Tweaks for speed

2022-01-29 08:22:08 +01:00 · 2022-01-29 08:22:08 +01:00 · d875416200
commit d875416200
parent 8ab094ad3d
2 changed files with 128 additions and 135 deletions
--- a/sample/simple3DFilter.cpp
+++ b/sample/simple3DFilter.cpp
@ -1,22 +1,22 @@
-#include "openmp.hpp"
-#include "omptl/algorithm"
-#include <cassert>
-#include "yorick.hpp"
-#include "sphSmooth.hpp"
-#include "mykdtree.hpp"
-#include "miniargs.hpp"
-#include <H5Cpp.h>
 #include "hdf5_array.hpp"
-#include <iostream>
-#include <boost/format.hpp>
+#include "miniargs.hpp"
+#include "mykdtree.hpp"
+#include "omptl/algorithm"
+#include "openmp.hpp"
+#include "sphSmooth.hpp"
+#include "yorick.hpp"
+#include <H5Cpp.h>
 #include <boost/bind.hpp>
+#include <boost/format.hpp>
+#include <cassert>
+#include <iostream>

 using namespace std;
 using namespace CosmoTool;

 #define N_SPH 32

-struct VCoord{
+struct VCoord {
  float v[3];
  float mass;
 };
@ -27,46 +27,42 @@ typedef boost::multi_array<float, 2> array_type;
 typedef boost::multi_array<float, 3> array3_type;
 typedef boost::multi_array<float, 4> array4_type;

-ComputePrecision getVelocity(const VCoord& v, int i)
-{
-  return v.mass * v.v[i];
-}
+ComputePrecision getVelocity(const VCoord &v, int i) { return v.mass * v.v[i]; }

-ComputePrecision getMass(const VCoord& v)
-{
-  return v.mass;
-}
+ComputePrecision getMass(const VCoord &v) { return v.mass; }

 typedef SPHSmooth<VCoord> MySmooth;
 typedef MySmooth::SPHTree MyTree;
 typedef MyTree::Cell MyCell;

-template<typename FuncT>
-void computeInterpolatedField(MyTree *tree1, double boxsize, int Nres, double cx, double cy, double cz,
-                              array3_type& bins, array3_type& arr, FuncT func, double rLimit2)
-{
-#pragma omp parallel
+template <typename FuncT>
+void computeInterpolatedField(MyTree *tree1, double boxsize, int Nres,
+                              double cx, double cy, double cz,
+                              array3_type &bins, array3_type &arr, FuncT func,
+                              double rLimit2) {
+  int rz_max = 0;
+#pragma omp parallel shared(rz_max)
  {
    MySmooth smooth1(tree1, N_SPH);

-#pragma omp for schedule(dynamic) 
-    for (int rz = 0; rz < Nres; rz++)
-      {
-        double pz = (rz)*boxsize/Nres-cz;
+#pragma omp for collapse(3) schedule(dynamic)
+    for (int rz = 0; rz < Nres; rz++) {

-        cout << format("[%d] %d / %d") % smp_get_thread_id() % rz % Nres << endl;
-        for (int ry = 0; ry < Nres; ry++)
-          {
-            double py = (ry)*boxsize/Nres-cy;
-            for (int rx = 0; rx < Nres; rx++)
-              {
-                double px = (rx)*boxsize/Nres-cx;
+      for (int ry = 0; ry < Nres; ry++) {
+        for (int rx = 0; rx < Nres; rx++) {
+          if (rz > rz_max) {
+            rz_max = rz;
+            cout << format("[%d] %d / %d") % smp_get_thread_id() % rz % Nres
+                 << endl;
+          }
+          double px = (rx)*boxsize / Nres - cx;
+          double py = (ry)*boxsize / Nres - cy;
+          double pz = (rz)*boxsize / Nres - cz;

-                MyTree::coords c = { float(px), float(py), float(pz) };
+          MyTree::coords c = {float(px), float(py), float(pz)};

-                double r2 = c[0]*c[0]+c[1]*c[1]+c[2]*c[2];
-                if (r2 > rLimit2)
-                  {
+          double r2 = c[0] * c[0] + c[1] * c[1] + c[2] * c[2];
+          if (r2 > rLimit2) {
            arr[rx][ry][rz] = 0;
            continue;
          }
@ -84,23 +80,20 @@ void computeInterpolatedField(MyTree *tree1, double boxsize, int Nres, double cx
  }
 }

-int main(int argc, char **argv)
-{
+int main(int argc, char **argv) {
  char *fname1, *outFile;
  double rLimit, boxsize, rLimit2, cx, cy, cz;
  int Nres;

-  MiniArgDesc args[] = {
-    { "INPUT DATA1", &fname1, MINIARG_STRING },
-    { "RADIUS LIMIT", &rLimit, MINIARG_DOUBLE },
-    { "BOXSIZE", &boxsize, MINIARG_DOUBLE },
-    { "RESOLUTION", &Nres, MINIARG_INT },
-    { "CX", &cx, MINIARG_DOUBLE },
-    { "CY", &cy, MINIARG_DOUBLE },
-    { "CZ", &cz, MINIARG_DOUBLE },
-    { "OUTPUT FILE", &outFile, MINIARG_STRING },
-    { 0, 0, MINIARG_NULL }
-  };
+  MiniArgDesc args[] = {{"INPUT DATA1", &fname1, MINIARG_STRING},
+                        {"RADIUS LIMIT", &rLimit, MINIARG_DOUBLE},
+                        {"BOXSIZE", &boxsize, MINIARG_DOUBLE},
+                        {"RESOLUTION", &Nres, MINIARG_INT},
+                        {"CX", &cx, MINIARG_DOUBLE},
+                        {"CY", &cy, MINIARG_DOUBLE},
+                        {"CZ", &cz, MINIARG_DOUBLE},
+                        {"OUTPUT FILE", &outFile, MINIARG_STRING},
+                        {0, 0, MINIARG_NULL}};

  if (!parseMiniArgs(argc, argv, args))
    return 1;
@ -112,7 +105,7 @@ int main(int argc, char **argv)

  array3_type bins(boost::extents[Nres][Nres][Nres]);

-  rLimit2 = rLimit*rLimit;
+  rLimit2 = rLimit * rLimit;

  hdf5_read_array(in_f, "particles", v1_data);
  assert(v1_data.shape()[1] == 7);
@ -124,30 +117,30 @@ int main(int argc, char **argv)
  MyCell *allCells_1 = new MyCell[N1_points];

 #pragma omp parallel for schedule(static)
-  for (uint32_t i = 0; i < Nres*Nres*Nres; i++)
+  for (uint32_t i = 0; i < Nres * Nres * Nres; i++)
    bins.data()[i] = 0;

  cout << "Shuffling data in cells..." << endl;
 #pragma omp parallel for schedule(static)
-  for (uint64_t i = 0 ; i < N1_points; i++)
-    {
+  for (uint64_t i = 0; i < N1_points; i++) {
    for (int j = 0; j < 3; j++)
      allCells_1[i].coord[j] = v1_data[i][j];
    for (int k = 0; k < 3; k++)
-        allCells_1[i].val.pValue.v[k] = v1_data[i][3+k];
+      allCells_1[i].val.pValue.v[k] = v1_data[i][3 + k];
    allCells_1[i].val.pValue.mass = v1_data[i][6];
    allCells_1[i].active = true;
    allCells_1[i].val.weight = 0.0;

-      long rx = floor((allCells_1[i].coord[0]+cx)*Nres/boxsize+0.5);
-      long ry = floor((allCells_1[i].coord[1]+cy)*Nres/boxsize+0.5);
-      long rz = floor((allCells_1[i].coord[2]+cz)*Nres/boxsize+0.5);
+    long rx = floor((allCells_1[i].coord[0] + cx) * Nres / boxsize + 0.5);
+    long ry = floor((allCells_1[i].coord[1] + cy) * Nres / boxsize + 0.5);
+    long rz = floor((allCells_1[i].coord[2] + cz) * Nres / boxsize + 0.5);

    if (rx < 0 || rx >= Nres || ry < 0 || ry >= Nres || rz < 0 || rz >= Nres)
      continue;

-//#pragma omp atomic update
-      bins[rx][ry][rz]++;
+    auto &b = bins[rx][ry][rz];
+#pragma omp atomic
+    b++;
  }
  v1_data.resize(boost::extents[1][1]);

@ -158,32 +151,31 @@ int main(int argc, char **argv)

  cout << "Creating smoothing filter..." << endl;

-//  array3_type out_rad_1(boost::extents[Nres][Nres][Nres]);
+  //  array3_type out_rad_1(boost::extents[Nres][Nres][Nres]);

  cout << "Weighing..." << endl;

-#pragma omp parallel
+  int rz_max = 0;
+#pragma omp parallel shared(rz_max)
  {
    MySmooth smooth1(&tree1, N_SPH);

-#pragma omp for schedule(dynamic) 
-    for (int rz = 0; rz < Nres; rz++)
-     {
-        double pz = (rz)*boxsize/Nres-cz;
-
+#pragma omp for collapse(3) schedule(dynamic, 8)
+    for (int rz = 0; rz < Nres; rz++) {
+      for (int ry = 0; ry < Nres; ry++) {
+        for (int rx = 0; rx < Nres; rx++) {
+          if (rz > rz_max) {
+            rz_max = rz;
            (cout << rz << " / " << Nres << endl).flush();
-        for (int ry = 0; ry < Nres; ry++)
-          {
-            double py = (ry)*boxsize/Nres-cy;
-            for (int rx = 0; rx < Nres; rx++)
-              {
-                double px = (rx)*boxsize/Nres-cx;
+          }
+          double pz = (rz)*boxsize / Nres - cz;
+          double py = (ry)*boxsize / Nres - cy;
+          double px = (rx)*boxsize / Nres - cx;

-                MyTree::coords c = { float(px), float(py), float(pz) };
+          MyTree::coords c = {float(px), float(py), float(pz)};

-                double r2 = c[0]*c[0]+c[1]*c[1]+c[2]*c[2];
-                if (r2 > rLimit2)
-                  {
+          double r2 = c[0] * c[0] + c[1] * c[1] + c[2] * c[2];
+          if (r2 > rLimit2) {
            continue;
          }

@ -192,11 +184,9 @@ int main(int argc, char **argv)
            smooth1.fetchNeighbours(c, numInCell);
          else
            smooth1.fetchNeighbours(c);
-#pragma omp critical
          smooth1.addGridSite(c);
        }
      }
-        (cout << " Done " << rz << endl).flush();
    }
  }

@ -204,13 +194,14 @@ int main(int argc, char **argv)

  array3_type interpolated(boost::extents[Nres][Nres][Nres]);

-  computeInterpolatedField(&tree1, boxsize, Nres, cx, cy, cz,
-                           bins, interpolated, getMass, rLimit2);
+  computeInterpolatedField(&tree1, boxsize, Nres, cx, cy, cz, bins,
+                           interpolated, getMass, rLimit2);
  hdf5_write_array(out_f, "density", interpolated);
-  //out_f.flush();
+  // out_f.flush();
  for (int i = 0; i < 3; i++) {
-      computeInterpolatedField(&tree1, boxsize, Nres, cx, cy, cz,
-                               bins, interpolated, boost::bind(getVelocity, _1, i), rLimit2);
+    computeInterpolatedField(&tree1, boxsize, Nres, cx, cy, cz, bins,
+                             interpolated, boost::bind(getVelocity, _1, i),
+                             rLimit2);
    hdf5_write_array(out_f, str(format("p%d") % i), interpolated);
  }

--- a/src/sphSmooth.tcc
+++ b/src/sphSmooth.tcc
@ -192,7 +192,9 @@ void SPHSmooth<ValType,Ndims>::addGridSite(const typename SPHTree::coords& c)
    {
      ComputePrecision d = internal.distances[i];
      SPHCell& cell = *(internal.ngb[i]);
-      cell.val.weight += getKernel(d/internal.smoothRadius) / r3;
+      double kernel_value = getKernel(d/internal.smoothRadius) / r3;
+#pragma omp atomic
+      cell.val.weight += kernel_value;
    }
 }