/*+ ARES/HADES/BORG Package -- ./libLSS/physics/openmp_cic.hpp Copyright (C) 2014-2020 Guilhem Lavaux Copyright (C) 2009-2020 Jens Jasche Additional contributions from: Guilhem Lavaux (2023) +*/ #ifndef __LIBLSS_PHYSICS_OPENMP_CIC_HPP #define __LIBLSS_PHYSICS_OPENMP_CIC_HPP #include #include "libLSS/tools/console.hpp" #include #include #include #include #include "libLSS/tools/array_tools.hpp" #include "libLSS/physics/generic_cic.hpp" namespace LibLSS { template struct OpenMPCloudInCell_impl { typedef T Type; // Number of extra planes required in case of MPI static const int MPI_PLANE_LEAKAGE = 1; typedef boost::multi_array ListArray; typedef boost::multi_array AtomicListArray; template < typename ParticleArray, typename ProjectionDensityArray, typename WeightArray, typename PeriodicFunction> static void projection( const ParticleArray &particles, ProjectionDensityArray &density, T Lx, T Ly, T Lz, size_t N0, size_t N1, size_t N2, const PeriodicFunction &p, const WeightArray &weight, size_t Np) { using boost::extents; ConsoleContext ctx("OpenMP CIC projection"); T inv_dx = N0 / Lx; T inv_dy = N1 / Ly; T inv_dz = N2 / Lz; typedef UninitializedArray U_AtomicListArray; typedef UninitializedArray U_ListArray; U_AtomicListArray part_mesh_p(extents[long(N0) * long(N1) * long(N2)]); U_ListArray part_list_p(extents[Np]); U_AtomicListArray::array_type &part_mesh = part_mesh_p.get_array(); U_ListArray::array_type &part_list = part_list_p.get_array(); long Nmesh = part_mesh.num_elements(); { ConsoleContext ctx0("initialize arrays"); array::fill(part_mesh, -1); array::fill(part_list, -1); } { ConsoleContext ctx0("build mesh list"); // First build part -> mesh list #pragma omp parallel for schedule(static) for (size_t i_part = 0; i_part < Np; i_part++) { T x = particles[i_part][0] * inv_dx; T y = particles[i_part][1] * inv_dy; T z = particles[i_part][2] * inv_dz; size_t ix = (size_t)std::floor(x); size_t iy = (size_t)std::floor(y); size_t iz = (size_t)std::floor(z); size_t idx = iz + N2 * iy + N2 * N1 * ix; int initial_elt = __atomic_exchange_n(&part_mesh[idx], i_part, __ATOMIC_RELAXED); if (initial_elt != -1) { part_list[i_part] = initial_elt; } } } { ConsoleContext ctx0("reverse list"); // We built the list in the incorrect order, reverse it as fast as we can #pragma omp parallel for schedule(dynamic, 10000) for (size_t mid = 0; mid < Nmesh; mid++) { int current_part = part_mesh[mid]; if (current_part >= 0) { int next_part = part_list[current_part]; part_list[current_part] = -1; while (next_part != -1) { int p = part_list[next_part]; part_list[next_part] = current_part; current_part = next_part; next_part = p; } part_mesh[mid] = current_part; } } } { ConsoleContext ctx0("projection"); #pragma omp parallel { for (int looper0 = 0; looper0 < 2; looper0++) { for (int looper1 = 0; looper1 < 2; looper1++) { for (int looper2 = 0; looper2 < 2; looper2++) { int r[3] = {looper0, looper1, looper2}; #pragma omp barrier #pragma omp for schedule(dynamic, 10000) for (long mid = 0; mid < Nmesh; mid++) { int mz = mid % N2; int my = (mid / N2) % N1; int mx = (mid / (N2 * N1)); int i_part = part_mesh[mid]; T w = 0; while (i_part != -1) { T w0 = 1; T x = particles[i_part][0] * inv_dx; T y = particles[i_part][1] * inv_dy; T z = particles[i_part][2] * inv_dz; T qx = std::floor(x); T qy = std::floor(y); T qz = std::floor(z); T dx = x - qx; T dy = y - qy; T dz = z - qz; w0 = (r[0] == 1) ? dx : (T(1) - dx); w0 *= (r[1] == 1) ? dy : (T(1) - dy); w0 *= (r[2] == 1) ? dz : (T(1) - dz); w += w0 * weight[i_part]; i_part = part_list[i_part]; } size_t tx = (mx + looper0); size_t ty = (my + looper1); size_t tz = (mz + looper2); p(tx, ty, tz); density[tx][ty][tz] += w; } } } } } #pragma omp barrier } } template < typename GradientArray, typename ProjectionDensityArray, typename WeightArray> static inline typename std::enable_if::type __do_gradient( GradientArray &adj_gradient, const ProjectionDensityArray &density, WeightArray const &a_w, size_t i, int axis, size_t ix, size_t iy, size_t iz, size_t jx, size_t jy, size_t jz, T x, T y, T z, T global_w) { T rx, ry, rz; T qx, qy, qz; switch (axis) { case 0: rx = 1; qx = -1; ry = y - iy; qy = 1 - ry; rz = z - iz; qz = 1 - rz; break; case 1: rx = x - ix; qx = 1 - rx; ry = 1; qy = -1; rz = z - iz; qz = 1 - rz; break; case 2: rx = x - ix; qx = 1 - rx; ry = y - iy; qy = 1 - ry; rz = 1; qz = -1; break; } double w = density[ix][iy][iz] * qx * qy * qz + density[ix][iy][jz] * qx * qy * rz + density[ix][jy][iz] * qx * ry * qz + density[ix][jy][jz] * qx * ry * rz + density[jx][iy][iz] * rx * qy * qz + density[jx][iy][jz] * rx * qy * rz + density[jx][jy][iz] * rx * ry * qz + density[jx][jy][jz] * rx * ry * rz; adj_gradient[i][axis] = a_w[axis] * w * global_w; } template static inline void __do_gradient( GradientArray &adj_gradient, const ProjectionDensityArray &density, T a_w, size_t i, int axis, size_t ix, size_t iy, size_t iz, size_t jx, size_t jy, size_t jz, T x, T y, T z, T global_w) { T rx, ry, rz; T qx, qy, qz; switch (axis) { case 0: rx = 1; qx = -1; ry = y - iy; qy = 1 - ry; rz = z - iz; qz = 1 - rz; break; case 1: rx = x - ix; qx = 1 - rx; ry = 1; qy = -1; rz = z - iz; qz = 1 - rz; break; case 2: rx = x - ix; qx = 1 - rx; ry = y - iy; qy = 1 - ry; rz = 1; qz = -1; break; } double w = density[ix][iy][iz] * qx * qy * qz + density[ix][iy][jz] * qx * qy * rz + density[ix][jy][iz] * qx * ry * qz + density[ix][jy][jz] * qx * ry * rz + density[jx][iy][iz] * rx * qy * qz + density[jx][iy][jz] * rx * qy * rz + density[jx][jy][iz] * rx * ry * qz + density[jx][jy][jz] * rx * ry * rz; adj_gradient[i][axis] += a_w * w * global_w; } template < typename ParticleArray, typename ProjectionDensityArray, typename GradientArray, typename PeriodicFunction, typename WeightArray> static void adjoint( const ParticleArray &particles, ProjectionDensityArray &density, GradientArray &adjoint_gradient, const WeightArray &w, T Lx, T Ly, T Lz, size_t N0, size_t N1, size_t N2, const PeriodicFunction &p, T nmean, size_t Np) { ConsoleContext ctx("Classic CIC adjoint-projection"); T inv_dx = N0 / Lx; T inv_dy = N1 / Ly; T inv_dz = N2 / Lz; T inv_nmean = 1 / nmean; size_t minX = density.index_bases()[0], minY = density.index_bases()[1], minZ = density.index_bases()[2], maxX = density.index_bases()[0] + density.shape()[0], maxY = density.index_bases()[1] + density.shape()[1], maxZ = density.index_bases()[2] + density.shape()[2]; #pragma omp parallel for schedule(static) for (long i = 0; i < Np; i++) { T x = particles[i][0] * inv_dx; T y = particles[i][1] * inv_dy; T z = particles[i][2] * inv_dz; size_t ix = (size_t)std::floor(x); size_t iy = (size_t)std::floor(y); size_t iz = (size_t)std::floor(z); size_t jx = (ix + 1); size_t jy = (iy + 1); size_t jz = (iz + 1); p(jx, jy, jz); if (ix < minX || ix >= maxX || iy < minY || iy >= maxY || iz < minZ || iz >= maxZ) continue; __do_gradient( adjoint_gradient, density, w[i], i, 0, ix, iy, iz, jx, jy, jz, x, y, z, inv_dx * inv_nmean); __do_gradient( adjoint_gradient, density, w[i], i, 1, ix, iy, iz, jx, jy, jz, x, y, z, inv_dy * inv_nmean); __do_gradient( adjoint_gradient, density, w[i], i, 2, ix, iy, iz, jx, jy, jz, x, y, z, inv_dz * inv_nmean); } } }; template class OpenMPCloudInCell : public GenericCIC> { public: typedef T Type; // Number of extra planes required in case of MPI static const int MPI_PLANE_LEAKAGE = 1; static const int MPI_NEGATIVE_PLANE_LEAKAGE = 0; typedef CIC_Distribution Distribution; typedef CIC_Tools::Periodic_MPI Periodic_MPI; }; } // namespace LibLSS #endif