Initial import

2023-05-29 10:41:03 +02:00 · 2023-05-29 10:41:03 +02:00 · 56a50eead3
commit 56a50eead3
820 changed files with 192077 additions and 0 deletions
--- a/libLSS/physics/class_cosmo.cpp
+++ b/libLSS/physics/class_cosmo.cpp
@ -0,0 +1,399 @@
+/*+
+    ARES/HADES/BORG Package -- ./libLSS/physics/class_cosmo.cpp
+    Copyright (C) 2020 Jens Jasche <jens.jasche@fysik.su.se>
+    Copyright (C) 2021 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#include <iostream>
+#include <fstream>
+#include <locale.h>
+#include <boost/algorithm/string/trim.hpp>
+#include "libLSS/tools/console.hpp"
+#include "class_cosmo.hpp"
+#include <class_code/class.h>
+#include "libLSS/tools/errors.hpp"
+#include "libLSS/tools/string_tools.hpp"
+#include "libLSS/tools/auto_interpolator.hpp"
+
+using namespace LibLSS;
+using namespace std;
+
+namespace LibLSS {
+  struct OpaqueClass {
+    struct precision pr;  // for precision parameters
+    struct background ba; // for cosmological background
+    struct thermo th;     // for thermodynamics
+    struct perturbs pt;   // for source functions
+    struct transfers tr;  // for transfer functions
+    struct primordial pm; // for primordial spectra
+    struct spectra sp;    // for output spectra
+    struct nonlinear nl;  // for non-linear spectra
+    struct lensing le;    // for lensed spectra
+    struct output op;     // for output files
+    ErrorMsg errmsg;      // for error messages
+
+    bool bg_init, th_init, pt_init, prim_init;
+
+    OpaqueClass() {
+      bg_init = false;
+      th_init = false;
+      pt_init = false;
+      prim_init = false;
+      ba.N_ncdm = 0;
+    }
+
+    ~OpaqueClass() {
+      if (ba.N_ncdm > 0)
+        delete[] ba.Omega0_ncdm;
+      if (bg_init)
+        background_free(&ba);
+      if (th_init)
+        thermodynamics_free(&th);
+      if (pt_init)
+        perturb_free(&pt);
+      if (prim_init)
+        primordial_free(&pm);
+    }
+
+    LibLSS::auto_interpolator<double> interpolate_mTk;
+  };
+} // namespace LibLSS
+
+ClassCosmo::ClassCosmo(CosmologicalParameters const &cosmo) {
+  LIBLSS_AUTO_DEBUG_CONTEXT(ctx);
+
+  numInterpolationPoints = 1024;
+  opaque = std::make_unique<OpaqueClass>();
+
+  std::string previous_locale = std::string(setlocale(LC_NUMERIC, 0));
+  // CLASS is not safe w.r.t Locale settings. It reads table with sscanf which
+  // is sensitive to the locale setup.
+  setlocale(LC_NUMERIC, "C");
+
+  try {
+    // Set all class values to default
+    if (input_default_params(
+            &opaque->ba, &opaque->th, &opaque->pt, &opaque->tr, &opaque->pm,
+            &opaque->sp, &opaque->nl, &opaque->le, &opaque->op) == _FAILURE_) {
+      ctx.format2<LOG_ERROR>(
+          "Error running input_default_params => %s", opaque->op.error_message);
+      error_helper<ErrorBadState>("Error in CLASS");
+    }
+
+    {
+      auto &pba = opaque->ba;
+      double
+          sigma_B; /* Stefan-Boltzmann constant in \f$ W/m^2/K^4 = Kg/K^4/s^3 \f$*/
+
+      sigma_B =
+          2. * pow(_PI_, 5) * pow(_k_B_, 4) / 15. / pow(_h_P_, 3) / pow(_c_, 2);
+
+      double omega_cdm = cosmo.omega_m - cosmo.omega_b;
+      double Omega_tot = 0;
+
+      pba.h = cosmo.h;
+      pba.H0 = pba.h * 1.e5 / _c_;
+      pba.Omega0_g = (4. * sigma_B / _c_ * pow(pba.T_cmb, 4.)) /
+                     (3. * _c_ * _c_ * 1.e10 * pba.h * pba.h / _Mpc_over_m_ /
+                      _Mpc_over_m_ / 8. / _PI_ / _G_);
+      Omega_tot += pba.Omega0_g;
+      pba.Omega0_ur = 3.046 * 7. / 8. * pow(4. / 11., 4. / 3.) * pba.Omega0_g;
+      Omega_tot += pba.Omega0_ur;
+      pba.Omega0_idr = 0.0;
+      Omega_tot += pba.Omega0_idr;
+      pba.Omega0_idm_dr = 0.0;
+      pba.T_idr = 0.0;
+      pba.Omega0_b = cosmo.omega_b;
+      Omega_tot += pba.Omega0_b;
+      pba.Omega0_cdm = omega_cdm;
+      Omega_tot += pba.Omega0_cdm;
+
+      {
+        // CLP parametrization
+        pba.fluid_equation_of_state = CLP;
+        pba.w0_fld = cosmo.w;
+        pba.wa_fld = cosmo.wprime;
+        pba.Omega0_fld = cosmo.omega_q;
+        Omega_tot += pba.Omega0_fld;
+      }
+      pba.Omega0_k = cosmo.omega_k;
+
+      pba.N_ncdm = 1;
+      pba.Omega0_ncdm = new double[1];
+
+      pba.Omega0_ncdm[0] = cosmo.sum_mnu;
+
+      opaque->pt.alpha_idm_dr = nullptr;
+      opaque->pt.beta_idr = nullptr;
+
+      pba.Omega0_lambda = 1 - pba.Omega0_k - Omega_tot;
+
+      pba.K = -pba.Omega0_k * pow(pba.a_today * pba.H0, 2);
+      /** - Set curvature sign */
+      if (pba.K > 0.)
+        pba.sgnK = 1;
+      else if (pba.K < 0.)
+        pba.sgnK = -1;
+    }
+
+    // Set all class precision values to default
+    if (input_default_precision(&opaque->pr) == _FAILURE_) {
+      ctx.format2<LOG_ERROR>(
+          "Error running input_default_precision => %s",
+          opaque->pr.error_message);
+      error_helper<ErrorBadState>("Error in CLASS");
+    }
+    opaque->pr.k_per_decade_for_pk = 30;
+
+    //initialize background calculations
+    if (background_init(&opaque->pr, &opaque->ba) == _FAILURE_) {
+      ctx.format2<LOG_ERROR>(
+          "Error running background_init => %s", opaque->ba.error_message);
+      error_helper<ErrorBadState>("Error in CLASS");
+    }
+    opaque->bg_init = true;
+
+    //opaque->th.thermodynamics_verbose = _TRUE_;
+    if (thermodynamics_init(&opaque->pr, &opaque->ba, &opaque->th) ==
+        _FAILURE_) {
+      ctx.format2<LOG_ERROR>(
+          "Error running thermodynamics_init => %s", opaque->th.error_message);
+      error_helper<ErrorBadState>("Error in CLASS");
+    }
+    opaque->th_init = true;
+
+    opaque->pt.has_perturbations = _TRUE_;
+    //opaque->pt.perturbations_verbose = 1;
+    opaque->pt.has_pk_matter = _TRUE_;
+    opaque->pt.has_density_transfers = _TRUE_;
+    opaque->pt.has_cls = _FALSE_;
+    //opaque->pt.k_max_for_pk = ;
+
+    if (perturb_init(&opaque->pr, &opaque->ba, &opaque->th, &opaque->pt) ==
+        _FAILURE_) {
+      ctx.format2<LOG_ERROR>(
+          "Error running perturb_init => %s", opaque->pt.error_message);
+      error_helper<ErrorBadState>("Error in CLASS");
+    }
+    opaque->pt_init = true;
+
+    if (primordial_init(&opaque->pr, &opaque->pt, &opaque->pm) == _FAILURE_) {
+      ctx.format2<LOG_ERROR>(
+          "Error running primordial_init => %s", opaque->pm.error_message);
+      error_helper<ErrorBadState>("Error in CLASS");
+    }
+    opaque->prim_init = true;
+
+    retrieve_Tk();
+  } catch (std::exception &e) {
+    setlocale(LC_NUMERIC, previous_locale.c_str());
+    throw;
+  }
+  setlocale(LC_NUMERIC, previous_locale.c_str());
+}
+
+double ClassCosmo::primordial_Pk(double k) {
+  //Input: wavenumber k in 1/Mpc (linear mode)
+  //Output: primordial spectra P(k) in \f$Mpc^3\f$ (linear mode)
+
+  double output;
+
+  primordial_spectrum_at_k(
+      &opaque->pm,
+      0, //choose scalar mode
+      linear, k, &output);
+
+  return output;
+}
+
+double ClassCosmo::get_Tk(double k) {
+  return -std::exp(opaque->interpolate_mTk(std::log(k)));
+}
+
+void ClassCosmo::retrieve_Tk() {
+  LIBLSS_AUTO_DEBUG_CONTEXT(ctx);
+
+  char *c_titles;
+  std::string titles;
+  double const output_redshift = 0;
+
+  // Query the available columns
+  c_titles = new char[_MAXTITLESTRINGLENGTH_];
+  std::fill(c_titles, c_titles + _MAXTITLESTRINGLENGTH_, 0);
+  if (perturb_output_titles(&opaque->ba, &opaque->pt, class_format, c_titles) ==
+      _FAILURE_) {
+    delete[] c_titles;
+    ctx.format2<LOG_ERROR>(
+        "Error running perturb_output_titles => %s", opaque->pt.error_message);
+    error_helper<ErrorBadState>("Error in CLASS");
+  }
+  titles = c_titles;
+  delete[] c_titles;
+
+  // Retrieve relevant data
+  auto names = LibLSS::tokenize(boost::algorithm::trim_copy(titles), "\t");
+  ctx.print(LibLSS::to_string(names));
+  auto index_md = opaque->pt.index_md_scalars;
+  auto number_of_titles = names.size();
+  auto number_of_ic = opaque->pt.ic_size[index_md];
+  auto timesteps = opaque->pt.k_size[index_md];
+  auto size_ic_data = timesteps * number_of_titles;
+  auto ic_num = opaque->pt.ic_size[index_md];
+
+  auto data = new double[size_ic_data * ic_num];
+
+  if (perturb_output_data(
+          &opaque->ba, &opaque->pt, class_format, output_redshift,
+          number_of_titles, data) == _FAILURE_) {
+    delete[] data;
+    ctx.format2<LOG_ERROR>(
+        "Error running perturb_output_data => %s", opaque->pt.error_message);
+    error_helper<ErrorBadState>("Error in CLASS");
+  }
+
+  // Adiabatic mode is referenced at opaque->pt.index_ic_ad
+  auto index_ic = opaque->pt.index_ic_ad;
+  auto result_k = std::find(names.begin(), names.end(), "k (h/Mpc)");
+  Console::instance().c_assert(
+      result_k != names.end(), "Invalid returned arrays for k from CLASS");
+  auto k_title = std::distance(names.begin(), result_k);
+  auto result = std::find(names.begin(), names.end(), "d_tot");
+  Console::instance().c_assert(
+      result != names.end(), "Invalid returned arrays from CLASS");
+  auto mTk_title = std::distance(names.begin(), result);
+
+  ctx.format("k_title=%d, mTk_title=%d", k_title, mTk_title);
+
+  auto get_data = [&](size_t step, size_t title) {
+    return data[index_ic * size_ic_data + step * number_of_titles + title];
+  };
+
+  array_1d k, Tk;
+
+  k.resize(boost::extents[timesteps]);
+  Tk.resize(boost::extents[timesteps]);
+
+  for (size_t step = 0; step < timesteps; step++) {
+    Tk[step] = -get_data(
+        step, mTk_title); // Laplacian between density and potential is negative
+    k[step] = get_data(step, k_title);
+  }
+
+  reinterpolate(k, Tk);
+
+  delete[] data;
+}
+
+ClassCosmo::~ClassCosmo() {}
+
+void ClassCosmo::reinterpolate(array_ref_1d const &k, array_ref_1d const &Tk) {
+  LIBLSS_AUTO_DEBUG_CONTEXT(ctx);
+
+  double k_min = opaque->pt.k_min / opaque->ba.h;
+  double k_max = opaque->pt.k_max / opaque->ba.h;
+
+  double delta_ln_k = std::log(k_max / k_min) / (numInterpolationPoints + 1);
+  double log_k_min = std::log(k_min);
+  double log_k_max = std::log(k_max);
+  size_t i_in_k = 0;
+
+  auto newTk =
+      new boost::multi_array<double, 1>(boost::extents[numInterpolationPoints]);
+
+  ctx.format(
+      "numInterpolationPoints = %d, k.size() = %d, k_min=%g, k_max=%g",
+      numInterpolationPoints, k.size(), k_min, k_max);
+  for (size_t i = 0; i < numInterpolationPoints; i++) {
+    double target_k = std::exp(delta_ln_k * i + log_k_min);
+    while (k[i_in_k] < target_k && i_in_k < k.size()) {
+      i_in_k++;
+    }
+
+    Console::instance().c_assert(i_in_k < k.size(), "Bad reinterpolation");
+    if (i_in_k == 0 && k[i_in_k] == k_min) {
+      (*newTk)[i] = std::log(Tk[0]);
+    } else if (k[i_in_k - 1] == 0) {
+      (*newTk)[i] =
+          std::log(Tk[i_in_k]) / std::log(k[i_in_k]) * std::log(target_k);
+    } else {
+      double alpha = std::log(target_k / k[i_in_k - 1]) /
+                     std::log(k[i_in_k] / k[i_in_k - 1]);
+      Console::instance().c_assert(
+          alpha > 0 && alpha < 1, "Bad alpha for interpolation");
+      (*newTk)[i] =
+          std::log(Tk[i_in_k - 1]) * (1 - alpha) + std::log(Tk[i_in_k]) * alpha;
+    }
+  }
+
+  opaque->interpolate_mTk = LibLSS::auto_interpolator<double>(
+      log_k_min, log_k_max, delta_ln_k, std::log(Tk[0]), 0.0, newTk);
+  opaque->interpolate_mTk.setThrowOnOverflow();
+}
+
+void ClassCosmo::updateCosmo() {
+  //ba.h = 0.67556;
+  auto &ba = opaque->ba;
+  auto &pba = opaque->ba;
+
+  ba.H0 = pba.h * 1.e5 / _c_;
+  ba.T_cmb = 2.7255;
+  ba.Omega0_b = 0.022032 / pow(pba.h, 2);
+  ba.Omega0_cdm = 0.12038 / pow(pba.h, 2);
+  ba.Omega0_dcdmdr = 0.0;
+  ba.Omega0_dcdm = 0.0;
+  ba.Gamma_dcdm = 0.0;
+  ba.N_ncdm = 0;
+  ba.Omega0_ncdm_tot = 0.;
+  ba.ksi_ncdm_default = 0.;
+  ba.ksi_ncdm = NULL;
+
+  ba.Omega0_scf = 0.; // Scalar field defaults
+  ba.attractor_ic_scf = _TRUE_;
+  ba.scf_parameters = NULL;
+  ba.scf_parameters_size = 0;
+  ba.scf_tuning_index = 0;
+
+  ba.Omega0_k = 0.;
+  ba.K = 0.;
+  ba.sgnK = 0;
+  ba.Omega0_lambda = 1. - pba.Omega0_k - pba.Omega0_g - pba.Omega0_ur -
+                     pba.Omega0_b - pba.Omega0_cdm - pba.Omega0_ncdm_tot -
+                     pba.Omega0_dcdmdr - pba.Omega0_idm_dr - pba.Omega0_idr;
+  ba.Omega0_fld = 0.;
+  ba.w0_fld = -1.;
+  ba.wa_fld = 0.;
+  ba.Omega_EDE = 0.;
+  ba.cs2_fld = 1.;
+
+  ba.shooting_failed = _FALSE_;
+}
+
+ClassCosmo::DictCosmology ClassCosmo::getCosmology() {
+
+  DictCosmology state;
+
+  state["Omega_g"] = opaque->ba.Omega0_g;
+  state["Omega_m"] = opaque->ba.Omega0_m;
+  state["N_ncdm"] = opaque->ba.N_ncdm;
+  state[lssfmt::format("Omega0_ncdm_%d", 0)] = opaque->ba.Omega0_ncdm[0];
+  state["Omega_k"] = opaque->ba.Omega0_k;
+  state["Omega_lambda"] = opaque->ba.Omega0_lambda;
+  state["Omega_m"] = opaque->ba.Omega0_m;
+
+  return state;
+}
+
+void ClassCosmo::setInterpolation(size_t numPoints) {
+  numInterpolationPoints = numPoints;
+}
+
+// ARES TAG: num_authors = 2
+// ARES TAG: name(0) = Jens Jasche
+// ARES TAG: email(0) = jens.jasche@fysik.su.se
+// ARES TAG: year(0) = 2020
+// ARES TAG: name(1) = Guilhem Lavaux
+// ARES TAG: email(1) = guilhem.lavaux@iap.fr
+// ARES TAG: year(1) = 2021
--- a/libLSS/physics/class_cosmo.hpp
+++ b/libLSS/physics/class_cosmo.hpp
@ -0,0 +1,55 @@
+/*+
+    ARES/HADES/BORG Package -- ./libLSS/physics/class_cosmo.hpp
+    Copyright (C) 2020 Jens Jasche <jens.jasche@fysik.su.se>
+    Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#ifndef __LIBLSS_CLASS_COSMO_HPP
+#  define __LIBLSS_CLASS_COSMO_HPP
+
+#  include <map>
+#  include <string>
+#  include <memory>
+#  include "libLSS/physics/cosmo.hpp"
+
+namespace LibLSS {
+
+  struct OpaqueClass;
+
+  class ClassCosmo {
+  private:
+    std::unique_ptr<OpaqueClass> opaque;
+    typedef boost::multi_array_ref<double, 1> array_ref_1d;
+    typedef boost::multi_array<double, 1> array_1d;
+
+    size_t numInterpolationPoints;
+
+  public:
+    typedef std::map<std::string, double> DictCosmology;
+
+    ClassCosmo(CosmologicalParameters const &params); // This is the constructor
+    ~ClassCosmo();
+
+    void setInterpolation(size_t numPoints);
+
+    double primordial_Pk(double k);
+    void updateCosmo();
+    double get_Tk(double k);
+    void retrieve_Tk();
+    DictCosmology getCosmology();
+
+  protected:
+    void reinterpolate(array_ref_1d const &k, array_ref_1d const &Tk);
+  };
+
+} // namespace LibLSS
+
+#endif
+
+// ARES TAG: num_authors = 1
+// ARES TAG: name(0) = Jens Jasche
+// ARES TAG: email(0) = jens.jasche@fysik.su.se
+// ARES TAG: year(0) = 2020
--- a/libLSS/physics/classic_cic.hpp
+++ b/libLSS/physics/classic_cic.hpp
--- a/libLSS/physics/classic_gpot.hpp
+++ b/libLSS/physics/classic_gpot.hpp
@ -0,0 +1,71 @@
+/*+
+    ARES/HADES/BORG Package -- ./libLSS/physics/classic_gpot.hpp
+    Copyright (C) 2014-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#ifndef __LIBLSS_PHYSICS_CLASSIC_GPOT_HPP
+#define __LIBLSS_PHYSICS_CLASSIC_GPOT_HPP
+
+#include <cmath>
+#include "libLSS/tools/console.hpp"
+#include <boost/multi_array.hpp>
+
+using namespace LibLSS;
+typedef boost::multi_array_types::extent_range range;
+using boost::extents;
+using boost::format;
+
+
+namespace LibLSS {
+
+    template<typename T>
+    struct ClassicGravitationalPotential {
+        typedef T Type;
+        typedef boost::multi_array<T, 3> DensityArray;
+        typedef boost::multi_array<T, 3> GravityArray;
+        
+        template<typename PotentialArray>
+        static void potential(const PotentialArray& dens, PotentialArray& pot, T Om, T L0, T L1, T L2, 
+                               int N0, int N1, int N2) {
+            ConsoleContext<LOG_DEBUG> ctx("Classic GravitationalPotential estimation");
+            
+            //transform density to F-space
+            MFCalls::execute_r2c(analysis_plan, dens.data(), AUX0.data());
+            
+            double normphi=3./2.*Om;
+
+            #pragma omp parallel for
+            for (int i=0 ; i<startN0+localN0;i++)
+                for (int j=0 ; j<N1;j++)
+                    for (int k=0; k<N2_HC;k++)
+                    {
+                        double kk[3];
+                        kk[0]=kmode(i,N0,L0);
+                        kk[1]=kmode(j,N1,L1);
+                        kk[2]=kmode(k,N2,L2);
+                    
+                        double sin20 = sin(kk[0]/2.)*sin(kk[0]/2.);
+			            double sin21 = sin(kk[1]/2.)*sin(kk[1]/2.);
+			            double sin22 = sin(kk[2]/2.)*sin(kk[2]/2.);	
+
+			            double Greens = - normphi/4./(sin20+sin21+sin22);
+		  
+                        AUX0[i][j][k] *= Greens;                        
+	                }
+	        //fix zero mode by hand
+	        if (startN0 == 0 && localN0 > 0) {
+            AUX0[0][0][0]=0;
+            }
+           
+            MFCalls::execute_c2r(synthesis_plan, AUX0.data(), pot.data());            
+        }
+    };
+    
+    
+}
+
+#endif
--- a/libLSS/physics/cosmo.cpp
+++ b/libLSS/physics/cosmo.cpp
@ -0,0 +1,482 @@
+/*+
+    ARES/HADES/BORG Package -- ./libLSS/physics/cosmo.cpp
+    Copyright (C) 2014-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#include <gsl/gsl_const_num.h>
+#include <gsl/gsl_const_mksa.h>
+#include <gsl/gsl_errno.h>
+#include <gsl/gsl_odeiv.h>
+#include <gsl/gsl_linalg.h>
+#include <gsl/gsl_integration.h>
+#include <cmath>
+#include <CosmoTool/algo.hpp>
+#include "cosmo.hpp"
+#include "libLSS/tools/gsl_error.hpp"
+#include "libLSS/tools/gslIntegrate.hpp"
+
+using namespace LibLSS;
+using namespace std;
+using CosmoTool::spower;
+using CosmoTool::square;
+
+#define epsabs COSMO_EPS
+#define epsrel COSMO_EPS
+
+static const int NEVAL = 1000;
+static const double cosmo_Ggrav = GSL_CONST_MKSA_GRAVITATIONAL_CONSTANT;
+static const double cosmo_clight =
+    GSL_CONST_MKSA_SPEED_OF_LIGHT; /* speed of light in m/s */
+static const double cosmo_kB =
+    GSL_CONST_MKSA_BOLTZMANN; /* Boltzmann constant  in m^2 * kg /s^2 /K */
+static const double cosmo_parsec = GSL_CONST_MKSA_PARSEC; /* parsec in m */
+static const double cosmo_kparsec = (1.0e3 * cosmo_parsec);
+static const double cosmo_mparsec = (1.0e6 * cosmo_parsec);
+static const double cosmo_gparsec = (1.0e9 * cosmo_parsec);
+static const double cosmo_hubble =
+    (1.0e5 / cosmo_mparsec); /* Hubble constant in 1/s */
+static const double cosmo_mp =
+    GSL_CONST_MKSA_MASS_PROTON; /* Mass of proton kg */
+
+static const double cosmo_Mpc_cm = (1.0e2 * cosmo_mparsec);  // cm
+static const double cosmo_Mpc_m = (1.0e0 * cosmo_mparsec);   // m
+static const double cosmo_Mpc_km = (1.0e-3 * cosmo_mparsec); // km
+static const double cosmo_H100_s = (100. / cosmo_Mpc_km);    // s^-1
+
+static const double cosmo_M_sun = (1.98892e33); // g
+
+static const double cosmo_G_const_Mpc_Msun_s =
+    (cosmo_M_sun * (6.673e-8) / cosmo_Mpc_cm / cosmo_Mpc_cm /
+     cosmo_Mpc_cm); // Mpc^3 msun^-1 s^-2
+
+static const double AMIN = 1e-6;
+
+static double x_plus(double a, const CosmologicalParameters &p);
+
+/* --- function w [dark energy eos parameter - time evolution] --- */
+static double w_Q(double a, const CosmologicalParameters &p) {
+  return p.w + p.wprime * (1.0 - a);
+}
+
+/* --- function aux_q --- */
+static double aux_q(double a, const CosmologicalParameters &p) {
+  return 3.0 / 2.0 * (1.0 - w_Q(a, p) / (1.0 + x_plus(a, p))) / a;
+}
+
+/* --- function x_plus [auxiliary function, see Linder+Jenkins MNRAS 346, 573-583 (2003) for definition] --- */
+static double x_plus(double a, const CosmologicalParameters &p) {
+  double aux = 3 * p.wprime * (1 - a);
+  return p.omega_m / (1 - p.omega_m) * pow(a, 3 * (p.w + p.wprime)) * exp(aux);
+}
+
+/* --- function dx_plus [derivative of x_plus, dx_plus(a) = d(x_plus(a))/da] --- */
+double x_plus_prime(double a, const CosmologicalParameters &p) {
+  return 3 * x_plus(a, p) * w_Q(a, p) / a;
+}
+
+/* --- function aux_r --- */
+static double aux_r(double a, const CosmologicalParameters &p) {
+  double aux = x_plus(a, p);
+  return 3.0 / 2.0 * aux / (1.0 + aux) / spower<2, double>(a);
+}
+
+/* ---  --- */
+static double aux_dr(double a, const CosmologicalParameters &p) {
+  double ra;
+
+  ra = aux_r(a, p);
+  return x_plus_prime(a, p) / (1.0 + x_plus(a, p)) *
+             (3.0 / 2.0 / spower<2, double>(a) - ra) -
+         2.0 * ra / a;
+}
+
+/* ---  --- */
+static double aux_dq(double a, const CosmologicalParameters &p) {
+  double xp, result;
+
+  xp = 1.0 + x_plus(a, p);
+  result = -aux_q(a, p) / a;
+  result -=
+      3.0 / 2.0 / a / xp * (p.wprime - w_Q(a, p) * x_plus_prime(a, p) / xp);
+  result /= a;
+
+  return result;
+}
+
+/* --- function dplus_function - defines f0 = dy1/da and f1 = dy2/da --- */
+static int d_plus_function(double t, const double y[], double f[], void *data) {
+  CosmologicalParameters *params = (CosmologicalParameters *)data;
+
+  /* derivatives f_i = dy_i/dt */
+  f[0] = y[1];
+  f[1] = aux_r(t, *params) * y[0] - aux_q(t, *params) * y[1];
+
+  return (GSL_SUCCESS);
+}
+
+static int d_plus_jacobian(
+    double t, const double y[], double *dfdy, double dfdt[], void *data) {
+  gsl_matrix_view dfdy_mat = gsl_matrix_view_array(dfdy, 2, 2);
+  gsl_matrix *m = &dfdy_mat.matrix;
+
+  CosmologicalParameters *params = (CosmologicalParameters *)data;
+
+  /* jacobian df_i(t,y(t)) / dy_j */
+  gsl_matrix_set(m, 0, 0, 0.0);
+  gsl_matrix_set(m, 0, 1, 1.0);
+  gsl_matrix_set(m, 1, 0, 0.0);
+  gsl_matrix_set(m, 1, 1, -aux_q(t, *params));
+
+  /* gradient df_i/dt, explicit dependence */
+  dfdt[0] = 0.0;
+  dfdt[1] = aux_dr(t, *params) * y[0] - aux_dq(t, *params) * y[1];
+
+  return GSL_SUCCESS;
+}
+
+static double hubble(double a, const CosmologicalParameters &p) {
+  using CosmoTool::spower;
+  double result;
+  double aux;
+
+  result = p.omega_r / spower<4, double>(a);
+  result += p.omega_m / spower<3, double>(a);
+  result += p.omega_k / spower<2, double>(a);
+
+  aux = -(1 + p.w + p.wprime) * log(a) + p.wprime * (a - 1);
+  result += p.omega_q * exp(3 * aux);
+
+  return p.h * 100 * sqrt(result);
+}
+
+Cosmology::Cosmology(const CosmologicalParameters &p) : parameters(p) {
+  // Do a check if the cosmological parameters sum up to 1
+  Console::instance().print<LOG_DEBUG>(
+      "Checking the normalization of cosmological parameters");
+  double sum = 0;
+  sum = p.omega_r + p.omega_m + p.omega_k + p.omega_q;
+  if (sum != 1.0) {
+    error_helper<ErrorBadState>("omega_r + omega_m + omega_k + omega_q != 1");
+  }
+  norm_d_plus = aux_d_plus(1.0);
+}
+
+double Cosmology::Hubble(double a) const { return hubble(a, parameters); }
+
+void Cosmology::precompute_d_plus() {
+  LIBLSS_AUTO_DEBUG_CONTEXT(ctx);
+  double result;
+  int status;
+  const gsl_odeiv_step_type *T = gsl_odeiv_step_bsimp;
+  gsl_odeiv_step *s = gsl_odeiv_step_alloc(T, 2);
+  gsl_odeiv_control *c = gsl_odeiv_control_y_new(0.0, epsrel);
+  gsl_odeiv_evolve *e = gsl_odeiv_evolve_alloc(2);
+  double t = AMIN,
+         habs = 1e-4; /* t = initial scale factor, h = absolute accuracy */
+  // TODO: Improve the initial condition
+  // If matter dominated era used to anchor the solution
+  // D(a) \propto a for a->0
+  // Thus D'(a) = D(a)/a
+  double y_prev[2],
+      y[2] = {1.0, 1.0 / AMIN}; /* initial conditions, y(0) = 1.0, y'(0) = 0 */
+  const double AMAX = 1.0;
+
+  /* result from solution of a 2nd order differential equation, transformed to a system of 2 1st order deqs */
+  gsl_odeiv_system sys = {
+      d_plus_function, d_plus_jacobian, 2, (void *)&parameters};
+
+  unsigned int NUM_D = 10000;
+
+  const double log_a_min = std::log(AMIN);
+  const double log_a_max = std::log(AMAX);
+  const double delta_log_a = (log_a_max - log_a_min) / (NUM_D - 1);
+  auto D_data = new boost::multi_array<double, 1>(boost::extents[NUM_D + 1]);
+  auto Dprime_data =
+      new boost::multi_array<double, 1>(boost::extents[NUM_D + 1]);
+  unsigned int j = 0;
+
+  auto get_a = [&](unsigned int k) {
+    return std::exp(log_a_min + k * delta_log_a);
+  };
+  double a_current = get_a(0);
+
+  (*D_data)[0] = std::log(y[0]);
+  (*Dprime_data)[0] = std::log(y[1]);
+
+  for (j = 1; j <= NUM_D; j++) {
+    a_current = get_a(j);
+    while (t < a_current) {
+      status = gsl_odeiv_evolve_apply(e, c, s, &sys, &t, a_current, &habs, y);
+      if (status != GSL_SUCCESS) {
+        error_helper<ErrorBadState>("Error during ODE integration of Dplus");
+      }
+    }
+
+    (*D_data)[j] = std::log(y[0]);
+    (*Dprime_data)[j] = std::log(y[1]);
+  }
+
+  gsl_odeiv_evolve_free(e);
+  gsl_odeiv_control_free(c);
+  gsl_odeiv_step_free(s);
+
+  pre_dplus = std::make_shared<auto_interpolator<double>>(
+      log_a_min, log_a_max, delta_log_a, 0,
+      std::numeric_limits<double>::infinity(), D_data);
+  pre_dplus->setThrowOnOverflow();
+
+  pre_dplus_prime = std::make_shared<auto_interpolator<double>>(
+      log_a_min, log_a_max, delta_log_a, 0,
+      std::numeric_limits<double>::infinity(), Dprime_data);
+  pre_dplus_prime->setThrowOnOverflow();
+
+  norm_d_plus = std::exp((*pre_dplus)(std::log(1.0)));
+}
+
+double Cosmology::aux_d_plus(double a, double *result_d_plus_prime) const {
+
+  if (pre_dplus && pre_dplus_prime) {
+    double result = std::exp((*pre_dplus)(std::log(a)));
+    if (result_d_plus_prime != 0)
+      *result_d_plus_prime = std::exp((*pre_dplus_prime)(std::log(a)));
+    return result;
+  }
+
+  double result;
+  int status;
+  const gsl_odeiv_step_type *T = gsl_odeiv_step_bsimp;
+  gsl_odeiv_step *s = gsl_odeiv_step_alloc(T, 2);
+  gsl_odeiv_control *c = gsl_odeiv_control_y_new(0.0, epsrel);
+  gsl_odeiv_evolve *e = gsl_odeiv_evolve_alloc(2);
+  double t = AMIN,
+         habs = 1e-4; /* t = initial scale factor, h = absolute accuracy */
+  double y[2] = {
+      1.0, 1.0 / AMIN}; /* initial conditions, dy1(0)/da = 1, dy2(0)/da=0 */
+
+  /* result from solution of a 2nd order differential equation, transformed to a system of 2 1st order deqs */
+  gsl_odeiv_system sys = {
+      d_plus_function, d_plus_jacobian, 2, (void *)&parameters};
+
+  while (t < a) {
+    status = gsl_odeiv_evolve_apply(e, c, s, &sys, &t, a, &habs, y);
+    if (status != GSL_SUCCESS)
+      break;
+  }
+
+  gsl_odeiv_evolve_free(e);
+  gsl_odeiv_control_free(c);
+  gsl_odeiv_step_free(s);
+
+  result = y[0]; /* d_plus */
+  if (result_d_plus_prime)
+    *result_d_plus_prime = y[1]; /* d(d_plus)/da */
+
+  return result;
+}
+
+double Cosmology::dcom_dz(double z) const {
+  double result;
+
+  double a = 1. / (z + 1.);
+
+  result = cosmo_clight / Hubble(a) / cosmo_mparsec;
+
+  return (result);
+}
+
+double aux_dcom(double a, void *params) {
+  double result;
+  const CosmologicalParameters &p = *(const CosmologicalParameters *)params;
+
+  result = -1. / square(a) / hubble(a, p);
+
+  double clight = cosmo_clight / 1000.; ///km/s
+
+  return (clight * result);
+}
+
+double Cosmology::a2com(double a) const {
+  double result, error;
+  gsl_integration_workspace *wspace = gsl_integration_workspace_alloc(NEVAL);
+  gsl_function F;
+
+  F.function = &aux_dcom;
+  F.params = (void *)&parameters;
+  gsl_integration_qag(
+      &F, 1.0, a, epsabs, epsrel, NEVAL, GSL_INTEG_GAUSS61, wspace, &result,
+      &error);
+
+  gsl_integration_workspace_free(wspace);
+
+  return (result);
+}
+
+void Cosmology::precompute_com2a() {
+  LIBLSS_AUTO_DEBUG_CONTEXT(ctx);
+  if (pre_com2a)
+    return;
+
+  const unsigned int NUM_A = 10000; // TODO: benchmark precision
+  const double log_a_min = std::log(1e-4);
+  const double delta_log_a = std::log(1.0 / 1e-4) / NUM_A;
+
+  boost::multi_array<double, 1> log_d(boost::extents[NUM_A]);
+
+#pragma omp parallel for
+  for (unsigned int i = 0; i < NUM_A; i++) {
+    const double a = std::exp(delta_log_a * i + log_a_min);
+
+    log_d[i] = std::log(a2com(a));
+  }
+
+  const double log_d_min = log_d[NUM_A - 1];
+  const double log_d_max = log_d[0];
+  const double delta_log_d = (log_d_max - log_d_min) / NUM_A;
+
+  auto log_a = new boost::multi_array<double, 1>(boost::extents[NUM_A]);
+  double current_log_d = log_d_min;
+  (*log_a)[0] = delta_log_a * (NUM_A - 1) + log_a_min;
+  unsigned int j = NUM_A - 1;
+  for (unsigned int i = 1; i < NUM_A; i++) {
+    current_log_d += delta_log_d;
+    while (current_log_d > log_d[j]) {
+      if (j == 0) {
+        ctx.print2<LOG_ERROR>("Bad reinterpolation state.");
+        MPI_Communication::instance()->abort();
+      }
+      j--;
+    }
+    Console::instance().c_assert(
+        j < NUM_A - 1, "Invalid state of the reinterpolation");
+    const double w = (current_log_d - log_d[j]) / (log_d[j + 1] - log_d[j]);
+    (*log_a)[i] = log_a_min + delta_log_a * ((1 - w) * j + (j + 1) * w);
+  }
+
+  pre_com2a = std::make_shared<auto_interpolator<double>>(
+      log_d_min, log_d_max, delta_log_d, 0,
+      std::numeric_limits<double>::infinity(), log_a);
+  pre_com2a->setThrowOnOverflow();
+}
+
+double Cosmology::com2a(double com) const {
+  if (pre_com2a) {
+    return std::exp((*pre_com2a)(std::log(com)));
+  }
+  return bisection(
+      A_MIN, A_MAX, 1e-6, com, [this](double a) { return a2com(a); });
+}
+
+double Cosmology::comph2a(double r) const {
+  double result = com2a(comph2com(r));
+  return (result);
+}
+
+double Cosmology::comph2d_plus(double r) const {
+  double a = com2a(comph2com(r));
+  double result = d_plus(a);
+  return (result);
+}
+
+double Cosmology::comph2g_plus(double r) const {
+  double a = com2a(comph2com(r));
+  double result = g_plus(a);
+  return (result);
+}
+
+double Cosmology::comph2Hubble(double r) const {
+  double a = com2a(comph2com(r));
+  double result = Hubble(a);
+  return (result);
+}
+
+/* --- function aux_dtr [auxiliary function for dtr] --- */
+double aux_dtr(double a, void *params) {
+  double result;
+  const CosmologicalParameters &p = *(const CosmologicalParameters *)params;
+
+  ///Fhubble=H0/adot
+  double H0 = 100.; ///km/s/Mpc
+
+  double FHubble = (p.h * H0 / hubble(a, p) / (a * a * a));
+
+  result = FHubble;
+
+  return (result);
+}
+/* --- function pm_time-stepping dtr --- */
+double Cosmology::dtr(double ai, double af) {
+  double result, error;
+  gsl_integration_workspace *wspace = gsl_integration_workspace_alloc(NEVAL);
+  gsl_function F;
+
+  F.function = &aux_dtr;
+  F.params = (void *)&parameters;
+  gsl_integration_qag(
+      &F, ai, af, epsabs, epsrel, NEVAL, GSL_INTEG_GAUSS61, wspace, &result,
+      &error);
+
+  gsl_integration_workspace_free(wspace);
+
+  return (result);
+}
+/* --- end of function dtv --- */
+/* --- function aux_dtv [auxiliary function for dtv] --- */
+double aux_dtv(double a, void *params) {
+  double result;
+  const CosmologicalParameters &p = *(const CosmologicalParameters *)params;
+
+  ///Fhubble=H0/adot
+  double H0 = 100.; ///km/s/Mpc
+
+  double FHubble = (p.h * H0 / hubble(a, p) / a / a);
+
+  result = FHubble;
+
+  return (result);
+}
+/* --- function pm_time-stepping dtv --- */
+double Cosmology::dtv(double ai, double af) {
+  double result, error;
+  gsl_integration_workspace *wspace = gsl_integration_workspace_alloc(NEVAL);
+  gsl_function F;
+
+  F.function = &aux_dtv;
+  F.params = (void *)&parameters;
+  gsl_integration_qag(
+      &F, ai, af, epsabs, epsrel, NEVAL, GSL_INTEG_GAUSS61, wspace, &result,
+      &error);
+
+  gsl_integration_workspace_free(wspace);
+
+  return (result);
+}
+/* --- end of function dtv --- */
+
+/* --- COLA time stepping --- */
+double Cosmology::integral_d_plus(double ai, double af) {
+  return gslIntegrate(
+      [this](double a) -> double {
+        return aux_dtv(a, &parameters) * d_plus(a);
+      },
+      ai, af, epsrel, NEVAL);
+}
+/* --- end --- */
+
+/* --- function critical density --- */
+double Cosmology::rho_crit() {
+  double rho_c = 3. * pow(parameters.h * cosmo_H100_s, 2.) /
+                 (8. * M_PI * cosmo_G_const_Mpc_Msun_s); //units [Msun/Mpc^3]
+  //calculates the critical density in units [Msun/(Mpc h^-1)^3]
+  return rho_c / parameters.h / parameters.h / parameters.h;
+}
+/* --- end of function critical density --- */
+/* --- function mass of volume --- */
+double Cosmology::mass_of_volume(double V) {
+  //returns the mean mass of a volume in units[Msun]
+  return rho_crit() * parameters.omega_m * V;
+}
+/* --- end of function mass of volume --- */
--- a/libLSS/physics/cosmo.hpp
+++ b/libLSS/physics/cosmo.hpp
@ -0,0 +1,196 @@
+/*+
+    ARES/HADES/BORG Package -- ./libLSS/physics/cosmo.hpp
+    Copyright (C) 2014-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#ifndef __LIBLSS_COSMO_HPP
+#define __LIBLSS_COSMO_HPP
+
+#include <string>
+#include <functional>
+#include <CosmoTool/hdf5_array.hpp>
+#include "libLSS/tools/bisection.hpp"
+#include "libLSS/tools/auto_interpolator.hpp"
+
+namespace LibLSS {
+
+  struct CosmologicalParameters {
+
+    double omega_r; /* negligible radiation density */
+    double omega_k; /* curvature - flat prior for everything! */
+    double omega_m;
+    double omega_b;
+    double omega_q;
+    double w;
+    double n_s;
+    double fnl; /* non-linearity parameter, for primordial non-Gaussianity */
+    double wprime;
+    double sigma8;
+    double rsmooth;
+    double h;
+    double beta;
+    double z0;
+    double a0;      /* scale factor at epoch of observation usually 1*/
+    double sum_mnu; /* sum of neutrino masses */
+
+    CosmologicalParameters()
+        : omega_r(0), omega_k(0), omega_m(0), omega_b(0), omega_q(0), w(0),
+          n_s(0), fnl(0), wprime(0), sigma8(0), h(0), beta(0), z0(0), a0(0),
+          sum_mnu(0) {}
+
+    bool operator==(CosmologicalParameters const &p2) const {
+      return omega_r == p2.omega_r && omega_k == p2.omega_k &&
+             omega_m == p2.omega_m && omega_b == p2.omega_b &&
+             omega_q == p2.omega_q && w == p2.w && n_s == p2.n_s &&
+             wprime == p2.wprime && sigma8 == p2.sigma8 && h == p2.h &&
+             sum_mnu == p2.sum_mnu;
+    }
+    bool operator!=(CosmologicalParameters const &p2) const {
+      return !operator==(p2);
+    }
+  };
+
+  static const double A_MIN = 0.;
+  static const double A_MAX = 30000.;
+  static const double COSMO_EPS = 1e-6;
+
+#define LIBLSS_COSMOLOGY_INVERSE_FUNCTION(                                     \
+    TARGET, ORIGINAL, RANGE_MIN, RANGE_MAX)                                    \
+  double TARGET(double X) const {                                              \
+    return bisection(                                                          \
+        RANGE_MIN, RANGE_MAX, 1e-6, X,                                         \
+        std::bind(&Cosmology::ORIGINAL, this, std::placeholders::_1));         \
+  }
+
+  class Cosmology {
+  private:
+    CosmologicalParameters parameters;
+    double A_spec; /* Normalization of the power spectrum */
+    int spec_type; /* indicates which power spectrum is currently used, and whether the normalization has to be reevaluated*/
+    double norm_d_plus;
+    double aux_d_plus(double a, double *result_d_plus_prime = 0) const;
+
+    std::shared_ptr<auto_interpolator<double>> pre_com2a, pre_dplus,
+        pre_dplus_prime;
+
+  public:
+    Cosmology(const CosmologicalParameters &parameters);
+
+    void precompute_com2a();
+    void precompute_d_plus();
+
+    CosmologicalParameters const &getParameters() const { return parameters; }
+
+    double a2z(double a) const { return 1 / a - 1; }
+    double z2a(double z) const { return 1 / (1 + z); }
+    double d_plus(double a) const { return aux_d_plus(a) / norm_d_plus; }
+
+    double d2dlum(double z, double d) const { return (1 + z) * d; }
+
+    double dlum2d(double z, double dlum) const { return dlum / (1 + z); }
+
+    double g_plus(double a) const {
+      double d_plus, d_plus_prime;
+
+      d_plus = aux_d_plus(a, &d_plus_prime);
+      return (a > COSMO_EPS) ? (a / d_plus * d_plus_prime) : 1.0;
+    }
+
+    double a2com(double a) const;
+    double com2a(double com) const;
+
+    double z2com(double z) const {
+      double a = z2a(z);
+      double dcom = a2com(a);
+      return dcom;
+    };
+
+    double dcom_dz(double z) const;
+
+    double com2comph(double r) const { return parameters.h * r; }
+    double comph2com(double r) const { return r / parameters.h; }
+
+    double comph2d_plus(double r) const;
+    double comph2g_plus(double r) const;
+    double comph2Hubble(double r) const;
+    double comph2a(double r) const;
+
+    double a2dlum(double a) const {
+      double z = a2z(a);
+      double dcom = a2com(a);
+      return (1 + z) * dcom;
+    };
+
+    double z2dlum(double z) const {
+      double a = z2a(z);
+      double dcom = a2com(a);
+      return (1 + z) * dcom;
+    };
+
+    LIBLSS_COSMOLOGY_INVERSE_FUNCTION(dlum2a, a2dlum, A_MIN, A_MAX);
+
+    double a2dA(double a) const {
+      double z = a2z(a);
+      double dcom = a2com(a);
+      return dcom / (1 + z);
+    };
+
+    double z2dA(double z) const {
+      double a = z2a(z);
+      double dcom = a2com(a);
+      return dcom / (1 + z);
+    };
+
+    //a2dA not invertible over full redhsiftrange
+    LIBLSS_COSMOLOGY_INVERSE_FUNCTION(dA2a, a2dA, 0.5, A_MAX);
+
+    double Hubble(double a) const;
+    double hNow() const { return parameters.h; }
+    double k_J(double a);
+    double kF_baryon(double a);
+    double kSZ_kernel(double a);
+    void print_cdmspec2file(std::string outputFileName);
+    double power_spectrum(double k, int type);
+    double transfer_function(double k);
+    double power_spectrum_grav(double k, int type);
+    double rho_background_matter(double a);
+    double gravpot_norm();
+    double return_cosmo_par(std::string cosmopar);
+    double FHubble(double a) { return (parameters.h * 100 / (a * Hubble(a))); }
+    double dtr(double ai, double af);
+    double dtv(double ai, double af);
+    double integral_d_plus(double ai, double af);
+    double rho_crit();
+    double mass_of_volume(double V);
+  };
+
+#undef LIBLSS_COSMOLOGY_INVERSE_FUNCTION
+
+} // namespace LibLSS
+
+// clang-format off
+CTOOL_STRUCT_TYPE(LibLSS::CosmologicalParameters,
+                  HDF5T_CosmologicalParameters,
+    ((double, omega_r))
+    ((double, omega_k))
+    ((double, omega_m))
+    ((double, omega_b))
+    ((double, omega_q))
+    ((double, w))
+    ((double, n_s))
+    ((double, fnl))
+    ((double, wprime))
+    ((double, sigma8))
+    ((double, rsmooth))
+    ((double, h))
+    ((double, beta))
+    ((double, z0))
+    ((double, a0))
+);
+// clang-format on
+
+#endif
--- a/libLSS/physics/cosmo_power.hpp
+++ b/libLSS/physics/cosmo_power.hpp
@ -0,0 +1,61 @@
+/*+
+    ARES/HADES/BORG Package -- ./libLSS/physics/cosmo_power.hpp
+    Copyright (C) 2014-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#ifndef __ARES_COSMO_POWER_HPP
+#define __ARES_COSMO_POWER_HPP
+
+#include <CosmoTool/algo.hpp>
+#include <CosmoTool/cosmopower.hpp>
+#include "libLSS/tools/console.hpp"
+#include "libLSS/tools/log_traits.hpp"
+#include "libLSS/mcmc/global_state.hpp"
+#include <boost/format.hpp>
+#include "libLSS/physics/cosmo.hpp"
+#include "libLSS/samplers/core/types_samplers.hpp"
+
+namespace LibLSS {
+
+  inline void createCosmologicalPowerSpectrum(
+      MarkovState &state, CosmologicalParameters &cosmo_params,
+      double adjust = 1) {
+    double h;
+    using CosmoTool::square;
+    ConsoleContext<LOG_INFO_SINGLE> ctx("filling cosmological power spectrum");
+
+    CosmoTool::CosmoPower cpower;
+
+    double Rsmooth = 0; // 1.6;
+    h = cpower.h = cosmo_params.h;
+    cpower.OMEGA_B = cosmo_params.omega_b;
+    cpower.OMEGA_C = cosmo_params.omega_m - cosmo_params.omega_b;
+    cpower.SIGMA8 = cosmo_params.sigma8;
+    cpower.n = cosmo_params.n_s;
+    ctx.print(
+        boost::format(
+            "sigma8 = %g, OmegaB = %g, Omega_C = %g, Omega_M = %g, h = %g") %
+        cpower.SIGMA8 % cpower.OMEGA_B % cpower.OMEGA_C % cosmo_params.omega_m %
+        h);
+    cpower.updateCosmology();
+    cpower.setFunction(CosmoTool::CosmoPower::HU_WIGGLES);
+    cpower.normalize();
+
+    ArrayType1d::ArrayType &k = *state.get<ArrayType1d>("k_modes")->array;
+    ArrayType1d::ArrayType &Pk =
+        *state.get<ArrayType1d>("powerspectrum")->array;
+    for (long i = 0; i < k.num_elements(); i++) {
+      Pk[i] = cpower.power(k[i] * h) * h * h * h * adjust *
+              std::exp(-square(k[i] * Rsmooth));
+    }
+
+    // Notify that the power spectrum is ready.
+//    state.get<ArrayType1d>("powerspectrum")->deferInit.submit_ready();
+  }
+} // namespace LibLSS
+
+#endif
--- a/libLSS/physics/generic_cic.hpp
+++ b/libLSS/physics/generic_cic.hpp
@ -0,0 +1,228 @@
+/*+
+    ARES/HADES/BORG Package -- ./libLSS/physics/generic_cic.hpp
+    Copyright (C) 2009-2019 Jens Jasche <jens.jasche@fysik.su.se>
+    Copyright (C) 2014-2019 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2019 Florent Leclercq <florent.leclercq@polytechnique.org>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#ifndef __LIBLSS_GENERIC_CIC_HPP
+#define __LIBLSS_GENERIC_CIC_HPP
+
+#include <boost/config.hpp>
+
+namespace LibLSS {
+
+    namespace CIC_Tools {
+
+        struct NonPeriodic {
+            NonPeriodic(int, int, int ) {}
+
+            template<typename I>
+            void operator()(I& i, I& j, I& k) const {}
+        };
+
+        struct Periodic {
+            int N0, N1, N2;
+
+            Periodic(int fN0, int fN1, int fN2) :
+                N0(fN0), N1(fN1), N2(fN2) {}
+
+            template<typename I>
+            void operator()(I& i, I& j, I& k) const {
+                if (i>=N0) i %= N0;
+                if (j>=N1) j %= N1;
+                if (k>=N2) k %= N2;
+            }
+        };
+
+        struct Periodic_MPI {
+            size_t N0, N1, N2;
+
+            Periodic_MPI(size_t fN0, size_t fN1, size_t fN2, MPI_Communication *comm) :
+                N0(fN0), N1(fN1), N2(fN2) {}
+
+            template<typename I>
+            void operator()(I& i, I& j, I& k) const {
+                if (j>=N1) j %= N1;
+                if (k>=N2) k %= N2;
+            }
+        };
+
+        struct DefaultWeight  {
+            BOOST_STATIC_CONSTANT(size_t, dimensionality = 1);
+            double operator[](long) const { return 1; }
+        };
+        
+        struct DefaultWeightDim2  {
+            BOOST_STATIC_CONSTANT(size_t, dimensionality = 2);
+            auto operator[](long) const { return DefaultWeight(); }
+        };
+    }
+
+
+    template<typename T,typename ImplType>
+    class GenericCIC {
+    public:
+        typedef ImplType impl;
+
+        template<typename ParticleArray, typename ProjectionDensityArray, typename WeightArray,
+                 typename PeriodicFunction>
+        static void projection(const ParticleArray& particles, ProjectionDensityArray& density,
+                               T Lx, T Ly, T Lz,
+                               int N0, int N1, int N2,
+                               const PeriodicFunction& p, const WeightArray& weight, size_t Np) {
+            impl::projection(particles, density, Lx, Ly, Lz, N0, N1, N2, p, weight, Np);
+        }
+
+
+        template<typename ParticleArray, typename ProjectionDensityArray, typename WeightArray,
+                 typename PeriodicFunction>
+        static void projection(const ParticleArray& particles, ProjectionDensityArray& density,
+                               T Lx, T Ly, T Lz,
+                               int N0, int N1, int N2,
+                               const PeriodicFunction& p, const WeightArray& weight) {
+          impl::projection(particles, density, Lx, Ly, Lz, N0, N1, N2, p, weight, particles.shape()[0]);
+        }
+
+        template<typename ParticleArray, typename ProjectionDensityArray, typename PeriodicFunction>
+        static void projection(const ParticleArray& particles, ProjectionDensityArray& density,
+                               T Lx, T Ly, T Lz,
+                               int N0, int N1, int N2,
+                               const PeriodicFunction& p) {
+          impl::projection(particles, density, Lx, Ly, Lz, N0, N1, N2, p,
+                       CIC_Tools::DefaultWeight(), particles.shape()[0]);
+        }
+
+        template<typename ParticleArray, typename ProjectionDensityArray>
+        static void projection(const ParticleArray& particles, ProjectionDensityArray& density,
+                               T Lx, T Ly, T Lz,
+                               int N0, int N1, int N2) {
+          impl::projection(particles, density, Lx, Ly, Lz, N0, N1, N2, CIC_Tools::Periodic(N0, N1, N2),
+                           CIC_Tools::DefaultWeight(), particles.shape()[0]);
+        }
+        
+        template<typename ParticleBasedScalar, typename ParticleArray, typename ProjectionDensityArray,
+                  typename WeightArray, typename PeriodicFunction>
+        static void interpolation_scalar(ParticleBasedScalar &A, const ParticleArray &particles, const ProjectionDensityArray &field, T Lx,
+        T Ly, T Lz, int N0, int N1, int N2, const PeriodicFunction &p,
+        const WeightArray &weight, size_t Np) {
+            impl::interpolation_scalar(A, particles, field, Lx, Ly, Lz, N0, N1, N2, p, weight, Np);
+        }
+        
+        template<typename ParticleBasedScalar, typename ParticleArray, typename ProjectionDensityArray,
+                  typename WeightArray, typename PeriodicFunction>
+        static void interpolation_scalar(ParticleBasedScalar &A, const ParticleArray &particles, const ProjectionDensityArray &field, T Lx,
+        T Ly, T Lz, int N0, int N1, int N2, const PeriodicFunction &p,
+        const WeightArray &weight) {
+            impl::interpolation_scalar(A, particles, field, Lx, Ly, Lz, N0, N1, N2, p, weight, particles.shape()[0]);
+        }
+        
+        template<typename ParticleBasedScalar, typename ParticleArray, typename ProjectionDensityArray,
+                  typename WeightArray, typename PeriodicFunction>
+        static void interpolation_scalar(ParticleBasedScalar &A, const ParticleArray &particles, const ProjectionDensityArray &field, T Lx,
+        T Ly, T Lz, int N0, int N1, int N2, const PeriodicFunction &p) {
+            impl::interpolation_scalar(A, particles, field, Lx, Ly, Lz, N0, N1, N2, p, CIC_Tools::DefaultWeight(), particles.shape()[0]);
+        }
+        
+        template<typename ParticleBasedScalar, typename ParticleArray, typename ProjectionDensityArray,
+                  typename WeightArray, typename PeriodicFunction>
+        static void interpolation_scalar(ParticleBasedScalar &A, const ParticleArray &particles, const ProjectionDensityArray &field, T Lx,
+        T Ly, T Lz, int N0, int N1, int N2) {
+            impl::interpolation_scalar(A, particles, field, Lx, Ly, Lz, N0, N1, N2, CIC_Tools::Periodic(N0, N1, N2), CIC_Tools::DefaultWeight(), particles.shape()[0]);
+        }
+        
+        template<typename ParticleBasedArray, typename ParticleArray, typename ProjectionDensityArray,
+                 typename WeightArray, typename PeriodicFunction>
+        static void interpolation(ParticleBasedArray &A, const ParticleArray &particles, const ProjectionDensityArray &field, T Lx,
+        T Ly, T Lz, int N0, int N1, int N2, const PeriodicFunction &p,
+        const WeightArray &weight, size_t Np) {
+            impl::interpolation(A, particles, field, Lx, Ly, Lz, N0, N1, N2, p, weight, Np);
+        }
+
+        template<typename ParticleBasedArray, typename ParticleArray, typename ProjectionDensityArray,
+                 typename WeightArray, typename PeriodicFunction>
+        static void interpolation(ParticleBasedArray &A, const ParticleArray &particles, const ProjectionDensityArray &field, T Lx,
+        T Ly, T Lz, int N0, int N1, int N2, const PeriodicFunction &p,
+        const WeightArray &weight) {
+            impl::interpolation(A, particles, field, Lx, Ly, Lz, N0, N1, N2, p, weight, particles.shape()[0]);
+        }
+        
+        template<typename ParticleBasedArray, typename ParticleArray, typename ProjectionDensityArray,
+                 typename WeightArray, typename PeriodicFunction>
+        static void interpolation(ParticleBasedArray &A, const ParticleArray &particles, const ProjectionDensityArray &field, T Lx,
+        T Ly, T Lz, int N0, int N1, int N2, const PeriodicFunction &p) {
+            impl::interpolation(A, particles, field, Lx, Ly, Lz, N0, N1, N2, p, CIC_Tools::DefaultWeightDim2(), particles.shape()[0]);
+        }
+        
+        template<typename ParticleBasedArray, typename ParticleArray, typename ProjectionDensityArray,
+                 typename WeightArray, typename PeriodicFunction>
+        static void interpolation(ParticleBasedArray &A, const ParticleArray &particles, const ProjectionDensityArray &field, T Lx,
+        T Ly, T Lz, int N0, int N1, int N2) {
+            impl::interpolation(A, particles, field, Lx, Ly, Lz, N0, N1, N2, CIC_Tools::Periodic(N0, N1, N2), CIC_Tools::DefaultWeightDim2(), particles.shape()[0]);
+        }
+        
+        template<typename ParticleArray, typename GradientArray, typename ProjectionDensityArray, typename PeriodicFunction>
+        static void adjoint(const ParticleArray& particles, ProjectionDensityArray& density,
+                            GradientArray& adjoint_gradient,
+                            T Lx, T Ly, T Lz,
+                            int N0, int N1, int N2,
+                            const PeriodicFunction& p,
+                            T nmean, size_t Np) {
+            impl::adjoint(particles, density, adjoint_gradient,CIC_Tools::DefaultWeight(), Lx, Ly, Lz, N0, N1, N2, p, nmean, Np);
+        }
+
+        template<typename ParticleArray, typename GradientArray, typename ProjectionDensityArray, typename PeriodicFunction>
+        static void adjoint(const ParticleArray& particles, ProjectionDensityArray& density,
+                            GradientArray& adjoint_gradient,
+                            T Lx, T Ly, T Lz,
+                            int N0, int N1, int N2,
+                            const PeriodicFunction& p,
+                            T nmean) {
+            impl::adjoint(particles, density, adjoint_gradient,CIC_Tools::DefaultWeight(), Lx, Ly, Lz, N0, N1, N2, p, nmean, particles.shape()[0]);
+        }
+
+        template<typename ParticleArray, typename GradientArray, typename ProjectionDensityArray>
+        static void adjoint(const ParticleArray& particles, ProjectionDensityArray& density,
+                            GradientArray& adjoint_gradient,
+                            T Lx, T Ly, T Lz,
+                            int N0, int N1, int N2,
+                            T nmean) {
+          impl::adjoint(particles, density, adjoint_gradient,CIC_Tools::DefaultWeight(), Lx, Ly, Lz, N0, N1, N2, CIC_Tools::Periodic(N0, N1, N2), nmean, particles.shape()[0]);
+        }
+        
+        template<typename ParticleBasedScalar, typename ParticleArray, typename ProjectionDensityArray,
+        typename WeightArray, typename PeriodicFunction>
+        static void adjoint_interpolation_scalar(
+            int axis, ParticleBasedScalar &A, const ParticleArray &particles, const ProjectionDensityArray &field, T Lx, T Ly, T Lz, int N0, int N1, int N2, const PeriodicFunction &p, const WeightArray &weight, size_t Np) {
+            impl::adjoint_interpolation_scalar(axis, A, particles, field, Lx, Ly, Lz, N0, N1, N2, p, weight, Np);
+        }
+        
+        template<typename ParticleBasedArray, typename ParticleArray, typename ProjectionDensityArray,
+                 typename WeightArray, typename PeriodicFunction>
+        static void adjoint_interpolation(
+            int axis, ParticleBasedArray &A, const ParticleArray &particles, const ProjectionDensityArray &field, T Lx,
+            T Ly, T Lz, int N0, int N1, int N2, const PeriodicFunction &p,
+            const WeightArray &weight, size_t Np) {
+            impl::adjoint_interpolation(axis, A, particles, field, Lx, Ly, Lz, N0, N1, N2, p, weight, Np);
+        }
+        
+
+    };
+
+}
+
+#endif
+
+// ARES TAG: authors_num = 3
+// ARES TAG: name(0) = Jens Jasche
+// ARES TAG: year(0) = 2009-2019
+// ARES TAG: email(0) = jens.jasche@fysik.su.se
+// ARES TAG: name(1) = Guilhem Lavaux
+// ARES TAG: year(1) = 2014-2019
+// ARES TAG: email(1) = guilhem.lavaux@iap.fr
+// ARES TAG: name(2) = Florent Leclercq
+// ARES TAG: year(2) = 2019
+// ARES TAG: email(2) = florent.leclercq@polytechnique.org
--- a/libLSS/physics/modified_ngp.hpp
+++ b/libLSS/physics/modified_ngp.hpp
@ -0,0 +1,454 @@
+/*+
+    ARES/HADES/BORG Package -- ./libLSS/physics/modified_ngp.hpp
+    Copyright (C) 2014-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#ifndef __LIBLSS_PHYSICS_MODIFIED_NGP_HPP
+#define __LIBLSS_PHYSICS_MODIFIED_NGP_HPP
+
+#include <cmath>
+#include "libLSS/tools/console.hpp"
+#include <boost/multi_array.hpp>
+#include "libLSS/physics/generic_cic.hpp"
+#include "libLSS/tools/mpi_fftw_helper.hpp"
+#include "libLSS/tools/compiler_tools.hpp"
+
+namespace LibLSS {
+
+  template <typename T, typename SubgridSpec, bool ignore_overflow>
+  struct ModifiedNGP_impl {
+    typedef T Type;
+    // Number of extra planes required in case of MPI
+    static const int MPI_PLANE_LEAKAGE = 1;
+    static const bool EXTRA_CHECK = true;
+    typedef boost::multi_array<T, 3> DensityArray;
+
+    //get virtual grid spacing
+    //for testing we choose subres=1. this should reprodice CIC
+    //particles will be assumed to be little boxes of size dx*subres
+    // subres = 1 corresponds to CIC
+    // subres -> 0 approaches NGP
+    static constexpr double subres = SubgridSpec::value;
+
+    template <typename A>
+    static inline void _safe_set(
+        A &&density, size_t const ix, size_t const iy, size_t const iz,
+        ssize_t const bounds[3][2], T const &value) {
+      if (ix >= bounds[0][0] && ix < bounds[0][1] && iy >= bounds[1][0] &&
+          iy < bounds[1][1] && iz >= bounds[2][0] && iz < bounds[2][1]) {
+        density[ix][iy][iz] += value;
+      }
+    }
+
+    // This function implements the particle projection to a grid.
+    // Arguments:
+    //   - particles (2d array: Nx3)
+    //   - density (3d array: N0xN1xN2, or slice thereof)
+    //   - Lx, Ly, Lz: physical size
+    //   - N0, N1, N2: grid size
+    //   - p: a function applying optional periodic boundary enforcement (depends on MPI for ghost plane)
+    //   - weight: per-particle weight functor, maybe returning only "1"
+    //   - Np: number of particles to project
+    template <
+        typename ParticleArray, typename ProjectionDensityArray,
+        typename WeightArray, typename PeriodicFunction>
+    static void projection(
+        const ParticleArray &particles, ProjectionDensityArray &density, T Lx,
+        T Ly, T Lz, int N0, int N1, int N2, const PeriodicFunction &p,
+        const WeightArray &weight, size_t Np) {
+      ConsoleContext<LOG_DEBUG> ctx("Modified NGP projection");
+
+      T inv_dx = N0 / Lx;
+      T inv_dy = N1 / Ly;
+      T inv_dz = N2 / Lz;
+
+      ssize_t minX = density.index_bases()[0];
+      ssize_t minY = density.index_bases()[1];
+      ssize_t minZ = density.index_bases()[2];
+      ssize_t maxX = minX + density.shape()[0];
+      ssize_t maxY = minY + density.shape()[1];
+      ssize_t maxZ = minZ + density.shape()[2];
+
+      ssize_t const bounds[3][2] = {{minX, maxX}, {minY, maxY}, {minZ, maxZ}};
+
+      ctx.format("minX=%d, maxX=%d, N0=%d", minX, maxX, N0);
+      ctx.format("minY=%d, maxY=%d, N1=%d", minY, maxY, N1);
+      ctx.format("minZ=%d, maxZ=%d, N2=%d", minZ, maxZ, N2);
+
+      for (long i = 0; i < Np; i++) {
+
+        //divide particle positions by target grid-size
+        //Note: all integer numbers are therefore defined at target resolution
+        T x = particles[i][0] * inv_dx;
+        T y = particles[i][1] * inv_dy;
+        T z = particles[i][2] * inv_dz;
+
+        //Note, we want to find the nearest lower left corner of a voxel that fully contains
+        //the box-shapep particle.
+        //we therefore have to find the nearest voxel for the lower left corner of the particel box
+
+        size_t ix = (size_t)std::floor(
+            x +
+            0.5 * (1. - subres)); //the offset of half a subresolution factor
+        size_t iy = (size_t)std::floor(
+            y +
+            0.5 *
+                (1. -
+                 subres)); //ensures the edges of the particle cloud are within
+        size_t iz = (size_t)std::floor(
+            z + 0.5 * (1. - subres)); //the lower voxel boundaries
+        //Note, it can be easily seen that for subres=1 the CIC scheme is recovered.
+
+        //now calculate distances before wrap-around
+        //if particle is fully contained in voxel assign the total mass
+        T rx = 0.;
+        T qx = 1.;
+
+        T ry = 0.;
+        T qy = 1.;
+
+        T rz = 0.;
+        T qz = 1.;
+        // clang-format off
+DISABLE_WARN_DIV_BY_ZERO;
+        // clang-format on
+        //if fraction of particle is contained in the next cell assign a fraction of mass
+        double dd = x - ix - 0.5 * (1 - subres);
+        if (dd > 0. && subres > 0) {
+          rx = dd / subres;
+          qx = 1. - rx;
+        }
+
+        dd = y - iy - 0.5 * (1 - subres);
+        if (dd > 0. && subres > 0) {
+          ry = dd / subres;
+          qy = 1. - ry;
+        }
+
+        dd = z - iz - 0.5 * (1 - subres);
+        if (dd > 0. && subres > 0) {
+          rz = dd / subres;
+          qz = 1. - rz;
+        }
+        // clang-format off
+ENABLE_WARN_DIV_BY_ZERO;
+        // clang-format on
+
+        //we need to check for periodicity
+        p(ix, iy, iz);
+
+        //if the particle is fully contained within a voxel
+        //then we can attribute its entire mass to this bin.
+        //otherwise a fraction of mass will be assigned to
+        //the next bin.
+
+        //find next cells
+        size_t jx = (ix + 1);
+        size_t jy = (iy + 1);
+        size_t jz = (iz + 1);
+
+        //check for periodicity
+        p(jx, jy, jz);
+
+        double w = weight[i];
+
+        if (!ignore_overflow) {
+          if (EXTRA_CHECK && jx >= maxX) {
+            Console::instance().print<LOG_ERROR>(
+                boost::format("Overflow at ix=%d, jx=%d (maxX=%d)") % ix % jx %
+                maxX);
+          }
+          if (EXTRA_CHECK && ix < minX) {
+            Console::instance().print<LOG_ERROR>(
+                boost::format("Underflow at ix=%d, jx=%d") % ix % jx);
+          }
+          if (EXTRA_CHECK && ix >= maxX) {
+            Console::instance().print<LOG_ERROR>(
+                boost::format("Overflow at ix=%d, jx=%d with x=%g") % ix % jx %
+                x);
+          }
+          if (EXTRA_CHECK && jy >= maxY) {
+            Console::instance().print<LOG_ERROR>(
+                boost::format("Overflow at iy=%d, jy=%d (maxY=%d)") % iy % jy %
+                maxY);
+          }
+          if (EXTRA_CHECK && iy < minY) {
+            Console::instance().print<LOG_ERROR>(
+                boost::format("Underflow at iy=%d, jy=%d") % iy % jy);
+          }
+          density[ix][iy][iz] += (qx) * (qy) * (qz)*w;
+          density[ix][iy][jz] += (qx) * (qy) * (rz)*w;
+          density[ix][jy][iz] += (qx) * (ry) * (qz)*w;
+          density[ix][jy][jz] += (qx) * (ry) * (rz)*w;
+          density[jx][iy][iz] += (rx) * (qy) * (qz)*w;
+          density[jx][iy][jz] += (rx) * (qy) * (rz)*w;
+          density[jx][jy][iz] += (rx) * (ry) * (qz)*w;
+          density[jx][jy][jz] += (rx) * (ry) * (rz)*w;
+        } else {
+          _safe_set(density, ix, iy, iz, bounds, qx * qy * qz * w);
+          _safe_set(density, ix, iy, jz, bounds, qx * qy * rz * w);
+          _safe_set(density, ix, jy, iz, bounds, qx * ry * qz * w);
+          _safe_set(density, ix, jy, jz, bounds, qx * ry * rz * w);
+
+          _safe_set(density, jx, iy, iz, bounds, rx * qy * qz * w);
+          _safe_set(density, jx, iy, jz, bounds, rx * qy * rz * w);
+          _safe_set(density, jx, jy, iz, bounds, rx * ry * qz * w);
+          _safe_set(density, jx, jy, jz, bounds, rx * ry * rz * w);
+        }
+      }
+    }
+
+    template <typename GradientArray, typename ProjectionDensityArray>
+    static inline void __do_gradient(
+        GradientArray &adj_gradient, const ProjectionDensityArray &density,
+        size_t i, int axis, int ix, int iy, int iz, int jx, int jy, int jz,
+        T rx, T ry, T rz, T qx, T qy, T qz, T global_w) {
+
+      switch (axis) {
+      case 0:
+
+        //Note the derivative of the Heaviside function is zero
+        if (rx > 0. && subres > 0) {
+          rx = 1. / subres;
+          qx = -1. / subres;
+        } else {
+          rx = 0;
+          qx = 0;
+        }
+
+        break;
+      case 1:
+        //Note the derivative of the Heaviside function is zero
+        if (ry > 0. && subres > 0) {
+          ry = 1. / subres;
+          qy = -1. / subres;
+        } else {
+          ry = 0;
+          qy = 0;
+        }
+        break;
+      case 2:
+        //Note the derivative of the Heaviside function is zero
+        if (rz > 0. && subres > 0) {
+          rz = 1. / subres;
+          qz = -1. / subres;
+        } else {
+          rz = 0;
+          qz = 0;
+        }
+        break;
+      }
+
+      double w = density[ix][iy][iz] * qx * qy * qz +
+                 density[ix][iy][jz] * qx * qy * rz +
+                 density[ix][jy][iz] * qx * ry * qz +
+                 density[ix][jy][jz] * qx * ry * rz +
+                 density[jx][iy][iz] * rx * qy * qz +
+                 density[jx][iy][jz] * rx * qy * rz +
+                 density[jx][jy][iz] * rx * ry * qz +
+                 density[jx][jy][jz] * rx * ry * rz;
+
+      adj_gradient[i][axis] += w * global_w;
+    }
+
+    template <
+        typename ParticleArray, typename GradientArray,
+        typename ProjectionDensityArray, typename PeriodicFunction,
+        typename WeightArray>
+    static void adjoint(
+        const ParticleArray &particles, ProjectionDensityArray &density,
+        GradientArray &adjoint_gradient, const WeightArray &weight, T Lx, T Ly,
+        T Lz, int N0, int N1, int N2, const PeriodicFunction &p, T nmean,
+        size_t Np) {
+      ConsoleContext<LOG_DEBUG> ctx("Modified NGP adjoint-projection");
+
+      T inv_dx = N0 / Lx;
+      T inv_dy = N1 / Ly;
+      T inv_dz = N2 / Lz;
+      T inv_nmean = T(1) / nmean;
+      ssize_t minX = density.index_bases()[0];
+      ssize_t minY = density.index_bases()[1];
+      ssize_t minZ = density.index_bases()[2];
+      ssize_t maxX = minX + density.shape()[0];
+      ssize_t maxY = minY + density.shape()[1];
+      ssize_t maxZ = minZ + density.shape()[2];
+
+      ctx.print(
+          boost::format(
+              "Number of particles = %d (array is %d), minX=%d maxX=%d") %
+          Np % particles.shape()[0] % minX % maxX);
+      ctx.print(
+          boost::format("Adjoint gradient = %d") % adjoint_gradient.shape()[0]);
+
+#pragma omp parallel for schedule(static)
+      for (size_t i = 0; i < Np; i++) {
+
+        T x = particles[i][0] * inv_dx;
+        T y = particles[i][1] * inv_dy;
+        T z = particles[i][2] * inv_dz;
+
+        ssize_t ix = (ssize_t)std::floor(x + 0.5 * (1. - subres));
+        ssize_t iy = (ssize_t)std::floor(y + 0.5 * (1. - subres));
+        ssize_t iz = (ssize_t)std::floor(z + 0.5 * (1. - subres));
+
+        T rx = 0.;
+        T qx = 1.;
+
+        T ry = 0.;
+        T qy = 1.;
+
+        T rz = 0.;
+        T qz = 1.;
+
+        double dd = x - ix - 0.5 * (1 - subres);
+        if (dd > 0. && subres > 0) {
+          rx = dd / subres;
+          qx = 1. - rx;
+        }
+
+        dd = y - iy - 0.5 * (1 - subres);
+        if (dd > 0. && subres > 0) {
+          ry = dd / subres;
+          qy = 1. - ry;
+        }
+
+        dd = z - iz - 0.5 * (1 - subres);
+        if (dd > 0. && subres > 0) {
+          rz = dd / subres;
+          qz = 1. - rz;
+        }
+
+        p(ix, iy, iz);
+
+        size_t jx = (ix + 1);
+        size_t jy = (iy + 1);
+        size_t jz = (iz + 1);
+
+        p(jx, jy, jz);
+
+        if (ignore_overflow) {
+          error_helper<ErrorBadState>("Overflow cannot be ignored in adjoint.");
+        }
+
+        if (EXTRA_CHECK && jx >= maxX) {
+          Console::instance().print<LOG_ERROR>(
+              boost::format("Overflow at ix=%d, jx=%d (maxX adj = %d)") % ix %
+              jx % maxX);
+        }
+        if (EXTRA_CHECK && ix < minX) {
+          Console::instance().print<LOG_ERROR>(
+              boost::format("Underflow at ix=%d, jx=%d (adj)") % ix % jx);
+        }
+        if (EXTRA_CHECK && jy >= maxY) {
+          Console::instance().print<LOG_ERROR>(
+              boost::format("Overflow at iy=%d, jy=%d (maxY=%d) adj") % iy %
+              jy % maxY);
+        }
+        if (EXTRA_CHECK && iy < minY) {
+          Console::instance().print<LOG_ERROR>(
+              boost::format("Underflow at iy=%d, jy=%d adj") % iy % jy);
+        }
+        if (EXTRA_CHECK && jz >= maxZ) {
+          Console::instance().print<LOG_ERROR>(
+              boost::format("Overflow at iz=%d, jz=%d (maxZ=%d) adj") % iz %
+              jz % maxZ);
+        }
+        if (EXTRA_CHECK && iz < minZ) {
+          Console::instance().print<LOG_ERROR>(
+              boost::format("Underflow at iz=%d, jz=%d adj") % iz % jz);
+        }
+
+        __do_gradient(
+            adjoint_gradient, density, i, 0, ix, iy, iz, jx, jy, jz, rx, ry, rz,
+            qx, qy, qz, inv_nmean * inv_dx);
+        __do_gradient(
+            adjoint_gradient, density, i, 1, ix, iy, iz, jx, jy, jz, rx, ry, rz,
+            qx, qy, qz, inv_nmean * inv_dy);
+        __do_gradient(
+            adjoint_gradient, density, i, 2, ix, iy, iz, jx, jy, jz, rx, ry, rz,
+            qx, qy, qz, inv_nmean * inv_dz);
+      }
+    }
+  };
+
+  namespace NGPGrid {
+    struct NGP {
+      static constexpr double value = 0.0;
+    };
+    struct CIC {
+      static constexpr double value = 1.0;
+    };
+    struct Double {
+      static constexpr double value = 0.5;
+    };
+    struct Quad {
+      static constexpr double value = 0.25;
+    };
+  } // namespace NGPGrid
+
+  // This implements the ModifiedNGP kernel. By default it acts like a CIC, for an additional cost.
+  // It relies on GenericCIC to implement the missing auxiliary functions from the base function
+  // given in ModifiedNGP_impl
+  template <
+      typename T, typename SubgridSpec = NGPGrid::CIC,
+      bool ignore_overflow = false>
+  class ModifiedNGP
+      : public GenericCIC<
+            T, ModifiedNGP_impl<T, SubgridSpec, ignore_overflow>> {
+  public:
+    typedef ModifiedNGP_impl<T, SubgridSpec, ignore_overflow> Base;
+    typedef T Type;
+
+    // Number of extra ghost planes required in case of MPI. Only post planes are
+    // supported.
+    // In practice only ONE plane is supported at the moment.
+    static const int MPI_PLANE_LEAKAGE = 1;
+    static const int MPI_NEGATIVE_PLANE_LEAKAGE = 0;
+
+    typedef CIC_Tools::Periodic_MPI Periodic_MPI;
+
+    // This defines the policy of load balancing distribution for MNGP.
+    // This class translates the requirements of slabing by FFTW to particle
+    // positions. As we are still using the ghost plane mechanism to adjust for
+    // edge effects this decision class is required to be able to do correct parallel
+    // projection.
+    // Its task is quite minimal as most of the complexity is in "get_peer" and
+    // load balancing in samplers/borg/pm/particle_distribution.hpp
+    struct Distribution {
+      typedef long LongElt;
+      typedef LibLSS::FFTW_Manager_3d<T> Manager;
+
+      std::shared_ptr<Manager> &force_mgr;
+      size_t f_N0;
+      size_t f_startN0;
+      size_t f_localN0;
+      double L0;
+
+      Distribution(
+          std::shared_ptr<Manager> &mgr, double L0, double = 0, double = 0)
+          : force_mgr(mgr), f_N0(mgr->N0), f_startN0(mgr->startN0),
+            f_localN0(mgr->localN0) {
+        this->L0 = L0;
+        Console::instance().print<LOG_DEBUG>(
+            boost::format(
+                "Initialize particle distribution decider: N0 = %d, L0 = %g") %
+            f_N0 % L0);
+      }
+
+      template <typename Position, typename... U>
+      inline LongElt operator()(Position &&pos, U &&...) {
+        T x = pos[0] * f_N0 / L0;
+        LongElt i0 = LongElt(std::floor(x + 0.5 * (1. - Base::subres))) % f_N0;
+        LongElt peer = force_mgr->get_peer(i0);
+        //Console::instance().print<LOG_DEBUG>(boost::format("Pos %g, peer = %d") % x % peer);
+        return peer;
+      }
+    };
+  };
+} // namespace LibLSS
+
+#endif
--- a/libLSS/physics/modified_ngp_smooth.hpp
+++ b/libLSS/physics/modified_ngp_smooth.hpp
@ -0,0 +1,366 @@
+/*+
+    ARES/HADES/BORG Package -- ./libLSS/physics/modified_ngp_smooth.hpp
+    Copyright (C) 2014-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#ifndef __LIBLSS_PHYSICS_SMOOTH_MODIFIED_NGP_HPP
+#define __LIBLSS_PHYSICS_SMOOTH_MODIFIED_NGP_HPP
+
+#include <cmath>
+#include <CosmoTool/algo.hpp>
+#include "libLSS/tools/console.hpp"
+#include "libLSS/physics/generic_cic.hpp"
+#include "libLSS/tools/mpi_fftw_helper.hpp"
+
+namespace LibLSS {
+
+    template<typename T, typename SubgridSpec>
+    struct SmoothModifiedNGP_impl {
+        typedef T Type;
+        // Number of extra planes required in case of MPI
+        static const int MPI_PLANE_LEAKAGE = 1;
+        static const bool EXTRA_CHECK = true;
+
+        //get virtual grid spacing
+        //for testing we choose subres=1. this should reprodice CIC
+        //particles will be assumed to be little boxes of size dx*subres
+        // subres = 1 corresponds to CIC
+        // subres -> 0 approaches NGP
+        static constexpr double subres=SubgridSpec::value;
+
+        static constexpr double C0 = (1. - subres)/6.;
+
+        static inline T kernel(T delta) {
+            double const a = subres > 0 ? delta/subres : 0;
+            if (a < 0.5 && a > -0.5) {
+               return 0.5 + (a  - CosmoTool::cube(2*a) * C0);
+            } else if (a > 0.5) {
+                return 1 - 8 * C0 * CosmoTool::cube((0.5 - delta)/(1-subres));
+            } else {
+                return 8 * C0 * CosmoTool::cube((0.5 + delta)/(1-subres));
+            }
+        }
+
+        static inline T adjoint(T delta) {
+            double const a = subres > 0 ? delta/subres : 0;
+            if (a < 0.5 && a > -0.5) {
+               return (1 - (6*C0)*CosmoTool::square(2*a))/subres;
+            } else if (a > 0.5) {
+                return (24 * C0/(1-subres)) * CosmoTool::square((0.5 - delta)/(1-subres));
+            } else {
+                return (24 * C0/(1-subres)) * CosmoTool::square((0.5 + delta)/(1-subres));
+            }
+        }
+
+
+        // This function implements the particle projection to a grid.
+        // Arguments:
+        //   - particles (2d array: Nx3)
+        //   - density (3d array: N0xN1xN2, or slice thereof)
+        //   - Lx, Ly, Lz: physical size
+        //   - N0, N1, N2: grid size
+        //   - p: a function applying optional periodic boundary enforcement (depends on MPI for ghost plane)
+        //   - weight: per-particle weight functor, maybe returning only "1"
+        //   - Np: number of particles to project
+        template<typename ParticleArray, typename ProjectionDensityArray, typename WeightArray,
+                 typename PeriodicFunction >
+        static void projection(const ParticleArray& particles, ProjectionDensityArray& density,
+                               T Lx, T Ly, T Lz,
+                               int N0, int N1, int N2, const PeriodicFunction& p, const WeightArray& weight, size_t Np) {
+            ConsoleContext<LOG_DEBUG> ctx("Modified NGP projection");
+
+            T inv_dx = N0/Lx;
+            T inv_dy = N1/Ly;
+            T inv_dz = N2/Lz;
+
+            int minX = density.index_bases()[0];
+            int minY = density.index_bases()[1];
+            int minZ = density.index_bases()[2];
+            int maxX = density.index_bases()[0] + density.shape()[0];
+            int maxY = density.index_bases()[1] + density.shape()[1];
+            int maxZ = density.index_bases()[2] + density.shape()[2];
+
+            ctx.print(boost::format("minX=%d, maxX=%d, N0=%d") % minX % maxX % N0);
+
+            for (long i = 0; i < Np; i++) {
+
+                //divide particle positions by target grid-size
+                //Note: all integer numbers are therefore defined at target resolution
+                T x = particles[i][0]*inv_dx;
+                T y = particles[i][1]*inv_dy;
+                T z = particles[i][2]*inv_dz;
+
+                //Note, we want to find the nearest lower left corner of a voxel that fully contains
+                //the box-shapep particle.
+                //we therefore have to find the nearest voxel for the lower left corner of the particel box
+
+                size_t ix = (size_t)std::floor(x); //the offset of half a subresolution factor
+                size_t iy = (size_t)std::floor(y); //ensures the edges of the particle cloud are within
+                size_t iz = (size_t)std::floor(z); //the lower voxel boundaries
+
+                T rx, qx;
+                T ry, qy;
+                T rz, qz;
+
+                // dx > 0 by construction. delta is taken with respect to the center
+                // dx = ix+0.5 - x
+                qx = kernel((double(ix)-x) + 0.5);
+                rx = 1-qx;
+                qy = kernel((double(iy)-y) + 0.5);
+                ry = 1-qy;
+                qz = kernel((double(iz)-z) + 0.5);
+                rz = 1-qz;
+
+                //we need to check for periodicity
+                p(ix, iy, iz);
+
+                //if the particle is fully contained within a voxel
+                //then we can attribute its entire mass to this bin.
+                //otherwise a fraction of mass will be assigned to
+                //the next bin.
+
+                //find next cells
+                size_t jx = (ix+1);
+                size_t jy = (iy+1);
+                size_t jz = (iz+1);
+
+                //check for periodicity
+                p(jx, jy, jz);
+
+                double w = weight[i];
+
+                if (EXTRA_CHECK && jx >= maxX) {
+                    Console::instance().print<LOG_ERROR>(boost::format("Overflow at ix=%d, jx=%d (maxX=%d)") % ix % jx % maxX);
+                }
+                if (EXTRA_CHECK && ix < minX) {
+                    Console::instance().print<LOG_ERROR>(boost::format("Underflow at ix=%d, jx=%d") % ix % jx);
+                }
+                if (EXTRA_CHECK && ix >= maxX) {
+                    Console::instance().print<LOG_ERROR>(boost::format("Overflow at ix=%d, jx=%d with x=%g") % ix % jx % x);
+                }
+                if (EXTRA_CHECK && jy >= maxY) {
+                    Console::instance().print<LOG_ERROR>(boost::format("Overflow at iy=%d, jy=%d (maxY=%d)") % iy % jy % maxY);
+                }
+                if (EXTRA_CHECK && iy < minY) {
+                    Console::instance().print<LOG_ERROR>(boost::format("Underflow at iy=%d, jy=%d") % iy % jy);
+                }
+
+                density[ix][iy][iz] += (  qx)*(  qy)*(  qz)*w;
+                density[ix][iy][jz] += (  qx)*(  qy)*(  rz)*w;
+                density[ix][jy][iz] += (  qx)*(  ry)*(  qz)*w;
+                density[ix][jy][jz] += (  qx)*(  ry)*(  rz)*w;
+                density[jx][iy][iz] += (  rx)*(  qy)*(  qz)*w;
+                density[jx][iy][jz] += (  rx)*(  qy)*(  rz)*w;
+                density[jx][jy][iz] += (  rx)*(  ry)*(  qz)*w;
+                density[jx][jy][jz] += (  rx)*(  ry)*(  rz)*w;
+            }
+
+        }
+
+
+        template<typename GradientArray, typename ProjectionDensityArray>
+        static inline void __do_gradient(GradientArray& adj_gradient,
+                                  const ProjectionDensityArray& density,
+                                  size_t i,
+                                  int axis,
+                                  int ix, int iy, int iz,
+                                  int jx, int jy, int jz,
+                                  T dx, T dy, T dz,
+                                  T rx, T ry, T rz, T qx, T qy, T qz, T global_w)
+        {
+
+            switch (axis) {
+                case 0:
+                    qx = -adjoint(dx);
+                    rx= -qx;
+                    break;
+                case 1:
+                    qy = -adjoint(dy);
+                    ry= -qy;
+                    break;
+                case 2:
+                    qz = -adjoint(dz);
+                    rz= -qz;
+                    break;
+            }
+
+            double w =
+                density[ix][iy][iz] * qx * qy * qz +
+                density[ix][iy][jz] * qx * qy * rz +
+                density[ix][jy][iz] * qx * ry * qz +
+                density[ix][jy][jz] * qx * ry * rz +
+                density[jx][iy][iz] * rx * qy * qz +
+                density[jx][iy][jz] * rx * qy * rz +
+                density[jx][jy][iz] * rx * ry * qz +
+                density[jx][jy][jz] * rx * ry * rz;
+
+            adj_gradient[i][axis] += w*global_w;
+        }
+
+        template<typename ParticleArray, typename GradientArray, typename ProjectionDensityArray, typename PeriodicFunction, typename WeightArray>
+        static void adjoint(const ParticleArray& particles, ProjectionDensityArray& density,
+                            GradientArray& adjoint_gradient, const WeightArray& weight,
+                            T Lx, T Ly, T Lz,
+                            int N0, int N1, int N2,
+                            const PeriodicFunction& p,
+                            T nmean, size_t Np) {
+            ConsoleContext<LOG_DEBUG> ctx("Modified NGP adjoint-projection");
+
+            T inv_dx = N0/Lx;
+            T inv_dy = N1/Ly;
+            T inv_dz = N2/Lz;
+            T inv_nmean = T(1)/nmean;
+            int minX = density.index_bases()[0];
+            int minY = density.index_bases()[1];
+            int minZ = density.index_bases()[2];
+            int maxX = minX + density.shape()[0];
+            int maxY = minY + density.shape()[1];
+            int maxZ = minZ + density.shape()[2];
+
+            ctx.print(boost::format("Number of particles = %d (array is %d), minX=%d maxX=%d") % Np %particles.shape()[0] % minX % maxX);
+            ctx.print(boost::format("Adjoint gradient = %d") % adjoint_gradient.shape()[0]);
+
+#pragma omp parallel for schedule(static)
+            for (size_t i = 0; i < Np; i++) {
+
+                T x = particles[i][0]*inv_dx;
+                T y = particles[i][1]*inv_dy;
+                T z = particles[i][2]*inv_dz;
+
+                size_t ix = (size_t)std::floor(x);
+                size_t iy = (size_t)std::floor(y);
+                size_t iz = (size_t)std::floor(z);
+
+                T rx, qx;
+                T ry, qy;
+                T rz, qz;
+		T dx = (double(ix)-x)+0.5;
+		T dy = (double(iy)-y)+0.5;
+		T dz = (double(iz)-z)+0.5;
+
+                qx = kernel(dx);
+                rx = 1-qx;
+                qy = kernel(dy);
+                ry = 1-qy;
+                qz = kernel(dz);
+                rz = 1-qz;
+
+                p(ix, iy, iz);
+
+                size_t jx = (ix+1);
+                size_t jy = (iy+1);
+                size_t jz = (iz+1);
+
+                p(jx, jy, jz);
+
+                if (EXTRA_CHECK && jx >= maxX) {
+                    Console::instance().print<LOG_ERROR>(boost::format("Overflow at ix=%d, jx=%d (maxX adj = %d)") % ix % jx % maxX);
+                }
+                if (EXTRA_CHECK &&ix < minX) {
+                    Console::instance().print<LOG_ERROR>(boost::format("Underflow at ix=%d, jx=%d (adj)") % ix % jx);
+                }
+                if (EXTRA_CHECK &&jy >= maxY) {
+                    Console::instance().print<LOG_ERROR>(boost::format("Overflow at iy=%d, jy=%d (maxY=%d) adj") % iy % jy % maxY);
+                }
+                if (EXTRA_CHECK && iy < minY) {
+                    Console::instance().print<LOG_ERROR>(boost::format("Underflow at iy=%d, jy=%d adj") % iy % jy);
+                }
+                if (EXTRA_CHECK && jz >= maxZ) {
+                    Console::instance().print<LOG_ERROR>(boost::format("Overflow at iz=%d, jz=%d (maxZ=%d) adj") % iz % jz % maxZ);
+                }
+                if (EXTRA_CHECK && iz < minZ) {
+                    Console::instance().print<LOG_ERROR>(boost::format("Underflow at iz=%d, jz=%d adj") % iz % jz);
+                }
+
+                __do_gradient(adjoint_gradient, density, i, 0, ix, iy, iz, jx, jy, jz, dx, dy, dz, rx, ry, rz, qx, qy, qz, inv_nmean*inv_dx);
+                __do_gradient(adjoint_gradient, density, i, 1, ix, iy, iz, jx, jy, jz, dx, dy, dz, rx, ry, rz, qx, qy, qz, inv_nmean*inv_dy);
+                __do_gradient(adjoint_gradient, density, i, 2, ix, iy, iz, jx, jy, jz, dx, dy, dz, rx, ry, rz, qx, qy, qz, inv_nmean*inv_dz);
+            }
+
+        }
+
+    };
+
+
+    namespace SmoothNGPGrid {
+      struct CIC { static constexpr double value = 1; };
+      struct Double { static constexpr double value = 0.5; };
+      struct Quad { static constexpr double value = 0.3; };
+    }
+
+
+    // This implements the ModifiedNGP kernel. By default it acts like a CIC, for an additional cost.
+    // It relies on GenericCIC to implement the missing auxiliary functions from the base function
+    // given in ModifiedNGP_impl
+    template<typename T,typename SubgridSpec = SmoothNGPGrid::CIC>
+    class SmoothModifiedNGP: public GenericCIC<T, SmoothModifiedNGP_impl<T,SubgridSpec> > {
+    public:
+        typedef SmoothModifiedNGP_impl<T,SubgridSpec> Base;
+        typedef T Type;
+
+        // Number of extra ghost planes required in case of MPI. Only post planes are
+        // supported.
+        // In practice only ONE plane is supported at the moment.
+        static const int MPI_PLANE_LEAKAGE = 1;
+        static const int MPI_NEGATIVE_PLANE_LEAKAGE = 1;
+
+        struct Periodic_MPI
+        {
+          bool start;
+          size_t N0, N1, N2;
+
+          Periodic_MPI(size_t _N0, size_t _N1, size_t _N2, MPI_Communication *comm)
+            : N0(_N0), N1(_N1), N2(_N2) {
+            start = comm->rank() == 0;
+          }
+
+          void operator()(size_t& i, size_t& j, size_t& k) const {
+            if (start)
+              if (i >= N0) i %= N0;
+            if (j >= N1) j %= N1;
+            if (k >= N2) k %= N2;
+          }
+        };
+
+        // This defines the policy of load balancing distribution for MNGP.
+        // This class translates the requirements of slabing by FFTW to particle
+        // positions. As we are still using the ghost plane mechanism to adjust for
+        // edge effects this decision class is required to be able to do correct parallel
+        // projection.
+        // Its task is quite minimal as most of the complexity is in "get_peer" and
+        // load balancing in samplers/borg/pm/particle_distribution.hpp
+        struct Distribution {
+            typedef long LongElt;
+            typedef LibLSS::FFTW_Manager_3d<T> Manager;
+
+	    std::shared_ptr<Manager>& force_mgr;
+            size_t f_N0;
+            size_t f_startN0;
+            size_t f_localN0;
+            double L0;
+
+            Distribution(std::shared_ptr<Manager>& mgr, double L0, double = 0, double = 0)
+                : force_mgr(mgr), f_N0(mgr->N0), f_startN0(mgr->startN0), 
+                  f_localN0(mgr->localN0) {
+                  this->L0 = L0;
+                  Console::instance().print<LOG_DEBUG>(boost::format("Initialize particle distribution decisioner: N0 = %d, L0 = %g") % f_N0 % L0);
+            }
+
+            template<typename Position, typename... U>
+            inline LongElt operator()(Position&& pos, U&&...) {
+                T x = pos[0]*f_N0/L0;
+                LongElt i0 = LongElt(std::floor(x))  % f_N0;
+                LongElt peer = force_mgr->get_peer(i0);
+                //Console::instance().print<LOG_DEBUG>(boost::format("Pos %g, peer = %d") % x % peer);
+                return peer;
+            }
+        };
+
+    };
+}
+
+#endif
--- a/libLSS/physics/openmp_cic.hpp
+++ b/libLSS/physics/openmp_cic.hpp
@ -0,0 +1,328 @@
+/*+
+    ARES/HADES/BORG Package -- ./libLSS/physics/openmp_cic.hpp
+    Copyright (C) 2014-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#ifndef __LIBLSS_PHYSICS_OPENMP_CIC_HPP
+#define __LIBLSS_PHYSICS_OPENMP_CIC_HPP
+
+#include <cmath>
+#include "libLSS/tools/console.hpp"
+#include <boost/multi_array.hpp>
+#include <CosmoTool/omptl/omptl>
+#include <CosmoTool/omptl/omptl_algorithm>
+#include <iostream>
+#include "libLSS/tools/array_tools.hpp"
+#include "libLSS/physics/generic_cic.hpp"
+
+namespace LibLSS {
+
+  template <typename T>
+  struct OpenMPCloudInCell_impl {
+    typedef T Type;
+    // Number of extra planes required in case of MPI
+    static const int MPI_PLANE_LEAKAGE = 1;
+    typedef boost::multi_array<int, 1> ListArray;
+    typedef boost::multi_array<int, 1> AtomicListArray;
+
+    template <
+        typename ParticleArray, typename ProjectionDensityArray,
+        typename WeightArray, typename PeriodicFunction>
+    static void projection(
+        const ParticleArray &particles, ProjectionDensityArray &density, T Lx,
+        T Ly, T Lz, size_t N0, size_t N1, size_t N2, const PeriodicFunction &p,
+        const WeightArray &weight, size_t Np) {
+      using boost::extents;
+
+      ConsoleContext<LOG_DEBUG> ctx("OpenMP CIC projection");
+
+      T inv_dx = N0 / Lx;
+      T inv_dy = N1 / Ly;
+      T inv_dz = N2 / Lz;
+
+      typedef UninitializedArray<AtomicListArray> U_AtomicListArray;
+      typedef UninitializedArray<ListArray> U_ListArray;
+      U_AtomicListArray part_mesh_p(extents[long(N0) * long(N1) * long(N2)]);
+      U_ListArray part_list_p(extents[Np]);
+      U_AtomicListArray::array_type &part_mesh = part_mesh_p.get_array();
+      U_ListArray::array_type &part_list = part_list_p.get_array();
+      long Nmesh = part_mesh.num_elements();
+
+      {
+        ConsoleContext<LOG_DEBUG> ctx0("initialize arrays");
+        array::fill(part_mesh, -1);
+        array::fill(part_list, -1);
+      }
+
+      {
+        ConsoleContext<LOG_DEBUG> ctx0("build mesh list");
+// First build part -> mesh list
+#pragma omp parallel for schedule(static)
+        for (size_t i_part = 0; i_part < Np; i_part++) {
+
+          T x = particles[i_part][0] * inv_dx;
+          T y = particles[i_part][1] * inv_dy;
+          T z = particles[i_part][2] * inv_dz;
+
+          size_t ix = (size_t)std::floor(x);
+          size_t iy = (size_t)std::floor(y);
+          size_t iz = (size_t)std::floor(z);
+
+          size_t idx = iz + N2 * iy + N2 * N1 * ix;
+
+          int initial_elt =
+              __atomic_exchange_n(&part_mesh[idx], i_part, __ATOMIC_RELAXED);
+          if (initial_elt != -1) {
+            part_list[i_part] = initial_elt;
+          }
+        }
+      }
+
+      {
+        ConsoleContext<LOG_DEBUG> ctx0("reverse list");
+
+        // We built the list in the incorrect order, reverse it as fast as we can
+#pragma omp parallel for schedule(dynamic, 10000)
+        for (size_t mid = 0; mid < Nmesh; mid++) {
+          int current_part = part_mesh[mid];
+
+          if (current_part >= 0) {
+            int next_part = part_list[current_part];
+
+            part_list[current_part] = -1;
+            while (next_part != -1) {
+              int p = part_list[next_part];
+              part_list[next_part] = current_part;
+              current_part = next_part;
+              next_part = p;
+            }
+            part_mesh[mid] = current_part;
+          }
+        }
+      }
+
+      {
+        ConsoleContext<LOG_DEBUG> ctx0("projection");
+
+#pragma omp parallel
+        {
+
+          for (int looper0 = 0; looper0 < 2; looper0++) {
+            for (int looper1 = 0; looper1 < 2; looper1++) {
+              for (int looper2 = 0; looper2 < 2; looper2++) {
+
+                int r[3] = {looper0, looper1, looper2};
+
+#pragma omp barrier
+#pragma omp for schedule(dynamic, 10000)
+                for (long mid = 0; mid < Nmesh; mid++) {
+                  int mz = mid % N2;
+                  int my = (mid / N2) % N1;
+                  int mx = (mid / (N2 * N1));
+                  int i_part = part_mesh[mid];
+
+                  T w = 0;
+
+                  while (i_part != -1) {
+                    T w0 = 1;
+                    T x = particles[i_part][0] * inv_dx;
+                    T y = particles[i_part][1] * inv_dy;
+                    T z = particles[i_part][2] * inv_dz;
+                    T qx = std::floor(x);
+                    T qy = std::floor(y);
+                    T qz = std::floor(z);
+                    T dx = x - qx;
+                    T dy = y - qy;
+                    T dz = z - qz;
+                    w0 = (r[0] == 1) ? dx : (T(1) - dx);
+                    w0 *= (r[1] == 1) ? dy : (T(1) - dy);
+                    w0 *= (r[2] == 1) ? dz : (T(1) - dz);
+                    w += w0 * weight[i_part];
+                    i_part = part_list[i_part];
+                  }
+
+                  size_t tx = (mx + looper0);
+                  size_t ty = (my + looper1);
+                  size_t tz = (mz + looper2);
+                  p(tx, ty, tz);
+                  density[tx][ty][tz] += w;
+                }
+              }
+            }
+          }
+        }
+#pragma omp barrier
+      }
+    }
+
+    template <
+        typename GradientArray, typename ProjectionDensityArray,
+        typename WeightArray>
+    static inline
+        typename std::enable_if<WeightArray::dimensionality == 1>::type
+        __do_gradient(
+            GradientArray &adj_gradient, const ProjectionDensityArray &density,
+            WeightArray const &a_w, size_t i, int axis, size_t ix, size_t iy,
+            size_t iz, size_t jx, size_t jy, size_t jz, T x, T y, T z,
+            T global_w) {
+      T rx, ry, rz;
+      T qx, qy, qz;
+
+      switch (axis) {
+      case 0:
+        rx = 1;
+        qx = -1;
+        ry = y - iy;
+        qy = 1 - ry;
+        rz = z - iz;
+        qz = 1 - rz;
+        break;
+      case 1:
+        rx = x - ix;
+        qx = 1 - rx;
+        ry = 1;
+        qy = -1;
+        rz = z - iz;
+        qz = 1 - rz;
+        break;
+      case 2:
+        rx = x - ix;
+        qx = 1 - rx;
+        ry = y - iy;
+        qy = 1 - ry;
+        rz = 1;
+        qz = -1;
+        break;
+      }
+
+      double w = density[ix][iy][iz] * qx * qy * qz +
+                 density[ix][iy][jz] * qx * qy * rz +
+                 density[ix][jy][iz] * qx * ry * qz +
+                 density[ix][jy][jz] * qx * ry * rz +
+                 density[jx][iy][iz] * rx * qy * qz +
+                 density[jx][iy][jz] * rx * qy * rz +
+                 density[jx][jy][iz] * rx * ry * qz +
+                 density[jx][jy][jz] * rx * ry * rz;
+
+      adj_gradient[i][axis] = a_w[axis] * w * global_w;
+    }
+
+    template <typename GradientArray, typename ProjectionDensityArray>
+    static inline void __do_gradient(
+        GradientArray &adj_gradient, const ProjectionDensityArray &density,
+        T a_w, size_t i, int axis, size_t ix, size_t iy, size_t iz, size_t jx,
+        size_t jy, size_t jz, T x, T y, T z, T global_w) {
+      T rx, ry, rz;
+      T qx, qy, qz;
+
+      switch (axis) {
+      case 0:
+        rx = 1;
+        qx = -1;
+        ry = y - iy;
+        qy = 1 - ry;
+        rz = z - iz;
+        qz = 1 - rz;
+        break;
+      case 1:
+        rx = x - ix;
+        qx = 1 - rx;
+        ry = 1;
+        qy = -1;
+        rz = z - iz;
+        qz = 1 - rz;
+        break;
+      case 2:
+        rx = x - ix;
+        qx = 1 - rx;
+        ry = y - iy;
+        qy = 1 - ry;
+        rz = 1;
+        qz = -1;
+        break;
+      }
+
+      double w = density[ix][iy][iz] * qx * qy * qz +
+                 density[ix][iy][jz] * qx * qy * rz +
+                 density[ix][jy][iz] * qx * ry * qz +
+                 density[ix][jy][jz] * qx * ry * rz +
+                 density[jx][iy][iz] * rx * qy * qz +
+                 density[jx][iy][jz] * rx * qy * rz +
+                 density[jx][jy][iz] * rx * ry * qz +
+                 density[jx][jy][jz] * rx * ry * rz;
+
+      adj_gradient[i][axis] += a_w * w * global_w;
+    }
+
+    template <
+        typename ParticleArray, typename ProjectionDensityArray,
+        typename GradientArray, typename PeriodicFunction, typename WeightArray>
+    static void adjoint(
+        const ParticleArray &particles, ProjectionDensityArray &density,
+        GradientArray &adjoint_gradient, const WeightArray &w, T Lx, T Ly, T Lz,
+        size_t N0, size_t N1, size_t N2, const PeriodicFunction &p, T nmean,
+        size_t Np) {
+      ConsoleContext<LOG_DEBUG> ctx("Classic CIC adjoint-projection");
+
+      T inv_dx = N0 / Lx;
+      T inv_dy = N1 / Ly;
+      T inv_dz = N2 / Lz;
+      T inv_nmean = 1 / nmean;
+      size_t minX = density.index_bases()[0], minY = density.index_bases()[1],
+             minZ = density.index_bases()[2],
+             maxX = density.index_bases()[0] + density.shape()[0],
+             maxY = density.index_bases()[1] + density.shape()[1],
+             maxZ = density.index_bases()[2] + density.shape()[2];
+
+#pragma omp parallel for schedule(static)
+      for (long i = 0; i < Np; i++) {
+
+        T x = particles[i][0] * inv_dx;
+        T y = particles[i][1] * inv_dy;
+        T z = particles[i][2] * inv_dz;
+
+        size_t ix = (size_t)std::floor(x);
+        size_t iy = (size_t)std::floor(y);
+        size_t iz = (size_t)std::floor(z);
+
+        size_t jx = (ix + 1);
+        size_t jy = (iy + 1);
+        size_t jz = (iz + 1);
+
+        p(jx, jy, jz);
+
+        if (ix < minX || ix >= maxX || iy < minY || iy >= maxY || iz < minZ ||
+            iz >= maxZ)
+          continue;
+
+        __do_gradient(
+            adjoint_gradient, density, w[i], i, 0, ix, iy, iz, jx, jy, jz, x, y,
+            z, inv_dx * inv_nmean);
+        __do_gradient(
+            adjoint_gradient, density, w[i], i, 1, ix, iy, iz, jx, jy, jz, x, y,
+            z, inv_dy * inv_nmean);
+        __do_gradient(
+            adjoint_gradient, density, w[i], i, 2, ix, iy, iz, jx, jy, jz, x, y,
+            z, inv_dz * inv_nmean);
+      }
+    }
+  };
+
+  template <typename T>
+  class OpenMPCloudInCell : public GenericCIC<T, OpenMPCloudInCell_impl<T>> {
+  public:
+    typedef T Type;
+    // Number of extra planes required in case of MPI
+    static const int MPI_PLANE_LEAKAGE = 1;
+    static const int MPI_NEGATIVE_PLANE_LEAKAGE = 0;
+    typedef CIC_Distribution<T> Distribution;
+    typedef CIC_Tools::Periodic_MPI Periodic_MPI;
+  };
+
+} // namespace LibLSS
+
+#endif