Initial import

2023-05-29 10:41:03 +02:00 · 2023-05-29 10:41:03 +02:00 · 56a50eead3
commit 56a50eead3
820 changed files with 192077 additions and 0 deletions
--- a/extra/borg/libLSS/borg.cmake
+++ b/extra/borg/libLSS/borg.cmake
@ -0,0 +1,104 @@
+require_ares_module(hades)
+
+include_directories(${JULIA_INCLUDE_DIRS})
+include(${CMAKE_SOURCE_DIR}/extra/hades/scripts/models.cmake)
+
+# Retrieve current git revision
+SET(save_dir ${CMAKE_CURRENT_SOURCE_DIR})
+SET(CMAKE_CURRENT_SOURCE_DIR ${CMAKE_SOURCE_DIR}/extra/borg/libLSS/)
+get_git_head_revision(HEAD BORG_GIT_VER)
+SET(CMAKE_CURRENT_SOURCE_DIR ${save_dir})
+
+SET(BASE_BORG_LIBLSS ${CMAKE_SOURCE_DIR}/extra/borg/libLSS/)
+configure_file(${BASE_BORG_LIBLSS}/borg_version.cpp.in ${CMAKE_CURRENT_BINARY_DIR}/borg_version.cpp)
+
+SET(EXTRA_BORG ${BASE_BORG_LIBLSS}/samplers/borg)
+SET(EXTRA_PHY_BORG ${BASE_BORG_LIBLSS}/physics)
+SET(EXTRA_LIBLSS ${EXTRA_LIBLSS}
+      ${CMAKE_CURRENT_BINARY_DIR}/borg_version.cpp
+      ${EXTRA_BORG}/borg_poisson_likelihood.cpp
+      ${EXTRA_BORG}/borg_poisson_meta.cpp
+      ${EXTRA_PHY_BORG}/bias/biases.cpp
+      ${EXTRA_PHY_BORG}/forwards/borg_lpt.cpp
+      ${EXTRA_PHY_BORG}/forwards/borg_qlpt.cpp
+      ${EXTRA_PHY_BORG}/forwards/borg_qlpt_rsd.cpp
+      ${EXTRA_PHY_BORG}/forwards/borg_2lpt.cpp
+      ${EXTRA_PHY_BORG}/forwards/transfer.cpp
+      ${EXTRA_PHY_BORG}/forwards/softplus.cpp
+      ${EXTRA_PHY_BORG}/forwards/enforceMass.cpp
+      ${EXTRA_PHY_BORG}/forwards/downgrade.cpp
+      ${EXTRA_PHY_BORG}/forwards/patch_model.cpp
+      ${EXTRA_PHY_BORG}/forwards/adapt_generic_bias.cpp
+      ${EXTRA_PHY_BORG}/forwards/altair_ap.cpp
+      ${EXTRA_PHY_BORG}/forwards/particle_balancer/dyn/particle_distribute.cpp
+      ${EXTRA_PHY_BORG}/velocity/velocity_cic.cpp
+      ${BASE_BORG_LIBLSS}/io/gadget3.cpp
+      ${EXTRA_PHY_BORG}/likelihoods/robust_poisson.cpp
+      ${CMAKE_SOURCE_DIR}/extra/borg/libLSS/samplers/generic/run_forward.cpp
+      ${CMAKE_SOURCE_DIR}/extra/borg/libLSS/samplers/generic/impl_generic.cpp
+      ${CMAKE_SOURCE_DIR}/extra/borg/libLSS/samplers/generic/impl_gaussian.cpp
+      ${CMAKE_SOURCE_DIR}/extra/borg/libLSS/samplers/generic/impl_poisson.cpp
+      ${CMAKE_SOURCE_DIR}/extra/borg/libLSS/samplers/generic/impl_robust.cpp
+      ${CMAKE_SOURCE_DIR}/extra/borg/libLSS/samplers/generic/generic_sigma8.cpp
+      ${CMAKE_SOURCE_DIR}/extra/borg/libLSS/samplers/generic/generic_sigma8_second.cpp
+      ${CMAKE_SOURCE_DIR}/extra/borg/libLSS/samplers/rgen/qnhmc/qnhmc_density_sampler.cpp
+      ${BASE_BORG_LIBLSS}/samplers/lya/base_lya_likelihood.cpp
+      ${BASE_BORG_LIBLSS}/samplers/lya/hades_lya_likelihood.cpp
+      ${BASE_BORG_LIBLSS}/samplers/lya_rsd_qlpt/hades_lya_likelihood_rsd.cpp
+      ${BASE_BORG_LIBLSS}/samplers/altair/altair_meta_sampler.cpp
+)
+
+#
+# Julia components
+#
+
+IF (BUILD_JULIA)
+  SET(ARES_JULIA ${CMAKE_SOURCE_DIR}/extra/borg )
+  SET(EXTRA_LIBLSS
+    ${EXTRA_LIBLSS}
+    ${ARES_JULIA}/libLSS/julia/julia_box.cpp
+    ${ARES_JULIA}/libLSS/julia/julia_calls.cpp
+    ${ARES_JULIA}/libLSS/julia/julia_mcmc.cpp
+    ${ARES_JULIA}/libLSS/julia/julia.cpp
+    ${ARES_JULIA}/libLSS/julia/julia_ghosts.cpp
+    ${ARES_JULIA}/libLSS/physics/forwards/julia.cpp
+    ${ARES_JULIA}/libLSS/samplers/julia/julia_likelihood.cpp
+  )
+
+
+  include(FindPythonInterp)
+
+  add_custom_command(
+        OUTPUT ${CMAKE_BINARY_DIR}/libLSS/julia/julia_module.hpp
+        COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/build_tools/gen_code_in_header.py ${ARES_JULIA}/libLSS/julia/julia_module.jl ${CMAKE_BINARY_DIR}/libLSS/julia/julia_module.hpp
+        MAIN_DEPENDENCY ${ARES_JULIA}/libLSS/julia/julia_module.jl
+  )
+
+  file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/libLSS/julia)
+  set_property(SOURCE ${ARES_JULIA}/libLSS/julia/julia.cpp
+        APPEND PROPERTY OBJECT_DEPENDS ${CMAKE_BINARY_DIR}/libLSS/julia/julia_module.hpp
+  )
+
+  set_property(
+    SOURCE
+       ${ARES_JULIA}/libLSS/julia/julia.cpp
+       ${ARES_JULIA}/libLSS/julia/julia_calls.cpp
+       ${ARES_JULIA}/libLSS/julia/julia_box.cpp
+    APPEND PROPERTY COMPILE_DEFINITIONS ${JULIA_DEFS}
+  )
+ENDIF()
+
+# Add a list of includes to be specified to ensure the linker will include all the symbols for automatic registration.
+hades_register_forward_models(
+  libLSS/physics/forwards/transfer.hpp
+  libLSS/physics/forwards/borg_lpt.hpp
+  libLSS/physics/forwards/borg_qlpt.hpp
+  libLSS/physics/forwards/borg_qlpt_rsd.hpp
+  libLSS/physics/forwards/borg_2lpt.hpp
+  libLSS/physics/forwards/downgrade.hpp
+  libLSS/physics/forwards/softplus.hpp
+  libLSS/physics/forwards/enforceMass.hpp
+  libLSS/physics/forwards/patch_model.hpp
+  libLSS/physics/forwards/adapt_generic_bias.hpp
+  libLSS/physics/forwards/altair_ap.hpp
+)
--- a/extra/borg/libLSS/borg_splash.hpp
+++ b/extra/borg/libLSS/borg_splash.hpp
@ -0,0 +1,62 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/borg_splash.hpp
+    Copyright (C) 2014-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#ifndef __LIBLSS_BORG_SPLASH_HPP
+#define __LIBLSS_BORG_SPLASH_HPP
+
+#include <string>
+#include "libLSS/tools/color_mod.hpp"
+#include "libLSS/borg_version.hpp"
+
+namespace LibLSS {
+
+  namespace BORG {
+    using namespace LibLSS::Color;
+
+    static void splash_borg() {
+
+      static std::string splash_str[] = {
+          "    ___________                              ",
+          "   /-/_\"/-/_/-/|     __________________________ ",
+          "  /\"-/-_\"/-_//||                " + fg(BLUE, "BORG3", BRIGHT) +
+              " model",
+          " /__________/|/|     (c) Jens Jasche 2012 - 2019",
+          " |\"|_'='-]:+|/||        Guilhem Lavaux 2014 - 2019",
+          " |-+-|.|_'-\"||//     __________________________ ",
+          " |[\".[:!+-'=|//     ",
+          " |='!+|-:]|-|/       ",
+          "  ----------         ",
+          "",
+          "Please acknowledge the following papers:",
+          "  - Jasche & Lavaux (A&A, 2019, arXiv 1806.11117)",
+          "  - Jasche & Wandelt (MNRAS, 2012, arXiv 1203.3639)",
+          "  - Jasche & Kitaura (MNRAS, 2010, arXiv 0911.2496)",
+          "  - And relevant papers depending on the used sub-module/contribution",
+          "\n",
+          "This is BORG version " + BORG_GIT_VERSION};
+
+      static const int numSplashStr =
+          sizeof(splash_str) / sizeof(splash_str[0]);
+
+      for (int i = 0; i < numSplashStr; i++)
+        Console::instance().print<LOG_STD>(splash_str[i]);
+    }
+
+  } // namespace BORG
+};  // namespace LibLSS
+
+
+#endif
+// ARES TAG: authors_num = 2
+// ARES TAG: name(0) = Guilhem Lavaux
+// ARES TAG: email(0) = guilhem.lavaux@iap.fr
+// ARES TAG: year(0) = 2014-2020
+// ARES TAG: name(1) = Jens Jasche
+// ARES TAG: email(1) = jens.jasche@fysik.su.se
+// ARES TAG: year(1) = 2009-2020
--- a/extra/borg/libLSS/borg_version.cpp.in
+++ b/extra/borg/libLSS/borg_version.cpp.in
@ -0,0 +1,3 @@
+#include "libLSS/borg_version.hpp"
+
+const std::string LibLSS::BORG_GIT_VERSION = "@BORG_GIT_VER@"; 
--- a/extra/borg/libLSS/borg_version.hpp
+++ b/extra/borg/libLSS/borg_version.hpp
@ -0,0 +1,21 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/borg_version.hpp
+    Copyright (C) 2014-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#ifndef __LIBLSS_BORG_VERSION_HPP
+#define __LIBLSS_BORG_VERSION_HPP
+
+#include <string>
+
+namespace LibLSS {
+
+  extern const std::string BORG_GIT_VERSION;
+
+}
+
+#endif
--- a/extra/borg/libLSS/data/lyman_alpha.hpp
+++ b/extra/borg/libLSS/data/lyman_alpha.hpp
@ -0,0 +1,38 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/data/lyman_alpha.hpp
+    Copyright (C) 2014-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#ifndef __LIBLSS_LYMAN_ALPHA_HPP
+#define __LIBLSS_LYMAN_ALPHA_HPP
+
+#include <CosmoTool/hdf5_array.hpp>
+#include "libLSS/physics/projector.hpp"
+
+namespace LibLSS {
+    
+    struct BaseLymanAlphaDescriptor {
+        unsigned long long id;
+        double phi, theta;
+        double zo;
+        double z;
+        double r;
+    };
+    
+};
+
+  /* HDF5 complex type */
+  CTOOL_STRUCT_TYPE(LibLSS::BaseLymanAlphaDescriptor, HDF5T_BaseLymanAlphaDescriptor,
+    ((unsigned long long, id))
+    ((double, phi))
+    ((double, theta))
+    ((double, zo))
+    ((double, z))
+    ((double, r))
+  );
+  
+#endif
--- a/extra/borg/libLSS/data/lyman_alpha_load_txt.hpp
+++ b/extra/borg/libLSS/data/lyman_alpha_load_txt.hpp
@ -0,0 +1,120 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/data/lyman_alpha_load_txt.hpp
+    Copyright (C) 2014-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#ifndef __LIBLSS_LYMAN_ALPHA_LOAD_TXT_HPP
+#define __LIBLSS_LYMAN_ALPHA_LOAD_TXT_HPP
+
+#include <string>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <boost/format.hpp>
+#include "libLSS/tools/console.hpp"
+#include "libLSS/tools/errors.hpp"
+#include "boost/multi_array.hpp"
+#include <CosmoTool/hdf5_array.hpp>
+#include <libLSS/physics/projector.hpp>
+#include <H5Cpp.h>
+
+namespace LibLSS {
+
+
+  struct BinsSpectraStruct {
+     size_t id;
+     double phi, theta;
+     double r;
+     double z;
+  };
+}
+
+CTOOL_STRUCT_TYPE(LibLSS::BinsSpectraStruct, HDF5T_LibLSS_BinsSpectraStruct,
+  ((size_t, id))
+  ((double, phi))
+  ((double, theta))
+  ((double, z))
+  ((double, r))
+);
+
+namespace LibLSS {
+  template<typename LymanAlphaSurvey>
+  void  loadLymanAlphaFromHDF5(
+      const std::string& fname,
+      LymanAlphaSurvey& survey, MarkovState& state) {
+      
+        std::cout << "ENTER load from hades_lya folder" << std::endl;
+       	using namespace std;
+        using boost::format;
+        Console& cons = Console::instance();
+        long originalSize = survey.NumberQSO();
+        
+        long N0 = static_cast<SLong&>(state["N0"]);
+        long N1 = static_cast<SLong&>(state["N1"]);
+        long N2 = static_cast<SLong&>(state["N2"]);
+		
+		bool warningDefault = false;
+        
+        boost::multi_array<BinsSpectraStruct, 1> QSO;
+
+        H5::H5File fg(fname, H5F_ACC_RDONLY) ;
+
+    	CosmoTool::hdf5_read_array(fg, "QSO", QSO);
+    
+    	boost::multi_array<LOSContainer, 1> proj;
+		proj.resize(boost::extents[QSO.shape()[0]]);
+        
+        typename LymanAlphaSurvey::QSOType qso;
+        typename LymanAlphaSurvey::LOSType los;
+        
+        RandomGen *rgen = state.get<RandomGen>("random_generator");
+        
+        //int step = N0 / pow(double(QSO.size()),0.5);
+        int ix=0, iy=0;
+        int l;
+    	for(int i=0; i<QSO.size(); i++)
+    	{
+    	    qso.id = QSO[i].id;
+    	    qso.theta = QSO[i].theta;
+    	    qso.phi = QSO[i].phi;
+    	    qso.z = QSO[i].z; 
+    	    
+    		survey.addLOS(proj[i]);
+		    survey.addQSO(qso);
+
+		    CosmoTool::hdf5_read_array(fg, str(boost::format("flux_%d") % i),survey.getProjection()[i].flux);
+			    
+		    l =survey.getProjection()[i].flux.size();
+		    survey.getProjection()[i].dlos.resize(boost::extents[l]);
+		    survey.getProjection()[i].z.resize(boost::extents[l]);
+		    survey.getProjection()[i].voxel_id.resize(boost::extents[l][3]);
+		    
+		    ix = (N0-2 -2) * rgen->get().uniform() + 2;
+		    iy = (N1-2 -2) * rgen->get().uniform() + 2;
+			    
+		    for (int ii=0; ii<l; ii++){
+			    survey.getProjection()[i].voxel_id[ii][0] = int(ix);
+			    survey.getProjection()[i].voxel_id[ii][1] = int(iy); 
+			    survey.getProjection()[i].voxel_id[ii][2] = int(ii+4);
+		    }
+			    
+			    
+		    //iy += step;
+		    //if (iy>N2){
+			//    iy = 0;
+			//    ix += step;
+		    //}
+			    
+			
+	}
+		
+        cons.print<LOG_STD>(format("Got %d los") % QSO.size());
+       
+   }
+} 
+
+#endif
--- a/extra/borg/libLSS/data/lyman_alpha_qso.hpp
+++ b/extra/borg/libLSS/data/lyman_alpha_qso.hpp
@ -0,0 +1,75 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/data/lyman_alpha_qso.hpp
+    Copyright (C) 2014-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#ifndef __LIBLSS_DATA_LYMAN_ALPHA_HPP
+#define __LIBLSS_DATA_LYMAN_ALPHA_HPP
+
+#include <H5Cpp.h>
+#include <boost/utility/base_from_member.hpp>
+#include <boost/mpl/assert.hpp>
+#include <boost/utility/enable_if.hpp>
+#include <boost/multi_array.hpp>
+#include <boost/function.hpp>
+#include <healpix_cxx/pointing.h>
+#include "libLSS/data/base.hpp"
+#include "libLSS/tools/allocator_policy.hpp"
+#include "libLSS/tools/checkmem.hpp"
+#include "libLSS/physics/cosmo.hpp"
+#include "libLSS/physics/projector.hpp"
+
+namespace LibLSS
+{
+    template<class GT,class AllocationPolicy = DefaultAllocationPolicy>
+    class LymanAlphaSurvey: virtual LibLSS::Base_Data
+    {
+    public:
+        typedef GT QSOType;
+        typedef LOSContainer LOSType;
+        typedef typename boost::multi_array<QSOType, 1> QSOArray;
+        typedef typename boost::multi_array<LOSType, 1> ProjectionArray;
+    protected:
+        QSOArray QSO;
+        ProjectionArray projection;
+        long numQSO;
+        long numLOS;
+        
+    public:
+        LymanAlphaSurvey() : numQSO(0), numLOS(0) {}
+        ~LymanAlphaSurvey() {}	
+        		
+        long NumberQSO() const { return numQSO; }
+        long NumberLOS() const { return numLOS; }
+        
+        // Methods defined in the tcc file
+        void addQSO(const QSOType& qso);
+        void addLOS(LOSType& los);
+        
+        QSOArray& getQSO() { return QSO; }
+        const QSOArray& getQSO() const { return QSO; }
+        
+        ProjectionArray& getProjection() {return projection; }
+        const ProjectionArray& getProjection() const {return projection; }
+        
+        void optimize() {
+           QSO.resize(boost::extents[numQSO]);
+        }
+        
+        void saveMain(H5::H5Location& fg);
+        void restoreMain(H5::H5Location& fg);
+
+        void save(H5::H5Location& fg) { saveMain(fg); }
+        void restore(H5::H5Location& fg) { restoreMain(fg); }
+
+    };
+
+};
+
+#include "lyman_alpha_qso.tcc"
+
+#endif
--- a/extra/borg/libLSS/data/lyman_alpha_qso.tcc
+++ b/extra/borg/libLSS/data/lyman_alpha_qso.tcc
@ -0,0 +1,65 @@
+#include <iostream>
+#include <CosmoTool/hdf5_array.hpp>
+#include "libLSS/tools/hdf5_scalar.hpp"
+
+namespace LibLSS {
+
+  template <class QSOType, class AllocationPolicy>
+  void LymanAlphaSurvey<QSOType, AllocationPolicy>::addQSO(const QSOType &qso) {
+    if (numQSO == QSO.size()) {
+      QSO.resize(boost::extents[numQSO + AllocationPolicy::getIncrement()]);
+    }
+
+    QSO[numQSO] = qso;
+
+    numQSO++;
+  }
+
+  template <class QSOType, class AllocationPolicy>
+  void LymanAlphaSurvey<QSOType, AllocationPolicy>::addLOS(LOSType &los) {
+    if (numLOS == projection.size()) {
+      projection.resize(
+          boost::extents[numLOS + AllocationPolicy::getIncrement()]);
+    }
+
+    projection[numLOS] = los;
+    numLOS++;
+  }
+
+  template <class QSOType, class AllocationPolicy>
+  void
+  LymanAlphaSurvey<QSOType, AllocationPolicy>::saveMain(H5::H5Location &fg) {
+    optimize();
+
+    H5::Group g0 = fg.createGroup("qso_array");
+    CosmoTool::hdf5_write_array(g0, "QSO", QSO);
+
+    for (int i = 0; i < numQSO; i++) {
+      H5::Group g = g0.createGroup(str(boost::format("qso_%d") % i));
+      CosmoTool::hdf5_write_array(g, "voxel_id", projection[i].voxel_id);
+      CosmoTool::hdf5_write_array(g, "dlos", projection[i].dlos);
+      CosmoTool::hdf5_write_array(g, "flux", projection[i].flux);
+    }
+  }
+
+  template <class QSOType, class AllocationPolicy>
+  void
+  LymanAlphaSurvey<QSOType, AllocationPolicy>::restoreMain(H5::H5Location &fg) {
+    H5::Group g0 = fg.openGroup("qso_array");
+    CosmoTool::hdf5_read_array(g0, "QSO", QSO);
+
+    numQSO = QSO.size();
+
+    projection.resize(boost::extents[numQSO]);
+
+    for (int i = 0; i < numQSO; i++) {
+      int s = 0;
+      H5::Group g = g0.openGroup(str(boost::format("qso_%d") % i));
+
+      CosmoTool::hdf5_read_array(g, "voxel_id", projection[i].voxel_id);
+      CosmoTool::hdf5_read_array(g, "dlos", projection[i].dlos);
+      CosmoTool::hdf5_read_array(g, "flux", projection[i].flux);
+    }
+  }
+
+}; // namespace LibLSS
--- a/extra/borg/libLSS/io/gadget3.cpp
+++ b/extra/borg/libLSS/io/gadget3.cpp
@ -0,0 +1,681 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/io/gadget3.cpp
+    Copyright (C) 2016-2018 Florent Leclercq <florent.leclercq@polytechnique.org>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#include "libLSS/io/gadget3.hpp"
+#include <cstring>
+
+using namespace std;
+using namespace LibLSS;
+using namespace LibLSS::IO;
+
+///-------------------------------------------------------------------------------------
+/** @fn get_bytes_per_blockelement
+ *  This function tells the size of one data entry in each of the blocks
+ *  defined for the output file. If one wants to add a new output-block, this
+ *  function should be augmented accordingly.
+ * @param blocknr
+ */
+static int get_bytes_per_blockelement(enum iofields blocknr) {
+  int bytes_per_blockelement = 0;
+  switch (blocknr) {
+  case IO_POS:
+  case IO_VEL:
+  case IO_ACCEL:
+    bytes_per_blockelement = 3 * sizeof(float);
+    break;
+  case IO_ID:
+    bytes_per_blockelement = sizeof(particleID_t);
+    break;
+  case IO_MASS:
+  case IO_U:
+  case IO_RHO:
+  case IO_HSML:
+  case IO_POT:
+  case IO_DTENTR:
+  case IO_TSTP:
+    bytes_per_blockelement = sizeof(float);
+    break;
+  }
+  return bytes_per_blockelement;
+} //get_bytes_per_blockelement
+///-------------------------------------------------------------------------------------
+/** @fn get_values_per_blockelement
+ * This function informs about the number of elements stored per particle for
+ *  the given block of the output file. If one wants to add a new
+ *  output-block, this function should be augmented accordingly.
+ * @param blocknr
+ */
+static int get_values_per_blockelement(enum iofields blocknr) {
+  int values = 0;
+  switch (blocknr) {
+  case IO_POS:
+  case IO_VEL:
+  case IO_ACCEL:
+    values = 3;
+    break;
+  case IO_ID:
+  case IO_MASS:
+  case IO_U:
+  case IO_RHO:
+  case IO_HSML:
+  case IO_POT:
+  case IO_DTENTR:
+  case IO_TSTP:
+    values = 1;
+    break;
+  }
+  return values;
+} //get_values_per_blockelement
+///-------------------------------------------------------------------------------------
+/** @fn get_dataset_name
+ *  This function returns a descriptive character string that describes the
+ *  name of the block when the HDF5 file format is used.  If one wants to add
+ *  a new output-block, this function should be augmented accordingly.
+ * @param blocknr
+ * @param buf
+ */
+static void get_dataset_name(enum iofields blocknr, char *buf) {
+  strcpy(buf, "default");
+
+  switch (blocknr) {
+  case IO_POS:
+    strcpy(buf, "Coordinates");
+    break;
+  case IO_VEL:
+    strcpy(buf, "Velocities");
+    break;
+  case IO_ID:
+    strcpy(buf, "ParticleIDs");
+    break;
+  case IO_MASS:
+    strcpy(buf, "Masses");
+    break;
+  case IO_U:
+    strcpy(buf, "InternalEnergy");
+    break;
+  case IO_RHO:
+    strcpy(buf, "Density");
+    break;
+  case IO_HSML:
+    strcpy(buf, "SmoothingLength");
+    break;
+  case IO_POT:
+    strcpy(buf, "Potential");
+    break;
+  case IO_ACCEL:
+    strcpy(buf, "Acceleration");
+    break;
+  case IO_DTENTR:
+    strcpy(buf, "RateOfChangeOfEntropy");
+    break;
+  case IO_TSTP:
+    strcpy(buf, "TimeStep");
+    break;
+  }
+} //get_dataset_name
+///-------------------------------------------------------------------------------------
+/** @fn read_header_attributes_in_hdf5
+ *  This function reads the header information in case the HDF5 file format is
+ *  used.
+ * @param hdf5_file
+ * @param header
+ */
+void read_header_attributes_in_hdf5(hid_t hdf5_file, header_t *header) {
+  hid_t hdf5_headergrp, hdf5_attribute;
+
+  hdf5_headergrp = H5Gopen(hdf5_file, "/Header", H5P_DEFAULT);
+
+  hdf5_attribute = H5Aopen_name(hdf5_headergrp, "NumPart_ThisFile");
+  H5Aread(hdf5_attribute, H5T_NATIVE_UINT, header->npart);
+  H5Aclose(hdf5_attribute);
+
+  hdf5_attribute = H5Aopen_name(hdf5_headergrp, "MassTable");
+  H5Aread(hdf5_attribute, H5T_NATIVE_DOUBLE, header->mass);
+  H5Aclose(hdf5_attribute);
+
+  hdf5_attribute = H5Aopen_name(hdf5_headergrp, "Time");
+  H5Aread(hdf5_attribute, H5T_NATIVE_DOUBLE, &header->time);
+  H5Aclose(hdf5_attribute);
+
+  hdf5_attribute = H5Aopen_name(hdf5_headergrp, "Redshift");
+  H5Aread(hdf5_attribute, H5T_NATIVE_DOUBLE, &header->redshift);
+  H5Aclose(hdf5_attribute);
+
+  hdf5_attribute = H5Aopen_name(hdf5_headergrp, "Flag_Sfr");
+  H5Aread(hdf5_attribute, H5T_NATIVE_INT, &header->flag_sfr);
+  H5Aclose(hdf5_attribute);
+
+  hdf5_attribute = H5Aopen_name(hdf5_headergrp, "Flag_Feedback");
+  H5Aread(hdf5_attribute, H5T_NATIVE_INT, &header->flag_feedback);
+  H5Aclose(hdf5_attribute);
+
+  hdf5_attribute = H5Aopen_name(hdf5_headergrp, "NumPart_Total");
+  H5Aread(hdf5_attribute, H5T_NATIVE_UINT, header->npartTotal);
+  H5Aclose(hdf5_attribute);
+
+  hdf5_attribute = H5Aopen_name(hdf5_headergrp, "Flag_Cooling");
+  H5Aread(hdf5_attribute, H5T_NATIVE_INT, &header->flag_cooling);
+  H5Aclose(hdf5_attribute);
+
+  hdf5_attribute = H5Aopen_name(hdf5_headergrp, "NumFilesPerSnapshot");
+  H5Aread(hdf5_attribute, H5T_NATIVE_INT, &header->num_files);
+  H5Aclose(hdf5_attribute);
+
+  hdf5_attribute = H5Aopen_name(hdf5_headergrp, "BoxSize");
+  H5Aread(hdf5_attribute, H5T_NATIVE_DOUBLE, &header->BoxSize);
+  H5Aclose(hdf5_attribute);
+
+  hdf5_attribute = H5Aopen_name(hdf5_headergrp, "Omega0");
+  H5Aread(hdf5_attribute, H5T_NATIVE_DOUBLE, &header->Omega0);
+  H5Aclose(hdf5_attribute);
+
+  hdf5_attribute = H5Aopen_name(hdf5_headergrp, "OmegaLambda");
+  H5Aread(hdf5_attribute, H5T_NATIVE_DOUBLE, &header->OmegaLambda);
+  H5Aclose(hdf5_attribute);
+
+  hdf5_attribute = H5Aopen_name(hdf5_headergrp, "HubbleParam");
+  H5Aread(hdf5_attribute, H5T_NATIVE_DOUBLE, &header->HubbleParam);
+  H5Aclose(hdf5_attribute);
+
+  hdf5_attribute = H5Aopen_name(hdf5_headergrp, "Flag_StellarAge");
+  H5Aread(hdf5_attribute, H5T_NATIVE_INT, &header->flag_stellarage);
+  H5Aclose(hdf5_attribute);
+
+  hdf5_attribute = H5Aopen_name(hdf5_headergrp, "Flag_Metals");
+  H5Aread(hdf5_attribute, H5T_NATIVE_INT, &header->flag_metals);
+  H5Aclose(hdf5_attribute);
+
+  hdf5_attribute = H5Aopen_name(hdf5_headergrp, "NumPart_Total_HighWord");
+  H5Aread(hdf5_attribute, H5T_NATIVE_UINT, header->npartTotalHighWord);
+  H5Aclose(hdf5_attribute);
+
+  hdf5_attribute = H5Aopen_name(hdf5_headergrp, "Flag_Entropy_ICs");
+  H5Aread(hdf5_attribute, H5T_NATIVE_INT, &header->flag_entropy_instead_u);
+  H5Aclose(hdf5_attribute);
+
+  H5Gclose(hdf5_headergrp);
+} //read_header_attributes_in_hdf5
+///-------------------------------------------------------------------------------------
+/** @fn write_header_attributes_in_hdf5
+ *  This function writes the header information in case HDF5 is selected as
+ *  file format.
+ * @param hdf5_file
+ * @param header
+ */
+void write_header_attributes_in_hdf5(hid_t hdf5_file, header_t header) {
+  hsize_t adim[1] = {6};
+  hid_t hdf5_headergrp, hdf5_dataspace, hdf5_attribute;
+
+  hdf5_headergrp =
+      H5Gcreate2(hdf5_file, "/Header", 0, H5P_DEFAULT, H5P_DEFAULT);
+
+  hdf5_dataspace = H5Screate(H5S_SIMPLE);
+  H5Sset_extent_simple(hdf5_dataspace, 1, adim, NULL);
+  hdf5_attribute = H5Acreate2(
+      hdf5_headergrp, "NumPart_ThisFile", H5T_NATIVE_INT, hdf5_dataspace,
+      H5P_DEFAULT, H5P_DEFAULT);
+  H5Awrite(hdf5_attribute, H5T_NATIVE_UINT, header.npart);
+  H5Aclose(hdf5_attribute);
+  H5Sclose(hdf5_dataspace);
+
+  hdf5_dataspace = H5Screate(H5S_SIMPLE);
+  H5Sset_extent_simple(hdf5_dataspace, 1, adim, NULL);
+  hdf5_attribute = H5Acreate2(
+      hdf5_headergrp, "MassTable", H5T_NATIVE_DOUBLE, hdf5_dataspace,
+      H5P_DEFAULT, H5P_DEFAULT);
+  H5Awrite(hdf5_attribute, H5T_NATIVE_DOUBLE, header.mass);
+  H5Aclose(hdf5_attribute);
+  H5Sclose(hdf5_dataspace);
+
+  hdf5_dataspace = H5Screate(H5S_SCALAR);
+  hdf5_attribute = H5Acreate2(
+      hdf5_headergrp, "Time", H5T_NATIVE_DOUBLE, hdf5_dataspace, H5P_DEFAULT,
+      H5P_DEFAULT);
+  H5Awrite(hdf5_attribute, H5T_NATIVE_DOUBLE, &header.time);
+  H5Aclose(hdf5_attribute);
+  H5Sclose(hdf5_dataspace);
+
+  hdf5_dataspace = H5Screate(H5S_SCALAR);
+  hdf5_attribute = H5Acreate2(
+      hdf5_headergrp, "Redshift", H5T_NATIVE_DOUBLE, hdf5_dataspace,
+      H5P_DEFAULT, H5P_DEFAULT);
+  H5Awrite(hdf5_attribute, H5T_NATIVE_DOUBLE, &header.redshift);
+  H5Aclose(hdf5_attribute);
+  H5Sclose(hdf5_dataspace);
+
+  hdf5_dataspace = H5Screate(H5S_SCALAR);
+  hdf5_attribute = H5Acreate2(
+      hdf5_headergrp, "Flag_Sfr", H5T_NATIVE_INT, hdf5_dataspace, H5P_DEFAULT,
+      H5P_DEFAULT);
+  H5Awrite(hdf5_attribute, H5T_NATIVE_INT, &header.flag_sfr);
+  H5Aclose(hdf5_attribute);
+  H5Sclose(hdf5_dataspace);
+
+  hdf5_dataspace = H5Screate(H5S_SCALAR);
+  hdf5_attribute = H5Acreate2(
+      hdf5_headergrp, "Flag_Feedback", H5T_NATIVE_INT, hdf5_dataspace,
+      H5P_DEFAULT, H5P_DEFAULT);
+  H5Awrite(hdf5_attribute, H5T_NATIVE_INT, &header.flag_feedback);
+  H5Aclose(hdf5_attribute);
+  H5Sclose(hdf5_dataspace);
+
+  hdf5_dataspace = H5Screate(H5S_SIMPLE);
+  H5Sset_extent_simple(hdf5_dataspace, 1, adim, NULL);
+  hdf5_attribute = H5Acreate2(
+      hdf5_headergrp, "NumPart_Total", H5T_NATIVE_UINT, hdf5_dataspace,
+      H5P_DEFAULT, H5P_DEFAULT);
+  H5Awrite(hdf5_attribute, H5T_NATIVE_UINT, header.npartTotal);
+  H5Aclose(hdf5_attribute);
+  H5Sclose(hdf5_dataspace);
+
+  hdf5_dataspace = H5Screate(H5S_SCALAR);
+  hdf5_attribute = H5Acreate2(
+      hdf5_headergrp, "Flag_Cooling", H5T_NATIVE_INT, hdf5_dataspace,
+      H5P_DEFAULT, H5P_DEFAULT);
+  H5Awrite(hdf5_attribute, H5T_NATIVE_INT, &header.flag_cooling);
+  H5Aclose(hdf5_attribute);
+  H5Sclose(hdf5_dataspace);
+
+  hdf5_dataspace = H5Screate(H5S_SCALAR);
+  hdf5_attribute = H5Acreate2(
+      hdf5_headergrp, "NumFilesPerSnapshot", H5T_NATIVE_INT, hdf5_dataspace,
+      H5P_DEFAULT, H5P_DEFAULT);
+  H5Awrite(hdf5_attribute, H5T_NATIVE_INT, &header.num_files);
+  H5Aclose(hdf5_attribute);
+  H5Sclose(hdf5_dataspace);
+
+  hdf5_dataspace = H5Screate(H5S_SCALAR);
+  hdf5_attribute = H5Acreate2(
+      hdf5_headergrp, "Omega0", H5T_NATIVE_DOUBLE, hdf5_dataspace, H5P_DEFAULT,
+      H5P_DEFAULT);
+  H5Awrite(hdf5_attribute, H5T_NATIVE_DOUBLE, &header.Omega0);
+  H5Aclose(hdf5_attribute);
+  H5Sclose(hdf5_dataspace);
+
+  hdf5_dataspace = H5Screate(H5S_SCALAR);
+  hdf5_attribute = H5Acreate2(
+      hdf5_headergrp, "OmegaLambda", H5T_NATIVE_DOUBLE, hdf5_dataspace,
+      H5P_DEFAULT, H5P_DEFAULT);
+  H5Awrite(hdf5_attribute, H5T_NATIVE_DOUBLE, &header.OmegaLambda);
+  H5Aclose(hdf5_attribute);
+  H5Sclose(hdf5_dataspace);
+
+  hdf5_dataspace = H5Screate(H5S_SCALAR);
+  hdf5_attribute = H5Acreate2(
+      hdf5_headergrp, "HubbleParam", H5T_NATIVE_DOUBLE, hdf5_dataspace,
+      H5P_DEFAULT, H5P_DEFAULT);
+  H5Awrite(hdf5_attribute, H5T_NATIVE_DOUBLE, &header.HubbleParam);
+  H5Aclose(hdf5_attribute);
+  H5Sclose(hdf5_dataspace);
+
+  hdf5_dataspace = H5Screate(H5S_SCALAR);
+  hdf5_attribute = H5Acreate2(
+      hdf5_headergrp, "BoxSize", H5T_NATIVE_DOUBLE, hdf5_dataspace, H5P_DEFAULT,
+      H5P_DEFAULT);
+  H5Awrite(hdf5_attribute, H5T_NATIVE_DOUBLE, &header.BoxSize);
+  H5Aclose(hdf5_attribute);
+  H5Sclose(hdf5_dataspace);
+
+  hdf5_dataspace = H5Screate(H5S_SCALAR);
+  hdf5_attribute = H5Acreate2(
+      hdf5_headergrp, "Flag_StellarAge", H5T_NATIVE_INT, hdf5_dataspace,
+      H5P_DEFAULT, H5P_DEFAULT);
+  H5Awrite(hdf5_attribute, H5T_NATIVE_INT, &header.flag_stellarage);
+  H5Aclose(hdf5_attribute);
+  H5Sclose(hdf5_dataspace);
+
+  hdf5_dataspace = H5Screate(H5S_SCALAR);
+  hdf5_attribute = H5Acreate2(
+      hdf5_headergrp, "Flag_Metals", H5T_NATIVE_INT, hdf5_dataspace,
+      H5P_DEFAULT, H5P_DEFAULT);
+  H5Awrite(hdf5_attribute, H5T_NATIVE_INT, &header.flag_metals);
+  H5Aclose(hdf5_attribute);
+  H5Sclose(hdf5_dataspace);
+
+  hdf5_dataspace = H5Screate(H5S_SIMPLE);
+  H5Sset_extent_simple(hdf5_dataspace, 1, adim, NULL);
+  hdf5_attribute = H5Acreate2(
+      hdf5_headergrp, "NumPart_Total_HighWord", H5T_NATIVE_UINT, hdf5_dataspace,
+      H5P_DEFAULT, H5P_DEFAULT);
+  H5Awrite(hdf5_attribute, H5T_NATIVE_UINT, header.npartTotalHighWord);
+  H5Aclose(hdf5_attribute);
+  H5Sclose(hdf5_dataspace);
+
+  hdf5_dataspace = H5Screate(H5S_SIMPLE);
+  H5Sset_extent_simple(hdf5_dataspace, 1, adim, NULL);
+  hdf5_attribute = H5Acreate2(
+      hdf5_headergrp, "Flag_Entropy_ICs", H5T_NATIVE_UINT, hdf5_dataspace,
+      H5P_DEFAULT, H5P_DEFAULT);
+  H5Awrite(hdf5_attribute, H5T_NATIVE_UINT, &header.flag_entropy_instead_u);
+  H5Aclose(hdf5_attribute);
+  H5Sclose(hdf5_dataspace);
+
+  H5Gclose(hdf5_headergrp);
+} //write_header_attributes_in_hdf5
+///-------------------------------------------------------------------------------------
+/** @fn write_hdf5_block
+ *  This function writes a block in a Gadget fileformat 3 snapshot file
+ * @param blocknr
+ * @param hdf5_grp
+ * @param Ids
+ * @param positions
+ * @param velocities
+ * @param header
+ * @param pc
+ */
+static void write_hdf5_block(
+    enum iofields blocknr, hid_t hdf5_grp[6], arrayID_t Ids,
+    arrayPosition_t positions, arrayPosition_t velocities, header_t header,
+    particleID_t pc) {
+  int rank, type, bytes_per_blockelement;
+  particleID_t pc_new;
+  unsigned int n;
+  char h5buf[100];
+  void *TestBuffer;
+  hid_t hdf5_dataspace_in_file, hdf5_dataspace_in_memory, hdf5_dataset;
+  hsize_t dims[2], offset[2];
+
+  for (type = 0, pc_new = pc; type < 6; type++) {
+    if (header.npart[type] <= 0)
+      continue;
+    get_dataset_name(blocknr, h5buf);
+    bytes_per_blockelement = get_bytes_per_blockelement(blocknr);
+
+    dims[0] = header.npart[type]; // write all particles in file
+    dims[1] = get_values_per_blockelement(blocknr);
+    if (dims[1] == 1)
+      rank = 1;
+    else
+      rank = 2;
+    hdf5_dataspace_in_file = H5Screate_simple(rank, dims, NULL);
+
+    if ((TestBuffer = malloc(bytes_per_blockelement * header.npart[type])) !=
+        NULL) // try to allocate a buffer to write the hdf5 block all at once
+    {
+      free(TestBuffer);
+      dims[0] = header.npart[type]; // write all particles in memory
+      offset[1] = 0;
+      hdf5_dataspace_in_memory = H5Screate_simple(rank, dims, NULL);
+
+      offset[0] = 0;
+      H5Sselect_hyperslab(
+          hdf5_dataspace_in_file, H5S_SELECT_SET, offset, NULL, dims, NULL);
+
+      // malloc an array
+      float *FloatBuffer;
+      particleID_t *ParticleIDtypeBuffer;
+      switch (blocknr) {
+      case IO_ID:
+        ParticleIDtypeBuffer =
+            (particleID_t *)malloc(bytes_per_blockelement * header.npart[type]);
+        break;
+      default:
+        FloatBuffer =
+            (float *)malloc(bytes_per_blockelement * header.npart[type]);
+        break;
+      }
+
+      // fill buffer array and write it to hdf5
+      switch (blocknr) {
+      case IO_POS:
+        for (n = 0; n < header.npart[type]; n++) {
+          // cast to float
+          FloatBuffer[3 * n + 0] = float(positions[pc_new][0]);
+          FloatBuffer[3 * n + 1] = float(positions[pc_new][1]);
+          FloatBuffer[3 * n + 2] = float(positions[pc_new][2]);
+
+          pc_new++;
+        }
+        hdf5_dataset = H5Dcreate2(
+            hdf5_grp[type], h5buf, H5T_NATIVE_FLOAT, hdf5_dataspace_in_file,
+            H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+        H5Dwrite(
+            hdf5_dataset, H5T_NATIVE_FLOAT, hdf5_dataspace_in_memory,
+            hdf5_dataspace_in_file, H5P_DEFAULT, FloatBuffer);
+        break;
+      case IO_VEL:
+        for (n = 0; n < header.npart[type]; n++) {
+          // cast to float
+          FloatBuffer[3 * n + 0] = float(velocities[pc_new][0]);
+          FloatBuffer[3 * n + 1] = float(velocities[pc_new][1]);
+          FloatBuffer[3 * n + 2] = float(velocities[pc_new][2]);
+
+          pc_new++;
+        }
+        hdf5_dataset = H5Dcreate2(
+            hdf5_grp[type], h5buf, H5T_NATIVE_FLOAT, hdf5_dataspace_in_file,
+            H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+        H5Dwrite(
+            hdf5_dataset, H5T_NATIVE_FLOAT, hdf5_dataspace_in_memory,
+            hdf5_dataspace_in_file, H5P_DEFAULT, FloatBuffer);
+        break;
+      case IO_ID:
+        for (n = 0; n < header.npart[type]; n++) {
+          ParticleIDtypeBuffer[n] = particleID_t(Ids[pc_new]);
+          pc_new++;
+        }
+        hdf5_dataset = H5Dcreate2(
+            hdf5_grp[type], h5buf, H5T_NATIVE_UINT64, hdf5_dataspace_in_file,
+            H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+        H5Dwrite(
+            hdf5_dataset, H5T_NATIVE_UINT64, hdf5_dataspace_in_memory,
+            hdf5_dataspace_in_file, H5P_DEFAULT, ParticleIDtypeBuffer);
+        break;
+      case IO_MASS:
+      case IO_U:
+      case IO_RHO:
+      case IO_HSML:
+      case IO_POT:
+      case IO_ACCEL:
+      case IO_DTENTR:
+      case IO_TSTP:
+        break;
+      }
+
+      // free memory
+      switch (blocknr) {
+      case IO_ID:
+        free(ParticleIDtypeBuffer);
+        break;
+      default:
+        free(FloatBuffer);
+        break;
+      }
+    } else // we write the hdf5 block particle per particle
+    {
+      dims[0] = 1; // write particles one by one in memory
+      offset[1] = 0;
+      hdf5_dataspace_in_memory = H5Screate_simple(rank, dims, NULL);
+
+      switch (blocknr) {
+      case IO_POS:
+        hdf5_dataset = H5Dcreate2(
+            hdf5_grp[type], h5buf, H5T_NATIVE_FLOAT, hdf5_dataspace_in_file,
+            H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+        break;
+      case IO_VEL:
+        hdf5_dataset = H5Dcreate2(
+            hdf5_grp[type], h5buf, H5T_NATIVE_FLOAT, hdf5_dataspace_in_file,
+            H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+        break;
+      case IO_ID:
+        hdf5_dataset = H5Dcreate2(
+            hdf5_grp[type], h5buf, H5T_NATIVE_UINT64, hdf5_dataspace_in_file,
+            H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+        break;
+      case IO_MASS:
+      case IO_U:
+      case IO_RHO:
+      case IO_HSML:
+      case IO_POT:
+      case IO_ACCEL:
+      case IO_DTENTR:
+      case IO_TSTP:
+        break;
+      }
+
+      for (n = 0; n < header.npart[type]; n++) {
+        offset[0] = n;
+        H5Sselect_hyperslab(
+            hdf5_dataspace_in_file, H5S_SELECT_SET, offset, NULL, dims, NULL);
+
+        float Vector[3];
+        switch (blocknr) {
+        case IO_POS:
+          // cast to float
+          Vector[0] = float(positions[pc_new][0]);
+          Vector[1] = float(positions[pc_new][1]);
+          Vector[2] = float(positions[pc_new][2]);
+          H5Dwrite(
+              hdf5_dataset, H5T_NATIVE_FLOAT, hdf5_dataspace_in_memory,
+              hdf5_dataspace_in_file, H5P_DEFAULT, &Vector);
+          break;
+        case IO_VEL:
+          // cast to float
+          Vector[0] = float(velocities[pc_new][0]);
+          Vector[1] = float(velocities[pc_new][1]);
+          Vector[2] = float(velocities[pc_new][2]);
+          H5Dwrite(
+              hdf5_dataset, H5T_NATIVE_FLOAT, hdf5_dataspace_in_memory,
+              hdf5_dataspace_in_file, H5P_DEFAULT, &Vector);
+          break;
+        case IO_ID:
+          H5Dwrite(
+              hdf5_dataset, H5T_NATIVE_UINT64, hdf5_dataspace_in_memory,
+              hdf5_dataspace_in_file, H5P_DEFAULT, &Ids[pc_new]);
+          break;
+        case IO_MASS:
+        case IO_U:
+        case IO_RHO:
+        case IO_HSML:
+        case IO_POT:
+        case IO_ACCEL:
+        case IO_DTENTR:
+        case IO_TSTP:
+          break;
+        }
+
+        pc_new++;
+      }
+    }
+    H5Sclose(hdf5_dataspace_in_memory);
+    H5Sclose(hdf5_dataspace_in_file);
+    H5Dclose(hdf5_dataset);
+  }
+} //write_hdf5_block
+
+namespace LibLSS {
+  namespace IO {
+
+    void readGadget(
+        H5::H5File hdf5_file, arrayID_t &Ids, arrayPosition_t &Pos,
+        arrayVelocity_t &Vel, CosmologicalParameters &cosmo, size_t &Np,
+        double &L0, double &L1, double &L2) {
+      // read header
+      header_t header1;
+      read_header_attributes_in_hdf5(hdf5_file.getId(), &header1);
+
+      cosmo.omega_m = header1.Omega0;
+      cosmo.omega_q = header1.OmegaLambda;
+      cosmo.h = header1.HubbleParam;
+      Np = (size_t)header1.npart[1];
+      L0 = L1 = L2 = header1.BoxSize;
+
+      // read positio
+      CosmoTool::hdf5_read_array(hdf5_file, "PartType1/Coordinates", Pos);
+
+      // read velocities
+      CosmoTool::hdf5_read_array(hdf5_file, "PartType1/Velocities", Vel);
+
+      // read Ids
+      CosmoTool::hdf5_read_array(hdf5_file, "PartType1/ParticleIDs", Ids);
+    } //readGadget
+
+    void saveGadget(
+        H5::H5File hdf5_file, arrayID_t Ids, arrayPosition_t Pos,
+        arrayVelocity_t Vel, CosmologicalParameters cosmo, const size_t Np,
+        const double L0, const double L1, const double L2) {
+      hid_t hdf5_grp[6];
+      char h5buf[100];
+      particleID_t pc = 0;
+      int type;
+
+      header_t header1;
+      header1.npart[0] = header1.npart[2] = header1.npart[3] =
+          header1.npart[4] = header1.npart[5] = 0;
+      header1.npart[1] = (unsigned int)Np;
+      header1.npartTotal[0] = header1.npartTotal[2] = header1.npartTotal[3] =
+          header1.npartTotal[4] = header1.npartTotal[5] = 0;
+      header1.npartTotal[1] = (unsigned int)Np;
+      header1
+          .mass[0] = header1
+                         .mass[2] = header1
+                                        .mass[3] = header1
+                                                       .mass[4] = header1
+                                                                      .mass[5] =
+          1.; // Shouldn't be zero, otherwise interpreted as "variable particle mass" by Gadget
+      header1.mass[1] = cosmo.omega_m * 3 * P_Hubble * P_Hubble /
+                        (8 * M_PI * P_G) * pow(L0, 3) /
+                        Np; // First Friedmann equation in cosmic time
+      header1.time;
+      header1.redshift;
+      header1.flag_sfr = 0;
+      header1.flag_feedback = 0;
+      header1.flag_cooling = 0;
+      header1.num_files = 1;
+      if (fabs(L1 - L0) > 1e-3 || fabs(L2 - L0) > 1e-3)
+        error_helper<ErrorParams>(
+            "L1 must be equal to L0, got L0=%g, L1=%g, L2=%g");
+      header1.BoxSize = L0;
+      header1.Omega0 = cosmo.omega_m;
+      header1.OmegaLambda = cosmo.omega_q;
+      header1.HubbleParam = cosmo.h;
+      header1.flag_stellarage = 0;
+      header1.flag_metals = 0;
+      header1.npartTotalHighWord[0] = header1.npartTotalHighWord[1] =
+          header1.npartTotalHighWord[2] = header1.npartTotalHighWord[3] =
+              header1.npartTotalHighWord[4] = header1.npartTotalHighWord[5] = 0;
+      header1.flag_entropy_instead_u = 0;
+      header1.flag_doubleprecision = 0;
+      header1.flag_ic_info = 0;
+      header1.lpt_scalingfactor = 0.;
+
+      // create groups
+      for (type = 0; type < 6; type++) {
+        if (header1.npart[type] > 0) {
+          sprintf(h5buf, "/PartType%d", type);
+          hdf5_grp[type] =
+              H5Gcreate2(hdf5_file.getId(), h5buf, 0, H5P_DEFAULT, H5P_DEFAULT);
+        }
+      }
+
+      // write header
+      write_header_attributes_in_hdf5(hdf5_file.getId(), header1);
+
+      // write positions
+      write_hdf5_block(IO_POS, hdf5_grp, Ids, Pos, Vel, header1, pc);
+
+      // write velocities
+      write_hdf5_block(IO_VEL, hdf5_grp, Ids, Pos, Vel, header1, pc);
+
+      // write Ids
+      write_hdf5_block(IO_ID, hdf5_grp, Ids, Pos, Vel, header1, pc);
+
+      // close groups
+      for (type = 5; type >= 0; type--)
+        if (header1.npart[type] > 0)
+          H5Gclose(hdf5_grp[type]);
+    } //saveGadget
+
+  } // namespace IO
+} // namespace LibLSS
+
+// ARES TAG: authors_num = 1
+// ARES TAG: name(0) = Florent Leclercq
+// ARES TAG: year(0) = 2016-2018
+// ARES TAG: email(0) = florent.leclercq@polytechnique.org
--- a/extra/borg/libLSS/io/gadget3.hpp
+++ b/extra/borg/libLSS/io/gadget3.hpp
@ -0,0 +1,103 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/io/gadget3.hpp
+    Copyright (C) 2016-2018 Florent Leclercq <florent.leclercq@polytechnique.org>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#ifndef __LIBLSS_BORG_GADGET3_HPP
+#  define __LIBLSS_BORG_GADGET3_HPP
+
+#  include <boost/multi_array.hpp>
+#  include "CosmoTool/hdf5_array.hpp"
+#  include "libLSS/tools/hdf5_scalar.hpp"
+#  include "libLSS/physics/cosmo.hpp"
+
+namespace LibLSS {
+  namespace IO {
+    typedef size_t particleID_t;
+    typedef boost::multi_array<particleID_t, 1> arrayID_t;
+    typedef boost::multi_array<double, 2> arrayPosition_t;
+    typedef boost::multi_array<double, 2> arrayVelocity_t;
+    
+    constexpr static double P_UnitLength_in_cm=3.085678e24; // 1.0 Mpc in cm
+    constexpr static double P_UnitMass_in_g=1.989e43; // 1.0e10 solar masses in g
+    constexpr static double P_UnitVelocity_in_cm_per_s=1e5; // 1 km/sec in cm/sec
+    constexpr static double P_UnitTime_in_s=P_UnitLength_in_cm / P_UnitVelocity_in_cm_per_s;
+    constexpr static double P_GRAVITY=6.67384e-8;
+    constexpr static double P_G=P_GRAVITY / (P_UnitLength_in_cm*P_UnitLength_in_cm*P_UnitLength_in_cm) * P_UnitMass_in_g * (P_UnitTime_in_s*P_UnitTime_in_s);
+    constexpr static double P_Hubble=100.; // so that HubbleParam is in units of 100 km/sec/Mpc
+    
+    enum iofields /*!< this enumeration lists the defined output blocks in snapshot files. Not all of them need to be present. */
+    { IO_POS,
+      IO_VEL,
+      IO_ID,
+      IO_MASS,
+      IO_U,
+      IO_RHO,
+      IO_HSML,
+      IO_POT,
+      IO_ACCEL,
+      IO_DTENTR,
+      IO_TSTP,
+    };
+
+    struct __header_G3 {
+      unsigned int
+          npart[6]; /*!< number of particles of each type in this file */
+      double mass
+          [6]; /*!< mass of particles of each type. If 0, then the masses are explicitly stored in the mass-block of the snapshot file, otherwise they are omitted */
+      double time;     /*!< time of snapshot file */
+      double redshift; /*!< redshift of snapshot file */
+      int flag_sfr; /*!< flags whether the simulation was including star formation */
+      int flag_feedback; /*!< flags whether feedback was included (obsolete) */
+      unsigned int npartTotal
+          [6]; /*!< total number of particles of each type in this snapshot. This can be different from npart if one is dealing with a multi-file snapshot. */
+      int flag_cooling; /*!< flags whether cooling was included  */
+      int num_files;    /*!< number of files in multi-file snapshot */
+      double
+          BoxSize; /*!< box-size of simulation in case periodic boundaries were used */
+      double Omega0;      /*!< matter density in units of critical density */
+      double OmegaLambda; /*!< cosmological constant parameter */
+      double HubbleParam; /*!< Hubble parameter in units of 100 km/sec/Mpc */
+      int flag_stellarage; /*!< flags whether the file contains formation times of star particles */
+      int flag_metals; /*!< flags whether the file contains metallicity values for gas and star particles */
+      unsigned int npartTotalHighWord
+          [6]; /*!< High word of the total number of particles of each type */
+      int flag_entropy_instead_u; /*!< flags that IC-file contains entropy instead of u */
+      // Specific to Gadget-3:
+      int flag_doubleprecision; /*!< flags that snapshot contains double-precision instead of single precision */
+      int flag_ic_info; /*!< flag to inform whether IC files are generated with ordinary Zeldovich approximation, or whether they contain 2nd order
+    lagrangian perturbation theory initial conditions. For snapshots files, the value informs whether the simulation was evolved from Zeldoch or 2lpt ICs. Encoding is as follows:
+                                                FLAG_ZELDOVICH_ICS     (1)   -IC file based on Zeldovich
+                                                FLAG_SECOND_ORDER_ICS  (2)   -Special IC-file containing 2lpt masses
+                                                FLAG_EVOLVED_ZELDOVICH (3)   -snapshot evolved from Zeldovich ICs
+                                                FLAG_EVOLVED_2LPT      (4)   -snapshot evolved from 2lpt ICs
+                                                FLAG_NORMALICS_2LPT    (5)   -standard gadget file format with 2lpt ICs
+                                                All other values, including 0 are interpreted as "don't know" for backwards compatibility.*/
+      float
+          lpt_scalingfactor; /*!< scaling factor for 2lpt initial conditions */
+      char fill[18];         /*!< fills to 256 Bytes */
+      char names[15][2];
+    };
+    typedef struct __header_G3 header_t;
+
+    void readGadget(
+        H5::H5File hdf5_file, arrayID_t &Ids, arrayPosition_t &Pos,
+        arrayVelocity_t &Vel, CosmologicalParameters &cosmo, size_t &Np,
+        double &L0, double &L1, double &L2);
+    void saveGadget(
+        H5::H5File hdf5_file, arrayID_t Ids, arrayPosition_t Pos,
+        arrayVelocity_t Vel, CosmologicalParameters cosmo, const size_t Np,
+        const double L0, const double L1, const double L2);
+
+  } // namespace IO
+} // namespace LibLSS
+
+#endif
+
+// ARES TAG: authors_num = 1
+// ARES TAG: name(0) = Florent Leclercq
+// ARES TAG: year(0) = 2016-2018
+// ARES TAG: email(0) = florent.leclercq@polytechnique.org
--- a/extra/borg/libLSS/julia/julia.cpp
+++ b/extra/borg/libLSS/julia/julia.cpp
@ -0,0 +1,422 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/julia/julia.cpp
+    Copyright (C) 2014-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#include "libLSS/tools/static_init.hpp"
+#include "libLSS/tools/static_auto.hpp"
+#include <julia.h>
+#include <list>
+#include <string>
+#include <algorithm>
+#include <vector>
+#include <iterator>
+#include <boost/algorithm/string/join.hpp>
+#include "libLSS/julia/julia.hpp"
+#include "libLSS/mcmc/global_state.hpp"
+#include "libLSS/mcmc/state_element.hpp"
+#include <boost/preprocessor/cat.hpp>
+#include <boost/preprocessor/repetition/repeat.hpp>
+#include <boost/preprocessor/seq/for_each.hpp>
+#include "libLSS/samplers/core/types_samplers.hpp"
+#include "src/common/preparation_types.hpp"
+#include "libLSS/tools/string_tools.hpp"
+#include <boost/algorithm/string.hpp>
+
+using namespace LibLSS;
+
+using boost::format;
+using boost::str;
+using LibLSS::Julia::Object;
+
+//extern "C" void jl_exit_on_sigint(int);
+//
+void *LibLSS::Julia::details::julia_array_reorder = 0;
+
+namespace {
+
+  void julia_console_print(int level, char const *msg) {
+    Console &cons = Console::instance();
+    std::vector<std::string> results;
+    std::string s(msg);
+    boost::split(results, s, [](char c) { return c == '\n'; });
+    switch (level) {
+    case 0:
+      cons.print<LOG_STD>(msg);
+      break;
+    case 1:
+      cons.print<LOG_ERROR>(msg);
+      break;
+    case 2:
+      cons.print<LOG_WARNING>(msg);
+      break;
+    case 3:
+      cons.print<LOG_INFO>(msg);
+      break;
+    case 4:
+      cons.print<LOG_INFO_SINGLE>(msg);
+      break;
+    case 5:
+      cons.print<LOG_VERBOSE>(msg);
+      break;
+    case 6:
+      cons.print<LOG_DEBUG>(msg);
+      break;
+    default:
+      cons.print<LOG_ERROR>(
+          "Unknown log level for message '" + std::string(msg) + "'");
+    }
+  }
+
+  void *julia_console_progress_start(int level, char const *msg, int max) {
+    return 0;
+  }
+
+  void julia_console_progress_step(void *p) {}
+
+  void julia_console_progress_end(void *p) {}
+
+  double rand_uniform(MarkovState *state) {
+    return state->get<RandomGen>("random_generator")->get().uniform();
+  }
+
+  double rand_gaussian(MarkovState *state) {
+    return state->get<RandomGen>("random_generator")->get().gaussian();
+  }
+
+  LibLSS_prepare::GalaxySurveyType::GalaxyType *
+  get_galaxy_descriptor(MarkovState *state, int id, int *sz) {
+    using LibLSS_prepare::GalaxyElement;
+
+    try {
+      std::string cname = "galaxy_catalog_" + to_string(id);
+      if (state->exists(cname)) {
+        auto &survey = state->get<GalaxyElement>(cname)->get().getGalaxies();
+
+        *sz = survey.size();
+        return survey.data();
+      } else {
+        std::string cname = "halo_catalog_" + to_string(id);
+        auto &survey = state->get<GalaxyElement>(cname)->get().getGalaxies();
+
+        *sz = survey.size();
+        return survey.data();
+      }
+    } catch (LibLSS::ErrorBase &e) {
+      jl_error(e.what());
+    }
+  }
+
+#define AUTO_STATE_EDIT_QUERY(type)                                            \
+  void BOOST_PP_CAT(state_edit_, type)(                                        \
+      MarkovState * state, char const *entry, type const *value, int sync) {   \
+    try {                                                                      \
+      if (sync != 0)                                                           \
+        state->getSyncScalar<type>(entry) = *value;                            \
+      else                                                                     \
+        state->getScalar<type>(entry) = *value;                                \
+    } catch (LibLSS::ErrorBase & e) {                                          \
+      jl_error(e.what());                                                      \
+    }                                                                          \
+  }                                                                            \
+  void BOOST_PP_CAT(state_query_, type)(                                       \
+      MarkovState * state, char const *entry, type *v, int sync) {             \
+    try {                                                                      \
+      if (sync != 0)                                                           \
+        *v = state->getSyncScalar<type>(entry);                                \
+      else                                                                     \
+        *v = state->getScalar<type>(entry);                                    \
+    } catch (LibLSS::ErrorBase & e) {                                          \
+      jl_error(e.what());                                                      \
+    }                                                                          \
+  }                                                                            \
+  void BOOST_PP_CAT(state_new_, type)(                                         \
+      MarkovState * state, char const *entry, type const *value, int sync,     \
+      int mcmc_save) {                                                         \
+    try {                                                                      \
+      if (sync != 0)                                                           \
+        state->newSyScalar<type>(entry, *value, mcmc_save != 0);               \
+      else                                                                     \
+        state->newScalar<type>(entry, *value, mcmc_save != 0);                 \
+    } catch (LibLSS::ErrorBase & e) {                                          \
+      jl_error(e.what());                                                      \
+    }                                                                          \
+  }                                                                            \
+  type *BOOST_PP_CAT(state_new_1d_array_, type)(                               \
+      MarkovState * state, char const *entry, size_t N, int mcmc_save) {       \
+    ArrayStateElement<type, 1> *elt;                                           \
+    state->newElement(                                                         \
+        entry, elt = new ArrayStateElement<type, 1>(boost::extents[N]),        \
+        mcmc_save != 0);                                                       \
+    return elt->array->data();                                                 \
+  }                                                                            \
+  void BOOST_PP_CAT(state_1d_array_autosize_, type)(                           \
+      MarkovState * state, char const *entry, int b) {                         \
+    try {                                                                      \
+      auto a = state->get<ArrayStateElement<type, 1>>(entry);                  \
+      a->setAutoResize(b == 1);                                                \
+    } catch (LibLSS::ErrorBase & e) {                                          \
+      jl_error(e.what());                                                      \
+    }                                                                          \
+  }                                                                            \
+  type *BOOST_PP_CAT(state_get_1d_array_, type)(                               \
+      MarkovState * state, char const *entry, size_t *N) {                     \
+    try {                                                                      \
+      auto a = state->get<ArrayStateElement<type, 1>>(entry)->array;           \
+      *N = a->size();                                                          \
+      return a->data();                                                        \
+    } catch (LibLSS::ErrorBase & e) {                                          \
+      jl_error(e.what());                                                      \
+      return 0;                                                                \
+    }                                                                          \
+  }                                                                            \
+  type *BOOST_PP_CAT(state_resize_1d_array_, type)(                            \
+      MarkovState * state, char const *entry, size_t N) {                      \
+    try {                                                                      \
+      auto a = state->get<ArrayStateElement<type, 1>>(entry)->array;           \
+      a->resize(boost::extents[N]);                                            \
+      return a->data();                                                        \
+    } catch (LibLSS::ErrorBase & e) {                                          \
+      return 0;                                                                \
+    }                                                                          \
+  }                                                                            \
+  type *BOOST_PP_CAT(state_new_2d_array_, type)(                               \
+      MarkovState * state, char const *entry, size_t N0, size_t N1,            \
+      int mcmc_save) {                                                         \
+    ArrayStateElement<type, 2> *elt;                                           \
+    state->newElement(                                                         \
+        entry, elt = new ArrayStateElement<type, 2>(boost::extents[N0][N1]),   \
+        mcmc_save != 0);                                                       \
+    return elt->array->data();                                                 \
+  }                                                                            \
+  type *BOOST_PP_CAT(state_get_2d_array_, type)(                               \
+      MarkovState * state, char const *entry, size_t *N) {                     \
+    try {                                                                      \
+      auto a = state->get<ArrayStateElement<type, 2>>(entry)->array;           \
+      N[1] = a->shape()[0];                                                    \
+      N[0] = a->shape()[1];                                                    \
+      return a->data();                                                        \
+    } catch (LibLSS::ErrorBase & e) {                                          \
+      jl_error(e.what());                                                      \
+      return 0;                                                                \
+    }                                                                          \
+  }                                                                            \
+  void BOOST_PP_CAT(state_2d_array_autosize_, type)(                           \
+      MarkovState * state, char const *entry, int b) {                         \
+    try {                                                                      \
+      auto a = state->get<ArrayStateElement<type, 2>>(entry);                  \
+      a->setAutoResize(b == 1);                                                \
+    } catch (LibLSS::ErrorBase & e) {                                          \
+      jl_error(e.what());                                                      \
+    }                                                                          \
+  }                                                                            \
+  type *BOOST_PP_CAT(state_resize_2d_array_, type)(                            \
+      MarkovState * state, char const *entry, size_t N0, size_t N1) {          \
+    try {                                                                      \
+      auto a = state->get<ArrayStateElement<type, 2>>(entry)->array;           \
+      a->resize(boost::extents[N0][N1]);                                       \
+      return a->data();                                                        \
+    } catch (LibLSS::ErrorBase & e) {                                          \
+      return 0;                                                                \
+    }                                                                          \
+  }                                                                            \
+  type *BOOST_PP_CAT(state_new_3d_array_, type)(                               \
+      MarkovState * state, char const *entry, size_t N0, size_t N1, size_t N2, \
+      int mcmc_save) {                                                         \
+    ArrayStateElement<type, 3, FFTW_Allocator<type>, true> *elt;               \
+    state->newElement(                                                         \
+        entry,                                                                 \
+        elt = new ArrayStateElement<type, 3, FFTW_Allocator<type>, true>(      \
+            boost::extents[N0][N1][N2]),                                       \
+        mcmc_save != 0);                                                       \
+    return elt->array->data();                                                 \
+  }                                                                            \
+  type *BOOST_PP_CAT(state_get_3d_array_, type)(                               \
+      MarkovState * state, char const *entry, size_t *N) {                     \
+    try {                                                                      \
+      auto a =                                                                 \
+          state                                                                \
+              ->get<ArrayStateElement<type, 3, FFTW_Allocator<type>, true>>(   \
+                  entry)                                                       \
+              ->array;                                                         \
+      N[2] = a->shape()[0];                                                    \
+      N[1] = a->shape()[1];                                                    \
+      N[0] = a->shape()[2];                                                    \
+      return a->data();                                                        \
+    } catch (LibLSS::ErrorBase & e) {                                          \
+      jl_error(e.what());                                                      \
+      return 0;                                                                \
+    }                                                                          \
+  }                                                                            \
+  void BOOST_PP_CAT(state_3d_array_autosize_, type)(                           \
+      MarkovState * state, char const *entry, int b) {                         \
+    try {                                                                      \
+      auto a = state->get<ArrayStateElement<type, 3>>(entry);                  \
+      a->setAutoResize(b == 1);                                                \
+    } catch (LibLSS::ErrorBase & e) {                                          \
+      jl_error(e.what());                                                      \
+    }                                                                          \
+  }                                                                            \
+  type *BOOST_PP_CAT(state_resize_3d_array_, type)(                            \
+      MarkovState * state, char const *entry, size_t N0, size_t N1,            \
+      size_t N2) {                                                             \
+    try {                                                                      \
+      auto a = state->get<ArrayStateElement<type, 3>>(entry)->array;           \
+      a->resize(boost::extents[N0][N1][N2]);                                   \
+      return a->data();                                                        \
+    } catch (LibLSS::ErrorBase & e) {                                          \
+      return 0;                                                                \
+    }                                                                          \
+  }
+
+  AUTO_STATE_EDIT_QUERY(int);
+  AUTO_STATE_EDIT_QUERY(long);
+  AUTO_STATE_EDIT_QUERY(double);
+
+  const std::string julia_module_code =
+#if !defined(DOXYGEN_SHOULD_SKIP_THIS)
+#include "libLSS/julia/julia_module.hpp"
+#else
+	  ""
+#endif
+      ;
+
+  void initializeJulia() {
+    auto &cons = Console::instance();
+
+    cons.print<LOG_INFO>("Initialize Julia core");
+
+    std::string thread_count = str(format("%d") % smp_get_max_threads());
+    setenv("JULIA_NUM_THREADS", thread_count.c_str(), 1);
+    setenv("JULIA_HOME", JULIA_HOME, 1);
+    jl_init_with_image(JULIA_BINDIR, jl_get_default_sysimg_path());
+
+    jl_value_t *exc;
+    jl_value_t **args;
+    jl_function_t *func;
+
+    JL_GC_PUSH2(&exc, &func);
+    // Load the special static module to make the wrapping easier.
+    (void)jl_eval_string(julia_module_code.c_str());
+
+    exc = jl_exception_occurred();
+    if (exc != 0) {
+      cons.print<LOG_ERROR>("Fatal error in the initialization of Julia core");
+      jl_call2(
+          jl_get_function(jl_base_module, "showerror"), jl_stderr_obj(), exc);
+      JL_GC_POP();
+      jl_exception_clear();
+      throw Julia::JuliaException(Object(exc));
+    }
+
+    cons.print<LOG_VERBOSE>("Invoking _setup_module");
+    // Now setup some hooks between julia and ARES Core.
+    {
+      jl_value_t *func_entries;
+      JL_GC_PUSH1(&func_entries);
+      constexpr size_t maxNumArgs = 52;
+      std::string array_creation =
+          str(format("Array{Ptr{Nothing}}(undef,(%d,))") % maxNumArgs);
+
+      func_entries = jl_eval_string(array_creation.c_str());
+      exc = jl_exception_occurred();
+      if (exc != 0) {
+        cons.print<LOG_ERROR>(
+            "Fatal error in the initialization of Julia core");
+        jl_call2(
+            jl_get_function(jl_base_module, "showerror"), jl_stderr_obj(), exc);
+        JL_GC_POP();
+        jl_exception_clear();
+        throw Julia::JuliaException(Object(exc));
+      }
+      void **func_entries_p = (void **)jl_array_data(func_entries);
+
+      size_t current_arg = 0;
+      func_entries_p[current_arg++] =
+          reinterpret_cast<void *>(&julia_console_print);
+
+#define PUSH_FUNC(func)                                                        \
+  func_entries_p[current_arg++] = reinterpret_cast<void *>(func)
+#define PUSH_STATE_EDIT_QUERY(type)                                            \
+  PUSH_FUNC(BOOST_PP_CAT(&state_new_, type));                                  \
+  PUSH_FUNC(BOOST_PP_CAT(&state_edit_, type));                                 \
+  PUSH_FUNC(BOOST_PP_CAT(&state_query_, type));                                \
+  PUSH_FUNC(BOOST_PP_CAT(&state_new_1d_array_, type));                         \
+  PUSH_FUNC(BOOST_PP_CAT(&state_get_1d_array_, type));                         \
+  PUSH_FUNC(BOOST_PP_CAT(&state_resize_1d_array_, type));                      \
+  PUSH_FUNC(BOOST_PP_CAT(&state_1d_array_autosize_, type));                    \
+  PUSH_FUNC(BOOST_PP_CAT(&state_new_2d_array_, type));                         \
+  PUSH_FUNC(BOOST_PP_CAT(&state_get_2d_array_, type));                         \
+  PUSH_FUNC(BOOST_PP_CAT(&state_resize_2d_array_, type));                      \
+  PUSH_FUNC(BOOST_PP_CAT(&state_2d_array_autosize_, type));                    \
+  PUSH_FUNC(BOOST_PP_CAT(&state_new_3d_array_, type));                         \
+  PUSH_FUNC(BOOST_PP_CAT(&state_get_3d_array_, type));                         \
+  PUSH_FUNC(BOOST_PP_CAT(&state_resize_3d_array_, type));                      \
+  PUSH_FUNC(BOOST_PP_CAT(&state_3d_array_autosize_, type));
+
+      PUSH_STATE_EDIT_QUERY(int);
+      PUSH_STATE_EDIT_QUERY(long);
+      PUSH_STATE_EDIT_QUERY(double);
+
+      PUSH_FUNC(&rand_uniform);
+      PUSH_FUNC(&rand_gaussian);
+
+      PUSH_FUNC(&get_galaxy_descriptor);
+      PUSH_FUNC(&julia_console_progress_start);
+      PUSH_FUNC(&julia_console_progress_step);
+      PUSH_FUNC(&julia_console_progress_end);
+
+      func = (jl_function_t *)jl_eval_string("libLSS._setup_module");
+      exc = jl_exception_occurred();
+      if (exc != 0) {
+        cons.print<LOG_ERROR>(
+            "Fatal error in the initialization of Julia core");
+        jl_call2(
+            jl_get_function(jl_base_module, "showerror"), jl_stderr_obj(), exc);
+        JL_GC_POP();
+        jl_exception_clear();
+        throw Julia::JuliaException(Object(exc));
+      }
+
+      cons.c_assert(func != 0, "Julia could not resolve our setup function");
+      cons.c_assert(
+          current_arg == maxNumArgs, "Invalid allocation for arguments");
+      cons.print<LOG_VERBOSE>("Run _setup_module");
+      jl_call1(func, (jl_value_t *)func_entries);
+      cons.print<LOG_VERBOSE>("Done _setup");
+
+      Julia::details::julia_array_reorder = jl_get_global(
+          (jl_module_t *)jl_get_global(jl_main_module, jl_symbol("libLSS")),
+          jl_symbol("_array_reorder"));
+      cons.c_assert(
+          Julia::details::julia_array_reorder != 0,
+          "Array reordering symbol not found");
+
+      JL_GC_POP();
+    }
+    JL_GC_POP();
+
+    //    jl_exit_on_sigint(1);
+  }
+
+  void finalizeJulia() {
+    Console::instance().print<LOG_INFO>("Cleaning julia core.");
+    jl_atexit_hook(0);
+  }
+
+  RegisterStaticInit reg(initializeJulia, finalizeJulia, 10);
+} // namespace
+
+AUTO_REGISTRATOR_IMPL(JuliaInit);
+
+Object Julia::global(std::string const &n) {
+  return Object(jl_get_global(jl_main_module, jl_symbol(n.c_str())));
+}
+
+void Julia::global(std::string const &n, Object o) {
+  jl_set_global(jl_main_module, jl_symbol(n.c_str()), (jl_value_t *)o.ptr());
+}
--- a/extra/borg/libLSS/julia/julia.hpp
+++ b/extra/borg/libLSS/julia/julia.hpp
@ -0,0 +1,148 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/julia/julia.hpp
+    Copyright (C) 2014-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#ifndef __LIBLSS_TOOLS_JULIA_HPP
+#define __LIBLSS_TOOLS_JULIA_HPP
+
+#include <boost/multi_array.hpp>
+#include "libLSS/tools/static_auto.hpp"
+#include "libLSS/tools/errors.hpp"
+#include <memory>
+#include <vector>
+
+AUTO_REGISTRATOR_DECL(JuliaInit);
+
+namespace LibLSS {
+
+  namespace Julia {
+
+    template <typename T, size_t N>
+    using array = boost::multi_array_ref<T, N>;
+
+    struct Object;
+
+    struct Object {
+      size_t opaquePosition;
+
+      template <typename T>
+      void box(T v);
+
+      template <typename Ret>
+      Ret unbox();
+
+      template <typename T, size_t N>
+      array<T, N> unbox_array();
+
+      template <typename T, size_t N>
+      void box_array(array<T, N> &a);
+
+      void *ptr() const;
+
+      Object(Object const &o);
+      Object(Object &&o);
+      Object(void *o);
+      Object();
+      ~Object();
+      Object &operator=(void *o);
+      Object &operator=(Object const &o);
+
+    private:
+      void protect();
+    };
+
+    template <typename T>
+    Object translate(T x) {
+      Object o;
+      o.box(x);
+      return o;
+    }
+
+    struct Module;
+
+    class JuliaNotFound : virtual public ErrorBase {
+    private:
+      std::string symbol;
+
+    public:
+      JuliaNotFound(const std::string &n)
+          : ErrorBase("Not found symbol: " + n) {}
+    };
+
+    class JuliaBadUnbox : virtual public ErrorBase {
+    public:
+      JuliaBadUnbox() : ErrorBase("Attempt to unbox a null pointer") {}
+    };
+
+    class JuliaException : virtual public ErrorBase {
+    private:
+      Object j_obj;
+
+      static std::string prepare(Object &o);
+
+    public:
+      Object getJuliaException() { return j_obj; }
+      JuliaException(Object &&o) : ErrorBase(prepare(o)), j_obj(o) {}
+    };
+
+    bool isBadGradient(JuliaException &e);
+
+    Object evaluate(std::string const &code);
+    void load_file(std::string const &filename);
+    void handle_julia_exception();
+
+    Module *module(std::string const &name);
+
+    Object manual_invoke(
+        Module *module, std::string const &name,
+        std::vector<Object> const &args);
+
+    Object
+    manual_invoke(std::string const &name, std::vector<Object> const &args);
+
+    void global(std::string const &name, Object o);
+
+    Object global(std::string const &name);
+
+    namespace details {
+      using std::vector;
+
+      extern void *julia_array_reorder;
+
+      template <typename T>
+      void variadic_vector_emplace(vector<T> &) {}
+
+      template <typename T, typename First, typename... Args>
+      void
+      variadic_vector_emplace(vector<T> &v, First &&first, Args &&... args) {
+        v.emplace_back(std::forward<First>(first));
+        variadic_vector_emplace(v, std::forward<Args>(args)...);
+      }
+
+      template <typename... Args>
+      Object invoke(std::string const &name, Args &&... args) {
+        vector<Object> vec;
+        variadic_vector_emplace(vec, translate(args)...);
+        return manual_invoke(name, vec);
+      }
+
+      template <typename... Args>
+      Object invoke(Module *module, std::string const &name, Args &&... args) {
+        vector<Object> vec;
+        variadic_vector_emplace(vec, translate(args)...);
+        return manual_invoke(module, name, vec);
+      }
+
+    } // namespace details
+
+    using details::invoke;
+  } // namespace Julia
+
+} // namespace LibLSS
+
+#endif
--- a/extra/borg/libLSS/julia/julia_array.hpp
+++ b/extra/borg/libLSS/julia/julia_array.hpp
@ -0,0 +1,49 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/julia/julia_array.hpp
+    Copyright (C) 2014-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#ifndef __LIBLSS_JULIA_ARRAY_HPP
+#define __LIBLSS_JULIA_ARRAY_HPP
+
+#include <boost/format.hpp>
+#include "libLSS/julia/julia.hpp"
+#include <vector>
+#include <array>
+#include <tuple>
+
+namespace LibLSS {
+
+  namespace Julia {
+
+    typedef std::tuple<ssize_t, ssize_t> IndexRange;
+
+    namespace helpers {
+      typedef IndexRange _r;
+    }
+
+    template <size_t N>
+    Object view_array(Object a, std::array<IndexRange, N> const &indexes) {
+      using boost::format;
+      using boost::str;
+      std::vector<Object> args(1 + N);
+
+      args[0] = a;
+      for (size_t i = 0; i < N; i++) {
+        auto const &t = indexes[i];
+        args[1 + i] =
+            evaluate(str(format("%d:%d") % std::get<0>(t) % std::get<1>(t)));
+      }
+
+      return manual_invoke("view", args);
+    }
+
+  } // namespace Julia
+
+} // namespace LibLSS
+
+#endif
--- a/extra/borg/libLSS/julia/julia_box.cpp
+++ b/extra/borg/libLSS/julia/julia_box.cpp
@ -0,0 +1,214 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/julia/julia_box.cpp
+    Copyright (C) 2014-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#include "libLSS/tools/static_auto.hpp"
+#include "libLSS/tools/static_init.hpp"
+#include <array>
+#include <julia.h>
+#include <list>
+#include <string>
+#include <algorithm>
+#include <vector>
+#include <iterator>
+#include <boost/algorithm/string/join.hpp>
+#include "libLSS/julia/julia.hpp"
+#include <boost/preprocessor/cat.hpp>
+#include <boost/preprocessor/repetition/repeat.hpp>
+#include <boost/preprocessor/seq/for_each.hpp>
+
+using namespace LibLSS;
+using boost::format;
+
+using LibLSS::Julia::Object;
+
+static constexpr size_t LIBLSS_MAX_JULIA_STACK = 256;
+
+static void *opaque_stack[LIBLSS_MAX_JULIA_STACK];
+static void *opaque_other_stack[3 * LIBLSS_MAX_JULIA_STACK];
+static bool opaque_stack_busy[LIBLSS_MAX_JULIA_STACK];
+static size_t currentOpaqueStackPosition = 0;
+
+#define OPAQUE opaque_stack[opaquePosition]
+
+namespace LibLSS {
+  namespace Julia {
+
+    template <typename T>
+    jl_value_t *julia_types();
+
+    template <typename T>
+    std::string julia_type_name() {
+      return jl_symbol_name(((jl_datatype_t *)julia_types<T>())->name->name);
+    }
+
+    void Object::protect() {
+      Console::instance().c_assert(currentOpaqueStackPosition < LIBLSS_MAX_JULIA_STACK, "Julia stack not large enough, increase LIBLSS_MAX_JULIA_STACK");
+      opaque_stack_busy[opaquePosition = currentOpaqueStackPosition++] = true;
+      OPAQUE = 0;
+
+#if (JULIA_VERSION_MAJOR==1) && (JULIA_VERSION_MINOR >= 4)
+      opaque_other_stack[3 * opaquePosition] = (void *)JL_GC_ENCODE_PUSH(1);
+      opaque_other_stack[3 * opaquePosition + 1] = jl_pgcstack;
+      opaque_other_stack[3 * opaquePosition + 2] = &OPAQUE;
+#else
+      opaque_other_stack[3 * opaquePosition] = (void *)3;
+      opaque_other_stack[3 * opaquePosition + 1] = jl_pgcstack;
+      opaque_other_stack[3 * opaquePosition + 2] = &OPAQUE;
+#endif
+      jl_pgcstack = (jl_gcframe_t *)&opaque_other_stack[3 * opaquePosition];
+    }
+
+    Object::Object(Object const &o) {
+      protect();
+      OPAQUE = o.ptr();
+    }
+
+    Object::Object(Object &&o) {
+      protect();
+      OPAQUE = o.ptr();
+    }
+
+    void *Object::ptr() const { return OPAQUE; }
+
+    Object::~Object() {
+      opaque_stack_busy[opaquePosition] = false;
+      if (currentOpaqueStackPosition == opaquePosition + 1) {
+        // Ok complete full stack unwinding
+        do {
+          if (opaque_stack_busy[currentOpaqueStackPosition - 1])
+            break;
+          currentOpaqueStackPosition--;
+          Console::instance().c_assert(jl_pgcstack == (jl_gcframe_t *)&opaque_other_stack[3 * currentOpaqueStackPosition], "Julia stack ordering is broken.");
+          jl_pgcstack = jl_pgcstack->prev;
+        } while (currentOpaqueStackPosition > 0);
+      }
+    }
+
+    Object &Object::operator=(void *o) {
+      OPAQUE = o;
+      return *this;
+    }
+
+    Object &Object::operator=(Object const &o) {
+      OPAQUE = o.ptr();
+      return *this;
+    }
+
+    Object::Object() { protect(); }
+
+    Object::Object(void *v) {
+      protect();
+      OPAQUE = v;
+    }
+
+    template <>
+    void Object::box(Object o) {
+      OPAQUE = o.ptr();
+    }
+
+#define IMPLEMENT_AUTO_TRANSLATE(type, jtype)                                  \
+  template <>                                                                  \
+  void Object::box<type>(type x) {                                             \
+    OPAQUE = BOOST_PP_CAT(jl_box_, jtype)(x);                                  \
+  }                                                                            \
+  template <>                                                                  \
+  type Object::unbox<type>() {                                                 \
+    type x;                                                                    \
+    if (OPAQUE == 0) {                                                         \
+      throw JuliaBadUnbox();                                                   \
+    }                                                                          \
+    x = BOOST_PP_CAT(jl_unbox_, jtype)(                                        \
+        reinterpret_cast<jl_value_t *>(OPAQUE));                               \
+    return x;                                                                  \
+  }                                                                            \
+  template <>                                                                  \
+  jl_value_t *julia_types<type>() {                                            \
+    return (jl_value_t *)BOOST_PP_CAT(BOOST_PP_CAT(jl_, jtype), _type);        \
+  }
+
+#define ONE_AUTO_TRANSLATE(r, data, T)                                         \
+  IMPLEMENT_AUTO_TRANSLATE(                                                    \
+      BOOST_PP_TUPLE_ELEM(2, 0, T), BOOST_PP_TUPLE_ELEM(2, 1, T))
+
+    BOOST_PP_SEQ_FOR_EACH(
+        ONE_AUTO_TRANSLATE, X,
+        ((bool, bool))((double, float64))((float, float32))((int8_t, int8))(
+            (uint8_t, uint8))((int16_t, int16))((uint16_t, uint16))(
+            (int32_t, int32))((uint32_t, uint32))((int64_t, int64))(
+            (uint64_t, uint64))((void *, voidpointer)));
+
+    template <typename T, size_t N>
+    array<T, N> Object::unbox_array() {
+      std::array<size_t, N> julia_extents;
+      jl_value_t *jobj = reinterpret_cast<jl_value_t *>(OPAQUE);
+      jl_datatype_t *el =
+          reinterpret_cast<jl_datatype_t *>(jl_array_eltype(jobj));
+
+      if (el != (jl_datatype_t *)julia_types<T>()) {
+        error_helper<ErrorBadState>(
+            "Incompatible array type, got " +
+            std::string(jl_symbol_name(el->name->name)) + " and expected " +
+            julia_type_name<T>());
+      }
+
+      for (size_t i = 0; i < N; i++)
+        julia_extents[i] = jl_array_dim(jobj, i);
+
+      return array<T, N>(
+          reinterpret_cast<T *>(jl_array_data(jobj)), julia_extents);
+    }
+
+    template <typename T, size_t N>
+    void Object::box_array(array<T, N> &a) {
+      jl_value_t *array_type = 0;
+      jl_value_t *dims = 0;
+      jl_value_t *tmp_array = 0;
+      jl_value_t *cppOrder = 0;
+      JL_GC_PUSH4(&array_type, &dims, &tmp_array, &cppOrder); 
+
+      std::string tuple_string = "(";
+
+      for (ssize_t i = N - 1; i >= 0; i--)
+        tuple_string += std::to_string(a.shape()[i]) + ",";
+      tuple_string += ")";
+
+      array_type = jl_apply_array_type(julia_types<T>(), N);
+
+      cppOrder = jl_box_bool(true);
+
+      dims = jl_eval_string(tuple_string.c_str());
+      handle_julia_exception();
+      tmp_array = (jl_value_t *)jl_ptr_to_array(array_type, a.data(), dims, 0);
+      handle_julia_exception();
+      OPAQUE = jl_call2( (jl_value_t*) Julia::details::julia_array_reorder, tmp_array, cppOrder);
+      handle_julia_exception();
+
+      JL_GC_POP();
+    }
+
+#define DECL_ARRAY_SINGLE(z, n, data)                                          \
+  template array<data, n> Object::unbox_array<data, n>();                      \
+  template void Object::box_array<data, n>(array<data, n> &);
+
+#define DECL_ARRAY_MULTIPLE(r, data, elem)                                     \
+  BOOST_PP_REPEAT(4, DECL_ARRAY_SINGLE, elem)
+
+    BOOST_PP_SEQ_FOR_EACH(
+        DECL_ARRAY_MULTIPLE, X,
+        (bool)(double)(float)(int8_t)(uint8_t)(int16_t)(uint16_t)(int32_t)(
+            uint32_t)(int64_t)(uint64_t));
+#ifdef __APPLE__
+    // Funny OSX types long long is 64 bits, long int is 64 bits too but different.
+    IMPLEMENT_AUTO_TRANSLATE(unsigned long, uint64)
+    DECL_ARRAY_MULTIPLE(X, X, unsigned long)
+#endif
+
+  } // namespace Julia
+
+} // namespace LibLSS
--- a/extra/borg/libLSS/julia/julia_calls.cpp
+++ b/extra/borg/libLSS/julia/julia_calls.cpp
@ -0,0 +1,154 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/julia/julia_calls.cpp
+    Copyright (C) 2014-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#include "libLSS/tools/static_init.hpp"
+#include "libLSS/tools/static_auto.hpp"
+#include <julia.h>
+#include <list>
+#include <string>
+#include <algorithm>
+#include <vector>
+#include <iterator>
+#include <boost/algorithm/string/join.hpp>
+#include "libLSS/julia/julia.hpp"
+#include <boost/preprocessor/cat.hpp>
+#include <boost/preprocessor/repetition/repeat.hpp>
+#include <boost/preprocessor/seq/for_each.hpp>
+#include <boost/algorithm/string/classification.hpp>
+#include <boost/algorithm/string/split.hpp>
+
+using namespace LibLSS;
+
+using LibLSS::Julia::Object;
+
+namespace LibLSS {
+  namespace Julia {
+
+    std::string JuliaException::prepare(Object &j_obj) {
+      Object pipe, data;
+
+      std::string msg =
+          jl_typeof_str(reinterpret_cast<jl_value_t *>(j_obj.ptr()));
+      msg += ": ";
+      pipe = evaluate("PipeBuffer()");
+      jl_call2(
+          jl_get_function(jl_base_module, "showerror"),
+          (jl_value_t *)pipe.ptr(), (jl_value_t *)j_obj.ptr());
+
+      data = jl_call1(
+          jl_get_function(jl_base_module, "read"), (jl_value_t *)pipe.ptr());
+      char *full_msg = (char *)jl_array_data(data.ptr());
+      size_t msg_len = jl_array_dim(data.ptr(), 0);
+
+      std::vector<std::string> splitted_error;
+      std::string s_msg(full_msg, msg_len);
+      boost::algorithm::split(splitted_error, s_msg, boost::is_any_of("\n"));
+
+      Console::instance().print<LOG_ERROR>("Julia exception trace:");
+      Console::instance().indent();
+
+      for (auto &one_msg : splitted_error) {
+        Console::instance().print<LOG_ERROR>(one_msg);
+        msg += one_msg;
+      }
+      Console::instance().unindent();
+      Console::instance().print<LOG_ERROR>("End of Julia exception");
+
+      return msg;
+    }
+
+    void handle_julia_exception() {
+      Object exc = jl_exception_occurred();
+      if (exc.ptr() != 0) {
+        jl_exception_clear();
+        throw JuliaException(std::move(exc));
+      }
+    }
+
+    Object evaluate(std::string const &code) {
+      Object ret = jl_eval_string(code.c_str());
+      handle_julia_exception();
+      return ret;
+    }
+
+    bool isBadGradient(JuliaException &e) {
+      return manual_invoke("libLSS._checkBadGradient", {e.getJuliaException()})
+          .unbox<bool>();
+    }
+
+    void load_file(std::string const &name) {
+      std::string cmd = "Base.include(Main, \"" + name + "\");";
+      Console::instance().print<LOG_DEBUG>("Loading command " + cmd);
+      (void)jl_eval_string(cmd.c_str());
+      handle_julia_exception();
+    }
+
+    Object
+    manual_invoke(std::string const &name, std::vector<Object> const &args) {
+      Object func, ret, exc;
+
+      func = jl_eval_string(name.c_str());
+      handle_julia_exception();
+      if (func.ptr() == 0) {
+        throw JuliaNotFound(name);
+      }
+
+      {
+        jl_value_t **j_args;
+        JL_GC_PUSHARGS(j_args, args.size());
+        for (size_t i = 0; i < args.size(); i++)
+          j_args[i] = reinterpret_cast<jl_value_t *>(args[i].ptr());
+
+        ret = jl_call((jl_function_t *)func.ptr(), j_args, args.size());
+        JL_GC_POP();
+      }
+
+      exc = jl_exception_occurred();
+      if (exc.ptr() != 0) {
+        jl_exception_clear();
+        throw JuliaException(std::move(exc));
+      }
+
+      return ret;
+    }
+
+    Object manual_invoke(
+        Module *module, std::string const &name,
+        std::vector<Object> const &args) {
+      Object ret, func;
+
+      func = jl_get_function((jl_module_t *)module, name.c_str());
+
+      if (func.ptr() == 0) {
+        throw JuliaNotFound(name);
+      }
+
+      {
+        jl_value_t **j_args;
+        JL_GC_PUSHARGS(j_args, args.size());
+
+        for (size_t i = 0; i < args.size(); i++)
+          j_args[i] = reinterpret_cast<jl_value_t *>(args[i].ptr());
+
+        ret = jl_call((jl_function_t *)func.ptr(), j_args, args.size());
+
+        JL_GC_POP();
+      }
+
+      jl_value_t *exc = jl_exception_occurred();
+      if (exc != 0) {
+        jl_exception_clear();
+        throw JuliaException(Object(exc));
+      }
+
+      return ret;
+    }
+
+  } // namespace Julia
+} // namespace LibLSS
--- a/extra/borg/libLSS/julia/julia_ghosts.cpp
+++ b/extra/borg/libLSS/julia/julia_ghosts.cpp
@ -0,0 +1,60 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/julia/julia_ghosts.cpp
+    Copyright (C) 2014-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#include "libLSS/julia/julia.hpp"
+#include "libLSS/julia/julia_ghosts.hpp"
+#include "libLSS/julia/julia_array.hpp"
+#include "libLSS/tools/string_tools.hpp"
+
+using namespace LibLSS;
+using LibLSS::Julia::helpers::_r;
+
+namespace {
+  void *_get_ghost_plane(void *j_ghosts, size_t plane, size_t maxN2) {
+    Julia::Object o;
+
+    auto ghosts = (GhostPlanes<double, 2> *)j_ghosts;
+
+    Console::instance().print<LOG_DEBUG>(
+        boost::format("Get plane %d, ghosts %p") % plane % ghosts);
+    auto &g_plane = ghosts->getPlane(plane);
+    o.box_array(g_plane);
+
+    return Julia::view_array<2>(o, {_r(1, g_plane.shape()[0]), _r(1, maxN2)})
+        .ptr();
+  }
+
+  void *_get_ag_ghost_plane(void *j_ghosts, size_t plane, size_t maxN2) {
+    Julia::Object o;
+
+    auto ghosts = (GhostPlanes<double, 2> *)j_ghosts;
+
+    Console::instance().print<LOG_DEBUG>(
+        boost::format("Get AG plane %d, ghosts %p") % plane % ghosts);
+    auto &g_plane = ghosts->ag_getPlane(plane);
+    o.box_array(g_plane);
+
+    return Julia::view_array<2>(o, {_r(1, g_plane.shape()[0]), _r(1, maxN2)})
+        .ptr();
+  }
+
+} // namespace
+
+Julia::Object
+Julia::newGhostManager(GhostPlanes<double, 2> *ghosts, size_t maxN2) {
+  std::vector<Object> args(4);
+
+  Console::instance().print<LOG_DEBUG>(
+      boost::format("Setup ghosts %p") % ghosts);
+  args[0].box((void *)ghosts);
+  args[1].box((void *)&_get_ghost_plane);
+  args[2].box((void *)&_get_ag_ghost_plane);
+  args[3].box(maxN2);
+  return Julia::manual_invoke("libLSS._new_ghost_plane", args);
+}
--- a/extra/borg/libLSS/julia/julia_ghosts.hpp
+++ b/extra/borg/libLSS/julia/julia_ghosts.hpp
@ -0,0 +1,23 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/julia/julia_ghosts.hpp
+    Copyright (C) 2014-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#ifndef __LIBLSS_JULIA_GHOST_HPP
+#define __LIBLSS_JULIA_GHOST_HPP
+
+#include "libLSS/julia/julia.hpp"
+#include "libLSS/tools/mpi/ghost_planes.hpp"
+
+namespace LibLSS {
+  namespace Julia {
+
+    Object newGhostManager(GhostPlanes<double, 2> *planes, size_t maxN2);
+  }
+} // namespace LibLSS
+
+#endif
--- a/extra/borg/libLSS/julia/julia_mcmc.cpp
+++ b/extra/borg/libLSS/julia/julia_mcmc.cpp
@ -0,0 +1,18 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/julia/julia_mcmc.cpp
+    Copyright (C) 2014-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#include "libLSS/julia/julia.hpp"
+#include "libLSS/julia/julia_mcmc.hpp"
+#include "libLSS/mcmc/global_state.hpp"
+
+LibLSS::Julia::Object LibLSS::Julia::pack(MarkovState &state) {
+  using namespace LibLSS::Julia;
+
+  return invoke("libLSS._setup_state", (void *)&state);
+}
--- a/extra/borg/libLSS/julia/julia_mcmc.hpp
+++ b/extra/borg/libLSS/julia/julia_mcmc.hpp
@ -0,0 +1,25 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/julia/julia_mcmc.hpp
+    Copyright (C) 2014-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#ifndef __LIBLSS_TOOLS_JULIA_MCMC_HPP
+#define __LIBLSS_TOOLS_JULIA_MCMC_HPP
+
+#include "libLSS/julia/julia.hpp"
+#include "libLSS/mcmc/global_state.hpp"
+
+namespace LibLSS {
+
+  namespace Julia {
+
+    Object pack(MarkovState &state);
+  }
+
+} // namespace LibLSS
+
+#endif
--- a/extra/borg/libLSS/julia/julia_module.jl
+++ b/extra/borg/libLSS/julia/julia_module.jl
@ -0,0 +1,389 @@
+#+
+#   ARES/HADES/BORG Package -- ./extra/borg/libLSS/julia/julia_module.jl
+#   Copyright (C) 2014-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+#   Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
+#
+#   Additional contributions from:
+#      Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+#   
+#+
+baremodule libLSS
+
+    using Base
+
+    module_is_setup = false
+    _internal = 0
+
+    @enum LOG LOG_STD=0 LOG_ERROR=1 LOG_WARNING=2 LOG_INFO=3 LOG_INFO_SINGLE=4 LOG_VERBOSE=5 LOG_DEBUG=6
+
+    const CONSOLE_PRINT=1
+    const STATE_NEW_INT=2
+    const STATE_EDIT_INT=3
+    const STATE_QUERY_INT=4
+    const STATE_NEW_1D_INT=5
+    const STATE_GET_1D_INT=6
+    const STATE_RESIZE_1D_INT=7
+    const STATE_1D_INT_AUTOSIZE=8
+    const STATE_NEW_2D_INT=9
+    const STATE_GET_2D_INT=10
+    const STATE_RESIZE_2D_INT=11
+    const STATE_2D_INT_AUTOSIZE=12
+    const STATE_NEW_3D_INT=13
+    const STATE_GET_3D_INT=14
+    const STATE_RESIZE_3D_INT=15
+    const STATE_3D_INT_AUTOSIZE=16
+    const STATE_NEW_LONG=17
+    const STATE_EDIT_LONG=18
+    const STATE_QUERY_LONG=19
+    const STATE_NEW_1D_LONG=20
+    const STATE_GET_1D_LONG=21
+    const STATE_RESIZE_1D_LONG=22
+    const STATE_1D_LONG_AUTOSIZE=23
+    const STATE_NEW_2D_LONG=24
+    const STATE_GET_2D_LONG=25
+    const STATE_RESIZE_2D_LONG=26
+    const STATE_2D_LONG_AUTOSIZE=27
+    const STATE_NEW_3D_LONG=28
+    const STATE_GET_3D_LONG=29
+    const STATE_RESIZE_3D_LONG=30
+    const STATE_3D_LONG_AUTOSIZE=31
+    const STATE_NEW_DBL=32
+    const STATE_EDIT_DBL=33
+    const STATE_QUERY_DBL=34
+    const STATE_NEW_1D_DBL=35
+    const STATE_GET_1D_DBL=36
+    const STATE_RESIZE_1D_DBL=37
+    const STATE_1D_DBL_AUTOSIZE=38
+    const STATE_NEW_2D_DBL=39
+    const STATE_GET_2D_DBL=40
+    const STATE_RESIZE_2D_DBL=41
+    const STATE_2D_DBL_AUTOSIZE=42
+    const STATE_NEW_3D_DBL=43
+    const STATE_GET_3D_DBL=44
+    const STATE_RESIZE_3D_DBL=45
+    const STATE_3D_DBL_AUTOSIZE=46
+    const RANDOM_UNIFORM=47
+    const RANDOM_GAUSSIAN=48
+    const GET_GALAXY_DESCRIPTOR=49
+    const CONSOLE_PROGRESS_START=50
+    const CONSOLE_PROGRESS_STEP=51
+    const CONSOLE_PROGRESS_END=52
+    const MODEL_SET_REQUEST_IO=53
+    const MODEL_INPUT_GET_REAL=54
+    const MODEL_INPUT_GET_FOURIER=55
+    const MODEL_OUTPUT_GET_REAL=54
+    const MODEL_OUTPUT_GET_FOURIER=55
+
+    struct DimensionSpec{N} end
+
+    d1d = DimensionSpec{1}()
+    d2d = DimensionSpec{2}()
+    d3d = DimensionSpec{3}()
+
+    export d1d, d2d, d3d
+
+    _code_new(::Type{Cint}) = STATE_NEW_INT
+    _code_new(::Type{Cdouble}) = STATE_NEW_DBL
+    _code_new(::Type{Clong}) = STATE_NEW_LONG
+
+    _code_query(::Type{Cdouble}) = STATE_QUERY_DBL
+    _code_edit(::Type{Cdouble}) = STATE_EDIT_DBL
+
+    _code_query(::Type{Clong}) = STATE_QUERY_LONG
+    _code_edit(::Type{Clong}) = STATE_EDIT_LONG
+
+    _code_query(::Type{Cint}) = STATE_QUERY_INT
+    _code_edit(::Type{Cint}) = STATE_EDIT_INT
+
+    _code_new_1d(::Type{Cint}) = STATE_NEW_1D_INT
+    _code_get_1d(::Type{Cint}) = STATE_GET_1D_INT
+    _code_resize_1d(::Type{Cint}) = STATE_RESIZE_1D_INT
+    _code_1d_autosize(::Type{Cint}) = STATE_1D_INT_AUTOSIZE
+    _code_new_2d(::Type{Cint}) = STATE_NEW_2D_INT
+    _code_get_2d(::Type{Cint}) = STATE_GET_2D_INT
+    _code_resize_2d(::Type{Cint}) = STATE_RESIZE_2D_INT
+    _code_2d_autosize(::Type{Cint}) = STATE_2D_INT_AUTOSIZE
+    _code_new_3d(::Type{Cint}) = STATE_NEW_3D_INT
+    _code_get_3d(::Type{Cint}) = STATE_GET_3D_INT
+    _code_resize_3d(::Type{Cint}) = STATE_RESIZE_3D_INT
+    _code_3d_autosize(::Type{Cint}) = STATE_3D_INT_AUTOSIZE
+
+    _code_new_1d(::Type{Clong}) = STATE_NEW_1D_LONG
+    _code_get_1d(::Type{Clong}) = STATE_GET_1D_LONG
+    _code_resize_1d(::Type{Clong}) = STATE_RESIZE_1D_LONG
+    _code_1d_autosize(::Type{Clong}) = STATE_1D_LONG_AUTOSIZE
+    _code_new_2d(::Type{Clong}) = STATE_NEW_2D_LONG
+    _code_get_2d(::Type{Clong}) = STATE_GET_2D_LONG
+    _code_resize_2d(::Type{Clong}) = STATE_RESIZE_2D_LONG
+    _code_1d_autosize(::Type{Clong}) = STATE_2D_LONG_AUTOSIZE
+    _code_new_3d(::Type{Clong}) = STATE_NEW_3D_LONG
+    _code_get_3d(::Type{Clong}) = STATE_GET_3D_LONG
+    _code_resize_3d(::Type{Clong}) = STATE_RESIZE_3D_LONG
+    _code_3d_autosize(::Type{Clong}) = STATE_3D_LONG_AUTOSIZE
+
+    _code_new_1d(::Type{Cdouble}) = STATE_NEW_1D_DBL
+    _code_get_1d(::Type{Cdouble}) = STATE_GET_1D_DBL
+    _code_resize_1d(::Type{Cdouble}) = STATE_RESIZE_1D_DBL
+    _code_1d_autosize(::Type{Cdouble}) = STATE_1D_DBL_AUTOSIZE
+    _code_new_2d(::Type{Cdouble}) = STATE_NEW_2D_DBL
+    _code_get_2d(::Type{Cdouble}) = STATE_GET_2D_DBL
+    _code_resize_2d(::Type{Cdouble}) = STATE_RESIZE_2D_DBL
+    _code_2d_autosize(::Type{Cdouble}) = STATE_2D_DBL_AUTOSIZE
+    _code_new_3d(::Type{Cdouble}) = STATE_NEW_3D_DBL
+    _code_get_3d(::Type{Cdouble}) = STATE_GET_3D_DBL
+    _code_resize_3d(::Type{Cdouble}) = STATE_RESIZE_3D_DBL
+    _code_3d_autosize(::Type{Cdouble}) = STATE_3D_DBL_AUTOSIZE
+
+
+    struct AlreadyInitialized <: Exception
+    end
+
+    Cptr = Ptr{Nothing}
+
+    struct State
+        opaque::Cptr
+    end
+
+    export State, AlreadyInitialized
+
+    function new(state::State, element::String, value::T; synchronous::Bool=false, mcmc_save::Bool=false) where {T}
+        ccall(_internal[_code_new(T)], Nothing, (Cptr, Cstring, Ref{T}, Cint, Cint), state.opaque, element, value, synchronous, mcmc_save)
+    end
+    put(state::State, element::String, value::T; synchronous::Bool=false) where {T} =
+        ccall(_internal[_code_edit(T)], Nothing, (Cptr, Cstring, Ref{T}, Cint), state.opaque, element, value, synchronous)
+    function get(state::State, element::String, ::Type{T}; synchronous::Bool=false ) where {T}
+        y = Ref{T}(0)
+        ccall(_internal[_code_query(T)], Nothing, (Cptr, Cstring, Ref{T}, Cint), state.opaque, element, y, synchronous)
+        y[]
+    end
+    function new_array(state::State, element::String, size::Int, ::Type{T}, cppOrder=true; mcmc_save::Bool=false) where {T}
+        ptr_array = ccall(_internal[_code_new_1d(T)], Ptr{T}, (Cptr, Cstring, Csize_t, Cint), state.opaque, element, Csize_t(size), mcmc_save)
+        _array_reorder(unsafe_wrap(Array, ptr_array, size), cppOrder)
+    end
+    function new_array(state::State, element::String, size::NTuple{2, Int}, ::Type{T}, cppOrder=true; mcmc_save::Bool=false) where {T}
+	swapped_size = _swap_order(size, cppOrder)
+        ptr_array = ccall(_internal[_code_new_2d(T)], Ptr{T}, (Cptr, Cstring, Csize_t, Csize_t, Cint), state.opaque, element, swapped_size[1], swapped_size[2], mcmc_save)
+        _array_reorder(unsafe_wrap(Array, ptr_array, size), cppOrder)
+    end
+    function new_array(state::State, element::String, size::NTuple{3, Int}, ::Type{T}, cppOrder=true; mcmc_save::Bool=false) where {T}
+	swapped_size = _swap_order(size, cppOrder)
+        ptr_array = ccall(_internal[_code_new_3d(T)], Ptr{T}, (Cptr, Cstring, Csize_t, Csize_t, Csize_t, Cint), state.opaque, element, swapped_size[1], swapped_size[2], swapped_size[3], mcmc_save)
+        _array_reorder(unsafe_wrap(Array, ptr_array, size), cppOrder)
+    end
+    function get_array_1d(state::State, element::String, ::Type{T}, cppOrder=true) where {T}
+        a_size = Vector{Csize_t}(undef, 1)
+        ptr_array = ccall(_internal[_code_get_1d(T)], Ptr{T}, (Cptr, Cstring, Ptr{Csize_t}), state.opaque, element, a_size)
+        _array_reorder(unsafe_wrap(Array, ptr_array, a_size[1]), cppOrder)
+    end
+    function get_array_2d(state::State, element::String, ::Type{T}, cppOrder=true) where {T}
+        a_size = Vector{Csize_t}(undef, 2)
+        ptr_array = ccall(_internal[_code_get_2d(T)], Ptr{T}, (Cptr, Cstring, Ptr{Csize_t}), state.opaque, element, a_size)
+        _array_reorder(unsafe_wrap(Array, ptr_array, (a_size[1],a_size[2])), cppOrder)
+    end
+    function get_array_3d(state::State, element::String, ::Type{T}, cppOrder=true) where {T}
+        a_size = Vector{Csize_t}(undef, 3)
+        ptr_array = ccall(_internal[_code_get_3d(T)], Ptr{T}, (Cptr, Cstring, Ptr{Csize_t}), state.opaque, element, a_size)
+        _array_reorder(unsafe_wrap(Array, ptr_array, (a_size[1],a_size[2],a_size[3])), cppOrder)
+    end
+    get_array(state::State, element::String, ::Type{T}, ::DimensionSpec{1}, cppOrder=true) where {T}=get_array_1d(state, element, T, cppOrder)
+    get_array(state::State, element::String, ::Type{T}, ::DimensionSpec{2}, cppOrder=true) where {T}=get_array_2d(state, element, T, cppOrder)
+    get_array(state::State, element::String, ::Type{T}, ::DimensionSpec{3}, cppOrder=true) where {T}=get_array_3d(state, element, T, cppOrder)
+
+    function resize_array(state::State, element::String, size::Int, ::Type{T}, cppOrder=true) where{T}
+        ptr_array = ccall(_internal[_code_resize_1d(T)], Ptr{T}, (Cptr, Cstring, Csize_t), state.opaque, element, Csize_t(size))
+        _array_reorder(unsafe_wrap(Array, ptr_array, size), cppOrder)
+    end
+    function resize_array(state::State, element::String, size::NTuple{2, Int}, ::Type{T}; cppOrder=true) where{T}
+        ptr_array = ccall(_internal[_code_resize_2d(T)], Ptr{T}, (Cptr, Cstring, Csize_t, Csize_t), state.opaque, element, Csize_t(size[1]), Csize_t(size[2]))
+        _array_reorder(unsafe_wrap(Array, ptr_array, size), cppOrder)
+    end
+    function resize_array(state::State, element::String, size::NTuple{3, Int}, ::Type{T}; cppOrder=true) where{T}
+        ptr_array = ccall(_internal[_code_resize_3d(T)], Ptr{T}, (Cptr, Cstring, Csize_t, Csize_t, Csize_t), state.opaque, element, Csize_t(size[1]), Csize_t(size[2]), Csize_t(size[3]))
+        _array_reorder(unsafe_wrap(Array, ptr_array, size), cppOrder)
+    end
+    function autosize_array(state::State, element::String, b::Bool, ::Type{T}, ::DimensionSpec{1}) where {T}
+        ccall(_internal[_code_1d_autosize(T)], Cvoid, (Cptr, Cstring, Cint), state.opaque, element, Cint(b))
+    end
+    function autosize_array(state::State, element::String, b::Bool, ::Type{T}, ::DimensionSpec{2}) where {T}
+        ccall(_internal[_code_2d_autosize(T)], Cvoid, (Cptr, Cstring, Cint), state.opaque, element, Cint(b))
+    end
+    function autosize_array(state::State, element::String, b::Bool, ::Type{T}, ::DimensionSpec{3}) where {T}
+        ccall(_internal[_code_3d_autosize(T)], Cvoid, (Cptr, Cstring, Cint), state.opaque, element, Cint(b))
+    end
+
+    export new, put, get, new_array, get_array_1d, get_array_2d, get_array_3d, get_array, resize_array, autosize_array
+
+    print(level::LOG, msg::String) =
+        ccall(_internal[CONSOLE_PRINT], Nothing, (Cint, Cstring), Int(level), msg)
+
+    export print, LOG_STD, LOG_ERROR, LOG_WARNING, LOG_INFO, LOG_INFO_SINGLE, LOG_VERBOSE, LOG_DEBUG
+
+    function progress_start(level::LOG, msg::String, steps::Int, func)
+        p = ccall(_internal[CONSOLE_PROGRESS_START], Cptr, (Cint, Cstring, Cint), Int(level), msg, steps)
+        for i in 1:steps
+	  func(i)
+	  ccall(_internal[CONSOLE_PROGRESS_STEP], Nothing, (Cptr,), p)
+	end
+	ccall(_internal[CONSOLE_PROGRESS_END], Nothing, (Cptr,), p)
+	Nothing
+    end
+
+    # Extract "galaxy" information
+
+    # WARNING GalaxyDescriptor struct must reflect the structure in libLSS/data/galaxies.hpp
+    struct GalaxyDescriptor
+       id::Clonglong
+       phi::Cdouble
+       theta::Cdouble
+       zo::Cdouble
+       m::Cdouble
+       M_abs::Cdouble
+       Mgal::Cdouble
+       z::Cdouble
+       r::Cdouble
+       w::Cdouble
+       final_w::Cdouble
+       radius::Cdouble
+       spin::Cdouble
+       posx::Cdouble
+       posy::Cdouble
+       posz::Cdouble
+       vx::Cdouble
+       vy::Cdouble
+       vz::Cdouble
+    end
+
+    function get_galaxy_descriptor(state::State, id)
+        sz = Vector{Cint}(undef, 1)
+        ptr_array = ccall(_internal[GET_GALAXY_DESCRIPTOR], Ptr{GalaxyDescriptor}, (Cptr, Cint, Cptr), state.opaque, Cint(id), sz)
+        unsafe_wrap(Array, ptr_array, sz[1])
+    end
+
+    export get_galaxy_descriptor
+
+    # Random numbers
+    uniform(state::State) =
+        ccall(_internal[RANDOM_UNIFORM], Cdouble, (Cptr,), state.opaque)
+    gaussian(state::State) =
+        ccall(_internal[RANDOM_GAUSSIAN], Cdouble, (Cptr,), state.opaque)
+
+    export uniform, gaussian
+
+    struct BoxModel
+        L::Tuple{Float64, Float64, Float64}
+        N::Tuple{UInt64, UInt64, UInt64}
+    end
+
+    _setup_state(ptr_state::Ptr{Nothing}) = State(ptr_state)
+
+    function _setup_module(entries)
+        global _internal
+        global module_is_setup
+
+        if module_is_setup
+            throw(AlreadyInitialized())
+        end
+        _internal = entries
+        module_is_setup = true
+
+        true
+    end
+
+    struct BadGradient <: Exception
+    end
+
+    export BadGradient
+
+    _checkBadGradient(e::BadGradient) = true
+    _checkBadGradient(e) = false
+
+    struct GhostPlanes
+        opaque::Ptr{Nothing}
+        access::Ptr{Nothing}
+        ag_access::Ptr{Nothing}
+        maxN2::UInt64
+    end
+
+    function _new_ghost_plane(ghost_obj::Ptr{Nothing}, ghost_access::Ptr{Nothing}, ghost_ag_access::Ptr{Nothing}, maxN2::Csize_t)
+        return GhostPlanes(ghost_obj, ghost_access, ghost_ag_access, maxN2)
+    end
+
+    function get_ag_ghost_plane(ghosts::GhostPlanes, plane)
+        return ccall(ghosts.ag_access, AbstractArray{Float64,2}, (Cptr,Csize_t,Csize_t), ghosts.opaque, plane, ghosts.maxN2)
+    end
+
+    function get_ghost_plane(ghosts::GhostPlanes, plane)
+        return ccall(
+            ghosts.access, AbstractArray{Float64,2},
+            (Cptr,Csize_t,Csize_t), ghosts.opaque, plane, ghosts.maxN2)
+    end
+
+    export GhostPlanes, get_ag_ghost_plane, get_ghost_plane
+
+    function _array_reorder(a::X, cppOrder::Bool) where {X<:AbstractArray{T,N}} where {T,N}
+        if cppOrder
+          return PermutedDimsArray(a, N:-1:1)
+	else 
+	  return PermutedDimsArray(a, 1:N) # To preserve type transparency
+	end
+    end
+
+    function _swap_order(Ns::NTuple{M,Int}, cppOrder::Bool) where {M}
+      if cppOrder
+        return Ns
+      else
+        return reverse(Ns)
+      end
+    end
+
+    abstract type ModelIO{N}
+    end
+
+    struct ModelInput{N} <: ModelIO{N} 
+        opaque::Cptr
+    end
+
+    struct ModelOutput{N} <: ModelIO{N} 
+        opaque::Cptr
+    end
+
+    @enum IOType IO_REAL=0 IO_FOURIER=1
+
+    export ModelInput, ModelOutput
+
+    function _get_opaque_model(m::ModelInput{N}) where {N<:Int}
+      m.opaque
+    end
+
+    function _get_opaque_model(m::ModelOutput{N}) where {N<:Int}
+      m.opaque
+    end
+
+    function _new_model_input(opaque::Cptr) where {N<:Int}
+       ModelInput{N}(opaque)
+    end
+
+    function _new_model_output(opaque::Cptr) where {N<:Int}
+       ModelOutput{N}(opaque)
+    end
+
+    function requestIO(m::ModelIO{N}, io::IOType) where {N<:Int}
+      ccall(_internal[MODEL_SET_REQUEST_IO], Nothing, (Cptr,Cint,Cint), _get_opaque_model(m), N, Int(io))
+    end
+
+    function getReal(m::ModelInput{N}) where {N<:Int}
+      ccall(_internal[MODEL_INPUT_GET_REAL], AbstractArray{Float64,N}, (Cptr,Cint), m.opaque, Int(N))
+    end
+
+    function getFourier(m::ModelInput{N}) where {N<:Int}
+      ccall(_internal[MODEL_INPUT_GET_FOURIER], AbstractArray{ComplexF64,N}, (Cptr,Cint), m.opaque, Int(N))
+    end
+
+    function getReal(m::ModelOutput{N}) where {N<:Int}
+      ccall(_internal[MODEL_OUTPUT_GET_REAL], AbstractArray{Float64,N}, (Cptr,Cint), m.opaque, Int(N))
+    end
+
+    function getFourier(m::ModelOutput{N}) where {N<:Int}
+      ccall(_internal[MODEL_OUTPUT_GET_FOURIER], AbstractArray{ComplexF64,N}, (Cptr,Cint), m.opaque, Int(N))
+    end
+end
--- a/extra/borg/libLSS/physics/adapt_classic_to_gauss.hpp
+++ b/extra/borg/libLSS/physics/adapt_classic_to_gauss.hpp
@ -0,0 +1,158 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/adapt_classic_to_gauss.hpp
+    Copyright (C) 2014-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#ifndef __LIBLSS_ADAPT_CLASSIC_TO_GAUSS_HPP
+#define __LIBLSS_ADAPT_CLASSIC_TO_GAUSS_HPP
+
+#include "libLSS/tools/fusewrapper.hpp"
+#include "libLSS/physics/bias/base.hpp"
+#include "libLSS/physics/bias/power_law.hpp"
+
+// This class extends a classic bias function to support negative binomial. It does so by adding
+// a parameter and repackaging the tuples
+#include <cxxabi.h>
+
+namespace LibLSS {
+
+  namespace bias {
+
+    template <typename T>
+    inline T copy_if_rref(T const &&x) {
+      return x;
+    }
+    template <typename T>
+    inline auto copy_if_rref(T &x) -> decltype(std::ref(x)) {
+      return std::ref(x);
+    }
+
+    struct NoiseModulatedAdaptor {
+      template <typename SelectArray, typename BiasedArray>
+      inline auto apply(SelectArray &&select, BiasedArray const &bias)
+          -> decltype(std::make_tuple(
+              b_va_fused<double>(
+                  _p1 * _p2, std::forward<SelectArray>(select),
+                  std::move(std::get<0>(bias))),
+              b_va_fused<double>(
+                  _p1 *_p2, std::forward<SelectArray>(select),
+                  std::move(std::get<1>(bias))))) {
+        // Take a biased density x noise and multiply both by selection
+        return std::make_tuple(
+            b_va_fused<double>(
+                _p1 * _p2, std::forward<SelectArray>(select),
+                std::move(std::get<0>(bias))),
+            b_va_fused<double>(
+                _p1 * _p2, std::forward<SelectArray>(select),
+                std::move(std::get<1>(bias))));
+      }
+
+      template <
+          typename LikelihoodGradient, typename SelectArray,
+          typename BiasedArray>
+      inline auto adjoint_gradient(
+          LikelihoodGradient ag_like, SelectArray &&select, BiasedArray const &)
+          -> decltype(std::tuple_cat(
+              std::make_tuple(b_va_fused<double>(
+                  _p1 * _p2, std::forward<SelectArray>(select),
+                  std::move(std::get<0>(ag_like)))),
+              last_of_tuple<1>(ag_like))) {
+        // In practice the adjoint gradient operator is the same as simple selection
+        return std::tuple_cat(
+            std::make_tuple(b_va_fused<double>(
+                _p1 * _p2, std::forward<SelectArray>(select),
+                std::move(std::get<0>(ag_like)))),
+            last_of_tuple<1>(ag_like));
+      }
+    };
+  } // namespace bias
+
+  template <typename T>
+  struct AdaptBias_Gauss {
+    static const auto numParams = 1 + T::numParams;
+    static constexpr const bool NmeanIsBias = T::NmeanIsBias;
+
+    T classic;
+    double r;
+
+    bias::NoiseModulatedAdaptor selection_adaptor;
+    //        bias::SimpleAdaptor selection_adaptor;
+    //
+    AdaptBias_Gauss(LikelihoodInfo const& info = LikelihoodInfo()) : classic(info) {}
+
+    template <typename BiasParameters>
+    static inline void setup_default(BiasParameters &params) {
+      T::setup_default(params);
+      params[T::numParams] = 3.; //0.002;
+    }
+
+    template <typename BiasParameters>
+    inline double log_prior_params(BiasParameters const &params) const {
+      return bias::bias_get_log_prior(classic, params);
+    }
+
+    // This adapt to copy the adequate Gaussian bias parameter before going further down
+    // in the bias model.
+    // It does not need to optimize away anything based on selector.
+    // Argument:
+    //    * fwd_model: current full forward model
+    //    * final_density: final matter density obtained by the forward model
+    //    * params: current bias parameters
+    //    * select: currently sampled bias parameter, it is by default NoSelector. This argument allow to do optimization in the
+    //              generation of the bias function (i.e. trim components, or avoid some reevaluation upon further calls).
+    template <
+        class ForwardModel, typename FinalDensityArray, typename BiasParameters,
+        typename MetaSelector = NoSelector>
+    inline void prepare(
+        ForwardModel &fwd_model, const FinalDensityArray &final_density,
+        double const nmean_, const BiasParameters &params, bool density_updated,
+        MetaSelector select = MetaSelector()) {
+      classic.prepare(
+          fwd_model, final_density, nmean_, params, density_updated, select);
+      r = params[T::numParams];
+    }
+
+    inline void cleanup() { classic.cleanup(); }
+
+    inline double get_linear_bias() const { return classic.get_linear_bias(); }
+
+    template <typename Array>
+    inline bool check_bias_constraints(Array &&a) {
+      return T::check_bias_constraints(a) && (a[T::numParams] > 0) &&
+             (a[T::numParams] < 10000);
+    }
+
+    template <typename FinalDensityArray>
+    inline auto compute_density(const FinalDensityArray &array)
+        -> decltype(std::make_tuple(
+            std::move(std::get<0>(classic.compute_density(array))),
+            b_va_fused<double, 3>(
+                FuseWrapper_detail::constantFunctor<double>(r)))) {
+      return std::make_tuple(
+          std::move(std::get<0>(classic.compute_density(
+              array))), // Grab the biased density from the parent bias function.
+          b_va_fused<double, 3>(FuseWrapper_detail::constantFunctor<double>(
+              r)) // Add a constant noise array
+      );
+    }
+
+    template <typename FinalDensityArray, typename TupleGradientLikelihoodArray>
+    inline auto apply_adjoint_gradient(
+        const FinalDensityArray &array, TupleGradientLikelihoodArray grad_array)
+        -> decltype(classic.apply_adjoint_gradient(
+            array, std::make_tuple(std::move(std::get<0>(grad_array))))) {
+      BOOST_STATIC_ASSERT(
+          (std::tuple_size<TupleGradientLikelihoodArray>::value == 1));
+      return classic.apply_adjoint_gradient(
+          array,
+          std::make_tuple(std::move(std::get<0>(
+              grad_array)))); // Pass down the first component of the AG to the bias function.
+    }
+  };
+} // namespace LibLSS
+
+#endif
--- a/extra/borg/libLSS/physics/adapt_classic_to_nb.hpp
+++ b/extra/borg/libLSS/physics/adapt_classic_to_nb.hpp
@ -0,0 +1,96 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/adapt_classic_to_nb.hpp
+    Copyright (C) 2014-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#ifndef __LIBLSS_ADAPT_CLASSIC_TO_NB_HPP
+#define __LIBLSS_ADAPT_CLASSIC_TO_NB_HPP
+
+#include "libLSS/tools/fusewrapper.hpp"
+#include "libLSS/physics/bias/base.hpp"
+
+// This class extends a classic bias function to support negative binomial. It does so by adding
+// a parameter and repackaging the tuples
+#include <cxxabi.h>
+
+namespace LibLSS {
+
+  template <typename T>
+  struct AdaptBias_NB {
+    static const auto numParams = 1 + T::numParams;
+    static constexpr const bool NmeanIsBias = T::NmeanIsBias;
+
+    T classic;
+    double r;
+
+    selection::SimpleAdaptor selection_adaptor;
+
+    AdaptBias_NB(LikelihoodInfo const& info = LikelihoodInfo()) : classic(info) {}
+
+    template <typename BiasParameters>
+    static inline void setup_default(BiasParameters &params) {
+      T::setup_default(params);
+      params[T::numParams] = 0.1; //0.002;
+    }
+
+    // This adapt to copy the adequate NB bias parameter before going further down
+    // in the bias model.
+    // It does not need to optimize away anything based on selector.
+    // Argument:
+    //    * fwd_model: current full forward model
+    //    * final_density: final matter density obtained by the forward model
+    //    * params: current bias parameters
+    //    * select: currently sampled bias parameter, it is by default NoSelector. This argument allow to do optimization in the
+    //              generation of the bias function (i.e. trim components, or avoid some reevaluation upon further calls).
+    template <
+        class ForwardModel, typename FinalDensityArray, typename BiasParameters,
+        typename MetaSelector = NoSelector>
+    inline void prepare(
+        ForwardModel &fwd_model, const FinalDensityArray &final_density,
+        double const nmean_, const BiasParameters &params, bool density_updated,
+        MetaSelector select = MetaSelector()) {
+      classic.prepare(
+          fwd_model, final_density, nmean_, params, density_updated, select);
+      r = params[T::numParams];
+    }
+
+    inline void cleanup() { classic.cleanup(); }
+
+    inline double get_linear_bias() const { return classic.get_linear_bias(); }
+
+    template <typename Array>
+    inline bool check_bias_constraints(Array &&a) {
+      return T::check_bias_constraints(a) && (a[T::numParams] > 0) &&
+             (a[T::numParams] < 10000);
+    }
+
+    template <typename FinalDensityArray>
+    inline auto compute_density(const FinalDensityArray &array)
+        -> decltype(std::make_tuple(
+            std::move(std::get<0>(classic.compute_density(array))),
+            b_va_fused<double, 3>(
+                FuseWrapper_detail::constantFunctor<double>(r)))) {
+      return std::make_tuple(
+          std::move(std::get<0>(classic.compute_density(array))),
+          b_va_fused<double, 3>(
+              FuseWrapper_detail::constantFunctor<double>(r)));
+    }
+
+    template <typename FinalDensityArray, typename TupleGradientLikelihoodArray>
+    inline auto apply_adjoint_gradient(
+        const FinalDensityArray &array, TupleGradientLikelihoodArray grad_array)
+        -> decltype(classic.apply_adjoint_gradient(
+            array, std::make_tuple(std::move(std::get<0>(grad_array))))) {
+      BOOST_STATIC_ASSERT(
+          (std::tuple_size<TupleGradientLikelihoodArray>::value == 1));
+      return classic.apply_adjoint_gradient(
+          array, std::make_tuple(std::move(std::get<0>(grad_array))));
+    }
+  };
+} // namespace LibLSS
+
+#endif
--- a/extra/borg/libLSS/physics/bias/2nd_order_bias.hpp
+++ b/extra/borg/libLSS/physics/bias/2nd_order_bias.hpp
@ -0,0 +1,725 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/bias/2nd_order_bias.hpp
+    Copyright (C) 2014-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#ifndef __LIBLSS_PHYSICS_2ND_ORDER_BIAS_HPP
+#define __LIBLSS_PHYSICS_2ND_ORDER_BIAS_HPP
+
+// This header provides the implementation of the LSS bias model to second order PT.
+
+#include <cmath>
+#include <boost/bind.hpp>
+#include <boost/format.hpp>
+#include "libLSS/tools/fused_array.hpp"
+#include <tuple>
+#include "libLSS/tools/phoenix_vars.hpp"
+#include <boost/phoenix/operator.hpp>
+#include <boost/phoenix/stl/cmath.hpp>
+#include "libLSS/tools/tuple_helper.hpp"
+#include "libLSS/physics/bias/base.hpp"
+#include "libLSS/physics/bias/power_law.hpp"
+#include "libLSS/tools/fftw_allocator.hpp"
+#include "libLSS/samplers/core/powerspec_tools.hpp"
+
+namespace LibLSS {
+
+  namespace bias {
+
+    namespace detail_SecondOrderBias {
+
+      using boost::format;
+      namespace ph = std::placeholders;
+
+      typedef FFTW_Manager_3d<double> DFT_Manager;
+      typedef Uninit_FFTW_Real_Array U_Array;
+      typedef Uninit_FFTW_Complex_Array U_CArray;
+      typedef Uninit_FFTW_Real_Array::array_type U_ArrayRef;
+      typedef Uninit_FFTW_Complex_Array::array_type U_CArrayRef;
+
+      struct SecondOrderBias {
+
+        // By default just do truncation, otherwise a smooth cut can be applied
+        static constexpr const bool NmeanIsBias = false;
+        static const bool SHARP_THRESHOLDER = true;
+        static constexpr double EPSILON_VOIDS = 1e-6;
+        static const auto numParams = 4;
+        long N0;
+        long N1;
+        long N2;
+        long N2_HC;
+        long startN0;
+        long localN0;
+        double L0;
+        double L1;
+        double L2;
+        double nmean;
+        double b1;
+        double b2;
+        double bk;
+        double r2;
+
+        MPI_Communication *comm;
+        DFT_Manager *mgr;
+        FCalls::plan_type analysis_plan;
+        FCalls::plan_type synthesis_plan;
+
+        U_Array *delta_sqr_arr;
+        U_Array *tidal_00_arr, *tidal_11_arr, *tidal_22_arr, *tidal_01_arr,
+            *tidal_02_arr, *tidal_12_arr, *tidal_sqr_arr;
+        U_Array *laplace_delta_arr;
+        U_ArrayRef *delta_sqr_ref;
+        U_ArrayRef *tidal_00_ref, *tidal_11_ref, *tidal_22_ref, *tidal_01_ref,
+            *tidal_02_ref, *tidal_12_ref, *tidal_sqr_ref;
+        U_ArrayRef *laplace_delta_ref;
+
+        U_Array *t00_dt00_arr, *t11_dt11_arr, *t22_dt22_arr, *t01_dt01_arr,
+            *t02_dt02_arr, *t12_dt12_arr;
+        U_Array *dlaplace_delta_arr;
+        U_ArrayRef *t00_dt00_ref, *t11_dt11_ref, *t22_dt22_ref, *t01_dt01_ref,
+            *t02_dt02_ref, *t12_dt12_ref;
+        U_ArrayRef *dlaplace_delta_ref;
+
+        U_Array *dlogL_drho_arr;
+        U_Array *dlogL_ddelta_arr;
+        U_ArrayRef *dlogL_drho_ref;
+        U_ArrayRef *dlogL_ddelta_ref;
+
+        U_Array *tmp_array_real_arr;
+        U_ArrayRef *tmp_array_real_ref;
+        U_CArray *tmp_array_complex_arr;
+        U_CArrayRef *tmp_array_complex_ref;
+
+        // This adaptor transforms an unselected galaxy density (with appropriate
+        // auxiliary arrays) to a selected array. It must be a functor accepting two
+        // parameters: a selection virtual array and a bias density virtual array.
+        selection::SimpleAdaptor selection_adaptor;
+
+        SecondOrderBias() {
+          delta_sqr_arr = 0;
+          tidal_00_arr = 0;
+          tidal_11_arr = 0;
+          tidal_22_arr = 0;
+          tidal_01_arr = 0;
+          tidal_02_arr = 0;
+          tidal_12_arr = 0;
+          laplace_delta_arr = 0;
+          dlaplace_delta_arr = 0;
+          t00_dt00_arr = 0;
+          t11_dt11_arr = 0;
+          t22_dt22_arr = 0;
+          t01_dt01_arr = 0;
+          t02_dt02_arr = 0;
+          t12_dt12_arr = 0;
+          tidal_sqr_arr = 0;
+
+          dlogL_drho_arr = 0;
+          dlogL_ddelta_arr = 0;
+
+          tmp_array_real_arr = 0;
+          tmp_array_complex_arr = 0;
+
+          analysis_plan = 0;
+          synthesis_plan = 0;
+        }
+
+        template <typename BiasParameters>
+        static inline void setup_default(BiasParameters &params) {
+          params[0] = 1.4;
+          params[1] = 0.8;
+          params[2] = 0.5;
+          params[3] = 0.2;
+        };
+
+        // Note: fwd_model and final_density arrays cannot be stored in this step. But
+        // they can be used.
+        template <
+            class ForwardModel, typename FinalDensityArray,
+            typename BiasParameters, typename MetaSelect = NoSelector>
+        inline void prepare(
+            ForwardModel &fwd_model, const FinalDensityArray &final_density,
+            double const _nmean, const BiasParameters &params,
+            bool density_changed, MetaSelect _select = MetaSelect()) {
+
+          ConsoleContext<LOG_DEBUG> ctx("bias model preparation");
+
+          comm = fwd_model.lo_mgr->getComm();
+          mgr = fwd_model.lo_mgr.get();
+          N0 = mgr->N0;
+          N1 = mgr->N1;
+          N2 = mgr->N2;
+          N2_HC = N2 / 2 + 1;
+          startN0 = mgr->startN0;
+          localN0 = mgr->localN0;
+          L0 = fwd_model.get_box_model().L0;
+          L1 = fwd_model.get_box_model().L1;
+          L2 = fwd_model.get_box_model().L2;
+          nmean = _nmean;
+          b1 = params[0];
+          b2 = params[1];
+          bk = params[2];
+          r2 = params[3];
+
+          ctx.print(
+              boost::format("Got a box %dx%dx%d (%gx%gx%g)") % N0 % N1 % N2 %
+              L0 % L1 % L2);
+          ctx.print("Allocating temporary arrays");
+          if (delta_sqr_arr == NULL) {
+            ctx.print("...delta_sqr_arr");
+            delta_sqr_arr =
+                new U_Array(mgr->extents_real(), mgr->allocator_real);
+            delta_sqr_ref = &delta_sqr_arr->get_array();
+          }
+
+          if (tidal_00_arr == NULL) {
+            ctx.print("...tidal_00_arr");
+            tidal_00_arr =
+                new U_Array(mgr->extents_real(), mgr->allocator_real);
+            tidal_00_ref = &tidal_00_arr->get_array();
+          }
+
+          if (tidal_11_arr == NULL) {
+            ctx.print("...tidal_11_arr");
+            tidal_11_arr =
+                new U_Array(mgr->extents_real(), mgr->allocator_real);
+            tidal_11_ref = &tidal_11_arr->get_array();
+          }
+
+          if (tidal_22_arr == NULL) {
+            ctx.print("...tidal_22_arr");
+            tidal_22_arr =
+                new U_Array(mgr->extents_real(), mgr->allocator_real);
+            tidal_22_ref = &tidal_22_arr->get_array();
+          }
+
+          if (tidal_01_arr == NULL) {
+            ctx.print("...tidal_01_arr");
+            tidal_01_arr =
+                new U_Array(mgr->extents_real(), mgr->allocator_real);
+            tidal_01_ref = &tidal_01_arr->get_array();
+          }
+
+          if (tidal_02_arr == NULL) {
+            ctx.print("...tidal_02_arr");
+            tidal_02_arr =
+                new U_Array(mgr->extents_real(), mgr->allocator_real);
+            tidal_02_ref = &tidal_02_arr->get_array();
+          }
+
+          if (tidal_12_arr == NULL) {
+            ctx.print("...tidal_12_arr");
+            tidal_12_arr =
+                new U_Array(mgr->extents_real(), mgr->allocator_real);
+            tidal_12_ref = &tidal_12_arr->get_array();
+          }
+
+          if (tidal_sqr_arr == NULL) {
+            ctx.print("...tidal_sqr_arr");
+            tidal_sqr_arr =
+                new U_Array(mgr->extents_real(), mgr->allocator_real);
+            tidal_sqr_ref = &tidal_sqr_arr->get_array();
+          }
+
+          if (laplace_delta_arr == NULL) {
+            ctx.print("...laplace_delta_arr");
+            laplace_delta_arr =
+                new U_Array(mgr->extents_real(), mgr->allocator_real);
+            laplace_delta_ref = &laplace_delta_arr->get_array();
+          }
+
+          if (t00_dt00_arr == NULL) {
+            ctx.print("...t00_dt00_arr");
+            t00_dt00_arr =
+                new U_Array(mgr->extents_real(), mgr->allocator_real);
+            t00_dt00_ref = &t00_dt00_arr->get_array();
+          }
+
+          if (t11_dt11_arr == NULL) {
+            ctx.print("...t11_dt11_arr");
+            t11_dt11_arr =
+                new U_Array(mgr->extents_real(), mgr->allocator_real);
+            t11_dt11_ref = &t11_dt11_arr->get_array();
+          }
+
+          if (t22_dt22_arr == NULL) {
+            ctx.print("...t22_dt22_arr");
+            t22_dt22_arr =
+                new U_Array(mgr->extents_real(), mgr->allocator_real);
+            t22_dt22_ref = &t22_dt22_arr->get_array();
+          }
+
+          if (t01_dt01_arr == NULL) {
+            ctx.print("...t01_dt01_arr");
+            t01_dt01_arr =
+                new U_Array(mgr->extents_real(), mgr->allocator_real);
+            t01_dt01_ref = &t01_dt01_arr->get_array();
+          }
+
+          if (t02_dt02_arr == NULL) {
+            ctx.print("...t02_dt02_arr");
+            t02_dt02_arr =
+                new U_Array(mgr->extents_real(), mgr->allocator_real);
+            t02_dt02_ref = &t02_dt02_arr->get_array();
+          }
+
+          if (t12_dt12_arr == NULL) {
+            ctx.print("...t12_dt12_arr");
+            t12_dt12_arr =
+                new U_Array(mgr->extents_real(), mgr->allocator_real);
+            t12_dt12_ref = &t12_dt12_arr->get_array();
+          }
+
+          if (dlaplace_delta_arr == NULL) {
+            ctx.print("...dlaplace_delta_arr");
+            dlaplace_delta_arr =
+                new U_Array(mgr->extents_real(), mgr->allocator_real);
+            dlaplace_delta_ref = &dlaplace_delta_arr->get_array();
+          }
+
+          if (dlogL_drho_arr == NULL) {
+            ctx.print("...dlogL_drho_arr");
+            dlogL_drho_arr =
+                new U_Array(mgr->extents_real(), mgr->allocator_real);
+            dlogL_drho_ref = &dlogL_drho_arr->get_array();
+          }
+
+          if (dlogL_ddelta_arr == NULL) {
+            ctx.print("...dlogL_ddelta_arr");
+            dlogL_ddelta_arr =
+                new U_Array(mgr->extents_real(), mgr->allocator_real);
+            dlogL_ddelta_ref = &dlogL_ddelta_arr->get_array();
+          }
+
+          if (tmp_array_real_arr == NULL) {
+            ctx.print("...tmp_array_real_arr");
+            tmp_array_real_arr =
+                new U_Array(mgr->extents_real(), mgr->allocator_real);
+            tmp_array_real_ref = &tmp_array_real_arr->get_array();
+          }
+
+          if (tmp_array_complex_arr == NULL) {
+            ctx.print("...tmp_array_complex_arr");
+            tmp_array_complex_arr =
+                new U_CArray(mgr->extents_complex(), mgr->allocator_complex);
+            tmp_array_complex_ref = &tmp_array_complex_arr->get_array();
+          }
+          ctx.print("FFT plans now");
+
+          if (analysis_plan == NULL) {
+            ctx.print("...analysis");
+            analysis_plan = mgr->create_r2c_plan(
+                tmp_array_real_ref->data(), tmp_array_complex_ref->data());
+          }
+
+          if (synthesis_plan == NULL) {
+            ctx.print("...synthesis");
+            synthesis_plan = mgr->create_c2r_plan(
+                tmp_array_complex_ref->data(), tmp_array_real_ref->data());
+          }
+
+          ctx.print("Prepare the arrays");
+          if (density_changed)
+            prepare_bias_model_arrays(
+                *delta_sqr_ref, *tidal_00_ref, *tidal_11_ref, *tidal_22_ref,
+                *tidal_01_ref, *tidal_02_ref, *tidal_12_ref, *tidal_sqr_ref,
+                *laplace_delta_ref, final_density);
+          ctx.print("Done preparation");
+        };
+
+        inline void cleanup(){
+            // Array deallocation only in final destructor
+        };
+
+        inline double get_linear_bias() const { return b1; };
+
+        template <typename Array>
+        static inline bool check_bias_constraints(Array &&a) {
+          return true;
+        };
+
+        void subtract_mean_array3d(U_ArrayRef &data_ref) {
+          ConsoleContext<LOG_DEBUG> ctx("subtract_mean_array3d");
+
+          double mean_data = 0.0;
+
+#pragma omp parallel for collapse(3) reduction(+ : mean_data)
+          for (size_t n0 = startN0; n0 < startN0 + localN0; n0++)
+            for (size_t n1 = 0; n1 < N1; n1++)
+              for (size_t n2 = 0; n2 < N2; n2++) {
+                mean_data += data_ref[n0][n1][n2];
+              }
+
+          comm->all_reduce_t(MPI_IN_PLACE, &mean_data, 1, MPI_SUM);
+          mean_data /= N0 * N1 * N2;
+
+#pragma omp parallel for collapse(3)
+          for (size_t n0 = startN0; n0 < startN0 + localN0; n0++)
+            for (size_t n1 = 0; n1 < N1; n1++)
+              for (size_t n2 = 0; n2 < N2; n2++) {
+                data_ref[n0][n1][n2] -= mean_data;
+              }
+        };
+
+        void get_spatial_derivative_array3d(
+            U_ArrayRef &deriv_array_real_out,
+            const U_CArrayRef &array_complex_in, const int axis0,
+            const int axis1, const std::string type) {
+
+          double fft_normalization = 1.0 / (N0 * N1 * N2);
+
+          Console::instance().print<LOG_DEBUG>("Spatial derivative: " + type);
+          array::copyArray3d(*tmp_array_complex_ref, array_complex_in);
+
+          if (type == "laplace") {
+#pragma omp parallel for collapse(3)
+            for (size_t i = startN0; i < startN0 + localN0; i++)
+              for (size_t j = 0; j < N1; j++)
+                for (size_t k = 0; k < N2_HC; k++) {
+                  double kk[3] = {kmode(i, N0, L0), kmode(j, N1, L1),
+                                  kmode(k, N2, L2)};
+
+                  double ksquared =
+                      kk[0] * kk[0] + kk[1] * kk[1] + kk[2] * kk[2];
+                  double factor = -1.0 * ksquared * fft_normalization;
+                  (*tmp_array_complex_ref)[i][j][k] *= factor;
+                }
+          } else if (type == "tidal") {
+            Console::instance().c_assert(
+                (axis0 >= 0) && (axis0 <= 2) && (axis1 >= 0) && (axis1 <= 2),
+                "Invalid 'axis0/1' argument in "
+                "'get_spatial_derivative_array3d'");
+
+            double delta_ij = (axis0 == axis1) ? 1.0 / 3.0 : 0.0;
+
+#pragma omp parallel for collapse(3)
+            for (size_t i = startN0; i < startN0 + localN0; i++)
+              for (size_t j = 0; j < N1; j++)
+                for (size_t k = 0; k < N2_HC; k++) {
+                  double kk[3] = {kmode(i, N0, L0), kmode(j, N1, L1),
+                                  kmode(k, N2, L2)};
+
+                  double ksquared =
+                      kk[0] * kk[0] + kk[1] * kk[1] + kk[2] * kk[2];
+                  double factor =
+                      (kk[axis0] * kk[axis1] / ksquared - delta_ij) *
+                      fft_normalization;
+                  (*tmp_array_complex_ref)[i][j][k] *= factor;
+                }
+          } else {
+            Console::instance().c_assert(
+                false,
+                "Invalid 'type' argument in 'get_spatial_derivative_array3d'");
+          }
+
+          if ((startN0 == 0) && (localN0 > 0)) {
+            (*tmp_array_complex_ref)[0][0][0] = 0.0;
+            (*tmp_array_complex_ref)[0][0][N2_HC - 1] = 0.0;
+            (*tmp_array_complex_ref)[0][N1 / 2][0] = 0.0;
+            (*tmp_array_complex_ref)[0][N1 / 2][N2_HC - 1] = 0.0;
+          }
+
+          if ((startN0 <= N0 / 2) && (startN0 + localN0 > N0 / 2)) {
+            (*tmp_array_complex_ref)[N0 / 2][0][0] = 0.0;
+            (*tmp_array_complex_ref)[N0 / 2][0][N2_HC - 1] = 0.0;
+            (*tmp_array_complex_ref)[N0 / 2][N1 / 2][0] = 0.0;
+            (*tmp_array_complex_ref)[N0 / 2][N1 / 2][N2_HC - 1] = 0.0;
+          }
+
+          mgr->execute_c2r(
+              synthesis_plan, tmp_array_complex_ref->data(),
+              deriv_array_real_out.data());
+        };
+
+        void get_density_derivative_array3d(
+            U_ArrayRef &deriv_array_real_out, const U_ArrayRef &array1_real_in,
+            const U_ArrayRef &array2_real_in, const int axis0, const int axis1,
+            const std::string type) {
+
+          if (type == "dlaplace") {
+
+            array::copyArray3d(*tmp_array_real_ref, array1_real_in);
+
+            mgr->execute_r2c(
+                analysis_plan, tmp_array_real_ref->data(),
+                tmp_array_complex_ref->data());
+
+            get_spatial_derivative_array3d(
+                deriv_array_real_out, *tmp_array_complex_ref, -1, -1,
+                "laplace");
+
+          } else if (type == "dtidal") {
+
+#pragma omp parallel for collapse(3)
+            for (size_t i = startN0; i < startN0 + localN0; i++)
+              for (size_t j = 0; j < N1; j++)
+                for (size_t k = 0; k < N2; k++) {
+                  (*tmp_array_real_ref)[i][j][k] =
+                      array1_real_in[i][j][k] * array2_real_in[i][j][k];
+                }
+
+            mgr->execute_r2c(
+                analysis_plan, tmp_array_real_ref->data(),
+                tmp_array_complex_ref->data());
+
+            get_spatial_derivative_array3d(
+                deriv_array_real_out, *tmp_array_complex_ref, axis0, axis1,
+                "tidal");
+
+          } else {
+            Console::instance().c_assert(
+                false,
+                "Invalid 'type' argument in 'get_density_derivative_array3d'");
+          }
+        };
+
+        void prepare_bias_model_arrays(
+            U_ArrayRef &delta_sqr_ref, U_ArrayRef &tidal_00_ref,
+            U_ArrayRef &tidal_11_ref, U_ArrayRef &tidal_22_ref,
+            U_ArrayRef &tidal_01_ref, U_ArrayRef &tidal_02_ref,
+            U_ArrayRef &tidal_12_ref, U_ArrayRef &tidal_sqr_ref,
+            U_ArrayRef &laplace_delta_ref, const U_ArrayRef &delta) {
+
+#pragma omp parallel for collapse(3)
+          for (size_t n0 = startN0; n0 < startN0 + localN0; n0++)
+            for (size_t n1 = 0; n1 < N1; n1++)
+              for (size_t n2 = 0; n2 < N2; n2++) {
+                (*tmp_array_real_ref)[n0][n1][n2] = delta[n0][n1][n2];
+              }
+          Console::instance().print<LOG_DEBUG>("Input backuped");
+
+          mgr->execute_r2c(
+              analysis_plan, tmp_array_real_ref->data(),
+              tmp_array_complex_ref->data());
+          Console::instance().print<LOG_DEBUG>("Transformed");
+
+          get_spatial_derivative_array3d(
+              laplace_delta_ref, *tmp_array_complex_ref, -1, -1, "laplace");
+
+          get_spatial_derivative_array3d(
+              tidal_00_ref, *tmp_array_complex_ref, 0, 0, "tidal");
+          get_spatial_derivative_array3d(
+              tidal_01_ref, *tmp_array_complex_ref, 0, 1, "tidal");
+          get_spatial_derivative_array3d(
+              tidal_02_ref, *tmp_array_complex_ref, 0, 2, "tidal");
+
+          get_spatial_derivative_array3d(
+              tidal_11_ref, *tmp_array_complex_ref, 1, 1, "tidal");
+          get_spatial_derivative_array3d(
+              tidal_12_ref, *tmp_array_complex_ref, 1, 2, "tidal");
+
+          get_spatial_derivative_array3d(
+              tidal_22_ref, *tmp_array_complex_ref, 2, 2, "tidal");
+
+#pragma omp parallel for collapse(3)
+          for (size_t n0 = startN0; n0 < startN0 + localN0; n0++)
+            for (size_t n1 = 0; n1 < N1; n1++)
+              for (size_t n2 = 0; n2 < N2; n2++) {
+                delta_sqr_ref[n0][n1][n2] =
+                    delta[n0][n1][n2] * delta[n0][n1][n2];
+                tidal_sqr_ref[n0][n1][n2] =
+                    tidal_00_ref[n0][n1][n2] * tidal_00_ref[n0][n1][n2] +
+                    tidal_11_ref[n0][n1][n2] * tidal_11_ref[n0][n1][n2] +
+                    tidal_22_ref[n0][n1][n2] * tidal_22_ref[n0][n1][n2] +
+                    2.0 * (tidal_01_ref[n0][n1][n2] * tidal_01_ref[n0][n1][n2] +
+                           tidal_02_ref[n0][n1][n2] * tidal_02_ref[n0][n1][n2] +
+                           tidal_12_ref[n0][n1][n2] * tidal_12_ref[n0][n1][n2]);
+              }
+
+          subtract_mean_array3d(delta_sqr_ref);
+
+          subtract_mean_array3d(tidal_sqr_ref);
+        };
+
+        void prepare_bias_model_derivative_arrays(
+            U_ArrayRef &t00_dt00_ref, U_ArrayRef &t11_dt11_ref,
+            U_ArrayRef &t22_dt22_ref, U_ArrayRef &t01_dt01_ref,
+            U_ArrayRef &t02_dt02_ref, U_ArrayRef &t12_dt12_ref,
+            U_ArrayRef &dlaplace_delta_ref, const U_ArrayRef &dlogL_drho_ref,
+            const U_ArrayRef &tidal_00_ref, const U_ArrayRef &tidal_11_ref,
+            const U_ArrayRef &tidal_22_ref, const U_ArrayRef &tidal_01_ref,
+            const U_ArrayRef &tidal_02_ref, const U_ArrayRef &tidal_12_ref) {
+
+          U_ArrayRef *NULL_ref;
+
+          get_density_derivative_array3d(
+              t00_dt00_ref, dlogL_drho_ref, tidal_00_ref, 0, 0, "dtidal");
+          get_density_derivative_array3d(
+              t01_dt01_ref, dlogL_drho_ref, tidal_01_ref, 0, 1, "dtidal");
+          get_density_derivative_array3d(
+              t02_dt02_ref, dlogL_drho_ref, tidal_02_ref, 0, 2, "dtidal");
+
+          get_density_derivative_array3d(
+              t11_dt11_ref, dlogL_drho_ref, tidal_11_ref, 1, 1, "dtidal");
+          get_density_derivative_array3d(
+              t12_dt12_ref, dlogL_drho_ref, tidal_12_ref, 1, 2, "dtidal");
+
+          get_density_derivative_array3d(
+              t22_dt22_ref, dlogL_drho_ref, tidal_22_ref, 2, 2, "dtidal");
+
+          get_density_derivative_array3d(
+              dlaplace_delta_ref, dlogL_drho_ref, *NULL_ref, -1, -1,
+              "dlaplace");
+        };
+
+        static inline double thresholder(double a, double b) {
+          if (a < b) {
+            if (SHARP_THRESHOLDER) {
+              return b;
+            } else {
+              return b * std::exp((-b / a + 1));
+            }
+          } else {
+            return a;
+          }
+        }
+
+        static inline double adjoint_thresholder(double a, double b, double g) {
+          if (a < b) {
+            if (SHARP_THRESHOLDER) {
+              return 0;
+            } else {
+              return b * b / (a * a) * std::exp((1 - b / a)) * g;
+            }
+          } else {
+            return g;
+          }
+        }
+
+        static inline double density_lambda(
+            double nmean, double b1, double b2, double bk, double r2,
+            double delta, double delta_sqr, double tidal_sqr,
+            double laplace_delta) {
+          double rho = b1 * delta + 0.5 * b2 * delta_sqr +
+                       0.5 * bk * tidal_sqr + r2 * laplace_delta;
+          double nu = nmean * thresholder(1.0 + rho, EPSILON_VOIDS);
+          return nu;
+        };
+
+        // This function returns an array-like array. That array
+        // depends on the existence of the final density array.
+        // The return type is quite complex. Let the compiler decides.
+        // C++11 does not allow automatic return type deduction. C++14 would be
+        // needed for that. So we have to rely on an auxiliary function that
+        // allow for a compact decltype to be written.
+        template <typename FinalDensityArray>
+        inline auto compute_density(const FinalDensityArray &array)
+            -> decltype(std::make_tuple(b_va_fused<double>(
+                std::bind(
+                    density_lambda, nmean, b1, b2, bk, r2, ph::_1, ph::_2,
+                    ph::_3, ph::_4),
+                array, *delta_sqr_ref, *tidal_sqr_ref, *laplace_delta_ref))) {
+          return std::make_tuple(b_va_fused<double>(
+              std::bind(
+                  density_lambda, nmean, b1, b2, bk, r2, ph::_1, ph::_2, ph::_3,
+                  ph::_4),
+              array, *delta_sqr_ref, *tidal_sqr_ref, *laplace_delta_ref));
+        };
+
+        // This function returns an array-like array. That array
+        // depends on the existence of the final density array and the gradient likelihood array.
+        // That is the job of the caller to ensure that temporary variables are not cleared
+        // before the final use.
+        // The return type is quite complex. Let the compiler decides.
+        // L(b_0(delta, p), b_1(delta, p), ..., b_n(delta, p))
+        // Now we take a tuple of gradient and collapse this to a gradient of delta.
+        template <
+            typename FinalDensityArray, typename TupleGradientLikelihoodArray>
+        auto apply_adjoint_gradient(
+            const FinalDensityArray &final_density,
+            TupleGradientLikelihoodArray grad_array)
+            -> decltype(std::make_tuple(std::ref(*dlogL_ddelta_ref))) {
+          ConsoleContext<LOG_DEBUG> ctx("bias model gradient computation");
+
+          ctx.print("Zero output array");
+
+          array::fill(*dlogL_ddelta_ref, 0.0);
+
+          ctx.print("Transfer the input gradient");
+
+          const double epsilon = EPSILON_VOIDS;
+          LibLSS::copy_array_rv(
+              array::slice_array((*dlogL_drho_ref), mgr->strict_range()),
+              std::get<0>(grad_array));
+          ctx.print("Data backuped");
+
+          prepare_bias_model_derivative_arrays(
+              *t00_dt00_ref, *t11_dt11_ref, *t22_dt22_ref, *t01_dt01_ref,
+              *t02_dt02_ref, *t12_dt12_ref, *dlaplace_delta_ref,
+              *dlogL_drho_ref, *tidal_00_ref, *tidal_11_ref, *tidal_22_ref,
+              *tidal_01_ref, *tidal_02_ref, *tidal_12_ref);
+
+          ctx.print("Computing the transform.");
+#pragma omp parallel for collapse(3)
+          for (size_t n0 = startN0; n0 < startN0 + localN0; n0++)
+            for (size_t n1 = 0; n1 < N1; n1++)
+              for (size_t n2 = 0; n2 < N2; n2++) {
+
+                double delta = (final_density)[n0][n1][n2];
+                double delta_sqr = (*delta_sqr_ref)[n0][n1][n2];
+                double tidal_sqr = (*tidal_sqr_ref)[n0][n1][n2];
+                double laplace_delta = (*laplace_delta_ref)[n0][n1][n2];
+
+                double rho = b1 * delta + 0.5 * b2 * delta_sqr +
+                             0.5 * bk * tidal_sqr + r2 * laplace_delta;
+                double dmu_drho = adjoint_thresholder(
+                    1 + rho, EPSILON_VOIDS, (*dlogL_drho_ref)[n0][n1][n2]);
+
+                double dlaplace_delta = (*dlaplace_delta_ref)[n0][n1][n2];
+                double t00_dt00 = (*t00_dt00_ref)[n0][n1][n2];
+                double t11_dt11 = (*t11_dt11_ref)[n0][n1][n2];
+                double t22_dt22 = (*t22_dt22_ref)[n0][n1][n2];
+                double t01_dt01 = (*t01_dt01_ref)[n0][n1][n2];
+                double t02_dt02 = (*t02_dt02_ref)[n0][n1][n2];
+                double t12_dt12 = (*t12_dt12_ref)[n0][n1][n2];
+
+                double drho_ddelta = b1 + b2 * delta;
+
+                (*dlogL_ddelta_ref)[n0][n1][n2] =
+                    nmean * (dmu_drho * drho_ddelta +
+                             bk * (t00_dt00 + t11_dt11 + t22_dt22 +
+                                   2.0 * (t01_dt01 + t02_dt02 + t12_dt12)) +
+                             r2 * dlaplace_delta);
+              }
+
+          return std::make_tuple(std::ref(*dlogL_ddelta_ref));
+        };
+
+        virtual ~SecondOrderBias() {
+          delete delta_sqr_arr;
+          delete tidal_00_arr;
+          delete tidal_11_arr;
+          delete tidal_22_arr;
+          delete tidal_01_arr;
+          delete tidal_02_arr;
+          delete tidal_12_arr;
+          delete tidal_sqr_arr;
+          delete laplace_delta_arr;
+          delete t00_dt00_arr;
+          delete t11_dt11_arr;
+          delete t22_dt22_arr;
+          delete t01_dt01_arr;
+          delete t02_dt02_arr;
+          delete t12_dt12_arr;
+          delete dlaplace_delta_arr;
+          delete dlogL_drho_arr;
+          delete dlogL_ddelta_arr;
+          delete tmp_array_real_arr;
+          delete tmp_array_complex_arr;
+          mgr->destroy_plan(analysis_plan);
+          mgr->destroy_plan(synthesis_plan);
+        };
+      };
+
+    }; // namespace detail_SecondOrderBias
+
+    using detail_SecondOrderBias::SecondOrderBias;
+
+  }; // namespace bias
+
+}; // namespace LibLSS
+
+#endif
--- a/extra/borg/libLSS/physics/bias/base.hpp
+++ b/extra/borg/libLSS/physics/bias/base.hpp
@ -0,0 +1,105 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/bias/base.hpp
+    Copyright (C) 2014-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#ifndef __LIBLSS_GENERIC_BIAS_BASE_HPP
+#define __LIBLSS_GENERIC_BIAS_BASE_HPP
+
+#include <functional>
+#include <tuple>
+#include "libLSS/tools/fused_array.hpp"
+#include "libLSS/tools/phoenix_vars.hpp"
+#include <boost/phoenix/operator.hpp>
+#include <boost/phoenix/stl/cmath.hpp>
+#include "libLSS/tools/tuple_helper.hpp"
+#include "libLSS/tools/phoenix_vars.hpp"
+#include "libLSS/tools/array_concepts.hpp"
+#include "libLSS/physics/likelihoods/base.hpp"
+#include <CosmoTool/hdf5_array.hpp>
+
+namespace LibLSS {
+
+  struct NoSelector {};
+  class MarkovState;
+
+  namespace bias {
+    namespace optional_feature_details {
+      MEMBER_FUNC_CHECKER(
+          has_log_prior_params, log_prior_params, ((double *)0));
+      MEMBER_FUNC_CHECKER(has_dump_fields, dump_fields, ((void *)0));
+
+      template <typename Bias, typename BiasParameters>
+      typename std::enable_if<
+          has_log_prior_params<Bias, double>::value, double>::type
+      bias_get_log_prior(Bias const &b, BiasParameters const &params) {
+        return b.log_prior_params(params);
+      }
+
+      template <typename Bias, typename BiasParameters>
+      typename std::enable_if<
+          !has_log_prior_params<Bias, double>::value, double>::type
+      bias_get_log_prior(Bias const &b, BiasParameters const &params) {
+        return 0;
+      }
+
+      template <typename Bias>
+      typename std::enable_if<has_dump_fields<Bias, void>::value, void>::type
+      bias_dump_fields(Bias const &b, MarkovState &state) {
+        return b.dump_fields(state);
+      }
+
+      template <typename Bias>
+      typename std::enable_if<!has_dump_fields<Bias, void>::value, void>::type
+      bias_dump_fields(Bias const &b, MarkovState &state) {}
+    } // namespace optional_feature_details
+
+    using optional_feature_details::bias_dump_fields;
+    using optional_feature_details::bias_get_log_prior;
+  } // namespace bias
+
+  namespace selection {
+
+    struct SimpleAdaptor {
+      template <typename SelectArray, typename BiasedArray>
+      inline auto apply(SelectArray &&select, BiasedArray const &bias)
+          -> decltype(std::tuple_cat(
+              std::make_tuple(b_va_fused<double>(
+                  _p1 * _p2, std::forward<SelectArray>(select),
+                  std::move(std::get<0>(bias)))),
+              last_of_tuple<1>(bias))) {
+        return std::tuple_cat(
+            std::make_tuple(b_va_fused<double>(
+                _p1 * _p2, std::forward<SelectArray>(select),
+                std::move(std::get<0>(bias)))),
+            last_of_tuple<1>(bias));
+      }
+
+      template <
+          typename LikelihoodGradient, typename SelectArray,
+          typename BiasedArray>
+      inline auto adjoint_gradient(
+          LikelihoodGradient ag_like, SelectArray &&select, BiasedArray const &)
+          -> decltype(std::tuple_cat(
+              std::make_tuple(b_va_fused<double>(
+                  _p1 * _p2, std::forward<SelectArray>(select),
+                  std::move(std::get<0>(ag_like)))),
+              last_of_tuple<1>(ag_like))) {
+        // In practice the adjoint gradient operator is the same as simple selection
+        return std::tuple_cat(
+            std::make_tuple(b_va_fused<double>(
+                _p1 * _p2, std::forward<SelectArray>(select),
+                std::move(std::get<0>(ag_like)))),
+            last_of_tuple<1>(ag_like));
+      }
+    };
+
+  } // namespace selection
+
+} // namespace LibLSS
+
+#endif
--- a/extra/borg/libLSS/physics/bias/biases.cpp
+++ b/extra/borg/libLSS/physics/bias/biases.cpp
@ -0,0 +1,53 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/bias/biases.cpp
+    Copyright (C) 2019 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#include "libLSS/physics/bias/noop.hpp"
+#include "libLSS/physics/bias/linear_bias.hpp"
+#include "libLSS/physics/bias/power_law.hpp"
+#include "libLSS/physics/bias/double_power_law.hpp"
+#include "libLSS/physics/bias/broken_power_law.hpp"
+#include "libLSS/physics/bias/many_power.hpp"
+#include "libLSS/physics/bias/eft_bias.hpp"
+#include "libLSS/physics/bias/eft_bias_marg.hpp"
+
+constexpr const int LibLSS::bias::detail_noop::Noop::numParams;
+constexpr const bool LibLSS::bias::detail_noop::Noop::NmeanIsBias;
+
+constexpr int LibLSS::bias::detail_linear_bias::LinearBias::numParams;
+constexpr const bool LibLSS::bias::detail_linear_bias::LinearBias::NmeanIsBias;
+
+constexpr int LibLSS::bias::detail::PowerLaw::numParams;
+constexpr double LibLSS::bias::detail::PowerLaw::EPSILON_VOIDS;
+constexpr const bool LibLSS::bias::detail::PowerLaw::NmeanIsBias;
+
+constexpr int LibLSS::bias::detail::DoubleBrokenPowerLaw::numParams;
+constexpr const bool LibLSS::bias::detail::DoubleBrokenPowerLaw::NmeanIsBias;
+
+constexpr int LibLSS::bias::detail::BrokenPowerLaw::numParams;
+constexpr const bool LibLSS::bias::detail::BrokenPowerLaw::NmeanIsBias;
+
+template<typename Levels> const int LibLSS::bias::ManyPower<Levels>::numParams;
+template<typename Levels> const bool LibLSS::bias::ManyPower<Levels>::NmeanIsBias;
+
+template struct LibLSS::bias::ManyPower<LibLSS::bias::ManyPowerLevels<double, 1>>;
+template struct LibLSS::bias::ManyPower<LibLSS::bias::ManyPowerLevels<double, 1, 1>>;
+template struct LibLSS::bias::ManyPower<LibLSS::bias::ManyPowerLevels<double, 1, 1, 1, 1>>;
+template struct LibLSS::bias::ManyPower<LibLSS::bias::ManyPowerLevels<double, 2>>;
+template struct LibLSS::bias::ManyPower<LibLSS::bias::ManyPowerLevels<double, 2, 2>>;
+
+template<bool b> const int LibLSS::bias::detail_EFTBias::EFTBias<b>::numParams;
+template<bool b> const bool LibLSS::bias::detail_EFTBias::EFTBias<b>::NmeanIsBias;
+
+template struct LibLSS::bias::detail_EFTBias::EFTBias<false>;
+template struct LibLSS::bias::detail_EFTBias::EFTBias<true>;
+
+
+// ARES TAG: authors_num = 1
+// ARES TAG: name(0) = Guilhem Lavaux
+// ARES TAG: email(0) = guilhem.lavaux@iap.fr
+// ARES TAG: year(0) = 2019
--- a/extra/borg/libLSS/physics/bias/broken_power_law.hpp
+++ b/extra/borg/libLSS/physics/bias/broken_power_law.hpp
@ -0,0 +1,131 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/bias/broken_power_law.hpp
+    Copyright (C) 2014-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#ifndef __LIBLSS_PHYSICS_BROKEN_POWER_LAW_BIAS_HPP
+#define __LIBLSS_PHYSICS_BROKEN_POWER_LAW_BIAS_HPP
+
+#include <cmath>
+#include <boost/bind.hpp>
+#include "libLSS/tools/console.hpp"
+#include "libLSS/tools/fused_array.hpp"
+#include "libLSS/physics/bias/base.hpp"
+#include <boost/tuple/tuple.hpp>
+
+namespace LibLSS {
+
+  namespace bias {
+
+    namespace detail {
+
+      struct BrokenPowerLaw {
+        static constexpr const int numParams = 4;
+        static constexpr const bool NmeanIsBias = true;
+
+        double alpha, epsilon, rho_g;
+        double nmean;
+
+        selection::SimpleAdaptor selection_adaptor;
+
+        BrokenPowerLaw(LikelihoodInfo const& = LikelihoodInfo()) {}
+
+        template <typename Array>
+        static inline bool check_bias_constraints(Array &&a) {
+          return a[0] > 0 && a[0] < 1e8 && a[1] > 0 && a[1] < 6 && a[2] > 0 && a[2] < 3 &&
+                 a[3] > 0 && a[3] < 1e5; // The only constraint is that rho_g is positive
+        }
+
+        template <typename BiasParameters>
+        static inline void setup_default(BiasParameters &params) {
+          params[0] = 100;
+          params[1] = 1.;
+          params[2] = 0.01;
+          params[3] = 0.001;
+        }
+
+        // Nota: fwd_model and final_density arrays cannot be stored in this step. But
+        // they can be used.
+        template <
+            class ForwardModel, typename FinalDensityArray,
+            typename BiasParameters, typename MetaSelector = NoSelector>
+        inline void prepare(
+            ForwardModel &fwd_model, const FinalDensityArray &final_density,
+            double const nmean_, const BiasParameters &params,
+            bool density_updated, MetaSelector _select = MetaSelector()) {
+          // No need for complex preparation. Just copy parameters.
+          alpha = params[1];
+          epsilon = params[2];
+          rho_g = params[3];
+          nmean = params[0];
+        }
+
+        inline void cleanup() {
+          // No need for cleanup
+        }
+
+        // This is a helper function to determine what is the pure linear behaviour. This
+        // is helpful for setup initial conditions that are acceptable in order to reduce absurd transients.
+        inline double get_linear_bias() const { return alpha; }
+
+        static inline double gradient_density_lambda(
+            double nmean, double alpha, double epsilon, double rho_g, double v,
+            double g) {
+          double const x = 1 + 1e-6 + v;
+          double const rho = x / rho_g;
+          double a = std::pow(x, alpha - 1);
+          double b = std::pow(rho, -epsilon);
+          double f = std::exp(-b);
+          return (alpha + epsilon * b) * a * f * nmean * g;
+        }
+
+        static inline double density_lambda(
+            double nmean, double alpha, double epsilon, double rho_g,
+            double v) {
+          double const x = 1 + 1e-6 + v;
+          double rho = x / rho_g;
+          double ret = nmean * std::pow(x, alpha) *
+                 std::exp(-std::pow(rho, -epsilon));
+           return 1e-6 + ret;
+        }
+
+        template <typename FinalDensityArray>
+        inline auto compute_density(const FinalDensityArray &array) {
+          return std::make_tuple(b_va_fused<double>(
+              std::bind(
+                  density_lambda, nmean, alpha, epsilon, rho_g,
+                  std::placeholders::_1),
+              array));
+        }
+
+        // This function returns an array-like array. That array
+        // depends on the existence of the final density array and the gradient likelihood array.
+        // That is the job of the caller to ensure that temporary variables are not cleared
+        // before the final use.
+        // The return type is quite complex. Let the compiler decides.
+        template <
+            typename FinalDensityArray, typename TupleGradientLikelihoodArray>
+        inline auto apply_adjoint_gradient(
+            const FinalDensityArray &array,
+            TupleGradientLikelihoodArray grad_array) {
+          return std::make_tuple(b_va_fused<double>(
+              std::bind(
+                  gradient_density_lambda, nmean, alpha, epsilon, rho_g,
+                  std::placeholders::_1, std::placeholders::_2),
+              array, std::move(std::get<0>(grad_array))));
+        }
+      };
+
+    } // namespace detail
+
+    using detail::BrokenPowerLaw;
+
+  } // namespace bias
+
+} // namespace LibLSS
+
+#endif
--- a/extra/borg/libLSS/physics/bias/broken_power_law_sigmoid.hpp
+++ b/extra/borg/libLSS/physics/bias/broken_power_law_sigmoid.hpp
@ -0,0 +1,143 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/bias/broken_power_law_sigmoid.hpp
+    Copyright (C) 2014-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#ifndef __LIBLSS_PHYSICS_BROKEN_POWER_LAW_SIGMOID_BIAS_HPP
+#define __LIBLSS_PHYSICS_BROKEN_POWER_LAW_SIGMOID_BIAS_HPP
+
+#include <cmath>
+#include <boost/bind.hpp>
+#include "libLSS/tools/console.hpp"
+#include "libLSS/tools/fused_array.hpp"
+#include "libLSS/physics/bias/base.hpp"
+#include <boost/tuple/tuple.hpp>
+
+namespace LibLSS {
+
+  namespace bias {
+
+    namespace detail {
+
+      struct BrokenPowerLawSigmoid {
+        static constexpr const int numParams = 6;
+        static constexpr const bool NmeanIsBias = true;
+
+        double alpha, epsilon, rho_g, k, rho_th;
+        double nmean;
+
+        selection::SimpleAdaptor selection_adaptor;
+
+        BrokenPowerLawSigmoid(LikelihoodInfo const& = LikelihoodInfo()) {}
+
+        template <typename Array>
+        static inline bool check_bias_constraints(Array &&a) {
+          return   (a[0] > 0 && a[0] < 1e8
+		&& a[1] > 0 && a[1] < 6
+		&& a[2] > 0 && a[2] < 3
+		&& a[3] > 0 && a[3] < 1e5
+		&& a[4] > 0 && a[4] < 100
+		&& a[5] > 0 && a[5] < 1e8); // The only constraint are that rho_g,k,rho_th are positive
+        }
+
+        template <typename BiasParameters>
+        static inline void setup_default(BiasParameters &params) {
+          params[0] = 1;
+          params[1] = 1.;
+          params[2] = 1e-8;
+          params[3] = 0.001;
+          params[4] = 1.;
+          params[5] = 1.;
+        }
+
+        // Nota: fwd_model and final_density arrays cannot be stored in this step. But
+        // they can be used.
+        template <
+            class ForwardModel, typename FinalDensityArray,
+            typename BiasParameters, typename MetaSelector = NoSelector>
+        inline void prepare(
+            ForwardModel &fwd_model, const FinalDensityArray &final_density,
+            double const nmean_, const BiasParameters &params,
+            bool density_updated, MetaSelector _select = MetaSelector()) {
+          // No need for complex preparation. Just copy parameters.
+	  nmean = params[0];
+          alpha = params[1];
+          epsilon = params[2];
+          rho_g = params[3];
+          k     = params[4];
+          rho_th = params[5];
+        }
+
+        inline void cleanup() {
+          // No need for cleanup
+        }
+
+        // This is a helper function to determine what is the pure linear behaviour. This
+        // is helpful for setup initial conditions that are acceptable in order to reduce absurd transients.
+        inline double get_linear_bias() const { return alpha; }
+
+        static inline double gradient_density_lambda(
+            double nmean, double alpha, double epsilon, double rho_g, double k, double rho_th, double v,
+            double g) {
+          double const x = 1 + 1e-6 + v;
+          double const rho = x / rho_g;
+          double a = std::pow(x, alpha - 1);
+          double b = std::pow(rho, -epsilon);
+          double f = std::exp(-b);
+          double h0 = std::exp(-k*(x-rho_th));
+          double h1 = 1./(1. + h0);
+
+          return (alpha + epsilon * b + h1 * h0 * k * x) * a * f * nmean * h1*g;
+        }
+
+        static inline double density_lambda(
+            double nmean, double alpha, double epsilon, double rho_g, double k, double rho_th,
+            double v) {
+          double const x = 1 + 1e-6 + v;
+          double rho = x / rho_g;
+          double ret = nmean * std::pow(x, alpha) *
+                 1./(1. + std::exp(-k*(x-rho_th))) *
+                 std::exp(-std::pow(rho, -epsilon));
+           return 1e-6 + ret;
+        }
+
+        template <typename FinalDensityArray>
+        inline auto compute_density(const FinalDensityArray &array) {
+          return std::make_tuple(b_va_fused<double>(
+              std::bind(
+                  density_lambda, nmean, alpha, epsilon, rho_g,k,rho_th,
+                  std::placeholders::_1),
+              array));
+        }
+
+        // This function returns an array-like array. That array
+        // depends on the existence of the final density array and the gradient likelihood array.
+        // That is the job of the caller to ensure that temporary variables are not cleared
+        // before the final use.
+        // The return type is quite complex. Let the compiler decides.
+        template <
+            typename FinalDensityArray, typename TupleGradientLikelihoodArray>
+        inline auto apply_adjoint_gradient(
+            const FinalDensityArray &array,
+            TupleGradientLikelihoodArray grad_array) {
+          return std::make_tuple(b_va_fused<double>(
+              std::bind(
+                  gradient_density_lambda, nmean, alpha, epsilon,rho_g,k,rho_th,
+                  std::placeholders::_1, std::placeholders::_2),
+              array, std::move(std::get<0>(grad_array))));
+        }
+      };
+
+    } // namespace detail
+
+    using detail::BrokenPowerLawSigmoid;
+
+  } // namespace bias
+
+} // namespace LibLSS
+
+#endif
--- a/extra/borg/libLSS/physics/bias/double_power_law.hpp
+++ b/extra/borg/libLSS/physics/bias/double_power_law.hpp
@ -0,0 +1,145 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/bias/double_power_law.hpp
+    Copyright (C) 2014-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#ifndef __LIBLSS_PHYSICS_DOUBLE_POWER_LAW_BIAS_HPP
+#define __LIBLSS_PHYSICS_DOUBLE_POWER_LAW_BIAS_HPP
+
+#include <cmath>
+#include <boost/bind.hpp>
+#include "libLSS/tools/console.hpp"
+#include "libLSS/tools/fused_array.hpp"
+#include "libLSS/physics/bias/base.hpp"
+#include <boost/tuple/tuple.hpp>
+
+namespace LibLSS {
+
+  namespace bias {
+
+    namespace detail {
+
+      struct DoubleBrokenPowerLaw {
+        static constexpr const int numParams = 3;
+        static constexpr const bool NmeanIsBias = false;
+
+        // We implement Mo & White Equation 15.38 (page 687)
+        double L0, beta, gamma;
+        double nmean;
+
+        selection::SimpleAdaptor selection_adaptor;
+
+        DoubleBrokenPowerLaw(LikelihoodInfo const& = LikelihoodInfo()) {}
+
+        template <typename Array>
+        static inline bool check_bias_constraints(Array &&a) {
+          Console::instance().print<LOG_DEBUG>(
+              boost::format("Attempting biases: %g, %g, %g, %g") % a[0] % a[1] %
+              a[2] % a[3]);
+          return a[0] > 0 && a[0] < 5000 && a[1] > 0 && a[1] < 3 && a[2] > 0 &&
+                 a[2] < 3; // The only constraint is that rho_L is positive
+        }
+
+        template <typename BiasParameters>
+        static inline void setup_default(BiasParameters &params) {
+          params[0] = 10;
+          params[1] = 1.5;
+          params[2] = 0.75;
+        }
+
+        // Nota: fwd_model and final_density arrays cannot be stored in this step. But
+        // they can be used.
+        template <
+            class ForwardModel, typename FinalDensityArray,
+            typename BiasParameters, typename MetaSelector = NoSelector>
+        inline void prepare(
+            ForwardModel &fwd_model, const FinalDensityArray &final_density,
+            double const nmean_, const BiasParameters &params,
+            bool density_updated, MetaSelector _select = MetaSelector()) {
+          // No need for complex preparation. Just copy parameters.
+          L0 = params[0];
+          beta = params[1];
+          gamma = params[2];
+          nmean = nmean_;
+        }
+
+        inline void cleanup() {
+          // No need for cleanup
+        }
+
+        // This is a helper function to determine what is the pure linear behaviour. This
+        // is helpful for setup initial conditions that are acceptable in order to reduce absurd transients.
+        inline double get_linear_bias() const { return beta; }
+
+        static constexpr double EPSILON_VOIDS = 1e-6;
+
+        static inline double gradient_density_lambda(
+            double nmean, double L0, double beta, double gamma, double v,
+            double g) {
+          double rho = 1 + EPSILON_VOIDS + v;
+          double x = rho / L0;
+          double A = std::pow(x, beta - 1);
+          double B = std::pow(x, beta - gamma - 1);
+          double C = 1 + x * B;
+
+          return nmean * (beta * C - (beta - gamma) * B * x) * A /
+                 (C * C * L0) * g;
+        }
+
+        static inline double density_lambda(
+            double nmean, double L0, double beta, double gamma, double v) {
+          double rho = 1 + EPSILON_VOIDS + v;
+          double x = rho / L0;
+          return nmean * std::pow(x, beta) / (1 + std::pow(x, beta - gamma));
+        }
+
+        template <typename FinalDensityArray>
+        inline auto compute_density(const FinalDensityArray &array)
+            -> decltype(std::make_tuple(b_va_fused<double>(
+                boost::bind(
+                    density_lambda, nmean, L0, beta, gamma,
+                    boost::placeholders::_1),
+                array))) {
+          return std::make_tuple(b_va_fused<double>(
+              boost::bind(
+                  density_lambda, nmean, L0, beta, gamma,
+                  boost::placeholders::_1),
+              array));
+        }
+
+        // This function returns an array-like array. That array
+        // depends on the existence of the final density array and the gradient likelihood array.
+        // That is the job of the caller to ensure that temporary variables are not cleared
+        // before the final use.
+        // The return type is quite complex. Let the compiler decides.
+        template <
+            typename FinalDensityArray, typename TupleGradientLikelihoodArray>
+        inline auto apply_adjoint_gradient(
+            const FinalDensityArray &array,
+            TupleGradientLikelihoodArray grad_array)
+            -> decltype(std::make_tuple(b_va_fused<double>(
+                boost::bind(
+                    gradient_density_lambda, nmean, L0, beta, gamma,
+                    boost::placeholders::_1, boost::placeholders::_2),
+                array, std::move(std::get<0>(grad_array))))) {
+          return std::make_tuple(b_va_fused<double>(
+              boost::bind(
+                  gradient_density_lambda, nmean, L0, beta, gamma,
+                  boost::placeholders::_1, boost::placeholders::_2),
+              array, std::move(std::get<0>(grad_array))));
+        }
+      };
+
+    } // namespace detail
+
+    using detail::DoubleBrokenPowerLaw;
+
+  } // namespace bias
+
+} // namespace LibLSS
+
+#endif
--- a/extra/borg/libLSS/physics/bias/downgrader.hpp
+++ b/extra/borg/libLSS/physics/bias/downgrader.hpp
@ -0,0 +1,239 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/bias/downgrader.hpp
+    Copyright (C) 2019-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#ifndef __LIBLSS_PHYSICS_DOWNGRADER_HPP
+#  define __LIBLSS_PHYSICS_DOWNGRADER_HPP
+
+/**
+  * This header provides the implementations of a simple power law bias model.
+  * A generic bias model must implement the following concept:
+  *     a "selection_adaptor" functor, available from the object
+  *     a "compute_density(final_density)" function accepting a virtual array and returning a tuple of virtual array.
+  *     a "apply_adjoint_gradient(final_density,gradient_likelihood_array)" also accepting virtual arrays and returning a virtual array
+  *     a "prepare" function which is called before the other two calls, to allow for precomputing auxiliary complex fields.
+  *     a "cleanup" which allows for tearing down any auxiliary fields and memory allocated in the prepare phase.
+  */
+#  include <cmath>
+#  include <functional>
+#  include "libLSS/tools/fused_array.hpp"
+#  include <tuple>
+#  include "libLSS/tools/phoenix_vars.hpp"
+#  include <boost/phoenix/operator.hpp>
+#  include <boost/phoenix/stl/cmath.hpp>
+#  include "libLSS/tools/tuple_helper.hpp"
+#  include "libLSS/physics/bias/base.hpp"
+#  include "libLSS/physics/bias/level_combinator.hpp"
+
+namespace LibLSS {
+
+  namespace bias {
+
+    namespace detail_downgrader {
+      using namespace LibLSS::Combinator;
+
+      namespace ph = std::placeholders;
+
+      template <size_t... l>
+      struct DegradeGenerator {
+        typedef Levels<double, l...> Level_t;
+      };
+
+      /**
+        * Downgrader bias
+        */
+      template <
+          typename LowerBias, typename Generator = DegradeGenerator<1, 1, 1>>
+      class Downgrader {
+      public:
+        //  static constexpr const bool NmeanIsBias = LowerBias::NmeanIsBias;
+        //  static const auto numParams = LowerBias::numParams;
+
+        // This adaptor transforms an unselected galaxy density (with appropriate
+        // auxiliary arrays) to a selected array. It must be a functor accepting two
+        // parameters: a selection virtual array and a bias density virtual array.
+        //  selection::SimpleAdaptor selection_adaptor;
+
+        //  double nmean;
+
+        //  template <typename BiasParameters>
+        //  static inline void setup_default(BiasParameters &params) {
+        //
+        //  }
+
+        typedef typename Generator::Level_t Level_t;
+        static constexpr const bool NmeanIsBias = LowerBias::NmeanIsBias;
+        static constexpr const auto numParams = LowerBias::numParams;
+        ;
+
+        decltype(
+            std::declval<LowerBias>().selection_adaptor) &selection_adaptor;
+
+        Level_t level;
+        LowerBias lowerBias;
+
+        GhostPlanes<double, 2> ghosts;
+        bool needInit;
+        size_t N2;
+        std::shared_ptr<U_Array<double, 3>> bias_density, ag_bias_density;
+
+        Downgrader(LikelihoodInfo const& = LikelihoodInfo())
+            : needInit(true), selection_adaptor(lowerBias.selection_adaptor) {}
+
+        template <typename Array>
+        inline bool check_bias_constraints(Array &&a) {
+          return lowerBias.check_bias_constraints(a);
+        }
+
+        template <typename Array>
+        static inline void setup_default(Array &&a) {
+          LowerBias::setup_default(a);
+        }
+
+        // Nota: fwd_model and final_density arrays cannot be stored in this step. But
+        // they can be used.
+        template <
+            class ForwardModel, typename FinalDensityArray,
+            typename BiasParameters, typename MetaSelect = NoSelector>
+        inline void prepare(
+            ForwardModel &fwd_model, const FinalDensityArray &final_density,
+            double const _nmean, const BiasParameters &params,
+            bool density_updated, MetaSelect _select = MetaSelect()) {
+          auto &mgr = fwd_model.lo_mgr;
+          LIBLSS_AUTO_CONTEXT(LOG_DEBUG, ctx);
+
+          this->N2 = mgr->N2;
+
+          if (needInit) {
+            auto box = fwd_model.get_box_model();
+            MPI_Communication *comm = mgr->getComm();
+            std::tuple<ssize_t, ssize_t> bounds[Level_t::numLevel];
+
+            for (int r = 0; r < Level_t::numLevel; r++) {
+              size_t factor =
+                  Combinator::const_pow(2, Level_t::numLevel - 1 - r);
+              ssize_t start =
+                  (mgr->N0 / factor) * comm->rank() / comm->size(); // FIXME
+              ssize_t end = (mgr->N0 / factor) * (comm->rank() + 1) /
+                            comm->size(); // FIXME
+              bounds[r] = std::make_tuple(start, end);
+              ctx.format(
+                  "Factor %d for level %d (bounds=[%d - %d])", factor, r,
+                  std::get<0>(bounds[r]), std::get<1>(bounds[r]));
+            }
+
+            level.allocate(
+                box.N0, box.N1, box.N2, mgr->N2, mgr->startN0, mgr->localN0,
+                bounds);
+            level.setup(ghosts, fwd_model.communicator());
+
+            bias_density = std::make_shared<U_Array<double, 3>>(
+                mgr->extents_real_strict());
+            ag_bias_density = std::make_shared<U_Array<double, 3>>(
+                mgr->extents_real_strict());
+            needInit = false;
+          }
+
+          lowerBias.prepare(
+              fwd_model, final_density, _nmean, params, density_updated,
+              _select);
+
+          fwrap(bias_density->get_array()) =
+              std::get<0>(lowerBias.compute_density(final_density));
+
+          ghosts.synchronize(bias_density->get_array());
+          // Now build the different levels from the planes.
+          level.buildLevels(ghosts, bias_density->get_array());
+        }
+
+        inline void cleanup() { lowerBias.cleanup(); }
+
+        template <typename FinalDensityArray>
+        inline auto compute_density(const FinalDensityArray &array) {
+          constexpr int numLevel = Level_t::numLevel;
+          auto const &barray = bias_density->get_array();
+
+          return std::make_tuple(b_fused_idx<double, 3>(
+              [this, &barray, numLevel](size_t i, size_t j, size_t k) {
+                double out;
+                if (k >= N2 / Combinator::const_pow(2, numLevel - 1)) {
+                  auto &cons = Console::instance();
+                  cons.format<LOG_ERROR>(
+                      "Going above limits with k=%d, numLevel=%d!", k,
+                      numLevel);
+                  return 0.0;
+                }
+                out = level.template get_density_level<Level_t::numLevel - 1>(
+                    barray, i, j, k);
+                if (out < 0 || out == 0 || std::isnan(out) || std::isinf(out)) {
+                  auto &cons = Console::instance();
+                  //cons.c_assert(!std::isnan(out[numLevel-1]), "Nan in density");
+                  cons.format<LOG_ERROR>(
+                      "Nan (%g) in density at %dx%dx%d", out, i, j, k);
+                  MPI_Communication::instance()->abort();
+                }
+                return out;
+              }));
+        }
+
+        // This function returns an array-like array. That array
+        // depends on the existence of the final density array and the gradient likelihood array.
+        // That is the job of the caller to ensure that temporary variables are not cleared
+        // before the final use.
+        // The return type is quite complex. Let the compiler decides.
+        // L(b_0(delta, p), b_1(delta, p), ..., b_n(delta, p))
+        // Now we take a tuple of gradient and collapse this to a gradient of delta.
+        template <
+            typename FinalDensityArray, typename TupleGradientLikelihoodArray>
+        inline auto apply_adjoint_gradient(
+            const FinalDensityArray &array,
+            TupleGradientLikelihoodArray grad_array) {
+          constexpr int numLevel = Level_t::numLevel;
+          BOOST_STATIC_ASSERT(
+              (std::tuple_size<TupleGradientLikelihoodArray>::value == 1));
+          auto &ag_array = ag_bias_density->get_array();
+          auto const &grad = std::get<0>(grad_array);
+          size_t startN0 = grad.index_bases()[0];
+          size_t endN0 = startN0 + grad.shape()[0];
+          size_t N1 = grad.shape()[1];
+          size_t N2 = grad.shape()[2];
+
+          ghosts.clear_ghosts();
+          level.clear_cache();
+          array::fill(ag_array, 0);
+
+#  pragma omp parallel for collapse(3)
+          for (size_t i = startN0; i < endN0; i++) {
+            for (size_t j = 0; j < N1; j++) {
+              for (size_t k = 0; k < N2; k++) {
+                level.template push_ag_density_level<numLevel - 1>(
+                    grad[i][j][k], ag_array, i, j, k);
+              }
+            }
+          }
+          level.ag_buildLevels(ghosts, ag_array);
+          ghosts.synchronize_ag(ag_array);
+
+          return lowerBias.apply_adjoint_gradient(
+              array, std::make_tuple(std::cref(ag_array)));
+        }
+      };
+    }; // namespace detail_downgrader
+
+    using detail_downgrader::DegradeGenerator;
+    /// Import the Noop class into LibLSS::bias
+    using detail_downgrader::Downgrader;
+
+  } // namespace bias
+
+} // namespace LibLSS
+
+#endif
+// ARES TAG: authors_num = 1
+// ARES TAG: name(0) = Guilhem Lavaux
+// ARES TAG: year(0) = 2019-2020
+// ARES TAG: email(0) = guilhem.lavaux@iap.fr
--- a/extra/borg/libLSS/physics/bias/eft_bias.hpp
+++ b/extra/borg/libLSS/physics/bias/eft_bias.hpp
@ -0,0 +1,732 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/bias/eft_bias.hpp
+    Copyright (C) 2018-2019 Franz Elsner <>
+    Copyright (C) 2019-2021 Fabian Schmidt <fabians@mpa-garching.mpg.de>
+    Copyright (C) 2019-2021 Martin Reinecke <martin@mpa-garching.mpg.de>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+/*
+       Key features:
+       - implements bias fields \delta, \delta^2, (K_ij)^2, \laplace\delta
+       - bias fields are precomputed for speed up during bias block sampling
+       - biased field is returned in real space but BEFORE sharp-k filter
+       - sharp-k filtered density is available in arrs->deltaLambda
+       - Gaussian priors on bias parameters can be set using
+         "bias_prior_mean", "bias_prior_sigma" in "likelihood" section of ini file,
+         as comma-separated float values
+
+       A word on numParams = 6: we have
+         nmean (should be fixed to 1),
+	 4 bias parameters,
+	 sigma_0 (sqrt of constant noise variance).
+
+    This program is free software; you can redistribute it and/or modify it
+    under the terms of either the CeCILL license or the GNU General Public
+    license, as included with the software package.
+
+    The text of the license is located in Licence_CeCILL_V2.1-en.txt
+    and GPL.txt in the root directory of the source package.
+
+*/
+#ifndef __LIBLSS_PHYSICS_EFT_BIAS_HPP
+#  define __LIBLSS_PHYSICS_EFT_BIAS_HPP
+
+// This header provides the implementation of the LSS bias model to second order PT.
+
+#  include <cmath>
+#  include "libLSS/tools/fused_array.hpp"
+#  include <tuple>
+#  include "libLSS/tools/phoenix_vars.hpp"
+#  include <boost/phoenix/operator.hpp>
+#  include <boost/phoenix/stl/cmath.hpp>
+#  include "libLSS/tools/tuple_helper.hpp"
+#  include "libLSS/tools/ptree_vectors.hpp"
+#  include "libLSS/tools/string_tools.hpp"
+#  include "libLSS/physics/bias/base.hpp"
+#  include "libLSS/physics/bias/power_law.hpp"
+#  include "libLSS/tools/fftw_allocator.hpp"
+#  include "libLSS/samplers/core/powerspec_tools.hpp"
+
+namespace LibLSS {
+
+  namespace bias {
+
+    namespace detail_EFTBias {
+
+      namespace ph = std::placeholders;
+
+      using DFT_Manager = FFTW_Manager_3d<double>;
+      using U_Array = Uninit_FFTW_Real_Array;
+      using U_CArray = Uninit_FFTW_Complex_Array;
+      using U_ArrayRef = U_Array::array_type;
+      using U_CArrayRef = U_CArray::array_type;
+
+      template <typename T>
+      struct myarr {
+        T arr;
+        typename T::array_type ref;
+        template <typename T1, typename T2>
+        myarr(const T1 &extent, const T2 &alloc)
+            : arr(extent, alloc), ref(arr.get_array()) {}
+      };
+
+      template <bool SHARP_THRESHOLDER>
+      class EFTBias {
+      public:
+        static constexpr const bool NmeanIsBias = true;
+        static constexpr int numParams = 6;
+
+      protected:
+        struct Arrs {
+          MPI_Communication &comm;
+          DFT_Manager &mgr;
+          myarr<U_Array> deltaLambda, delta_sqr, tidal_sqr, laplace_delta,
+              dlogL_ddelta;
+          myarr<U_Array> tidal_00, tidal_01, tidal_02, tidal_11, tidal_12,
+              tidal_22;
+          FCalls::plan_type analysis_plan;
+          FCalls::plan_type synthesis_plan;
+          template <class Mgr>
+          Arrs(MPI_Communication &comm_, Mgr &mgr_)
+              : comm(comm_), mgr(mgr_),
+                deltaLambda(mgr.extents_real(), mgr.allocator_real),
+                delta_sqr(mgr.extents_real(), mgr.allocator_real),
+                tidal_sqr(mgr.extents_real(), mgr.allocator_real),
+                laplace_delta(mgr.extents_real(), mgr.allocator_real),
+                dlogL_ddelta(mgr.extents_real(), mgr.allocator_real),
+                tidal_00(mgr.extents_real(), mgr.allocator_real),
+                tidal_01(mgr.extents_real(), mgr.allocator_real),
+                tidal_02(mgr.extents_real(), mgr.allocator_real),
+                tidal_11(mgr.extents_real(), mgr.allocator_real),
+                tidal_12(mgr.extents_real(), mgr.allocator_real),
+                tidal_22(mgr.extents_real(), mgr.allocator_real) {
+            myarr<U_Array> tmp(mgr.extents_real(), mgr.allocator_real);
+            myarr<U_CArray> ctmp(mgr.extents_complex(), mgr.allocator_complex);
+            analysis_plan =
+                mgr.create_r2c_plan(tmp.ref.data(), ctmp.ref.data());
+            synthesis_plan =
+                mgr.create_c2r_plan(ctmp.ref.data(), tmp.ref.data());
+          }
+        };
+        std::unique_ptr<Arrs> arrs;
+
+        static constexpr double EPSILON_VOIDS = 1e-6; // for thresholder
+
+        long N0, N1, N2, N2_HC;
+        long startN0, localN0;
+        double L0, L1, L2;
+        // nmean and bias/likelihood parameter:
+        double nmean;
+        double b1, b2;
+        double bk;
+        double r2;
+        double sigma_0;
+        // cutoff:
+        double EFT_Lambda;
+        // sigma0 limits
+        double sigma0min, sigma0max;
+
+        // priors on bias/likelihood parameters
+        // - if priorsigma <= 0, means no prior
+        std::vector<double> priormean;
+        std::vector<double> priorsigma;
+        // store prior ln P
+        double lnPprior;
+
+        // hard-coded renormalization; this should correspond to RMS of
+        // Eulerian density field AFTER cut
+        // - no renorm:
+        static constexpr double rmsdelta_renorm = 0.;
+        // - 2LPT, Lambda=0.1:
+        // static constexpr double rmsdelta_renorm = 0.413;
+
+        // FS: disable thresholders; in future, should be removed properly
+        static inline double thresholder(double a, double b) {
+          return a;
+          // if (a>=b) return a;
+          // return SHARP_THRESHOLDER ? b : b * std::exp((-b/a + 1));
+        }
+        static inline double adjoint_thresholder(double a, double b, double g) {
+          return g;
+          // if (a>=b) return g;
+          // return SHARP_THRESHOLDER ? 0 : ( b*b / (a*a) * std::exp((1-b/a)) * g);
+        }
+
+        // construct bias field
+        static inline double density_lambda(
+            double nmean, double b1eff, double b2, double bk, double r2,
+            double delta, double delta_sqr, double tidal_sqr,
+            double laplace_delta) {
+          double rho = b1eff * delta + b2 * delta_sqr + bk * tidal_sqr +
+                       r2 * laplace_delta;
+          return nmean * thresholder(1.0 + rho, EPSILON_VOIDS);
+        }
+
+        void fix_symmetry(myarr<U_CArray> &ctmp) const {
+          if ((startN0 == 0) && (localN0 > 0)) {
+            ctmp.ref[0][0][0] = 0.0;
+            ctmp.ref[0][0][N2_HC - 1] = 0.0;
+            ctmp.ref[0][N1 / 2][0] = 0.0;
+            ctmp.ref[0][N1 / 2][N2_HC - 1] = 0.0;
+          }
+
+          if ((startN0 <= N0 / 2) && (startN0 + localN0 > N0 / 2)) {
+            ctmp.ref[N0 / 2][0][0] = 0.0;
+            ctmp.ref[N0 / 2][0][N2_HC - 1] = 0.0;
+            ctmp.ref[N0 / 2][N1 / 2][0] = 0.0;
+            ctmp.ref[N0 / 2][N1 / 2][N2_HC - 1] = 0.0;
+          }
+        }
+
+        // computes (axis0,axis1) component of tidal field:
+        // K_ij = \Del_ij \delta
+        // where
+        //     \Del_ij \equiv \partial_i \partial_j/\nabla^2 - 1/3 \delta_ij
+        //
+        // and i-1 = axis0, j-1 = axis1, \delta = array_complex_in
+        // - input array is in Fourier space (complex)
+        // - output array is in real space
+        void get_spatial_derivative_array3d_tidal(
+            U_ArrayRef &deriv_array_real_out,
+            const U_CArrayRef &array_complex_in, const int axis0,
+            const int axis1) const {
+
+          myarr<U_CArray> ctmp(
+              arrs->mgr.extents_complex(), arrs->mgr.allocator_complex);
+          double fft_normalization = 1.0 / (N0 * N1 * N2);
+
+          Console::instance().print<LOG_DEBUG>("Spatial derivative: dtidal");
+          array::copyArray3d(ctmp.ref, array_complex_in);
+
+          Console::instance().c_assert(
+              (axis0 >= 0) && (axis0 <= 2) && (axis1 >= 0) && (axis1 <= 2),
+              "Invalid 'axis0/1' argument in "
+              "'get_spatial_derivative_array3d'");
+
+          double delta_ij = (axis0 == axis1) ? 1.0 / 3.0 : 0.0;
+
+#  pragma omp parallel for collapse(3)
+          for (size_t i = startN0; i < startN0 + localN0; i++)
+            for (size_t j = 0; j < N1; j++)
+              for (size_t k = 0; k < N2_HC; k++) {
+                double kk[3] = {
+                    kmode(i, N0, L0), kmode(j, N1, L1), kmode(k, N2, L2)};
+
+                double ksquared = kk[0] * kk[0] + kk[1] * kk[1] + kk[2] * kk[2];
+                double factor = (kk[axis0] * kk[axis1] / ksquared - delta_ij) *
+                                fft_normalization;
+                ctmp.ref[i][j][k] *= factor;
+              }
+
+          fix_symmetry(ctmp);
+
+          // transform ctmp array to real space -> deriv_array_real_out
+          arrs->mgr.execute_c2r(
+              arrs->synthesis_plan, ctmp.ref.data(),
+              deriv_array_real_out.data());
+        }
+
+        // computes Laplacian of density field, \nabla^2 \delta
+        // where \delta = array_complex_in
+        // - input array is in Fourier space (complex)
+        // - output array is in real space
+        void get_spatial_derivative_array3d_laplace(
+            U_ArrayRef &deriv_array_real_out,
+            const U_CArrayRef &array_complex_in) const {
+
+          myarr<U_CArray> ctmp(
+              arrs->mgr.extents_complex(), arrs->mgr.allocator_complex);
+          double fft_normalization = 1.0 / (N0 * N1 * N2);
+
+          Console::instance().print<LOG_DEBUG>("Spatial derivative: dlaplace");
+          array::copyArray3d(ctmp.ref, array_complex_in);
+
+#  pragma omp parallel for collapse(3)
+          for (size_t i = startN0; i < startN0 + localN0; i++)
+            for (size_t j = 0; j < N1; j++)
+              for (size_t k = 0; k < N2_HC; k++) {
+                double kk[3] = {
+                    kmode(i, N0, L0), kmode(j, N1, L1), kmode(k, N2, L2)};
+
+                double ksquared = kk[0] * kk[0] + kk[1] * kk[1] + kk[2] * kk[2];
+                ctmp.ref[i][j][k] *= -1.0 * ksquared * fft_normalization;
+              }
+
+          fix_symmetry(ctmp);
+
+          arrs->mgr.execute_c2r(
+              arrs->synthesis_plan, ctmp.ref.data(),
+              deriv_array_real_out.data());
+        }
+
+        // computes \Del_ij (a_1 a_2), where
+        // a_1 = array1_real_in, a_2 = array2_real_in, and i-1 = axis0, j-1 = axis1
+        // - both input arrays and output array are real
+        void get_density_derivative_array3d_dtidal(
+            U_ArrayRef &deriv_array_real_out, const U_ArrayRef &array1_real_in,
+            const U_ArrayRef &array2_real_in, const int axis0,
+            const int axis1) const {
+          myarr<U_Array> tmp(
+              arrs->mgr.extents_real(), arrs->mgr.allocator_real);
+          myarr<U_CArray> ctmp(
+              arrs->mgr.extents_complex(), arrs->mgr.allocator_complex);
+          fwrap(tmp.ref) = fwrap(array1_real_in) * fwrap(array2_real_in);
+
+          arrs->mgr.execute_r2c(
+              arrs->analysis_plan, tmp.ref.data(), ctmp.ref.data());
+
+          get_spatial_derivative_array3d_tidal(
+              deriv_array_real_out, ctmp.ref, axis0, axis1);
+        }
+
+        // computes \nabla^2 a, where a = array1_real_in
+        // - both input arrays and output array are real
+        // - copy operations necessary to preserve input array (r2c destroys input array)
+        void get_density_derivative_array3d_dlaplace(
+            U_ArrayRef &deriv_array_real_out,
+            const U_ArrayRef &array1_real_in) const {
+
+          myarr<U_Array> tmp(
+              arrs->mgr.extents_real(), arrs->mgr.allocator_real);
+          myarr<U_CArray> ctmp(
+              arrs->mgr.extents_complex(), arrs->mgr.allocator_complex);
+          array::copyArray3d(tmp.ref, array1_real_in);
+
+          arrs->mgr.execute_r2c(
+              arrs->analysis_plan, tmp.ref.data(), ctmp.ref.data());
+
+          get_spatial_derivative_array3d_laplace(
+              deriv_array_real_out, ctmp.ref);
+        }
+
+        // apply sharp-k cut to field: modes with k > limit are set to zero
+        // - also sets to zero \vec k==0 mode -> subtract mean
+        void
+        sharpk_filter(U_CArrayRef &field, double limit, double norm) const {
+#  pragma omp parallel for collapse(3)
+          for (size_t i = startN0; i < startN0 + localN0; i++)
+            for (size_t j = 0; j < N1; j++)
+              for (size_t k = 0; k < N2_HC; k++) {
+                double kk[3] = {
+                    kmode(i, N0, L0), kmode(j, N1, L1), kmode(k, N2, L2)};
+
+                double ksquared = kk[0] * kk[0] + kk[1] * kk[1] + kk[2] * kk[2];
+                if (ksquared > limit * limit || ksquared < 1.e-15)
+                  field[i][j][k] = 0.;
+                else {
+                  field[i][j][k] *= norm;
+                }
+              }
+        }
+
+        // evaluate prior for current parameter values
+        template <typename BiasParameters>
+        double getPriorLogProbability(const BiasParameters &par) const {
+          double f = 0.;
+          // note: start from i=1, since nmean is ignored throughout
+          for (size_t i = 1; i < numParams; i++) {
+            if (priorsigma[i] > 0.) {
+              double D = par[i] - priormean[i];
+              double sig2 = priorsigma[i] * priorsigma[i];
+              f += D * D / sig2;
+              f += log(sig2); // normalization (modulo 2pi)
+            }
+          }
+
+          return -0.5 * f;
+        }
+
+        // fill bias fields in 'arrs'
+        void prepare_bias_model_arrays(const U_ArrayRef &delta) {
+          //          myarr<U_Array> tmp(arrs->mgr.extents_real(), arrs->mgr.allocator_real);
+          myarr<U_CArray> ctmp(
+              arrs->mgr.extents_complex(), arrs->mgr.allocator_complex);
+
+          fwrap(LibLSS::array::slice_array(
+              arrs->deltaLambda.ref, arrs->mgr.strict_range())) = fwrap(delta);
+
+          // ctmp contains delta in Fourier space
+          arrs->mgr.execute_r2c(
+              arrs->analysis_plan, arrs->deltaLambda.ref.data(),
+              ctmp.ref.data());
+
+          // Eulerian sharpk filter
+          sharpk_filter(ctmp.ref, EFT_Lambda, 1.);
+
+          // compute \nabla^2 \delta and components of tidal field, all in real space
+          get_spatial_derivative_array3d_laplace(
+              arrs->laplace_delta.ref, ctmp.ref);
+          get_spatial_derivative_array3d_tidal(
+              arrs->tidal_00.ref, ctmp.ref, 0, 0);
+          get_spatial_derivative_array3d_tidal(
+              arrs->tidal_01.ref, ctmp.ref, 0, 1);
+          get_spatial_derivative_array3d_tidal(
+              arrs->tidal_02.ref, ctmp.ref, 0, 2);
+          get_spatial_derivative_array3d_tidal(
+              arrs->tidal_11.ref, ctmp.ref, 1, 1);
+          get_spatial_derivative_array3d_tidal(
+              arrs->tidal_12.ref, ctmp.ref, 1, 2);
+          get_spatial_derivative_array3d_tidal(
+              arrs->tidal_22.ref, ctmp.ref, 2, 2);
+
+          // iFFT of ctmp to get sharp-k filtered density itself -> deltaLambda
+          // -- normalization is applied below
+          arrs->mgr.execute_c2r(
+              arrs->synthesis_plan, ctmp.ref.data(),
+              arrs->deltaLambda.ref.data());
+          const double fft_normalization = 1.0 / (N0 * N1 * N2);
+
+          // (K_ij)^2, delta^2
+          // -- FS hand-looped version:
+#  pragma omp parallel for collapse(3)
+          for (size_t n0 = startN0; n0 < startN0 + localN0; n0++)
+            for (size_t n1 = 0; n1 < N1; n1++)
+              for (size_t n2 = 0; n2 < N2; n2++) {
+                // K^2
+                // - notice the factor of 2 in front of off-diagonal terms
+                double K2 = arrs->tidal_00.ref[n0][n1][n2] *
+                                arrs->tidal_00.ref[n0][n1][n2] +
+                            arrs->tidal_11.ref[n0][n1][n2] *
+                                arrs->tidal_11.ref[n0][n1][n2] +
+                            arrs->tidal_22.ref[n0][n1][n2] *
+                                arrs->tidal_22.ref[n0][n1][n2] +
+                            2. * (arrs->tidal_01.ref[n0][n1][n2] *
+                                      arrs->tidal_01.ref[n0][n1][n2] +
+                                  arrs->tidal_02.ref[n0][n1][n2] *
+                                      arrs->tidal_02.ref[n0][n1][n2] +
+                                  arrs->tidal_12.ref[n0][n1][n2] *
+                                      arrs->tidal_12.ref[n0][n1][n2]);
+                arrs->tidal_sqr.ref[n0][n1][n2] = K2;
+
+                // delta
+                arrs->deltaLambda.ref[n0][n1][n2] *= fft_normalization;
+
+                // delta^2
+                double d = arrs->deltaLambda.ref[n0][n1][n2];
+                arrs->delta_sqr.ref[n0][n1][n2] = d * d;
+              }
+        }
+
+      public:
+        // This adaptor transforms an unselected galaxy density (with appropriate
+        // auxiliary arrays) to a selected array. It must be a functor accepting two
+        // parameters: a selection virtual array and a bias density virtual array.
+        // - SimpleAdaptor multiplies first returned field of compute_density with mask/selection array
+        //   leaving other return values untouched
+        selection::SimpleAdaptor selection_adaptor;
+
+        EFTBias(LikelihoodInfo const &info = LikelihoodInfo()) {
+          ConsoleContext<LOG_DEBUG> ctx("EFTBias constructor");
+
+          // get Lambda
+          EFT_Lambda = Likelihood::query<double>(info, "EFT_Lambda");
+
+          // get sigma0 limits
+          sigma0min = Likelihood::query<double>(info, "sigma0_min");
+          if (!(sigma0min > 0.))
+            sigma0min = 0.;
+          sigma0max = Likelihood::query<double>(info, "sigma0_max");
+          if (!(sigma0max > sigma0min))
+            sigma0max = 1.e30;
+          ctx.format("sigma0 limits: [%g, %g]", sigma0min, sigma0max);
+
+          // reset priors
+          lnPprior = 0.;
+          priormean.resize(numParams);
+          priorsigma.resize(numParams);
+          for (size_t i = 0; i < numParams; i++) {
+            priormean[i] = 0.;
+            priorsigma[i] = -1.;
+          }
+
+          // get prior mean, sigma if available
+          std::string smean =
+              Likelihood::query<std::string>(info, "bias_prior_mean");
+          if (smean.length()) {
+            auto bias_double = string_as_vector<double>(smean, ", ");
+            for (size_t i = 0; i < numParams; i++) {
+              priormean[i] = i < bias_double.size() ? bias_double[i] : 0.;
+            }
+            ctx.print(
+                "EFTBias: Set the bias prior mean to [" + to_string(priormean) +
+                "]");
+          }
+
+          std::string ssigma =
+              Likelihood::query<std::string>(info, "bias_prior_sigma");
+          if (ssigma.length()) {
+            auto bias_double = string_as_vector<double>(ssigma, ", ");
+            for (size_t i = 0; i < numParams; i++) {
+              priorsigma[i] = i < bias_double.size() ? bias_double[i] : 0.;
+            }
+            ctx.print(
+                "EFTBias: Set the bias prior sigma to [" +
+                to_string(priorsigma) + "]");
+          }
+        }
+
+        template <typename BiasParameters>
+        static inline void setup_default(BiasParameters &params) {
+          params[0] = 1.;
+          params[1] = 1.4;
+          params[2] = 0.8;
+          params[3] = 0.5;
+          params[4] = 0.2;
+          params[5] = 1.; //sigma_0
+        }
+
+        // Note: fwd_model and final_density arrays cannot be stored in this step. But
+        // they can be used.
+        template <
+            class ForwardModel, typename FinalDensityArray,
+            typename BiasParameters, typename MetaSelect = NoSelector>
+        inline void prepare(
+            ForwardModel &fwd_model, const FinalDensityArray &final_density,
+            double const _nmean, const BiasParameters &params,
+            bool density_changed, MetaSelect _select = MetaSelect()) {
+          ConsoleContext<LOG_DEBUG> ctx("EFTBias preparation");
+
+          // allocate bias fields
+          if (arrs == nullptr)
+            arrs.reset(new Arrs(
+                *(fwd_model.lo_mgr->getComm()), *(fwd_model.lo_mgr.get())));
+
+          // fill variables
+          N0 = arrs->mgr.N0;
+          N1 = arrs->mgr.N1;
+          N2 = arrs->mgr.N2;
+          N2_HC = N2 / 2 + 1;
+          startN0 = arrs->mgr.startN0;
+          localN0 = arrs->mgr.localN0;
+          L0 = fwd_model.get_box_model().L0;
+          L1 = fwd_model.get_box_model().L1;
+          L2 = fwd_model.get_box_model().L2;
+          nmean = params[0];
+          b1 = params[1];
+          b2 = params[2];
+          bk = params[3];
+          r2 = params[4];
+          sigma_0 = params[5];
+
+          // compute prior
+          lnPprior = getPriorLogProbability(params);
+
+          ctx.format("Got a box %dx%dx%d (%gx%gx%g)", N0, N1, N2, L0, L1, L2);
+          if (density_changed) {
+            // prepare density squared, Laplace delta, and tidal field squared
+            // - note that these fields have nonzero mean, but this is removed in likelihood eft::sharpk_filter
+            ctx.print("Prepare the arrays");
+            prepare_bias_model_arrays(final_density);
+
+            // compute variance of delta, delta^2, K^2 for checking
+            // (note this is BEFORE sharp-k cut)
+            double Md = 0., Md2 = 0., MK2 = 0.;
+            double Vd = 0., Vd2 = 0., VK2 = 0.;
+#  pragma omp parallel for collapse(3) reduction(+ : Md, Md2, MK2, Vd, Vd2, VK2)
+            for (size_t n0 = startN0; n0 < startN0 + localN0; n0++)
+              for (size_t n1 = 0; n1 < N1; n1++)
+                for (size_t n2 = 0; n2 < N2; n2++) {
+                  Md += final_density[n0][n1][n2];
+                  Md2 += arrs->delta_sqr.ref[n0][n1][n2];
+                  MK2 += arrs->tidal_sqr.ref[n0][n1][n2];
+                  Vd += pow(final_density[n0][n1][n2], 2.);
+                  Vd2 += pow(arrs->delta_sqr.ref[n0][n1][n2], 2.);
+                  VK2 += pow(arrs->tidal_sqr.ref[n0][n1][n2], 2.);
+                }
+            double Md_glob = 0., Md2_glob = 0., MK2_glob = 0.;
+            arrs->comm.all_reduce_t(&Md, &Md_glob, 1, MPI_SUM);
+            arrs->comm.all_reduce_t(&Md2, &Md2_glob, 1, MPI_SUM);
+            arrs->comm.all_reduce_t(&MK2, &MK2_glob, 1, MPI_SUM);
+            Md_glob /= double(N0 * N1 * N2);
+            Md2_glob /= double(N0 * N1 * N2);
+            MK2_glob /= double(N0 * N1 * N2);
+            double Vd_glob = 0., Vd2_glob = 0., VK2_glob = 0.;
+            arrs->comm.all_reduce_t(&Vd, &Vd_glob, 1, MPI_SUM);
+            arrs->comm.all_reduce_t(&Vd2, &Vd2_glob, 1, MPI_SUM);
+            arrs->comm.all_reduce_t(&VK2, &VK2_glob, 1, MPI_SUM);
+            Vd_glob = Vd_glob / double(N0 * N1 * N2) - Md_glob * Md_glob;
+            Vd2_glob = Vd2_glob / double(N0 * N1 * N2) - Md2_glob * Md2_glob;
+            VK2_glob = VK2_glob / double(N0 * N1 * N2) - MK2_glob * MK2_glob;
+            ctx.format(
+                "Mean of delta (BEFORE), delta^2, K^2 (AFTER Eulerian sharp-k "
+                "cut): %.5e, %.5e, %.5e (Lambda = %.2e)",
+                Md_glob, Md2_glob, MK2_glob, EFT_Lambda);
+            ctx.format(
+                "Variance of delta (BEFORE), delta^2, K^2 (AFTER Eulerian "
+                "sharp-k cut): %.5e, %.5e, %.5e (Lambda = %.2e)",
+                Vd_glob, Vd2_glob, VK2_glob, EFT_Lambda);
+          }
+        }
+
+        inline void cleanup() {
+          // Array deallocation only in final destructor
+        }
+
+        template <typename Array>
+        inline bool check_bias_constraints(Array &&params) {
+          // enforce sigma0 within parameter limits
+          // FS: - while testing, also force b1 > 0;
+          //       just remember to remove this before running on voids ;-)
+          return (
+              params[5] < sigma0max && params[5] > sigma0min && params[1] > 0.);
+        }
+
+        // This function returns an array-like array. That array
+        // depends on the existence of the final density array.
+        // The return type is quite complex. Let the compiler decide.
+        template <typename FinalDensityArray>
+        inline auto compute_density(const FinalDensityArray &array) const {
+          // add rough renormalization
+          double b1eff =
+              (b1 - (b2 + 2. / 3. * bk) * 68. / 21. * rmsdelta_renorm *
+                        rmsdelta_renorm);
+          return std::make_tuple(
+              b_va_fused<double>(
+                  std::bind(
+                      // bind biased-field function density_lambda to bias
+                      // parameters and bias fields
+                      density_lambda, nmean, b1eff, b2, bk, r2, ph::_1, ph::_2,
+                      ph::_3, ph::_4),
+                  // Notice no sharp-k cut on bias fields, including array = density, applied here.
+                  // This cut is applied in EFTlikelihood.
+                  array, arrs->delta_sqr.ref, arrs->tidal_sqr.ref,
+                  arrs->laplace_delta.ref),
+              *LibLSS::constant<double, 3>(
+                  sigma_0, arrs->mgr.extents_real_strict()),
+              *LibLSS::constant<double, 3>(
+                  lnPprior, arrs->mgr.extents_real_strict()));
+        }
+
+        // This function returns an array-like array. That array
+        // depends on the existence of the final density array and the gradient likelihood array.
+        // That is the job of the caller to ensure that temporary variables are not cleared
+        // before the final use.
+        // The return type is quite complex. Let the compiler decide.
+        // L(b_0(delta, p), b_1(delta, p), ..., b_n(delta, p))
+        // Now we take a tuple of gradient and collapse this to a gradient of delta.
+        //
+        // FS: we pre-compute the bias fields (see prepare( )) but not their adjoints.
+        //     Does this make sense? I guess adjoint is only called when phases are changed,
+        //     in which case we anyway have to recompute; bias fields are also needed in
+        //     block sampling of bias parameters, where their precomputation helps.
+        template <
+            typename FinalDensityArray, typename TupleGradientLikelihoodArray>
+        auto apply_adjoint_gradient(
+            const FinalDensityArray &final_density,
+            TupleGradientLikelihoodArray grad_array) {
+          ConsoleContext<LOG_DEBUG> ctx("EFTBias gradient computation");
+
+          ctx.print("Transfer the input gradient");
+
+          myarr<U_Array> dlogL_drho(
+              arrs->mgr.extents_real(), arrs->mgr.allocator_real);
+          LibLSS::copy_array_rv(
+              array::slice_array((dlogL_drho.ref), arrs->mgr.strict_range()),
+              std::get<0>(grad_array));
+          ctx.print("Data backed up");
+
+          myarr<U_Array> tmp(
+              arrs->mgr.extents_real(), arrs->mgr.allocator_real),
+              // this will contain functional derivative of K^2 term:
+              deriv_sum(arrs->mgr.extents_real(), arrs->mgr.allocator_real),
+              // this will contain functional derivative of \nabla^2\delta term:
+              dlaplace_delta(
+                  arrs->mgr.extents_real(), arrs->mgr.allocator_real);
+
+          // Functional derivatives of fields under derivative operators are
+          // treated through integration by parts. See pt_borg/notes/borg_implementation_notes.
+
+          // derivative of K^2 term:
+          // compute \Del_ij ( dlogL_drho K^ij )
+          // - component by component and sum up
+          // - notice the factor of 2 in front of off-diagonal terms
+          // - overall factor of 2 from product rule added in loop below
+          get_density_derivative_array3d_dtidal(
+              tmp.ref, dlogL_drho.ref, arrs->tidal_01.ref, 0, 1);
+          fwrap(deriv_sum.ref) = 2 * fwrap(tmp.ref);
+          get_density_derivative_array3d_dtidal(
+              tmp.ref, dlogL_drho.ref, arrs->tidal_02.ref, 0, 2);
+          fwrap(deriv_sum.ref) = fwrap(deriv_sum.ref) + 2 * fwrap(tmp.ref);
+          get_density_derivative_array3d_dtidal(
+              tmp.ref, dlogL_drho.ref, arrs->tidal_12.ref, 1, 2);
+          fwrap(deriv_sum.ref) = fwrap(deriv_sum.ref) + 2 * fwrap(tmp.ref);
+          get_density_derivative_array3d_dtidal(
+              tmp.ref, dlogL_drho.ref, arrs->tidal_00.ref, 0, 0);
+          fwrap(deriv_sum.ref) = fwrap(deriv_sum.ref) + fwrap(tmp.ref);
+          get_density_derivative_array3d_dtidal(
+              tmp.ref, dlogL_drho.ref, arrs->tidal_11.ref, 1, 1);
+          fwrap(deriv_sum.ref) = fwrap(deriv_sum.ref) + fwrap(tmp.ref);
+          get_density_derivative_array3d_dtidal(
+              tmp.ref, dlogL_drho.ref, arrs->tidal_22.ref, 2, 2);
+          fwrap(deriv_sum.ref) = fwrap(deriv_sum.ref) + fwrap(tmp.ref);
+
+          // derivative of \nabla^2 \delta: take Laplacian of dlogL_drho times (-1)^2
+          get_density_derivative_array3d_dlaplace(
+              dlaplace_delta.ref, dlogL_drho.ref);
+
+          // now assemble total adjoint gradient
+          double b1eff =
+              (b1 - (b2 + 2. / 3. * bk) * 68. / 21. * rmsdelta_renorm *
+                        rmsdelta_renorm);
+          ctx.print("Computing the transform.");
+#  pragma omp parallel for collapse(3)
+          for (size_t n0 = startN0; n0 < startN0 + localN0; n0++)
+            for (size_t n1 = 0; n1 < N1; n1++)
+              for (size_t n2 = 0; n2 < N2; n2++) {
+                double rho = b1eff * final_density[n0][n1][n2] +
+                             b2 * arrs->delta_sqr.ref[n0][n1][n2] +
+                             bk * arrs->tidal_sqr.ref[n0][n1][n2] +
+                             r2 * arrs->laplace_delta.ref[n0][n1][n2];
+                double dmu_drho = adjoint_thresholder(
+                    1 + rho, EPSILON_VOIDS, dlogL_drho.ref[n0][n1][n2]);
+
+                double drho_ddelta = b1 + 2. * b2 * final_density[n0][n1][n2];
+
+                arrs->dlogL_ddelta.ref[n0][n1][n2] =
+                    nmean * (dmu_drho * drho_ddelta +
+                             2. * bk * deriv_sum.ref[n0][n1][n2] +
+                             r2 * dlaplace_delta.ref[n0][n1][n2]);
+              }
+
+          // apply sharp-k filter to dlogL_ddelta.
+          myarr<U_CArray> ctmp(
+              arrs->mgr.extents_complex(), arrs->mgr.allocator_complex);
+          arrs->mgr.execute_r2c(
+              arrs->analysis_plan, arrs->dlogL_ddelta.ref.data(),
+              ctmp.ref.data());
+          double fft_normalization = 1.0 / (N0 * N1 * N2);
+          sharpk_filter(ctmp.ref, EFT_Lambda, fft_normalization);
+          arrs->mgr.execute_c2r(
+              arrs->synthesis_plan, ctmp.ref.data(),
+              arrs->dlogL_ddelta.ref.data());
+
+          return std::make_tuple(std::ref(arrs->dlogL_ddelta.ref));
+        }
+      };
+
+    } // namespace detail_EFTBias
+
+    using detail_EFTBias::EFTBias;
+
+    // define type names for convenience
+    using EFTBiasThresh = EFTBias<true>;
+    using EFTBiasDefault = EFTBias<false>;
+
+  } // namespace bias
+
+} // namespace LibLSS
+
+#endif
+// ARES TAG: authors_num = 3
+// ARES TAG: name(0) = Franz Elsner
+// ARES TAG: year(0) = 2018-2019
+// ARES TAG: email(0) = 
+// ARES TAG: name(1) = Fabian Schmidt
+// ARES TAG: year(1) = 2019-2021
+// ARES TAG: email(1) = fabians@mpa-garching.mpg.de
+// ARES TAG: name(2) = Martin Reinecke
+// ARES TAG: year(2) = 2019-2021
+// ARES TAG: email(2) = martin@mpa-garching.mpg.de
--- a/extra/borg/libLSS/physics/bias/eft_bias_marg.hpp
+++ b/extra/borg/libLSS/physics/bias/eft_bias_marg.hpp
@ -0,0 +1,389 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/bias/eft_bias_marg.hpp
+    Copyright (C) 2019-2021 Fabian Schmidt <fabians@mpa-garching.mpg.de>
+    Copyright (C) 2019-2021 Martin Reinecke <martin@mpa-garching.mpg.de>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+
+/*
+       Key features:
+       - derives from EFTBias which implements bias fields
+           \delta, \delta^2, (K_ij)^2, \laplace\delta
+       - all fields are returned in real space AFTER sharp-k filter (unlike EFTBias)
+       - uses same namespace detail_EFTBias
+       - version to be bundled with marginalized EFT likelihood; returns fields
+           individually, and takes corresponding gradients in adjoint
+       - although implemented differently, Gaussian priors on bias parameters
+         can still be set using "bias_prior_mean", "bias_prior_sigma" (in
+         this case, prior is implemented by EFTMargLikelihood)
+
+    This program is free software; you can redistribute it and/or modify it
+    under the terms of either the CeCILL license or the GNU General Public
+    license, as included with the software package.
+
+    The text of the license is located in Licence_CeCILL_V2.1-en.txt
+    and GPL.txt in the root directory of the source package.
+
+*/
+#ifndef __LIBLSS_PHYSICS_EFT_BIAS_MARG_HPP
+#  define __LIBLSS_PHYSICS_EFT_BIAS_MARG_HPP
+
+// This header provides the implementation of the LSS bias model to second order PT.
+
+#  include <cmath>
+#  include "libLSS/tools/fused_array.hpp"
+#  include <tuple>
+#  include "libLSS/tools/phoenix_vars.hpp"
+#  include <boost/phoenix/operator.hpp>
+#  include <boost/phoenix/stl/cmath.hpp>
+#  include "libLSS/tools/tuple_helper.hpp"
+#  include "libLSS/physics/bias/base.hpp"
+#  include "libLSS/tools/fftw_allocator.hpp"
+#  include "libLSS/samplers/core/powerspec_tools.hpp"
+
+#  include "eft_bias.hpp"
+
+namespace LibLSS {
+
+  namespace bias {
+
+    namespace detail_EFTBias {
+
+      namespace ph = std::placeholders;
+
+      using DFT_Manager = FFTW_Manager_3d<double>;
+      using U_Array = Uninit_FFTW_Real_Array;
+      using U_CArray = Uninit_FFTW_Complex_Array;
+      using U_ArrayRef = U_Array::array_type;
+      using U_CArrayRef = U_CArray::array_type;
+
+      class EFTBiasMarg : public EFTBias<false> {
+      public:
+      protected:
+        // EFT_kmax parameter: right now, only implemented in marginalized
+        //                     likelihood
+        double kmax;
+
+        // apply sharp-k cut to field: modes with k > limit are set to zero
+        // - also sets to zero \vec k==0 mode -> subtract mean
+        // - version that takes real-space field 'in' and returns filtered real-space field
+        void
+        sharpk_filter_r2r(U_ArrayRef &in, U_CArrayRef &ctmp, double limit) {
+          // FFT
+          arrs->mgr.execute_r2c(arrs->analysis_plan, in.data(), ctmp.data());
+
+          // sharp-k filter with norm
+          double norm = 1.0 / (N0 * N1 * N2);
+          sharpk_filter(ctmp, limit, norm);
+
+          // iFFT
+          arrs->mgr.execute_c2r(arrs->synthesis_plan, ctmp.data(), in.data());
+        }
+
+      public:
+        // This adaptor transforms an unselected galaxy density (with appropriate
+        // auxiliary arrays) to a selected array. It must be a functor accepting two
+        // parameters: a selection virtual array and a bias density virtual array.
+        // - SimpleAdaptor multiplies first returned field of compute_density with mask/selection array
+        //   leaving other return values untouched
+        selection::SimpleAdaptor selection_adaptor;
+
+        EFTBiasMarg(LikelihoodInfo const &info = LikelihoodInfo())
+            : EFTBias(info) {
+          // get kmax
+          kmax = Likelihood::query<double>(info, "EFT_kmax");
+          if (!(kmax > 0. && kmax < EFT_Lambda))
+            kmax = EFT_Lambda;
+
+          ConsoleContext<LOG_DEBUG> ctx("EFTBiasMarg constructor");
+          ctx.format("Lambda = %.3f, kmax = %.3f", EFT_Lambda, kmax);
+        }
+
+        template <typename Array>
+        inline bool check_bias_constraints(Array &&params) {
+          // enforce sigma0 within parameter limits
+          return (params[5] < sigma0max && params[5] > sigma0min);
+        }
+
+        // fill bias fields in 'arrs' (base class) and apply sharp-k filter
+        void prepare_bias_model_arrays(const U_ArrayRef &delta) {
+          // base class preparation
+          EFTBias::prepare_bias_model_arrays(delta);
+
+          // now apply sharp-k filters
+          // (sharp-k filtered density already present in arrs->deltaLambda)
+          myarr<U_CArray> ctmp(
+              arrs->mgr.extents_complex(), arrs->mgr.allocator_complex);
+
+          // - also sharp-k filter delta here, since we allow for kmax != Lambda
+          sharpk_filter_r2r(arrs->deltaLambda.ref, ctmp.ref, kmax);
+
+          sharpk_filter_r2r(arrs->delta_sqr.ref, ctmp.ref, kmax);
+          sharpk_filter_r2r(arrs->tidal_sqr.ref, ctmp.ref, kmax);
+          sharpk_filter_r2r(arrs->laplace_delta.ref, ctmp.ref, kmax);
+        }
+
+        // Note: fwd_model and final_density arrays cannot be stored in this step. But
+        // they can be used.
+        // -- essentially copied from EFTbias (note that prepare_bias_model_arrays is overloaded)
+        // -- should think about removing some of the code duplication here
+        template <
+            class ForwardModel, typename FinalDensityArray,
+            typename BiasParameters, typename MetaSelect = NoSelector>
+        inline void prepare(
+            ForwardModel &fwd_model, const FinalDensityArray &final_density,
+            double const _nmean, const BiasParameters &params,
+            bool density_changed, MetaSelect _select = MetaSelect()) {
+          ConsoleContext<LOG_DEBUG> ctx("EFTBiasMarg preparation");
+
+          // allocate bias fields
+          if (arrs == nullptr)
+            arrs.reset(new Arrs(
+                *(fwd_model.lo_mgr->getComm()), *(fwd_model.lo_mgr.get())));
+
+          // fill variables
+          N0 = arrs->mgr.N0;
+          N1 = arrs->mgr.N1;
+          N2 = arrs->mgr.N2;
+          N2_HC = N2 / 2 + 1;
+          startN0 = arrs->mgr.startN0;
+          localN0 = arrs->mgr.localN0;
+          L0 = fwd_model.get_box_model().L0;
+          L1 = fwd_model.get_box_model().L1;
+          L2 = fwd_model.get_box_model().L2;
+          nmean = params[0];
+          b1 = params[1];
+          b2 = params[2];
+          bk = params[3];
+          r2 = params[4];
+          sigma_0 = params[5];
+
+          // no need to compute prior, added in EFTLikeMarg
+
+          ctx.format("Got a box %dx%dx%d (%gx%gx%g)", N0, N1, N2, L0, L1, L2);
+          if (density_changed) {
+            // prepare density squared, Laplace delta, and tidal field squared
+            // - note that these fields have nonzero mean, but this is removed in likelihood eft::sharpk_filter
+            ctx.print("Prepare the arrays");
+            prepare_bias_model_arrays(final_density);
+
+            // compute variance of delta, delta^2, K^2 for checking
+            // (note this is now AFTER sharp-k cut)
+            double Md = 0., Md2 = 0., MK2 = 0.;
+            double Vd = 0., Vd2 = 0., VK2 = 0.;
+#  pragma omp parallel for collapse(3) reduction(+ : Md, Md2, MK2, Vd, Vd2, VK2)
+            for (size_t n0 = startN0; n0 < startN0 + localN0; n0++)
+              for (size_t n1 = 0; n1 < N1; n1++)
+                for (size_t n2 = 0; n2 < N2; n2++) {
+                  Md += arrs->deltaLambda.ref[n0][n1][n2];
+                  Md2 += arrs->delta_sqr.ref[n0][n1][n2];
+                  MK2 += arrs->tidal_sqr.ref[n0][n1][n2];
+                  Vd += pow(arrs->deltaLambda.ref[n0][n1][n2], 2.);
+                  Vd2 += pow(arrs->delta_sqr.ref[n0][n1][n2], 2.);
+                  VK2 += pow(arrs->tidal_sqr.ref[n0][n1][n2], 2.);
+                }
+            double Md_glob = 0., Md2_glob = 0., MK2_glob = 0.;
+            arrs->comm.all_reduce_t(&Md, &Md_glob, 1, MPI_SUM);
+            arrs->comm.all_reduce_t(&Md2, &Md2_glob, 1, MPI_SUM);
+            arrs->comm.all_reduce_t(&MK2, &MK2_glob, 1, MPI_SUM);
+            Md_glob /= double(N0 * N1 * N2);
+            Md2_glob /= double(N0 * N1 * N2);
+            MK2_glob /= double(N0 * N1 * N2);
+            double Vd_glob = 0., Vd2_glob = 0., VK2_glob = 0.;
+            arrs->comm.all_reduce_t(&Vd, &Vd_glob, 1, MPI_SUM);
+            arrs->comm.all_reduce_t(&Vd2, &Vd2_glob, 1, MPI_SUM);
+            arrs->comm.all_reduce_t(&VK2, &VK2_glob, 1, MPI_SUM);
+            Vd_glob = Vd_glob / double(N0 * N1 * N2) - Md_glob * Md_glob;
+            Vd2_glob = Vd2_glob / double(N0 * N1 * N2) - Md2_glob * Md2_glob;
+            VK2_glob = VK2_glob / double(N0 * N1 * N2) - MK2_glob * MK2_glob;
+            ctx.format(
+                "Mean of delta (AFTER), delta^2, K^2 (AFTER Eulerian "
+                "sharp-k cut): %.5e, %.5e, %.5e (Lambda = %.2e)\n",
+                Md_glob, Md2_glob, MK2_glob, EFT_Lambda);
+            ctx.format(
+                "Variance of delta (AFTER), delta^2, K^2 (AFTER Eulerian "
+                "sharp-k cut): %.5e, %.5e, %.5e (Lambda = %.2e)\n",
+                Vd_glob, Vd2_glob, VK2_glob, EFT_Lambda);
+          }
+
+          ctx.print("Done preparation");
+        }
+
+        // This function returns an array-like array. That array
+        // depends on the existence of the final density array.
+        // The return type is quite complex. Let the compiler decide.
+        // The return tuple contains sigma field as well as a vector with
+        //   bias fields tuple<0> = sigma, tuple<1> = biasfieldvector, with
+        // 0: delta
+        // 1: delta^2
+        // 2: K^2
+        // 3: lapl delta
+        // 4: sigma
+        // -- all in real space after sharp-k filter
+        // -- worry about mask later
+        // -- sigma0 is first element in tuple, as that is what selection is applied to
+        template <typename FinalDensityArray>
+        inline auto compute_density(const FinalDensityArray &array) const {
+          std::vector<U_ArrayRef> bias;
+          bias.push_back(arrs->deltaLambda.ref);
+          bias.push_back(arrs->delta_sqr.ref);
+          bias.push_back(arrs->tidal_sqr.ref);
+          bias.push_back(arrs->laplace_delta.ref);
+
+          // return the tuple of bias fields as well as sigma0
+          return std::make_tuple(
+              *LibLSS::constant<double, 3>(
+                  sigma_0, arrs->mgr.extents_real_strict()),
+              bias);
+        }
+
+        // This function returns an array-like array. That array
+        // depends on the existence of the final density array and the gradient likelihood array.
+        // That is the job of the caller to ensure that temporary variables are not cleared
+        // before the final use.
+        // The return type is quite complex. Let the compiler decide.
+        // L(b_0(delta, p), b_1(delta, p), ..., b_n(delta, p))
+        // Now we take a tuple of gradient and collapse this to a gradient of delta.
+        //
+        // grad_array contains vector of gradients of the (marginalized) likelihood
+        // w.r.t the bias fields, see below
+        template <
+            typename FinalDensityArray, typename TupleGradientLikelihoodArray>
+        auto apply_adjoint_gradient(
+            const FinalDensityArray &final_density,
+            TupleGradientLikelihoodArray grad_array) {
+          ConsoleContext<LOG_DEBUG> ctx("EFTBiasMarg gradient computation");
+
+          auto &grad = std::get<1>(grad_array);
+          // grad is vector which contains
+          // 0: dlogL/ddelta
+          // 1: dlogL/ddelta^2
+          // 2: dlogL/dK^2
+          // 3: dlogL/d(lapl delta)
+          // -- the last two we need for integration by parts
+
+          ctx.print("Transfer the input gradient");
+          // - copy of the first two fields only necessary if kmax != EFT_Lambda...
+          auto dlogL_dd = grad[0];
+          auto dlogL_ddelta2 = grad[1];
+          // - actually, copies of the above two are not needed
+          // myarr<U_Array> dlogL_dd(arrs->mgr.extents_real(),
+          // 			  arrs->mgr.allocator_real);
+          // myarr<U_Array> dlogL_ddelta2(arrs->mgr.extents_real(),
+          // 			       arrs->mgr.allocator_real);
+          // LibLSS::copy_array_rv(
+          //     array::slice_array((dlogL_dd.ref), arrs->mgr.strict_range()),
+          //     grad[0]);
+          // LibLSS::copy_array_rv(
+          //     array::slice_array((dlogL_ddelta2.ref), arrs->mgr.strict_range()),
+          //     grad[1]);
+
+          myarr<U_Array> dlogL_dK2(
+              arrs->mgr.extents_real(), arrs->mgr.allocator_real);
+          myarr<U_Array> dlogL_dlapl(
+              arrs->mgr.extents_real(), arrs->mgr.allocator_real);
+          LibLSS::copy_array_rv(
+              array::slice_array((dlogL_dK2.ref), arrs->mgr.strict_range()),
+              grad[2]);
+          LibLSS::copy_array_rv(
+              array::slice_array((dlogL_dlapl.ref), arrs->mgr.strict_range()),
+              grad[3]);
+          ctx.print("Data backed up");
+
+          myarr<U_CArray> ctmp(
+              arrs->mgr.extents_complex(), arrs->mgr.allocator_complex);
+
+          // // sharp-k filter gradients at kmax
+          // // - formally necessary if kmax != EFT_Lambda; however, since
+          // //   dlogL/dO only has support for k < kmax, it is not necessary
+          // //   in practice
+          // if (kmax != EFT_Lambda)  {
+          //   sharpk_filter_r2r(dlogL_dd.ref, ctmp.ref, kmax);
+          //   sharpk_filter_r2r(dlogL_ddelta2.ref, ctmp.ref, kmax);
+          //   sharpk_filter_r2r(dlogL_dK2.ref, ctmp.ref, kmax);
+          //   sharpk_filter_r2r(dlogL_dlapl.ref, ctmp.ref, kmax);
+          // }
+
+          myarr<U_Array> tmp(
+              arrs->mgr.extents_real(), arrs->mgr.allocator_real),
+              // this will contain functional derivative of K^2 term:
+              dK2(arrs->mgr.extents_real(), arrs->mgr.allocator_real),
+              // this will contain functional derivative of \nabla^2\delta term:
+              dlaplace_delta(
+                  arrs->mgr.extents_real(), arrs->mgr.allocator_real);
+
+          // Functional derivatives of fields under derivative operators are
+          // treated through integration by parts. See pt_borg/notes/borg_implementation_notes.
+
+          // derivative of K^2 term:
+          // compute K^ij \Del_ij ( dlogL_dK2 K^ij )
+          // - component by component and sum up
+          // - notice the factor of 2 in front of off-diagonal terms
+          get_density_derivative_array3d_dtidal(
+              tmp.ref, dlogL_dK2.ref, arrs->tidal_01.ref, 0, 1);
+          fwrap(dK2.ref) = 2 * fwrap(tmp.ref);
+          get_density_derivative_array3d_dtidal(
+              tmp.ref, dlogL_dK2.ref, arrs->tidal_02.ref, 0, 2);
+          fwrap(dK2.ref) = fwrap(dK2.ref) + 2 * fwrap(tmp.ref);
+          get_density_derivative_array3d_dtidal(
+              tmp.ref, dlogL_dK2.ref, arrs->tidal_12.ref, 1, 2);
+          fwrap(dK2.ref) = fwrap(dK2.ref) + 2 * fwrap(tmp.ref);
+          get_density_derivative_array3d_dtidal(
+              tmp.ref, dlogL_dK2.ref, arrs->tidal_00.ref, 0, 0);
+          fwrap(dK2.ref) = fwrap(dK2.ref) + fwrap(tmp.ref);
+          get_density_derivative_array3d_dtidal(
+              tmp.ref, dlogL_dK2.ref, arrs->tidal_11.ref, 1, 1);
+          fwrap(dK2.ref) = fwrap(dK2.ref) + fwrap(tmp.ref);
+          get_density_derivative_array3d_dtidal(
+              tmp.ref, dlogL_dK2.ref, arrs->tidal_22.ref, 2, 2);
+          fwrap(dK2.ref) = fwrap(dK2.ref) + fwrap(tmp.ref);
+
+          // derivative of \nabla^2 \delta: take Laplacian of dlogL_drho times (-1)^2
+          get_density_derivative_array3d_dlaplace(
+              dlaplace_delta.ref, dlogL_dlapl.ref);
+
+          // we also need delta_Lambda (rather than deltaLambda which is filtered at k_max...)
+          // - get here from final_density
+          fwrap(tmp.ref) = fwrap(final_density);
+          sharpk_filter_r2r(tmp.ref, ctmp.ref, EFT_Lambda);
+
+          // now assemble total adjoint gradient
+          ctx.print("Computing the gradient.");
+#  pragma omp parallel for collapse(3)
+          for (size_t n0 = startN0; n0 < startN0 + localN0; n0++)
+            for (size_t n1 = 0; n1 < N1; n1++)
+              for (size_t n2 = 0; n2 < N2; n2++) {
+                arrs->dlogL_ddelta.ref[n0][n1][n2] =
+                    dlogL_dd[n0][n1][n2]
+                    // factor of 2 for quadratic terms from product rule
+                    // - note that we use tmp = delta_Lambda for delta^2
+                    + 2. * dlogL_ddelta2[n0][n1][n2] * tmp.ref[n0][n1][n2] +
+                    2. * dK2.ref[n0][n1][n2] + dlaplace_delta.ref[n0][n1][n2];
+              }
+
+          // finally, apply sharp-k filter to dlogL_ddelta at EFT_Lambda
+          sharpk_filter_r2r(arrs->dlogL_ddelta.ref, ctmp.ref, EFT_Lambda);
+
+          return std::make_tuple(std::ref(arrs->dlogL_ddelta.ref));
+        }
+      }; // class EFTBiasMarg
+
+    } // namespace detail_EFTBias
+
+    using detail_EFTBias::EFTBiasMarg;
+
+  } // namespace bias
+
+} // namespace LibLSS
+
+#endif
+// ARES TAG: authors_num = 2
+// ARES TAG: name(0) = Fabian Schmidt
+// ARES TAG: year(0) = 2019-2021
+// ARES TAG: email(0) = fabians@mpa-garching.mpg.de
+// ARES TAG: name(1) = Martin Reinecke
+// ARES TAG: year(1) = 2019-2021
+// ARES TAG: email(1) = martin@mpa-garching.mpg.de
--- a/extra/borg/libLSS/physics/bias/level_combinator.hpp
+++ b/extra/borg/libLSS/physics/bias/level_combinator.hpp
@ -0,0 +1,426 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/bias/level_combinator.hpp
+    Copyright (C) 2019-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2019-2020 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#ifndef __LIBLSS_LEVEL_COMBINATOR_HPP
+#  define __LIBLSS_LEVEL_COMBINATOR_HPP
+
+#  include <cmath>
+#  include <functional>
+#  include <array>
+#  include "libLSS/tools/fused_array.hpp"
+#  include <tuple>
+#  include "libLSS/tools/phoenix_vars.hpp"
+#  include <boost/phoenix/operator.hpp>
+#  include <boost/phoenix/stl/cmath.hpp>
+#  include "libLSS/tools/tuple_helper.hpp"
+#  include "libLSS/physics/bias/base.hpp"
+#  include "libLSS/tools/mpi/ghost_planes.hpp"
+#  include "libLSS/tools/array_tools.hpp"
+#  include <boost/config.hpp>
+#  include "libLSS/tools/string_tools.hpp"
+
+namespace LibLSS {
+
+  namespace Combinator {
+    static constexpr bool ULTRA_VERBOSE = false;
+
+    namespace ph = std::placeholders;
+
+    // This a pow that works as a constant expression for all compilers.
+    template <typename T>
+    static inline constexpr T const_pow(T x, size_t p) {
+      return p == 0 ? 1 : (x * const_pow(x, p - 1));
+    }
+
+    template <typename T, size_t... N>
+    struct Levels;
+
+    template <typename T>
+    struct Levels<T> {
+      static constexpr const int numParams = 1;
+      static constexpr const int Nmax = 1;
+      static constexpr const int numLevel = 0;
+      typedef boost::multi_array_ref<T, 3> ArrayRef;
+
+      template <size_t maxLevel, typename SomeArray>
+      inline void get_density(
+          std::array<double, maxLevel> &delta_out,
+          SomeArray const &density_levels, size_t i, size_t j, size_t k) {}
+
+      template <size_t level, typename SomeArray>
+      inline double
+      get_density_level(SomeArray const &, size_t i, size_t j, size_t k) {
+        return 0.0;
+      }
+
+      template <size_t level, typename SomeArray>
+      inline void
+      push_ag_density_level(double, SomeArray &, size_t i, size_t j, size_t k) {
+      }
+
+      void allocate(
+          size_t, size_t, size_t, size_t, size_t, size_t,
+          std::tuple<ssize_t, ssize_t> * = 0) {}
+      void clear_cache() {}
+
+      template <size_t maxLevel>
+      inline void push_ag_density(
+          std::array<double, maxLevel> &ag_delta, ArrayRef &ag_density,
+          size_t i, size_t j, size_t k) {}
+
+      constexpr static inline size_t getPower(size_t) { return 0; }
+
+      void get_requirement(std::vector<size_t> &) {}
+
+      void ag_buildLevels(GhostPlanes<T, 2> &ghosts, ArrayRef &ag_density) {}
+      void buildLevels(
+          GhostPlanes<T, 2> &ghosts,
+          boost::multi_array_ref<T, 3> const &final_density) {}
+    };
+
+    template <typename T, size_t thisN, size_t... N>
+    struct Levels<T, thisN, N...> {
+      typedef Levels<T, N...> PreviousLevels;
+      typedef typename PreviousLevels::ArrayRef ArrayRef;
+      PreviousLevels previousLevels;
+      static constexpr const int Npower = thisN;
+      static constexpr const int Nmax = thisN + PreviousLevels::Nmax;
+      static constexpr const int numParams = Nmax * (Nmax + 1) / 2;
+      static constexpr const int numLevel = PreviousLevels::numLevel + 1;
+
+      size_t N0, N1, N2, N2real, startN0, localN0;
+
+      std::vector<size_t> required_ghosts;
+      boost::multi_array<T, 3> this_level_cache, ag_this_level_cache;
+
+      constexpr static inline size_t getPower(size_t level) {
+        return (level == numLevel) ? Npower : PreviousLevels::getPower(level);
+      }
+
+      void allocate(
+          size_t N0_, size_t N1_, size_t N2_, size_t N2real_, size_t startN0_,
+          size_t localN0_, std::tuple<ssize_t, ssize_t> *cache_bound = 0) {
+        ConsoleContext<LOG_DEBUG> ctx(
+            "allocate multi-level<" + to_string(numLevel) + ">");
+        N0 = N0_;
+        N1 = N1_;
+        N2 = N2_;
+        N2real = N2real_;
+        startN0 = startN0_;
+        localN0 = localN0_;
+
+        constexpr size_t resolution = const_pow(2, numLevel - 1);
+        if (numLevel > 1) {
+          ssize_t start_cache, end_cache;
+
+          if (cache_bound == 0) {
+            start_cache = (startN0 >= resolution)
+                              ? (startN0 - resolution + 1) / resolution
+                              : 0;
+            end_cache = (startN0 + localN0 + resolution - 1) / resolution;
+          } else {
+            std::tie(start_cache, end_cache) = *cache_bound;
+          }
+          if (ULTRA_VERBOSE) {
+            ctx.print(
+                "numLevel=" + to_string(numLevel) +
+                " resolution=" + to_string(resolution));
+            ctx.print(
+                "start_cache=" + to_string(start_cache) +
+                " end_cache=" + to_string(end_cache));
+          }
+          auto cache_ext =
+              boost::extents[boost::multi_array_types::extent_range(
+                  start_cache, end_cache)][N1 / resolution][N2 / resolution];
+          this_level_cache.resize(cache_ext);
+          ag_this_level_cache.resize(cache_ext);
+        }
+        previousLevels.allocate(
+            N0, N1, N2, N2real, startN0, localN0,
+            cache_bound == 0 ? 0 : (cache_bound + 1));
+        if (numLevel > 1) {
+          required_ghosts.clear();
+          required_ghosts.reserve(resolution);
+          size_t start_cache = this_level_cache.index_bases()[0];
+          size_t end_cache = start_cache + this_level_cache.shape()[0];
+          for (size_t i = start_cache; i < end_cache; i++)
+            for (size_t q = 0; q < resolution; q++) {
+              size_t plane = i * resolution + q;
+              // We have boundary effect here.
+              if (((plane < startN0) || (plane >= (startN0 + localN0))) &&
+                  plane < N0)
+                required_ghosts.push_back(plane);
+            }
+          if (ULTRA_VERBOSE)
+            ctx.print(
+                "Required ghost planes (N0=" + to_string(N0) +
+                "): " + to_string(required_ghosts));
+        }
+      }
+
+      void ag_buildLevels(GhostPlanes<T, 2> &ghosts, ArrayRef &ag_density) {
+        ConsoleContext<LOG_DEBUG> ctx(
+            "adjoint gradient multi-level<" + std::to_string(numLevel) + ">");
+
+        // The top level has no need for that.
+        if (numLevel == 1) {
+          return;
+        }
+
+        size_t r = const_pow(2, numLevel - 1);
+        double inv_r3 = 1.0 / (r * r * r);
+        size_t finalN0 = startN0 + localN0;
+
+        // We have to update ag_density with all the cached values of the
+        // sub levels
+        previousLevels.ag_buildLevels(ghosts, ag_density);
+
+        size_t level_start = ag_this_level_cache.index_bases()[0];
+        size_t level_end = level_start + ag_this_level_cache.shape()[0];
+        ctx.print("Add contribution from the cache");
+        for (size_t i = level_start; i < level_end; i++) {
+          for (size_t j = 0; j < N1 / r; j++) {
+            for (size_t k = 0; k < N2 / r; k++) {
+              typename ArrayRef::element ag =
+                  ag_this_level_cache[i][j][k] * inv_r3;
+              Console::instance().c_assert(!std::isnan(ag), "AG is Nan  (0)");
+              for (size_t a = 0; a < r; a++) {
+                size_t n = r * i + a;
+                if (n < startN0 || n >= finalN0)
+                  continue;
+                auto out_ag = ag_density[n];
+                for (size_t b = 0; b < r; b++)
+                  for (size_t c = 0; c < r; c++)
+                    out_ag[r * j + b][r * k + c] += ag;
+              }
+            }
+          }
+        }
+
+        // Now we need to update the ag ghost planes to propagate
+        // the ag caches.
+        ctx.print("Propagate to the ghost planes");
+        for (auto other_plane : required_ghosts) {
+          auto this_plane = ag_this_level_cache[other_plane / r];
+          auto &g_plane = ghosts.ag_getPlane(other_plane);
+
+          for (size_t j = 0; j < N1 / r; j++) {
+            for (size_t k = 0; k < N2 / r; k++) {
+              typename ArrayRef::element ag = this_plane[j][k] * inv_r3;
+              // The first loop is implicit from the top-level one.
+              // Explicit looping would be more complicated and involves extra
+              // lookup.
+              Console::instance().c_assert(!std::isnan(ag), "AG is Nan");
+              for (size_t b = 0; b < r; b++)
+                for (size_t c = 0; c < r; c++)
+                  g_plane[r * j + b][r * k + c] += ag;
+            }
+          }
+        }
+      }
+
+      void
+      buildLevels(GhostPlanes<T, 2> &ghosts, ArrayRef const &final_density) {
+        ConsoleContext<LOG_DEBUG> ctx(
+            "precompute multi-level<" + std::to_string(numLevel) + ">");
+
+        if (numLevel == 1)
+          return; // Do nothing
+
+        // First build the lower level (high resolution)
+        previousLevels.buildLevels(ghosts, final_density);
+
+        // This is redundnat. It is possible to use the information on previous Levels
+        // to build this one.
+        size_t level_start = this_level_cache.index_bases()[0];
+        size_t level_final = level_start + this_level_cache.shape()[0];
+        size_t r = const_pow(2, numLevel - 1);
+        double inv_r3 = 1.0 / (r * r * r);
+        array::fill(this_level_cache, 0);
+        if (ULTRA_VERBOSE)
+          ctx.print("Building cache, r = " + to_string(r));
+// We do not collapse more than 3 because of the injection operation.
+// FIXME: this code assumes that all the ranges have some divisible properties
+//        We accept this for the moment but future might need a more General
+//        scheme.
+#  pragma omp parallel for collapse(3)
+        for (size_t i = level_start; i < level_final; i++) {
+          for (size_t j = 0; j < N1 / r; j++) {
+            for (size_t k = 0; k < N2 / r; k++) {
+              typename ArrayRef::element V = 0;
+              for (size_t a = 0; a < r; a++) {
+                size_t n = r * i + a;
+                if (n < startN0 || n >= (startN0 + localN0))
+                  continue;
+                auto D = final_density[n];
+                for (size_t b = 0; b < r; b++)
+                  for (size_t c = 0; c < r; c++) {
+                    V += D[r * j + b][k * r + c];
+                    if (std::isnan(V)) {
+                      ctx.format(
+                          "Nan(%g) detected at %d,%d,%d", V, n, r * j + b,
+                          k * r + c);
+                      MPI_Communication::instance()->abort();
+                    }
+                  }
+              }
+              if (ULTRA_VERBOSE)
+                ctx.print(
+                    "Setting i,j,k=" +
+                    to_string(std::array<size_t, 3>{i, j, k}) + " with " +
+                    to_string(V));
+              this_level_cache[i][j][k] = V * inv_r3;
+            }
+          }
+        }
+
+        if (ULTRA_VERBOSE)
+          ctx.print(
+              "Use ghosts plane \"" + LibLSS::to_string(required_ghosts) +
+              "\"");
+        for (auto other_plane : required_ghosts) {
+          auto this_plane = this_level_cache[other_plane / r];
+          auto &g_plane = ghosts.getPlane(other_plane);
+#  pragma omp parallel for collapse(2)
+          for (size_t j = 0; j < N1 / r; j++) {
+            for (size_t k = 0; k < N2 / r; k++) {
+              typename ArrayRef::element V = 0;
+              for (size_t a = 0; a < r; a++)
+                for (size_t b = 0; b < r; b++) {
+                  V += g_plane[j * r + a][k * r + b];
+                  if (std::isnan(V)) {
+                    ctx.format(
+                        "Nan detected at %d,%d,%d", other_plane, j * r + a,
+                        k * r + b);
+                    MPI_Communication::instance()->abort();
+                  }
+                }
+              if (ULTRA_VERBOSE)
+                ctx.print(
+                    "Adding to i,j,k=" +
+                    to_string(std::array<size_t, 3>{other_plane / r, j, k}) +
+                    " with " + to_string(V));
+              this_plane[j][k] += V * inv_r3;
+            }
+          }
+        }
+      }
+
+      void clear_cache() {
+        previousLevels.clear_cache();
+        array::fill(ag_this_level_cache, 0);
+      }
+
+      void get_requirement(std::vector<size_t> &all_required_ghosts) {
+        all_required_ghosts.insert(
+            all_required_ghosts.end(), required_ghosts.begin(),
+            required_ghosts.end());
+        previousLevels.get_requirement(all_required_ghosts);
+      }
+
+      void setup(GhostPlanes<T, 2> &ghosts, MPI_Communication *comm) {
+        std::vector<size_t> local_planes, all_required_planes;
+        for (size_t j = startN0; j < startN0 + localN0; j++)
+          local_planes.push_back(j);
+
+        get_requirement(all_required_planes);
+
+        ghosts.setup(
+            comm, all_required_planes, local_planes,
+            std::array<size_t, 2>{N1, N2real}, N0);
+      }
+
+      template <size_t maxLevel>
+      inline void push_ag_density(
+          std::array<double, maxLevel> &ag_delta, ArrayRef &ag_density,
+          size_t i, size_t j, size_t k) {
+        previousLevels.push_ag_density(ag_delta, ag_density, i, j, k);
+
+        if (numLevel == 1) {
+          // Only the top level updates the ag directly.
+          ag_density[i][j][k] += ag_delta[0];
+        } else {
+          auto reduction = const_pow(2, numLevel - 1);
+          size_t i_l = i / reduction;
+          size_t j_l = j / reduction;
+          size_t k_l = k / reduction;
+
+          // All other levels go through the cache to update the ag later.
+          ag_this_level_cache[i_l][j_l][k_l] += ag_delta[numLevel - 1];
+        }
+      }
+
+      template <size_t level>
+      inline void push_ag_density_level(
+          double ag_delta, ArrayRef &ag_density, size_t i, size_t j, size_t k) {
+        if (level == numLevel - 1) {
+          if (level == 0)
+            ag_density[i][j][k] += ag_delta;
+          else
+            ag_this_level_cache[i][j][k] += ag_delta;
+        } else
+          previousLevels.template push_ag_density_level<level>(
+              ag_delta, ag_density, i, j, k);
+      }
+
+      template <size_t level>
+      inline double get_density_level(
+          ArrayRef const &final_density, size_t i, size_t j, size_t k) {
+        if (level == numLevel - 1) {
+          if (level == 0)
+            return final_density[i][j][k];
+          else
+            return this_level_cache[i][j][k];
+        } else {
+          return previousLevels.template get_density_level<level>(
+              final_density, i, j, k);
+        }
+      }
+
+      template <size_t maxLevel>
+      inline void get_density(
+          std::array<double, maxLevel> &delta_out,
+          ArrayRef const &final_density, size_t i, size_t j, size_t k) {
+        previousLevels.get_density(delta_out, final_density, i, j, k);
+
+        if (numLevel == 1) {
+          delta_out[0] = final_density[i][j][k];
+        } else {
+          auto reduction = const_pow(2, numLevel - 1);
+          size_t i_l = i / reduction;
+          size_t j_l = j / reduction;
+          size_t k_l = k / reduction;
+
+          delta_out[numLevel - 1] = this_level_cache[i_l][j_l][k_l];
+        }
+      }
+
+      inline size_t minPlaneRequest() {
+        size_t reduction = const_pow(2, numLevel - 1);
+
+        return reduction * (startN0 / reduction);
+      }
+
+      inline size_t maxPlaneRequest() {
+        size_t reduction = const_pow(2, numLevel - 1);
+
+        return reduction * ((startN0 + localN0) / reduction);
+      }
+    };
+  } // namespace Combinator
+} // namespace LibLSS
+
+#endif
+// ARES TAG: authors_num = 2
+// ARES TAG: name(0) = Guilhem Lavaux
+// ARES TAG: year(0) = 2019-2020
+// ARES TAG: email(0) = guilhem.lavaux@iap.fr
+// ARES TAG: name(1) = Jens Jasche
+// ARES TAG: year(1) = 2019-2020
+// ARES TAG: email(1) = jens.jasche@fysik.su.se
--- a/extra/borg/libLSS/physics/bias/linear_bias.hpp
+++ b/extra/borg/libLSS/physics/bias/linear_bias.hpp
@ -0,0 +1,124 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/bias/linear_bias.hpp
+    Copyright (C) 2014-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#ifndef __LIBLSS_PHYSICS_LINEAR_BIAS_HPP
+#define __LIBLSS_PHYSICS_LINEAR_BIAS_HPP
+
+#include <cmath>
+#include <functional>
+#include "libLSS/tools/fused_array.hpp"
+#include <tuple>
+#include "libLSS/tools/phoenix_vars.hpp"
+#include <boost/phoenix/operator.hpp>
+#include <boost/phoenix/stl/cmath.hpp>
+#include "libLSS/tools/tuple_helper.hpp"
+#include "libLSS/physics/bias/base.hpp"
+
+namespace LibLSS {
+
+  namespace bias {
+
+    namespace detail_linear_bias {
+      namespace ph = std::placeholders;
+
+      struct LinearBias {
+
+        static constexpr const bool NmeanIsBias = true;
+        static constexpr int numParams = 2;
+
+        // This adaptor transforms an unselected galaxy density (with appropriate
+        // auxiliary arrays) to a selected array. It must be a functor accepting two
+        // parameters: a selection virtual array and a bias density virtual array.
+        selection::SimpleAdaptor selection_adaptor;
+
+        double nmean, bias;
+
+        LinearBias(LikelihoodInfo const& = LikelihoodInfo()) {}
+
+        template <typename BiasParameters>
+        static inline void setup_default(BiasParameters &params) {
+          params[0] = 1;
+          params[1] = 1;
+        }
+
+        // Nota: fwd_model and final_density arrays cannot be stored in this step. But
+        // they can be used.
+        template <
+            class ForwardModel, typename FinalDensityArray,
+            typename BiasParameters, typename MetaSelect = NoSelector>
+        inline void prepare(
+            ForwardModel &fwd_model, const FinalDensityArray &final_density,
+            double const _nmean, const BiasParameters &params,
+            bool density_updated, MetaSelect _select = MetaSelect()) {
+          nmean = params[0];
+          bias = params[1];
+        }
+
+        inline void cleanup() {
+          // No need for cleanup
+        }
+
+        static inline double
+        gradient_density_lambda(double nmean, double bias, double g) {
+          return nmean * bias * g;
+        }
+
+        static inline double
+        density_lambda(double nmean, double bias, double v) {
+          return nmean * (1 + bias * v);
+        }
+
+        inline double get_linear_bias() const { return bias; }
+
+        template <typename Array>
+        static inline bool check_bias_constraints(Array &&a) {
+          return a[0] > 0 and a[1] > 0;
+        }
+
+        // This function returns an array-like array. That array
+        // depends on the existence of the final density array.
+        // The return type is quite complex. Let the compiler decides.
+        // C++11 does not allow automatic return type deduction. C++14 would be
+        // needed for that. So we have to rely on an auxiliary function that
+        // allow for a compact decltype to be written.
+        template <typename FinalDensityArray>
+        inline auto compute_density(const FinalDensityArray &array) {
+          return std::make_tuple(b_va_fused<double>(
+              std::bind(density_lambda, nmean, bias, ph::_1), array));
+        }
+
+        // This function returns an array-like array. That array
+        // depends on the existence of the final density array and the gradient likelihood array.
+        // That is the job of the caller to ensure that temporary variables are not cleared
+        // before the final use.
+        // The return type is quite complex. Let the compiler decides.
+        // L(b_0(delta, p), b_1(delta, p), ..., b_n(delta, p))
+        // Now we take a tuple of gradient and collapse this to a gradient of delta.
+        template <
+            typename FinalDensityArray, typename TupleGradientLikelihoodArray>
+        inline auto apply_adjoint_gradient(
+            const FinalDensityArray &array,
+            TupleGradientLikelihoodArray grad_array) {
+          BOOST_STATIC_ASSERT(
+              (std::tuple_size<TupleGradientLikelihoodArray>::value == 1));
+          return std::make_tuple(b_va_fused<double>(
+              std::bind(gradient_density_lambda, nmean, bias, ph::_1),
+              std::move(std::get<0>(grad_array))));
+        }
+      };
+
+    } // namespace detail_linear_bias
+
+    using detail_linear_bias::LinearBias;
+
+  } // namespace bias
+
+} // namespace LibLSS
+
+#endif
--- a/extra/borg/libLSS/physics/bias/many_power.hpp
+++ b/extra/borg/libLSS/physics/bias/many_power.hpp
@ -0,0 +1,391 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/bias/many_power.hpp
+    Copyright (C) 2014-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#ifndef __LIBLSS_PHYSICS_MANY_POWER_HPP
+#define __LIBLSS_PHYSICS_MANY_POWER_HPP
+
+// This header provides the implementations of a simple power law bias model.
+// A generic bias model must implement the following concept:
+//     a "selection_adaptor" functor, available from the object
+//     a "compute_density(final_density)" function accepting a virtual array and returning a tuple of virtual array.
+//     a "apply_adjoint_gradient(final_density,gradient_likelihood_array)" also accepting virtual arrays and returning a virtual array
+//     a "prepare" function which is called before the other two calls, to allow for precomputing auxiliary complex fields.
+//     a "cleanup" which allows for tearing down any auxiliary fields and memory allocated in the prepare phase.
+
+#include <cmath>
+#include <functional>
+#include <array>
+#include "libLSS/tools/fused_array.hpp"
+#include <tuple>
+#include "libLSS/tools/phoenix_vars.hpp"
+#include <boost/phoenix/operator.hpp>
+#include <boost/phoenix/stl/cmath.hpp>
+#include "libLSS/tools/tuple_helper.hpp"
+#include "libLSS/physics/bias/base.hpp"
+#include "libLSS/tools/mpi/ghost_planes.hpp"
+#include "libLSS/tools/array_tools.hpp"
+#include <boost/config.hpp>
+#include "libLSS/tools/string_tools.hpp"
+#include "libLSS/physics/bias/level_combinator.hpp"
+
+namespace LibLSS {
+
+  namespace bias {
+
+    namespace detail_manypower {
+
+      using namespace LibLSS::Combinator;
+
+      template <typename LevelCombinator = Levels<double, 0>>
+      struct ManyPower {
+
+        static constexpr const bool NmeanIsBias = true;
+        typedef ManyPower<LevelCombinator> Self;
+        typedef LevelCombinator Levels;
+        static constexpr const int numParams = LevelCombinator::numParams;
+        static constexpr const int Nmax = LevelCombinator::Nmax;
+
+        size_t startN0, localN0;
+
+        // This adaptor transforms an unselected galaxy density (with appropriate
+        // auxiliary arrays) to a selected array. It must be a functor accepting two
+        // parameters: a selection virtual array and a bias density virtual array.
+        selection::SimpleAdaptor selection_adaptor;
+
+        typedef boost::multi_array<double, 2> Matrix;
+        Matrix A;
+
+        LevelCombinator combinator;
+        double nmean;
+        GhostPlanes<double, 2> ghosts;
+        bool needInit;
+        size_t N0, N1, N2;
+        double prior_width;
+
+        typedef UninitializedArray<boost::multi_array<double, 3>> U_Density_t;
+        std::shared_ptr<U_Density_t> ag_density, full_density;
+
+        ManyPower(LikelihoodInfo const &info = LikelihoodInfo()) {
+          needInit = true;
+          prior_width =
+              Likelihood::query<double>(info, "ManyPower_prior_width");
+          Console::instance().format<LOG_INFO>(
+              "ManyPower running with prior_width=%g", prior_width);
+        }
+
+        template <typename BiasParameters>
+        static inline void setup_default(BiasParameters &params) {
+          array::fill(params, 0);
+          for (uint32_t i = 0; i < Nmax; i++) {
+            uint32_t q = i * (i + 1) / 2;
+            Console::instance().c_assert(q < params.size(), "Problem!");
+            params[q] =1;//i * (i + 3) / 2] = 1;
+          }
+          params[0] = 120;
+          Console::instance().print<LOG_DEBUG>(
+              "setup_default Nparams=" + to_string(numParams) +
+              " params=" + to_string(params));
+        }
+
+        // Nota: fwd_model and final_density arrays cannot be stored in this step. But
+        // they can be used.
+        template <
+            class ForwardModel, typename FinalDensityArray,
+            typename BiasParameters, typename MetaSelect = NoSelector>
+        inline void prepare(
+            ForwardModel &fwd_model, const FinalDensityArray &final_density,
+            double const _nmean, const BiasParameters &params,
+            bool density_updated, MetaSelect _select = MetaSelect()) {
+          ConsoleContext<LOG_DEBUG> ctx("prepare manypower");
+          auto &mgr = fwd_model.out_mgr;
+          if (needInit) {
+            auto box = fwd_model.get_box_model_output();
+
+            Console::instance().c_assert(
+                params.size() >= numParams,
+                "Invalid number of bias parameters (" +
+                    std::to_string(params.size()) + "<" +
+                    std::to_string(numParams) + ")");
+            startN0 = mgr->startN0;
+            localN0 = mgr->localN0;
+
+            N0 = box.N0;
+            N1 = box.N1;
+            N2 = box.N2;
+
+            // First allocate all required caches and find out
+            // which planes are required here.
+            combinator.allocate(N0, N1, N2, mgr->N2real, startN0, localN0);
+            // Setup the ghost plane manager of our requirements.
+            combinator.setup(ghosts, fwd_model.communicator());
+
+            ag_density = std::make_shared<U_Density_t>(mgr->extents_real());
+            full_density = std::make_shared<U_Density_t>(mgr->extents_real());
+
+            needInit = false;
+          }
+
+          A.resize(boost::extents[Nmax][Nmax]);
+
+          // Make a triangle^T triangle matrix multiplication
+          std::array<double, numParams> mod_params;
+          Console::instance().c_assert(
+              params.size() >= mod_params.size(),
+              "Input params does not have expected size");
+          {
+            auto iter = params.begin();
+            auto iter_out = mod_params.begin();
+            for (int i = 0; i < numParams; i++) {
+              *iter_out = *iter;
+              ++iter;
+              ++iter_out;
+            }
+          }
+          nmean = mod_params[0];
+          mod_params[0] = 1;
+
+          for (uint32_t i = 0; i < Nmax; i++) {
+            for (uint32_t j = 0; j < Nmax; j++) {
+              A[i][j] = 0;
+              for (uint32_t k = 0; k <= std::min(i, j); k++)
+                A[i][j] += mod_params[i * (i + 1) / 2 + k] *
+                           mod_params[j * (j + 1) / 2 + k];
+              // We ignore _nmean
+            }
+            ctx.print("A[" + to_string(i) + "]=" + to_string(A[i]));
+          }
+
+          if (density_updated) {
+            ctx.print("Density updated. Resynchronize.");
+            auto ff = full_density->get_array()[mgr->strict_range()];
+            auto s = ff.shape();
+            auto ib = ff.index_bases();
+            auto s2 = final_density.shape();
+            auto ib2 = final_density.index_bases();
+            ctx.print(
+                boost::format(" fd: %dx%dx%d  [%d,%d,%d] ") % s[0] % s[1] %
+                s[2] % ib[0] % ib[1] % ib[2]);
+            ctx.print(
+                boost::format(" id: %dx%dx%d  [%d,%d,%d] ") % s2[0] % s2[1] %
+                s2[2] % ib2[0] % ib2[1] % ib2[2]);
+            LibLSS::copy_array_rv(
+                full_density->get_array()[mgr->strict_range()],
+                final_density[mgr->strict_range()]);
+            // Produce the multi-level representation
+            // First synchronize data.
+            ghosts.synchronize(full_density->get_array());
+            // Now build the different levels from the planes.
+            combinator.buildLevels(ghosts, final_density);
+          }
+        }
+
+        inline void cleanup() {
+          Console::instance().print<LOG_DEBUG>("Cleanup ManyPower");
+          // No need for cleanup
+        }
+
+        template <typename BiasParameters>
+        inline double log_prior_params(const BiasParameters &params) const {
+          double u = 0; // we skip the first one which should be uniform
+          for (size_t i = 1; i < numParams; i++) {
+            u += params[i] * params[i];
+          }
+          return -0.5 * u / prior_width;
+        }
+
+        template <typename AGArrayType, typename ArrayType>
+        inline void gradient_density_lambda(
+            ArrayType const &final_density, AGArrayType const &ag_likelihood) {
+          ConsoleContext<LOG_DEBUG> ctx("many power gradient_density_lambda");
+          std::array<double, LevelCombinator::Nmax> Delta, Delta_prime;
+          std::array<double, LevelCombinator::numLevel> DensityLevel;
+          size_t finalN0 = startN0 + localN0;
+          auto &ag_array = ag_density->get_array();
+
+          ghosts.clear_ghosts();
+          combinator.clear_cache();
+          array::fill(ag_array, 0);
+          ctx.print("Building separate ag components");
+          for (size_t i = startN0; i < finalN0; i++) {
+            for (size_t j = 0; j < N1; j++) {
+              for (size_t k = 0; k < N2; k++) {
+                size_t idx;
+                double const ag = ag_likelihood[i][j][k] * nmean;
+
+                // Again the forward pass. We need the density at all levels.
+                combinator.get_density(DensityLevel, final_density, i, j, k);
+
+                // However we only need the gradient of the power of these density.
+                // Compute gradient of the density operators
+                std::array<size_t, LevelCombinator::Nmax> to_level;
+                std::array<double, LevelCombinator::numLevel> ret;
+                Delta_prime[0] = 0;
+                Delta[0] = 1;
+                to_level[0] = -1;
+                idx = 1;
+
+                for (size_t level = 0; level < LevelCombinator::numLevel;
+                     level++) {
+                  double const delta_level = DensityLevel[level];
+                  ret[level] = 0;
+                  if (LevelCombinator::getPower(level + 1) < 1)
+                    continue;
+                  Delta_prime[idx] = ag;
+                  Delta[idx] = delta_level;
+                  to_level[idx] = level;
+                  idx++;
+                  for (uint32_t i = 1; i < LevelCombinator::getPower(level + 1);
+                       i++, idx++) {
+                    Delta[idx] = Delta[idx - 1] *
+                                 delta_level; // delta^(i+1) = delta^i * delta
+                    Delta_prime[idx] =
+                        (i + 1) * Delta[idx - 1] * ag; // (i+1) delta^i * AG
+                    to_level[idx] =
+                        level; // transform the index to a averaging level
+                  }
+                }
+                auto &cons = Console::instance();
+                //cons.print<LOG_DEBUG>("Delta = " + to_string(Delta));
+                //cons.print<LOG_DEBUG>("DeltaPrime = " + to_string(Delta_prime));
+                //cons.print<LOG_DEBUG>("to_level = " + to_string(to_level));
+                cons.c_assert(idx == LevelCombinator::Nmax, "Internal error");
+
+                // Now we need to compute the real AG. The first pass gets
+                // the ag for each level.
+                // i==0 and j==0 are pointing to the constant part which have
+                // no equivalent level. We skip those.
+                for (uint32_t q = 1; q < LevelCombinator::Nmax; q++) {
+                  size_t l_q = to_level[q];
+                  for (uint32_t p = 1; p < q; p++) {
+                    ret[l_q] += 2 * Delta_prime[q] * Delta[p] * A[q][p];
+                    ret[to_level[p]] += 2 * Delta_prime[p] * Delta[q] * A[q][p];
+                  }
+                  ret[l_q] += Delta_prime[q] *
+                              (2 * Delta[0] * A[q][0] + 2 * Delta[q] * A[q][q]);
+                }
+                // Now recombine all levels to give the final AG at voxel i,j,k.
+                combinator.push_ag_density(ret, ag_array, i, j, k);
+              }
+            }
+          }
+
+          ctx.print("Build ag levels");
+          combinator.ag_buildLevels(ghosts, ag_array);
+
+          ctx.print("Do ag synchronization");
+          // Now we do global communication to reduce all planes
+          ghosts.synchronize_ag(ag_array);
+        }
+
+        template <typename ArrayType>
+        inline double density_lambda(
+            ArrayType const &final_density, size_t i, size_t j, size_t k) {
+          std::array<double, LevelCombinator::Nmax> Delta;
+          std::array<double, LevelCombinator::numLevel> DensityLevel;
+          size_t idx = 0;
+
+          // If we touch padding. Go away.
+          if (k >= N2)
+            return 0;
+
+          // Forward collapse
+          // DensityLevel receives all the density levels corresponding to
+          // the voxel i,j,k
+          //        DensityLevel[0] = final_density[i][j][k];
+          combinator.get_density(DensityLevel, final_density, i, j, k);
+
+          // Now build the vector. First is just a constant.
+          Delta[0] = 1;
+          idx = 1;
+          for (size_t level = 0; level < LevelCombinator::numLevel; level++) {
+            // Each level starts with just the initial value.
+            if (LevelCombinator::getPower(level + 1) >= 1)
+              Delta[idx++] = DensityLevel[level];
+            for (uint32_t i = 1; i < LevelCombinator::getPower(level + 1);
+                 i++, idx++) {
+              // Then we add power of that value.
+              Delta[idx] = Delta[idx - 1] * DensityLevel[level];
+            }
+          }
+          Console::instance().c_assert(
+              idx == LevelCombinator::Nmax, "Internal error");
+          // Now we have built the vector over which we need the quadratic
+          // form.
+
+          // Now compute the 2-norm with the Q-form A.
+          double ret = 0;
+          for (uint32_t i = 0; i < Nmax; i++) {
+            for (uint32_t j = 0; j < i; j++) {
+              ret += 2 * Delta[i] * Delta[j] * A[i][j];
+            }
+            ret += Delta[i] * Delta[i] * A[i][i];
+          }
+          Console::instance().c_assert(!std::isnan(ret), "NaN in density");
+          Console::instance().c_assert(!std::isinf(ret), "Inf in density");
+          Console::instance().c_assert(!std::isnan(nmean), "NaN in nmean");
+          return ret * nmean;
+        }
+
+        inline double get_linear_bias() const { return 1; }
+
+        template <typename Array>
+        static inline bool check_bias_constraints(Array &&a) {
+          bool condition = a[1] > 0;  // Ensure linear bias positive
+          Console::instance().c_assert(a.size() <= numParams, "Invalid number of parameters");
+//          for (int j = 0; j < Nmax; j++)
+//             condition = condition && (a[j * (j + 3) / 2] > 0);
+          return condition;
+        }
+
+        // This function returns a tuple of array-like arrays.
+        // In practice for ManyPower we compute the content to a temporary array as there was
+        // some padding/boundary issues in the past.
+        template <typename FinalDensityArray>
+        inline auto compute_density(const FinalDensityArray &a) {
+          ConsoleContext<LOG_DEBUG> ctx("many_power compute_density");
+          LibLSS::copy_array(
+              full_density->get_array(),
+              b_fused_idx<double, 3>(std::bind(
+                  &Self::density_lambda<FinalDensityArray>, this, std::cref(a),
+                  ph::_1, ph::_2, ph::_3)));
+          return std::make_tuple(std::cref(full_density->get_array()));
+        }
+
+        // This function returns a tuple of array-like objects. Those arrays
+        // depend on the existence of the final density array and the gradient likelihood array.
+        // That is the job of the caller to ensure that temporary variables are not cleared
+        // before the final use.
+        // The return type is quite complex. Let the compiler decides.
+        // L(b_0(delta, p), b_1(delta, p), ..., b_n(delta, p))
+        // Now we take a tuple of gradient and collapse this to a gradient of delta.
+        template <
+            typename FinalDensityArray, typename TupleGradientLikelihoodArray>
+        inline auto apply_adjoint_gradient(
+            const FinalDensityArray &array,
+            TupleGradientLikelihoodArray grad_array) {
+          BOOST_STATIC_ASSERT(
+              (std::tuple_size<TupleGradientLikelihoodArray>::value == 1));
+          LIBLSS_AUTO_DEBUG_CONTEXT(ctx);
+
+          gradient_density_lambda(array, std::get<0>(grad_array));
+
+          return std::make_tuple(std::cref(ag_density->get_array()));
+        }
+      };
+
+    } // namespace detail_manypower
+
+    using detail_manypower::ManyPower;
+    template <typename T, size_t... N>
+    using ManyPowerLevels = detail_manypower::Levels<T, N...>;
+  } // namespace bias
+
+} // namespace LibLSS
+
+#endif
--- a/extra/borg/libLSS/physics/bias/noop.hpp
+++ b/extra/borg/libLSS/physics/bias/noop.hpp
@ -0,0 +1,158 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/bias/noop.hpp
+    Copyright (C) 2014-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#ifndef __LIBLSS_PHYSICS_NOOP_HPP
+#define __LIBLSS_PHYSICS_NOOP_HPP
+
+/**
+  * This header provides the implementations of a simple power law bias model.
+  * A generic bias model must implement the following concept:
+  *     a "selection_adaptor" functor, available from the object
+  *     a "compute_density(final_density)" function accepting a virtual array and returning a tuple of virtual array.
+  *     a "apply_adjoint_gradient(final_density,gradient_likelihood_array)" also accepting virtual arrays and returning a virtual array
+  *     a "prepare" function which is called before the other two calls, to allow for precomputing auxiliary complex fields.
+  *     a "cleanup" which allows for tearing down any auxiliary fields and memory allocated in the prepare phase.
+  */
+#include <cmath>
+#include <functional>
+#include "libLSS/tools/fused_array.hpp"
+#include <tuple>
+#include "libLSS/tools/phoenix_vars.hpp"
+#include <boost/phoenix/operator.hpp>
+#include <boost/phoenix/stl/cmath.hpp>
+#include "libLSS/tools/tuple_helper.hpp"
+#include "libLSS/physics/bias/base.hpp"
+
+namespace LibLSS {
+
+  namespace bias {
+
+    namespace detail_noop {
+      namespace ph = std::placeholders;
+
+      /**
+        * Noop bias
+        */
+      struct Noop {
+
+        static constexpr const bool NmeanIsBias = true;
+        static constexpr const int numParams = 1;
+
+        // This adaptor transforms an unselected galaxy density (with appropriate
+        // auxiliary arrays) to a selected array. It must be a functor accepting two
+        // parameters: a selection virtual array and a bias density virtual array.
+        selection::SimpleAdaptor selection_adaptor;
+
+        double nmean;
+
+        Noop(LikelihoodInfo const& = LikelihoodInfo()) {}
+
+        template <typename BiasParameters>
+        static inline void setup_default(BiasParameters &params) {}
+
+        // Nota: fwd_model and final_density arrays cannot be stored in this step. But
+        // they can be used.
+        template <
+            class ForwardModel, typename FinalDensityArray,
+            typename BiasParameters, typename MetaSelect = NoSelector>
+        inline void prepare(
+            ForwardModel &fwd_model, const FinalDensityArray &final_density,
+            double const _nmean, const BiasParameters &params,
+            bool density_updated, MetaSelect _select = MetaSelect()) {
+          //nmean = _nmean;
+          nmean = params[0];
+        }
+
+        inline void cleanup() {
+          // No need for cleanup
+        }
+
+        static inline double gradient_density_lambda(double nmean, double g) {
+          return nmean * g;
+        }
+
+        static inline double density_lambda(double nmean, double v) {
+          return nmean * (1 + v);
+        }
+
+        inline double get_linear_bias() const { return 1; }
+
+        template <typename Array>
+        static inline bool check_bias_constraints(Array &&a) {
+          return true;
+        }
+
+        // The compute density takes a physical matter density field
+        // and produce a set of fields that can be consumed by a likelihood.
+        // For example if can be a biased density field and the allowed variance
+        // per voxel.
+        // Formally that generates a vector a_i = f_i({delta})
+        //
+        // This function returns a tuple of array-like objects. That array
+        // depends on the existence of the final density array.
+        // The return type is quite complex as it can be an abstract expression
+        // and not a real array. Here we let the compiler decides with an auto
+        // type.
+        template <typename FinalDensityArray>
+        inline auto compute_density(const FinalDensityArray &array) {
+          return std::make_tuple(b_va_fused<double>(
+              std::bind(density_lambda, nmean, ph::_1), array));
+        }
+
+        // This function returns a tuple of array-like array. 
+        // That array depends on the existence of the final density 
+        // array and the gradient likelihood array. That is the job of 
+        // the caller to ensure that temporary variables are not cleared
+        // before the final use.
+        //
+        // The return type may be quite complex depending on the detail of the
+        // implementation of the gradient. If one decides to use a real
+        // array then the return type is just a tuple of arrays. However
+        // it is allowed to be also a tuple of expressions acting on arrays.
+        // In practice we let the compiler decides here.
+        // The "biased fields" are assumed to be a_i = f_i({delta})
+        // The adjoint gradient of the density field must be returned as a 1-tuple
+        template <
+            typename FinalDensityArray, typename TupleGradientLikelihoodArray>
+        inline auto apply_adjoint_gradient(
+            const FinalDensityArray &array,
+            TupleGradientLikelihoodArray grad_array) {
+          BOOST_STATIC_ASSERT(
+              (std::tuple_size<TupleGradientLikelihoodArray>::value == 1));
+          return std::make_tuple(b_va_fused<double>(
+              std::bind(gradient_density_lambda, nmean, ph::_1),
+              std::move(std::get<0>(grad_array))));
+        }
+
+        /**
+          * This function computes the adjoint gradient  
+          * of the bias coefficient i. The adjoint gradient is provided with
+          * the same information as for apply_adjoint_gradient.
+          * It is expected to return an array-like object.
+          */
+        template <
+            typename FinalDensityArray, typename TupleGradientLikelihoodArray>
+        inline auto apply_adjoint_gradient_bias(const FinalDensityArray& array, TupleGradientLikelihoodArray grad_array, unsigned int i) {
+          Console::instance().c_assert(i < 1, "Invalid range of bias parameter");
+          boost::multi_array<double, 1> output_grad(boost::extents[1]);
+          output_grad[0] = ((1.0 + fwrap(array)) * fwrap(std::get<0>(grad_array))).sum();
+          return output_grad;
+        }
+      };
+
+    } // namespace detail_noop
+
+    /// Import the Noop class into LibLSS::bias
+    using detail_noop::Noop;
+
+  } // namespace bias
+
+} // namespace LibLSS
+
+#endif
--- a/extra/borg/libLSS/physics/bias/passthrough.hpp
+++ b/extra/borg/libLSS/physics/bias/passthrough.hpp
@ -0,0 +1,85 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/bias/passthrough.hpp
+    Copyright (C) 2014-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#ifndef __LIBLSS_PHYSICS_PASSTHROUGH_HPP
+#define __LIBLSS_PHYSICS_PASSTHROUGH_HPP
+
+#include <cmath>
+#include <tuple>
+#include "libLSS/physics/bias/base.hpp"
+
+namespace LibLSS {
+
+  namespace bias {
+
+    namespace detail_passthrough {
+
+      /**
+        * Noop bias
+        */
+      struct Passthrough {
+
+        static constexpr const bool NmeanIsBias = true;
+        static const auto numParams = 0;
+
+        // This adaptor transforms an unselected galaxy density (with appropriate
+        // auxiliary arrays) to a selected array. It must be a functor accepting two
+        // parameters: a selection virtual array and a bias density virtual array.
+        selection::SimpleAdaptor selection_adaptor;
+
+        template <typename BiasParameters>
+        static inline void setup_default(BiasParameters &params) {}
+
+        Passthrough(LikelihoodInfo const& = LikelihoodInfo()) {}
+
+        // Nota: fwd_model and final_density arrays cannot be stored in this step. But
+        // they can be used.
+        template <
+            class ForwardModel, typename FinalDensityArray,
+            typename BiasParameters, typename MetaSelect = NoSelector>
+        inline void prepare(
+            ForwardModel &fwd_model, const FinalDensityArray &final_density,
+            double const _nmean, const BiasParameters &params,
+            bool density_updated, MetaSelect _select = MetaSelect()) {}
+
+        inline void cleanup() {
+          // No need for cleanup
+        }
+
+        inline double get_linear_bias() const { return 1; }
+
+        template <typename Array>
+        static inline bool check_bias_constraints(Array &&a) {
+          return true;
+        }
+
+        template <typename FinalDensityArray>
+        inline auto compute_density(const FinalDensityArray &array) {
+          return std::make_tuple(std::cref(array));
+        }
+
+        template <
+            typename FinalDensityArray, typename TupleGradientLikelihoodArray>
+        inline auto apply_adjoint_gradient(
+            const FinalDensityArray &array,
+            TupleGradientLikelihoodArray grad_array) {
+          return grad_array;
+        }
+      };
+
+    } // namespace detail_passthrough
+
+    /// Import the Noop class into LibLSS::bias
+    using detail_passthrough::Passthrough;
+
+  } // namespace bias
+
+} // namespace LibLSS
+
+#endif
--- a/extra/borg/libLSS/physics/bias/power_law.hpp
+++ b/extra/borg/libLSS/physics/bias/power_law.hpp
@ -0,0 +1,141 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/bias/power_law.hpp
+    Copyright (C) 2014-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#ifndef __LIBLSS_PHYSICS_POWER_LAW_BIAS_HPP
+#define __LIBLSS_PHYSICS_POWER_LAW_BIAS_HPP
+
+// This header provides the implementations of a simple power law bias model.
+// A generic bias model must implement the following concept:
+//     a "selection_adaptor" functor, available from the object
+//     a "compute_density(final_density)" function accepting a virtual array and returning a tuple of virtual array.
+//     a "apply_adjoint_gradient(final_density,gradient_likelihood_array)" also accepting virtual arrays and returning a virtual array
+//     a "prepare" function which is called before the other two calls, to allow for precomputing auxiliary complex fields.
+//     a "cleanup" which allows for tearing down any auxiliary fields and memory allocated in the prepare phase.
+
+#include <cmath>
+#include <boost/bind.hpp>
+#include "libLSS/tools/fused_array.hpp"
+#include <tuple>
+#include "libLSS/tools/phoenix_vars.hpp"
+#include <boost/phoenix/operator.hpp>
+#include <boost/phoenix/stl/cmath.hpp>
+#include "libLSS/tools/tuple_helper.hpp"
+#include "libLSS/physics/bias/base.hpp"
+
+namespace LibLSS {
+
+  namespace bias {
+
+    namespace detail {
+
+      inline double dummy_lambda(double) { return 0; }
+
+      struct PowerLaw {
+        static constexpr double EPSILON_VOIDS = 1e-6;
+        static constexpr int numParams = 2;
+        static constexpr const bool NmeanIsBias = true;
+
+        double alpha;
+        double nmean;
+
+        // This adaptor transforms an unselected galaxy density (with appropriate
+        // auxiliary arrays) to a selected array. It must be a functor accepting two
+        // parameters: a selection virtual array and a bias density virtual array.
+        selection::SimpleAdaptor selection_adaptor;
+
+        PowerLaw(LikelihoodInfo const& = LikelihoodInfo()) {}
+
+        template <typename BiasParameters>
+        static inline void setup_default(BiasParameters &params) {
+          params[0] = 10;
+          params[1] = 0.2;
+        }
+
+        // Nota: fwd_model and final_density arrays cannot be stored in this step. But
+        // they can be used.
+        template <
+            class ForwardModel, typename FinalDensityArray,
+            typename BiasParameters, typename MetaSelect = NoSelector>
+        inline void prepare(
+            ForwardModel &fwd_model, const FinalDensityArray &final_density,
+            double const _nmean, const BiasParameters &params,
+            bool density_updated, MetaSelect _select = MetaSelect()) {
+          // No need for complex preparation. Just copy parameters.
+          nmean = params[0];
+          alpha = params[1];
+        }
+
+        inline void cleanup() {
+          // No need for cleanup
+        }
+
+        static inline double gradient_density_lambda(
+            double nmean, double alpha, double v, double g) {
+          return alpha * nmean * std::pow(1 + v + EPSILON_VOIDS, alpha - 1) * g;
+        }
+
+        static inline double
+        density_lambda(double nmean, double alpha, double v) {
+          double v0 = nmean * std::pow(1 + EPSILON_VOIDS + v, alpha);
+          Console::instance().c_assert(!std::isinf(v0), "V is infinite");
+          return v0;
+        }
+
+        inline double get_linear_bias() const { return alpha; }
+
+        template <typename Array>
+        static inline bool check_bias_constraints(Array &&a) {
+          return a[0] > 0 && a[1] > 0 && a[1] < 5; // alpha can take any value
+        }
+
+        // This function returns an array-like array. That array
+        // depends on the existence of the final density array.
+        // The return type is quite complex. Let the compiler decides.
+        // C++11 does not allow automatic return type deduction. C++14 would be
+        // needed for that. So we have to rely on an auxiliary function that
+        // allow for a compact decltype to be written.
+        template <typename FinalDensityArray>
+        inline auto compute_density(const FinalDensityArray &array) {
+          return std::make_tuple(b_va_fused<double>(
+              std::bind(
+                  density_lambda, nmean, alpha, std::placeholders::_1),
+              array));
+        }
+
+        // This function returns an array-like array. That array
+        // depends on the existence of the final density array and the gradient likelihood array.
+        // That is the job of the caller to ensure that temporary variables are not cleared
+        // before the final use.
+        // The return type is quite complex. Let the compiler decides.
+        // L(b_0(delta, p), b_1(delta, p), ..., b_n(delta, p))
+        // Now we take a tuple of gradient and collapse this to a gradient of delta.
+        template <
+            typename FinalDensityArray, typename TupleGradientLikelihoodArray>
+        inline auto apply_adjoint_gradient(
+            const FinalDensityArray &array,
+            TupleGradientLikelihoodArray grad_array) {
+          BOOST_STATIC_ASSERT(
+              (std::tuple_size<TupleGradientLikelihoodArray>::value == 1));
+          return std::make_tuple(b_va_fused<double>(
+              std::bind(
+                  gradient_density_lambda, nmean, alpha,
+                  std::placeholders::_1, std::placeholders::_2),
+              array, std::move(std::get<0>(grad_array))));
+        }
+      };
+
+    } // namespace detail
+
+    using detail::PowerLaw;
+
+  } // namespace bias
+
+} // namespace LibLSS
+
+#endif
--- a/extra/borg/libLSS/physics/forwards/2lpt/borg_fwd_2lpt.cpp
+++ b/extra/borg/libLSS/physics/forwards/2lpt/borg_fwd_2lpt.cpp
@ -0,0 +1,651 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/forwards/2lpt/borg_fwd_2lpt.cpp
+    Copyright (C) 2014-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+
+// Be advised that deltao is already scaled. So the first fourier transform from deltao -> real space
+// is free of scaling. But none of the others, which has to be scaled by 1/(L^3)
+
+#include "../pm/plane_xchg.hpp"
+
+template <typename CIC>
+void Borg2LPTModel<CIC>::lpt2_ic(
+    CArrayRef &deltao, PhaseArrayRef &pos, PhaseArrayRef &vel,
+    PhaseArrayRef &lctim) {
+  // set cosmological parameters
+  // Initial density is scaled to initial redshift!
+  ConsoleContext<LOG_DEBUG> ctx("lpt2_ic");
+  Cosmology cosmo(cosmo_params);
+  double fft_normalization = 1.0 / (c_N0 * c_N1 * c_N2);
+  double inv_volume = 1 / (L0 * L1 * L2);
+
+  typedef UninitializedArray<
+      FFTW_Complex_Array, FFTW_Allocator<std::complex<double>>>
+      U_CArray;
+  typedef U_CArray::array_type Ref_CArray;
+
+  U_CArray tmp_p(mgr->extents_complex(), mgr->allocator_complex);
+  Ref_CArray &tmp = tmp_p.get_array();
+
+  for (int axis = 0; axis < 3; axis++) {
+#pragma omp parallel for collapse(3)
+    for (size_t i = c_startN0; i < c_startN0 + c_localN0; i++)
+      for (size_t j = 0; j < c_N1; j++)
+        for (size_t k = 0; k < c_N2_HC; k++) {
+          double kk[3] = {kmode(i, c_N0, L0), kmode(j, c_N1, L1),
+                          kmode(k, c_N2, L2)};
+
+          double ksquared = kk[0] * kk[0] + kk[1] * kk[1] + kk[2] * kk[2];
+          double fac = -kk[axis] / ksquared * inv_volume;
+
+          std::complex<double> &in_delta = deltao[i][j][k];
+
+          tmp[i][j][k] = std::complex<double>(
+              -in_delta.imag() * fac, in_delta.real() * fac);
+
+          std::complex<double> &aux = tmp[i][j][k];
+
+          // calculate second order LPT terms
+          if (axis == 0) {
+            /* disp0,0 */
+            (*c_psi_00)[i][j][k] =
+                std::complex<double>(-aux.imag() * kk[0], aux.real() * kk[0]);
+
+            /* disp0,1 */
+            (*c_psi_01)[i][j][k] =
+                std::complex<double>(-aux.imag() * kk[1], aux.real() * kk[1]);
+
+            /* disp0,2 */
+            (*c_psi_02)[i][j][k] =
+                std::complex<double>(-aux.imag() * kk[2], aux.real() * kk[2]);
+          }
+
+          // calculate second order LPT terms
+          if (axis == 1) {
+            /* disp1,1 */
+            (*c_psi_11)[i][j][k] =
+                std::complex<double>(-aux.imag() * kk[1], aux.real() * kk[1]);
+
+            /* disp1,2 */
+            (*c_psi_12)[i][j][k] =
+                std::complex<double>(-aux.imag() * kk[2], aux.real() * kk[2]);
+          }
+
+          // calculate second order LPT terms
+          if (axis == 2) {
+            /* disp2,2 */
+            (*c_psi_22)[i][j][k] =
+                std::complex<double>(-aux.imag() * kk[2], aux.real() * kk[2]);
+          }
+        }
+
+    if (c_startN0 == 0 && c_localN0 > 0) {
+      tmp[0][0][0] = 0;
+      tmp[0][0][c_N2_HC - 1] = 0;
+      tmp[0][c_N1 / 2][0] = 0;
+      tmp[0][c_N1 / 2][c_N2_HC - 1] = 0;
+
+      (*c_psi_00)[0][0][0] = 0;
+      (*c_psi_00)[0][0][c_N2_HC - 1] = 0;
+      (*c_psi_00)[0][c_N1 / 2][0] = 0;
+      (*c_psi_00)[0][c_N1 / 2][c_N2_HC - 1] = 0;
+
+      (*c_psi_01)[0][0][0] = 0;
+      (*c_psi_01)[0][0][c_N2_HC - 1] = 0;
+      (*c_psi_01)[0][c_N1 / 2][0] = 0;
+      (*c_psi_01)[0][c_N1 / 2][c_N2_HC - 1] = 0;
+
+      (*c_psi_02)[0][0][0] = 0;
+      (*c_psi_02)[0][0][c_N2_HC - 1] = 0;
+      (*c_psi_02)[0][c_N1 / 2][0] = 0;
+      (*c_psi_02)[0][c_N1 / 2][c_N2_HC - 1] = 0;
+
+      (*c_psi_11)[0][0][0] = 0;
+      (*c_psi_11)[0][0][c_N2_HC - 1] = 0;
+      (*c_psi_11)[0][c_N1 / 2][0] = 0;
+      (*c_psi_11)[0][c_N1 / 2][c_N2_HC - 1] = 0;
+
+      (*c_psi_12)[0][0][0] = 0;
+      (*c_psi_12)[0][0][c_N2_HC - 1] = 0;
+      (*c_psi_12)[0][c_N1 / 2][0] = 0;
+      (*c_psi_12)[0][c_N1 / 2][c_N2_HC - 1] = 0;
+
+      (*c_psi_22)[0][0][0] = 0;
+      (*c_psi_22)[0][0][c_N2_HC - 1] = 0;
+      (*c_psi_22)[0][c_N1 / 2][0] = 0;
+      (*c_psi_22)[0][c_N1 / 2][c_N2_HC - 1] = 0;
+    }
+
+    if (c_startN0 <= c_N0 / 2 && c_startN0 + c_localN0 > c_N0 / 2) {
+      tmp[c_N0 / 2][0][0] = 0;
+      tmp[c_N0 / 2][0][c_N2_HC - 1] = 0;
+      tmp[c_N0 / 2][c_N1 / 2][0] = 0;
+      tmp[c_N0 / 2][c_N1 / 2][c_N2_HC - 1] = 0;
+
+      (*c_psi_00)[c_N0 / 2][0][0] = 0;
+      (*c_psi_00)[c_N0 / 2][0][c_N2_HC - 1] = 0;
+      (*c_psi_00)[c_N0 / 2][c_N1 / 2][0] = 0;
+      (*c_psi_00)[c_N0 / 2][c_N1 / 2][c_N2_HC - 1] = 0;
+
+      (*c_psi_01)[c_N0 / 2][0][0] = 0;
+      (*c_psi_01)[c_N0 / 2][0][c_N2_HC - 1] = 0;
+      (*c_psi_01)[c_N0 / 2][c_N1 / 2][0] = 0;
+      (*c_psi_01)[c_N0 / 2][c_N1 / 2][c_N2_HC - 1] = 0;
+
+      (*c_psi_02)[c_N0 / 2][0][0] = 0;
+      (*c_psi_02)[c_N0 / 2][0][c_N2_HC - 1] = 0;
+      (*c_psi_02)[c_N0 / 2][c_N1 / 2][0] = 0;
+      (*c_psi_02)[c_N0 / 2][c_N1 / 2][c_N2_HC - 1] = 0;
+
+      (*c_psi_11)[c_N0 / 2][0][0] = 0;
+      (*c_psi_11)[c_N0 / 2][0][c_N2_HC - 1] = 0;
+      (*c_psi_11)[c_N0 / 2][c_N1 / 2][0] = 0;
+      (*c_psi_11)[c_N0 / 2][c_N1 / 2][c_N2_HC - 1] = 0;
+
+      (*c_psi_12)[c_N0 / 2][0][0] = 0;
+      (*c_psi_12)[c_N0 / 2][0][c_N2_HC - 1] = 0;
+      (*c_psi_12)[c_N0 / 2][c_N1 / 2][0] = 0;
+      (*c_psi_12)[c_N0 / 2][c_N1 / 2][c_N2_HC - 1] = 0;
+
+      (*c_psi_22)[c_N0 / 2][0][0] = 0;
+      (*c_psi_22)[c_N0 / 2][0][c_N2_HC - 1] = 0;
+      (*c_psi_22)[c_N0 / 2][c_N1 / 2][0] = 0;
+      (*c_psi_22)[c_N0 / 2][c_N1 / 2][c_N2_HC - 1] = 0;
+    }
+
+    // FFT to Realspace
+    mgr->execute_c2r(c_synthesis_plan, tmp.data(), c_tmp_real_field->data());
+
+#pragma omp parallel for collapse(2)
+    for (int l = c_startN0; l < c_startN0 + c_localN0; l++)
+      for (int m = 0; m < c_N1; m++)
+        for (int n = 0; n < c_N2; n++) {
+          size_t idx = n + c_N2 * m + c_N2 * c_N1 * (l - c_startN0);
+          vel[idx][axis] = (*c_tmp_real_field)[l][m][n];
+        }
+  }
+
+  // FFT to Realspace 2lpt
+  mgr->execute_c2r(c_synthesis_plan, c_psi_00->data(), r_psi_00->data());
+  mgr->execute_c2r(c_synthesis_plan, c_psi_01->data(), r_psi_01->data());
+  mgr->execute_c2r(c_synthesis_plan, c_psi_02->data(), r_psi_02->data());
+  mgr->execute_c2r(c_synthesis_plan, c_psi_11->data(), r_psi_11->data());
+  mgr->execute_c2r(c_synthesis_plan, c_psi_12->data(), r_psi_12->data());
+  mgr->execute_c2r(c_synthesis_plan, c_psi_22->data(), r_psi_22->data());
+
+// Calculate source for second order displacement
+#pragma omp parallel for collapse(2)
+  for (int l = c_startN0; l < c_startN0 + c_localN0; l++)
+    for (int m = 0; m < c_N1; m++)
+      for (int n = 0; n < c_N2; n++) {
+        // Calculates source for second order displacement
+        // RHS of eq. (D6) in Scoccimarro 1998, MNRAS 299, 1097
+
+        double t00 = (*r_psi_00)[l][m][n]; // 0
+        double t01 = (*r_psi_01)[l][m][n]; // 1
+        double t02 = (*r_psi_02)[l][m][n]; // 2
+        double t11 = (*r_psi_11)[l][m][n]; // 3
+        double t12 = (*r_psi_12)[l][m][n]; // 4
+        double t22 = (*r_psi_22)[l][m][n]; // 5
+
+        (*c_tmp_real_field)[l][m][n] =
+            t00 * (t11 + t22) + t11 * t22 - t01 * t01 - t02 * t02 - t12 * t12;
+        (*c_tmp_real_field)[l][m][n] *=
+            fft_normalization; // Multiply here to be ready for the r2c and c2r. L^3 cancels out.
+      }
+
+  // FFT to Fourier-space
+  mgr->execute_r2c(
+      c_analysis_plan, c_tmp_real_field->data(), c_tmp_complex_field->data());
+
+  // create dummy array for second order displacement
+  UninitializedArray<PhaseArray> vel2(extents[c_localN0 * c_N1 * c_N2][3]);
+
+  for (int axis = 0; axis < 3; axis++) {
+#pragma omp parallel for collapse(2)
+    for (int i = c_startN0; i < c_startN0 + c_localN0; i++)
+      for (int j = 0; j < c_N1; j++)
+        for (int k = 0; k < c_N2_HC; k++) {
+          double kk[3] = {kmode(i, c_N0, L0), kmode(j, c_N1, L1),
+                          kmode(k, c_N2, L2)};
+
+          double ksquared = kk[0] * kk[0] + kk[1] * kk[1] + kk[2] * kk[2];
+          double fac = -kk[axis] / ksquared;
+
+          std::complex<double> &aux = (*c_tmp_complex_field)[i][j][k];
+
+          tmp[i][j][k] =
+              std::complex<double>(-aux.imag() * fac, aux.real() * fac);
+        }
+
+    if (c_startN0 == 0 && c_localN0 > 0) {
+      tmp[0][0][0] = 0;
+      tmp[0][0][c_N2_HC - 1] = 0;
+      tmp[0][c_N1 / 2][0] = 0;
+      tmp[0][c_N1 / 2][c_N2_HC - 1] = 0;
+    }
+
+    if (c_startN0 <= c_N0 / 2 && c_startN0 + c_localN0 > c_N0 / 2) {
+      tmp[c_N0 / 2][0][0] = 0;
+      tmp[c_N0 / 2][0][c_N2_HC - 1] = 0;
+      tmp[c_N0 / 2][c_N1 / 2][0] = 0;
+      tmp[c_N0 / 2][c_N1 / 2][c_N2_HC - 1] = 0;
+    }
+
+    // FFT to Realspace
+    mgr->execute_c2r(c_synthesis_plan, tmp.data(), c_tmp_real_field->data());
+
+    size_t c_endN0 = c_startN0 + c_localN0;
+    auto &v_array = vel2.get_array();
+#pragma omp parallel for collapse(3)
+    for (size_t l = c_startN0; l < c_endN0; l++)
+      for (size_t m = 0; m < c_N1; m++)
+        for (size_t n = 0; n < c_N2; n++) {
+          size_t idx = n + c_N2 * m + c_N2 * c_N1 * (l - c_startN0);
+          v_array[idx][axis] = (*c_tmp_real_field)[l][m][n];
+        }
+  }
+
+  auto &ids = *lagrangian_id;
+  size_t base_id = c_N2 * c_N1 * c_startN0;
+
+#pragma omp parallel for collapse(2)
+  for (int l = c_startN0; l < c_startN0 + c_localN0; l++)
+    for (int m = 0; m < c_N1; m++)
+      for (int n = 0; n < c_N2; n++) {
+        // sort particles on equidistant grid
+        double q0 = L0 / double(c_N0) * double(l);
+        double q1 = L1 / double(c_N1) * double(m);
+        double q2 = L2 / double(c_N2) * double(n);
+
+        size_t idx = n + c_N2 * m + c_N2 * c_N1 * (l - c_startN0);
+
+        double DD1 = lctim[idx][0];
+        double DD2 = lctim[idx][3];
+        auto displ = vel[idx];
+        auto displ_2 = vel2.get_array()[idx];
+
+        double x = q0 - DD1 * displ[0] + DD2 * displ_2[0];
+        double y = q1 - DD1 * displ[1] + DD2 * displ_2[1];
+        double z = q2 - DD1 * displ[2] + DD2 * displ_2[2];
+
+        // enforce periodic boundary conditions
+        pos[idx][0] = periodic_fix(x, 0., L0);
+        pos[idx][1] = periodic_fix(y, 0., L1);
+        pos[idx][2] = periodic_fix(z, 0., L2);
+
+        ids[idx] = idx + base_id;
+
+        // NOTE: displacements are already stored in the velocity vectors. Only need to multiply by prefactor
+
+        // store velocities in km/sec
+        // note we multiply by a^2 to get the correct momentum variable for the particle mesh code
+        // and normalize to code units
+        double v1_scaling = lctim[idx][1];
+        double v2_scaling = lctim[idx][4];
+        displ[0] = displ[0] * v1_scaling + v2_scaling * displ_2[0];
+        displ[1] = displ[1] * v1_scaling + v2_scaling * displ_2[1];
+        displ[2] = displ[2] * v1_scaling + v2_scaling * displ_2[2];
+      }
+
+  realInfo.localNumParticlesAfter = realInfo.localNumParticlesBefore =
+      c_localN0 * c_N1 * c_N2;
+  redshiftInfo.localNumParticlesBefore = realInfo.localNumParticlesAfter;
+  redshiftInfo.localNumParticlesAfter = realInfo.localNumParticlesAfter;
+}
+
+template <typename CIC>
+void Borg2LPTModel<CIC>::lpt2_redshift_pos(
+    PhaseArrayRef &pos, PhaseArrayRef &vel, PhaseArrayRef &s_pos,
+    PhaseArrayRef &lctim) {
+  Cosmology cosmo(cosmo_params);
+
+  // this routine generates particle positions in redshift space
+  // the code uses particle momenta p=a^2 dx/dt where x is the co-moving position
+  // peculiar velocities are then given by v_pec = p/a
+
+  // NOTE: Check coefficients
+  ArrayType1d::ArrayType &observer = vobs;
+
+#pragma omp parallel for
+  for (size_t idx = 0; idx < realInfo.localNumParticlesAfter; idx++) {
+    double x0g = pos[idx][0];
+    double x1g = pos[idx][1];
+    double x2g = pos[idx][2];
+
+    double x0 = x0g + xmin0;
+    double x1 = x1g + xmin1;
+    double x2 = x2g + xmin2;
+
+    double v0 = vel[idx][0] + observer[0];
+    double v1 = vel[idx][1] + observer[1];
+    double v2 = vel[idx][2] + observer[2];
+
+    double r2_los = x0 * x0 + x1 * x1 + x2 * x2;
+    double v_los = v0 * x0 + v1 * x1 + v2 * x2;
+
+    double facRSD = lctim
+        [idx]
+        [2]; // this factor is 1/H/a for velocities in [km/sec] an additional factor arises from momentum conversion
+
+    double A = facRSD * v_los / r2_los;
+
+    double s0 = x0g + A * x0;
+    double s1 = x1g + A * x1;
+    double s2 = x2g + A * x2;
+
+    // enforce periodic boundary conditions
+    s_pos[idx][0] = periodic_fix(s0, 0., L0);
+    s_pos[idx][1] = periodic_fix(s1, 0., L1);
+    s_pos[idx][2] = periodic_fix(s2, 0., L2);
+  }
+  // Update the info for redshift particles
+  redshiftInfo.localNumParticlesAfter = redshiftInfo.localNumParticlesBefore =
+      realInfo.localNumParticlesAfter;
+}
+
+template <typename CIC>
+void Borg2LPTModel<CIC>::lpt2_density_obs(
+    PhaseArrayRef &pos, ArrayRef &deltao, size_t numParts) {
+  double const nmean = double(c_N0*c_N1*c_N2)/(box_output.N0*box_output.N1*box_output.N2);
+
+  if (ALWAYS_MPI(comm)) {
+    typedef UninitializedArray<boost::multi_array<double, 3>> U_Array;
+    typedef U_Array::array_type::index_range i_range;
+    U_Array::array_type::index_gen indices;
+    // Here we have to introduce ghost planes.
+    U_Array tmp_delta(out_mgr->extents_real(CIC::MPI_PLANE_LEAKAGE));
+
+    array::fill(tmp_delta.get_array(), 0);
+    Console::instance().print<LOG_VERBOSE>(format("numParts = %d") % numParts);
+    CIC::projection(
+        pos, tmp_delta.get_array(), L0, L1, L2, out_mgr->N0, out_mgr->N1, out_mgr->N2,
+        typename CIC::Periodic_MPI(out_mgr->N0, out_mgr->N1, out_mgr->N2, comm),
+        CIC_Tools::DefaultWeight(), numParts);
+    // CIC has MPI_PLANE_LEAKAGE extra planes. They have to be sent to the adequate nodes.
+    density_exchange_planes<true>(
+        comm, tmp_delta.get_array(), out_mgr, CIC::MPI_PLANE_LEAKAGE);
+
+   fwrap(deltao[out_mgr->strict_range()]) =
+           tmp_delta.get_array()[out_mgr->strict_range()];
+  } else {
+    array::fill(deltao, 0);
+    cic.projection(
+        pos, deltao, L0, L1, L2, out_mgr->N0, out_mgr->N1, out_mgr->N2, CIC_Tools::Periodic(out_mgr->N0, out_mgr->N1, out_mgr->N2),
+        CIC_Tools::DefaultWeight(), numParts);
+  }
+
+  array::density_rescale(deltao, nmean);
+
+  if (DUMP_BORG_DENSITY) {
+    std::string fname = str(format("borg_density_%d.h5") % comm->rank());
+    H5::H5File f(fname, H5F_ACC_TRUNC);
+    CosmoTool::hdf5_write_array(f, "density", deltao);
+  }
+}
+
+template <typename CIC>
+void Borg2LPTModel<CIC>::lpt2_fwd_model(
+    CArrayRef &deltao, PhaseArrayRef &pos, PhaseArrayRef &vel,
+    PhaseArrayRef &lctim) {
+  ConsoleContext<LOG_DEBUG> ctx("BORG forward model");
+
+  if (c_deltao != 0) {
+    array::fill(*c_deltao, 0);
+    mgr->upgrade_complex(*lo_mgr, deltao, *c_deltao);
+    lpt2_ic(*c_deltao, pos, vel, lctim);
+  } else {
+    // NOTE: ICs are generated at ai
+    lpt2_ic(deltao, pos, vel, lctim);
+  }
+}
+
+template <typename CIC>
+void Borg2LPTModel<CIC>::forwardModelRsdField(
+    ArrayRef &deltaf, double *vobs_ext) {
+  ConsoleContext<LOG_DEBUG> ctx("BORG forward model rsd density calculation");
+
+  // introduce redshift distortions
+  if (do_rsd) {
+    // ArrayType1d::ArrayType& dummy = vobs;
+
+    // set vobs to input
+    double dummy[3];
+
+    dummy[0] = vobs[0];
+    dummy[1] = vobs[1];
+    dummy[2] = vobs[2];
+
+    vobs[0] = vobs_ext[0];
+    vobs[1] = vobs_ext[1];
+    vobs[2] = vobs_ext[2];
+    // ctx.print(format("old_v = %g,%g,%g   vnew = %g,%g,%g") % vobs[0] % vobs[1] % vobs[2] % vobs_ext[0] % vobs_ext[1] % vobs_ext[2]);
+
+    ctx.print("doing redshift space distortions.");
+    lpt2_redshift_pos(
+        u_pos->get_array(), u_vel->get_array(), u_s_pos->get_array(),
+        lc_timing->get_array());
+
+    // Reset indexes
+    LibLSS::initIndexes(
+        redshiftInfo.u_idx->get_array(), redshiftInfo.localNumParticlesBefore);
+    particle_redistribute(
+        redshiftInfo, u_s_pos->get_array(),
+        typename CIC::Distribution(lo_mgr, L0, L1, L2));
+
+    lpt2_density_obs(
+        u_s_pos->get_array(), deltaf, redshiftInfo.localNumParticlesAfter);
+
+    // reset vobs
+    vobs[0] = dummy[0];
+    vobs[1] = dummy[1];
+    vobs[2] = dummy[2];
+  }
+}
+
+template <typename CIC>
+void Borg2LPTModel<CIC>::test_lpt2_velocities(MarkovState &state) {
+  typedef ArrayStateElement<double, 2> PhaseArrayElement;
+  auto s_hat_p = mgr->allocate_complex_array();
+  auto &s_hat = s_hat_p.get_array();
+  IArrayType::ArrayType &key_array = *state.get<IArrayType>("k_keys")->array;
+  ArrayType1d::ArrayType &pspec =
+      *state.get<ArrayType1d>("powerspectrum")->array;
+  int k0_test = 2, k1_test = 5, k2_test = 3;
+  double A_k = pspec[key_array[k0_test][k1_test][k2_test]] * volume;
+
+  PhaseArrayElement *pos_e =
+      new PhaseArrayElement(extents[c_localN0 * c_N1 * c_N2][3]);
+  PhaseArrayElement *vel_e =
+      new PhaseArrayElement(extents[c_localN0 * c_N1 * c_N2][3]);
+  PhaseArrayElement *timing_e =
+      new PhaseArrayElement(extents[c_localN0 * c_N1 * c_N2][5]);
+
+  state.newElement("lpt2_pos", pos_e);
+  state.newElement("lpt2_vel", vel_e);
+
+  auto &pos = *pos_e->array;
+  auto &vel = *vel_e->array;
+  auto &lc_timing = *timing_e->array;
+
+  gen_light_cone_timing(lc_timing);
+
+  array::EigenMap<CArray>::map(s_hat).fill(0);
+  s_hat[k0_test][k1_test][k2_test] = std::sqrt(A_k) / volume;
+
+  // Hermiticity_fixup(s_hat);
+
+  state.newScalar<double>("A_k_test", std::sqrt(A_k));
+  ArrayType1d *e_k_pos;
+  state.newElement("k_pos_test", e_k_pos = new ArrayType1d(extents[3]));
+  ArrayType1d::ArrayType &a_k_pos = *(e_k_pos->array);
+
+  a_k_pos[0] = kmode(k0_test, N0, L0);
+  a_k_pos[1] = kmode(k1_test, N1, L1);
+  a_k_pos[2] = kmode(k2_test, N2, L2);
+
+  lpt2_ic(s_hat, pos, vel, lc_timing);
+}
+
+template <typename CIC>
+void Borg2LPTModel<CIC>::gen_light_cone_timing(PhaseArrayRef &lctim) {
+  Cosmology cosmo(cosmo_params);
+  double D0 = cosmo.d_plus(a_init);
+
+  double a_lc = af;
+
+  double D1 = cosmo.d_plus(a_lc) / D0;
+  double f1 = cosmo.g_plus(a_lc);
+  double Hubble = cosmo.Hubble(a_lc) / cosmo_params.h; // km /sec /(Mpc/h)
+  double v_scaling = -D1 * f1 * a_lc * a_lc * Hubble;
+  double facRSD = 1. / a_lc / Hubble;
+
+  if (lctime) {
+    using boost::lambda::_1;
+
+    double r0 = 0.;
+    double r1 =
+        2. * sqrt(
+                 (L0 + xmin0) * (L0 + xmin0) + (L1 + xmin1) * (L1 + xmin1) +
+                 (L2 + xmin2) * (L2 + xmin2));
+    double step = 1.;
+
+    auto auxDplus = build_auto_interpolator(
+        boost::bind(&Cosmology::comph2d_plus, &cosmo, _1), r0, r1, step,
+        cosmo.comph2d_plus(r0), cosmo.comph2d_plus(r1));
+    auto auxGplus = build_auto_interpolator(
+        boost::bind(&Cosmology::comph2g_plus, &cosmo, _1), r0, r1, step,
+        cosmo.comph2g_plus(r0), cosmo.comph2g_plus(r1));
+    auto auxHubble = build_auto_interpolator(
+        boost::bind(&Cosmology::comph2Hubble, &cosmo, _1), r0, r1, step,
+        cosmo.comph2Hubble(r0), cosmo.comph2Hubble(r1));
+    auto auxa = build_auto_interpolator(
+        boost::bind(&Cosmology::comph2a, &cosmo, _1), r0, r1, step,
+        cosmo.comph2a(r0), cosmo.comph2a(r1));
+
+    // For every particle calculate distance to observer
+    size_t c_endN0 = c_startN0 + c_localN0;
+#pragma omp parallel for collapse(3)
+    for (size_t l = c_startN0; l < c_endN0; l++)
+      for (size_t m = 0; m < c_N1; m++)
+        for (size_t n = 0; n < c_N2; n++) {
+          size_t idx = n + c_N2 * m + c_N2 * c_N1 * (l - c_startN0);
+
+          // sort particles on equidistant grid
+          double x0 = L0 / double(c_N0) * double(l) + xmin0;
+          double x1 = L1 / double(c_N1) * double(m) + xmin1;
+          double x2 = L2 / double(c_N2) * double(n) + xmin2;
+
+          double r_obs = sqrt(x0 * x0 + x1 * x1 + x2 * x2);
+
+          D1 = auxDplus(r_obs) / D0;
+          f1 = auxGplus(r_obs);
+          Hubble = auxHubble(r_obs) / cosmo_params.h; // km /sec /(Mpc/h)
+          a_lc = auxa(r_obs);
+          v_scaling = -D1 * f1 * Hubble * a_lc * a_lc;
+          facRSD = 1. / a_lc / Hubble;
+
+          lctim[idx][0] = D1;
+          lctim[idx][1] = v_scaling;
+          lctim[idx][2] = facRSD;
+          lctim[idx][3] = -3. / 7. * D1 * D1;
+          lctim[idx][4] = -3. / 7. * D1 * D1 * 2.0 * f1 * Hubble * a_lc * a_lc;
+        }
+  } else {
+    size_t c_endN0 = c_startN0 + c_localN0;
+#pragma omp parallel for collapse(3)
+    for (size_t l = c_startN0; l < c_endN0; l++)
+      for (size_t m = 0; m < c_N1; m++)
+        for (size_t n = 0; n < c_N2; n++) {
+          size_t idx = n + c_N2 * m + c_N2 * c_N1 * (l - c_startN0);
+
+          lctim[idx][0] = D1;
+          lctim[idx][1] = v_scaling;
+          lctim[idx][2] = facRSD;
+          lctim[idx][3] = -3. / 7. * D1 * D1;
+          lctim[idx][4] = -3. / 7. * D1 * D1 * 2.0 * f1 * Hubble * a_lc * a_lc;
+        }
+  }
+}
+
+template <typename CIC>
+void Borg2LPTModel<CIC>::getDensityFinal(ModelOutput<3> delta_output) {
+  LIBLSS_AUTO_DEBUG_CONTEXT(ctx);
+  delta_output.setRequestedIO(PREFERRED_REAL);
+
+  try {
+    // introduce redshift distortions
+    if (do_rsd) {
+
+      ctx.print("doing redshift space distortions.");
+      lpt2_redshift_pos(
+          u_pos->get_array(), u_vel->get_array(), u_s_pos->get_array(),
+          lc_timing->get_array());
+
+      particle_redistribute(
+          redshiftInfo, u_s_pos->get_array(),
+          typename CIC::Distribution(out_mgr, L0, L1, L2),
+          make_attribute_helper(Particles::scalar(*lagrangian_id)));
+
+      // Now we can project
+      lpt2_density_obs(
+          u_s_pos->get_array(), delta_output.getRealOutput(),
+          redshiftInfo.localNumParticlesAfter);
+    } else {
+      particle_redistribute(
+          realInfo, u_pos->get_array(),
+          typename CIC::Distribution(out_mgr, L0, L1, L2),
+          make_attribute_helper(
+              Particles::vector(u_vel->get_array()),
+              Particles::scalar(*lagrangian_id)));
+      // Project now
+      lpt2_density_obs(
+          u_pos->get_array(), delta_output.getRealOutput(),
+          realInfo.localNumParticlesAfter);
+      redshiftInfo.localNumParticlesAfter = realInfo.localNumParticlesAfter;
+    }
+  } catch (const ErrorLoadBalance &) {
+    // If load balance failure it means our sample is deeply wrong. Free resources and inform the caller.
+    releaseParticles();
+    forwardModelHold = false;
+    throw;
+  }
+  /*
+  if (!forwardModelHold && !adjointNext) {
+    releaseParticles();
+  }*/
+  forwardModelHold = false;
+}
+
+template <typename CIC>
+void Borg2LPTModel<CIC>::forwardModel_v2(ModelInput<3> delta_init) {
+  ConsoleContext<LOG_DEBUG> ctx("BORG 2LPT MODEL SIMPLE");
+  size_t partNum = size_t(c_localN0 * c_N1 * c_N2 * partFactor);
+
+  delta_init.setRequestedIO(PREFERRED_FOURIER);
+  u_pos.reset();
+  u_vel.reset();
+  lagrangian_id.reset();
+
+  lagrangian_id = std::unique_ptr<IdxArray>(new IdxArray(extents[partNum]));
+  u_pos = std::make_shared<U_PArray>(extents[partNum][3]);
+  u_vel = std::make_shared<U_PArray>(extents[partNum][3]);
+
+  realInfo.allocate(comm, partNum);
+  if (do_rsd) {
+    u_s_pos = std::make_shared<U_PArray>(extents[partNum][3]);
+    redshiftInfo.allocate(comm, partNum);
+  }
+
+  updateCosmo();
+
+  delta_init.needDestroyInput();
+  lpt2_fwd_model(
+      delta_init.getFourier(), u_pos->get_array(), u_vel->get_array(),
+      lc_timing->get_array());
+}
--- a/extra/borg/libLSS/physics/forwards/2lpt/borg_fwd_2lpt.hpp
+++ b/extra/borg/libLSS/physics/forwards/2lpt/borg_fwd_2lpt.hpp
@ -0,0 +1,9 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/forwards/2lpt/borg_fwd_2lpt.hpp
+    Copyright (C) 2014-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
--- a/extra/borg/libLSS/physics/forwards/2lpt/borg_fwd_2lpt_adj.cpp
+++ b/extra/borg/libLSS/physics/forwards/2lpt/borg_fwd_2lpt_adj.cpp
@ -0,0 +1,383 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/forwards/2lpt/borg_fwd_2lpt_adj.cpp
+    Copyright (C) 2014-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+
+#include "libLSS/tools/tuple_helper.hpp"
+#include "libLSS/tools/fused_assign.hpp"
+
+template <typename CIC>
+void Borg2LPTModel<CIC>::lpt2_ic_ag(
+    PhaseArrayRef &pos_ag, PhaseArrayRef &vel_ag, PhaseArrayRef &lctim) {
+  // Initial density is scaled to initial redshift!!!
+
+  ConsoleContext<LOG_DEBUG> ctx("2LPT-IC adjoint");
+  Cosmology cosmo(cosmo_params);
+  double c_volNorm = 1 / volume;
+
+  // allocate auxiliary Fourier array
+  auto &AUX1 = *AUX1_p;
+  auto &aux = *aux_p;
+  auto &AUX0 = *AUX0_p;
+
+  array::fill(AUX1, 0);
+
+  // Do position derivative 1st order
+  //------------------------------------------------------------------------------
+  for (unsigned axis = 0; axis < 3; axis++) {
+#pragma omp parallel for collapse(3)
+    for (size_t l = c_startN0; l < c_startN0 + c_localN0; l++)
+      for (size_t m = 0; m < c_N1; m++)
+        for (size_t n = 0; n < c_N2; n++) {
+          size_t idx = n + c_N2 * m + c_N2 * c_N1 * (l - c_startN0);
+          double DD1 = lctim[idx][0];
+          double v1_scaling = lctim[idx][1];
+          aux[l][m][n] =
+              -DD1 * pos_ag[idx][axis] + v1_scaling * vel_ag[idx][axis];
+        }
+
+    // FFT to F-space
+    mgr->execute_r2c(c_analysis_plan, aux.data(), AUX0.data());
+
+#pragma omp parallel for collapse(3)
+    for (size_t i = c_startN0; i < c_startN0 + c_localN0; i++)
+      for (size_t j = 0; j < c_N1; j++)
+        for (size_t k = 0; k < c_N2_HC; k++) {
+          double kk[3] = {kmode(i, c_N0, L0), kmode(j, c_N1, L1),
+                          kmode(k, c_N2, L2)};
+
+          double ksquared = kk[0] * kk[0] + kk[1] * kk[1] + kk[2] * kk[2];
+          double fac = -kk[axis] / ksquared * c_volNorm;
+
+          std::complex<double> const &in_delta = AUX0[i][j][k];
+
+          AUX1[i][j][k] += std::complex<double>(
+              fac * in_delta.imag(), -fac * in_delta.real());
+        }
+
+    // Do position derivative 2nd order
+    //--------------------------------------------------------------------------
+
+    lpt2_add_to_derivative(
+        AUX1, pos_ag, vel_ag, lctim, axis,
+        std::make_tuple(
+            std::make_tuple(
+                0, 0, 1.0,
+                std::make_tuple(std::cref(*r_psi_11), std::cref(*r_psi_22))),
+            std::make_tuple(
+                1, 1, 1.0,
+                std::make_tuple(std::cref(*r_psi_22), std::cref(*r_psi_00))),
+            std::make_tuple(
+                2, 2, 1.0,
+                std::make_tuple(std::cref(*r_psi_00), std::cref(*r_psi_11))),
+            std::make_tuple(0, 1, -2.0, std::make_tuple(std::cref(*r_psi_01))),
+            std::make_tuple(0, 2, -2.0, std::make_tuple(std::cref(*r_psi_02))),
+            std::make_tuple(
+                1, 2, -2.0, std::make_tuple(std::cref(*r_psi_12)))));
+  }
+
+  // fix hermiticity...unclear how to do that
+  if (c_startN0 == 0 && c_localN0 > 0) {
+    AUX1[0][0][0] = 0;
+    AUX1[0][0][c_N2_HC - 1] = 0;
+    AUX1[0][c_N1 / 2][0] = 0;
+    AUX1[0][c_N1 / 2][c_N2_HC - 1] = 0;
+  }
+
+  if (c_startN0 <= c_N0 / 2 && c_startN0 + c_localN0 > c_N0 / 2) {
+    AUX1[c_N0 / 2][0][0] = 0;
+    AUX1[c_N0 / 2][0][c_N2_HC - 1] = 0;
+    AUX1[c_N0 / 2][c_N1 / 2][0] = 0;
+    AUX1[c_N0 / 2][c_N1 / 2][c_N2_HC - 1] = 0;
+  }
+}
+
+namespace {
+  inline double tuple_sum(ssize_t, ssize_t, ssize_t, std::tuple<>) { return 0; }
+
+  template <typename H, typename... A>
+  inline double
+  tuple_sum(ssize_t i, ssize_t j, ssize_t k, std::tuple<H, A...> const &t) {
+    return std::get<0>(t)[i][j][k] + tuple_sum(i, j, k, last_of_tuple<1>(t));
+  }
+} // namespace
+
+template <typename CIC>
+template <typename... ArrayTuple>
+void Borg2LPTModel<CIC>::lpt2_add_to_derivative(
+    U_F_Array::array_type &result, const PhaseArrayRef &pos_ag,
+    const PhaseArrayRef &vel_ag, const PhaseArrayRef &lctim, const int axis0,
+    std::tuple<ArrayTuple...> const &tuple_psi) {
+  auto r_pos_psi = c_tmp_real_field;
+  auto c_pos_psi = c_tmp_complex_field;
+
+  size_t const endN0 = c_startN0 + c_localN0;
+#pragma omp parallel for collapse(3)
+  for (size_t l = c_startN0; l < endN0; l++)
+    for (size_t m = 0; m < c_N1; m++)
+      for (size_t n = 0; n < c_N2; n++) {
+        size_t idx = n + c_N2 * m + c_N2 * c_N1 * (l - c_startN0);
+        double DD2 = lctim[idx][3];
+        double DD2v = lctim[idx][4];
+        (*r_pos_psi)[l][m][n] =
+            (DD2 * pos_ag[idx][axis0] + DD2v * vel_ag[idx][axis0]);
+      }
+
+  // FFT to F-space
+  mgr->execute_r2c(c_analysis_plan, r_pos_psi->data(), c_pos_psi->data());
+
+  double const inv_N = 1.0 / (c_N0 * c_N1 * c_N2);
+
+#pragma omp parallel for collapse(3)
+  for (size_t i = c_startN0; i < endN0; i++)
+    for (size_t j = 0; j < c_N1; j++)
+      for (size_t k = 0; k < c_N2_HC; k++) {
+        double const kk[3] = {kmode(i, c_N0, L0), kmode(j, c_N1, L1),
+                              kmode(k, c_N2, L2)};
+
+        double ksquared = kk[0] * kk[0] + kk[1] * kk[1] + kk[2] * kk[2];
+        double fac = -kk[axis0] / ksquared * inv_N;
+
+        std::complex<double> &pos_psi_ijk = (*c_pos_psi)[i][j][k];
+        pos_psi_ijk = std::complex<double>(
+            fac * pos_psi_ijk.imag(), -fac * pos_psi_ijk.real());
+      }
+
+  if (c_startN0 == 0 && c_localN0 > 0) {
+    (*c_pos_psi)[0][0][0] = 0;
+  }
+
+  // FFT to realspace
+  mgr->execute_c2r(c_synthesis_plan, c_pos_psi->data(), r_pos_psi->data());
+
+  auto &AUX2 = *aux_p;
+  LibLSS::copy_array(AUX2, *r_pos_psi);
+
+  tuple_for_each(tuple_psi, [&](auto const &t) {
+    unsigned int const axis1 = std::get<0>(t);
+    unsigned int const axis2 = std::get<1>(t);
+    double const prefactor = std::get<2>(t);
+    auto const &psi_list = std::get<3>(t);
+
+#pragma omp parallel for collapse(3)
+    for (size_t l = c_startN0; l < endN0; l++)
+      for (size_t m = 0; m < c_N1; m++)
+        for (size_t n = 0; n < c_N2; n++) {
+          double &pos_ijk = (*r_pos_psi)[l][m][n];
+          pos_ijk = prefactor * AUX2[l][m][n] * tuple_sum(l, m, n, psi_list);
+        }
+
+    // FFT to F-space
+    mgr->execute_r2c(c_analysis_plan, r_pos_psi->data(), c_pos_psi->data());
+
+    double const inv_volume = 1 / volume;
+
+#pragma omp parallel for collapse(3)
+    for (int i = c_startN0; i < endN0; i++)
+      for (int j = 0; j < c_N1; j++)
+        for (int k = 0; k < c_N2_HC; k++) {
+          double kk[3] = {kmode(i, c_N0, L0), kmode(j, c_N1, L1),
+                          kmode(k, c_N2, L2)};
+
+          double ksquared = kk[0] * kk[0] + kk[1] * kk[1] + kk[2] * kk[2];
+          double fac = kk[axis1] * kk[axis2] / ksquared * inv_volume;
+
+          std::complex<double> &pos_psi_ijk = (*c_pos_psi)[i][j][k];
+
+          result[i][j][k] += fac * pos_psi_ijk;
+        }
+  });
+};
+
+template <typename CIC>
+void Borg2LPTModel<CIC>::lpt2_redshift_pos_ag(
+    PhaseArrayRef &pos, PhaseArrayRef &vel, PhaseArrayRef &pos_ag,
+    PhaseArrayRef &vel_ag, PhaseArrayRef &lctim) {
+  Cosmology cosmo(cosmo_params);
+
+  // this routine generates particle positions in redshift space
+  // the code uses particle momenta p=a^2 dx/dt where x is the co-moving position
+  // peculiar velocities are then given by v_pec = p/a
+
+  // NOTE: Check coefficients
+  boost::array<double, 3> observer = {vobs[0], vobs[1], vobs[2]};
+
+#pragma omp parallel for
+  for (size_t idx = 0; idx < redshiftInfo.localNumParticlesBefore; idx++) {
+    double x0 = pos[idx][0] + xmin0;
+    double x1 = pos[idx][1] + xmin1;
+    double x2 = pos[idx][2] + xmin2;
+
+    double v0 = vel[idx][0] + observer[0];
+    double v1 = vel[idx][1] + observer[1];
+    double v2 = vel[idx][2] + observer[2];
+
+    double s_pos_ag0 = pos_ag[idx][0];
+    double s_pos_ag1 = pos_ag[idx][1];
+    double s_pos_ag2 = pos_ag[idx][2];
+
+    double r2_los = x0 * x0 + x1 * x1 + x2 * x2;
+    double v_los = x0 * v0 + x1 * v1 + x2 * v2;
+    double facRSD = lctim
+        [idx]
+        [2]; // this factor is 1/H/a for velocities in [km/sec] an additional factor arises from momentum conversion
+
+    double slos = s_pos_ag0 * x0 + s_pos_ag1 * x1 + s_pos_ag2 * x2;
+
+    double A = facRSD * slos / r2_los;
+    double B = -2 * facRSD * v_los * slos / square(r2_los);
+    double C = facRSD * v_los / r2_los;
+
+    pos_ag[idx][0] = s_pos_ag0 * (1 + C) + B * x0 + A * v0;
+    pos_ag[idx][1] = s_pos_ag1 * (1 + C) + B * x1 + A * v1;
+    pos_ag[idx][2] = s_pos_ag2 * (1 + C) + B * x2 + A * v2;
+
+    vel_ag[idx][0] = A * x0;
+    vel_ag[idx][1] = A * x1;
+    vel_ag[idx][2] = A * x2;
+  }
+}
+
+template <typename CIC>
+template <typename PositionArray>
+void Borg2LPTModel<CIC>::lpt2_density_obs_ag(
+    PositionArray &pos, PhaseArrayRef &pos_ag, PhaseArrayRef &vel_ag,
+    ArrayRef &B, size_t numParts) {
+  double const nmean = double(c_N0*c_N1*c_N2)/(box_output.N0*box_output.N1*box_output.N2);
+
+  typedef UninitializedArray<boost::multi_array<double, 3>> U_Array;
+
+  if (ALWAYS_MPI(comm)) {
+    // Allocate a temporary density field with extra planes for the
+    // the projection leakage
+    U_Array tmp_delta(out_mgr->extents_real(CIC::MPI_PLANE_LEAKAGE));
+
+    density_exchange_planes_ag(
+        comm, tmp_delta.get_array(), B, out_mgr, CIC::MPI_PLANE_LEAKAGE);
+    array::fill(pos_ag, 0);
+    CIC::adjoint(
+        pos, tmp_delta.get_array(), pos_ag, L0, L1, L2, out_mgr->N0, out_mgr->N1, out_mgr->N2,
+        typename CIC::Periodic_MPI(out_mgr->N0, out_mgr->N1, out_mgr->N2, comm), nmean, numParts);
+  } else {
+    // This is simple, no copy, no adjustment
+    array::fill(pos_ag, 0);
+    CIC::adjoint(
+        pos, B, pos_ag, L0, L1, L2, out_mgr->N0, out_mgr->N1, out_mgr->N2, CIC_Tools::Periodic(out_mgr->N0, out_mgr->N1, out_mgr->N2),
+        nmean, numParts);
+  }
+
+  array::fill(vel_ag, 0);
+}
+
+template <typename CIC>
+void Borg2LPTModel<CIC>::lpt2_fwd_model_ag(
+    PhaseArrayRef &pos_ag, PhaseArrayRef &vel_ag, PhaseArrayRef &lctime,
+    CArrayRef &DPSI) {
+  ConsoleContext<LOG_DEBUG> ctx("BORG adjoint model (particles)");
+
+  ///NOTE: ICs are generated at ai
+
+  //N.) undo ICs
+  lpt2_ic_ag(pos_ag, vel_ag, lctime);
+
+  // RESULT is in AUX1
+
+  if (c_deltao != 0) {
+    array::fill(DPSI, 0);
+    lo_mgr->degrade_complex(*mgr, *AUX1_p, DPSI);
+  } else {
+    fwrap(DPSI) = *AUX1_p;
+  }
+}
+
+template <typename CIC>
+void Borg2LPTModel<CIC>::adjointModelParticles(
+    PhaseArrayRef &grad_pos, PhaseArrayRef &grad_vel) {
+  //lpt2_fwd_model_ag(grad_pos, grad_vel, *lc_timing, gradient_delta);
+  releaseParticles();
+}
+
+template <typename CIC>
+void Borg2LPTModel<CIC>::adjointModel_v2(ModelInputAdjoint<3> gradient_delta) {
+  ConsoleContext<LOG_DEBUG> ctx("BORG adjoint model");
+
+  // This function computes the adjoint gradient in place. The adjoint gradient of the final density must be provided, in exchange
+  // the adjoint gradient of the initial field is returned
+
+  // introduce adjoint quantities
+  // This must be allocated in two steps to avoid the implicit
+  // zero initialization.
+  size_t refPartNum = size_t(c_localN0 * c_N1 * c_N2 * partFactor);
+
+  // gradient_delta may be empty (if for example we use other way to feed the adjoint gradients
+  // directly to particle (e.g. velocity field)
+  // if empty, we just ignore it and work with the current state of pos_ag,vel_ag
+
+  u_pos_ag.reset();
+  u_vel_ag.reset();
+
+  u_pos_ag = std::make_shared<U_PArray>(extents[refPartNum][3]);
+  u_vel_ag = std::make_shared<U_PArray>(extents[refPartNum][3]);
+  auto &pos_ag = u_pos_ag->get_array();
+  auto &vel_ag = u_vel_ag->get_array();
+  auto &pos = u_pos->get_array();
+  auto &vel = u_vel->get_array();
+
+  // re-evaluate redshift distortions from forward run
+  if (do_rsd) {
+    ctx.print("doing redshift space distortions.");
+    PhaseArrayRef &s_pos = u_s_pos->get_array();
+
+    if (gradient_delta) {
+      gradient_delta.setRequestedIO(PREFERRED_REAL);
+      gradient_delta.needDestroyInput();
+
+      ///work backwards from final to initial conditions
+      //1.) undo CIC
+      lpt2_density_obs_ag(
+          s_pos, pos_ag, vel_ag, gradient_delta.getReal(),
+          redshiftInfo.localNumParticlesAfter);
+    }
+    particle_undistribute(redshiftInfo, pos_ag);
+
+    //2.) undo redshift distortions
+    lpt2_redshift_pos_ag(pos, vel, pos_ag, vel_ag, *lc_timing);
+  } else {
+    // work backwards from final to initial conditions
+    // 1.) undo CIC
+    if (gradient_delta) {
+      gradient_delta.setRequestedIO(PREFERRED_REAL);
+      lpt2_density_obs_ag(
+          pos, pos_ag, vel_ag, gradient_delta.getReal(),
+          realInfo.localNumParticlesAfter);
+    }
+    particle_undistribute(
+        realInfo, pos_ag, make_attribute_helper(Particles::vector(vel_ag)));
+  }
+}
+
+template <typename CIC>
+void Borg2LPTModel<CIC>::getAdjointModelOutput(
+    ModelOutputAdjoint<3> gradient_delta) {
+  auto &pos_ag = u_pos_ag->get_array();
+  auto &vel_ag = u_vel_ag->get_array();
+
+  gradient_delta.setRequestedIO(PREFERRED_FOURIER);
+
+  lpt2_fwd_model_ag(
+      pos_ag, vel_ag, *lc_timing, gradient_delta.getFourierOutput());
+
+  clearAdjointGradient();
+}
+
+template <typename CIC>
+void Borg2LPTModel<CIC>::clearAdjointGradient() {
+
+  u_pos_ag.reset();
+  u_vel_ag.reset();
+}
--- a/extra/borg/libLSS/physics/forwards/adapt_generic_bias.cpp
+++ b/extra/borg/libLSS/physics/forwards/adapt_generic_bias.cpp
@ -0,0 +1,248 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/forwards/adapt_generic_bias.cpp
+    Copyright (C) 2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#include <string>
+#include <boost/core/demangle.hpp>
+#include "libLSS/tools/console.hpp"
+#include "libLSS/physics/model_io.hpp"
+#include "libLSS/physics/forwards/softplus.hpp"
+#include "libLSS/tools/fusewrapper.hpp"
+#include "libLSS/physics/forwards/registry.hpp"
+#include "libLSS/tools/ptree_proxy.hpp"
+#include "libLSS/physics/forwards/adapt_generic_bias.hpp"
+#include "libLSS/physics/forwards/transfer.hpp"
+
+using namespace LibLSS;
+
+template <typename T>
+void ForwardGenericBias<T>::commonSetup() {
+  currentBiasParams.resize(boost::extents[bias_t::numParams]);
+
+  dummyModel = std::make_shared<ForwardTransfer>(comm, box_input);
+
+  if (bias)
+    bias->setup_default(currentBiasParams);
+}
+
+template <typename T>
+ForwardGenericBias<T>::~ForwardGenericBias() {
+  if (!bias_cleaned)
+    bias->cleanup();
+}
+
+template <typename T>
+void ForwardGenericBias<T>::forwardModel_v2(ModelInput<3> delta_init) {
+  LIBLSS_AUTO_DEBUG_CONTEXT(ctx);
+
+  // Setup the IO basis that is required for this forward.
+  delta_init.setRequestedIO(PREFERRED_REAL);
+
+  hold_input = std::move(delta_init);
+
+  if (!bias)
+    rebuildBias();
+
+  if (!bias_cleaned)
+    bias->cleanup();
+
+  std::ostringstream oss;
+  oss << "bias Params = ";
+  for (int i = 0; i < bias_t::numParams; i++)
+    oss << currentBiasParams[i] << " ";
+  ctx.print(oss.str());
+  bias->prepare(
+      *dummyModel, hold_input.getRealConst(),
+      0., /* nmean is ignored generally now, it is only there for backward compatibility */
+      currentBiasParams, true); //densityUpdated);
+  bias_cleaned = false;
+}
+
+template <typename T>
+void ForwardGenericBias<T>::getDensityFinal(ModelOutput<3> delta_output) {
+  delta_output.setRequestedIO(PREFERRED_REAL);
+  invalidDensity = false;
+  fwrap(delta_output.getRealOutput()) =
+      std::get<0>(bias->compute_density(hold_input.getRealConst()));
+}
+
+template <typename T>
+void ForwardGenericBias<T>::adjointModel_v2(
+    ModelInputAdjoint<3> in_gradient_delta) {
+  LIBLSS_AUTO_DEBUG_CONTEXT(ctx);
+  // Build strict range views (we do not want to see the
+  // the FFTW padding, ensure that the view object lives till the end of this function.
+
+  in_gradient_delta.setRequestedIO(PREFERRED_REAL);
+  hold_ag_input = std::move(in_gradient_delta);
+}
+
+template <typename T>
+void ForwardGenericBias<T>::getAdjointModelOutput(
+    ModelOutputAdjoint<3> out_gradient_delta) {
+  out_gradient_delta.setRequestedIO(PREFERRED_REAL);
+
+  fwrap(out_gradient_delta.getRealOutput()) =
+      std::get<0>(bias->apply_adjoint_gradient(
+          hold_input.getRealConst(),
+          std::make_tuple(std::cref(hold_ag_input.getRealConst()))));
+}
+
+template <typename T>
+boost::any ForwardGenericBias<T>::getModelParam(
+    std::string const &n, std::string const &param) {
+  LIBLSS_AUTO_DEBUG_CONTEXT(ctx);
+
+  ctx.format("modelName = %s, n = %s", modelName, n);
+  if (n == modelName && param == "biasParameters") {
+    if (!bias)
+      rebuildBias();
+    return currentBiasParams;
+  }
+  return boost::any();
+}
+template <typename T>
+void ForwardGenericBias<T>::rebuildBias(std::shared_ptr<LikelihoodInfo> info)
+{
+  LIBLSS_AUTO_DEBUG_CONTEXT(ctx);
+  if (!bias_cleaned && bias)
+    bias->cleanup();
+  if (!info)
+    bias = std::make_shared<bias_t>();
+  else
+    bias = std::make_shared<bias_t>(*info);
+  bias_cleaned = true;
+
+  if (!biasSet) {
+    bias->setup_default(currentBiasParams);
+    biasSet = true;
+  }
+}
+
+template <typename T>
+void ForwardGenericBias<T>::setModelParams(
+    ModelDictionnary const &model_params) {
+  LIBLSS_AUTO_DEBUG_CONTEXT(ctx);
+
+  {
+    auto location = model_params.find("info");
+    if (location != model_params.end()) {
+      try {
+        auto info =
+            boost::any_cast<std::shared_ptr<LikelihoodInfo>>(location->second);
+
+        rebuildBias(info);
+      } catch (boost::bad_any_cast const &) {
+        error_helper<ErrorBadState>(
+            "Bad type in info parameters, was: " +
+            std::string(boost::core::demangle(location->second.type().name())));
+      }
+    }
+  }
+
+  // TODO: find a way to disambiguate in case of several bias models...
+  {
+    auto location = model_params.find("biasParameters");
+    if (location != model_params.end()) {
+      // Retrieve the array.
+      try {
+        auto params =
+            boost::any_cast<LibLSS::multi_array<double, 1>>(location->second);
+
+        if (params.shape()[0] != bias_t::numParams) {
+          throw std::invalid_argument("Invalid number of bias parameters");
+        }
+
+        if (!bias->check_bias_constraints(params)) {
+          ctx.print("Failing constraints: " + LibLSS::to_string(params));
+          throw outOfBoundParam("Fail bias constraints");
+        }
+
+        int diff;
+        for (diff = 0; diff < bias_t::numParams; diff++)
+          if (currentBiasParams[diff] != params[diff])
+            break;
+        ctx.format("Bias changed (diff=%d, numParams=%d)", diff, bias_t::numParams);
+        if (diff != bias_t::numParams) {
+          currentBiasParams = params;
+          biasSet = true;
+          invalidDensity = true;
+        }
+      } catch (boost::bad_any_cast const &e) {
+        error_helper<ErrorBadState>(
+            "Bad type in bias parameters, was: " +
+            std::string(boost::core::demangle(location->second.type().name())));
+      }
+    }
+  }
+
+  // TODO: Remove the bias from the dictionnary before going upward.
+  BORGForwardModel::setModelParams(model_params);
+  return;
+}
+
+template <typename T>
+static std::shared_ptr<BORGForwardModel> create_bias(
+    MPI_Communication *comm, BoxModel const &box, PropertyProxy const &params) {
+  auto model = std::make_shared<ForwardGenericBias<T>>(comm, box);
+  return model;
+}
+
+AUTO_REGISTRATOR_IMPL(ForwardGenericBias);
+
+#include "libLSS/physics/bias/noop.hpp"
+#include "libLSS/physics/bias/power_law.hpp"
+#include "libLSS/physics/bias/linear_bias.hpp"
+#include "libLSS/physics/bias/broken_power_law.hpp"
+#include "libLSS/physics/bias/double_power_law.hpp"
+#include "libLSS/physics/bias/many_power.hpp"
+#include "libLSS/physics/bias/eft_bias.hpp"
+
+namespace {
+  void bias_registrator() {
+    LIBLSS_AUTO_DEBUG_CONTEXT(ctx);
+
+    ForwardRegistry::instance().registerFactory(
+        "bias::Noop", create_bias<bias::Noop>);
+    ForwardRegistry::instance().registerFactory(
+        "bias::Linear", create_bias<bias::LinearBias>);
+    ForwardRegistry::instance().registerFactory(
+        "bias::PowerLaw", create_bias<bias::PowerLaw>);
+    ForwardRegistry::instance().registerFactory(
+        "bias::BrokenPowerLaw", create_bias<bias::BrokenPowerLaw>);
+    ForwardRegistry::instance().registerFactory(
+        "bias::DoubleBrokenPowerLaw", create_bias<bias::DoubleBrokenPowerLaw>);
+    ForwardRegistry::instance().registerFactory(
+        "bias::ManyPower_1^1",
+        create_bias<bias::ManyPower<bias::ManyPowerLevels<double, 1>>>);
+    ForwardRegistry::instance().registerFactory(
+        "bias::ManyPower_1^2",
+        create_bias<bias::ManyPower<bias::ManyPowerLevels<double, 1, 1>>>);
+    ForwardRegistry::instance().registerFactory(
+        "bias::ManyPower_1^4",
+        create_bias<
+            bias::ManyPower<bias::ManyPowerLevels<double, 1, 1, 1, 1>>>);
+    ForwardRegistry::instance().registerFactory(
+        "bias::ManyPower_2^2",
+        create_bias<bias::ManyPower<bias::ManyPowerLevels<double, 2, 2>>>);
+    ForwardRegistry::instance().registerFactory(
+        "bias::EFT", create_bias<bias::EFTBiasDefault>);
+    ForwardRegistry::instance().registerFactory(
+        "bias::EFT_Thresh", create_bias<bias::EFTBiasThresh>);
+  }
+
+  RegisterStaticInit _initter(
+      &bias_registrator,
+      StaticInit::MIN_PRIORITY -
+          1); // Bad priority patch. Basically we ask it to be run before the registry listing.
+} // namespace
+
+// ARES TAG: num_authors = 1
+// ARES TAG: name(0) = Guilhem Lavaux
+// ARES TAG: year(0) = 2020
+// ARES TAG: email(0) = guilhem.lavaux@iap.fr
--- a/extra/borg/libLSS/physics/forwards/adapt_generic_bias.hpp
+++ b/extra/borg/libLSS/physics/forwards/adapt_generic_bias.hpp
@ -0,0 +1,102 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/forwards/adapt_generic_bias.hpp
+    Copyright (C) 2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#pragma once
+#ifndef __LIBLSS_HADES_FORWARD_BIAS_GENERIC_HPP
+#  define __LIBLSS_HADES_FORWARD_BIAS_GENERIC_HPP
+
+#  include "libLSS/physics/forward_model.hpp"
+#  include "libLSS/tools/fusewrapper.hpp"
+#  include "libLSS/tools/errors.hpp"
+#  include "libLSS/physics/forwards/registry.hpp"
+#  include "libLSS/physics/likelihoods/base.hpp"
+#  include "libLSS/tools/static_init.hpp"
+#  include "libLSS/tools/static_auto.hpp"
+
+namespace LibLSS {
+
+  /**
+   * This class handles the convolution of a real field by some Fourier kernel.
+   */
+  template <typename T>
+  class ForwardGenericBias : public BORGForwardModel {
+  public:
+    using BORGForwardModel::CArrayRef;
+    typedef T bias_t;
+
+  protected:
+    ModelInput<3> hold_input;
+    ModelInputAdjoint<3> hold_ag_input;
+
+    bool bias_cleaned;
+    std::shared_ptr<bias_t> bias;
+    bool invalidDensity;
+    bool biasSet;
+
+    LibLSS::multi_array<double, 1> currentBiasParams;
+
+    std::shared_ptr<BORGForwardModel> dummyModel;
+
+    void commonSetup();
+
+    void rebuildBias(std::shared_ptr<LikelihoodInfo> info = std::shared_ptr<LikelihoodInfo>());
+
+  public:
+    /**
+     * Constructor.
+     */
+    explicit ForwardGenericBias(
+        MPI_Communication *comm, const BoxModel &box, const BoxModel &box2)
+        : BORGForwardModel(comm, box, box2), bias_cleaned(true),
+          invalidDensity(true), biasSet(false) {
+      commonSetup();
+    }
+
+    /**
+     * Constructor.
+     */
+    explicit ForwardGenericBias(MPI_Communication *comm, const BoxModel &box)
+        : BORGForwardModel(comm, box), bias_cleaned(true),
+          invalidDensity(true), biasSet(false) {
+      commonSetup();
+    }
+
+    virtual ~ForwardGenericBias();
+
+    // Difficult to guess directly at the moment. However the classical default for bias
+    // is to handle data in REAL representation.
+
+    PreferredIO getPreferredInput() const override { return PREFERRED_REAL; }
+    PreferredIO getPreferredOutput() const override { return PREFERRED_REAL; }
+
+    void forwardModel_v2(ModelInput<3> delta_init) override;
+    void getDensityFinal(ModelOutput<3> delta_output) override;
+
+    void adjointModel_v2(ModelInputAdjoint<3> in_gradient_delta) override;
+    void
+    getAdjointModelOutput(ModelOutputAdjoint<3> out_gradient_delta) override;
+
+    void releaseParticles() override {}
+
+    void setModelParams(ModelDictionnary const &params) override;
+    boost::any getModelParam(std::string const& n, std::string const& param) override;
+
+    bool densityInvalidated() const override { return invalidDensity; }
+  }; // namespace LibLSS
+
+} // namespace LibLSS
+
+AUTO_REGISTRATOR_DECL(ForwardGenericBias);
+
+#endif
+
+// ARES TAG: num_authors = 1
+// ARES TAG: name(0) = Guilhem Lavaux
+// ARES TAG: year(0) = 2020
+// ARES TAG: email(0) = guilhem.lavaux@iap.fr
--- a/extra/borg/libLSS/physics/forwards/altair_ap.cpp
+++ b/extra/borg/libLSS/physics/forwards/altair_ap.cpp
@ -0,0 +1,605 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/forwards/altair_ap.cpp
+    Copyright (C) 2018-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2018-2020 Doogesh Kodi Ramanah <ramanah@iap.fr>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#include "libLSS/physics/forward_model.hpp"
+#include "libLSS/physics/forwards/altair_ap.hpp"
+#include "libLSS/tools/fusewrapper.hpp"
+#include "libLSS/tools/fused_cond.hpp"
+#include "libLSS/tools/errors.hpp"
+#include <gsl/gsl_const_mksa.h>
+#include <Eigen/Dense>
+#include <cmath>
+#include "libLSS/tools/mpi/ghost_planes.hpp"
+#include "libLSS/tools/auto_interpolator.hpp"
+
+using namespace LibLSS;
+using namespace LibLSS::ALTAIR;
+
+AltairAPForward::AltairAPForward(
+    MPI_Communication *comm, const BoxModel &box_c, const BoxModel &box_z,
+    bool is_contrast_)
+    : BORGForwardModel(comm, box_c, box_z),
+      M_matrix(n_order * n_order * n_order, n_order * n_order * n_order),
+      M_inverse(n_order * n_order * n_order, n_order * n_order * n_order),
+      grid_transform(out_mgr->extents_real_strict()[4]),
+      is_contrast(is_contrast_) {
+  ConsoleContext<LOG_VERBOSE> ctx("Altair AP initialization");
+
+  ctx.format(
+      "Redshift box [%g,%g] [%g,%g] [%g,%g]", box_z.xmin0,
+      (box_z.xmin0 + box_z.L0), box_z.xmin1, (box_z.xmin1 + box_z.L1),
+      box_z.xmin2, (box_z.xmin2 + box_z.L2));
+  ctx.format(
+      "Comoving box [%g,%g] [%g,%g] [%g,%g]", xmin0, (xmin0 + L0), xmin1,
+      (xmin1 + L1), xmin2, (xmin2 + L2));
+
+  setupDefault();
+
+  // ASSERTION: box_c.N* == box.N* // For the time being, redshift box corresponds to comoving one
+  L_z[0] = box_z.L0;
+  L_z[1] = box_z.L1;
+  L_z[2] = box_z.L2;
+  corner_z[0] = box_z.xmin0;
+  corner_z[1] = box_z.xmin1;
+  corner_z[2] = box_z.xmin2;
+
+  delta_z[0] = L_z[0] / out_mgr->N0;
+  delta_z[1] = L_z[1] / out_mgr->N1;
+  delta_z[2] = L_z[2] / out_mgr->N2;
+
+  delta_c[0] = box_c.L0 / lo_mgr->N0;
+  delta_c[1] = box_c.L1 / lo_mgr->N1;
+  delta_c[2] = box_c.L2 / lo_mgr->N2;
+
+  prepareMatrix();   // Compute M_matrix only once
+  COSMO_INIT = true; //FIXME
+}
+
+void AltairAPForward::prepareMatrix() {
+  // Declare variables required for tricubic interpolation
+  // For case of trilinear interpolation
+  switch (n_order) {
+  case 2:
+    x_inter[0] = 0;
+    x_inter[1] = 1;
+    break;
+  case 4:
+    x_inter[0] = -1;
+    x_inter[1] = 0;
+    x_inter[2] = 1;
+    x_inter[3] = 2;
+    break;
+  case 6:
+    x_inter[0] = -2;
+    x_inter[1] = -1;
+    x_inter[2] = 0;
+    x_inter[3] = 1;
+    x_inter[4] = 2;
+    x_inter[5] = 3;
+    break;
+  case 8:
+    x_inter[0] = -3;
+    x_inter[1] = -2;
+    x_inter[2] = -1;
+    x_inter[3] = 0;
+    x_inter[4] = 1;
+    x_inter[5] = 2;
+    x_inter[6] = 3;
+    x_inter[7] = 4;
+    break;
+  default:
+    error_helper<ErrorNotImplemented>("Interpolation order not implemented");
+    break;
+  }
+  // Construct the M_matrix [(64x64) for tricubic interpolation] that encodes the interpolation coefficients
+  for (int alpha = 0; alpha < n_order; alpha++) {
+    for (int beta = 0; beta < n_order; beta++) {
+      for (int gamma = 0; gamma < n_order; gamma++) {
+        for (int i = 0; i < n_order; i++) {
+          for (int j = 0; j < n_order; j++) {
+            for (int k = 0; k < n_order; k++) {
+
+              M_matrix(
+                  n_order * n_order * alpha + n_order * beta + gamma,
+                  n_order * n_order * i + n_order * j + k) =
+                  std::pow(x_inter[alpha], i) * std::pow(x_inter[beta], j) *
+                  std::pow(x_inter[gamma], k);
+            }
+          }
+        }
+      }
+    }
+  }
+  // Compute the inverse of M_matrix
+  M_inverse = M_matrix.inverse();
+}
+
+void AltairAPForward::updateCoordinateSystem() {
+  ConsoleContext<LOG_VERBOSE> ctx("Altair AP: updating coordinate system");
+  Console &cons = Console::instance();
+  Cosmology cosmo(my_params);
+
+  double Qfactor = (H100 * M_IN_KM) / cosmo_clight; // Q has units h100/Mpc
+  // Note that internal units are all Mpc/h100, so the external redshift has to be put on similar scales
+  // This leaves only the physical effect due to distortion
+
+  // Owned planes
+  std::vector<size_t> owned_planes(lo_mgr->localN0);
+
+  for (size_t i = 0; i < lo_mgr->localN0; i++)
+    owned_planes[i] = lo_mgr->startN0 + i;
+
+  size_t startN0 = out_mgr->startN0;
+  size_t endN0 = startN0 + out_mgr->localN0;
+  size_t out_N1 = out_mgr->N1;
+  size_t out_N2 = out_mgr->N2;
+
+  double far_z = 0;
+
+  for (int a = 0; a <= 1; a++) {
+    for (int b = 0; b <= 1; b++) {
+      for (int c = 0; c <= 1; c++) {
+        double z_x = corner_z[0] + a * out_mgr->N0 * delta_z[0];
+        double z_y = corner_z[1] + b * out_N1 * delta_z[1];
+        double z_z = corner_z[2] + c * out_N2 * delta_z[2];
+
+        double z_r = std::sqrt(
+            z_x * z_x + z_y * z_y + z_z * z_z); // z_r has units Mpc/h100
+        far_z = std::max(far_z, z_r);
+      }
+    }
+  }
+  far_z *= Qfactor*1.1; // add 10% safety margin.
+  ctx.format("far_z=%g", far_z);
+
+  auto fast_r_c =
+      build_auto_interpolator(
+          [&](double z) { return cosmo.com2comph(cosmo.a2com(cosmo.z2a(z))); },
+          0.0, far_z, 1e-2, 0.0, 0.0)
+          .setThrowOnOverflow();
+
+  auto fast_E_z =
+      build_auto_interpolator(
+          [&](double z) {
+            return cosmo.Hubble(cosmo.z2a(z)) / (my_params.h * H100);
+          },
+          0.0, far_z, 1e-2, 0.0, 0.0)
+          .setThrowOnOverflow();
+
+#pragma omp parallel for collapse(3)
+  for (size_t n0 = startN0; n0 < endN0; n0++) {
+    for (size_t n1 = 0; n1 < out_N1; n1++) {
+      for (size_t n2 = 0; n2 < out_N2; n2++) {
+        double z_x = double(n0) * delta_z[0] + corner_z[0],
+               z_y = double(n1) * delta_z[1] + corner_z[1],
+               z_z = double(n2) * delta_z[2] + corner_z[2];
+
+        double z_r = std::sqrt(
+            z_x * z_x + z_y * z_y + z_z * z_z); // z_r has units Mpc/h100
+
+        double r_c;
+        try {
+          r_c = fast_r_c(
+            z_r *
+            Qfactor); // z_r*Q_factor = adimensional;r_c has units Mpc/h100
+} catch(LibLSS::ErrorParams& e) {
+  ctx.format("Problem at z_r*Qfactor=%g", z_r*Qfactor);
+  throw;
+}
+
+        //double r_c = z_r; // NO EXPANSION!!! -> Remove distortion due to cosmology
+
+        double c_x = z_x / z_r * r_c, c_y = z_y / z_r * r_c,
+               c_z = z_z / z_r * r_c;
+
+        if (r_c == 0) {
+          c_x = 0;
+          c_y = 0;
+          c_z = 0;
+        }
+
+        // Compute E(z)
+        double E_z = fast_E_z(Qfactor * z_r); // E_z is dimensionless //FIXME
+        //(cosmo_params.h * H100); // E_z is dimensionless
+
+        //double E_z = 1/(1+2*Qfactor*z_r); // NO EXPANSION!!! -> Remove distortion due to cosmology
+        double del_z_del_x = E_z;
+        double Jc = del_z_del_x - (z_r / r_c);
+
+        // Compute all 9 components of (3x3) matrix J for each voxel
+        double J_00 = ((Jc * c_x * c_x) / (r_c * r_c)) + (z_r / r_c);
+        double J_01 = ((Jc * c_x * c_y) / (r_c * r_c));
+        double J_02 = ((Jc * c_x * c_z) / (r_c * r_c));
+        double J_10 = J_01; // Symmetric about leading diagonal
+        double J_11 = ((Jc * c_y * c_y) / (r_c * r_c)) + (z_r / r_c);
+        double J_12 = ((Jc * c_y * c_z) / (r_c * r_c));
+        double J_20 = J_02;
+        double J_21 = J_12;
+        double J_22 = ((Jc * c_z * c_z) / (r_c * r_c)) + (z_r / r_c);
+        // Compute determinant of (3x3) matrix J -> Jacobian(voxel)
+        double Jacobian = J_00 * (J_11 * J_22 - J_12 * J_21) -
+                          J_01 * (J_10 * J_22 - J_12 * J_20) +
+                          J_02 * (J_10 * J_21 - J_11 * J_20);
+
+        grid_transform[n0][n1][n2][0] = (c_x - xmin0) / delta_c[0];
+        grid_transform[n0][n1][n2][1] = (c_y - xmin1) / delta_c[1];
+        grid_transform[n0][n1][n2][2] = (c_z - xmin2) / delta_c[2];
+        grid_transform[n0][n1][n2][3] = 1.0 / Jacobian;
+
+        if (r_c == 0) {
+          grid_transform[n0][n1][n2][3] = 1.;
+        }
+
+#ifndef NDEBUG
+        if (grid_transform[n0][n1][n2][3] < 0) {
+          ctx.format(
+              "Jacobian = %g, Jc = %g, r_c = %g, z_r = %g, "
+              "del_z_del_x = %g, E_z = %g, Qfactor = %g",
+              grid_transform[n0][n1][n2][3], Jc, r_c, z_r, del_z_del_x, E_z,
+              Qfactor);
+          cons.c_assert((grid_transform[n0][n1][n2][3] > 0), "Jacobian < 0");
+        }
+#endif
+        // The following assertions ensure we do not go outside of box
+        // We use error_helper to report to the caller that cosmology/box setup is wrong
+        // but do not crash the code in case we are working in python.
+        auto g = grid_transform[n0][n1][n2];
+        if (g[0] < 0)
+          error_helper<ErrorParams>("Underflow lower bound 0");
+        if (g[0] >= lo_mgr->N0)
+          error_helper<ErrorParams>("Overflow lower bound 0");
+        if (g[1] < 0)
+          error_helper<ErrorParams>("Underflow lower bound 1");
+        if (g[1] >= lo_mgr->N1)
+          error_helper<ErrorParams>("Overflow lower bound 1");
+        if (g[2] < 0)
+          error_helper<ErrorParams>("Underflow lower bound 2");
+        if (g[2] >= lo_mgr->N1)
+          error_helper<ErrorParams>("Overflow lower bound 2");
+      }
+    }
+  }
+
+  // Setup ghost plane
+  std::set<size_t> required_planes;
+
+  for (size_t n0 = startN0; n0 < endN0; n0++) {
+    for (size_t n1 = 0; n1 < out_N1; n1++) {
+      for (size_t n2 = 0; n2 < out_N2; n2++) {
+        size_t base = std::floor(grid_transform[n0][n1][n2][0]);
+        for (size_t j = 0; j < n_order; j++) {
+          size_t k = (base + x_inter[j] + N0) % N0;
+          if (!lo_mgr->on_core(k)) {
+            required_planes.insert(k);
+          }
+        }
+      }
+    }
+  }
+
+  ghosts.setup(
+      comm, required_planes, owned_planes,
+      std::array<size_t, 2>{size_t(lo_mgr->N1), size_t(lo_mgr->N2real)},
+      lo_mgr->N0);
+}
+
+// Function for tricubic interpolation here. Input: s_field --> Output: z_field
+template <typename SArray>
+void AltairAPForward::interpolate_3d(SArray const &s_field, ArrayRef &z_field) {
+  LIBLSS_AUTO_DEBUG_CONTEXT(ctx);
+  double shift;
+
+  fwrap(*tmp_real_field) = s_field;
+
+  ghosts.synchronize(*tmp_real_field);
+
+  if (is_contrast)
+    shift = 1;
+  else
+    shift = 0;
+
+// Coordinate transformation (redshift -> comoving), followed by trilinear interpolation
+#pragma omp parallel
+  {
+    size_t endN0 = out_mgr->startN0 + out_mgr->localN0;
+    size_t startN0 = out_mgr->startN0;
+    size_t out_N1 = out_mgr->N1;
+    size_t out_N2 = out_mgr->N2;
+
+    VectorXd interp_coeff(n_order * n_order * n_order);
+    VectorXd rho(
+        n_order * n_order *
+        n_order); // vector that encodes values of (n_order**3) voxels that will contribute to interpolated value
+#pragma omp for collapse(                                                      \
+    3) //private(rho, interp_coeff) // private ensures that different arrays are used on different threads to avoid leakage
+    for (size_t n0 = startN0; n0 < endN0; n0++) {
+      for (size_t n1 = 0; n1 < out_N1; n1++) {
+        for (size_t n2 = 0; n2 < out_N2; n2++) {
+
+          auto t = grid_transform[n0][n1][n2];
+          double out = 0;
+
+          int ix = (int)std::floor(t[0]); // input x-coordinate
+          int iy = (int)std::floor(t[1]); // input y-coordinate
+          int iz = (int)std::floor(t[2]); // input z-coordinate
+
+          double rx = t[0] - ix;
+          double ry = t[1] - iy;
+          double rz = t[2] - iz;
+
+          // Construct rho to evaluate store the value of s_field at all vertices of interest -> vector of dimension 64 for tricubic scheme
+          // We use periodic BC here -> Add N0/N1/N2 to prevent negative values
+
+          for (size_t alpha = 0; alpha < n_order; alpha++) {
+            size_t r_alpha = (ix + x_inter[alpha] + N0) % N0;
+            if (lo_mgr->on_core(r_alpha)) {
+              for (size_t beta = 0; beta < n_order; beta++) {
+                size_t r_beta = (iy + x_inter[beta] + N1) % N1;
+                for (size_t gamma = 0; gamma < n_order; gamma++) {
+                  size_t r_gamma = (iz + x_inter[gamma] + N2) % N2;
+                  rho(n_order * n_order * alpha + n_order * beta + gamma) =
+                      s_field[r_alpha][r_beta][r_gamma];
+                }
+              }
+            } else {
+              auto selected_plane = ghosts.getPlane(r_alpha);
+              for (size_t beta = 0; beta < n_order; beta++) {
+                size_t r_beta = (iy + x_inter[beta] + N1) % N1;
+                for (size_t gamma = 0; gamma < n_order; gamma++) {
+                  size_t r_gamma = (iz + x_inter[gamma] + N2) % N2;
+                  rho(n_order * n_order * alpha + n_order * beta + gamma) =
+                      selected_plane[r_beta][r_gamma];
+                }
+              }
+            }
+          }
+
+          // Include an assertion below on all values of rho to ensure no negative values
+#ifndef NDEBUG
+          for (int i_assert = 0; i_assert < (n_order * n_order * n_order);
+               i_assert++) {
+            Console::instance().c_assert(
+                rho(i_assert) >= (-1 + epsilon), "rho[i_assert] not positive");
+          }
+#endif
+
+          // Construct interp_coeff via matricial operation -> vector of dimension 64 for tricubic scheme
+          interp_coeff.noalias() =
+              M_inverse *
+              rho; // Apparently, the matricial operation is as simplistic as this
+
+          // Core of generic nth order interpolation
+          boost::array<double, n_order> ax, ay, az;
+          for (unsigned int i_tilde = 0; i_tilde < n_order; i_tilde++) {
+            ax[i_tilde] = std::pow(rx, i_tilde);
+            ay[i_tilde] = std::pow(ry, i_tilde);
+            az[i_tilde] = std::pow(rz, i_tilde);
+          }
+
+          for (unsigned int i_tilde = 0; i_tilde < n_order; i_tilde++) {
+            for (unsigned int j_tilde = 0; j_tilde < n_order; j_tilde++) {
+              for (unsigned int k_tilde = 0; k_tilde < n_order; k_tilde++) {
+                out += interp_coeff(
+                           n_order * n_order * i_tilde + n_order * j_tilde +
+                           k_tilde) *
+                       ax[i_tilde] * ay[j_tilde] * az[k_tilde];
+              }
+            }
+          }
+
+          // z_field may become < -1 owed to numerical inaccuracies
+          z_field[n0][n1][n2] = t[3] * (shift + out) - shift;
+        }
+      }
+    }
+  }
+}
+
+void AltairAPForward::forwardModelSimple(CArrayRef &delta_init) {
+  error_helper<ErrorNotImplemented>(
+      "No forwardModelSimple in ALTAIR forward model");
+}
+
+void AltairAPForward::clearAdjointGradient() { hold_in_gradient.clear(); }
+
+void AltairAPForward::forwardModel_v2(ModelInput<3> delta_init) {
+  ConsoleContext<LOG_DEBUG> ctx("forward Altair AP");
+
+  delta_init.setRequestedIO(PREFERRED_REAL);
+
+  hold_input = std::move(delta_init);
+}
+
+void AltairAPForward::getDensityFinal(ModelOutput<3> delta_output) {
+
+  // Need the real space density field -> s_field
+  delta_output.setRequestedIO(PREFERRED_REAL);
+
+  double G = 1; // Growth TBI
+
+  auto const &s_field = hold_input.getRealConst();
+
+  // Only bother of real values (no padding)
+  // First part of the forward model, rescaling
+  auto fdelta = fwrap(s_field);
+  auto zero_array = b_fused_idx<double, 3>(
+      [](int, int, int) -> double { return -1 + epsilon; });
+  // Threshold the density at zero
+  auto cond = *(fdelta > (-1 + epsilon));
+  auto density = b_cond_fused<double>(cond, *fdelta, zero_array);
+
+  // The function below does the trilinear interpolation and outputs z_field
+  // Output of forward model -> galaxy density field in redshift space.
+  interpolate_3d(density, delta_output.getRealOutput());
+}
+
+//FIXME
+void AltairAPForward::setModelParams(ModelDictionnary const &params) {
+  LIBLSS_AUTO_DEBUG_CONTEXT(ctx);
+  if (params.find("altair_cosmo") != params.end()) {
+    ctx.print("Got new cosmology");
+    my_params = boost::any_cast<CosmologicalParameters>(
+        params.find("altair_cosmo")->second);
+    updateCoordinateSystem();
+  }
+}
+
+void AltairAPForward::updateCosmo() {
+  ConsoleContext<LOG_DEBUG> ctx("Altair AP cosmo update");
+
+  if (COSMO_INIT) { //FIXME
+    my_params = cosmo_params;
+    updateCoordinateSystem();
+    COSMO_INIT = false;
+  }
+}
+
+void AltairAPForward::forwardModelRsdField(ArrayRef &deltaf, double *vobs_ext) {
+  error_helper<ErrorNotImplemented>("No RSD support in Log forward model");
+}
+
+void AltairAPForward::adjointModel_v2(ModelInputAdjoint<3> in_gradient_delta) {
+  ConsoleContext<LOG_DEBUG> ctx("adjoint Altair AP");
+
+  // Build strict range views (we do not want to see the
+  // FFTW padding), ensure that the view object lives till the end of this function.
+
+  in_gradient_delta.setRequestedIO(PREFERRED_REAL);
+
+  hold_in_gradient = std::move(in_gradient_delta);
+}
+
+void AltairAPForward::getAdjointModelOutput(
+    ModelOutputAdjoint<3> ag_delta_output) {
+
+  LIBLSS_AUTO_DEBUG_CONTEXT(ctx);
+  ag_delta_output.setRequestedIO(PREFERRED_REAL);
+
+  auto &in_gradient = hold_in_gradient.getRealConst();
+
+  auto in_gradient_view = in_gradient[out_mgr->strict_range()];
+
+  auto out_gradient_view =
+      ag_delta_output.getRealOutput()[lo_mgr->strict_range()];
+
+  ghosts.clear_ghosts();
+
+  fwrap(out_gradient_view) = 0;
+
+  // Careful with MPI:
+  // With strict range -> view where the first axis is no longer between [startN0, startN0+localNo] but between [0, localN0]
+  //fwrap(in_gradient[lo_mgr->strict_range()]) = gradient_delta[lo_mgr->strict_range()];
+  //fwrap(out_gradient_view) = 0;
+  size_t endN0 = out_mgr->startN0 + out_mgr->localN0;
+  size_t startN0 = out_mgr->startN0;
+  size_t out_N1 = out_mgr->N1;
+  size_t out_N2 = out_mgr->N2;
+
+// Gradient of trilinear interpolation -> Loop over voxel of incoming gradient
+#pragma omp parallel for collapse(3)
+  for (size_t n0 = startN0; n0 < endN0; n0++) {
+    for (size_t n1 = 0; n1 < out_N1; n1++) {
+      for (size_t n2 = 0; n2 < out_N2; n2++) {
+
+        // For adjoint gradient purposes
+        VectorXd interp_coeff_adj(n_order * n_order * n_order);
+        VectorXd rho_adj(n_order * n_order * n_order);
+
+        auto input_gradient = in_gradient[n0][n1][n2];
+        auto t = grid_transform[n0][n1][n2];
+
+        // Compute adjoint (tangent) gradient
+        int ix = (int)std::floor(t[0]); // input x-coordinate
+        int iy = (int)std::floor(t[1]); // input y-coordinate
+        int iz = (int)std::floor(t[2]); // input z-coordinate
+
+        double rx = t[0] - ix;
+        double ry = t[1] - iy;
+        double rz = t[2] - iz;
+
+        // Generalize gradient of trilinear interpolation below to tricubic version
+        for (size_t alpha = 0; alpha < n_order; alpha++) {
+          for (size_t beta = 0; beta < n_order; beta++) {
+            for (size_t gamma = 0; gamma < n_order; gamma++) {
+              rho_adj(n_order * n_order * alpha + n_order * beta + gamma) =
+                  input_gradient * std::pow(rx, alpha) * std::pow(ry, beta) *
+                  std::pow(rz, gamma);
+            }
+          }
+        }
+
+        interp_coeff_adj = (rho_adj.transpose() * M_inverse)
+                               .transpose(); // Same M_inverse as computed above
+
+        auto compute_value = [&interp_coeff_adj](size_t i, size_t j, size_t k) {
+          return interp_coeff_adj(n_order * n_order * i + n_order * j + k);
+        };
+
+        double volume_expansion = t[3];
+
+        auto accumulate = [this, iy, iz, &compute_value,
+                           volume_expansion](size_t i_tilde, auto &&out) {
+          for (size_t j_tilde = 0; j_tilde < n_order; j_tilde++) {
+            size_t index_j = (iy + x_inter[j_tilde] + N1) % N1;
+            for (size_t k_tilde = 0; k_tilde < n_order; k_tilde++) {
+              size_t index_k = (iz + x_inter[k_tilde] + N2) % N2;
+              out[index_j][index_k] +=
+                  volume_expansion * compute_value(i_tilde, j_tilde, k_tilde);
+            }
+          }
+        };
+
+        auto &ag_real_out = ag_delta_output.getRealOutput();
+        // Gradient of tricubic interpolation (Do not forget Jacobian factor)
+        for (size_t i_tilde = 0; i_tilde < n_order; i_tilde++) {
+          size_t index_i = (ix + x_inter[i_tilde] + N0) % N0;
+
+          if (lo_mgr->on_core(index_i)) {
+            accumulate(i_tilde, ag_real_out[index_i]);
+          } else {
+            accumulate(i_tilde, ghosts.ag_getPlane(index_i));
+          }
+        }
+      }
+    }
+  }
+
+  ghosts.synchronize_ag(ag_delta_output.getRealOutput());
+}
+
+void AltairAPForward::releaseParticles() {}
+
+static std::shared_ptr<BORGForwardModel> build_altair_ap(
+    MPI_Communication *comm, BoxModel const &box, PropertyProxy const &params) {
+  bool is_contrast;
+  BoxModel box_z;
+
+  box_z.xmin0 = params.get<double>("corner0_z");
+  box_z.xmin1 = params.get<double>("corner1_z");
+  box_z.xmin2 = params.get<double>("corner2_z");
+  box_z.L0 = params.get<double>("L0_z");
+  box_z.L1 = params.get<double>("L1_z");
+  box_z.L2 = params.get<double>("L2_z");
+  box_z.N0 = params.get<double>("N0_z");
+  box_z.N1 = params.get<double>("N1_z");
+  box_z.N2 = params.get<double>("N2_z");
+  is_contrast = params.get<bool>("is_contrast", false);
+
+  return std::make_shared<AltairAPForward>(comm, box, box_z, is_contrast);
+}
+
+LIBLSS_REGISTER_FORWARD_IMPL(ALTAIR_AP, build_altair_ap);
+
+// ARES TAG: authors_num = 2
+// ARES TAG: name(0) = Guilhem Lavaux
+// ARES TAG: email(0) = guilhem.lavaux@iap.fr
+// ARES TAG: year(0) = 2018-2020
+// ARES TAG: name(1) = Doogesh Kodi Ramanah
+// ARES TAG: email(1) = ramanah@iap.fr
+// ARES TAG: year(1) = 2018-2020
--- a/extra/borg/libLSS/physics/forwards/altair_ap.hpp
+++ b/extra/borg/libLSS/physics/forwards/altair_ap.hpp
@ -0,0 +1,115 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/forwards/altair_ap.hpp
+    Copyright (C) 2018-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2018-2020 Doogesh Kodi Ramanah <ramanah@iap.fr>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#ifndef __LIBLSS_ALTAIR_AP_FORWARD_MODEL_HPP
+#  define __LIBLSS_ALTAIR_AP_FORWARD_MODEL_HPP
+
+#  include "libLSS/physics/forward_model.hpp"
+#  include "libLSS/physics/forwards/registry.hpp"
+#  include "libLSS/tools/fusewrapper.hpp"
+#  include "libLSS/tools/fused_cond.hpp"
+#  include "libLSS/tools/errors.hpp"
+#  include <gsl/gsl_const_mksa.h>
+#  include <Eigen/Dense>
+#  include <cmath>
+#  include "libLSS/tools/mpi/ghost_planes.hpp"
+
+namespace LibLSS {
+
+  namespace ALTAIR {
+    using namespace Eigen;
+    using boost::format;
+
+    // For case of tricubic interpolation
+    static constexpr int TRICUBIC = 2;
+    static constexpr int TRIQUINTIC = 4;
+    static constexpr int TRISEPTIC = 6;
+    static constexpr int TRIHEPTIC = 8;
+
+    constexpr static const int n_order = TRIQUINTIC;
+
+    class AltairAPForward : public BORGForwardModel {
+    protected:
+      double rho_mean;
+      double L_z[3];
+      double corner_z[3];
+      double delta_z[3];
+      double delta_c[3];
+
+      constexpr static const double H100 = 100.; // km/s/Mpc
+      constexpr static const double M_IN_KM =
+          1000.; // number of metres in one kilometer
+      constexpr static const double cosmo_clight =
+          GSL_CONST_MKSA_SPEED_OF_LIGHT; // speed of light in m/s
+      static constexpr double epsilon = 1e-4;
+
+      MatrixXd
+          M_matrix, // (n_order**3,n_order**3) matrix of interpolation coefficients
+          M_inverse; // inverse of M_matrix
+
+      boost::multi_array<double, 4> grid_transform;
+      boost::multi_array<double, 4> soft_factor;
+
+      int x_inter[n_order];
+
+      GhostPlanes<double, 2> ghosts; // 2d ghost planes
+      ModelInput<3> hold_input;
+      ModelInputAdjoint<3> hold_in_gradient;
+      bool is_contrast;
+
+    public:
+      explicit AltairAPForward(
+          MPI_Communication *comm, const BoxModel &box_c,
+          const BoxModel &box_z, bool is_contrast = true);
+
+      void prepareMatrix();
+
+      void setModelParams(ModelDictionnary const &params) override; //FIXME
+      CosmologicalParameters my_params;                             //FIXME
+      bool COSMO_INIT;                                              //FIXME
+
+      void updateCoordinateSystem();
+
+      template <typename SArray>
+      void interpolate_3d(SArray const &s_field, ArrayRef &z_field);
+
+      void forwardModelSimple(CArrayRef &delta_init) override;
+
+      void forwardModel_v2(ModelInput<3> delta_init) override;
+
+      void getDensityFinal(ModelOutput<3> delta_output) override;
+
+      void updateCosmo() override;
+
+      void forwardModelRsdField(ArrayRef &deltaf, double *vobs_ext) override;
+
+      void adjointModel_v2(ModelInputAdjoint<3> in_gradient_delta) override;
+
+      void
+      getAdjointModelOutput(ModelOutputAdjoint<3> ag_delta_output) override;
+
+      void clearAdjointGradient() override;
+
+      void releaseParticles() override;
+    };
+
+  } // namespace ALTAIR
+} // namespace LibLSS
+
+LIBLSS_REGISTER_FORWARD_DECL(ALTAIR_AP);
+
+#endif
+
+// ARES TAG: authors_num = 2
+// ARES TAG: name(0) = Guilhem Lavaux
+// ARES TAG: email(0) = guilhem.lavaux@iap.fr
+// ARES TAG: year(0) = 2018-2020
+// ARES TAG: name(1) = Doogesh Kodi Ramanah
+// ARES TAG: email(1) = ramanah@iap.fr
+// ARES TAG: year(1) = 2018-2020
--- a/extra/borg/libLSS/physics/forwards/always_mpi.hpp
+++ b/extra/borg/libLSS/physics/forwards/always_mpi.hpp
@ -0,0 +1,22 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/forwards/always_mpi.hpp
+    Copyright (C) 2019-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+namespace LibLSS {
+
+  static constexpr bool const SKIP_MPI_FOR_SINGLE_NODE = true;
+
+  inline bool ALWAYS_MPI(MPI_Communication *comm) {
+    return (!SKIP_MPI_FOR_SINGLE_NODE || comm->size() > 1);
+  }
+
+} // namespace LibLSS
+
+// ARES TAG: authors_num = 1
+// ARES TAG: name(0) = Guilhem Lavaux
+// ARES TAG: year(0) = 2019-2020
+// ARES TAG: email(0) = guilhem.lavaux@iap.fr
--- a/extra/borg/libLSS/physics/forwards/borg_2lpt.cpp
+++ b/extra/borg/libLSS/physics/forwards/borg_2lpt.cpp
@ -0,0 +1,234 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/forwards/borg_2lpt.cpp
+    Copyright (C) 2014-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#include "libLSS/cconfig.h"
+#include <CosmoTool/algo.hpp>
+#include <Eigen/Core>
+#include <boost/format.hpp>
+#include <boost/bind.hpp>
+#include <CosmoTool/fourier/fft/fftw_calls.hpp>
+#include "libLSS/samplers/core/random_number.hpp"
+#include "libLSS/samplers/core/powerspec_tools.hpp"
+#include "libLSS/tools/mpi_fftw_helper.hpp"
+#include "libLSS/tools/array_tools.hpp"
+#include "libLSS/samplers/rgen/hmc/hmc_density_sampler.hpp"
+#include "libLSS/physics/cosmo.hpp"
+#include "libLSS/tools/powerspectrum/measure.hpp"
+#include "libLSS/physics/forwards/borg_2lpt.hpp"
+#include "libLSS/borg_splash.hpp"
+#include "libLSS/physics/forwards/pm/part_decision.hpp"
+#include <H5Cpp.h>
+#include "libLSS/physics/forwards/particle_balancer/attributes.hpp"
+#include "always_mpi.hpp"
+
+using namespace LibLSS;
+using namespace LibLSS::BORG_help;
+
+using CosmoTool::square;
+
+typedef Uninit_FFTW_Real_Array::array_type U_ArrayRef;
+static const bool TEST_MODEL_RESPONSE = false;
+static const bool VERBOSE_LPT = false;
+static const bool DUMP_BORG_DENSITY = false;
+
+template <typename CIC>
+Borg2LPTModel<CIC>::Borg2LPTModel(
+    MPI_Communication *comm, const BoxModel &box, const BoxModel &box_out,
+    bool rsd, int p_ss_factor, double p_factor, double ai, double af,
+    bool light_cone)
+    : ParticleBasedForwardModel(comm, box, box_out), AUX1_p(0), AUX0_p(0),
+      aux_p(0), partFactor(p_factor), lctime(light_cone) {
+  using boost::c_storage_order;
+  LIBLSS_AUTO_DEBUG_CONTEXT(ctx);
+
+  setupDefault();
+
+  BORG::splash_borg();
+  this->do_rsd = rsd;
+  this->a_init = ai;
+  this->af = af;
+  this->ss_factor = p_ss_factor;
+
+  c_N0 = ss_factor * N0;
+  c_N1 = ss_factor * N1;
+  c_N2 = ss_factor * N2;
+
+  c_N2_HC = c_N2 / 2 + 1;
+
+  mgr = new DFT_Manager(c_N0, c_N1, c_N2, comm);
+
+  c_N2real = mgr->N2real;
+  c_startN0 = mgr->startN0;
+  c_localN0 = mgr->localN0;
+
+  ctx.print("Allocate AUX1_p");
+  AUX1_m = new U_F_Array(mgr->extents_complex(), mgr->allocator_complex);
+  AUX1_p = &AUX1_m->get_array();
+  ctx.print("Allocate aux_p");
+  aux_m = new U_R_Array(mgr->extents_real(), mgr->allocator_real);
+  aux_p = &aux_m->get_array();
+  ctx.print("Allocate AUX0_p");
+  AUX0_m = new U_F_Array(mgr->extents_complex(), mgr->allocator_complex);
+  AUX0_p = &AUX0_m->get_array();
+
+  if (ss_factor > 1) {
+    ctx.print("c_deltao");
+    c_deltao_m = new U_F_Array(mgr->extents_complex(), mgr->allocator_complex);
+    c_deltao = &c_deltao_m->get_array();
+    ctx.print("c_tmp_real_field");
+    c_tmp_real_field_m =
+        new U_R_Array(mgr->extents_real(), mgr->allocator_real);
+    c_tmp_real_field = &c_tmp_real_field_m->get_array();
+    ctx.print("c_tmp_complex_field");
+    c_tmp_complex_field_m =
+        new U_F_Array(mgr->extents_complex(), mgr->allocator_complex);
+    c_tmp_complex_field = &c_tmp_complex_field_m->get_array();
+  } else {
+    c_deltao = 0;
+    c_tmp_real_field = &tmp_real_field->get_array();
+    c_tmp_complex_field = &tmp_complex_field->get_array();
+  }
+
+  ctx.print("Allocating more arrays");
+  u_r_psi_00 = new U_R_Array(mgr->extents_real(), mgr->allocator_real);
+  u_r_psi_01 = new U_R_Array(mgr->extents_real(), mgr->allocator_real);
+  u_r_psi_02 = new U_R_Array(mgr->extents_real(), mgr->allocator_real);
+  u_r_psi_11 = new U_R_Array(mgr->extents_real(), mgr->allocator_real);
+  u_r_psi_12 = new U_R_Array(mgr->extents_real(), mgr->allocator_real);
+  u_r_psi_22 = new U_R_Array(mgr->extents_real(), mgr->allocator_real);
+
+  u_c_psi_00 = new U_F_Array(mgr->extents_complex(), mgr->allocator_complex);
+  u_c_psi_01 = new U_F_Array(mgr->extents_complex(), mgr->allocator_complex);
+  u_c_psi_02 = new U_F_Array(mgr->extents_complex(), mgr->allocator_complex);
+  u_c_psi_11 = new U_F_Array(mgr->extents_complex(), mgr->allocator_complex);
+  u_c_psi_12 = new U_F_Array(mgr->extents_complex(), mgr->allocator_complex);
+  u_c_psi_22 = new U_F_Array(mgr->extents_complex(), mgr->allocator_complex);
+
+  r_psi_00 = &u_r_psi_00->get_array();
+  r_psi_01 = &u_r_psi_01->get_array();
+  r_psi_02 = &u_r_psi_02->get_array();
+  r_psi_11 = &u_r_psi_11->get_array();
+  r_psi_12 = &u_r_psi_12->get_array();
+  r_psi_22 = &u_r_psi_22->get_array();
+
+  c_psi_00 = &u_c_psi_00->get_array();
+  c_psi_01 = &u_c_psi_01->get_array();
+  c_psi_02 = &u_c_psi_02->get_array();
+  c_psi_11 = &u_c_psi_11->get_array();
+  c_psi_12 = &u_c_psi_12->get_array();
+  c_psi_22 = &u_c_psi_22->get_array();
+
+  ctx.print("Planning");
+  c_synthesis_plan = mgr->create_c2r_plan(AUX1_p->data(), aux_p->data());
+  c_analysis_plan = mgr->create_r2c_plan(aux_p->data(), AUX1_p->data());
+
+  ///initialize light cone timing
+  ctx.print("Lc_timing allocation");
+  lc_timing = std::make_shared<U_PArray>(extents[c_localN0 * c_N1 * c_N2][5]);
+  oldParams.h = 0.0;
+}
+
+template <typename CIC>
+void Borg2LPTModel<CIC>::updateCosmo() {
+  if (oldParams != cosmo_params) {
+    oldParams = cosmo_params;
+
+    gen_light_cone_timing(lc_timing->get_array());
+  }
+}
+
+template <typename CIC>
+Borg2LPTModel<CIC>::~Borg2LPTModel() {
+  if (AUX1_p) {
+    delete AUX1_m;
+    delete aux_m;
+    delete AUX0_m;
+
+    if (c_deltao != 0) {
+      delete c_tmp_real_field_m;
+      delete c_tmp_complex_field_m;
+      delete c_deltao_m;
+    }
+
+    delete u_r_psi_00;
+    delete u_r_psi_01;
+    delete u_r_psi_02;
+    delete u_r_psi_11;
+    delete u_r_psi_12;
+    delete u_r_psi_22;
+    delete u_c_psi_00;
+    delete u_c_psi_01;
+    delete u_c_psi_02;
+    delete u_c_psi_11;
+    delete u_c_psi_12;
+    delete u_c_psi_22;
+
+    mgr->destroy_plan(c_synthesis_plan);
+    mgr->destroy_plan(c_analysis_plan);
+
+    delete mgr;
+  }
+  releaseParticles();
+}
+
+#include "2lpt/borg_fwd_2lpt.cpp"
+#include "2lpt/borg_fwd_2lpt_adj.cpp"
+
+template class LibLSS::Borg2LPTModel<>;
+
+#include "libLSS/physics/modified_ngp.hpp"
+template class LibLSS::Borg2LPTModel<ModifiedNGP<double, NGPGrid::Quad>>;
+template class LibLSS::Borg2LPTModel<ModifiedNGP<double, NGPGrid::Double>>;
+template class LibLSS::Borg2LPTModel<ModifiedNGP<double, NGPGrid::CIC>>;
+
+#ifdef _OPENMP
+#  include "libLSS/physics/openmp_cic.hpp"
+template class LibLSS::Borg2LPTModel<OpenMPCloudInCell<double>>;
+#endif
+
+template <typename Grid = ClassicCloudInCell<double>>
+static std::shared_ptr<BORGForwardModel> build_borg_2lpt(
+    MPI_Communication *comm, BoxModel const &box, PropertyProxy const &params) {
+  double ai = params.get<double>("a_initial");
+  double af = params.get<double>("a_final");
+  bool rsd = params.get<bool>("do_rsd");
+  int ss_factor = params.get<double>("supersampling");
+  bool light_cone = params.get<bool>("lightcone");
+  double p_factor = params.get<double>("part_factor");
+  int mul_out = params.get<int>("mul_out", 1);
+  BoxModel box_out = box;
+
+  box_out.N0 *= mul_out;
+  box_out.N1 *= mul_out;
+  box_out.N2 *= mul_out;
+
+  return std::make_shared<Borg2LPTModel<Grid>>(
+      comm, box, box_out, rsd, ss_factor, p_factor, ai, af, light_cone);
+}
+
+AUTO_REGISTRATOR_IMPL(LIBLSS_REGISTER_NAME(2LPT_CIC));
+AUTO_REGISTRATOR_IMPL(LIBLSS_REGISTER_NAME(2LPT_NGP));
+AUTO_REGISTRATOR_IMPL(LIBLSS_REGISTER_NAME(2LPT_DOUBLE));
+
+#ifdef _OPENMP
+AUTO_REGISTRATOR_IMPL(LIBLSS_REGISTER_NAME(2LPT_CIC_OPENMP));
+#endif
+
+namespace {
+  _RegisterForwardModel
+      MANGLED_LIBLSS_REGISTER_NAME(LPT_CIC)("2LPT_CIC", build_borg_2lpt<>);
+  _RegisterForwardModel MANGLED_LIBLSS_REGISTER_NAME(2LPT_NGP)(
+      "2LPT_NGP", build_borg_2lpt<ModifiedNGP<double, NGPGrid::NGP>>);
+  _RegisterForwardModel MANGLED_LIBLSS_REGISTER_NAME(2LPT_DOUBLE)(
+      "2LPT_DOUBLE", build_borg_2lpt<ModifiedNGP<double, NGPGrid::Double>>);
+#ifdef _OPENMP
+  _RegisterForwardModel MANGLED_LIBLSS_REGISTER_NAME(2LPT_CIC_OPENMP)(
+      "2LPT_CIC_OPENMP", build_borg_2lpt<OpenMPCloudInCell<double>>);
+#endif
+} // namespace
--- a/extra/borg/libLSS/physics/forwards/borg_2lpt.hpp
+++ b/extra/borg/libLSS/physics/forwards/borg_2lpt.hpp
@ -0,0 +1,190 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/forwards/borg_2lpt.hpp
+    Copyright (C) 2014-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#ifndef __LIBLSS_BORG_2LPT_HPP
+#define __LIBLSS_BORG_2LPT_HPP
+
+#include "libLSS/tools/array_tools.hpp"
+#include "libLSS/tools/auto_interpolator.hpp"
+#include "libLSS/samplers/rgen/hmc/hmc_density_sampler.hpp"
+#include "libLSS/mcmc/state_element.hpp"
+#include "libLSS/mcmc/global_state.hpp"
+#include "libLSS/physics/forwards/borg_helpers.hpp"
+#include "libLSS/physics/forward_model.hpp"
+#include "libLSS/physics/classic_cic.hpp"
+#include "libLSS/tools/uninitialized_type.hpp"
+#include "libLSS/tools/mpi_fftw_helper.hpp"
+#include "libLSS/physics/forwards/particle_balancer/particle_distribute.hpp"
+#include <boost/lambda/lambda.hpp>
+#include <CosmoTool/hdf5_array.hpp>
+#include "libLSS/physics/forwards/registry.hpp"
+
+namespace LibLSS {
+
+  template <typename CIC = ClassicCloudInCell<double>>
+  class Borg2LPTModel : virtual public ParticleBasedForwardModel {
+  protected:
+    // Import member variables
+    bool do_rsd;
+    double a_init;
+    double af;
+    typedef Uninit_FFTW_Complex_Array U_F_Array;
+    typedef Uninit_FFTW_Real_Array U_R_Array;
+    typedef Uninit_FFTW_Real_Array::array_type U_ArrayRef;
+
+    typedef boost::multi_array<size_t, 1> IdxArray;
+    typedef UninitializedArray<PhaseArray> U_PArray;
+
+    std::shared_ptr<U_PArray> u_pos, u_vel, lc_timing, u_s_pos, u_pos_ag,
+        u_vel_ag;
+    std::unique_ptr<IdxArray> lagrangian_id;
+
+    CIC cic;
+    long c_N0, c_N1, c_N2, c_localN0, c_N2_HC, c_startN0, c_N2real;
+    U_R_Array::array_type *aux_p, *c_tmp_real_field;
+    U_F_Array::array_type *AUX1_p, *AUX0_p, *c_deltao, *c_tmp_complex_field;
+    DFT_Manager *mgr;
+    int ss_factor;
+    double partFactor;
+    bool lctime;
+    BalanceInfo realInfo, redshiftInfo;
+
+    U_R_Array *c_tmp_real_field_m, *aux_m;
+    U_F_Array *c_tmp_complex_field_m, *AUX1_m, *AUX0_m, *c_deltao_m;
+
+    U_R_Array *u_r_psi_00, *u_r_psi_01, *u_r_psi_02, *u_r_psi_11, *u_r_psi_12,
+        *u_r_psi_22;
+    U_F_Array *u_c_psi_00, *u_c_psi_01, *u_c_psi_02, *u_c_psi_11, *u_c_psi_12,
+        *u_c_psi_22;
+
+    U_R_Array::array_type *r_psi_00, *r_psi_01, *r_psi_02, *r_psi_11, *r_psi_12,
+        *r_psi_22;
+    U_F_Array::array_type *c_psi_00, *c_psi_01, *c_psi_02, *c_psi_11, *c_psi_12,
+        *c_psi_22;
+
+    DFT_Manager::Calls::plan_type c_analysis_plan, c_synthesis_plan;
+
+    // forward model lpt2
+    void lpt2_ic(
+        CArrayRef &deltao, PhaseArrayRef &pos, PhaseArrayRef &vel,
+        PhaseArrayRef &lctim);
+    void lpt2_redshift_pos(
+        PhaseArrayRef &pos, PhaseArrayRef &vel, PhaseArrayRef &s_pos,
+        PhaseArrayRef &lctim);
+    void
+    lpt2_density_obs(PhaseArrayRef &pos, ArrayRef &deltao, size_t numParts);
+    void lpt2_fwd_model(
+        CArrayRef &deltao, PhaseArrayRef &pos, PhaseArrayRef &vel,
+        PhaseArrayRef &lctim);
+    void forwardModel_rsd_field(ArrayRef &deltaf, double *vobs_ext);
+    void gen_light_cone_timing(PhaseArrayRef &lctim);
+
+    // adjoint model lpt2
+    void lpt2_ic_ag(
+        PhaseArrayRef &pos_ag, PhaseArrayRef &vel_ag, PhaseArrayRef &lctim);
+    template <typename... A>
+    void lpt2_add_to_derivative(
+        U_F_Array::array_type &result, const PhaseArrayRef &pos_ag,
+        const PhaseArrayRef &vel_ag, const PhaseArrayRef &lctim,
+        const int axis0, std::tuple<A...> const &t);
+    void lpt2_redshift_pos_ag(
+        PhaseArrayRef &pos, PhaseArrayRef &vel, PhaseArrayRef &pos_ag,
+        PhaseArrayRef &vel_ag, PhaseArrayRef &lctim);
+
+    template <typename PositionArray>
+    void lpt2_density_obs_ag(
+        PositionArray &pos, PhaseArrayRef &pos_ag, PhaseArrayRef &vel_ag,
+        ArrayRef &B, size_t numParts);
+    void lpt2_fwd_model_ag(
+        PhaseArrayRef &pos, PhaseArrayRef &vel, PhaseArrayRef &lctime,
+        CArrayRef &DPSI);
+
+    CosmologicalParameters oldParams;
+    virtual void updateCosmo();
+
+  public:
+    typedef PhaseArrayRef &ParticleArray;
+    typedef PhaseArrayRef &VelocityArray;
+
+    Borg2LPTModel(
+        MPI_Communication *comm, const BoxModel &box, const BoxModel &box_out,
+        bool rsd, int ss_factor, double p_factor, double ai, double af,
+        bool light_cone);
+    virtual ~Borg2LPTModel();
+
+    virtual void forwardModel_v2(ModelInput<3> delta_init);
+    virtual void adjointModel_v2(ModelInputAdjoint<3> gradient_delta);
+    virtual void getDensityFinal(ModelOutput<3> delta_output);
+    virtual void getAdjointModelOutput(ModelOutputAdjoint<3> ag_delta_output);
+
+    virtual void clearAdjointGradient();
+
+    // This computes the adjoint gradient on the particle positions, velocities
+    // Not all models may support this. The default implementation triggers an error.
+    virtual void
+    adjointModelParticles(PhaseArrayRef &grad_pos, PhaseArrayRef &grad_vel);
+
+    virtual IdSubArray getLagrangianIdentifiers() const {
+      boost::multi_array_types::index_gen i_gen;
+      typedef boost::multi_array_types::index_range range;
+      return (*lagrangian_id)[i_gen[range()]];
+    }
+
+    virtual PhaseSubArray getParticlePositions() {
+      boost::multi_array_types::index_gen i_gen;
+      typedef boost::multi_array_types::index_range range;
+
+      return u_pos->get_array()[i_gen[range()][range()]];
+    }
+
+    virtual PhaseSubArray getParticleVelocities() {
+      boost::multi_array_types::index_gen i_gen;
+      typedef boost::multi_array_types::index_range range;
+
+      return u_vel->get_array()[i_gen[range()][range()]];
+    }
+
+    virtual size_t getNumberOfParticles() const {
+      return redshiftInfo.localNumParticlesAfter;
+    }
+    virtual unsigned int getSupersamplingRate() const { return ss_factor; }
+
+    template <typename ArrayOut>
+    void copyParticlePositions(ArrayOut &a, int _ = -1) const {
+      LibLSS::copy_array(a, u_pos->get_array());
+    }
+
+    template <typename ArrayOut>
+    void copyParticleVelocities(ArrayOut &a, int _ = -1) const {
+      LibLSS::copy_array<ArrayOut>(a, u_vel->get_array());
+    }
+
+    virtual void releaseParticles() {
+      u_pos.reset();
+      u_vel.reset();
+      realInfo.clear();
+      redshiftInfo.clear();
+      lagrangian_id.reset();
+    }
+
+    virtual void forwardModelRsdField(ArrayRef &deltaf, double *vobs_ext);
+
+    void test_lpt2_velocities(MarkovState &state);
+  };
+
+}; // namespace LibLSS
+
+LIBLSS_REGISTER_FORWARD_DECL(2LPT_CIC);
+#ifdef _OPENMP
+LIBLSS_REGISTER_FORWARD_DECL(2LPT_CIC_OPENMP);
+#endif
+LIBLSS_REGISTER_FORWARD_DECL(2LPT_NGP);
+LIBLSS_REGISTER_FORWARD_DECL(2LPT_DOUBLE);
+
+#endif
--- a/extra/borg/libLSS/physics/forwards/borg_helpers.hpp
+++ b/extra/borg/libLSS/physics/forwards/borg_helpers.hpp
@ -0,0 +1,45 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/forwards/borg_helpers.hpp
+    Copyright (C) 2014-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#ifndef __LIBLSS_BORG_HELP_HPP
+#define __LIBLSS_BORG_HELP_HPP
+
+namespace LibLSS {
+
+  namespace BORG_help {
+
+    typedef boost::multi_array_types::extent_range range;
+    using boost::extents;
+    using boost::format;
+
+    template <typename T>
+    T periodic_fix(T x, T xmin, T L) {
+      T result = x;
+      T y = x - xmin;
+
+      if (y < 0 || y >= L) {
+        T intpart;
+        if (y < 0)
+          result = xmin + L + modf(y / L, &intpart) * L;
+        if (y >= 0)
+          result = xmin + modf(y / L, &intpart) * L;
+      }
+
+      while (result < xmin)
+        result += L;
+      while (result >= (xmin + L))
+        result -= L;
+
+      return result;
+    }
+
+  } // namespace BORG_help
+};  // namespace LibLSS
+
+#endif
--- a/extra/borg/libLSS/physics/forwards/borg_lep.cpp
+++ b/extra/borg/libLSS/physics/forwards/borg_lep.cpp
@ -0,0 +1,216 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/forwards/borg_lep.cpp
+    Copyright (C) 2014-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#include <CosmoTool/algo.hpp>
+#include <Eigen/Core>
+#include <boost/array.hpp>
+#include <boost/format.hpp>
+#include <boost/bind.hpp>
+#include <CosmoTool/fourier/fft/fftw_calls.hpp>
+#include "libLSS/samplers/core/random_number.hpp"
+#include "libLSS/samplers/core/powerspec_tools.hpp"
+#include "libLSS/tools/mpi_fftw_helper.hpp"
+#include "libLSS/tools/array_tools.hpp"
+#include "libLSS/samplers/rgen/hmc/hmc_density_sampler.hpp"
+#include "libLSS/physics/cosmo.hpp"
+#include "libLSS/tools/powerspectrum/measure.hpp"
+#include "libLSS/physics/forwards/borg_lep.hpp"
+#include "libLSS/tools/fused_array.hpp"
+
+#undef EXPERIMENTAL_ORDERING
+
+using namespace LibLSS;
+using namespace LibLSS::BORG_help;
+using CosmoTool::square;
+
+static const double unit_r0 = 1.0;               // Units of distances = 1 Mpc/h
+static const double H0 = 100.0;                  // h km/s/Mpc
+static const double unit_t0 = 1 / H0;            // Units of time
+static const double unit_v0 = unit_r0 / unit_t0; // Units of velocity
+
+#include "lep/borg_fwd_lep.cpp"
+#include "lep/borg_fwd_lep_adj.cpp"
+
+template <typename CIC>
+void BorgLEPModel<CIC>::tabulate_sin() {
+  sin2K[0].resize(boost::extents[f_N0]);
+  sin2K[1].resize(boost::extents[f_N1]);
+  sin2K[2].resize(boost::extents[f_N2]);
+
+  for (int i = 0; i < f_N0; i++) {
+    sin2K[0][i] = square(sin(M_PI * i / double(f_N0)) * 2 * f_N0 / L0);
+  }
+
+  for (int i = 0; i < f_N1; i++) {
+    sin2K[1][i] = square(sin(M_PI * i / double(f_N1)) * 2 * f_N1 / L1);
+  }
+
+  for (int i = 0; i < f_N2; i++) {
+    sin2K[2][i] = square(sin(M_PI * i / double(f_N2)) * 2 * f_N2 / L2);
+  }
+}
+
+template <typename CIC>
+BorgLEPModel<CIC>::BorgLEPModel(
+    MPI_Communication *comm, const BoxModel &box, int ss_factor, int f_factor,
+    int lep_nsteps, bool do_rsd, double ai, double z_start)
+    : BORGForwardModel(comm, box) {
+  this->ss_factor = ss_factor;
+  this->f_factor = f_factor;
+  this->lep_nsteps = lep_nsteps;
+  this->z_start = z_start;
+  this->do_redshift = do_rsd;
+  this->ai = ai;
+
+  alloc_arrays();
+  tabulate_sin();
+}
+
+template <typename CIC>
+void BorgLEPModel<CIC>::alloc_arrays() {
+  using boost::c_storage_order;
+
+  c_N0 = ss_factor * N0;
+  c_N1 = ss_factor * N1;
+  c_N2 = ss_factor * N2;
+  c_N2_HC = c_N2 / 2 + 1;
+
+  f_N0 = f_factor * N0;
+  f_N1 = f_factor * N1;
+  f_N2 = f_factor * N2;
+  f_N2_HC = f_N2 / 2 + 1;
+
+  force_mgr = new DFT_Manager(f_N0, f_N1, f_N2, comm);
+  mgr = new DFT_Manager(c_N0, c_N1, c_N2, comm);
+
+  c_startN0 = mgr->startN0;
+  c_localN0 = mgr->localN0;
+  f_startN0 = force_mgr->startN0;
+  f_localN0 = force_mgr->localN0;
+
+  AUX1_p = new CArray(
+      mgr->extents_complex(), c_storage_order(), mgr->allocator_complex);
+  aux_p =
+      new Array(mgr->extents_real(), c_storage_order(), mgr->allocator_real);
+  AUX0_p = new CArray(
+      mgr->extents_complex(), c_storage_order(), mgr->allocator_complex);
+  f_AUX0_p = new CArray(
+      force_mgr->extents_complex(), c_storage_order(),
+      force_mgr->allocator_complex);
+
+  if (ss_factor > 1) {
+    c_deltao = new CArray(
+        mgr->extents_complex(), c_storage_order(), mgr->allocator_complex);
+    c_tmp_real_field =
+        new Array(mgr->extents_real(), c_storage_order(), mgr->allocator_real);
+    c_tmp_complex_field = new CArray(
+        mgr->extents_complex(), c_storage_order(), mgr->allocator_complex);
+    lo_AUX0_p = new CArray(
+        lo_mgr->extents_complex(), c_storage_order(),
+        lo_mgr->allocator_complex);
+  } else {
+    c_deltao = 0;
+    c_tmp_real_field = tmp_real_field;
+    c_tmp_complex_field = tmp_complex_field;
+    lo_AUX0_p = AUX0_p;
+  }
+
+  ///setup array for large scale gravitational potential
+  g_lep0 = new Uninit_FFTW_Real_Array(
+      force_mgr->extents_real(), force_mgr->allocator_real);
+  g_lep1 = new Uninit_FFTW_Real_Array(
+      force_mgr->extents_real(), force_mgr->allocator_real);
+  g_lep2 = new Uninit_FFTW_Real_Array(
+      force_mgr->extents_real(), force_mgr->allocator_real);
+
+  Uninit_FFTW_Real_Array f_tmp_p(
+      force_mgr->extents_real(), force_mgr->allocator_real);
+  Uninit_FFTW_Real_Array::array_type &f_tmp = f_tmp_p.get_array();
+
+  c_synthesis_plan = mgr->create_c2r_plan(AUX1_p->data(), aux_p->data());
+  c_analysis_plan = mgr->create_r2c_plan(aux_p->data(), AUX1_p->data());
+  f_synthesis_plan = force_mgr->create_c2r_plan(f_AUX0_p->data(), f_tmp.data());
+  f_analysis_plan = force_mgr->create_r2c_plan(f_tmp.data(), f_AUX0_p->data());
+}
+
+template <typename CIC>
+BorgLEPModel<CIC>::~BorgLEPModel() {
+  delete AUX0_p;
+  delete AUX1_p;
+  delete aux_p;
+  delete f_AUX0_p;
+
+  if (c_deltao != 0) {
+    delete c_deltao;
+    delete c_tmp_real_field;
+    delete c_tmp_complex_field;
+    delete lo_AUX0_p;
+  }
+
+  mgr->destroy_plan(c_synthesis_plan);
+  mgr->destroy_plan(c_analysis_plan);
+
+  force_mgr->destroy_plan(f_synthesis_plan);
+  force_mgr->destroy_plan(f_analysis_plan);
+
+  delete force_mgr;
+  delete mgr;
+}
+
+template <typename CIC>
+void BorgLEPModel<CIC>::forwardModel(
+    CArrayRef &delta_init, ArrayRef &delta_output, bool adjointNext) {
+  ConsoleContext<LOG_DEBUG> ctx("BORG-lep MODEL");
+  int nstep = lep_nsteps;
+
+#ifdef EXPERIMENTAL_ORDERING
+  typedef boost::general_storage_order<3> order_type;
+  typedef order_type::size_type size_type;
+  size_type ordering[] = {0, 2, 1};
+  bool ascending[] = {true, true, true};
+  order_type order(ordering, ascending);
+  u_pos = new UninitializedArray<TapeArray>(
+      extents[nstep][c_localN0 * c_N1 * c_N2][3]);
+  u_vel = new UninitializedArray<TapeArray>(
+      extents[nstep][c_localN0 * c_N1 * c_N2][3], std::allocator<double>(),
+      order);
+#else
+  u_pos = new UninitializedArray<TapeArray>(
+      extents[nstep][c_localN0 * c_N1 * c_N2][3]);
+  u_vel = new UninitializedArray<TapeArray>(
+      extents[nstep][c_localN0 * c_N1 * c_N2][3]);
+#endif
+  timing = new TimingArray(extents[6][nstep]);
+
+  lep_fwd_model(
+      delta_init, delta_output, u_pos->get_array(), u_vel->get_array(),
+      *timing);
+
+  if (!forwardModelHold && !adjointNext) {
+    releaseParticles();
+  }
+}
+
+template <typename CIC>
+void BorgLEPModel<CIC>::adjointModel(ArrayRef &gradient_delta) {
+  int nstep = lep_nsteps;
+
+  lep_fwd_model_ag(
+      gradient_delta, u_pos->get_array(), u_vel->get_array(), gradient_delta,
+      *timing);
+
+  releaseParticles();
+}
+
+template class LibLSS::BorgLEPModel<>;
+
+#ifdef _OPENMP
+#  include "libLSS/physics/openmp_cic.hpp"
+template class LibLSS::BorgLEPModel<OpenMPCloudInCell<double>>;
+#endif
--- a/extra/borg/libLSS/physics/forwards/borg_lep.hpp
+++ b/extra/borg/libLSS/physics/forwards/borg_lep.hpp
@ -0,0 +1,174 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/forwards/borg_lep.hpp
+    Copyright (C) 2014-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#ifndef __LIBLSS_BORG_LEP_HPP
+#define __LIBLSS_BORG_LEP_HPP
+
+#include "libLSS/tools/array_tools.hpp"
+#include "libLSS/samplers/rgen/hmc/hmc_density_sampler.hpp"
+#include "libLSS/mcmc/state_element.hpp"
+#include "libLSS/mcmc/global_state.hpp"
+#include "libLSS/physics/forwards/borg_helpers.hpp"
+#include "libLSS/tools/uninitialized_type.hpp"
+#include "libLSS/tools/push_operators.hpp"
+#include "libLSS/physics/classic_cic.hpp"
+#include "libLSS/tools/mpi_fftw_helper.hpp"
+#include "libLSS/physics/forward_model.hpp"
+
+namespace LibLSS {
+
+  template <typename CIC = ClassicCloudInCell<double>>
+  class BorgLEPModel : public BORGForwardModel {
+  protected:
+    typedef boost::multi_array<double, 2> TimingArray;
+    typedef boost::multi_array<double, 3> TapeArray;
+    typedef boost::multi_array_ref<double, 3> TapeArrayRef;
+    typedef boost::multi_array<double, 1> SinArray;
+
+    typedef typename LibLSS::array::EigenMap<ArrayRef> EMap;
+    typedef UninitializedArray<PhaseArray> U_PhaseArray;
+    typedef UninitializedArray<TapeArray> U_TapeArray;
+
+    TimingArray *timing;
+    U_TapeArray *u_pos, *u_vel;
+    SinArray sin2K[3];
+
+    CArray *AUX1_p, *AUX0_p, *lo_AUX0_p, *f_AUX0_p;
+    CArray *c_deltao;
+    long c_N0, c_N1, c_N2, c_localN0, c_N2_HC, c_startN0, c_N2real;
+    long f_N0, f_N1, f_N2, f_localN0, f_N2_HC, f_startN0;
+    Array *aux_p;
+    DFT_Manager *mgr, *force_mgr;
+    int ss_factor, f_factor, lep_nsteps;
+    bool do_redshift;
+    double z_start;
+    double ai;
+
+    CArrayRef *c_tmp_complex_field;
+    ArrayRef *c_tmp_real_field;
+
+    Uninit_FFTW_Real_Array *g_lep0;
+    Uninit_FFTW_Real_Array *g_lep1;
+    Uninit_FFTW_Real_Array *g_lep2;
+
+    DFT_Manager::Calls::plan_type c_analysis_plan, c_synthesis_plan;
+    DFT_Manager::Calls::plan_type f_analysis_plan, f_synthesis_plan;
+
+    ///forward model lep
+    void lep_ic(
+        CArrayRef &deltao, TapeArrayRef &pos, TapeArrayRef &vel,
+        TimingArray &timing);
+    template <typename PositionArray, typename RedshiftPosition>
+    void lep_redshift_pos(
+        const PositionArray &pos, const PositionArray &vel,
+        RedshiftPosition &s_pos);
+
+    template <typename PositionArray>
+    void lep_density_obs(const PositionArray &pos, ArrayRef &deltao);
+    void lep_fwd_model(
+        CArrayRef &deltao, ArrayRef &deltaf, TapeArrayRef &pos,
+        TapeArrayRef &vel, TimingArray &timing);
+    template <typename PositionArray>
+    void lep_gravpot(const PositionArray &pos, FFTW_Real_Array_ref &pot);
+    void lep_vel_update(
+        TapeArrayRef &pos, TapeArrayRef &vel, double dtv, double dDv,
+        int istep);
+    void lep_pos_update(
+        TapeArrayRef &pos, TapeArrayRef &vel, double dtr, double dDr,
+        int istep);
+    void lep_stepping(
+        TapeArrayRef &pos, TapeArrayRef &vel, int nstep, TimingArray &timing);
+    void
+    lep_gen_timesteps(double ai, double af, TimingArray &timing, int nstep);
+
+    //adjoint model lep
+    void lep_ic_ag(
+        PhaseArrayRef &pos_ag, PhaseArrayRef &vel_ag, TimingArray &timing);
+
+    template <typename PositionArray, typename PosAgArray>
+    void lep_redshift_pos_ag(
+        const PositionArray &pos, const PositionArray &vel, PosAgArray &pos_ag,
+        PosAgArray &vel_ag);
+
+    template <typename PositionArray, typename OutputArray>
+    void lep_density_obs_ag(
+        const PositionArray &pos, OutputArray &pos_ag, OutputArray &vel_ag,
+        ArrayRef &B);
+
+    template <typename PositionArray>
+    void lep_gravpot_ag(const PositionArray &pos, FFTW_Real_Array &pot);
+    template <typename PositionArray>
+    void lep_force_0_ag(
+        const PositionArray &pos, const PositionArray &vel,
+        PhaseArrayRef &pos_ag, PhaseArrayRef &vel_ag, PhaseArrayRef &F_ag,
+        double dtr, double dtv);
+    template <typename PositionArray>
+    void lep_force_1_ag(
+        const PositionArray &pos, const PositionArray &vel,
+        PhaseArrayRef &pos_ag, PhaseArrayRef &vel_ag, PhaseArrayRef &F_ag,
+        double dtr, double dtv);
+    template <typename ForceArray>
+    void lep_pos_update_ag(
+        PhaseArrayRef &pos_ag, const ForceArray &F_ag, double dtr);
+    void
+    lep_vel_update_ag(PhaseArrayRef &pos_ag, PhaseArrayRef &vel_ag, double dtr);
+    void lep_stepping_ag(
+        TapeArrayRef &pos, TapeArrayRef &vel, PhaseArrayRef &pos_ag,
+        PhaseArrayRef &vel_ag, int nstep, TimingArray &timing);
+    void lep_fwd_model_ag(
+        ArrayRef &B, TapeArrayRef &pos, TapeArrayRef &vel, ArrayRef &DPSI,
+        TimingArray &timing);
+
+    template <int axis, bool accum, int sign>
+    void compute_force(FFTW_Real_Array_ref &g, FFTW_Real_Array_ref &pot);
+
+    template <int axis, bool accum, int sign>
+    void compute_lep_force(FFTW_Real_Array_ref &g, FFTW_Real_Array_ref &pot);
+
+    void alloc_arrays();
+    void tabulate_sin();
+
+  public:
+    BorgLEPModel(
+        MPI_Communication *comm, const BoxModel &box, int ss_factor,
+        int f_factor, int nsteps, bool do_rsd, double ai, double z_start);
+    virtual ~BorgLEPModel();
+
+    virtual void forwardModel(
+        CArrayRef &delta_init, ArrayRef &delta_output, bool adjointNext);
+    virtual void adjointModel(ArrayRef &gradient_delta);
+    TapeArray::reference getParticlePositions() {
+      return u_pos->get_array()[lep_nsteps - 1];
+    }
+    TapeArray::reference getParticleVelocities() {
+      return u_vel->get_array()[lep_nsteps - 1];
+    }
+    virtual void releaseParticles() {
+      if (u_pos != 0) {
+        delete u_pos;
+        delete u_vel;
+        delete timing;
+      }
+    }
+
+    virtual void forwardModelRsdField(ArrayRef &deltaf, double *vobs_ext);
+
+    virtual void clearAdjointGradient() {}
+    virtual void pushAdjointModelParticles(
+        PhaseArrayRef &grad_pos, PhaseArrayRef &grad_vel) {
+      error_helper<ErrorNotImplemented>(
+          "adjointModelParticles is not implemented in this model.");
+    }
+
+    virtual void retrieveAdjointGradient(CArrayRef &agDeltaInit) {}
+  };
+
+}; // namespace LibLSS
+
+#endif
--- a/extra/borg/libLSS/physics/forwards/borg_lpt.cpp
+++ b/extra/borg/libLSS/physics/forwards/borg_lpt.cpp
@ -0,0 +1,202 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/forwards/borg_lpt.cpp
+    Copyright (C) 2014-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#include "libLSS/cconfig.h"
+#include <CosmoTool/algo.hpp>
+#include <Eigen/Core>
+#include <boost/format.hpp>
+#include <boost/bind.hpp>
+#include <CosmoTool/fourier/fft/fftw_calls.hpp>
+#include "libLSS/samplers/core/random_number.hpp"
+#include "libLSS/samplers/core/powerspec_tools.hpp"
+#include "libLSS/tools/mpi_fftw_helper.hpp"
+#include "libLSS/tools/array_tools.hpp"
+#include "libLSS/samplers/rgen/hmc/hmc_density_sampler.hpp"
+#include "libLSS/physics/cosmo.hpp"
+#include "libLSS/tools/powerspectrum/measure.hpp"
+#include "libLSS/physics/forwards/borg_lpt.hpp"
+#include "libLSS/borg_splash.hpp"
+#include <H5Cpp.h>
+#include "libLSS/physics/forwards/particle_balancer/attributes.hpp"
+#include "libLSS/physics/forwards/particle_balancer/particle_distribute.hpp"
+#include "always_mpi.hpp"
+#include "libLSS/physics/forwards/registry.hpp"
+
+using namespace LibLSS;
+using namespace LibLSS::BORG_help;
+
+using CosmoTool::square;
+
+static const bool TEST_MODEL_RESPONSE = false;
+static const bool VERBOSE_LPT = false;
+static const bool DUMP_BORG_DENSITY = false;
+
+template <typename CIC>
+BorgLptModel<CIC>::BorgLptModel(
+    MPI_Communication *comm, BoxModel const &box, BoxModel const &box_out,
+    bool rsd, int p_ss_factor, double p_factor, double ai, double af,
+    bool light_cone, double light_cone_boost)
+    : ParticleBasedForwardModel(comm, box, box_out), partFactor(p_factor),
+      lctime(light_cone), lcboost(light_cone_boost), firstTime(true) {
+  using boost::c_storage_order;
+  ConsoleContext<LOG_DEBUG> ctx("BorgLptModel::BorgLptModel");
+
+  setupDefault();
+  BORG::splash_borg();
+  this->invalidCache = true;
+  this->do_rsd = rsd;
+  this->a_init = ai;
+  this->af = af;
+  this->ss_factor = p_ss_factor;
+
+  ctx.format("Part factor = %g", partFactor);
+
+  c_N0 = ss_factor * N0;
+  c_N1 = ss_factor * N1;
+  c_N2 = ss_factor * N2;
+
+  c_N2_HC = c_N2 / 2 + 1;
+
+  ctx.print(
+      format(
+          "Building manager for supersampled resolution: N0=%d, N1=%d, N2=%d") %
+      c_N0 % c_N1 % c_N2);
+  mgr = new DFT_Manager(c_N0, c_N1, c_N2, comm);
+
+  c_N2real = mgr->N2real;
+  c_startN0 = mgr->startN0;
+  c_localN0 = mgr->localN0;
+
+  ctx.print("Allocating AUX1");
+  AUX1_m = mgr->allocate_ptr_complex_array();
+  AUX1_p = &AUX1_m->get_array();
+  ctx.print("Allocating aux");
+  aux_m = mgr->allocate_ptr_array();
+  aux_p = &aux_m->get_array();
+  ctx.print("Allocating AUX0");
+  AUX0_m = mgr->allocate_ptr_complex_array();
+  AUX0_p = &AUX0_m->get_array();
+
+  if (ss_factor > 1) {
+    ctx.print("Allocating c_deltao");
+    c_deltao_m = mgr->allocate_ptr_complex_array();
+    c_deltao = &c_deltao_m->get_array();
+    ctx.print("Allocating c_tmp_real_field");
+    c_tmp_real_field_m = mgr->allocate_ptr_array();
+    c_tmp_real_field = &c_tmp_real_field_m->get_array();
+    ctx.print(
+        format("Number of elements = %d") % c_tmp_real_field->num_elements());
+    ctx.print("Allocating c_tmp_complex_field");
+    c_tmp_complex_field_m = mgr->allocate_ptr_complex_array();
+    c_tmp_complex_field = &c_tmp_complex_field_m->get_array();
+  } else {
+    c_deltao = 0;
+    c_tmp_real_field = &tmp_real_field->get_array();
+    c_tmp_complex_field = &tmp_complex_field->get_array();
+  }
+
+  c_synthesis_plan = mgr->create_c2r_plan(AUX1_p->data(), aux_p->data());
+  c_analysis_plan = mgr->create_r2c_plan(aux_p->data(), AUX1_p->data());
+
+  ///initialize light cone timing
+  lc_timing = std::make_shared<U_PArray>(extents[c_localN0 * c_N1 * c_N2][3]);
+  oldParams.h = 0.0;
+}
+
+template <typename CIC>
+void BorgLptModel<CIC>::updateCosmo() {
+  LIBLSS_AUTO_CONTEXT(LOG_DEBUG, ctx);
+  if (firstTime || oldParams != cosmo_params) {
+    firstTime = false;
+    oldParams = cosmo_params;
+    ctx.print("Cosmo Parameter changed. Rebuild light cone.");
+
+    invalidCache = true;
+
+    gen_light_cone_timing(lc_timing->get_array());
+  }
+}
+
+template <typename CIC>
+bool BorgLptModel<CIC>::densityInvalidated() const {
+  return invalidCache;
+}
+
+template <typename CIC>
+BorgLptModel<CIC>::~BorgLptModel() {
+  ConsoleContext<LOG_DEBUG> ctx("BorgLptModel::~BorgLptModel");
+  mgr->destroy_plan(c_synthesis_plan);
+  mgr->destroy_plan(c_analysis_plan);
+  delete mgr;
+  releaseParticles();
+}
+
+#include "lpt/borg_fwd_lpt.cpp"
+#include "lpt/borg_fwd_lpt_adj.cpp"
+
+template class LibLSS::BorgLptModel<>;
+
+#include "libLSS/physics/modified_ngp.hpp"
+template class LibLSS::BorgLptModel<ModifiedNGP<double, NGPGrid::NGP>>;
+template class LibLSS::BorgLptModel<ModifiedNGP<double, NGPGrid::Quad>>;
+template class LibLSS::BorgLptModel<ModifiedNGP<double, NGPGrid::Double>>;
+template class LibLSS::BorgLptModel<ModifiedNGP<double, NGPGrid::CIC>>;
+
+#include "libLSS/physics/modified_ngp_smooth.hpp"
+template class LibLSS::BorgLptModel<SmoothModifiedNGP<double, NGPGrid::Quad>>;
+template class LibLSS::BorgLptModel<SmoothModifiedNGP<double, NGPGrid::CIC>>;
+
+#ifdef _OPENMP
+#  include "libLSS/physics/openmp_cic.hpp"
+template class LibLSS::BorgLptModel<OpenMPCloudInCell<double>>;
+#endif
+
+template <typename Grid = ClassicCloudInCell<double>>
+static std::shared_ptr<BORGForwardModel> build_borg_lpt(
+    MPI_Communication *comm, BoxModel const &box, PropertyProxy const &params) {
+  LIBLSS_AUTO_DEBUG_CONTEXT(ctx);
+  double ai = params.get<double>("a_initial");
+  double af = params.get<double>("a_final");
+  bool rsd = params.get<bool>("do_rsd");
+  int ss_factor = params.get<double>("supersampling");
+  bool light_cone = params.get<bool>("lightcone");
+  double p_factor = params.get<double>("part_factor");
+  BoxModel box_out = box;
+  int mul_out = params.get<int>("mul_out", 1);
+
+  box_out.N0 *= mul_out;
+  box_out.N1 *= mul_out;
+  box_out.N2 *= mul_out;
+
+  ctx.format(
+      "ai=%g, af=%g, rsd=%d, ss_factor=%d, p_factor=%d, light_cone=%d", ai, af,
+      rsd, ss_factor, p_factor, light_cone);
+  return std::make_shared<BorgLptModel<Grid>>(
+      comm, box, box_out, rsd, ss_factor, p_factor, ai, af, light_cone);
+}
+
+AUTO_REGISTRATOR_IMPL(LIBLSS_REGISTER_NAME(LPT_CIC));
+AUTO_REGISTRATOR_IMPL(LIBLSS_REGISTER_NAME(LPT_DOUBLE));
+AUTO_REGISTRATOR_IMPL(LIBLSS_REGISTER_NAME(LPT_NGP));
+
+#ifdef _OPENMP
+AUTO_REGISTRATOR_IMPL(LIBLSS_REGISTER_NAME(LPT_CIC_OPENMP));
+#endif
+namespace {
+  _RegisterForwardModel
+      MANGLED_LIBLSS_REGISTER_NAME(LPT_CIC)("LPT_CIC", build_borg_lpt<>);
+  _RegisterForwardModel MANGLED_LIBLSS_REGISTER_NAME(LPT_NGP)(
+      "LPT_NGP", build_borg_lpt<ModifiedNGP<double, NGPGrid::NGP>>);
+  _RegisterForwardModel MANGLED_LIBLSS_REGISTER_NAME(LPT_DOUBLE)(
+      "LPT_DOUBLE", build_borg_lpt<ModifiedNGP<double, NGPGrid::Double>>);
+#ifdef _OPENMP
+  _RegisterForwardModel MANGLED_LIBLSS_REGISTER_NAME(LPT_CIC_OPENMP)(
+      "LPT_CIC_OPENMP", build_borg_lpt<OpenMPCloudInCell<double>>);
+#endif
+} // namespace
--- a/extra/borg/libLSS/physics/forwards/borg_lpt.hpp
+++ b/extra/borg/libLSS/physics/forwards/borg_lpt.hpp
@ -0,0 +1,197 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/forwards/borg_lpt.hpp
+    Copyright (C) 2014-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#ifndef __LIBLSS_BORG_LPT_HPP
+#define __LIBLSS_BORG_LPT_HPP
+
+#include "libLSS/tools/array_tools.hpp"
+#include "libLSS/tools/auto_interpolator.hpp"
+#include "libLSS/samplers/rgen/hmc/hmc_density_sampler.hpp"
+#include "libLSS/mcmc/state_element.hpp"
+#include "libLSS/mcmc/global_state.hpp"
+#include "libLSS/physics/forwards/borg_helpers.hpp"
+#include "libLSS/physics/forward_model.hpp"
+#include "libLSS/physics/classic_cic.hpp"
+#include "libLSS/physics/modified_ngp.hpp"
+#include "libLSS/tools/uninitialized_type.hpp"
+#include "libLSS/tools/mpi_fftw_helper.hpp"
+#include "libLSS/physics/forwards/particle_balancer/particle_distribute.hpp"
+#include <boost/lambda/lambda.hpp>
+#include <CosmoTool/hdf5_array.hpp>
+#include "libLSS/physics/forwards/registry.hpp"
+
+namespace LibLSS {
+
+  template <typename CIC = ClassicCloudInCell<double>>
+  class BorgLptModel : virtual public ParticleBasedForwardModel {
+  protected:
+    // Import member variables
+    bool do_rsd;
+    bool invalidCache;
+    double a_init;
+
+    typedef boost::multi_array<size_t, 1> IdxArray;
+    typedef UninitializedArray<PhaseArray> U_PArray;
+
+    std::shared_ptr<U_PArray> u_pos, u_vel, lc_timing, u_s_pos, u_pos_ag,
+        u_vel_ag;
+    std::unique_ptr<IdxArray> lagrangian_id;
+    U_CArray_p AUX1_m, AUX0_m, c_tmp_complex_field_m, c_deltao_m;
+    U_Array_p c_tmp_real_field_m, aux_m;
+    CIC cic;
+    long c_N0, c_N1, c_N2, c_localN0, c_N2_HC, c_startN0, c_N2real;
+    DFT_Manager *mgr;
+    int ss_factor;
+    double af;
+    double partFactor;
+    bool lctime;
+    double lcboost;
+    BalanceInfo realInfo, redshiftInfo;
+    bool firstTime;
+    bool adjointRequired;
+
+    CArray *c_tmp_complex_field, *AUX1_p, *AUX0_p;
+    CArray *c_deltao;
+    Array *c_tmp_real_field;
+    Array *aux_p;
+
+    DFT_Manager::Calls::plan_type c_analysis_plan, c_synthesis_plan;
+
+    ///forward model lpt
+    void lpt_ic(
+        CArrayRef &deltao, PhaseArrayRef &pos, PhaseArrayRef &vel,
+        PhaseArrayRef &lctim);
+    void lpt_redshift_pos(
+        PhaseArrayRef &pos, PhaseArrayRef &vel, PhaseArrayRef &s_pos,
+        PhaseArrayRef &lctim);
+    void lpt_density_obs(PhaseArrayRef &pos, ArrayRef &deltao, size_t numParts);
+    void lpt_fwd_model(
+        CArrayRef &deltao, PhaseArrayRef &pos, PhaseArrayRef &vel,
+        PhaseArrayRef &lctim);
+    void forwardModel_rsd_field(ArrayRef &deltaf, double *vobs_ext);
+    void gen_light_cone_timing(PhaseArrayRef &lctim);
+
+    ///adjoint model lpt
+    void lpt_ic_ag(
+        PhaseArrayRef &pos_ag, PhaseArrayRef &vel_ag, PhaseArrayRef &lctim);
+    void lpt_redshift_pos_ag(
+        PhaseArrayRef &pos, PhaseArrayRef &vel, PhaseArrayRef &pos_ag,
+        PhaseArrayRef &vel_ag, PhaseArrayRef &lctim);
+
+    template <typename PositionArray>
+    void lpt_density_obs_ag(
+        PositionArray &pos, PhaseArrayRef &pos_ag, PhaseArrayRef &vel_ag,
+        ArrayRef const &B, size_t numParts);
+    void lpt_fwd_model_ag(
+        PhaseArrayRef &pos, PhaseArrayRef &vel, PhaseArrayRef &lctime,
+        CArrayRef &out_ag);
+
+    CosmologicalParameters oldParams;
+
+    void preallocate();
+    void updateCosmo() override;
+
+  public:
+    typedef PhaseArrayRef &ParticleArray;
+    typedef PhaseArrayRef &VelocityArray;
+
+    BorgLptModel(
+        MPI_Communication *comm, BoxModel const &box, BoxModel const &box_out,
+        bool rsd, int ss_factor, double p_factor, double ai, double af,
+        bool light_cone, double light_cone_boost = 1.0);
+    virtual ~BorgLptModel();
+
+    bool densityInvalidated() const override;
+
+    void forwardModel_v2(ModelInput<3> delta_init) override;
+    void adjointModel_v2(ModelInputAdjoint<3> gradient_delta) override;
+    void getDensityFinal(ModelOutput<3> delta_output) override;
+    void getAdjointModelOutput(ModelOutputAdjoint<3> ag_delta_output) override;
+
+    void clearAdjointGradient() override;
+
+    void setAdjointRequired(bool on) override { adjointRequired = on; }
+
+    size_t getNumberOfParticles() const override {
+      return redshiftInfo.localNumParticlesAfter;
+    }
+    unsigned int getSupersamplingRate() const override { return ss_factor; }
+
+    PhaseArrayRef const &lightConeTiming() const {
+      return lc_timing->get_array();
+    }
+
+    PhaseSubArray getParticlePositions() override {
+      boost::multi_array_types::index_gen i_gen;
+      typedef boost::multi_array_types::index_range range;
+      if (!u_pos)
+        error_helper<ErrorBadState>("Particle array already freed.");
+
+      return u_pos->get_array()[i_gen[range(
+          0, redshiftInfo.localNumParticlesAfter)][range()]];
+    }
+
+    PhaseSubArray getParticleVelocities() override {
+      boost::multi_array_types::index_gen i_gen;
+      typedef boost::multi_array_types::index_range range;
+      if (!u_vel)
+        error_helper<ErrorBadState>("Particle array already freed.");
+
+      return u_vel->get_array()[i_gen[range(
+          0, redshiftInfo.localNumParticlesAfter)][range()]];
+    }
+
+    template <typename ArrayOut>
+    void copyParticlePositions(ArrayOut &a, int _ = -1) const {
+      LibLSS::copy_array(a, u_pos->get_array());
+    }
+
+    template <typename ArrayOut>
+    void copyParticleVelocities(ArrayOut &a, int _ = -1) const {
+      LibLSS::copy_array<ArrayOut>(a, u_vel->get_array());
+    }
+
+    void releaseParticles() override {
+      if (u_pos) {
+        u_pos.reset();
+        u_vel.reset();
+      }
+      if (u_s_pos) {
+        u_s_pos.reset();
+      }
+      realInfo.clear();
+      redshiftInfo.clear();
+      lagrangian_id.reset();
+    }
+
+    IdSubArray getLagrangianIdentifiers() const override {
+      boost::multi_array_types::index_gen i_gen;
+      typedef boost::multi_array_types::index_range range;
+
+      return (
+          *lagrangian_id)[i_gen[range(0, redshiftInfo.localNumParticlesAfter)]];
+    }
+
+    void adjointModelParticles(
+        PhaseArrayRef &grad_pos, PhaseArrayRef &grad_vel) override;
+
+    void forwardModelRsdField(ArrayRef &deltaf, double *vobs_ext) override;
+
+    void test_lpt_velocities(MarkovState &state);
+  };
+
+}; // namespace LibLSS
+LIBLSS_REGISTER_FORWARD_DECL(LPT_CIC);
+#ifdef _OPENMP
+LIBLSS_REGISTER_FORWARD_DECL(LPT_CIC_OPENMP);
+#endif
+LIBLSS_REGISTER_FORWARD_DECL(LPT_NGP);
+LIBLSS_REGISTER_FORWARD_DECL(LPT_DOUBLE);
+
+#endif
--- a/extra/borg/libLSS/physics/forwards/borg_qlpt.cpp
+++ b/extra/borg/libLSS/physics/forwards/borg_qlpt.cpp
@ -0,0 +1,182 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/forwards/borg_qlpt.cpp
+    Copyright (C) 2020 Natalia Porqueres <n.porqueres@imperial.ac.uk>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#include "libLSS/cconfig.h"
+#include "libLSS/mpi/generic_mpi.hpp"
+#include <CosmoTool/algo.hpp>
+#include <Eigen/Core>
+#include <boost/format.hpp>
+#include <boost/bind.hpp>
+#include <CosmoTool/fourier/fft/fftw_calls.hpp>
+#ifdef ARES_MPI_FFTW
+#  include <CosmoTool/fourier/fft/fftw_calls_mpi.hpp>
+#endif
+#include "libLSS/samplers/core/random_number.hpp"
+#include "libLSS/samplers/core/powerspec_tools.hpp"
+#include "libLSS/tools/mpi_fftw_helper.hpp"
+#include "libLSS/tools/array_tools.hpp"
+#include "libLSS/samplers/rgen/hmc/hmc_density_sampler.hpp"
+#include "libLSS/physics/cosmo.hpp"
+#include "libLSS/tools/powerspectrum/measure.hpp"
+#include "libLSS/physics/forwards/borg_qlpt.hpp"
+#include "libLSS/borg_splash.hpp"
+#include <H5Cpp.h>
+#include "libLSS/physics/forwards/particle_balancer/attributes.hpp"
+#include "libLSS/physics/forwards/particle_balancer/particle_distribute.hpp"
+#include "always_mpi.hpp"
+#include "libLSS/physics/forwards/registry.hpp"
+
+using namespace LibLSS;
+using namespace LibLSS::BORG_help;
+
+using CosmoTool::square;
+
+static const bool TEST_MODEL_RESPONSE = false;
+static const bool VERBOSE_QLPT = false;
+static const bool DUMP_BORG_DENSITY = false;
+
+BorgQLptModel::BorgQLptModel(
+    MPI_Communication *comm, BoxModel const &box, BoxModel const &box_out,
+    double hbar, bool rsd, int p_ss_factor, double p_factor, double ai,
+    double af, bool light_cone, double light_cone_boost)
+    : BORGForwardModel(comm, box, box_out), partFactor(p_factor),
+      lctime(light_cone), lcboost(light_cone_boost), firstTime(true) {
+  using boost::c_storage_order;
+  ConsoleContext<LOG_DEBUG> ctx("BorgQLptModel::BorgQLptModel");
+
+  setupDefault();
+  BORG::splash_borg();
+  this->do_rsd = rsd;
+  this->a_init = ai;
+  this->af = af;
+  this->ss_factor = p_ss_factor;
+
+  this->hbar = hbar * L0 / 2000. * 256. / N0; //0.07*L0/2000.*256./N0;
+
+  /*    if (comm->size() > 1) {
+      error_helper<ErrorParams>("MPI is not fully supported for QLPT forward model. Please use PM in degenerate configuration (nsteps=2, zstart=0)");
+    }
+*/
+
+  c_N0 = N0;
+  c_N1 = N1;
+  c_N2 = N2;
+
+  c_N2_HC = c_N2 / 2 + 1;
+
+  ctx.print(
+      format(
+          "Building manager for supersampled resolution: N0=%d, N1=%d, N2=%d") %
+      c_N0 % c_N1 % c_N2);
+  mgr = new DFT_Manager(c_N0, c_N1, c_N2, comm);
+
+  N2real = mgr->N2real;
+  startN0 = mgr->startN0;
+  localN0 = mgr->localN0;
+
+  c_localN0 = mgr->localN0;
+  c_startN0 = mgr->startN0;
+
+  potential = lo_mgr->allocate_ptr_array();
+
+  ctx.print("Allocating AUX1");
+  AUX1_m = mgr->allocate_ptr_complex_array();
+  AUX1_p = &AUX1_m->get_array();
+  ctx.print("Allocating aux");
+  aux_m = mgr->allocate_ptr_array();
+  aux_p = &aux_m->get_array();
+  ctx.print("Allocating AUX0");
+  AUX0_m = mgr->allocate_ptr_complex_array();
+  AUX0_p = &AUX0_m->get_array();
+
+  if (ss_factor > 1) {
+    ctx.print("Allocating c_deltao");
+    c_deltao_m = mgr->allocate_ptr_complex_array();
+    c_deltao = &c_deltao_m->get_array();
+    ctx.print("Allocating c_tmp_real_field");
+    c_tmp_real_field_m = mgr->allocate_ptr_array();
+    c_tmp_real_field = &c_tmp_real_field_m->get_array();
+    ctx.print(
+        format("Number of elements = %d") % c_tmp_real_field->num_elements());
+    ctx.print("Allocating c_tmp_complex_field");
+    c_tmp_complex_field_m = mgr->allocate_ptr_complex_array();
+    c_tmp_complex_field = &c_tmp_complex_field_m->get_array();
+  } else {
+    c_deltao = 0;
+    c_tmp_real_field = &tmp_real_field->get_array();
+    c_tmp_complex_field = &tmp_complex_field->get_array();
+  }
+
+  synthesis_plan = mgr->create_c2r_plan(AUX1_p->data(), aux_p->data());
+  analysis_plan = mgr->create_r2c_plan(aux_p->data(), AUX1_p->data());
+
+  ///initialize light cone timing
+  lc_timing = std::make_shared<U_PArray>(extents[localN0 * N1 * N2][3]);
+  oldParams.h = 0.0;
+}
+
+void BorgQLptModel::updateCosmo() {
+  LIBLSS_AUTO_CONTEXT(LOG_DEBUG, ctx);
+  if (firstTime || oldParams != cosmo_params) {
+    firstTime = false;
+    oldParams = cosmo_params;
+    ctx.print("Cosmo Parameter changed. Rebuild light cone.");
+
+    Cosmology cosmo(cosmo_params);
+    D0 = cosmo.d_plus(a_init);
+    D1 = cosmo.d_plus(af) / D0;
+    Df1 = cosmo.d_plus(af) / D0;
+    f1 = cosmo.g_plus(af);
+  }
+}
+
+BorgQLptModel::~BorgQLptModel() {
+  ConsoleContext<LOG_DEBUG> ctx("BorgQLptModel::~BorgQLptModel");
+  delete mgr;
+  releaseParticles();
+}
+
+#include "qlpt/borg_fwd_qlpt.cpp"
+#include "qlpt/borg_fwd_qlpt_adj.cpp"
+
+static std::shared_ptr<BORGForwardModel> build_borg_qlpt(
+    MPI_Communication *comm, BoxModel const &box, PropertyProxy const &params) {
+  LIBLSS_AUTO_DEBUG_CONTEXT(ctx);
+  double ai = params.get<double>("a_initial");
+  double af = params.get<double>("a_final");
+  bool rsd = params.get<bool>("do_rsd");
+  int ss_factor = params.get<double>("supersampling");
+  bool light_cone = params.get<bool>("lightcone");
+  double p_factor = params.get<double>("part_factor");
+  double hbar = params.get<double>("hbar");
+  BoxModel box_out = box;
+  int mul_out = params.get<int>("mul_out", 1);
+
+  box_out.N0 *= mul_out;
+  box_out.N1 *= mul_out;
+  box_out.N2 *= mul_out;
+
+  ctx.format(
+      "ai=%g, af=%g, rsd=%d, ss_factor=%d, p_factor=%d, light_cone=%d", ai, af,
+      hbar, rsd, ss_factor, p_factor, light_cone);
+  return std::make_shared<BorgQLptModel>(
+      comm, box, box_out, hbar, rsd, ss_factor, p_factor, ai, af, light_cone);
+}
+
+AUTO_REGISTRATOR_IMPL(LIBLSS_REGISTER_NAME(QLPT_NGP));
+
+namespace {
+  _RegisterForwardModel
+      MANGLED_LIBLSS_REGISTER_NAME(QLPT_NGP)("QLPT", build_borg_qlpt);
+
+} // namespace
+
+// ARES TAG: authors_num = 1
+// ARES TAG: name(0) = Natalia Porqueres
+// ARES TAG: email(0) = n.porqueres@imperial.ac.uk
+// ARES TAG: year(0) = 2020
--- a/extra/borg/libLSS/physics/forwards/borg_qlpt.hpp
+++ b/extra/borg/libLSS/physics/forwards/borg_qlpt.hpp
@ -0,0 +1,130 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/forwards/borg_qlpt.hpp
+    Copyright (C) 2020 Natalia Porqueres <n.porqueres@imperial.ac.uk>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#ifndef __LIBLSS_BORG_QLPT_HPP
+#  define __LIBLSS_BORG_QLPT_HPP
+
+#  include "libLSS/mpi/generic_mpi.hpp"
+#  include "libLSS/tools/array_tools.hpp"
+#  include "libLSS/tools/auto_interpolator.hpp"
+#  include "libLSS/samplers/rgen/hmc/hmc_density_sampler.hpp"
+#  include "libLSS/mcmc/state_element.hpp"
+#  include "libLSS/mcmc/global_state.hpp"
+#  include "libLSS/physics/forwards/borg_helpers.hpp"
+#  include "libLSS/physics/forward_model.hpp"
+#  include "libLSS/physics/modified_ngp.hpp"
+#  include "libLSS/tools/uninitialized_type.hpp"
+#  include "libLSS/tools/mpi_fftw_helper.hpp"
+#  include "libLSS/physics/forwards/particle_balancer/particle_distribute.hpp"
+#  include <boost/lambda/lambda.hpp>
+#  include <CosmoTool/hdf5_array.hpp>
+#  include "libLSS/physics/forwards/registry.hpp"
+
+namespace LibLSS {
+
+  class BorgQLptModel : virtual public BORGForwardModel {
+  protected:
+    // Import member variables
+    bool do_rsd;
+    double a_init;
+
+    typedef boost::multi_array<size_t, 1> IdxArray;
+    typedef UninitializedArray<PhaseArray> U_PArray;
+    typedef UninitializedArray<DFT_Manager::ArrayReal, DFT_Manager::AllocReal>
+        U_ArrayReal;
+
+    std::shared_ptr<U_PArray> u_pos, u_vel, lc_timing, u_s_pos, u_pos_ag,
+        u_vel_ag;
+    std::unique_ptr<IdxArray> lagrangian_id;
+    std::unique_ptr<U_ArrayReal> potential;
+
+    U_CArray_p AUX1_m, AUX0_m, c_tmp_complex_field_m, c_deltao_m;
+    U_Array_p c_tmp_real_field_m, aux_m;
+    long c_N0, c_N1, c_N2, c_localN0, c_N2_HC, c_startN0, c_N2real;
+    DFT_Manager *mgr;
+    int ss_factor;
+    double af;
+    double partFactor;
+    bool lctime;
+    double lcboost;
+    BalanceInfo realInfo, redshiftInfo;
+    bool firstTime;
+    bool adjointRequired;
+    double hbar;
+    double D0, D1, Df1, f1;
+
+    ModelInputAdjoint<3> hold_in_gradient;
+
+    CArray *c_tmp_complex_field, *AUX1_p, *AUX0_p;
+    CArray *c_deltao;
+    Array *c_tmp_real_field;
+    Array *aux_p;
+
+    DFT_Manager::Calls::plan_type c_analysis_plan, c_synthesis_plan, plan;
+
+    ///forward model qlpt
+    void test();
+    void qlpt_ic(CArrayRef &deltao, PhaseArrayRef &lctim);
+    void qlpt_redshift_pos(PhaseArrayRef &pos, PhaseArrayRef &lctim);
+    void qlpt_density_obs(ArrayRef &deltao, size_t numParts);
+    void qlpt_fwd_model(CArrayRef &deltao, PhaseArrayRef &lctim);
+    void forwardModel_rsd_field(ArrayRef &deltaf, double *vobs_ext);
+
+    ///adjoint model qlpt
+    void
+    qlpt_fwd_model_ag(PhaseArrayRef &lctime, ArrayRef &in_ag, ArrayRef &out_ag);
+
+    CosmologicalParameters oldParams;
+    void preallocate();
+    void updateCosmo() override;
+
+  public:
+    typedef PhaseArrayRef &ParticleArray;
+    typedef PhaseArrayRef &VelocityArray;
+
+    BorgQLptModel(
+        MPI_Communication *comm, BoxModel const &box, BoxModel const &box_out,
+        double hbar, bool rsd, int ss_factor, double p_factor, double ai,
+        double af, bool light_cone, double light_cone_boost = 1.0);
+    virtual ~BorgQLptModel();
+
+    //virtual void forwardModelSimple(ModelInput<3> &delta_init);
+    //virtual void forwardModel(
+    //    ModelInput<3> &delta_init, ModelInput<3> &delta_output, bool adjointNext);
+    //virtual void adjointModel(ModelInput<3> &gradient_delta);
+
+    void forwardModel_v2(ModelInput<3> delta_init) override;
+    void adjointModel_v2(ModelInputAdjoint<3> gradient_delta) override;
+    void getDensityFinal(ModelOutput<3> delta_output) override;
+    void getAdjointModelOutput(ModelOutputAdjoint<3> ag_delta_output) override;
+
+    void clearAdjointGradient() override;
+    void setAdjointRequired(bool on) override { adjointRequired = on; }
+
+    unsigned int getSupersamplingRate() const { return ss_factor; }
+
+    PhaseArrayRef const &lightConeTiming() const {
+      return lc_timing->get_array();
+    }
+
+    void releaseParticles() override {}
+
+    void forwardModelRsdField(ArrayRef &deltaf, double *vobs_ext) override;
+
+    void test_qlpt_velocities(MarkovState &state);
+  };
+
+}; // namespace LibLSS
+
+LIBLSS_REGISTER_FORWARD_DECL(QLPT_NGP);
+
+#endif
+// ARES TAG: authors_num = 1
+// ARES TAG: name(0) = Natalia Porqueres
+// ARES TAG: email(0) = n.porqueres@imperial.ac.uk
+// ARES TAG: year(0) = 2020
--- a/extra/borg/libLSS/physics/forwards/borg_qlpt_rsd.cpp
+++ b/extra/borg/libLSS/physics/forwards/borg_qlpt_rsd.cpp
@ -0,0 +1,181 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/forwards/borg_qlpt_rsd.cpp
+    Copyright (C) 2020 Natalia Porqueres <n.porqueres@imperial.ac.uk>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#include "libLSS/cconfig.h"
+#include "libLSS/mpi/generic_mpi.hpp"
+#include <CosmoTool/algo.hpp>
+#include <Eigen/Core>
+#include <boost/format.hpp>
+#include <boost/bind.hpp>
+#include <CosmoTool/fourier/fft/fftw_calls.hpp>
+#ifdef ARES_MPI_FFTW
+#  include <CosmoTool/fourier/fft/fftw_calls_mpi.hpp>
+#endif
+#include "libLSS/samplers/core/random_number.hpp"
+#include "libLSS/samplers/core/powerspec_tools.hpp"
+#include "libLSS/tools/mpi_fftw_helper.hpp"
+#include "libLSS/tools/array_tools.hpp"
+#include "libLSS/samplers/rgen/hmc/hmc_density_sampler.hpp"
+#include "libLSS/physics/cosmo.hpp"
+#include "libLSS/tools/powerspectrum/measure.hpp"
+#include "libLSS/physics/forwards/borg_qlpt_rsd.hpp"
+#include "libLSS/borg_splash.hpp"
+#include <H5Cpp.h>
+#include "libLSS/physics/forwards/particle_balancer/attributes.hpp"
+#include "libLSS/physics/forwards/particle_balancer/particle_distribute.hpp"
+#include "always_mpi.hpp"
+#include "libLSS/physics/forwards/registry.hpp"
+
+using namespace LibLSS;
+using namespace LibLSS::BORG_help;
+
+using CosmoTool::square;
+
+static const bool TEST_MODEL_RESPONSE = false;
+static const bool VERBOSE_QLPT_RSD = false;
+static const bool DUMP_BORG_DENSITY = false;
+
+BorgQLptRsdModel::BorgQLptRsdModel(
+    MPI_Communication *comm, BoxModel const &box, BoxModel const &box_out,
+    double hbar, bool rsd, int p_ss_factor, double p_factor, double ai,
+    double af, bool light_cone, double light_cone_boost)
+    : BORGForwardModel(comm, box, box_out), partFactor(p_factor),
+      lctime(light_cone), lcboost(light_cone_boost), firstTime(true) {
+  using boost::c_storage_order;
+  ConsoleContext<LOG_DEBUG> ctx("BorgQLptRsdModel::BorgQLptRsdModel");
+
+  setupDefault();
+  BORG::splash_borg();
+  this->do_rsd = rsd;
+  this->a_init = ai;
+  this->af = af;
+  this->ss_factor = p_ss_factor;
+
+  this->hbar = hbar * L0 / 2000. * 256. / N0; //0.07*L0/2000.*256./N0;
+
+  /*    if (comm->size() > 1) {
+      error_helper<ErrorParams>("MPI is not fully supported for QLPT-RSD forward model. Please use PM in degenerate configuration (nsteps=2, zstart=0)");
+    }
+*/
+
+  c_N0 = N0;
+  c_N1 = N1;
+  c_N2 = N2;
+
+  c_N2_HC = c_N2 / 2 + 1;
+
+  ctx.print(
+      format(
+          "Building manager for supersampled resolution: N0=%d, N1=%d, N2=%d") %
+      c_N0 % c_N1 % c_N2);
+  mgr = new DFT_Manager(c_N0, c_N1, c_N2, comm);
+
+  N2real = mgr->N2real;
+  startN0 = mgr->startN0;
+  localN0 = mgr->localN0;
+
+  c_localN0 = mgr->localN0;
+  c_startN0 = mgr->startN0;
+
+  potential = lo_mgr->allocate_ptr_array();
+
+  ctx.print("Allocating AUX1");
+  AUX1_m = mgr->allocate_ptr_complex_array();
+  AUX1_p = &AUX1_m->get_array();
+  ctx.print("Allocating aux");
+  aux_m = mgr->allocate_ptr_array();
+  aux_p = &aux_m->get_array();
+  ctx.print("Allocating AUX0");
+  AUX0_m = mgr->allocate_ptr_complex_array();
+  AUX0_p = &AUX0_m->get_array();
+
+  if (ss_factor > 1) {
+    ctx.print("Allocating c_deltao");
+    c_deltao_m = mgr->allocate_ptr_complex_array();
+    c_deltao = &c_deltao_m->get_array();
+    ctx.print("Allocating c_tmp_real_field");
+    c_tmp_real_field_m = mgr->allocate_ptr_array();
+    c_tmp_real_field = &c_tmp_real_field_m->get_array();
+    ctx.print(
+        format("Number of elements = %d") % c_tmp_real_field->num_elements());
+    ctx.print("Allocating c_tmp_complex_field");
+    c_tmp_complex_field_m = mgr->allocate_ptr_complex_array();
+    c_tmp_complex_field = &c_tmp_complex_field_m->get_array();
+  } else {
+    c_deltao = 0;
+    c_tmp_real_field = &tmp_real_field->get_array();
+    c_tmp_complex_field = &tmp_complex_field->get_array();
+  }
+
+  synthesis_plan = mgr->create_c2r_plan(AUX1_p->data(), aux_p->data());
+  analysis_plan = mgr->create_r2c_plan(aux_p->data(), AUX1_p->data());
+
+  ///initialize light cone timing
+  lc_timing = std::make_shared<U_PArray>(extents[localN0 * N1 * N2][3]);
+  oldParams.h = 0.0;
+}
+
+void BorgQLptRsdModel::updateCosmo() {
+  LIBLSS_AUTO_CONTEXT(LOG_DEBUG, ctx);
+  if (firstTime || oldParams != cosmo_params) {
+    firstTime = false;
+    oldParams = cosmo_params;
+    ctx.print("Cosmo Parameter changed. Rebuild light cone.");
+
+    Cosmology cosmo(cosmo_params);
+    D0 = cosmo.d_plus(a_init);
+    D1 = cosmo.d_plus(af) / D0;
+    Df1 = cosmo.d_plus(af) / D0;
+    f1 = cosmo.g_plus(af);
+  }
+}
+
+BorgQLptRsdModel::~BorgQLptRsdModel() {
+  ConsoleContext<LOG_DEBUG> ctx("BorgQLptRsdModel::~BorgQLptRsdModel");
+  delete mgr;
+  releaseParticles();
+}
+
+#include "qlpt_rsd/borg_fwd_qlpt_rsd.cpp"
+#include "qlpt_rsd/borg_fwd_qlpt_rsd_adj.cpp"
+
+static std::shared_ptr<BORGForwardModel> build_borg_qlpt_rsd(
+    MPI_Communication *comm, BoxModel const &box, PropertyProxy const &params) {
+  LIBLSS_AUTO_DEBUG_CONTEXT(ctx);
+  double ai = params.get<double>("a_initial");
+  double af = params.get<double>("a_final");
+  bool rsd = params.get<bool>("do_rsd");
+  int ss_factor = params.get<double>("supersampling");
+  bool light_cone = params.get<bool>("lightcone");
+  double p_factor = params.get<double>("part_factor");
+  double hbar = params.get<double>("hbar");
+  BoxModel box_out = box;
+  int mul_out = params.get<int>("mul_out", 1);
+
+  box_out.N0 *= mul_out;
+  box_out.N1 *= mul_out;
+  box_out.N2 *= mul_out;
+
+  ctx.format(
+      "ai=%g, af=%g, rsd=%d, ss_factor=%d, p_factor=%d, light_cone=%d", ai, af,
+      hbar, rsd, ss_factor, p_factor, light_cone);
+  return std::make_shared<BorgQLptRsdModel>(
+      comm, box, box_out, hbar, rsd, ss_factor, p_factor, ai, af, light_cone);
+}
+
+AUTO_REGISTRATOR_IMPL(LIBLSS_REGISTER_NAME(QLPT_RSD));
+
+namespace {
+  _RegisterForwardModel MANGLED_LIBLSS_REGISTER_NAME(QLPT_RSD)(
+      "QLPT_RSD", build_borg_qlpt_rsd);
+} // namespace
+
+// ARES TAG: authors_num = 1
+// ARES TAG: name(0) = Natalia Porqueres
+// ARES TAG: email(0) = n.porqueres@imperial.ac.uk
+// ARES TAG: year(0) = 2020
--- a/extra/borg/libLSS/physics/forwards/borg_qlpt_rsd.hpp
+++ b/extra/borg/libLSS/physics/forwards/borg_qlpt_rsd.hpp
@ -0,0 +1,123 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/forwards/borg_qlpt_rsd.hpp
+    Copyright (C) 2020 Natalia Porqueres <n.porqueres@imperial.ac.uk>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#ifndef __LIBLSS_BORG_QLPT_RSD_HPP
+#  define __LIBLSS_BORG_QLPT_RSD_HPP
+
+#  include "libLSS/mpi/generic_mpi.hpp"
+#  include "libLSS/tools/array_tools.hpp"
+#  include "libLSS/tools/auto_interpolator.hpp"
+#  include "libLSS/samplers/rgen/hmc/hmc_density_sampler.hpp"
+#  include "libLSS/mcmc/state_element.hpp"
+#  include "libLSS/mcmc/global_state.hpp"
+#  include "libLSS/physics/forwards/borg_helpers.hpp"
+#  include "libLSS/physics/forward_model.hpp"
+#  include "libLSS/physics/modified_ngp.hpp"
+#  include "libLSS/tools/uninitialized_type.hpp"
+#  include "libLSS/tools/mpi_fftw_helper.hpp"
+#  include "libLSS/physics/forwards/particle_balancer/particle_distribute.hpp"
+#  include <boost/lambda/lambda.hpp>
+#  include <CosmoTool/hdf5_array.hpp>
+#  include "libLSS/physics/forwards/registry.hpp"
+
+namespace LibLSS {
+
+  class BorgQLptRsdModel : virtual public BORGForwardModel {
+  protected:
+    // Import member variables
+    bool do_rsd;
+    double a_init;
+
+    typedef boost::multi_array<size_t, 1> IdxArray;
+    typedef UninitializedArray<PhaseArray> U_PArray;
+    typedef UninitializedArray<DFT_Manager::ArrayReal, DFT_Manager::AllocReal>
+        U_ArrayReal;
+
+    std::shared_ptr<U_PArray> u_pos, u_vel, lc_timing, u_s_pos, u_pos_ag,
+        u_vel_ag;
+    std::unique_ptr<IdxArray> lagrangian_id;
+    std::unique_ptr<U_ArrayReal> potential;
+
+    U_CArray_p AUX1_m, AUX0_m, c_tmp_complex_field_m, c_deltao_m;
+    U_Array_p c_tmp_real_field_m, aux_m;
+    long c_N0, c_N1, c_N2, c_localN0, c_N2_HC, c_startN0, c_N2real;
+    DFT_Manager *mgr;
+    int ss_factor;
+    double af;
+    double partFactor;
+    bool lctime;
+    double lcboost;
+    BalanceInfo realInfo, redshiftInfo;
+    bool firstTime;
+    bool adjointRequired;
+    double hbar;
+    double D0, D1, Df1, f1;
+
+    ModelInputAdjoint<3> hold_in_gradient;
+
+    CArray *c_tmp_complex_field, *AUX1_p, *AUX0_p;
+    CArray *c_deltao;
+    Array *c_tmp_real_field;
+    Array *aux_p;
+
+    DFT_Manager::Calls::plan_type c_analysis_plan, c_synthesis_plan, plan;
+
+    ///forward model qlpt_rsd
+    void test();
+    void qlpt_rsd_ic(CArrayRef &deltao, PhaseArrayRef &lctim);
+    void qlpt_rsd_redshift_pos(PhaseArrayRef &pos, PhaseArrayRef &lctim);
+    void qlpt_rsd_density_obs(ArrayRef &deltao, size_t numParts);
+    void qlpt_rsd_fwd_model(CArrayRef &deltao, PhaseArrayRef &lctim);
+    void forwardModel_rsd_field(ArrayRef &deltaf, double *vobs_ext);
+
+    ///adjoint model qlpt_rsd
+    void qlpt_rsd_fwd_model_ag(PhaseArrayRef &lctime, ArrayRef &in_ag, ArrayRef &out_ag);
+
+    CosmologicalParameters oldParams;
+    void preallocate();
+    void updateCosmo() override;
+
+  public:
+    typedef PhaseArrayRef &ParticleArray;
+    typedef PhaseArrayRef &VelocityArray;
+
+    BorgQLptRsdModel(
+        MPI_Communication *comm, BoxModel const &box, BoxModel const &box_out,
+        double hbar, bool rsd, int ss_factor, double p_factor, double ai,
+        double af, bool light_cone, double light_cone_boost = 1.0);
+    virtual ~BorgQLptRsdModel();
+
+    //virtual void forwardModelSimple(ModelInput<3> &delta_init);
+    //virtual void forwardModel(
+    //    ModelInput<3> &delta_init, ModelInput<3> &delta_output, bool adjointNext);
+    //virtual void adjointModel(ModelInput<3> &gradient_delta);
+
+    void forwardModel_v2(ModelInput<3> delta_init) override;
+    void adjointModel_v2(ModelInputAdjoint<3> gradient_delta) override;
+    void getDensityFinal(ModelOutput<3> delta_output) override;
+    void getAdjointModelOutput(ModelOutputAdjoint<3> ag_delta_output) override;
+
+    void clearAdjointGradient() override;
+    void setAdjointRequired(bool on) override { adjointRequired = on; }
+
+    void releaseParticles() override {}
+
+    void forwardModelRsdField(ArrayRef &deltaf, double *vobs_ext) override;
+
+    void test_qlpt_rsd_velocities(MarkovState &state);
+  };
+
+}; // namespace LibLSS
+
+LIBLSS_REGISTER_FORWARD_DECL(QLPT_RSD);
+
+#endif
+// ARES TAG: authors_num = 1
+// ARES TAG: name(0) = Natalia Porqueres
+// ARES TAG: email(0) = n.porqueres@imperial.ac.uk
+// ARES TAG: year(0) = 2020
--- a/extra/borg/libLSS/physics/forwards/deprecated/borg_pm.cpp
+++ b/extra/borg/libLSS/physics/forwards/deprecated/borg_pm.cpp
@ -0,0 +1,355 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/forwards/deprecated/borg_pm.cpp
+    Copyright (C) 2014-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#include <CosmoTool/algo.hpp>
+#include <Eigen/Core>
+#include <boost/array.hpp>
+#include <boost/format.hpp>
+#include <boost/bind.hpp>
+#include <boost/mpl/for_each.hpp>
+#include <boost/mpl/range_c.hpp>
+#include <CosmoTool/fourier/fft/fftw_calls.hpp>
+#include "libLSS/samplers/core/random_number.hpp"
+#include "libLSS/samplers/core/powerspec_tools.hpp"
+#include "libLSS/tools/mpi_fftw_helper.hpp"
+#include "libLSS/tools/array_tools.hpp"
+#include "libLSS/samplers/rgen/hmc/hmc_density_sampler.hpp"
+#include "libLSS/physics/cosmo.hpp"
+#include "libLSS/tools/powerspectrum/measure.hpp"
+#include "libLSS/physics/forwards/deprecated/borg_pm.hpp"
+#include "libLSS/tools/fused_array.hpp"
+#include "libLSS/borg_splash.hpp"
+#include "libLSS/physics/forwards/particle_balancer/attributes.hpp"
+#include "libLSS/physics/forwards/particle_balancer/particle_distribute.hpp"
+
+#undef EXPERIMENTAL_ORDERING
+
+using namespace LibLSS;
+using namespace LibLSS::BORG_help;
+using CosmoTool::square;
+
+static const bool SKIP_MPI_FOR_SINGLE_NODE = true;
+static const bool FORCE_REDISTRIBUTE = false;
+
+static inline bool ALWAYS_MPI(MPI_Communication *comm) {
+  return (!SKIP_MPI_FOR_SINGLE_NODE || comm->size() > 1);
+}
+
+#include "pm/borg_fwd_pm.cpp"
+#include "pm/borg_fwd_pm_adj.cpp"
+
+template <typename FIC, typename CIC>
+void BorgPMModel<FIC, CIC>::tabulate_sin() {
+  sin2K[0].resize(boost::extents[f_N0]);
+  sin2K[1].resize(boost::extents[f_N1]);
+  sin2K[2].resize(boost::extents[f_N2]);
+
+  for (int i = 0; i < f_N0; i++) {
+    sin2K[0][i] = square(sin(M_PI * i / double(f_N0)) * 2 * f_N0 / L0);
+  }
+
+  for (int i = 0; i < f_N1; i++) {
+    sin2K[1][i] = square(sin(M_PI * i / double(f_N1)) * 2 * f_N1 / L1);
+  }
+
+  for (int i = 0; i < f_N2; i++) {
+    sin2K[2][i] = square(sin(M_PI * i / double(f_N2)) * 2 * f_N2 / L2);
+  }
+}
+
+template <typename FIC, typename CIC>
+BorgPMModel<FIC, CIC>::BorgPMModel(
+    MPI_Communication *comm, const BoxModel &box, int ss_factor, int f_factor,
+    int pm_nsteps, double p_factor, bool do_rsd, double ai, double af,
+    double z_start)
+    : ParticleBasedForwardModel(comm, box) {
+  this->ss_factor = ss_factor;
+  this->f_factor = f_factor;
+  this->pm_nsteps = pm_nsteps;
+  this->z_start = z_start;
+  this->do_redshift = do_rsd;
+  this->ai = ai;
+  this->af = af;
+  this->part_factor = p_factor;
+
+  u_pos = 0;
+  u_vel = 0;
+
+  if (pm_nsteps < 2) {
+    error_helper<ErrorParams>(
+        "BORG_PM is not defined for less than 2 PM steps.");
+  }
+
+  BORG::splash_borg();
+
+  alloc_arrays();
+  tabulate_sin();
+}
+
+template <typename FIC, typename CIC>
+void BorgPMModel<FIC, CIC>::alloc_arrays() {
+  using boost::c_storage_order;
+
+  c_N0 = ss_factor * N0;
+  c_N1 = ss_factor * N1;
+  c_N2 = ss_factor * N2;
+  c_N2_HC = c_N2 / 2 + 1;
+
+  f_N0 = f_factor * N0;
+  f_N1 = f_factor * N1;
+  f_N2 = f_factor * N2;
+  f_N2_HC = f_N2 / 2 + 1;
+
+  force_mgr = std::make_unique<DFT_Manager>(f_N0, f_N1, f_N2, comm);
+  mgr = std::make_unique<DFT_Manager>(c_N0, c_N1, c_N2, comm);
+
+  // When RSD is activated we need another final step.
+  int real_nsteps = pm_nsteps + (do_redshift ? 1 : 0);
+
+  numTransferStep.resize(boost::extents[real_nsteps][comm->size()]);
+  numReceiveStep.resize(boost::extents[real_nsteps][comm->size()]);
+  offsetReceiveStep.resize(boost::extents[real_nsteps][1 + comm->size()]);
+  offsetSendStep.resize(boost::extents[real_nsteps][1 + comm->size()]);
+  local_usedParticles.resize(boost::extents[real_nsteps]);
+
+  c_startN0 = mgr->startN0;
+  c_localN0 = mgr->localN0;
+  f_startN0 = force_mgr->startN0;
+  f_localN0 = force_mgr->localN0;
+  f_startN1 = force_mgr->startN1;
+  f_localN1 = force_mgr->localN1;
+
+  AUX1_p = new CArray(
+      mgr->extents_complex(), c_storage_order(), mgr->allocator_complex);
+  aux_p =
+      new Array(mgr->extents_real(), c_storage_order(), mgr->allocator_real);
+  AUX0_p = new CArray(
+      mgr->extents_complex(), c_storage_order(), mgr->allocator_complex);
+#ifdef ARES_MPI_FFTW
+  f_AUX0_p = new CArray(
+      force_mgr->extents_complex_transposed(), c_storage_order(),
+      force_mgr->allocator_complex);
+#else
+  f_AUX0_p = new CArray(
+      force_mgr->extents_complex(), c_storage_order(),
+      force_mgr->allocator_complex);
+#endif
+
+  if (ss_factor > 1) {
+    c_deltao = new CArray(
+        mgr->extents_complex(), c_storage_order(), mgr->allocator_complex);
+    c_tmp_real_field =
+        new Array(mgr->extents_real(), c_storage_order(), mgr->allocator_real);
+    c_tmp_complex_field = new CArray(
+        mgr->extents_complex(), c_storage_order(), mgr->allocator_complex);
+    lo_AUX0_p = new CArray(
+        lo_mgr->extents_complex(), c_storage_order(),
+        lo_mgr->allocator_complex);
+  } else {
+    c_deltao = 0;
+    c_tmp_real_field = tmp_real_field;
+    c_tmp_complex_field = tmp_complex_field;
+    lo_AUX0_p = new CArray(
+        lo_mgr->extents_complex(), c_storage_order(),
+        lo_mgr->allocator_complex); //AUX0_p;
+  }
+
+  Uninit_FFTW_Real_Array f_tmp_p(
+      force_mgr->extents_real(), force_mgr->allocator_real);
+  Uninit_FFTW_Real_Array::array_type &f_tmp = f_tmp_p.get_array();
+
+  c_synthesis_plan = mgr->create_c2r_plan(AUX1_p->data(), aux_p->data());
+  c_analysis_plan = mgr->create_r2c_plan(aux_p->data(), AUX1_p->data());
+  f_synthesis_plan =
+      force_mgr->create_c2r_plan(f_AUX0_p->data(), f_tmp.data(), true);
+  f_analysis_plan =
+      force_mgr->create_r2c_plan(f_tmp.data(), f_AUX0_p->data(), true);
+}
+
+template <typename FIC, typename CIC>
+BorgPMModel<FIC, CIC>::~BorgPMModel() {
+  delete AUX0_p;
+  delete AUX1_p;
+  delete aux_p;
+  delete f_AUX0_p;
+  delete lo_AUX0_p;
+
+  if (c_deltao != 0) {
+    delete c_deltao;
+    delete c_tmp_real_field;
+    delete c_tmp_complex_field;
+  }
+
+  mgr->destroy_plan(c_synthesis_plan);
+  mgr->destroy_plan(c_analysis_plan);
+
+  force_mgr->destroy_plan(f_synthesis_plan);
+  force_mgr->destroy_plan(f_analysis_plan);
+}
+
+template <typename FIC, typename CIC>
+void BorgPMModel<FIC, CIC>::forwardModelSimple(CArrayRef &delta_init) {
+  ConsoleContext<LOG_DEBUG> ctx("BORG-PM MODEL (particles)");
+  int nstep = pm_nsteps;
+  int real_nstep = nstep + (do_redshift ? 1 : 0);
+  size_t part_number =
+      size_t(part_factor * size_t(c_localN0) * size_t(c_N1) * size_t(c_N2));
+
+  if (u_pos != 0)
+    delete u_pos;
+  if (u_vel != 0)
+    delete u_vel;
+
+#ifdef EXPERIMENTAL_ORDERING
+  typedef boost::general_storage_order<3> order_type;
+  typedef order_type::size_type size_type;
+  size_type ordering[] = {0, 2, 1};
+  bool ascending[] = {true, true, true};
+  order_type order(ordering, ascending);
+  u_pos =
+      new UninitializedArray<TapeArray>(extents[real_nstep][part_number][3]);
+  u_vel = new UninitializedArray<TapeArray>(
+      extents[nstep][part_number][3], std::allocator<double>(), order);
+#else
+  u_pos =
+      new UninitializedArray<TapeArray>(extents[real_nstep][part_number][3]);
+  u_vel = new UninitializedArray<TapeArray>(extents[nstep][part_number][3]);
+#endif
+  lagrangian_id = std::unique_ptr<IdxArray>(new IdxArray(extents[part_number]));
+  u_idx =
+      new UninitializedArray<IdxTapeArray>(extents[real_nstep][part_number]);
+  timing = new TimingArray(extents[4][nstep]);
+
+  pm_fwd_model(
+      delta_init, u_pos->get_array(), u_vel->get_array(), u_idx->get_array(),
+      *timing);
+}
+
+template <typename FIC, typename CIC>
+void BorgPMModel<FIC, CIC>::forwardModel(
+    CArrayRef &delta_init, ArrayRef &delta_output, bool adjointNext) {
+  ConsoleContext<LOG_DEBUG> ctx("BORG-PM MODEL");
+  int nstep = pm_nsteps;
+  int real_nstep = nstep + (do_redshift ? 1 : 0);
+  int last_step = nstep - 1;
+  TapeArrayRef::index_gen i_gen;
+  typedef TapeArrayRef::index_range i_range;
+  // Make some alias first
+
+  forwardModelSimple(delta_init);
+
+  auto &pos = u_pos->get_array();
+  auto &vel = u_vel->get_array();
+  auto &part_idx = u_idx->get_array();
+
+  //build density field
+  if (do_redshift) {
+    // the RSD is collectively as complex as a full time step.
+    // particles are moved then they have to be redistributed to nodes
+    // and the density built on each node.
+    // of course this has to be undone in the adjoint gradient
+    ctx.print("doing redshift space distortions.");
+    int s_step = last_step + 1;
+    TapeArrayRef::reference s_pos = pos[s_step];
+    size_t loc_used = local_usedParticles[last_step];
+
+    // Move particles to their redshift position in the s_pos buffer (actually last entry of pos tape array)
+    ctx.print("Produce redshift coordinates");
+    pm_redshift_pos(pos[last_step], vel[last_step], s_pos, loc_used);
+    // Reinit indexes to 0, 1, ..., loc_used-1
+    ctx.print("init indexes");
+    initIndexes(part_idx[s_step], loc_used);
+    // Domain decomposition, use the FIC distribution criterion
+    ctx.print("redistribute");
+    pm_distribute_particles<false, FIC>(
+        lo_mgr, s_step, pos, vel, part_idx, loc_used);
+    // pos[s_step] is now guaranteed to live only in the acceptable domain for delta_output
+    ctx.print("project");
+    pm_density_obs(
+        s_pos[i_gen[i_range(0, local_usedParticles[s_step])][i_range()]],
+        delta_output);
+  } else {
+    auto slice_inds =
+        i_gen[i_range(0, local_usedParticles[last_step])][i_range()];
+    pm_density_obs(pos[last_step][slice_inds], delta_output);
+  }
+
+  if (!forwardModelHold && !adjointNext) {
+    releaseParticles();
+  }
+  forwardModelHold = false;
+}
+
+template <typename FIC, typename CIC>
+void BorgPMModel<FIC, CIC>::adjointModelParticles(
+    PhaseArrayRef &pos_ag, PhaseArrayRef &vel_ag, ArrayRef &gradient_delta) {
+  int nstep = pm_nsteps;
+
+  pm_fwd_model_ag(pos_ag, vel_ag, u_idx->get_array(), gradient_delta, *timing);
+
+  releaseParticles();
+}
+
+template <typename FIC, typename CIC>
+void BorgPMModel<FIC, CIC>::adjointModel(ArrayRef &gradient_delta) {
+  ConsoleContext<LOG_DEBUG> ctx("BORG-PM ADJOINT MODEL main");
+  int nstep = pm_nsteps;
+  size_t partNumber = size_t(part_factor * c_localN0 * c_N1 * c_N2);
+  U_PhaseArray pos_ag_p(extents[partNumber][3]);
+  U_PhaseArray vel_ag_p(extents[partNumber][3]);
+
+  PhaseArrayRef &pos_ag = pos_ag_p.get_array();
+  PhaseArrayRef &vel_ag = vel_ag_p.get_array();
+
+  PhaseArrayRef::index_gen i_gen;
+  TapeArrayRef &pos = u_pos->get_array();
+  TapeArrayRef &vel = u_vel->get_array();
+
+  int last_step = nstep - 1;
+  size_t lastParts = local_usedParticles[last_step];
+
+  if (do_redshift) {
+    ctx.print("doing redshift space distortions.");
+    int rsd_step = last_step + 1;
+    size_t rsdParts = local_usedParticles[rsd_step];
+
+    //        U_PhaseArray s_pos_u(extents[lastParts][3]);
+    //        PhaseArrayRef& s_pos = s_pos_u.get_array();
+    //
+    //        pm_redshift_pos(pos[last_step], vel[last_step], s_pos);
+    ///work backwards from final to initial conditions
+    //1.) undo CIC
+    pm_density_obs_ag(pos[rsd_step], pos_ag, vel_ag, gradient_delta, rsdParts);
+    pm_distribute_particles_ag<false>(
+        nstep, pos_ag, vel_ag, pos, vel, u_idx->get_array());
+    //2.) undo redshift distortions
+    pm_redshift_pos_ag(
+        pos[last_step], vel[last_step], pos_ag, vel_ag, lastParts);
+  } else {
+    pm_density_obs_ag(
+        pos[last_step], pos_ag, vel_ag, gradient_delta, lastParts);
+  }
+
+  pm_fwd_model_ag(pos_ag, vel_ag, u_idx->get_array(), gradient_delta, *timing);
+
+  releaseParticles();
+}
+
+template class LibLSS::BorgPMModel<>;
+
+#include "libLSS/physics/modified_ngp.hpp"
+template class LibLSS::BorgPMModel<
+    ModifiedNGP<double, NGPGrid::Double>, ClassicCloudInCell<double>>;
+template class LibLSS::BorgPMModel<
+    ModifiedNGP<double, NGPGrid::Quad>, ClassicCloudInCell<double>>;
+
+#ifdef _OPENMP
+#  include "libLSS/physics/openmp_cic.hpp"
+template class BorgPMModel<OpenMPCloudInCell<double>>;
+#endif
--- a/extra/borg/libLSS/physics/forwards/deprecated/borg_pm.hpp
+++ b/extra/borg/libLSS/physics/forwards/deprecated/borg_pm.hpp
@ -0,0 +1,334 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/forwards/deprecated/borg_pm.hpp
+    Copyright (C) 2014-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#ifndef __LIBLSS_BORG_PM_HPP
+#define __LIBLSS_BORG_PM_HPP
+
+#include "libLSS/tools/array_tools.hpp"
+#include "libLSS/samplers/rgen/hmc/hmc_density_sampler.hpp"
+#include "libLSS/mcmc/state_element.hpp"
+#include "libLSS/mcmc/global_state.hpp"
+#include "libLSS/physics/forwards/borg_helpers.hpp"
+#include "libLSS/tools/uninitialized_type.hpp"
+#include "libLSS/tools/push_operators.hpp"
+#include "libLSS/physics/classic_cic.hpp"
+#include "libLSS/tools/mpi_fftw_helper.hpp"
+#include "libLSS/physics/forward_model.hpp"
+#include "libLSS/tools/array_tools.hpp"
+
+namespace LibLSS {
+
+  class BorgPMTypes {
+  public:
+    typedef boost::multi_array<double, 2> TimingArray;
+    typedef boost::multi_array<double, 3> TapeArray;
+    typedef boost::multi_array<long, 2> IdxTapeArray;
+    typedef boost::multi_array_ref<double, 3> TapeArrayRef;
+    typedef boost::multi_array_ref<long, 2> IdxTapeArrayRef;
+    typedef boost::multi_array<double, 1> SinArray;
+    typedef boost::multi_array<long, 2> TapeTransferArray;
+    typedef boost::multi_array<size_t, 1> IdxArray;
+
+    constexpr static const double unit_r0 = 1.0; // Units of distances = 1 Mpc/h
+    constexpr static const double H0 = 100.0;    // h km/s/Mpc
+    constexpr static const double unit_t0 = 1 / H0; // Units of time
+    constexpr static const double unit_v0 =
+        unit_r0 / unit_t0; // Units of velocity
+  };
+
+  template <
+      typename FinalInCell = ClassicCloudInCell<double>,
+      typename CIC = ClassicCloudInCell<double>>
+  class BorgPMModel : virtual public ParticleBasedForwardModel,
+                      public BorgPMTypes {
+  protected:
+    typedef typename LibLSS::array::EigenMap<ArrayRef> EMap;
+    typedef UninitializedArray<PhaseArray> U_PhaseArray;
+    typedef UninitializedArray<TapeArray> U_TapeArray;
+    typedef UninitializedArray<IdxTapeArray> U_IdxTapeArray;
+
+    TimingArray *timing;
+    U_TapeArray *u_pos, *u_vel;
+    U_IdxTapeArray *u_idx;
+    std::unique_ptr<IdxArray> lagrangian_id;
+
+    TapeTransferArray numTransferStep, numReceiveStep, offsetSendStep,
+        offsetReceiveStep;
+    boost::multi_array<size_t, 1> local_usedParticles;
+    SinArray sin2K[3];
+
+    U_CArray_p AUX1_m, AUX0_m, c_tmp_complex_field_m;
+
+    CArray *AUX1_p, *AUX0_p, *lo_AUX0_p, *f_AUX0_p;
+    CArray *c_deltao;
+    long c_N0, c_N1, c_N2, c_localN0, c_N2_HC, c_startN0, c_N2real;
+    long f_N0, f_N1, f_N2, f_localN0, f_N2_HC, f_startN0;
+    long f_startN1, f_localN1;
+    Array *aux_p;
+    std::unique_ptr<DFT_Manager> mgr, force_mgr;
+    int ss_factor, f_factor, pm_nsteps;
+    double part_factor;
+    bool do_redshift;
+    double z_start;
+    double ai, af;
+
+    CArray *c_tmp_complex_field;
+    Array *c_tmp_real_field;
+
+    DFT_Manager::Calls::plan_type c_analysis_plan, c_synthesis_plan;
+    DFT_Manager::Calls::plan_type f_analysis_plan, f_synthesis_plan;
+
+    void pm_fwd_model(
+        CArrayRef &deltao, TapeArrayRef &pos, TapeArrayRef &vel,
+        IdxTapeArrayRef &part_idx, TimingArray &timing);
+    void pm_fwd_model_ag(
+        PhaseArrayRef &pos_ag, PhaseArrayRef &vel_ag, IdxTapeArrayRef &part_idx,
+        ArrayRef &DPSI, TimingArray &timing);
+    void pm_gen_timesteps(double ai, double af, TimingArray &timing, int nstep);
+
+  protected:
+    // =====================================================
+    // IC generation
+    void pm_ic(
+        CArrayRef &deltao, TapeArrayRef &pos, TapeArrayRef &vel,
+        IdxTapeArrayRef &part_idx, TimingArray &timing);
+    void
+    pm_ic_ag(PhaseArrayRef &pos_ag, PhaseArrayRef &vel_ag, TimingArray &timing);
+
+    // =====================================================
+    // Redshift space folding and its AG
+    template <typename PositionArray, typename RedshiftPosition>
+    void pm_redshift_pos(
+        const PositionArray &pos, const PositionArray &vel,
+        RedshiftPosition &s_pos, size_t numParticles);
+    template <typename PositionArray, typename PosAgArray>
+    void pm_redshift_pos_ag(
+        const PositionArray &pos, const PositionArray &vel, PosAgArray &pos_ag,
+        PosAgArray &vel_ag, size_t partNum);
+
+    // =====================================================
+    // Density project and its AG
+
+    template <typename PositionArray>
+    void pm_density_obs(const PositionArray &pos, ArrayRef &deltao);
+    template <typename PositionArray, typename OutputArray>
+    void pm_density_obs_ag(
+        const PositionArray &pos, OutputArray &pos_ag, OutputArray &vel_ag,
+        ArrayRef &B, size_t partNum);
+
+    // =====================================================
+    // Special Force AG
+
+    template <typename PositionArray>
+    void pm_force_0_ag(
+        const PositionArray &pos, const PositionArray &vel,
+        PhaseArrayRef &pos_ag, PhaseArrayRef &vel_ag, PhaseArrayRef &F_ag,
+        double dtr, double dtv, size_t agNum);
+    template <typename PositionArray>
+    void pm_force_1_ag(
+        const PositionArray &pos, const PositionArray &vel,
+        PhaseArrayRef &pos_ag, PhaseArrayRef &vel_ag, PhaseArrayRef &F_ag,
+        double dtr, double dtv, size_t agNum);
+
+    // =====================================================
+    // Update position and its AG
+
+    void pm_pos_update(
+        TapeArrayRef &pos, TapeArrayRef &vel, IdxTapeArrayRef &part_idx,
+        double dtr, int istep);
+    template <typename ForceArray>
+    void pm_pos_update_ag(
+        PhaseArrayRef &pos_ag, const ForceArray &F_ag, double dtr,
+        size_t agNum);
+
+    // =====================================================
+    // This is the main stepping routine and its AG.
+    void pm_stepping(
+        int nstep, TimingArray &timing, TapeArrayRef &pos, TapeArrayRef &vel,
+        IdxTapeArrayRef &part_idx);
+
+    void pm_stepping_ag(
+        int nstep, TimingArray &timing, TapeArrayRef &pos, TapeArrayRef &vel,
+        PhaseArrayRef &pos_ag, PhaseArrayRef &vel_ag,
+        IdxTapeArrayRef &part_idx);
+
+    // =====================================================
+    // These routines are in charge of force over MPI.
+    template <int axis, bool accum, int sign, typename PotentialArray>
+    void codelet_force(
+        int i, FFTW_Real_Array_ref &g, PotentialArray &pot_plus,
+        PotentialArray &pot_minus);
+
+    template <int axis, bool accum, int sign>
+    void compute_force(FFTW_Real_Array_ref &g, FFTW_Real_Array_ref &pot);
+
+    // =====================================================
+    // This the velocity update step and its AG
+    template <typename TapePos, typename TapeVel, typename Grav>
+    void codelet_vel_update(
+        int axis, int istep, double dtv, int i_g_plus, TapePos &pos,
+        TapeVel &vel, Grav &g_plus, Grav &g);
+
+    void pm_vel_update(
+        TapeArrayRef &pos, TapeArrayRef &vel, IdxTapeArrayRef &part_idx,
+        double dtv, int istep);
+    void pm_vel_update_ag(
+        PhaseArrayRef &pos_ag, PhaseArrayRef &vel_ag, double dtr, size_t agNum);
+
+    // =====================================================
+    // Gravitational potential computation
+    template <typename PositionArray>
+    void pm_grav_density(
+        bool clear, const PositionArray &pos, size_t partNum,
+        FFTW_Real_Array_ref &pot);
+
+    void pm_gravpot(FFTW_Real_Array_ref &pot);
+    template <typename PositionArray>
+    void pm_gravpot_ag(const PositionArray &pos, FFTW_Real_Array &pot);
+
+    // =====================================================
+    // These are pure I/O routines to exchange data over MPI.
+    template <bool doVel, typename Projector = CIC>
+    void pm_distribute_particles(
+        std::unique_ptr<DFT_Manager> &dmgr, int istep, TapeArrayRef &pos,
+        TapeArrayRef &vel, IdxTapeArrayRef &part_idx, size_t inParts);
+
+    template <bool doVel>
+    void pm_distribute_particles_ag(
+        int istep, PhaseArrayRef &pos_ag, PhaseArrayRef &vel_ag,
+        TapeArrayRef &pos, TapeArrayRef &vel, IdxTapeArrayRef &part_idx);
+
+    template <bool accum, typename PlaneArray>
+    void pm_exchange_planes(
+        PlaneArray &density, std::unique_ptr<DFT_Manager> &d_mgr,
+        int extra_planes = CIC::MPI_PLANE_LEAKAGE);
+
+    template <typename OutPlaneArray, typename InPlaneArray>
+    void pm_exchange_planes_ag(
+        OutPlaneArray &loc_density, InPlaneArray &global_density,
+        std::unique_ptr<DFT_Manager> &d_mgr);
+    // =====================================================
+
+    void alloc_arrays();
+    void tabulate_sin();
+
+  public:
+    typedef TapeArray::reference ParticleArray;
+    typedef TapeArray::reference VelocityArray;
+
+    BorgPMModel(
+        MPI_Communication *comm, const BoxModel &box, int ss_factor,
+        int f_factor, int nsteps, double part_factor, bool do_rsd, double ai,
+        double af, double z_start);
+    virtual ~BorgPMModel();
+
+    virtual void forwardModelSimple(CArrayRef &delta_init);
+    virtual void forwardModel(
+        CArrayRef &delta_init, ArrayRef &delta_output, bool adjointNext);
+    virtual void adjointModel(ArrayRef &gradient_delta);
+    virtual void adjointModelParticles(
+        PhaseArrayRef &grad_pos, PhaseArrayRef &grad_vel,
+        ArrayRef &gradient_delta);
+
+    virtual size_t getNumberOfParticles() const {
+      return local_usedParticles[pm_nsteps - 1];
+    }
+    virtual unsigned int getSupersamplingRate() const { return ss_factor; }
+
+    virtual IdSubArray getLagrangianIdentifiers() const {
+      boost::multi_array_types::index_gen i_gen;
+      typedef boost::multi_array_types::index_range range;
+      int last_step = pm_nsteps - 1;
+      auto idx_parts = i_gen[range(0, local_usedParticles[last_step])];
+
+      return (*lagrangian_id)[idx_parts];
+    }
+
+    virtual PhaseSubArray getParticlePositions() {
+      boost::multi_array_types::index_gen i_gen;
+      typedef boost::multi_array_types::index_range range;
+
+      int last_step = pm_nsteps - 1;
+      auto idx_parts =
+          i_gen[last_step][range(0, local_usedParticles[last_step])][range()];
+
+      return u_pos->get_array()[idx_parts];
+    }
+
+    virtual PhaseSubArray getParticleVelocities() {
+      boost::multi_array_types::index_gen i_gen;
+      typedef boost::multi_array_types::index_range range;
+
+      int last_step = pm_nsteps - 1;
+      auto idx_parts =
+          i_gen[last_step][range(0, local_usedParticles[last_step])][range()];
+
+      return u_vel->get_array()[idx_parts];
+    }
+
+    virtual double getVelocityMultiplier() { return unit_v0 / af; }
+
+    template <typename ArrayOut>
+    void copyParticlePositions(ArrayOut &a, int pmstep = -1) const {
+      typedef TapeArray::index_range i_range;
+      typename TapeArray::index_gen i_gen;
+      // We do not care about rsd.
+      int last_step = (pmstep < 0) ? pm_nsteps - 1 : pmstep;
+      auto idx_parts =
+          i_gen[last_step][i_range(0, local_usedParticles[last_step])]
+               [i_range()];
+
+      LibLSS::copy_array(a, u_pos->get_array()[idx_parts]);
+    }
+
+    template <typename ArrayOut>
+    void copyParticleVelocities(ArrayOut &a, int pmstep = -1) const {
+      typedef TapeArray::index_range i_range;
+      typename TapeArray::index_gen i_gen;
+      // We do not care about rsd.
+      int last_step = pmstep < 0 ? (pm_nsteps - 1) : pmstep;
+      auto idx_parts =
+          i_gen[last_step][i_range(0, local_usedParticles[last_step])]
+               [i_range()];
+
+      double facRSD = unit_v0 / af;
+
+      LibLSS::copy_array(
+          a, b_fused<double>(
+                 u_vel->get_array()[idx_parts], boost::lambda::_1 * facRSD));
+    }
+
+    virtual void releaseParticles() {
+      if (u_pos != 0) {
+        delete u_idx;
+        delete timing;
+        delete u_pos;
+        delete u_vel;
+        lagrangian_id.reset();
+        u_pos = 0;
+        u_vel = 0;
+      }
+      lagrangian_id.reset();
+    }
+
+    virtual void forwardModelRsdField(ArrayRef &deltaf, double *vobs_ext);
+
+    virtual void clearAdjointGradient() {}
+    virtual void pushAdjointModelParticles(
+        PhaseArrayRef &grad_pos, PhaseArrayRef &grad_vel) {
+      error_helper<ErrorNotImplemented>(
+          "adjointModelParticles is not implemented in this model.");
+    }
+
+    virtual void retrieveAdjointGradient(CArrayRef &agDeltaInit) {}
+  };
+
+}; // namespace LibLSS
+
+#endif
--- a/extra/borg/libLSS/physics/forwards/deprecated/pm/borg_fwd_pm.cpp
+++ b/extra/borg/libLSS/physics/forwards/deprecated/pm/borg_fwd_pm.cpp
@ -0,0 +1,378 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/forwards/deprecated/pm/borg_fwd_pm.cpp
+    Copyright (C) 2014-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+
+template <typename FIC, typename CIC>
+void BorgPMModel<FIC, CIC>::pm_ic(
+    CArrayRef &deltao, TapeArrayRef &pos, TapeArrayRef &vel,
+    IdxTapeArrayRef &part_idx, TimingArray &timing) {
+  ///set cosmological parameters
+  ///Initial density is scaled to initial redshift!!!
+  ConsoleContext<LOG_DEBUG> ctx("PM init ic");
+  Cosmology cosmo(cosmo_params);
+  TapeArrayRef::index_gen i_gen;
+  typedef TapeArrayRef::index_range i_range;
+
+  double an =
+      timing[0][0]; ///set position ics at r_{0}, calculate till present epoch
+  double anh = timing
+      [1]
+      [0]; ///velocities are created at v_{0-1/2}, calculate till present epoch
+  double D0 = cosmo.d_plus(ai);
+  double D1 = cosmo.d_plus(an) / D0;
+  double Df1 = cosmo.d_plus(anh) / D0;
+  double f1 = cosmo.g_plus(anh);
+  double Hubble = cosmo.Hubble(anh) / cosmo_params.h; ///km /sec /(Mpc/h)
+
+  ctx.print(
+      format("D0=%g, D1=%g, Df1=%g, f1=%g, Hubble=%g") % D0 % D1 % Df1 % f1 %
+      Hubble);
+
+  typedef UninitializedArray<
+      FFTW_Complex_Array, FFTW_Allocator<std::complex<double>>>
+      U_CArray;
+  typedef U_CArray::array_type Ref_CArray;
+
+  U_CArray tmp_p(mgr->extents_complex(), mgr->allocator_complex);
+  Ref_CArray &tmp = tmp_p.get_array();
+
+  for (int axis = 0; axis < 3; axis++) {
+    ctx.print(format("Initialize displacement, axis=%d") % axis);
+
+#pragma omp parallel for collapse(3)
+    for (int i = c_startN0; i < c_startN0 + c_localN0; i++)
+      for (int j = 0; j < c_N1; j++)
+        for (int k = 0; k < c_N2_HC; k++) {
+          double kk[3];
+          kk[0] = kmode(i, c_N0, L0);
+          kk[1] = kmode(j, c_N1, L1);
+          kk[2] = kmode(k, c_N2, L2);
+
+          double ksquared = kk[0] * kk[0] + kk[1] * kk[1] + kk[2] * kk[2];
+          double fac = -kk[axis] / (ksquared);
+
+          std::complex<double> &in_delta = deltao[i][j][k];
+
+          tmp[i][j][k] = std::complex<double>(
+              -fac * in_delta.imag(), fac * in_delta.real());
+        }
+
+    if (c_startN0 == 0 && c_localN0 > 0) {
+      tmp[0][0][0] = 0;
+      tmp[0][0][c_N2 / 2] = 0;
+      tmp[0][c_N1 / 2][0] = 0;
+      tmp[0][c_N1 / 2][c_N2 / 2] = 0;
+    }
+
+    if (c_startN0 <= c_N0 / 2 && c_startN0 + c_localN0 > c_N0 / 2) {
+      tmp[c_N0 / 2][0][0] = 0;
+      tmp[c_N0 / 2][0][c_N2 / 2] = 0;
+      tmp[c_N0 / 2][c_N1 / 2][0] = 0;
+      tmp[c_N0 / 2][c_N1 / 2][c_N2 / 2] = 0;
+    }
+
+    /// FFT to Realspace
+    mgr->execute_c2r(c_synthesis_plan, tmp.data(), c_tmp_real_field->data());
+
+#pragma omp parallel for collapse(3)
+    for (long l = c_startN0; l < c_startN0 + c_localN0; l++)
+      for (long m = 0; m < c_N1; m++)
+        for (long n = 0; n < c_N2; n++) {
+          size_t idx = n + c_N2 * m + c_N2 * c_N1 * (l - c_startN0);
+          vel[0][idx][axis] = (*c_tmp_real_field)[l][m][n];
+        }
+  }
+
+  double vScaling = -Df1 * Hubble * f1 * anh * anh / unit_v0;
+
+  ctx.print("Move particles and rescale velocities");
+  auto &ids = *lagrangian_id;
+  size_t base_id = c_N2 * c_N1 * c_startN0;
+
+#pragma omp parallel for collapse(3)
+  for (long l = c_startN0; l < c_startN0 + c_localN0; l++)
+    for (long m = 0; m < c_N1; m++)
+      for (long n = 0; n < c_N2; n++) {
+        /// sort particles on equidistant grid
+        double q0 = L0 / double(c_N0) * double(l);
+        double q1 = L1 / double(c_N1) * double(m);
+        double q2 = L2 / double(c_N2) * double(n);
+        size_t idx = n + c_N2 * m + c_N2 * c_N1 * (l - c_startN0);
+        TapeArray::reference::reference loc_pos = pos[0][idx];
+        TapeArray::reference::reference loc_vel = vel[0][idx];
+
+        double x = q0 - D1 * loc_vel[0];
+        double y = q1 - D1 * loc_vel[1];
+        double z = q2 - D1 * loc_vel[2];
+
+        ///enforce periodic boundary conditions
+        loc_pos[0] = periodic_fix(x, 0., L0);
+        loc_pos[1] = periodic_fix(y, 0., L1);
+        loc_pos[2] = periodic_fix(z, 0., L2);
+
+        ids[idx] = idx + base_id;
+
+        ///NOTE: displacements are already sttored in the velocity vectors. Only need to multiply by prefactor
+
+        ///store velocities in km/sec
+        ///note we multiply by a² to get the correct momentum variable for the particle mesh code
+        ///and normalize to code units
+        loc_vel[0] *= vScaling;
+        loc_vel[1] *= vScaling;
+        loc_vel[2] *= vScaling;
+      }
+
+  // Start evenly distributed
+  local_usedParticles[0] = size_t(c_localN0) * c_N1 * c_N2;
+  copy_array_rv(
+      part_idx[i_gen[0][i_range(0, local_usedParticles[0])]],
+      b_fused_idx<long, 1>(boost::lambda::_1));
+}
+
+template <typename FIC, typename CIC>
+template <typename PositionArray, typename RedshiftPosition>
+void BorgPMModel<FIC, CIC>::pm_redshift_pos(
+    const PositionArray &pos, const PositionArray &vel, RedshiftPosition &s_pos,
+    size_t numParticles) {
+  Cosmology cosmo(cosmo_params);
+
+  //this routine generates particle positions in redshift space
+  double anh = af;
+  double Hubble = cosmo.Hubble(anh) / cosmo_params.h; ///km /sec /(Mpc/h)
+
+  // the code uses particle momenta p=a^2 dx/dt where x is the co-moving position
+  // peculiar velocities are then given by v_pec = p/a
+
+  //NOTE: Check coefficients
+  double facRSD =
+      unit_v0 / af /
+      Hubble; //this factor is 1/H/a for velocities in [km/sec] an additional factor arises from momentum conversion
+  double facObs = af / Hubble / facRSD;
+
+  size_t usedParts = s_pos.shape()[0];
+  boost::array<double, 3> observer = {vobs[0] * facObs, vobs[1] * facObs,
+                                      vobs[2] * facObs};
+
+#pragma omp parallel for
+  for (size_t idx = 0; idx < numParticles; idx++) {
+    typename PositionArray::const_reference cur_pos = pos[idx];
+    typename PositionArray::const_reference cur_vel = vel[idx];
+
+    double x0g = cur_pos[0];
+    double x1g = cur_pos[1];
+    double x2g = cur_pos[2];
+
+    double x0 = x0g + xmin0;
+    double x1 = x1g + xmin1;
+    double x2 = x2g + xmin2;
+
+    double v0 = cur_vel[0] + observer[0];
+    double v1 = cur_vel[1] + observer[1];
+    double v2 = cur_vel[2] + observer[2];
+
+    double r2_los = x0 * x0 + x1 * x1 + x2 * x2;
+    double v_los = v0 * x0 + v1 * x1 + v2 * x2;
+
+    double A = facRSD * v_los / r2_los;
+
+    double s0 = x0g + A * x0;
+    double s1 = x1g + A * x1;
+    double s2 = x2g + A * x2;
+
+    ///enforce periodic boundary conditions
+    s_pos[idx][0] = periodic_fix(s0, 0., L0);
+    s_pos[idx][1] = periodic_fix(s1, 0., L1);
+    s_pos[idx][2] = periodic_fix(s2, 0., L2);
+  }
+}
+
+#include "pm_force.hpp"
+#include "pm_grav.hpp"
+#include "pm_vel_update.hpp"
+#include "pm_pos_update.hpp"
+
+template <typename FIC, typename CIC>
+void BorgPMModel<FIC, CIC>::pm_stepping(
+    int nstep, TimingArray &timing, TapeArrayRef &pos, TapeArrayRef &vel,
+    IdxTapeArrayRef &part_idx) {
+  ConsoleContext<LOG_DEBUG> ctx("BORG-PM stepping");
+  TapeArrayRef::index_gen i_gen;
+  typedef TapeArrayRef::index_range i_range;
+
+  ctx.print(format("Doing  %d timesteps of PM") % (nstep - 1));
+
+  ///________________________________________________________
+  /// PM code forward model
+  ///________________________________________________________
+
+  for (int nn = 0; nn < nstep - 1; nn++) {
+    double dtr = timing[2][nn];
+    double dtv = timing[3][nn];
+    long loc_used = local_usedParticles[nn];
+
+    initIndexes(part_idx[nn + 1], loc_used);
+    pm_vel_update(pos, vel, part_idx, dtv, nn);
+    pm_pos_update(pos, vel, part_idx, dtr, nn);
+    // The last step is special
+    if (nn < nstep - 2)
+      pm_distribute_particles<true>(
+          force_mgr, nn + 1, pos, vel, part_idx, local_usedParticles[nn]);
+  }
+  pm_distribute_particles<true, FIC>(
+      lo_mgr, nstep - 1, pos, vel, part_idx, local_usedParticles[nstep - 2]);
+}
+
+template <typename FIC, typename CIC>
+void BorgPMModel<FIC, CIC>::pm_gen_timesteps(
+    double ai, double af, TimingArray &timing, int nstep) {
+  ConsoleContext<LOG_DEBUG> ctx("BORG-PM gen_timesteps");
+  Cosmology cosmo(cosmo_params);
+
+  ctx.print(
+      format("Building timesteps from ai=%g to af=%d in %d steps") % ai % af %
+      nstep);
+  double du = (log(af) - log(ai)) / double(nstep - 1);
+
+  for (int i = 0; i < nstep; i++) {
+    double an0 = ai * exp(du * i);
+    double an1 = ai * exp(du * (i + 1));
+
+    double anh0 = (an0 + ai * exp(du * (i - 1))) / 2.;
+    double anh1 = (an0 + an1) / 2.;
+
+    double dtr = cosmo.dtr(an0, an1);
+    double dtv = cosmo.dtv(anh0, anh1);
+
+    timing[0][i] = an0;
+    timing[1][i] = anh0;
+    timing[2][i] = dtr;
+    timing[3][i] = dtv;
+  }
+}
+
+template <typename FIC, typename CIC>
+template <typename PositionArray>
+void BorgPMModel<FIC, CIC>::pm_density_obs(
+    const PositionArray &pos, ArrayRef &deltao) {
+  double nmean = CosmoTool::cube(ss_factor);
+
+  if (ALWAYS_MPI(comm)) {
+    typedef UninitializedArray<boost::multi_array<double, 3>> U_Array;
+    typedef U_Array::array_type::index_range i_range;
+    U_Array::array_type::index_gen indices;
+    // Here we have to introduce ghost planes.
+    U_Array tmp_delta(lo_mgr->extents_real(CIC::MPI_PLANE_LEAKAGE));
+
+    array::fill(tmp_delta.get_array(), 0);
+    FIC::projection(
+        pos, tmp_delta.get_array(), L0, L1, L2, N0, N1, N2,
+        typename FIC::Periodic_MPI(N0, N1, N2, comm),
+        CIC_Tools::DefaultWeight(), pos.shape()[0]);
+    // pot has MPI_PLANE_LEAKAGE extra planes. They have to be sent to the adequate nodes.
+    pm_exchange_planes<true>(tmp_delta.get_array(), lo_mgr);
+
+    copy_array_rv(
+        deltao[indices[i_range(startN0, startN0 + localN0)][i_range()]
+                      [i_range(0, N2)]],
+        tmp_delta.get_array()[indices[i_range(startN0, startN0 + localN0)]
+                                     [i_range()][i_range()]]);
+  } else {
+    array::fill(deltao, 0);
+    FIC::projection(
+        pos, deltao, L0, L1, L2, N0, N1, N2, CIC_Tools::Periodic(N0, N1, N2),
+        CIC_Tools::DefaultWeight(), pos.shape()[0]);
+  }
+
+  array::density_rescale(deltao, nmean);
+}
+
+template <typename FIC, typename CIC>
+void BorgPMModel<FIC, CIC>::forwardModelRsdField(
+    ArrayRef &deltaf, double *vobs_ext) {
+  TapeArrayRef::index_gen i_gen;
+  typedef TapeArrayRef::index_range i_range;
+
+  ConsoleContext<LOG_DEBUG> ctx("BORG forward model rsd density calculation");
+  /// we also choose some time steps
+  int nstep = pm_nsteps;
+  int last_step = nstep - 1;
+  TapeArrayRef &pos = u_pos->get_array();
+  TapeArrayRef &vel = u_vel->get_array();
+  IdxTapeArrayRef &part_idx = u_idx->get_array();
+
+  ///introduce redshift distortions
+  if (do_redshift) {
+    //ArrayType1d::ArrayType& dummy = vobs;
+    int s_step = last_step + 1;
+    TapeArrayRef::reference s_pos = pos[s_step];
+    size_t loc_used = local_usedParticles[last_step];
+
+    //set vobs to input
+    double dummy[3];
+
+    dummy[0] = vobs[0];
+    dummy[1] = vobs[1];
+    dummy[2] = vobs[2];
+
+    vobs[0] = vobs_ext[0];
+    vobs[1] = vobs_ext[1];
+    vobs[2] = vobs_ext[2];
+
+    ctx.print("doing redshift space distortions.");
+    // Move particles to their redshift position in the s_pos buffer (actually last entry of pos tape array)
+    pm_redshift_pos(pos[last_step], vel[last_step], s_pos, loc_used);
+    // Reinit indexes to 0, 1, ..., loc_used-1
+    initIndexes(part_idx[s_step], loc_used);
+    // Domain decomposition.
+    pm_distribute_particles<false, FIC>(
+        lo_mgr, s_step, pos, vel, part_idx, loc_used);
+    // pos[s_step] is now guaranteed to live only in the acceptable domain for deltaf
+    pm_density_obs(
+        s_pos[i_gen[i_range(0, local_usedParticles[s_step])][i_range()]],
+        deltaf);
+
+    //reset vobs, last index has been destroyed. AG will not be correct
+    vobs[0] = dummy[0];
+    vobs[1] = dummy[1];
+    vobs[2] = dummy[2];
+  }
+}
+
+template <typename FIC, typename CIC>
+void BorgPMModel<FIC, CIC>::pm_fwd_model(
+    CArrayRef &deltao, TapeArrayRef &pos, TapeArrayRef &vel,
+    IdxTapeArrayRef &part_idx, TimingArray &timing) {
+  ConsoleContext<LOG_DEBUG> ctx("BORG-PM forward model");
+  ///NOTE: ICs are generated at ai
+  ///but the pmcode starts at ao and finishes at af
+  double ao = 1. / (1. + z_start);
+
+  /// we also choose some time steps
+  int nstep = pm_nsteps;
+  int last_step = nstep - 1;
+
+  //generate time steps
+  pm_gen_timesteps(ao, af, timing, nstep);
+
+  //generate initial conditions at ao
+  if (c_deltao != 0) {
+    array::fill(*c_deltao, 0);
+    mgr->upgrade_complex(*lo_mgr, deltao, *c_deltao);
+    pm_ic(*c_deltao, pos, vel, part_idx, timing);
+  } else
+    pm_ic(deltao, pos, vel, part_idx, timing);
+
+  if ((FORCE_REDISTRIBUTE || ALWAYS_MPI(comm)))
+    // Redistribute first
+    pm_distribute_particles<true>(
+        force_mgr, 0, pos, vel, part_idx, local_usedParticles[0]);
+
+  //do the pm stepping
+  pm_stepping(nstep, timing, pos, vel, part_idx);
+}
--- a/extra/borg/libLSS/physics/forwards/deprecated/pm/borg_fwd_pm_adj.cpp
+++ b/extra/borg/libLSS/physics/forwards/deprecated/pm/borg_fwd_pm_adj.cpp
@ -0,0 +1,573 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/forwards/deprecated/pm/borg_fwd_pm_adj.cpp
+    Copyright (C) 2014-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+
+template <typename FIC, typename CIC>
+void BorgPMModel<FIC, CIC>::pm_ic_ag(
+    PhaseArrayRef &pos_ag, PhaseArrayRef &vel_ag, TimingArray &timing) {
+  ///set cosmological parameters
+  ///Initial density is scaled to initial redshift!!!
+  Cosmology cosmo(cosmo_params);
+
+  double an =
+      timing[0][0]; ///set position ics at r_{0}, calculate till present epoch
+  double anh = timing
+      [1]
+      [0]; ///velocities are created at v_{0-1/2}, calculate till present epoch
+  double D0 = cosmo.d_plus(ai);
+  double D1 = cosmo.d_plus(an) / D0;
+  double Df1 = cosmo.d_plus(anh) / D0;
+  double f1 = cosmo.g_plus(anh);
+  double Hubble = cosmo.Hubble(anh) / cosmo_params.h; ///km /sec /(Mpc/h)
+
+  ///allocate auxiliary Fourier array
+  FFTW_Complex_Array &AUX1 = *AUX1_p;
+  FFTW_Real_Array &aux = *aux_p;
+  FFTW_Complex_Array &AUX0 = *AUX0_p;
+
+  array::fill(AUX1, 0);
+
+  for (int axis = 0; axis < 3; axis++) {
+    ///1. Do position/velocity derivative
+    ///------------------------------------------------------------------------------
+
+    double fac_vel = -Df1 * f1 * anh * anh * Hubble / unit_v0;
+#pragma omp parallel for collapse(3)
+    for (long l = c_startN0; l < c_startN0 + c_localN0; l++)
+      for (long m = 0; m < c_N1; m++)
+        for (long n = 0; n < c_N2; n++) {
+          // pos_ag and vel_ag are ordered exactly as the initial conditions,
+          // so this code works
+          size_t idx = n + c_N2 * m + c_N2 * c_N1 * (l - c_startN0);
+          aux[l][m][n] = -D1 * pos_ag[idx][axis] + fac_vel * vel_ag[idx][axis];
+        }
+
+    /// FFT to F-space
+    mgr->execute_r2c(c_analysis_plan, aux.data(), AUX0.data());
+
+#pragma omp parallel for collapse(3)
+    for (long i = c_startN0; i < c_startN0 + c_localN0; i++) {
+      for (long j = 0; j < c_N1; j++) {
+        for (long k = 0; k < c_N2_HC; k++) {
+          double kk[3];
+          kk[0] = kmode(i, c_N0, L0);
+          kk[1] = kmode(j, c_N1, L1);
+          kk[2] = kmode(k, c_N2, L2);
+
+          double ksquared = kk[0] * kk[0] + kk[1] * kk[1] + kk[2] * kk[2];
+          double fac = -kk[axis] / ksquared * volNorm;
+
+          std::complex<double> &in_delta = AUX0[i][j][k];
+
+          AUX1[i][j][k] += std::complex<double>(
+              fac * in_delta.imag(), -fac * in_delta.real());
+        }
+      }
+    }
+  }
+
+  //fix hermiticity...unclear how to do that
+  if (c_startN0 == 0 && c_localN0 > 0) {
+    AUX1[0][0][0] = 0;
+    AUX1[0][0][c_N2_HC - 1] = 0;
+    AUX1[0][c_N1 / 2][0] = 0;
+    AUX1[0][c_N1 / 2][c_N2_HC - 1] = 0;
+  }
+
+  if (c_startN0 <= c_N0 / 2 && c_startN0 + c_localN0 > c_N0 / 2) {
+    AUX1[c_N0 / 2][0][0] = 0;
+    AUX1[c_N0 / 2][0][c_N2_HC - 1] = 0;
+    AUX1[c_N0 / 2][c_N1 / 2][0] = 0;
+    AUX1[c_N0 / 2][c_N1 / 2][c_N2_HC - 1] = 0;
+  }
+}
+
+///===============================
+template <typename FIC, typename CIC>
+template <typename PositionArray, typename PosAgArray>
+void BorgPMModel<FIC, CIC>::pm_redshift_pos_ag(
+    const PositionArray &pos, const PositionArray &vel, PosAgArray &pos_ag,
+    PosAgArray &vel_ag, size_t partNum) {
+  Cosmology cosmo(cosmo_params);
+
+  //this routine generates particle positions in redshift space
+  double anh = af;
+  double Hubble = cosmo.Hubble(anh) / cosmo_params.h; ///km /sec /(Mpc/h)
+
+  // the code uses particle momenta p=a^2 dx/dt where x is the co-moving position
+  // peculiar velocities are then given by v_pec = p/a
+
+  //NOTE: Check coefficients
+  double facRSD =
+      unit_v0 / af /
+      Hubble; //this factor is 1/H/a for velocities in [km/sec] an additional factor arises from momentum conversion
+  double facObs = af / Hubble / (facRSD);
+  boost::array<double, 3> observer = {vobs[0] * facObs, vobs[1] * facObs,
+                                      vobs[2] * facObs};
+
+#pragma omp parallel for
+  for (size_t idx = 0; idx < partNum; idx++) {
+    typename PositionArray::const_reference cur_pos = pos[idx];
+    typename PositionArray::const_reference cur_vel = vel[idx];
+    typename PosAgArray::reference cur_pos_ag = pos_ag[idx];
+    typename PosAgArray::reference cur_vel_ag = vel_ag[idx];
+
+    double x0 = cur_pos[0] + xmin0;
+    double x1 = cur_pos[1] + xmin1;
+    double x2 = cur_pos[2] + xmin2;
+
+    double v0 = cur_vel[0] + observer[0];
+    double v1 = cur_vel[1] + observer[1];
+    double v2 = cur_vel[2] + observer[2];
+
+    double s_pos_ag0 = cur_pos_ag[0];
+    double s_pos_ag1 = cur_pos_ag[1];
+    double s_pos_ag2 = cur_pos_ag[2];
+
+    double r2_los = x0 * x0 + x1 * x1 + x2 * x2;
+    double v_los = v0 * x0 + v1 * x1 + v2 * x2;
+    double slos = s_pos_ag0 * x0 + s_pos_ag1 * x1 + s_pos_ag2 * x2;
+
+    double A = facRSD * slos / r2_los;
+    double B = -2 * facRSD * v_los * slos / square(r2_los);
+    double C = facRSD * v_los / r2_los;
+
+    cur_pos_ag[0] = s_pos_ag0 + C * s_pos_ag0 + B * x0 + A * v0;
+    cur_pos_ag[1] = s_pos_ag1 + C * s_pos_ag1 + B * x1 + A * v1;
+    cur_pos_ag[2] = s_pos_ag2 + C * s_pos_ag2 + B * x2 + A * v2;
+
+    cur_vel_ag[0] = A * x0;
+    cur_vel_ag[1] = A * x1;
+    cur_vel_ag[2] = A * x2;
+  }
+}
+///===============================
+template <typename FIC, typename CIC>
+template <typename PositionArray, typename OutputArray>
+void BorgPMModel<FIC, CIC>::pm_density_obs_ag(
+    const PositionArray &pos, OutputArray &pos_ag, OutputArray &vel_ag,
+    ArrayRef &B, size_t partNum) {
+  double nmean = CosmoTool::cube(ss_factor);
+  typedef UninitializedArray<boost::multi_array<double, 3>> U_Array;
+
+  if (ALWAYS_MPI(comm)) {
+    // Allocate a temporary density field with extra planes for the
+    // the projection leakage
+    U_Array tmp_delta(lo_mgr->extents_real(CIC::MPI_PLANE_LEAKAGE));
+
+    pm_exchange_planes_ag(tmp_delta.get_array(), B, lo_mgr);
+    array::fill(pos_ag, 0);
+    FIC::adjoint(
+        pos, tmp_delta.get_array(), pos_ag, L0, L1, L2, N0, N1, N2,
+        typename FIC::Periodic_MPI(N0, N1, N2, comm), nmean, partNum);
+  } else {
+    // This is simple, no copy, no adjustment
+    array::fill(pos_ag, 0);
+    FIC::adjoint(
+        pos, B, pos_ag, L0, L1, L2, N0, N1, N2, CIC_Tools::Periodic(N0, N1, N2),
+        nmean, partNum);
+  }
+
+  // Always zero here, not with RSD though
+  array::fill(vel_ag, 0);
+}
+
+template <int axis, typename GravityArray>
+double __gravity_interpolation_ag(
+    const GravityArray &g, double x, double y, double z, int ix, int iy, int iz,
+    int jx, int jy, int jz) {
+  double rx, ry, rz, qx, qy, qz;
+
+  switch (axis) {
+  case 0:
+    rx = 1;
+    ry = (y - iy);
+    rz = (z - iz);
+    qx = -1;
+    qy = 1 - ry;
+    qz = 1 - rz;
+    break;
+  case 1:
+    rx = (x - ix);
+    ry = 1;
+    rz = (z - iz);
+    qx = 1 - rx;
+    qy = -1;
+    qz = 1 - rz;
+    break;
+  case 2:
+    rx = (x - ix);
+    ry = (y - iy);
+    rz = 1;
+
+    qx = 1 - rx;
+    qy = 1 - ry;
+    qz = -1;
+    break;
+  }
+
+  return g[ix][iy][iz] * qx * qy * qz + g[ix][iy][jz] * qx * qy * rz +
+         g[ix][jy][iz] * qx * ry * qz + g[ix][jy][jz] * qx * ry * rz +
+         g[jx][iy][iz] * rx * qy * qz + g[jx][iy][jz] * rx * qy * rz +
+         g[jx][jy][iz] * rx * ry * qz + g[jx][jy][jz] * rx * ry * rz;
+}
+
+// This is the computation of 1st term in the paper. It corresponds
+// the gradient of the interpolation kernel at the particle position
+template <typename FIC, typename CIC>
+template <typename PositionArray>
+void BorgPMModel<FIC, CIC>::pm_force_1_ag(
+    const PositionArray &pos, const PositionArray &vel, PhaseArrayRef &pos_ag,
+    PhaseArrayRef &vel_ag, PhaseArrayRef &F_ag, double dtr, double dtv,
+    size_t agNum) {
+  // We need one additional plane to compute the interpolated gravity
+  Uninit_FFTW_Real_Array g_p(
+      force_mgr->extents_real(1), force_mgr->allocator_real);
+  Uninit_FFTW_Real_Array pot_p(
+      force_mgr->extents_real(CIC::MPI_PLANE_LEAKAGE),
+      force_mgr->allocator_real);
+  typedef Uninit_FFTW_Real_Array::array_type GArray;
+  GArray &pot = pot_p.get_array();
+  GArray &g = g_p.get_array();
+
+  ConsoleContext<LOG_DEBUG> ctx("Build gravitational potential");
+  //estimate gravitational potential
+  //alternatively we can store the potential from the forward run
+  //for the expense of higher memory requirements
+  pm_grav_density(true, pos, agNum, pot);
+  pm_gravpot(pot);
+
+  double i_d0 = f_N0 / L0;
+  double i_d1 = f_N1 / L1;
+  double i_d2 = f_N2 / L2;
+
+  // This is a hack to avoid branching all the time.
+  long f_mpi_N0 = (ALWAYS_MPI(comm)) ? (f_N0 + 2) : f_N0;
+
+  //calculate forces and update velocities
+  for (int axis = 0; axis < 3; axis++) {
+    ConsoleContext<LOG_DEBUG> ctx_force("axis handling");
+    switch (axis) {
+    case 0:
+      array::fill(F_ag, 0);
+      compute_force<0, false, 1>(g, pot);
+      break;
+    case 1:
+      compute_force<1, false, 1>(g, pot);
+      break;
+    case 2:
+      compute_force<2, false, 1>(g, pot);
+      break;
+    }
+
+    if (ALWAYS_MPI(comm)) {
+      // We need to exchange a plane (thus enforcing "1")
+      auto g0 = g[f_startN0 + f_localN0];
+      auto g1 = g[f_startN0];
+      int plane0 = (f_startN0 + f_localN0) % f_N0;
+      int plane1 = (f_startN0 + f_N0 - 1) % f_N0;
+      MPI_Communication::Request req_recv, req_send;
+      req_recv = comm->IrecvT(
+          &g0[0][0], g0.num_elements(), force_mgr->get_peer(plane0), plane0);
+      req_send = comm->IsendT(
+          &g1[0][0], g1.num_elements(), force_mgr->get_peer(plane1), f_startN0);
+      req_recv.wait();
+      req_send.wait();
+      //pm_exchange_planes<false>(g, force_mgr, 1);
+      // Now all planes are available to compute derivatives
+    }
+
+    ctx_force.print(format("accumulate on F_ag, axis=%d") % axis);
+#pragma omp parallel for schedule(static)
+    for (long i = 0; i < agNum; i++) {
+      double x = pos[i][0] * i_d0;
+      double y = pos[i][1] * i_d1;
+      double z = pos[i][2] * i_d2;
+
+      int ix = (int)std::floor(x);
+      int iy = (int)std::floor(y);
+      int iz = (int)std::floor(z);
+
+      int jx = (ix + 1) % f_mpi_N0;
+      int jy = (iy + 1) % f_N1;
+      int jz = (iz + 1) % f_N2;
+
+      double ax, ay, az;
+      //derivative of cic kernel with respect to x
+
+      ax = __gravity_interpolation_ag<0, GArray>(
+               g, x, y, z, ix, iy, iz, jx, jy, jz) *
+           i_d0;
+
+      //derivative of cic kernel with respect to y
+      ay = __gravity_interpolation_ag<1, GArray>(
+               g, x, y, z, ix, iy, iz, jx, jy, jz) *
+           i_d1;
+
+      //derivative of cic kernel with respect to z
+      az = __gravity_interpolation_ag<2, GArray>(
+               g, x, y, z, ix, iy, iz, jx, jy, jz) *
+           i_d2;
+
+      //now add terms to force
+
+      F_ag[i][0] +=
+          ax * pos_ag[i][axis] * dtr * dtv + ax * vel_ag[i][axis] * dtv;
+      F_ag[i][1] +=
+          ay * pos_ag[i][axis] * dtr * dtv + ay * vel_ag[i][axis] * dtv;
+      F_ag[i][2] +=
+          az * pos_ag[i][axis] * dtr * dtv + az * vel_ag[i][axis] * dtv;
+    }
+  }
+
+  Console::instance().print<LOG_DEBUG>("Done force_1_ag");
+}
+
+// Computation of the second term in the PM paper. That's the computation
+// of the gradient of the gridded force with respect to displacements of the
+// previous timesteps.
+template <typename FIC, typename CIC>
+template <typename PositionArray>
+void BorgPMModel<FIC, CIC>::pm_force_0_ag(
+    const PositionArray &pos, const PositionArray &vel, PhaseArrayRef &pos_ag,
+    PhaseArrayRef &vel_ag, PhaseArrayRef &F_ag, double dtr, double dtv,
+    size_t adjNum) {
+  double d0 = L0 / double(N0);
+  double d1 = L1 / double(N1);
+  double d2 = L2 / double(N2);
+  typedef boost::multi_array<double, 1> WeightArray;
+  typedef UninitializedArray<WeightArray> U_WeightArray;
+
+  FFTW_Complex_Array &f_AUX0 = *f_AUX0_p;
+  Uninit_FFTW_Real_Array B_p(
+      force_mgr->extents_real(), force_mgr->allocator_real);
+  Uninit_FFTW_Real_Array g_p(
+      force_mgr->extents_real(CIC::MPI_PLANE_LEAKAGE),
+      force_mgr->allocator_real);
+  Uninit_FFTW_Real_Array::array_type &g = g_p.get_array();
+  Uninit_FFTW_Real_Array::array_type &B = B_p.get_array();
+  U_WeightArray u_weight(boost::extents[adjNum]);
+  U_WeightArray::array_type &weight = u_weight.get_array();
+
+  array::fill(B, 0);
+
+  for (int axis = 0; axis < 3; axis++) {
+#pragma omp parallel for
+    for (long i = 0; i < adjNum; i++) {
+      weight[i] = pos_ag[i][axis] * dtr * dtv + vel_ag[i][axis] * dtv;
+    }
+
+    //do cic
+    array::fill(g, 0);
+    if (ALWAYS_MPI(comm)) {
+      Console::instance().print<LOG_DEBUG>(
+          "Projecting positions to recompute force (Comm > 1)");
+      CIC::projection(
+          pos, g, L0, L1, L2, f_N0, f_N1, f_N2,
+          typename CIC::Periodic_MPI(f_N0, f_N1, f_N2, comm), weight, adjNum);
+      Console::instance().print<LOG_DEBUG>("Done. Now exchanging");
+      pm_exchange_planes<true>(g, force_mgr);
+    } else {
+      CIC::projection(
+          pos, g, L0, L1, L2, f_N0, f_N1, f_N2,
+          CIC_Tools::Periodic(f_N0, f_N1, f_N2), weight, adjNum);
+    }
+
+    switch (axis) {
+    case 0:
+      BorgPMModel<FIC, CIC>::compute_force<0, true, -1>(B, g);
+      break;
+    case 1:
+      BorgPMModel<FIC, CIC>::compute_force<1, true, -1>(B, g);
+      break;
+    case 2:
+      BorgPMModel<FIC, CIC>::compute_force<2, true, -1>(B, g);
+      break;
+    }
+    Console::instance().print<LOG_DEBUG>("Accrued forces");
+  }
+
+  Console::instance().print<LOG_DEBUG>("Fourier transforming");
+  //transform density to F-space
+  force_mgr->execute_r2c(f_analysis_plan, B.data(), f_AUX0.data());
+
+  double normphi = 3. / 2. * cosmo_params.omega_m / double(f_N0 * f_N1 * f_N2) /
+                   (unit_r0 * unit_r0);
+
+#ifdef ARES_MPI_FFTW
+#  pragma omp parallel for
+  for (int i = f_startN1; i < f_startN1 + f_localN1; i++) {
+    double sin21 = sin2K[1][i];
+    for (int j = 0; j < f_N0; j++) {
+      double sin20 = sin2K[0][j];
+      for (int k = 0; k < f_N2_HC; k++) {
+        double sin22 = sin2K[2][k];
+
+        double Greens = -normphi / (sin20 + sin21 + sin22);
+        f_AUX0[i][j][k] *= Greens;
+      }
+    }
+  }
+  //fix zero mode by hand
+  if (f_startN1 == 0 && f_localN1 > 0) {
+    f_AUX0[0][0][0] = 0;
+  }
+#else
+#  pragma omp parallel for
+  for (int i = f_startN0; i < f_startN0 + f_localN0; i++) {
+    double sin20 = sin2K[0][i];
+    for (int j = 0; j < f_N1; j++) {
+      double sin21 = sin2K[1][j];
+      for (int k = 0; k < f_N2_HC; k++) {
+        double sin22 = sin2K[2][k];
+
+        double Greens = -normphi / (sin20 + sin21 + sin22);
+        f_AUX0[i][j][k] *= Greens;
+      }
+    }
+  }
+  //fix zero mode by hand
+  if (f_startN0 == 0 && f_localN0 > 0) {
+    f_AUX0[0][0][0] = 0;
+  }
+#endif
+
+  force_mgr->execute_c2r(f_synthesis_plan, f_AUX0.data(), B.data());
+
+  double nmean = CosmoTool::cube(double(ss_factor) / f_factor);
+
+  Console::instance().print<LOG_DEBUG>(
+      "Now exchange_plane_ag and final CIC adjoint for force_0");
+
+  if (ALWAYS_MPI(comm)) {
+    Uninit_FFTW_Real_Array tmp_B(
+        force_mgr->extents_real(CIC::MPI_PLANE_LEAKAGE));
+
+    pm_exchange_planes_ag(tmp_B.get_array(), B, force_mgr);
+    array::fill(F_ag, 0);
+    CIC::adjoint(
+        pos, tmp_B.get_array(), F_ag, L0, L1, L2, f_N0, f_N1, f_N2,
+        typename CIC::Periodic_MPI(f_N0, f_N1, f_N2, comm), nmean, adjNum);
+  } else {
+    array::fill(F_ag, 0);
+    CIC::adjoint(
+        pos, B, F_ag, L0, L1, L2, f_N0, f_N1, f_N2,
+        CIC_Tools::Periodic(f_N0, f_N1, f_N2), nmean, adjNum);
+  }
+}
+
+template <typename FIC, typename CIC>
+template <typename ForceArray>
+void BorgPMModel<FIC, CIC>::pm_pos_update_ag(
+    PhaseArrayRef &pos_ag, const ForceArray &F_ag, double dtr, size_t agNum) {
+#pragma omp parallel for
+  for (long i = 0; i < agNum; i++) {
+    for (int j = 0; j < 3; j++)
+      pos_ag[i][j] += F_ag[i][j];
+  }
+}
+
+template <typename FIC, typename CIC>
+void BorgPMModel<FIC, CIC>::pm_vel_update_ag(
+    PhaseArrayRef &pos_ag, PhaseArrayRef &vel_ag, double dtr, size_t agNum) {
+#pragma omp parallel for
+  for (long i = 0; i < agNum; i++) {
+    for (int j = 0; j < 3; j++)
+      vel_ag[i][j] += pos_ag[i][j] * dtr;
+  }
+}
+
+template <typename FIC, typename CIC>
+void BorgPMModel<FIC, CIC>::pm_stepping_ag(
+    int nstep, TimingArray &timing, TapeArrayRef &pos, TapeArrayRef &vel,
+    PhaseArrayRef &pos_ag, PhaseArrayRef &vel_ag, IdxTapeArrayRef &part_idx) {
+  //define adjoint force array
+  using boost::lambda::_1;
+  using boost::lambda::_2;
+  size_t partNumber = pos.shape()[1];
+  U_PhaseArray F_ag_0_p(extents[partNumber][3]);
+  U_PhaseArray::array_type &F_ag_0 = F_ag_0_p.get_array();
+  U_PhaseArray F_ag_1_p(extents[partNumber][3]);
+  U_PhaseArray::array_type &F_ag_1 = F_ag_1_p.get_array();
+  ConsoleContext<LOG_DEBUG> ctx("stepping ag");
+
+  // MAIN LOOP : undo Leapfrog stepping
+  for (int nn = nstep - 2; nn >= 0; nn--) {
+    double dtr = timing[2][nn];
+    double dtv = timing[3][nn];
+    size_t agNum;
+
+    ctx.print(format("handling step = %d") % nn);
+    // Now redistribute
+    if (FORCE_REDISTRIBUTE || ALWAYS_MPI(comm))
+      pm_distribute_particles_ag<true>(
+          nn + 1, pos_ag, vel_ag, pos, vel, part_idx);
+
+    agNum = local_usedParticles[nn];
+    ctx.print(
+        format("Done undoing distribution, agNum = %d, now force ag") % agNum);
+
+    //order of force term is important as they will be added up!!!!
+    //    #pragma omp task shared(pos, vel, pos_ag, vel_ag, F_ag_0)
+    {
+      pm_force_0_ag(pos[nn], vel[nn], pos_ag, vel_ag, F_ag_0, dtr, dtv, agNum);
+    }
+    //    #pragma omp task shared(pos, vel, pos_ag, vel_ag, F_ag_1)
+    {
+      pm_force_1_ag(pos[nn], vel[nn], pos_ag, vel_ag, F_ag_1, dtr, dtv, agNum);
+    }
+    //    #pragma omp taskwait
+    ctx.print("Done force ag, now vel update");
+    pm_vel_update_ag(pos_ag, vel_ag, dtr, agNum);
+    ctx.print("Done vel update, now pos update");
+    pm_pos_update_ag(
+        pos_ag,
+        b_fused<U_PhaseArray::array_type::element>(F_ag_0, F_ag_1, _1 + _2),
+        dtr, agNum);
+    ctx.print("Done pos update, finished ag stepping");
+  }
+}
+
+template <typename FIC, typename CIC>
+void BorgPMModel<FIC, CIC>::pm_fwd_model_ag(
+    PhaseArrayRef &pos_ag, PhaseArrayRef &vel_ag, IdxTapeArrayRef &part_idx,
+    ArrayRef &DPSI, TimingArray &timing) {
+  ConsoleContext<LOG_DEBUG> ctx("BORG-PM adjoint model");
+  int nstep = pm_nsteps;
+  TapeArrayRef &pos = u_pos->get_array();
+  TapeArrayRef &vel = u_vel->get_array();
+
+  ///NOTE: ICs are generated at ai
+
+  ///introduce adjoint quantities
+  typedef U_PhaseArray::array_type PhaseArrayRef;
+
+  //2.) undo pm-stepping
+  pm_stepping_ag(nstep, timing, pos, vel, pos_ag, vel_ag, u_idx->get_array());
+
+  // Now final redistribute
+  if (FORCE_REDISTRIBUTE || ALWAYS_MPI(comm)) {
+    pm_distribute_particles_ag<true>(0, pos_ag, vel_ag, pos, vel, part_idx);
+  }
+  // Now we are back in IC configuration (including pos_ag, vel_ag ordering)
+
+  //N.) undo ICs
+  pm_ic_ag(pos_ag, vel_ag, timing);
+
+  // Apply gradient upgrade operator
+  if (c_deltao != 0) {
+    ctx.print("Gradient of upgrade and FFT...");
+    array::fill(tmp_complex_field->get_array(), 0);
+    lo_mgr->degrade_complex(*mgr, *AUX1_p, tmp_complex_field->get_array());
+    lo_mgr->execute_c2r(synthesis_plan, tmp_complex_field->get_array().data(), DPSI.data());
+  } else {
+    ctx.print("Finishing up with an FFT");
+    lo_mgr->execute_c2r(synthesis_plan, AUX1_p->data(), DPSI.data());
+  }
+}
--- a/extra/borg/libLSS/physics/forwards/deprecated/pm/pm_force.hpp
+++ b/extra/borg/libLSS/physics/forwards/deprecated/pm/pm_force.hpp
@ -0,0 +1,142 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/forwards/deprecated/pm/pm_force.hpp
+    Copyright (C) 2014-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+
+template <typename FIC, typename CIC>
+template <int axis, bool accum, int sign, typename PotentialArray>
+void BorgPMModel<FIC, CIC>::codelet_force(
+    int i, FFTW_Real_Array_ref &g, PotentialArray &pot_plus,
+    PotentialArray &pot_minus) {
+  long N[3] = {f_N0, f_N1, f_N2};
+  double i_d[3] = {sign * f_N0 / (unit_r0 * L0), sign * f_N1 / (unit_r0 * L1),
+                   sign * f_N2 / (unit_r0 * L2)};
+
+  typedef FFTW_Real_Array::index index_type;
+
+#pragma omp parallel for collapse(2)
+  for (long j = 0; j < f_N1; j++)
+    for (long k = 0; k < f_N2; k++) {
+      boost::array<index_type, 3> idxp = {i, j, k};
+      boost::array<index_type, 3> idxm = {i, j, k};
+
+      idxp[axis]++;
+      idxm[axis]--;
+      if (idxp[axis] >= N[axis])
+        idxp[axis] -= N[axis];
+      if (idxm[axis] < 0)
+        idxm[axis] += N[axis];
+      double value = -0.5 * (pot_plus(idxp) - pot_minus(idxm)) * i_d[axis];
+      push_to<accum>::apply(g[i][j][k], value);
+    }
+}
+
+template <typename FIC, typename CIC>
+template <int axis, bool accum, int sign>
+void BorgPMModel<FIC, CIC>::compute_force(
+    FFTW_Real_Array_ref &g, FFTW_Real_Array_ref &pot) {
+  ConsoleContext<LOG_DEBUG> ctx("force computation");
+
+  // For all axis other than the first, it is easy
+  if (axis != 0) {
+#pragma omp parallel for
+    for (long i = f_startN0; i < f_startN0 + f_localN0; i++)
+      codelet_force<axis, accum, sign>(i, g, pot, pot);
+
+    return;
+  }
+
+  ctx.print("Force axis-0: Handling traditional case");
+#pragma omp parallel for
+  for (long i = f_startN0 + 1; i < f_startN0 + f_localN0 - 1; i++)
+    codelet_force<0, accum, sign>(i, g, pot, pot);
+
+  // No real MPI here. Just do direct computation
+  if (SKIP_MPI_FOR_SINGLE_NODE && f_localN0 == f_N0) {
+    ctx.print("No MPI: finish off");
+    codelet_force<0, accum, sign>(f_startN0, g, pot, pot);
+    codelet_force<0, accum, sign>(f_startN0 + f_localN0 - 1, g, pot, pot);
+    return;
+  }
+
+  // No force. No data. Nothing. Skip.
+  if (f_localN0 == 0) {
+    ctx.print("No plane living here: finish");
+    return;
+  }
+
+  // Here we exchange with neighbours to be able to compute the gradient in this slice
+  long f_N2real = force_mgr->N2real;
+  int lower_0 = (f_startN0 + f_N0 - 1) % f_N0;
+  int upper_0 = (f_startN0 + f_localN0) % f_N0;
+
+  typedef Uninit_FFTW_Real_Array U_Array;
+  typedef U_Array::array_type U_ArrayType;
+  U_Array lower_pot_plane_recv_p(
+      extents[range(lower_0, lower_0 + 1)][f_N1][f_N2real]);
+  U_Array upper_pot_plane_recv_p(
+      extents[range(upper_0, upper_0 + 1)][f_N1][f_N2real]);
+  U_Array lower_pot_plane_send_p(
+      extents[range(f_startN0, f_startN0 + 1)][f_N1][f_N2real]);
+  U_Array upper_pot_plane_send_p(extents[range(
+      f_startN0 + f_localN0 - 1, f_startN0 + f_localN0)][f_N1][f_N2real]);
+  U_ArrayType &lower_pot_plane_recv = lower_pot_plane_recv_p.get_array(),
+              lower_pot_plane_send = lower_pot_plane_send_p.get_array(),
+              upper_pot_plane_recv = upper_pot_plane_recv_p.get_array(),
+              upper_pot_plane_send = upper_pot_plane_send_p.get_array();
+
+  MPI_Communication::Request lower_req_recv, upper_req_recv;
+  MPI_Communication::Request lower_req_send, upper_req_send;
+
+  ctx.print(
+      format("Copy arrays to be sent (%d, %d)") % f_startN0 %
+      (f_startN0 + f_localN0 - 1));
+
+  copy_array_rv(lower_pot_plane_send[f_startN0], pot[f_startN0]);
+  copy_array_rv(
+      upper_pot_plane_send[f_startN0 + f_localN0 - 1],
+      pot[f_startN0 + f_localN0 - 1]);
+
+  ctx.print("Sending/Receiving");
+
+  lower_req_send = comm->IsendT(
+      lower_pot_plane_send.data(), lower_pot_plane_send.num_elements(),
+      force_mgr->get_peer(lower_0), f_startN0);
+  upper_req_send = comm->IsendT(
+      upper_pot_plane_send.data(), upper_pot_plane_send.num_elements(),
+      force_mgr->get_peer(upper_0), f_startN0 + f_localN0 - 1);
+
+  lower_req_recv = comm->IrecvT(
+      lower_pot_plane_recv.data(), lower_pot_plane_recv.num_elements(),
+      force_mgr->get_peer(lower_0), lower_0);
+  upper_req_recv = comm->IrecvT(
+      upper_pot_plane_recv.data(), upper_pot_plane_recv.num_elements(),
+      force_mgr->get_peer(upper_0), upper_0);
+
+  ctx.print("I/O scheduled.");
+  if (f_localN0 > 1) {
+
+    lower_req_recv.wait();
+    ctx.print(" * Handling lower_pot_plane_recv");
+    codelet_force<0, accum, sign>(f_startN0, g, pot, lower_pot_plane_recv);
+
+    upper_req_recv.wait();
+    ctx.print(" * Handling upper_pot_plane_recv");
+    codelet_force<0, accum, sign>(
+        f_startN0 + f_localN0 - 1, g, upper_pot_plane_recv, pot);
+    ctx.print("Done handling");
+  } else {
+    ctx.print(" * Degenerate case, f_localN0 == 1");
+    lower_req_recv.wait();
+    upper_req_recv.wait();
+    codelet_force<0, accum, sign>(
+        f_startN0, g, upper_pot_plane_recv, lower_pot_plane_recv);
+  }
+  lower_req_send.wait();
+  upper_req_send.wait();
+}
--- a/extra/borg/libLSS/physics/forwards/deprecated/pm/pm_grav.hpp
+++ b/extra/borg/libLSS/physics/forwards/deprecated/pm/pm_grav.hpp
@ -0,0 +1,109 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/forwards/deprecated/pm/pm_grav.hpp
+    Copyright (C) 2014-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#include "libLSS/physics/forwards/pm/plane_xchg.hpp"
+
+template <typename FIC, typename CIC>
+template <typename PositionArray>
+void BorgPMModel<FIC, CIC>::pm_grav_density(
+    bool clear, const PositionArray &pos, size_t partNum,
+    FFTW_Real_Array_ref &pot) {
+  if (clear)
+    array::fill(pot, 0);
+
+  Console &cons = Console::instance();
+
+  cons.print<LOG_DEBUG>(
+      format("shape = %d,%d") % pos.shape()[0] % pos.shape()[1]);
+
+  if (comm->size() > 1) {
+    CIC::projection(
+        pos, pot, L0, L1, L2, f_N0, f_N1, f_N2,
+        typename CIC::Periodic_MPI(f_N0, f_N1, f_N2, comm),
+        CIC_Tools::DefaultWeight(), partNum);
+    // pot has MPI_PLANE_LEAKAGE extra planes. They have to be sent to the adequate nodes.
+    pm_exchange_planes<true>(pot, force_mgr);
+  } else {
+    CIC::projection(
+        pos, pot, L0, L1, L2, f_N0, f_N1, f_N2,
+        CIC_Tools::Periodic(f_N0, f_N1, f_N2), CIC_Tools::DefaultWeight(),
+        partNum);
+  }
+}
+
+template <typename FIC, typename CIC>
+template <typename OutPlaneArray, typename InPlaneArray>
+void BorgPMModel<FIC, CIC>::pm_exchange_planes_ag(
+    OutPlaneArray &loc_density, InPlaneArray &global_density,
+    std::unique_ptr<DFT_Manager> &d_mgr) {
+  density_exchange_planes_ag(
+      comm, loc_density, global_density, d_mgr, CIC::MPI_PLANE_LEAKAGE);
+}
+
+template <typename FIC, typename CIC>
+template <bool accum, typename PlaneArray>
+void BorgPMModel<FIC, CIC>::pm_exchange_planes(
+    PlaneArray &density, std::unique_ptr<DFT_Manager> &d_mgr,
+    int extra_planes) {
+  density_exchange_planes<accum>(comm, density, d_mgr, extra_planes);
+}
+
+template <typename FIC, typename CIC>
+void BorgPMModel<FIC, CIC>::pm_gravpot(FFTW_Real_Array_ref &pot) {
+  ConsoleContext<LOG_DEBUG> ctx("gravitational solver");
+  double nmean = CosmoTool::cube(double(ss_factor) / f_factor);
+
+  //transform density to F-space
+  CArray &f_AUX0 = *f_AUX0_p;
+  force_mgr->execute_r2c(f_analysis_plan, pot.data(), f_AUX0.data());
+  double normphi = 3. / 2. * cosmo_params.omega_m / double(f_N0 * f_N1 * f_N2) *
+                   (unit_r0 * unit_r0) / nmean;
+
+#ifdef ARES_MPI_FFTW
+#  pragma omp parallel for
+  for (long i = f_startN1; i < f_startN1 + f_localN1; i++) {
+    double sin21 = sin2K[1][i];
+    for (long j = 0; j < f_N0; j++) {
+      double sin20 = sin2K[0][j];
+      for (long k = 0; k < f_N2_HC; k++) {
+        double sin22 = sin2K[2][k];
+        double Greens = -normphi / (sin20 + sin21 + sin22);
+
+        f_AUX0[i][j][k] *= Greens;
+      }
+    }
+  }
+
+  //fix zero mode by hand
+  if (f_startN1 == 0 && f_localN1 > 0) {
+    f_AUX0[0][0][0] = 0;
+  }
+#else
+#  pragma omp parallel for
+  for (long i = f_startN0; i < f_startN0 + f_localN0; i++) {
+    double sin20 = sin2K[0][i];
+    for (long j = 0; j < f_N1; j++) {
+      double sin21 = sin2K[1][j];
+      for (long k = 0; k < f_N2_HC; k++) {
+        double sin22 = sin2K[2][k];
+        double Greens = -normphi / (sin20 + sin21 + sin22);
+
+        f_AUX0[i][j][k] *= Greens;
+      }
+    }
+  }
+
+  //fix zero mode by hand
+  if (f_startN0 == 0 && f_localN0 > 0) {
+    f_AUX0[0][0][0] = 0;
+  }
+#endif
+
+  force_mgr->execute_c2r(f_synthesis_plan, f_AUX0.data(), pot.data());
+}
--- a/extra/borg/libLSS/physics/forwards/deprecated/pm/pm_pos_update.hpp
+++ b/extra/borg/libLSS/physics/forwards/deprecated/pm/pm_pos_update.hpp
@ -0,0 +1,103 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/forwards/deprecated/pm/pm_pos_update.hpp
+    Copyright (C) 2014-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+
+template <typename FIC, typename CIC>
+void BorgPMModel<FIC, CIC>::pm_pos_update(
+    TapeArrayRef &pos, TapeArrayRef &vel, IdxTapeArrayRef &part_idx, double dtr,
+    int istep) {
+  ConsoleContext<LOG_DEBUG> ctx("pos update");
+  typedef TapeArrayRef::reference::reference TapeElt;
+  typedef IdxTapeArrayRef::reference::reference IdxTapeElt;
+
+#pragma omp parallel for
+  for (long i = 0; i < local_usedParticles[istep]; i++) {
+    TapeElt prev_loc_pos = pos[istep][i];
+    TapeElt prev_loc_vel = vel[istep + 1][i];
+    TapeElt loc_pos = pos[istep + 1][i];
+    double x = prev_loc_pos[0] + prev_loc_vel[0] * dtr;
+    double y = prev_loc_pos[1] + prev_loc_vel[1] * dtr;
+    double z = prev_loc_pos[2] + prev_loc_vel[2] * dtr;
+
+    loc_pos[0] = periodic_fix(x, 0., L0);
+    loc_pos[1] = periodic_fix(y, 0., L1);
+    loc_pos[2] = periodic_fix(z, 0., L2);
+  }
+
+  // No MPI: exit now to save CPU cycles
+  if (SKIP_MPI_FOR_SINGLE_NODE && comm->size() == 1) {
+    local_usedParticles[istep + 1] = local_usedParticles[istep];
+    return;
+  }
+}
+
+template <typename FIC, typename CIC>
+template <bool doVel, typename Projector>
+void BorgPMModel<FIC, CIC>::pm_distribute_particles(
+    std::unique_ptr<DFT_Manager> &dmgr, int istep, TapeArrayRef &pos,
+    TapeArrayRef &vel, IdxTapeArrayRef &part_idx, size_t inParticles) {
+  ConsoleContext<LOG_DEBUG> ctx("pre pm_distribute_particles");
+
+  auto in_pos = pos[istep];
+  // One pick the first time step because we need an argument. But in practice it is not used
+  auto io_part_idx = part_idx[istep];
+  auto io_numTransferStep = numTransferStep[istep];
+  auto io_numReceiveStep = numReceiveStep[istep];
+  auto io_offsetReceiveStep = offsetReceiveStep[istep];
+  auto io_offsetSendStep = offsetSendStep[istep];
+
+  if (doVel) {
+    auto in_vel = vel[istep];
+    particle_redistribute(
+        comm, in_pos, io_part_idx, inParticles, local_usedParticles[istep],
+        io_numTransferStep, io_numReceiveStep, io_offsetReceiveStep,
+        io_offsetSendStep, typename Projector::Distribution(dmgr, L0),
+        make_attribute_helper(
+            Particles::vector(in_vel), Particles::scalar(*lagrangian_id)));
+  } else {
+    particle_redistribute(
+        comm, in_pos, io_part_idx, inParticles, local_usedParticles[istep],
+        io_numTransferStep, io_numReceiveStep, io_offsetReceiveStep,
+        io_offsetSendStep, typename Projector::Distribution(dmgr, L0),
+        make_attribute_helper(Particles::scalar(*lagrangian_id))
+
+    );
+  }
+}
+
+// This function do not compute a gradient per-se. But it redistributes the pos_ag and vel_ag on MPI nodes according to the earlier
+// forward reshuffling.
+// Input state:
+//   pos_ag, vel_ag must have local_usedParticles[istep] elements
+//   pos, vel, part_idx are the full tape arrays of this node
+template <typename FIC, typename CIC>
+template <bool doVel>
+void BorgPMModel<FIC, CIC>::pm_distribute_particles_ag(
+    int istep, PhaseArrayRef &pos_ag, PhaseArrayRef &vel_ag, TapeArrayRef &pos,
+    TapeArrayRef &vel, IdxTapeArrayRef &part_idx) {
+  auto io_part_idx = part_idx[istep];
+  auto numTransfer = numTransferStep[istep];
+  auto numReceive = numReceiveStep[istep];
+  auto offsetReceive = offsetReceiveStep[istep];
+  auto offsetSend = offsetSendStep[istep];
+  size_t target_usedParticles =
+      istep == 0 ? (c_localN0 * c_N1 * c_N2) : (local_usedParticles[istep - 1]);
+
+  if (doVel) {
+    particle_undistribute(
+        comm, pos_ag, io_part_idx, local_usedParticles[istep],
+        target_usedParticles, numTransfer, numReceive, offsetReceive,
+        offsetSend, make_attribute_helper(Particles::vector(vel_ag)));
+  } else {
+    particle_undistribute(
+        comm, pos_ag, io_part_idx, local_usedParticles[istep],
+        target_usedParticles, numTransfer, numReceive, offsetReceive,
+        offsetSend);
+  }
+}
--- a/extra/borg/libLSS/physics/forwards/deprecated/pm/pm_vel_update.hpp
+++ b/extra/borg/libLSS/physics/forwards/deprecated/pm/pm_vel_update.hpp
@ -0,0 +1,135 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/forwards/deprecated/pm/pm_vel_update.hpp
+    Copyright (C) 2014-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+
+template <typename FIC, typename CIC>
+template <typename TapePos, typename TapeVel, typename Grav>
+void BorgPMModel<FIC, CIC>::codelet_vel_update(
+    int axis, int istep, double dtv, int i_g_plus, TapePos &pos, TapeVel &vel,
+    Grav &g_plus, Grav &g) {
+  double i_d0 = f_N0 / L0;
+  double i_d1 = f_N1 / L1;
+  double i_d2 = f_N2 / L2;
+
+  long Np = local_usedParticles[istep];
+
+  typename TapeVel::reference vel_next = vel[istep + 1];
+  typename TapePos::reference pos_current = pos[istep];
+  typename TapeVel::reference vel_current = vel[istep];
+
+#pragma omp parallel for schedule(static)
+  for (long i = 0; i < Np; i++) {
+    typename TapePos::reference::reference i_pos = pos_current[i];
+
+    double x = i_pos[0] * i_d0;
+    double y = i_pos[1] * i_d1;
+    double z = i_pos[2] * i_d2;
+
+    int ix = (int)std::floor(x);
+    int iy = (int)std::floor(y);
+    int iz = (int)std::floor(z);
+
+    int jx = ix + 1; if (jx >= f_N0) jx -= f_N0;
+    int jy = iy + 1; if (jy >= f_N1) jy -= f_N1;
+    int jz = iz + 1; if (jz >= f_N2) jz -= f_N2;
+
+    double rx = (x - ix);
+    double ry = (y - iy);
+    double rz = (z - iz);
+
+    double qx = 1 - rx;
+    double qy = 1 - ry;
+    double qz = 1 - rz;
+
+    Grav &g1 = (jx == i_g_plus) ? g_plus : g;
+
+    auto &v1 = vel_next[i][axis];
+    auto &v0 = vel_current[i][axis];
+
+    double force =
+        g[ix][iy][iz] * qx * qy * qz + g[ix][iy][jz] * qx * qy * rz +
+        g[ix][jy][iz] * qx * ry * qz + g[ix][jy][jz] * qx * ry * rz +
+        g1[jx][iy][iz] * rx * qy * qz + g1[jx][iy][jz] * rx * qy * rz +
+        g1[jx][jy][iz] * rx * ry * qz + g1[jx][jy][jz] * rx * ry * rz;
+
+    v1 = v0 + force * dtv;
+  }
+}
+
+template <typename FIC, typename CIC>
+void BorgPMModel<FIC, CIC>::pm_vel_update(
+    TapeArrayRef &pos, TapeArrayRef &vel, IdxTapeArrayRef &part_idx, double dtv,
+    int istep) {
+  ConsoleContext<LOG_DEBUG> ctx("vel update");
+
+  typedef Uninit_FFTW_Real_Array U_Array;
+  long f_N2real = force_mgr->N2real;
+  long lower_0 = (f_startN0 + f_N0 - 1) % f_N0;
+  long upper_0 = (f_startN0 + f_localN0) % f_N0;
+
+  U_Array g_p(force_mgr->extents_real(), force_mgr->allocator_real);
+  U_Array pot_p(
+      force_mgr->extents_real(CIC::MPI_PLANE_LEAKAGE),
+      force_mgr->allocator_real);
+  U_Array g_plus_p(extents[range(upper_0, upper_0 + 1)][f_N1][f_N2real]);
+  U_Array g_minus_send_p(
+      extents[range(f_startN0, f_startN0 + 1)][f_N1][f_N2real]);
+  U_Array::array_type &g = g_p.get_array();
+  U_Array::array_type &pot = pot_p.get_array();
+  U_Array::array_type &g_plus = g_plus_p.get_array();
+  U_Array::array_type &g_minus_send = g_minus_send_p.get_array();
+  MPI_Communication::Request req_minus_recv, req_plus_recv, req_minus_send,
+      req_plus_send;
+  TapeArrayRef::index_gen indices;
+  typedef TapeArrayRef::index_range range;
+
+  //estimate gravitational potential
+  ctx.print(format("Projecting %ld particles") % local_usedParticles[istep]);
+  pm_grav_density(true, pos[istep], local_usedParticles[istep], pot);
+  pm_gravpot(pot);
+
+  //calculate forces and update velocities
+  for (int axis = 0; axis < 3; axis++) {
+    switch (axis) {
+    case 0:
+      compute_force<0, false, 1>(g, pot);
+      break;
+    case 1:
+      compute_force<1, false, 1>(g, pot);
+      break;
+    case 2:
+      compute_force<2, false, 1>(g, pot);
+      break;
+    }
+
+    if (f_localN0 == f_N0) {
+      codelet_vel_update(axis, istep, dtv, -1, pos, vel, g, g);
+    } else {
+      ctx.print("Exchange one force plane");
+      // This grabs and distribute the extra plane required for the interpolation at the edge.
+      req_plus_recv = comm->IrecvT(
+          g_plus.data(), g_plus.num_elements(), force_mgr->get_peer(upper_0),
+          upper_0);
+      copy_array_rv(g_minus_send[f_startN0], g[f_startN0], false);
+      req_minus_send = comm->IsendT(
+          g_minus_send.data(), g_minus_send.num_elements(),
+          force_mgr->get_peer(lower_0), f_startN0);
+      req_plus_recv.wait();
+      // Done receiving we can do computations
+
+      ctx.print("Computing accelerations");
+      codelet_vel_update(
+          axis, istep, dtv, (f_startN0 + f_localN0) % f_N0, pos, vel, g_plus,
+          g);
+
+      // Ensure the sending is done
+      req_minus_send.wait();
+    }
+  }
+}
--- a/extra/borg/libLSS/physics/forwards/downgrade.cpp
+++ b/extra/borg/libLSS/physics/forwards/downgrade.cpp
@ -0,0 +1,164 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/forwards/downgrade.cpp
+    Copyright (C) 2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#include <string>
+#include "libLSS/tools/console.hpp"
+#include "libLSS/physics/model_io.hpp"
+#include "libLSS/physics/forwards/downgrade.hpp"
+#include "libLSS/tools/fusewrapper.hpp"
+#include "libLSS/physics/forwards/registry.hpp"
+#include "libLSS/tools/ptree_proxy.hpp"
+#include <boost/phoenix/stl/cmath.hpp>
+#include "libLSS/tools/tuple_helper.hpp"
+#include "libLSS/physics/bias/base.hpp"
+#include "libLSS/physics/bias/level_combinator.hpp"
+
+using namespace LibLSS;
+
+static BoxModel half_box(BoxModel const &box) {
+  BoxModel box2 = box;
+
+  box2.N0 /= 2;
+  box2.N1 /= 2;
+  box2.N2 /= 2;
+  return box2;
+}
+
+ForwardDowngrade::ForwardDowngrade(MPI_Communication *comm, BoxModel const &box)
+    : BORGForwardModel(comm, box, half_box(box)) {
+  LIBLSS_AUTO_DEBUG_CONTEXT(ctx);
+
+  std::tuple<ssize_t, ssize_t> bounds[2];
+  //Level_t::numLevel];
+  //
+  bounds[1] = std::make_tuple(0,0);//out_mgr->startN0, out_mgr->startN0 + out_mgr->localN0);
+  bounds[0] = std::make_tuple(out_mgr->startN0, out_mgr->startN0 + out_mgr->localN0);
+
+  for (int r = 0; r < Level_t::numLevel; r++) {
+    ctx.format(
+        ".. Level %d (bounds=[%d - %d])", r,
+        std::get<0>(bounds[r]), std::get<1>(bounds[r]));
+  }
+
+  level.allocate(
+      box.N0, box.N1, box.N2, lo_mgr->N2real, lo_mgr->startN0, lo_mgr->localN0,
+      bounds);
+  level.setup(ghosts, comm);
+
+  ag_array = std::make_shared<U_Array>(lo_mgr->extents_real_strict());
+}
+
+void ForwardDowngrade::forwardModel_v2(ModelInput<3> delta_init) {
+  LIBLSS_AUTO_DEBUG_CONTEXT(ctx);
+
+  // Setup the IO basis that is required for this forward.
+  delta_init.setRequestedIO(PREFERRED_REAL);
+
+  ghosts.updatePlaneDims(std::array<ssize_t,2>{lo_mgr->N1, lo_mgr->N2real});
+  ghosts.synchronize(delta_init.getReal());
+  // Now build the different levels from the planes.
+  level.buildLevels(ghosts, delta_init.getReal());
+
+  hold_input = std::move(delta_init);
+}
+
+void ForwardDowngrade::getDensityFinal(ModelOutput<3> delta_output) {
+  LIBLSS_AUTO_DEBUG_CONTEXT(ctx);
+
+  delta_output.setRequestedIO(PREFERRED_REAL);
+  auto w_delta_output = fwrap(array::slice_array(
+      delta_output.getRealOutput(), out_mgr->strict_range()));
+
+  ctx.format(
+      "out = %dx%dx%d", (*w_delta_output).shape()[0],
+      (*w_delta_output).shape()[1], (*w_delta_output).shape()[2]);
+  ctx.format("in = %dx%dx%d", N0, N1, N2);
+
+  w_delta_output =
+      fwrap(b_fused_idx<double, 3>([this](size_t i, size_t j, size_t k) {
+        constexpr int const numLevel = Level_t::numLevel;
+        double out;
+        if (k >= N2 / 2) {
+          auto &cons = Console::instance();
+          cons.format<LOG_ERROR>("Going above limits!");
+          MPI_Communication::instance()->abort();
+          return 0.0;
+        }
+        out = level.template get_density_level<1>(
+            hold_input.getReal(), i, j, k);
+        if (std::isnan(out) || std::isinf(out)) {
+          auto &cons = Console::instance();
+          //cons.c_assert(!std::isnan(out[numLevel-1]), "Nan in density");
+          cons.format<LOG_ERROR>(
+              "Nan (%g) in density at %dx%dx%d", out, i, j, k);
+          MPI_Communication::instance()->abort();
+        }
+        return out;
+      }));
+}
+
+void ForwardDowngrade::adjointModel_v2(ModelInputAdjoint<3> in_gradient_delta) {
+  LIBLSS_AUTO_DEBUG_CONTEXT(ctx);
+  // Build strict range views (we do not want to see the
+  // the FFTW padding, ensure that the view object lives till the end of this function.
+
+  in_gradient_delta.setRequestedIO(PREFERRED_REAL);
+
+  ghosts.release();
+  ghosts.updatePlaneDims(std::array<ssize_t,2>{lo_mgr->N1, lo_mgr->N2});
+  ghosts.allocate();
+  ghosts.clear_ghosts();
+  level.clear_cache();
+
+  fwrap(*ag_array) = 0;
+
+  size_t startN0 = out_mgr->startN0;
+  size_t endN0 = startN0 + out_mgr->localN0;
+  size_t N1 = out_mgr->N1;
+  size_t N2 = out_mgr->N2;
+
+  auto &in_grad = in_gradient_delta.getRealConst();
+
+#pragma omp parallel for collapse(3)
+  for (size_t i = startN0; i < endN0; i++) {
+    for (size_t j = 0; j < N1; j++) {
+      for (size_t k = 0; k < N2; k++) {
+        level.template push_ag_density_level<1>(
+            in_grad[i][j][k], ag_array->get_array(), i, j, k);
+      }
+    }
+  }
+
+  level.ag_buildLevels(ghosts, ag_array->get_array());
+
+  ghosts.synchronize_ag(ag_array->get_array());
+}
+
+void ForwardDowngrade::getAdjointModelOutput(
+    ModelOutputAdjoint<3> out_gradient_delta) {
+  out_gradient_delta.setRequestedIO(PREFERRED_REAL);
+  // Remember: the output of the gradient is conformal to the input of the model (thus slicing with lo_mgr).
+  auto w_out_gradient = fwrap(array::slice_array(out_gradient_delta.getRealOutput(), lo_mgr->strict_range()));
+
+  w_out_gradient = fwrap(*ag_array);
+}
+
+static std::shared_ptr<BORGForwardModel> build_downgrade(
+    MPI_Communication *comm, BoxModel const &box, PropertyProxy const &params) {
+
+  // TODO: Setup transfer function
+  auto model = std::make_shared<ForwardDowngrade>(comm, box);
+  return model;
+}
+
+LIBLSS_REGISTER_FORWARD_IMPL(Downgrade, build_downgrade);
+
+// ARES TAG: authors_num = 1
+// ARES TAG: name(0) = Guilhem Lavaux
+// ARES TAG: year(0) = 2020
+// ARES TAG: email(0) = guilhem.lavaux@iap.fr
--- a/extra/borg/libLSS/physics/forwards/downgrade.hpp
+++ b/extra/borg/libLSS/physics/forwards/downgrade.hpp
@ -0,0 +1,68 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/forwards/downgrade.hpp
+    Copyright (C) 2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#ifndef __LIBLSS_HADES_FORWARD_DOWNGRADE_HPP
+#  define __LIBLSS_HADES_FORWARD_DOWNGRADE_HPP
+#  pragma once
+
+#  include "libLSS/physics/forward_model.hpp"
+#  include "libLSS/tools/fusewrapper.hpp"
+#  include "libLSS/tools/errors.hpp"
+#  include "libLSS/physics/forwards/registry.hpp"
+#  include "libLSS/physics/bias/level_combinator.hpp"
+
+namespace LibLSS {
+
+  /**
+   * This class handles the convolution of a real field by some Fourier kernel.
+   */
+  class ForwardDowngrade : public BORGForwardModel {
+
+  protected:
+    ModelInput<3> hold_input;
+
+    typedef Combinator::Levels<double, 1, 1> Level_t;
+
+    std::shared_ptr<DFT_Manager::U_ArrayReal> ag_array;
+
+    Level_t level;
+    GhostPlanes<double, 2> ghosts;
+
+  public:
+    /**
+     * Constructor.
+     */
+    explicit ForwardDowngrade(MPI_Communication *comm, const BoxModel &box);
+
+    PreferredIO getPreferredInput() const override { return PREFERRED_REAL; }
+    PreferredIO getPreferredOutput() const override { return PREFERRED_REAL; }
+
+    bool densityInvalidated() const override { return false; }
+      
+    void forwardModel_v2(ModelInput<3> delta_init) override;
+    void getDensityFinal(ModelOutput<3> delta_output) override;
+
+    void adjointModel_v2(ModelInputAdjoint<3> in_gradient_delta) override;
+    void
+    getAdjointModelOutput(ModelOutputAdjoint<3> out_gradient_delta) override;
+
+    void forwardModelRsdField(ArrayRef &, double *) override {}
+
+    void releaseParticles() override {}
+  }; // namespace LibLSS
+
+} // namespace LibLSS
+
+LIBLSS_REGISTER_FORWARD_DECL(Downgrade);
+
+#endif
+
+// ARES TAG: authors_num = 1
+// ARES TAG: name(0) = Guilhem Lavaux
+// ARES TAG: year(0) = 2020
+// ARES TAG: email(0) = guilhem.lavaux@iap.fr
--- a/extra/borg/libLSS/physics/forwards/enforceMass.cpp
+++ b/extra/borg/libLSS/physics/forwards/enforceMass.cpp
@ -0,0 +1,84 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/forwards/enforceMass.cpp
+    Copyright (C) 2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#include <string>
+#include "libLSS/tools/console.hpp"
+#include "libLSS/physics/model_io.hpp"
+#include "libLSS/physics/forwards/enforceMass.hpp"
+#include "libLSS/tools/fusewrapper.hpp"
+#include "libLSS/physics/forwards/registry.hpp"
+#include "libLSS/tools/ptree_proxy.hpp"
+
+using namespace LibLSS;
+
+void ForwardEnforceMass::forwardModel_v2(ModelInput<3> delta_init) {
+  LIBLSS_AUTO_DEBUG_CONTEXT(ctx);
+
+  // Setup the IO basis that is required for this forward.
+  delta_init.setRequestedIO(PREFERRED_REAL);
+
+  hold_input = std::move(delta_init);
+  full_sum =
+      (fwrap(hold_input.getRealConst()[lo_mgr->strict_range()]) + 1.0).sum();
+
+  comm->all_reduce_t(MPI_IN_PLACE, &full_sum, 1, MPI_SUM);
+}
+
+void ForwardEnforceMass::getDensityFinal(ModelOutput<3> delta_output) {
+  delta_output.setRequestedIO(PREFERRED_REAL);
+  auto w_delta_init = fwrap(hold_input.getRealConst()) + 1.0;
+  auto w_delta_output = fwrap(delta_output.getRealOutput());
+
+  double mean = full_sum / get_box_model().numElements();
+
+  w_delta_output = w_delta_init / mean - 1.0;
+}
+
+void ForwardEnforceMass::adjointModel_v2(
+    ModelInputAdjoint<3> in_gradient_delta) {
+  LIBLSS_AUTO_DEBUG_CONTEXT(ctx);
+  // Build strict range views (we do not want to see the
+  // the FFTW padding, ensure that the view object lives till the end of this function.
+
+  in_gradient_delta.setRequestedIO(PREFERRED_REAL);
+  hold_ag_input = std::move(in_gradient_delta);
+}
+
+void ForwardEnforceMass::getAdjointModelOutput(
+    ModelOutputAdjoint<3> out_gradient_delta) {
+  out_gradient_delta.setRequestedIO(PREFERRED_REAL);
+  auto w_in_gradient = fwrap(hold_ag_input.getRealConst());
+  auto w_out_gradient = fwrap(out_gradient_delta.getRealOutput());
+  auto w_delta_init = fwrap(hold_input.getRealConst()) + 1.0;
+
+  double mean = full_sum / get_box_model().numElements();
+
+  double full_sum_grad =
+      (fwrap(hold_ag_input.getRealConst()[lo_mgr->strict_range()])).sum();
+
+  comm->all_reduce_t(MPI_IN_PLACE, &full_sum_grad, 1, MPI_SUM);
+
+  w_out_gradient =
+      1.0 / mean * (w_in_gradient - w_delta_init * full_sum_grad / full_sum);
+}
+
+static std::shared_ptr<BORGForwardModel> build_enforcemass(
+    MPI_Communication *comm, BoxModel const &box, PropertyProxy const &params) {
+
+  // TODO: Setup transfer function
+  auto model = std::make_shared<ForwardEnforceMass>(comm, box);
+  return model;
+}
+
+LIBLSS_REGISTER_FORWARD_IMPL(EnforceMass, build_enforcemass);
+
+// ARES TAG: num_authors = 1
+// ARES TAG: name(0) = Guilhem Lavaux
+// ARES TAG: year(0) = 2020
+// ARES TAG: email(0) = guilhem.lavaux@iap.fr
--- a/extra/borg/libLSS/physics/forwards/enforceMass.hpp
+++ b/extra/borg/libLSS/physics/forwards/enforceMass.hpp
@ -0,0 +1,61 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/forwards/enforceMass.hpp
+    Copyright (C) 2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#ifndef __LIBLSS_HADES_FORWARD_ENFORCEMASS_HPP
+#  define __LIBLSS_HADES_FORWARD_ENFORCEMASS_HPP
+#  pragma once
+
+#  include "libLSS/physics/forward_model.hpp"
+#  include "libLSS/tools/fusewrapper.hpp"
+#  include "libLSS/tools/errors.hpp"
+#  include "libLSS/physics/forwards/registry.hpp"
+
+namespace LibLSS {
+
+  /**
+   * This class handles the convolution of a real field by some Fourier kernel.
+   */
+  class ForwardEnforceMass : public BORGForwardModel {
+  protected:
+    ModelInput<3> hold_input;
+    ModelInputAdjoint<3> hold_ag_input;
+
+    double full_sum;
+
+  public:
+    /**
+     * Constructor.
+     */
+    explicit ForwardEnforceMass(MPI_Communication *comm, const BoxModel &box)
+        : BORGForwardModel(comm, box) {
+      ensureInputEqualOutput();
+    }
+
+    virtual PreferredIO getPreferredInput() const { return PREFERRED_REAL; }
+    virtual PreferredIO getPreferredOutput() const { return PREFERRED_REAL; }
+
+    virtual void forwardModel_v2(ModelInput<3> delta_init);
+    virtual void getDensityFinal(ModelOutput<3> delta_output);
+
+    virtual void adjointModel_v2(ModelInputAdjoint<3> in_gradient_delta);
+    virtual void
+    getAdjointModelOutput(ModelOutputAdjoint<3> out_gradient_delta);
+
+  }; // namespace LibLSS
+
+} // namespace LibLSS
+
+LIBLSS_REGISTER_FORWARD_DECL(EnforceMass);
+
+#endif
+
+// ARES TAG: num_authors = 1
+// ARES TAG: name(0) = Guilhem Lavaux
+// ARES TAG: year(0) = 2020
+// ARES TAG: email(0) = guilhem.lavaux@iap.fr
--- a/extra/borg/libLSS/physics/forwards/julia.cpp
+++ b/extra/borg/libLSS/physics/forwards/julia.cpp
@ -0,0 +1,76 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/forwards/julia.cpp
+    Copyright (C) 2019-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#include <boost/format.hpp>
+#include "libLSS/julia/julia.hpp"
+#include "libLSS/physics/forwards/julia.hpp"
+
+using namespace LibLSS;
+using boost::format;
+using boost::str;
+
+static inline std::string forward_module_initialize(std::string const &mname) {
+  return mname + ".Forward";
+}
+
+static inline std::string forward_module_adjoint(std::string const &mname) {
+  return mname + ".adjoint_gradient";
+}
+
+static inline std::string forward_module_forward(std::string const &mname) {
+  return mname + ".forward";
+}
+
+Julia::Object Julia::make_simulation_box(const BoxModel &box) {
+  Object o;
+
+  return Julia::evaluate(
+      str(format("libLSS.BoxModel((%g,%g,%g),[%u,%u,%u])") % box.L0 % box.L1 %
+          box.L2 % box.N0 % box.N1 % box.N2));
+}
+
+JuliaForward::JuliaForward(
+    MPI_Communication *comm, const BoxModel &box, const std::string &code_name,
+    const std::string &_module_name)
+    : BORGForwardModel(comm, box), module_name(_module_name) {
+  Console::instance().print<LOG_INFO>(
+      "Loading code " + code_name + " in julia VM");
+  // TODO, only reevaluate if needed
+  Julia::evaluate("include(\"" + code_name + "\")");
+
+  // Create the adequate julia object.
+  forward_object = Julia::invoke(
+      forward_module_initialize(module_name), Julia::make_simulation_box(box));
+}
+
+JuliaForward::~JuliaForward() {
+  // forward_object self destruct here.
+}
+
+void JuliaForward::forwardModel_v2(ModelInput<3> delta_init) {
+//  Julia::Object init_d = Julia::box(delta_init);
+//
+//  Julia::invoke(
+//      forward_module_forward(module_name), forward_object, init_d); 
+}
+
+void JuliaForward::getDensityFinal(ModelOutput<3> delta_out) {
+}
+
+
+void JuliaForward::adjointModel_v2(ModelInputAdjoint<3> in_gradient) {}
+void JuliaForward::getAdjointModelOutput(ModelOutputAdjoint<3> in_gradient) {}
+
+void JuliaForward::releaseParticles() {}
+
+void JuliaForward::updateCosmo() {}
+
+// ARES TAG: authors_num = 1
+// ARES TAG: name(0) = Guilhem Lavaux
+// ARES TAG: email(0) = guilhem.lavaux@iap.fr
+// ARES TAG: year(0) = 2019-2020
--- a/extra/borg/libLSS/physics/forwards/julia.hpp
+++ b/extra/borg/libLSS/physics/forwards/julia.hpp
@ -0,0 +1,49 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/forwards/julia.hpp
+    Copyright (C) 2019-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#ifndef __LIBLSS_PHYSICS_JULIA_HPP
+#define __LIBLSS_PHYSICS_JULIA_HPP
+
+#include "libLSS/samplers/core/types_samplers.hpp"
+#include "libLSS/samplers/rgen/hmc/hmc_density_sampler.hpp"
+#include "libLSS/physics/forward_model.hpp"
+#include "libLSS/mcmc/global_state.hpp"
+#include "libLSS/julia/julia.hpp"
+
+namespace LibLSS {
+
+  namespace Julia {
+    Object make_simulation_box(const BoxModel &box);
+  }
+
+  class JuliaForward : public BORGForwardModel {
+  protected:
+    std::string module_name;
+    Julia::Object forward_object;
+
+  public:
+    JuliaForward(
+        MPI_Communication *comm, const BoxModel &box,
+        const std::string &code_name, const std::string &module_name);
+    virtual ~JuliaForward();
+
+    virtual void forwardModel_v2(ModelInput<3> delta_init);
+    virtual void getDensityFinal(ModelOutput<3> delta_out);
+    virtual void adjointModel_v2(ModelInputAdjoint<3> gradient_delta);
+    virtual void getAdjointModelOutput(ModelOutputAdjoint<3> gradient_delta);
+    virtual void releaseParticles();
+    virtual void updateCosmo();
+  };
+} // namespace LibLSS
+
+#endif
+// ARES TAG: authors_num = 1
+// ARES TAG: name(0) = Guilhem Lavaux
+// ARES TAG: email(0) = guilhem.lavaux@iap.fr
+// ARES TAG: year(0) = 2019-2020
+
--- a/extra/borg/libLSS/physics/forwards/lep/borg_fwd_lep.cpp
+++ b/extra/borg/libLSS/physics/forwards/lep/borg_fwd_lep.cpp
@ -0,0 +1,553 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/forwards/lep/borg_fwd_lep.cpp
+    Copyright (C) 2014-2018 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2009-2018 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+template <typename CIC>
+void BorgLEPModel<CIC>::lep_ic(
+    CArrayRef &deltao, TapeArrayRef &pos, TapeArrayRef &vel,
+    TimingArray &timing) {
+  ///set cosmological parameters
+  ///Initial density is scaled to initial redshift!!!
+  Cosmology cosmo(cosmo_params);
+
+  double an =
+      timing[0][0]; ///set position ics at r_{0}, calculate till present epoch
+  double anh = timing
+      [1]
+      [0]; ///velocities are created at v_{0-1/2}, calculate till present epoch
+  double D0 = cosmo.d_plus(ai);
+  double D1 = cosmo.d_plus(an) / D0;
+  double Df1 = cosmo.d_plus(anh) / D0;
+  double f1 = cosmo.g_plus(anh);
+  double Hubble = cosmo.Hubble(anh) / cosmo_params.h; ///km /sec /(Mpc/h)
+
+  Console::instance().print<LOG_DEBUG>(
+      format("D0=%g, D1=%g, Df1=%g, f1=%g, Hubble=%g") % D0 % D1 % Df1 % f1 %
+      Hubble);
+
+  typedef UninitializedArray<
+      FFTW_Complex_Array, FFTW_Allocator<std::complex<double>>>
+      U_CArray;
+  typedef U_CArray::array_type Ref_CArray;
+
+  U_CArray tmp_p(mgr->extents_complex(), mgr->allocator_complex);
+  Ref_CArray &tmp = tmp_p.get_array();
+
+  ///allocate new array for Eulerian grav-pot calculculation
+  U_CArray tmp_g(mgr->extents_complex(), mgr->allocator_complex);
+  Ref_CArray &tmp_grav = tmp_g.get_array();
+
+  Uninit_FFTW_Real_Array pot_p(mgr->extents_real(), mgr->allocator_real);
+  Uninit_FFTW_Real_Array::array_type &pot = pot_p.get_array();
+
+  ///set gravitational potential normalization
+  //scale potential to first timestep
+  double normphi =
+      D1 * 3. / 2. * cosmo_params.omega_m *
+      (unit_r0 * unit_r0); ///maybe we miss a Fourier normalization here
+
+  for (int axis = 0; axis < 3; axis++) {
+#pragma omp parallel for
+    for (int i = c_startN0; i < c_startN0 + c_localN0; i++)
+      for (int j = 0; j < c_N1; j++)
+        for (int k = 0; k < c_N2_HC; k++) {
+          double kk[3];
+          kk[0] = kmode(i, c_N0, L0);
+          kk[1] = kmode(j, c_N1, L1);
+          kk[2] = kmode(k, c_N2, L2);
+
+          double ksquared = kk[0] * kk[0] + kk[1] * kk[1] + kk[2] * kk[2];
+          double fac = -kk[axis] / (ksquared);
+
+          std::complex<double> &in_delta = deltao[i][j][k];
+
+          //calculate large scale gravitational potential
+          tmp_grav[i][j][k] =
+              (-1) * normphi * in_delta / (ksquared); //check sign!!!
+
+          tmp[i][j][k] = std::complex<double>(
+              -fac * in_delta.imag(), fac * in_delta.real()
+
+          );
+        }
+
+    if (c_startN0 == 0 && c_localN0 > 0) {
+      tmp[0][0][0] = 0;
+      tmp[0][0][c_N2 / 2] = 0;
+      tmp[0][c_N1 / 2][0] = 0;
+      tmp[0][c_N1 / 2][c_N2 / 2] = 0;
+      tmp_grav[0][0][0] = 0;
+      tmp_grav[0][0][c_N2 / 2] = 0;
+      tmp_grav[0][c_N1 / 2][0] = 0;
+      tmp_grav[0][c_N1 / 2][c_N2 / 2] = 0;
+    }
+
+    if (c_startN0 <= c_N0 / 2 && c_startN0 + c_localN0 > c_N0 / 2) {
+      tmp[c_N0 / 2][0][0] = 0;
+      tmp[c_N0 / 2][0][c_N2 / 2] = 0;
+      tmp[c_N0 / 2][c_N1 / 2][0] = 0;
+      tmp[c_N0 / 2][c_N1 / 2][c_N2 / 2] = 0;
+      tmp_grav[c_N0 / 2][0][0] = 0;
+      tmp_grav[c_N0 / 2][0][c_N2 / 2] = 0;
+      tmp_grav[c_N0 / 2][c_N1 / 2][0] = 0;
+      tmp_grav[c_N0 / 2][c_N1 / 2][c_N2 / 2] = 0;
+    }
+
+    ///Now build lep forces and use pot->data() as temporary field
+    mgr->execute_c2r(c_synthesis_plan, tmp_grav.data(), pot.data());
+
+    compute_lep_force<0, false, 1>(g_lep0->get_array(), pot);
+    compute_lep_force<1, false, 1>(g_lep1->get_array(), pot);
+    compute_lep_force<2, false, 1>(g_lep2->get_array(), pot);
+
+#pragma omp parallel for
+    for (long l = c_startN0; l < c_startN0 + c_localN0; l++)
+      for (long m = 0; m < c_N1; m++)
+        for (long n = 0; n < c_N2; n++) {
+          size_t idx = n + c_N2 * m + c_N2 * c_N1 * (l - c_startN0);
+          vel[0][idx][axis] = (*c_tmp_real_field)[l][m][n];
+        }
+  }
+
+  double vScaling = -Df1 * Hubble * f1 * anh * anh / unit_v0;
+
+#pragma omp parallel for
+  for (long l = c_startN0; l < c_startN0 + c_localN0; l++)
+    for (long m = 0; m < c_N1; m++)
+      for (long n = 0; n < c_N2; n++) {
+        /// sort particles on equidistant grid
+        double q0 = L0 / double(c_N0) * double(l);
+        double q1 = L1 / double(c_N1) * double(m);
+        double q2 = L2 / double(c_N2) * double(n);
+
+        size_t idx = n + c_N2 * m + c_N2 * c_N1 * (l - c_startN0);
+        double x = q0 - D1 * vel[0][idx][0];
+        double y = q1 - D1 * vel[0][idx][1];
+        double z = q2 - D1 * vel[0][idx][2];
+
+        ///enforce periodic boundary conditions
+        pos[0][idx][0] = periodic_fix(x, 0., L0);
+        pos[0][idx][1] = periodic_fix(y, 0., L1);
+        pos[0][idx][2] = periodic_fix(z, 0., L2);
+
+        ///store velocities in km/sec
+        ///note we multiply by a² to get the correct momentum variable for the lep code
+        ///and normalize to code units
+        vel[0][idx][0] *= vScaling;
+        vel[0][idx][1] *= vScaling;
+        vel[0][idx][2] *= vScaling;
+      }
+}
+
+template <typename CIC>
+template <typename PositionArray, typename RedshiftPosition>
+void BorgLEPModel<CIC>::lep_redshift_pos(
+    const PositionArray &pos, const PositionArray &vel,
+    RedshiftPosition &s_pos) {
+  Cosmology cosmo(cosmo_params);
+
+  //this routine generates particle positions in redshift space
+  double af = 1.; ///km /sec /Mpc
+  double anh = 1.;
+  double Hubble = cosmo.Hubble(anh) / cosmo_params.h; ///km /sec /(Mpc/h)
+
+  // the code uses particle momenta p=a^2 dx/dt where x is the co-moving position
+  // peculiar velocities are then given by v_pec = p/a
+
+  //NOTE: Check coefficients
+  ArrayType1d::ArrayType &observer = vobs;
+  double facRSD =
+      1. / af /
+      Hubble; //this factor is 1/H/a for velocities in [km/sec] an additional factor arises from momentum conversion
+
+#pragma omp parallel for
+  for (long l = c_startN0; l < c_startN0 + c_localN0; l++)
+    for (long m = 0; m < c_N1; m++)
+      for (long n = 0; n < c_N2; n++) {
+        size_t idx = n + c_N2 * m + c_N2 * c_N1 * (l - c_startN0);
+        double x0g = pos[idx][0];
+        double x1g = pos[idx][1];
+        double x2g = pos[idx][2];
+
+        double x0 = x0g + xmin0;
+        double x1 = x1g + xmin1;
+        double x2 = x2g + xmin2;
+
+        double v0 = vel[idx][0] + observer[0];
+        double v1 = vel[idx][1] + observer[1];
+        double v2 = vel[idx][2] + observer[2];
+
+        double r2_los = x0 * x0 + x1 * x1 + x2 * x2;
+        double v_los = v0 * x0 + v1 * x1 + v2 * x2;
+
+        double A = facRSD * v_los / r2_los;
+
+        double s0 = x0g + A * x0;
+        double s1 = x1g + A * x1;
+        double s2 = x2g + A * x2;
+
+        ///enforce periodic boundary conditions
+        s_pos[idx][0] = periodic_fix(s0, 0., L0);
+        s_pos[idx][1] = periodic_fix(s1, 0., L1);
+        s_pos[idx][2] = periodic_fix(s2, 0., L2);
+      }
+}
+
+template <typename CIC>
+template <typename PositionArray>
+void BorgLEPModel<CIC>::lep_gravpot(
+    const PositionArray &pos, FFTW_Real_Array_ref &pot) {
+  ConsoleContext<LOG_DEBUG> ctx("gravitational solver");
+  double nmean = CosmoTool::cube(double(ss_factor) / f_factor);
+
+  array::fill(pot, 0);
+  CIC::projection(pos, pot, L0, L1, L2, f_N0, f_N1, f_N2);
+  array::density_rescale(pot, nmean);
+
+  //transform density to F-space
+  CArray &f_AUX0 = *f_AUX0_p;
+  force_mgr->execute_r2c(f_analysis_plan, pot.data(), f_AUX0.data());
+  double normphi = 3. / 2. * cosmo_params.omega_m / double(f_N0 * f_N1 * f_N2) *
+                   (unit_r0 * unit_r0);
+
+#pragma omp parallel for
+  for (long i = f_startN0; i < f_startN0 + f_localN0; i++) {
+    double sin20 = sin2K[0][i];
+    for (long j = 0; j < f_N1; j++) {
+      double sin21 = sin2K[1][j];
+      for (long k = 0; k < f_N2_HC; k++) {
+        double sin22 = sin2K[2][k];
+        double Greens = -normphi / (sin20 + sin21 + sin22);
+
+        f_AUX0[i][j][k] *= Greens;
+      }
+    }
+  }
+
+  //fix zero mode by hand
+  if (f_startN0 == 0 && f_localN0 > 0) {
+    f_AUX0[0][0][0] = 0;
+  }
+
+  force_mgr->execute_c2r(f_synthesis_plan, f_AUX0.data(), pot.data());
+}
+
+template <typename CIC>
+template <int axis, bool accum, int sign>
+void BorgLEPModel<CIC>::compute_lep_force(
+    FFTW_Real_Array_ref &g, FFTW_Real_Array_ref &pot) {
+  long N[3] = {N0, N1, N2};
+  double i_d[3] = {sign * N0 / (unit_r0 * L0), sign * N1 / (unit_r0 * L1),
+                   sign * N2 / (unit_r0 * L2)};
+  typedef FFTW_Real_Array::index index_type;
+
+#pragma omp parallel for
+  for (long i = startN0; i < startN0 + localN0; i++)
+    for (long j = 0; j < N1; j++)
+      for (long k = 0; k < N2; k++) {
+        boost::array<index_type, 3> idxp = {i, j, k};
+        boost::array<index_type, 3> idxm = {i, j, k};
+
+        idxp[axis]++;
+        idxm[axis]--;
+        if (idxp[axis] > N[axis] - 1)
+          idxp[axis] -= N[axis];
+        if (idxm[axis] < 0)
+          idxm[axis] += N[axis];
+
+        double value = -0.5 * (pot(idxp) - pot(idxm)) * i_d[axis];
+        push_to<accum>::apply(g[i][j][k], value);
+      }
+}
+
+template <typename CIC>
+template <int axis, bool accum, int sign>
+void BorgLEPModel<CIC>::compute_force(
+    FFTW_Real_Array_ref &g, FFTW_Real_Array_ref &pot) {
+  long N[3] = {f_N0, f_N1, f_N2};
+  double i_d[3] = {sign * f_N0 / (unit_r0 * L0), sign * f_N1 / (unit_r0 * L1),
+                   sign * f_N2 / (unit_r0 * L2)};
+  typedef FFTW_Real_Array::index index_type;
+
+#pragma omp parallel for
+  for (long i = f_startN0; i < f_startN0 + f_localN0; i++)
+    for (long j = 0; j < f_N1; j++)
+      for (long k = 0; k < f_N2; k++) {
+        boost::array<index_type, 3> idxp = {i, j, k};
+        boost::array<index_type, 3> idxm = {i, j, k};
+
+        idxp[axis]++;
+        idxm[axis]--;
+        if (idxp[axis] > N[axis] - 1)
+          idxp[axis] -= N[axis];
+        if (idxm[axis] < 0)
+          idxm[axis] += N[axis];
+
+        double value = -0.5 * (pot(idxp) - pot(idxm)) * i_d[axis];
+        push_to<accum>::apply(g[i][j][k], value);
+      }
+}
+
+template <typename CIC>
+void BorgLEPModel<CIC>::lep_vel_update(
+    TapeArrayRef &pos, TapeArrayRef &vel, double dtv, double dDv, int istep) {
+  ConsoleContext<LOG_DEBUG> ctx("vel update");
+
+  double i_d0 = f_N0 / L0;
+  double i_d1 = f_N1 / L1;
+  double i_d2 = f_N2 / L2;
+
+  long Np = pos.shape()[1];
+
+  //calculate forces and update velocities
+  for (int axis = 0; axis < 3; axis++) {
+    ///interpolate lep forces to particles
+
+#pragma omp parallel for schedule(static)
+    for (long i = 0; i < Np; i++) {
+
+      double x = pos[istep][i][0] * i_d0;
+      double y = pos[istep][i][1] * i_d1;
+      double z = pos[istep][i][2] * i_d2;
+
+      int ix = (int)std::floor(x);
+      int iy = (int)std::floor(y);
+      int iz = (int)std::floor(z);
+
+      int jx = (ix + 1) % f_N0;
+      int jy = (iy + 1) % f_N1;
+      int jz = (iz + 1) % f_N2;
+
+      double rx = (x - ix);
+      double ry = (y - iy);
+      double rz = (z - iz);
+
+      double qx = 1 - rx;
+      double qy = 1 - ry;
+      double qz = 1 - rz;
+
+      double force = 0.;
+      switch (axis) {
+      case 0:
+        force = g_lep0->get_array()[ix][iy][iz] * qx * qy * qz +
+                g_lep0->get_array()[ix][iy][jz] * qx * qy * rz +
+                g_lep0->get_array()[ix][jy][iz] * qx * ry * qz +
+                g_lep0->get_array()[ix][jy][jz] * qx * ry * rz +
+                g_lep0->get_array()[jx][iy][iz] * rx * qy * qz +
+                g_lep0->get_array()[jx][iy][jz] * rx * qy * rz +
+                g_lep0->get_array()[jx][jy][iz] * rx * ry * qz +
+                g_lep0->get_array()[jx][jy][jz] * rx * ry * rz;
+        break;
+      case 1:
+        force = g_lep1->get_array()[ix][iy][iz] * qx * qy * qz +
+                g_lep1->get_array()[ix][iy][jz] * qx * qy * rz +
+                g_lep1->get_array()[ix][jy][iz] * qx * ry * qz +
+                g_lep1->get_array()[ix][jy][jz] * qx * ry * rz +
+                g_lep1->get_array()[jx][iy][iz] * rx * qy * qz +
+                g_lep1->get_array()[jx][iy][jz] * rx * qy * rz +
+                g_lep1->get_array()[jx][jy][iz] * rx * ry * qz +
+                g_lep1->get_array()[jx][jy][jz] * rx * ry * rz;
+        break;
+      case 2:
+        force = g_lep2->get_array()[ix][iy][iz] * qx * qy * qz +
+                g_lep2->get_array()[ix][iy][jz] * qx * qy * rz +
+                g_lep2->get_array()[ix][jy][iz] * qx * ry * qz +
+                g_lep2->get_array()[ix][jy][jz] * qx * ry * rz +
+                g_lep2->get_array()[jx][iy][iz] * rx * qy * qz +
+                g_lep2->get_array()[jx][iy][jz] * rx * qy * rz +
+                g_lep2->get_array()[jx][jy][iz] * rx * ry * qz +
+                g_lep2->get_array()[jx][jy][jz] * rx * ry * rz;
+        break;
+      }
+      force *=
+          dDv; //multiply with linear growth factor for lep potential evolution
+      vel[istep + 1][i][axis] = vel[istep][i][axis] + force * dtv;
+    }
+  }
+}
+
+template <typename CIC>
+void BorgLEPModel<CIC>::lep_pos_update(
+    TapeArrayRef &pos, TapeArrayRef &vel, double dtr, double dDr, int istep) {
+  ConsoleContext<LOG_DEBUG> ctx("pos update");
+  long Np = pos.shape()[1];
+
+#pragma omp parallel for
+  for (long i = 0; i < Np; i++) {
+    //NOTE: we stored the initial displacement vector in the initial velocity component
+    double x = pos[istep][i][0] + vel[istep + 1][i][0] * dtr;
+    double y = pos[istep][i][1] + vel[istep + 1][i][1] * dtr;
+    double z = pos[istep][i][2] + vel[istep + 1][i][2] * dtr;
+
+    pos[istep + 1][i][0] = periodic_fix(x, 0., L0);
+    pos[istep + 1][i][1] = periodic_fix(y, 0., L1);
+    pos[istep + 1][i][2] = periodic_fix(z, 0., L2);
+  }
+}
+
+template <typename CIC>
+void BorgLEPModel<CIC>::lep_stepping(
+    TapeArrayRef &pos, TapeArrayRef &vel, int nstep, TimingArray &timing) {
+  ConsoleContext<LOG_DEBUG> ctx("BORG-lep stepping");
+
+  ctx.print(format("Doing  %d timesteps of lep") % (nstep - 1));
+
+  ///________________________________________________________
+  /// lep code forward model
+  ///________________________________________________________
+
+  for (int nn = 0; nn < nstep - 1; nn++) {
+    double dtr = timing[2][nn];
+    double dtv = timing[3][nn];
+    double dDr = timing[4][nn];
+    double dDv = timing[5][nn];
+
+    lep_vel_update(pos, vel, dtv, dDv, nn);
+    lep_pos_update(pos, vel, dtr, dDr, nn);
+  }
+}
+
+template <typename CIC>
+void BorgLEPModel<CIC>::lep_gen_timesteps(
+    double ai, double af, TimingArray &timing, int nstep) {
+  ConsoleContext<LOG_DEBUG> ctx("BORG-lep gen_timesteps");
+  Cosmology cosmo(cosmo_params);
+
+  ctx.print(
+      format("Building timesteps from ai=%g to af=%d in %d steps") % ai % af %
+      nstep);
+
+  double du = (log(af) - log(ai)) / double(nstep - 1);
+
+  //need to scale lep potential according to pm initial conds.
+  double D00 = cosmo.d_plus(ai);
+
+  for (int i = 0; i < nstep; i++) {
+    double an0 = ai * exp(du * i);
+    double an1 = ai * exp(du * (i + 1));
+
+    double anh0 = (an0 + ai * exp(du * (i - 1))) / 2.;
+    double anh1 = (an0 + an1) / 2.;
+
+    double dtr = cosmo.dtr(an0, an1);
+    double dtv = cosmo.dtv(anh0, anh1);
+
+    double D0 = cosmo.d_plus(ai);
+    double dDr = (cosmo.d_plus(an1) - cosmo.d_plus(an0)) / D0;
+    //double dDv=cosmo.d_plus(an0)/D00;
+    //need to do a propper integral here
+    //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+    double dDv = 0.5 * (cosmo.d_plus(an1) + cosmo.d_plus(an0)) / D00;
+    //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+    timing[0][i] = an0;
+    timing[1][i] = anh0;
+    timing[2][i] = dtr;
+    timing[3][i] = dtv;
+    timing[4][i] = dDr;
+    timing[5][i] = dDv;
+  }
+}
+
+template <typename CIC>
+template <typename PositionArray>
+void BorgLEPModel<CIC>::lep_density_obs(
+    const PositionArray &pos, ArrayRef &deltao) {
+  double nmean = CosmoTool::cube(ss_factor);
+
+  array::fill(deltao, 0);
+  CIC::projection(pos, deltao, L0, L1, L2, N0, N1, N2);
+  array::density_rescale(deltao, nmean);
+}
+
+template <typename CIC>
+void BorgLEPModel<CIC>::forwardModelRsdField(
+    ArrayRef &deltaf, double *vobs_ext) {
+
+  std::cout << "ERROR: VOBS META_SAMPLER NOT IMPLEMENTED" << std::endl;
+  exit(0);
+  /*
+    ConsoleContext<LOG_DEBUG> ctx("BORG forward model rsd density calculation");
+
+    ///introduce redshift distortions
+    if (do_rsd) {
+        UninitializedArray<PhaseArray> s_pos_p(extents[c_localN0*c_N1*c_N2][3]);
+        
+        //ArrayType1d::ArrayType& dummy = vobs;
+        
+        //set vobs to input
+        double dummy[3];
+        
+        dummy[0]=vobs[0];
+        dummy[1]=vobs[1];
+        dummy[2]=vobs[2];
+        
+        vobs[0]=vobs_ext[0];
+        vobs[1]=vobs_ext[1];
+        vobs[2]=vobs_ext[2];
+        
+        ctx.print("doing redshift space distortions.");
+        lpt_redshift_pos(u_pos->get_array(), u_vel->get_array(), s_pos_p.get_array());
+        
+        lpt_density_obs(s_pos_p.get_array(), deltaf);
+        
+        //reset vobs
+        vobs[0]=dummy[0];
+        vobs[1]=dummy[1];
+        vobs[2]=dummy[2];
+        ;
+        
+    }
+ */
+}
+
+template <typename CIC>
+void BorgLEPModel<CIC>::lep_fwd_model(
+    CArrayRef &deltao, ArrayRef &deltaf, TapeArrayRef &pos, TapeArrayRef &vel,
+    TimingArray &timing) {
+  ConsoleContext<LOG_DEBUG> ctx("BORG-LEP forward model");
+  ///NOTE: ICs are generated at ai
+  ///but the lepcode starts at ao and finishes at af
+  double ao = 1. / (1. + z_start);
+  double af = 1.;
+
+  /// we also choose some time steps
+  int nstep = timing.shape()[1];
+
+  //generate time steps
+  lep_gen_timesteps(ao, af, timing, nstep);
+
+  //generate initial conditions at ao
+  if (c_deltao != 0) {
+    array::fill(*c_deltao, 0);
+    mgr->upgrade_complex(*lo_mgr, deltao, *c_deltao);
+    lep_ic(*c_deltao, pos, vel, timing);
+  } else
+    lep_ic(deltao, pos, vel, timing);
+
+  //do the lep stepping
+  lep_stepping(pos, vel, nstep, timing);
+
+  //build density field
+  if (do_redshift) {
+    UninitializedArray<PhaseArray> s_pos_p(extents[c_localN0 * c_N1 * c_N2][3]);
+    ctx.print("doing redshift space distortions.");
+    lep_redshift_pos(pos[nstep - 1], vel[nstep - 1], s_pos_p.get_array());
+    lep_density_obs(s_pos_p.get_array(), deltaf);
+  } else {
+    lep_density_obs(pos[nstep - 1], deltaf);
+  }
+}
+// ARES TAG: authors_num = 2
+// ARES TAG: name(0) = Guilhem Lavaux
+// ARES TAG: email(0) = guilhem.lavaux@iap.fr
+// ARES TAG: year(0) = 2014-2018
+// ARES TAG: name(1) = Jens Jasche
+// ARES TAG: email(1) = jens.jasche@fysik.su.se
+// ARES TAG: year(1) = 2009-2018
+~
--- a/extra/borg/libLSS/physics/forwards/lep/borg_fwd_lep_adj.cpp
+++ b/extra/borg/libLSS/physics/forwards/lep/borg_fwd_lep_adj.cpp
@ -0,0 +1,468 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/forwards/lep/borg_fwd_lep_adj.cpp
+    Copyright (C) 2014-2018 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2009-2018 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+
+template <typename CIC>
+void BorgLEPModel<CIC>::lep_ic_ag(
+    PhaseArrayRef &pos_ag, PhaseArrayRef &vel_ag, TimingArray &timing) {
+  ///set cosmological parameters
+  ///Initial density is scaled to initial redshift!!!
+  Cosmology cosmo(cosmo_params);
+
+  double an =
+      timing[0][0]; ///set position ics at r_{0}, calculate till present epoch
+  double anh = timing
+      [1]
+      [0]; ///velocities are created at v_{0-1/2}, calculate till present epoch
+  double D0 = cosmo.d_plus(ai);
+  double D1 = cosmo.d_plus(an) / D0;
+  double Df1 = cosmo.d_plus(anh) / D0;
+  double f1 = cosmo.g_plus(anh);
+  double Hubble = cosmo.Hubble(anh) / cosmo_params.h; ///km /sec /(Mpc/h)
+
+  ///allocate auxiliary Fourier array
+  FFTW_Complex_Array &AUX1 = *AUX1_p;
+  FFTW_Real_Array &aux = *aux_p;
+  FFTW_Complex_Array &AUX0 = *AUX0_p;
+
+  for (int axis = 0; axis < 3; axis++) {
+    ///1. Do position/velocity derivative
+    ///------------------------------------------------------------------------------
+
+    double fac_vel = -Df1 * f1 * anh * anh * Hubble / unit_v0;
+#pragma omp parallel for
+    for (long l = c_startN0; l < c_startN0 + c_localN0; l++)
+      for (long m = 0; m < c_N1; m++)
+        for (long n = 0; n < c_N2; n++) {
+          size_t idx = n + c_N2 * m + c_N2 * c_N1 * (l - c_startN0);
+          aux[l][m][n] = -D1 * pos_ag[idx][axis] + fac_vel * vel_ag[idx][axis];
+        }
+
+    /// FFT to F-space
+    mgr->execute_r2c(c_analysis_plan, aux.data(), AUX0.data());
+
+#pragma omp parallel for
+    for (long i = c_startN0; i < c_startN0 + c_localN0; i++) {
+      for (long j = 0; j < c_N1; j++) {
+        for (long k = 0; k < c_N2_HC; k++) {
+          double kk[3];
+          kk[0] = kmode(i, c_N0, L0);
+          kk[1] = kmode(j, c_N1, L1);
+          kk[2] = kmode(k, c_N2, L2);
+
+          double ksquared = kk[0] * kk[0] + kk[1] * kk[1] + kk[2] * kk[2];
+          double fac = -kk[axis] / ksquared * volNorm;
+
+          std::complex<double> &in_delta = AUX0[i][j][k];
+
+          AUX1[i][j][k] += std::complex<double>(
+              fac * in_delta.imag(), -fac * in_delta.real());
+        }
+      }
+    }
+  }
+
+  //fix hermiticity...unclear how to do that
+  if (c_startN0 == 0 && c_localN0 > 0) {
+    AUX1[0][0][0] = 0;
+    AUX1[0][0][c_N2_HC - 1] = 0;
+    AUX1[0][c_N1 / 2][0] = 0;
+    AUX1[0][c_N1 / 2][c_N2_HC - 1] = 0;
+  }
+
+  if (c_startN0 <= c_N0 / 2 && c_startN0 + c_localN0 > c_N0 / 2) {
+    AUX1[c_N0 / 2][0][0] = 0;
+    AUX1[c_N0 / 2][0][c_N2_HC - 1] = 0;
+    AUX1[c_N0 / 2][c_N1 / 2][0] = 0;
+    AUX1[c_N0 / 2][c_N1 / 2][c_N2_HC - 1] = 0;
+  }
+}
+
+///===============================
+template <typename CIC>
+template <typename PositionArray, typename PosAgArray>
+void BorgLEPModel<CIC>::lep_redshift_pos_ag(
+    const PositionArray &pos, const PositionArray &vel, PosAgArray &pos_ag,
+    PosAgArray &vel_ag) {
+  Cosmology cosmo(cosmo_params);
+
+  //this routine generates particle positions in redshift space
+  double af = 1.; ///km /sec /Mpc
+  double anh = 1.;
+  double Hubble = cosmo.Hubble(anh) / cosmo_params.h; ///km /sec /(Mpc/h)
+
+  // the code uses particle momenta p=a^2 dx/dt where x is the co-moving position
+  // peculiar velocities are then given by v_pec = p/a
+
+  //NOTE: Check coefficients
+  ArrayType1d::ArrayType &observer = vobs;
+  double facRSD =
+      1. / af /
+      Hubble; //this factor is 1/H/a for velocities in [km/sec] an additional factor arises from momentum conversion
+
+#pragma omp parallel for
+  for (long l = startN0; l < c_startN0 + c_localN0; l++)
+    for (long m = 0; m < N1; m++)
+      for (long n = 0; n < N2; n++) {
+        size_t idx = n + c_N2 * m + c_N1 * c_N2 * (l - c_startN0);
+        double x0 = pos[idx][0] + xmin0;
+        double x1 = pos[idx][1] + xmin1;
+        double x2 = pos[idx][2] + xmin2;
+
+        double v0 = vel[idx][0] + observer[0];
+        double v1 = vel[idx][1] + observer[1];
+        double v2 = vel[idx][2] + observer[2];
+
+        double s_pos_ag0 = pos_ag[idx][0];
+        double s_pos_ag1 = pos_ag[idx][1];
+        double s_pos_ag2 = pos_ag[idx][2];
+
+        double r2_los = x0 * x0 + x1 * x1 + x2 * x2;
+        double v_los = v0 * x0 + v1 * x1 + v2 * x2;
+        double slos = s_pos_ag0 * x0 + s_pos_ag1 * x1 + s_pos_ag2 * x2;
+
+        double A = facRSD * slos / r2_los;
+        double B = -2 * facRSD * v_los * slos / square(r2_los);
+        double C = facRSD * v_los / r2_los;
+
+        pos_ag[idx][0] = s_pos_ag0 + C * s_pos_ag0 + B * x0 + A * v0;
+        pos_ag[idx][1] = s_pos_ag1 + C * s_pos_ag1 + B * x1 + A * v1;
+        pos_ag[idx][2] = s_pos_ag2 + C * s_pos_ag2 + B * x2 + A * v2;
+
+        vel_ag[idx][0] = A * x0;
+        vel_ag[idx][1] = A * x1;
+        vel_ag[idx][2] = A * x2;
+      }
+}
+///===============================
+template <typename CIC>
+template <typename PositionArray, typename OutputArray>
+void BorgLEPModel<CIC>::lep_density_obs_ag(
+    const PositionArray &pos, OutputArray &pos_ag, OutputArray &vel_ag,
+    ArrayRef &B) {
+  double nmean = CosmoTool::cube(ss_factor);
+
+  CIC::adjoint(pos, B, pos_ag, L0, L1, L2, N0, N1, N2, nmean);
+  array::fill(vel_ag, 0);
+}
+
+template <int axis, typename GravityArray>
+double __lep_gravity_interpolation_ag(
+    const GravityArray &g, double x, double y, double z, int ix, int iy, int iz,
+    int jx, int jy, int jz) {
+  double rx, ry, rz, qx, qy, qz;
+
+  switch (axis) {
+  case 0:
+    rx = 1;
+    ry = (y - iy);
+    rz = (z - iz);
+    qx = -1;
+    qy = 1 - ry;
+    qz = 1 - rz;
+    break;
+  case 1:
+    rx = (x - ix);
+    ry = 1;
+    rz = (z - iz);
+    qx = 1 - rx;
+    qy = -1;
+    qz = 1 - rz;
+    break;
+  case 2:
+    rx = (x - ix);
+    ry = (y - iy);
+    rz = 1;
+
+    qx = 1 - rx;
+    qy = 1 - ry;
+    qz = -1;
+    break;
+  }
+
+  return g[ix][iy][iz] * qx * qy * qz + g[ix][iy][jz] * qx * qy * rz +
+         g[ix][jy][iz] * qx * ry * qz + g[ix][jy][jz] * qx * ry * rz +
+         g[jx][iy][iz] * rx * qy * qz + g[jx][iy][jz] * rx * qy * rz +
+         g[jx][jy][iz] * rx * ry * qz + g[jx][jy][jz] * rx * ry * rz;
+}
+
+template <typename CIC>
+template <typename PositionArray>
+void BorgLEPModel<CIC>::lep_force_1_ag(
+    const PositionArray &pos, const PositionArray &vel, PhaseArrayRef &pos_ag,
+    PhaseArrayRef &vel_ag, PhaseArrayRef &F_ag, double dtr, double dtv) {
+  Uninit_FFTW_Real_Array g_p(
+      force_mgr->extents_real(), force_mgr->allocator_real);
+  Uninit_FFTW_Real_Array pot_p(
+      force_mgr->extents_real(), force_mgr->allocator_real);
+  typedef Uninit_FFTW_Real_Array::array_type GArray;
+  GArray &pot = pot_p.get_array();
+  GArray &g = g_p.get_array();
+
+  //estimate gravitational potential
+  //alternatively we can store the potential from the forward run
+  //for the expense of higher memory requirements
+  lep_gravpot(pos, pot);
+
+  double i_d0 = f_N0 / L0;
+  double i_d1 = f_N1 / L1;
+  double i_d2 = f_N2 / L2;
+
+  long Np = pos.shape()[0];
+
+  //calculate forces and update velocities
+  for (int axis = 0; axis < 3; axis++) {
+    switch (axis) {
+    case 0:
+      array::fill(F_ag, 0);
+      compute_force<0, false, 1>(g, pot);
+      break;
+    case 1:
+      compute_force<1, false, 1>(g, pot);
+      break;
+    case 2:
+      compute_force<2, false, 1>(g, pot);
+      break;
+    }
+
+#pragma omp parallel for schedule(static)
+    for (long i = 0; i < Np; i++) {
+      double x = pos[i][0] * i_d0;
+      double y = pos[i][1] * i_d1;
+      double z = pos[i][2] * i_d2;
+
+      int ix = (int)std::floor(x);
+      int iy = (int)std::floor(y);
+      int iz = (int)std::floor(z);
+
+      int jx = (ix + 1) % f_N0;
+      int jy = (iy + 1) % f_N1;
+      int jz = (iz + 1) % f_N2;
+
+      double ax, ay, az;
+      //derivative of cic kernel with respect to x
+
+      ax = __lep_gravity_interpolation_ag<0, GArray>(
+               g, x, y, z, ix, iy, iz, jx, jy, jz) *
+           i_d0;
+
+      //derivative of cic kernel with respect to y
+      ay = __lep_gravity_interpolation_ag<1, GArray>(
+               g, x, y, z, ix, iy, iz, jx, jy, jz) *
+           i_d1;
+
+      //derivative of cic kernel with respect to z
+      az = __lep_gravity_interpolation_ag<2, GArray>(
+               g, x, y, z, ix, iy, iz, jx, jy, jz) *
+           i_d2;
+
+      //now add terms to force
+
+      F_ag[i][0] +=
+          ax * pos_ag[i][axis] * dtr * dtv + ax * vel_ag[i][axis] * dtv;
+      F_ag[i][1] +=
+          ay * pos_ag[i][axis] * dtr * dtv + ay * vel_ag[i][axis] * dtv;
+      F_ag[i][2] +=
+          az * pos_ag[i][axis] * dtr * dtv + az * vel_ag[i][axis] * dtv;
+    }
+  }
+}
+
+template <typename CIC>
+template <typename PositionArray>
+void BorgLEPModel<CIC>::lep_force_0_ag(
+    const PositionArray &pos, const PositionArray &vel, PhaseArrayRef &pos_ag,
+    PhaseArrayRef &vel_ag, PhaseArrayRef &F_ag, double dtr, double dtv) {
+  long Np = pos_ag.shape()[0];
+
+  double d0 = L0 / double(N0);
+  double d1 = L1 / double(N1);
+  double d2 = L2 / double(N2);
+  typedef boost::multi_array<double, 1> WeightArray;
+
+  FFTW_Complex_Array &f_AUX0 = *f_AUX0_p;
+  Uninit_FFTW_Real_Array B_p(
+      force_mgr->extents_real(), force_mgr->allocator_real);
+  Uninit_FFTW_Real_Array g_p(
+      force_mgr->extents_real(), force_mgr->allocator_real);
+  Uninit_FFTW_Real_Array::array_type &g = g_p.get_array();
+  Uninit_FFTW_Real_Array::array_type &B = B_p.get_array();
+  WeightArray weight(boost::extents[Np]);
+
+  array::fill(B, 0);
+
+  for (int axis = 0; axis < 3; axis++) {
+#pragma omp parallel for
+    for (long i = 0; i < Np; i++) {
+      weight[i] = pos_ag[i][axis] * dtr * dtv + vel_ag[i][axis] * dtv;
+    }
+
+    //do cic
+    array::fill(g, 0);
+    CIC::projection(
+        pos, g, L0, L1, L2, f_N0, f_N1, f_N2,
+        CIC_Tools::Periodic(f_N0, f_N1, f_N2), weight);
+
+    switch (axis) {
+    case 0:
+      BorgLEPModel<CIC>::compute_force<0, true, -1>(B, g);
+      break;
+    case 1:
+      BorgLEPModel<CIC>::compute_force<1, true, -1>(B, g);
+      break;
+    case 2:
+      BorgLEPModel<CIC>::compute_force<2, true, -1>(B, g);
+      break;
+    }
+  }
+
+  //transform density to F-space
+  force_mgr->execute_r2c(f_analysis_plan, B.data(), f_AUX0.data());
+
+  double normphi = 3. / 2. * cosmo_params.omega_m / double(f_N0 * f_N1 * f_N2) /
+                   (unit_r0 * unit_r0);
+
+#pragma omp parallel for
+  for (int i = 0; i < f_startN0 + f_localN0; i++) {
+    double sin20 = sin2K[0][i];
+    for (int j = 0; j < f_N1; j++) {
+      double sin21 = sin2K[1][j];
+      for (int k = 0; k < f_N2_HC; k++) {
+        double sin22 = sin2K[2][k];
+
+        double Greens = -normphi / (sin20 + sin21 + sin22);
+        f_AUX0[i][j][k] *= Greens;
+      }
+    }
+  }
+
+  //fix zero mode by hand
+  if (f_startN0 == 0 && f_localN0 > 0) {
+    f_AUX0[0][0][0] = 0;
+  }
+
+  force_mgr->execute_c2r(f_synthesis_plan, f_AUX0.data(), B.data());
+
+  double nmean = CosmoTool::cube(double(ss_factor) / f_factor);
+  CIC::adjoint(pos, B, F_ag, L0, L1, L2, f_N0, f_N1, f_N2, nmean);
+}
+
+template <typename CIC>
+template <typename ForceArray>
+void BorgLEPModel<CIC>::lep_pos_update_ag(
+    PhaseArrayRef &pos_ag, const ForceArray &F_ag, double dtr) {
+  long Np = pos_ag.shape()[0];
+
+#pragma omp parallel for
+  for (long i = 0; i < Np; i++) {
+    for (int j = 0; j < 3; j++)
+      pos_ag[i][j] += F_ag[i][j];
+  }
+}
+
+template <typename CIC>
+void BorgLEPModel<CIC>::lep_vel_update_ag(
+    PhaseArrayRef &pos_ag, PhaseArrayRef &vel_ag, double dtr) {
+  long Np = pos_ag.shape()[0];
+
+#pragma omp parallel for
+  for (long i = 0; i < Np; i++) {
+    for (int j = 0; j < 3; j++)
+      vel_ag[i][j] += pos_ag[i][j] * dtr;
+  }
+}
+
+template <typename CIC>
+void BorgLEPModel<CIC>::lep_stepping_ag(
+    TapeArrayRef &pos, TapeArrayRef &vel, PhaseArrayRef &pos_ag,
+    PhaseArrayRef &vel_ag, int nstep, TimingArray &timing) {
+  //define adjoint force array
+  using boost::lambda::_1;
+  using boost::lambda::_2;
+  U_PhaseArray F_ag_0_p(extents[c_localN0 * c_N1 * c_N2][3]);
+  U_PhaseArray::array_type &F_ag_0 = F_ag_0_p.get_array();
+  U_PhaseArray F_ag_1_p(extents[c_localN0 * c_N1 * c_N2][3]);
+  U_PhaseArray::array_type &F_ag_1 = F_ag_1_p.get_array();
+
+  // MAIN LOOP : undo Leapfrog stepping
+  for (int nn = nstep - 2; nn > -1; nn--) {
+    double dtr = timing[2][nn];
+    double dtv = timing[3][nn];
+
+    //order of force term is important as they will be added up!!!!
+#pragma omp task shared(pos, vel, pos_ag, vel_ag, F_ag_0)
+    { lep_force_0_ag(pos[nn], vel[nn], pos_ag, vel_ag, F_ag_0, dtr, dtv); }
+#pragma omp task shared(pos, vel, pos_ag, vel_ag, F_ag_1)
+    { lep_force_1_ag(pos[nn], vel[nn], pos_ag, vel_ag, F_ag_1, dtr, dtv); }
+#pragma omp taskwait
+    lep_vel_update_ag(pos_ag, vel_ag, dtr);
+    lep_pos_update_ag(
+        pos_ag,
+        b_fused<U_PhaseArray::array_type::element>(F_ag_0, F_ag_1, _1 + _2),
+        dtr);
+  }
+}
+
+template <typename CIC>
+void BorgLEPModel<CIC>::lep_fwd_model_ag(
+    ArrayRef &B, TapeArrayRef &pos, TapeArrayRef &vel, ArrayRef &DPSI,
+    TimingArray &timing) {
+  ConsoleContext<LOG_DEBUG> ctx("BORG-lep adjoint model");
+  int nstep = timing.shape()[1];
+
+  ///NOTE: ICs are generated at ai
+
+  ///introduce adjoint quantities
+  typedef U_PhaseArray::array_type PhaseArrayRef;
+  U_PhaseArray pos_ag_p(extents[c_localN0 * c_N1 * c_N2][3]);
+  U_PhaseArray vel_ag_p(extents[c_localN0 * c_N1 * c_N2][3]);
+
+  PhaseArrayRef &pos_ag = pos_ag_p.get_array();
+  PhaseArrayRef &vel_ag = vel_ag_p.get_array();
+
+  if (do_redshift) {
+    ctx.print("doing redshift space distortions.");
+
+    U_PhaseArray s_pos_u(extents[c_localN0 * c_N1 * c_N2][3]);
+    PhaseArrayRef &s_pos = s_pos_u.get_array();
+
+    lep_redshift_pos(pos[nstep - 1], vel[nstep - 1], s_pos);
+    ///work backwards from final to initial conditions
+    //1.) undo CIC
+    lep_density_obs_ag(s_pos, pos_ag, vel_ag, B);
+
+    //2.) undo redshift distortions
+    lep_redshift_pos_ag(pos[nstep - 1], vel[nstep - 1], pos_ag, vel_ag);
+  } else {
+    lep_density_obs_ag(pos[nstep - 1], pos_ag, vel_ag, B);
+  }
+
+  //2.) undo lep-stepping
+  lep_stepping_ag(pos, vel, pos_ag, vel_ag, nstep, timing);
+
+  //N.) undo ICs
+  lep_ic_ag(pos_ag, vel_ag, timing);
+
+  // Apply gradient upgrade operator
+  if (c_deltao != 0) {
+    array::fill(*tmp_complex_field, 0);
+    lo_mgr->degrade_complex(*mgr, *AUX1_p, *tmp_complex_field);
+    lo_mgr->execute_c2r(synthesis_plan, tmp_complex_field->data(), DPSI.data());
+  } else {
+    lo_mgr->execute_c2r(synthesis_plan, AUX1_p->data(), DPSI.data());
+  }
+}
+// ARES TAG: authors_num = 2
+// ARES TAG: name(0) = Guilhem Lavaux
+// ARES TAG: email(0) = guilhem.lavaux@iap.fr
+// ARES TAG: year(0) = 2014-2018
+// ARES TAG: name(1) = Jens Jasche
+// ARES TAG: email(1) = jens.jasche@fysik.su.se
+// ARES TAG: year(1) = 2009-2018
+~
--- a/extra/borg/libLSS/physics/forwards/lpt/borg_fwd_lpt.cpp
+++ b/extra/borg/libLSS/physics/forwards/lpt/borg_fwd_lpt.cpp
@ -0,0 +1,518 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/forwards/lpt/borg_fwd_lpt.cpp
+    Copyright (C) 2014-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2009-2018 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+
+#include "../pm/plane_xchg.hpp"
+
+template <typename CIC>
+void BorgLptModel<CIC>::lpt_ic(
+    CArrayRef &deltao, PhaseArrayRef &pos, PhaseArrayRef &vel,
+    PhaseArrayRef &lctim) {
+  ///set cosmological parameters
+  ///Initial density is scaled to initial redshift!!!
+  Cosmology cosmo(cosmo_params);
+  ConsoleContext<LOG_DEBUG> ctx("lpt_ic");
+
+  double an = af; ///set position ics at r_{0}, calculate till present epoch
+  double anh =
+      af; ///velocities are created at v_{0-1/2}, calculate till present epoch
+  size_t endN0 = c_startN0 + c_localN0;
+  double inv_volume = 1 / (L0 * L1 * L2);
+  double Hubble = cosmo.Hubble(anh) / cosmo_params.h; ///km /sec /(Mpc/h)
+
+  typedef UninitializedArray<
+      FFTW_Complex_Array, FFTW_Allocator<std::complex<double>>>
+      U_CArray;
+  typedef U_CArray::array_type Ref_CArray;
+
+  U_CArray tmp_p(mgr->extents_complex(), mgr->allocator_complex);
+  Ref_CArray &tmp = tmp_p.get_array();
+
+  size_t Ns[3] = {size_t(c_N0) / 2, size_t(c_N1) / 2, size_t(c_N2) / 2};
+  for (int axis = 0; axis < 3; axis++) {
+#pragma omp parallel for collapse(3) schedule(static)
+    for (size_t i = c_startN0; i < endN0; i++)
+      for (size_t j = 0; j < c_N1; j++)
+        for (size_t k = 0; k < c_N2_HC; k++) {
+          double kk[3] = {
+              kmode(i, c_N0, L0), kmode(j, c_N1, L1), kmode(k, c_N2, L2)};
+          size_t ijk[3] = {i, j, k};
+
+          double ksquared = kk[0] * kk[0] + kk[1] * kk[1] + kk[2] * kk[2];
+          double fac = -kk[axis] / ksquared * inv_volume;
+
+          std::complex<double> &in_delta = deltao[i][j][k];
+
+          if (ijk[axis] == Ns[axis]) {
+            tmp[i][j][k] = 0;
+          } else {
+            tmp[i][j][k] = std::complex<double>(
+                -in_delta.imag() * fac, in_delta.real() * fac);
+          }
+        }
+    if (c_startN0 == 0 && c_localN0 > 0) {
+      tmp[0][0][0] = 0;
+      tmp[0][0][c_N2_HC - 1] = 0;
+      tmp[0][c_N1 / 2][0] = 0;
+      tmp[0][c_N1 / 2][c_N2_HC - 1] = 0;
+    }
+
+    if (c_startN0 <= c_N0 / 2 && c_startN0 + c_localN0 > c_N0 / 2) {
+      tmp[c_N0 / 2][0][0] = 0;
+      tmp[c_N0 / 2][0][c_N2_HC - 1] = 0;
+      tmp[c_N0 / 2][c_N1 / 2][0] = 0;
+      tmp[c_N0 / 2][c_N1 / 2][c_N2_HC - 1] = 0;
+    }
+
+    /// FFT to Realspace
+    mgr->execute_c2r(c_synthesis_plan, tmp.data(), c_tmp_real_field->data());
+
+#pragma omp parallel for collapse(3) schedule(static)
+    for (size_t l = c_startN0; l < endN0; l++)
+      for (size_t m = 0; m < c_N1; m++)
+        for (size_t n = 0; n < c_N2; n++) {
+          size_t idx = n + c_N2 * m + c_N2 * c_N1 * (l - c_startN0);
+          vel[idx][axis] = (*c_tmp_real_field)[l][m][n];
+        }
+  }
+
+  auto &ids = *lagrangian_id;
+  size_t base_id = c_N2 * c_N1 * c_startN0;
+#pragma omp parallel for collapse(3) schedule(static)
+  for (size_t l = c_startN0; l < endN0; l++)
+    for (size_t m = 0; m < c_N1; m++)
+      for (size_t n = 0; n < c_N2; n++) {
+        /// sort particles on equidistant grid
+        double q0 = L0 / double(c_N0) * double(l);
+        double q1 = L1 / double(c_N1) * double(m);
+        double q2 = L2 / double(c_N2) * double(n);
+
+        size_t idx = n + c_N2 * m + c_N2 * c_N1 * (l - c_startN0);
+
+        double DD1 = lctim[idx][0];
+
+        double x = q0 - DD1 * vel[idx][0];
+        double y = q1 - DD1 * vel[idx][1];
+        double z = q2 - DD1 * vel[idx][2];
+
+        ///enforce periodic boundary conditions
+        pos[idx][0] = periodic_fix(x, 0., L0);
+        pos[idx][1] = periodic_fix(y, 0., L1);
+        pos[idx][2] = periodic_fix(z, 0., L2);
+
+        ids[idx] = idx + base_id;
+        ///NOTE: displacements are already stored in the velocity vectors. Only need to multiply by prefactor
+
+        ///store velocities in km/sec
+        ///note we multiply by a^2 to get the correct momentum variable for the particle mesh code
+        ///and normalize to code units
+        double V_SCALING = lctim[idx][1];
+        vel[idx][0] *= V_SCALING;
+        vel[idx][1] *= V_SCALING;
+        vel[idx][2] *= V_SCALING;
+      }
+
+  realInfo.localNumParticlesAfter = realInfo.localNumParticlesBefore =
+      c_localN0 * c_N1 * c_N2;
+  // This is for noting down which particles to copy. This
+  // is inoccuous as long as redshift load balancing is properly called
+  // in its right time.
+  redshiftInfo.localNumParticlesAfter = realInfo.localNumParticlesAfter;
+}
+
+template <typename CIC>
+void BorgLptModel<CIC>::lpt_redshift_pos(
+    PhaseArrayRef &pos, PhaseArrayRef &vel, PhaseArrayRef &s_pos,
+    PhaseArrayRef &lctim) {
+  Cosmology cosmo(cosmo_params);
+
+  //this routine generates particle positions in redshift space
+  double anh = af;
+  double Hubble = cosmo.Hubble(anh) / cosmo_params.h; ///km /sec /(Mpc/h)
+
+  // the code uses particle momenta p=a^2 dx/dt where x is the co-moving position
+  // peculiar velocities are then given by v_pec = p/a
+
+  //NOTE: Check coefficients
+  ArrayType1d::ArrayType &observer = vobs;
+
+#pragma omp parallel for
+  for (size_t idx = 0; idx < realInfo.localNumParticlesAfter; idx++) {
+    double x0g = pos[idx][0];
+    double x1g = pos[idx][1];
+    double x2g = pos[idx][2];
+
+    double x0 = x0g + xmin0;
+    double x1 = x1g + xmin1;
+    double x2 = x2g + xmin2;
+
+    double v0 = vel[idx][0] + observer[0];
+    double v1 = vel[idx][1] + observer[1];
+    double v2 = vel[idx][2] + observer[2];
+
+    double r2_los = x0 * x0 + x1 * x1 + x2 * x2;
+    double v_los = v0 * x0 + v1 * x1 + v2 * x2;
+
+    double facRSD = lctim
+        [idx]
+        [2]; //this factor is 1/H/a for velocities in [km/sec] an additional factor arises from momentum conversion
+
+    double A = facRSD * v_los / r2_los;
+
+    double s0 = x0g + A * x0;
+    double s1 = x1g + A * x1;
+    double s2 = x2g + A * x2;
+
+    ///enforce periodic boundary conditions
+    s_pos[idx][0] = periodic_fix(s0, 0., L0);
+    s_pos[idx][1] = periodic_fix(s1, 0., L1);
+    s_pos[idx][2] = periodic_fix(s2, 0., L2);
+  }
+  // Update the info for redshift particles
+  redshiftInfo.localNumParticlesAfter = redshiftInfo.localNumParticlesBefore =
+      realInfo.localNumParticlesAfter;
+}
+
+template <typename CIC>
+void BorgLptModel<CIC>::lpt_density_obs(
+    PhaseArrayRef &pos, ArrayRef &deltao, size_t numParts) {
+  double const nmean = double(c_N0 * c_N1 * c_N2) /
+                       (box_output.N0 * box_output.N1 * box_output.N2);
+
+  if (ALWAYS_MPI(comm)) {
+    typedef UninitializedArray<boost::multi_array<double, 3>> U_Array;
+    typedef U_Array::array_type::index_range i_range;
+    U_Array::array_type::index_gen indices;
+    // Here we have to introduce ghost planes.
+    U_Array tmp_delta(out_mgr->extents_real(CIC::MPI_PLANE_LEAKAGE));
+
+    array::fill(tmp_delta.get_array(), 0);
+    Console::instance().format<LOG_DEBUG>("numParts = %d", numParts);
+    CIC::projection(
+        pos, tmp_delta.get_array(), box_output.L0, box_output.L1, box_output.L2,
+        box_output.N0, box_output.N1, box_output.N2,
+        typename CIC::Periodic_MPI(
+            box_output.N0, box_output.N1, box_output.N2, comm),
+        CIC_Tools::DefaultWeight(), numParts);
+
+    // CIC has MPI_PLANE_LEAKAGE extra planes. They have to be sent to the adequate nodes.
+    density_exchange_planes<true>(
+        comm, tmp_delta.get_array(), out_mgr, CIC::MPI_PLANE_LEAKAGE);
+
+    fwrap(deltao[out_mgr->strict_range()]) =
+        tmp_delta.get_array()[out_mgr->strict_range()];
+  } else {
+    array::fill(deltao, 0);
+    Console::instance().format<LOG_DEBUG>("projection with deltao");
+    CIC::projection(
+        pos, deltao, box_output.L0, box_output.L1, box_output.L2, box_output.N0,
+        box_output.N1, box_output.N2,
+        CIC_Tools::Periodic(box_output.N0, box_output.N1, box_output.N2),
+        CIC_Tools::DefaultWeight(), numParts);
+  }
+
+  array::density_rescale(deltao, nmean);
+
+  if (DUMP_BORG_DENSITY) {
+    std::string fname = str(format("borg_density_%d.h5") % comm->rank());
+    H5::H5File f(fname, H5F_ACC_TRUNC);
+    CosmoTool::hdf5_write_array(f, "density", deltao);
+  }
+}
+
+template <typename CIC>
+void BorgLptModel<CIC>::lpt_fwd_model(
+    CArrayRef &deltao, PhaseArrayRef &pos, PhaseArrayRef &vel,
+    PhaseArrayRef &lctim) {
+  ConsoleContext<LOG_DEBUG> ctx("BORG forward model");
+
+  if (false) {
+    static int step = 0;
+    std::string fname = str(format("fwd_ic_%d_%d.h5") % step % comm->rank());
+    H5::H5File f(fname, H5F_ACC_TRUNC);
+    CosmoTool::hdf5_write_array(f, "deltao", deltao);
+    step++;
+  }
+
+  if (c_deltao != 0) {
+    array::fill(*c_deltao, 0);
+    mgr->upgrade_complex(*lo_mgr, deltao, *c_deltao);
+    lpt_ic(*c_deltao, pos, vel, lctim);
+  } else {
+    ///NOTE: ICs are generated at ai
+    lpt_ic(deltao, pos, vel, lctim);
+  }
+}
+
+template <typename CIC>
+void BorgLptModel<CIC>::forwardModelRsdField(
+    ArrayRef &deltaf, double *vobs_ext) {
+  ConsoleContext<LOG_DEBUG> ctx("BORG forward model rsd density calculation");
+
+  ///introduce redshift distortions
+  if (do_rsd) {
+    //ArrayType1d::ArrayType& dummy = vobs;
+
+    //set vobs to input
+    double dummy[3];
+
+    dummy[0] = vobs[0];
+    dummy[1] = vobs[1];
+    dummy[2] = vobs[2];
+
+    vobs[0] = vobs_ext[0];
+    vobs[1] = vobs_ext[1];
+    vobs[2] = vobs_ext[2];
+    //ctx.print(format("old_v = %g,%g,%g   vnew = %g,%g,%g") % vobs[0] % vobs[1] % vobs[2] % vobs_ext[0] % vobs_ext[1] % vobs_ext[2]);
+
+    ctx.print("doing redshift space distortions.");
+    lpt_redshift_pos(
+        u_pos->get_array(), u_vel->get_array(), u_s_pos->get_array(),
+        lc_timing->get_array());
+
+    // Reset indexes
+    LibLSS::initIndexes(
+        redshiftInfo.u_idx->get_array(), redshiftInfo.localNumParticlesBefore);
+    particle_redistribute(
+        redshiftInfo, u_s_pos->get_array(),
+        typename CIC::Distribution(lo_mgr, L0, L1, L2));
+
+    lpt_density_obs(
+        u_s_pos->get_array(), deltaf, redshiftInfo.localNumParticlesAfter);
+
+    //reset vobs
+    vobs[0] = dummy[0];
+    vobs[1] = dummy[1];
+    vobs[2] = dummy[2];
+  }
+}
+
+template <typename CIC>
+void BorgLptModel<CIC>::test_lpt_velocities(MarkovState &state) {
+  typedef ArrayStateElement<double, 2> PhaseArrayElement;
+  auto s_hat_p = mgr->allocate_complex_array();
+  auto &s_hat = s_hat_p.get_array();
+  IArrayType::ArrayType &key_array = *state.get<IArrayType>("k_keys")->array;
+  ArrayType1d::ArrayType &pspec =
+      *state.get<ArrayType1d>("powerspectrum")->array;
+  int k0_test = 2, k1_test = 5, k2_test = 3;
+  double A_k = pspec[key_array[k0_test][k1_test][k2_test]] * volume;
+
+  PhaseArrayElement *pos_e =
+      new PhaseArrayElement(extents[c_localN0 * c_N1 * c_N2][3]);
+  PhaseArrayElement *vel_e =
+      new PhaseArrayElement(extents[c_localN0 * c_N1 * c_N2][3]);
+  PhaseArrayElement *timing_e =
+      new PhaseArrayElement(extents[c_localN0 * c_N1 * c_N2][2]);
+
+  state.newElement("lpt_pos", pos_e);
+  state.newElement("lpt_vel", vel_e);
+
+  auto &pos = *pos_e->array;
+  auto &vel = *vel_e->array;
+  auto &lc_timing = *timing_e->array;
+
+  gen_light_cone_timing(lc_timing);
+
+  fwrap(s_hat) = 0;
+  s_hat[k0_test][k1_test][k2_test] = std::sqrt(A_k) / volume;
+
+  //    Hermiticity_fixup(s_hat);
+
+  state.newScalar<double>("A_k_test", std::sqrt(A_k));
+  ArrayType1d *e_k_pos;
+  state.newElement("k_pos_test", e_k_pos = new ArrayType1d(extents[3]));
+  ArrayType1d::ArrayType &a_k_pos = *(e_k_pos->array);
+
+  a_k_pos[0] = kmode(k0_test, N0, L0);
+  a_k_pos[1] = kmode(k1_test, N1, L1);
+  a_k_pos[2] = kmode(k2_test, N2, L2);
+
+  lpt_ic(s_hat, pos, vel, lc_timing);
+}
+
+template <typename CIC>
+void BorgLptModel<CIC>::gen_light_cone_timing(PhaseArrayRef &lctim) {
+  Cosmology cosmo(cosmo_params);
+  ConsoleContext<LOG_VERBOSE> ctx("lightcone computation");
+
+  cosmo.precompute_d_plus();
+  cosmo.precompute_com2a();
+
+  double an = af; ///set position ics at r_{0}, calculate till present epoch
+  double anh =
+      af; ///velocities are created at v_{0-1/2}, calculate till present epoch
+  double D0 = cosmo.d_plus(a_init);
+
+  double a_lc = af;
+
+  double D1 = cosmo.d_plus(a_lc) / D0;
+  double f1 = cosmo.g_plus(a_lc);
+  double Hubble = cosmo.Hubble(a_lc) / cosmo_params.h; ///km /sec /(Mpc/h)
+  double v_scaling = -D1 * f1 * a_lc * a_lc * Hubble;
+  double facRSD = 1. / a_lc / Hubble;
+
+  if (lctime) {
+    using boost::lambda::_1;
+
+    double r0 = 0.;
+    double r1 = 2. * lcboost *
+                sqrt(
+                    (L0 + xmin0) * (L0 + xmin0) + (L1 + xmin1) * (L1 + xmin1) +
+                    (L2 + xmin2) * (L2 + xmin2));
+    double step = 2.;
+
+    ctx.print("Tabulating D+");
+    auto auxDplus = build_auto_interpolator(
+        boost::bind(&Cosmology::comph2d_plus, &cosmo, _1), r0, r1, step,
+        cosmo.comph2d_plus(r0), cosmo.comph2d_plus(r1));
+    ctx.print("Tabulating G+");
+    auto auxGplus = build_auto_interpolator(
+        boost::bind(&Cosmology::comph2g_plus, &cosmo, _1), r0, r1, step,
+        cosmo.comph2g_plus(r0), cosmo.comph2g_plus(r1));
+    ctx.print("Tabulating H");
+    auto auxHubble = build_auto_interpolator(
+        boost::bind(&Cosmology::comph2Hubble, &cosmo, _1), r0, r1, step,
+        cosmo.comph2Hubble(r0), cosmo.comph2Hubble(r1));
+    ctx.print("Tabulating a");
+    auto auxa = build_auto_interpolator(
+        boost::bind(&Cosmology::comph2a, &cosmo, _1), r0, r1, step,
+        cosmo.comph2a(r0), cosmo.comph2a(r1));
+
+    ctx.print("Extruding lightcone");
+///For every particle calculate distance to observer
+#pragma omp parallel for collapse(3)
+    for (int l = c_startN0; l < c_startN0 + c_localN0; l++)
+      for (int m = 0; m < c_N1; m++)
+        for (int n = 0; n < c_N2; n++) {
+          size_t idx = n + c_N2 * m + c_N2 * c_N1 * (l - c_startN0);
+
+          /// sort particles on equidistant grid
+          double x0 = L0 / double(c_N0) * double(l) + xmin0;
+          double x1 = L1 / double(c_N1) * double(m) + xmin1;
+          double x2 = L2 / double(c_N2) * double(n) + xmin2;
+          double r_obs = sqrt(x0 * x0 + x1 * x1 + x2 * x2) * lcboost;
+          D1 = auxDplus(r_obs) / D0;
+          f1 = auxGplus(r_obs);
+          Hubble = auxHubble(r_obs) / cosmo_params.h; ///km /sec /(Mpc/h)
+          a_lc = auxa(r_obs);
+          v_scaling = -D1 * f1 * a_lc * a_lc * Hubble;
+          facRSD = 1. / a_lc / Hubble;
+
+          lctim[idx][0] = D1;
+          lctim[idx][1] = v_scaling;
+          lctim[idx][2] = facRSD;
+        }
+  } else {
+#pragma omp parallel for collapse(3)
+    for (int l = c_startN0; l < c_startN0 + c_localN0; l++)
+      for (int m = 0; m < c_N1; m++)
+        for (int n = 0; n < c_N2; n++) {
+          size_t idx = n + c_N2 * m + c_N2 * c_N1 * (l - c_startN0);
+
+          lctim[idx][0] = D1;
+          lctim[idx][1] = v_scaling;
+          lctim[idx][2] = facRSD;
+        }
+  }
+}
+
+template <typename CIC>
+void BorgLptModel<CIC>::forwardModel_v2(ModelInput<3> delta_init) {
+  ConsoleContext<LOG_DEBUG> ctx("BORG LPT MODEL");
+
+  delta_init.setRequestedIO(PREFERRED_FOURIER);
+
+  size_t partNum = size_t(c_localN0 * c_N1 * c_N2 * partFactor);
+
+  u_pos.reset();
+  u_vel.reset();
+  lagrangian_id.reset();
+
+  lagrangian_id = std::unique_ptr<IdxArray>(new IdxArray(extents[partNum]));
+  u_pos = std::make_shared<U_PArray>(extents[partNum][3]);
+  u_vel = std::make_shared<U_PArray>(extents[partNum][3]);
+
+  realInfo.allocate(comm, partNum);
+  if (do_rsd) {
+    u_s_pos = std::make_shared<U_PArray>(extents[partNum][3]);
+    redshiftInfo.allocate(comm, partNum);
+  }
+
+  delta_init.needDestroyInput();
+  lpt_fwd_model(
+      delta_init.getFourier(), u_pos->get_array(), u_vel->get_array(),
+      lc_timing->get_array());
+
+  try {
+    ///introduce redshift distortions
+    if (do_rsd) {
+
+      ctx.print("doing redshift space distortions.");
+      // Particle redistribution, real space, this step could be avoided I think (i.e. just remove the line)
+      lpt_redshift_pos(
+          u_pos->get_array(), u_vel->get_array(), u_s_pos->get_array(),
+          lc_timing->get_array());
+
+      particle_redistribute(
+          redshiftInfo, u_s_pos->get_array(),
+          typename CIC::Distribution(out_mgr, L0, L1, L2),
+          make_attribute_helper(Particles::scalar(*lagrangian_id)));
+    } else {
+      particle_redistribute(
+          realInfo, u_pos->get_array(),
+          typename CIC::Distribution(out_mgr, L0, L1, L2),
+          make_attribute_helper(
+              Particles::vector(u_vel->get_array()),
+              Particles::scalar(*lagrangian_id)));
+      redshiftInfo.localNumParticlesAfter = realInfo.localNumParticlesAfter;
+    }
+  } catch (const ErrorLoadBalance &) {
+    // If load balance failure it means our sample is deeply wrong. Free resources and inform the caller.
+    releaseParticles();
+    forwardModelHold = false;
+    throw;
+  }
+}
+
+template <typename CIC>
+void BorgLptModel<CIC>::getDensityFinal(ModelOutput<3> delta_output) {
+  LIBLSS_AUTO_DEBUG_CONTEXT(ctx);
+  this->invalidCache = false;
+  delta_output.setRequestedIO(PREFERRED_REAL);
+
+  ctx.format(
+      "output shape is %dx%dx%d", delta_output.getRealOutput().shape()[0],
+      delta_output.getRealOutput().shape()[1],
+      delta_output.getRealOutput().shape()[2]);
+
+  if (do_rsd) {
+    // Now we can project
+    lpt_density_obs(
+        u_s_pos->get_array(), delta_output.getRealOutput(),
+        redshiftInfo.localNumParticlesAfter);
+  } else {
+    lpt_density_obs(
+        u_pos->get_array(), delta_output.getRealOutput(),
+        realInfo.localNumParticlesAfter);
+  }
+
+  /* if (!forwardModelHold && !adjointRequired) {
+    releaseParticles();
+  }*/
+  forwardModelHold = false;
+}
+// ARES TAG: authors_num = 2
+// ARES TAG: name(0) = Guilhem Lavaux
+// ARES TAG: email(0) = guilhem.lavaux@iap.fr
+// ARES TAG: year(0) = 2014-2020
+// ARES TAG: name(1) = Jens Jasche
+// ARES TAG: email(1) = jens.jasche@fysik.su.se
+// ARES TAG: year(1) = 2009-2018
+
--- a/extra/borg/libLSS/physics/forwards/lpt/borg_fwd_lpt.hpp
+++ b/extra/borg/libLSS/physics/forwards/lpt/borg_fwd_lpt.hpp
@ -0,0 +1,9 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/forwards/lpt/borg_fwd_lpt.hpp
+    Copyright (C) 2014-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
--- a/extra/borg/libLSS/physics/forwards/lpt/borg_fwd_lpt_adj.cpp
+++ b/extra/borg/libLSS/physics/forwards/lpt/borg_fwd_lpt_adj.cpp
@ -0,0 +1,302 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/forwards/lpt/borg_fwd_lpt_adj.cpp
+    Copyright (C) 2014-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2009-2018 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+
+template <typename CIC>
+void BorgLptModel<CIC>::lpt_ic_ag(
+    PhaseArrayRef &pos_ag, PhaseArrayRef &vel_ag, PhaseArrayRef &lctim) {
+  ///set cosmological parameters
+  ///Initial density is scaled to initial redshift!!!
+
+  ConsoleContext<LOG_DEBUG> ctx("LPT-IC adjoint");
+  Cosmology cosmo(cosmo_params);
+
+  double an = af; ///set position ics at r_{0}, calculate till present epoch
+  double anh =
+      af; ///velocities are created at v_{0-1/2}, calculate till present epoch
+  double Hubble = cosmo.Hubble(anh) / cosmo_params.h; ///km /sec /(Mpc/h)
+  double c_volNorm = 1 / volume;
+
+  ///allocate auxiliary Fourier array
+  auto &AUX1 = *AUX1_p;
+  auto &aux = *aux_p;
+  auto &AUX0 = *AUX0_p;
+
+  array::fill(AUX1, 0);
+
+  for (int axis = 0; axis < 3; axis++) {
+
+    ///1. Do position derivative
+    ///------------------------------------------------------------------------------
+#pragma omp parallel for collapse(3)
+    for (int l = c_startN0; l < c_startN0 + c_localN0; l++)
+      for (int m = 0; m < c_N1; m++)
+        for (int n = 0; n < c_N2; n++) {
+          size_t idx = n + c_N2 * m + c_N2 * c_N1 * (l - c_startN0);
+          double DD1 = lctim[idx][0];
+          double V_SCALING = lctim[idx][1];
+          aux[l][m][n] =
+              -DD1 * pos_ag[idx][axis] + V_SCALING * vel_ag[idx][axis];
+        }
+
+    /// FFT to F-space
+    mgr->execute_r2c(c_analysis_plan, aux.data(), AUX0.data());
+
+    size_t Ns[3] = {size_t(c_N0) / 2, size_t(c_N1) / 2, size_t(c_N2) / 2};
+#pragma omp parallel for collapse(3)
+    for (size_t i = c_startN0; i < c_startN0 + c_localN0; i++)
+      for (size_t j = 0; j < c_N1; j++)
+        for (size_t k = 0; k < c_N2_HC; k++) {
+          double kk[3] = {
+              kmode(i, c_N0, L0), kmode(j, c_N1, L1), kmode(k, c_N2, L2)};
+          size_t ijk[3] = {i, j, k};
+
+          double ksquared = kk[0] * kk[0] + kk[1] * kk[1] + kk[2] * kk[2];
+          double fac = -kk[axis] / ksquared * c_volNorm;
+
+          std::complex<double> &in_delta = AUX0[i][j][k];
+
+          if (ijk[axis] != Ns[axis])
+            AUX1[i][j][k] += std::complex<double>(
+                fac * in_delta.imag(), -fac * in_delta.real());
+        }
+  }
+
+  //fix hermiticity...unclear how to do that
+  if (c_startN0 == 0 && c_localN0 > 0) {
+    AUX1[0][0][0] = 0;
+    AUX1[0][0][c_N2_HC - 1] = 0;
+    AUX1[0][c_N1 / 2][0] = 0;
+    AUX1[0][c_N1 / 2][c_N2_HC - 1] = 0;
+  }
+
+  if (c_startN0 <= c_N0 / 2 && c_startN0 + c_localN0 > c_N0 / 2) {
+    AUX1[c_N0 / 2][0][0] = 0;
+    AUX1[c_N0 / 2][0][c_N2_HC - 1] = 0;
+    AUX1[c_N0 / 2][c_N1 / 2][0] = 0;
+    AUX1[c_N0 / 2][c_N1 / 2][c_N2_HC - 1] = 0;
+  }
+}
+
+///===============================
+template <typename CIC>
+void BorgLptModel<CIC>::lpt_redshift_pos_ag(
+    PhaseArrayRef &pos, PhaseArrayRef &vel, PhaseArrayRef &pos_ag,
+    PhaseArrayRef &vel_ag, PhaseArrayRef &lctim) {
+  Cosmology cosmo(cosmo_params);
+
+  //this routine generates particle positions in redshift space
+  double anh = af;
+  double Hubble = cosmo.Hubble(anh) / cosmo_params.h; ///km /sec /(Mpc/h)
+
+  // the code uses particle momenta p=a^2 dx/dt where x is the co-moving position
+  // peculiar velocities are then given by v_pec = p/a
+
+  //NOTE: Check coefficients
+  boost::array<double, 3> observer = {vobs[0], vobs[1], vobs[2]};
+
+#pragma omp parallel for
+  for (size_t idx = 0; idx < redshiftInfo.localNumParticlesBefore; idx++) {
+    double x0 = pos[idx][0] + xmin0;
+    double x1 = pos[idx][1] + xmin1;
+    double x2 = pos[idx][2] + xmin2;
+
+    double v0 = vel[idx][0] + observer[0];
+    double v1 = vel[idx][1] + observer[1];
+    double v2 = vel[idx][2] + observer[2];
+
+    double s_pos_ag0 = pos_ag[idx][0];
+    double s_pos_ag1 = pos_ag[idx][1];
+    double s_pos_ag2 = pos_ag[idx][2];
+
+    double r2_los = x0 * x0 + x1 * x1 + x2 * x2;
+    double v_los = x0 * v0 + x1 * v1 + x2 * v2;
+    double facRSD = lctim
+        [idx]
+        [2]; //this factor is 1/H/a for velocities in [km/sec] an additional factor arises from momentum conversion
+
+    double slos = s_pos_ag0 * x0 + s_pos_ag1 * x1 + s_pos_ag2 * x2;
+
+    double A = facRSD * slos / r2_los;
+    double B = -2 * facRSD * v_los * slos / square(r2_los);
+    double C = facRSD * v_los / r2_los;
+
+    pos_ag[idx][0] = s_pos_ag0 * (1 + C) + B * x0 + A * v0;
+    pos_ag[idx][1] = s_pos_ag1 * (1 + C) + B * x1 + A * v1;
+    pos_ag[idx][2] = s_pos_ag2 * (1 + C) + B * x2 + A * v2;
+
+    vel_ag[idx][0] = A * x0;
+    vel_ag[idx][1] = A * x1;
+    vel_ag[idx][2] = A * x2;
+  }
+}
+
+///===============================
+template <typename CIC>
+template <typename PositionArray>
+void BorgLptModel<CIC>::lpt_density_obs_ag(
+    PositionArray &pos, PhaseArrayRef &pos_ag, PhaseArrayRef &vel_ag,
+    ArrayRef const &B, size_t numParts) {
+  double nmean = double(c_N0 * c_N1 * c_N2) /
+                 (box_output.N0 * box_output.N1 * box_output.N2);
+  typedef UninitializedArray<boost::multi_array<double, 3>> U_Array;
+
+  if (ALWAYS_MPI(comm)) {
+    // Allocate a temporary density field with extra planes for the
+    // the projection leakage
+    U_Array tmp_delta(out_mgr->extents_real(CIC::MPI_PLANE_LEAKAGE));
+
+    density_exchange_planes_ag(
+        comm, tmp_delta.get_array(), B, out_mgr, CIC::MPI_PLANE_LEAKAGE);
+    CIC::adjoint(
+        pos, tmp_delta.get_array(), pos_ag, L0, L1, L2, box_output.N0,
+        box_output.N1, box_output.N2,
+        typename CIC::Periodic_MPI(
+            box_output.N0, box_output.N1, box_output.N2, comm),
+        nmean, numParts);
+  } else {
+    // This is simple, no copy, no adjustment
+    CIC::adjoint(
+        pos, B, pos_ag, L0, L1, L2, box_output.N0, box_output.N1, box_output.N2,
+        CIC_Tools::Periodic(box_output.N0, box_output.N1, box_output.N2), nmean,
+        numParts);
+  }
+}
+
+template <typename CIC>
+void BorgLptModel<CIC>::lpt_fwd_model_ag(
+    PhaseArrayRef &pos_ag, PhaseArrayRef &vel_ag, PhaseArrayRef &lctime,
+    CArrayRef &out_ag) {
+  ConsoleContext<LOG_DEBUG> ctx("BORG adjoint model (particles)");
+
+  ///NOTE: ICs are generated at ai
+
+  //N.) undo ICs
+  lpt_ic_ag(pos_ag, vel_ag, lctime);
+
+  // RESULT is in AUX1
+
+  if (c_deltao != 0) {
+    array::fill(out_ag, 0);
+    lo_mgr->degrade_complex(*mgr, *AUX1_p, out_ag);
+  } else {
+    fwrap(out_ag) = *AUX1_p;
+  }
+}
+
+template <typename CIC>
+void BorgLptModel<CIC>::preallocate() {
+  size_t refPartNum = size_t(c_localN0 * c_N1 * c_N2 * partFactor);
+  auto partExt = boost::extents[refPartNum][3];
+
+  if (!u_pos_ag) {
+    u_pos_ag = std::make_shared<U_PArray>(partExt);
+    u_vel_ag = std::make_shared<U_PArray>(partExt);
+    U_PArray::array_type &pos_ag = u_pos_ag->get_array();
+    U_PArray::array_type &vel_ag = u_vel_ag->get_array();
+
+    array::fill(pos_ag, 0);
+    array::fill(vel_ag, 0);
+  }
+}
+
+template <typename CIC>
+void BorgLptModel<CIC>::adjointModel_v2(ModelInputAdjoint<3> gradient_delta) {
+  ConsoleContext<LOG_DEBUG> ctx("BORG adjoint model");
+  // This function computes the adjoint gradient in place. The adjoint gradient of the final density must be provided, in exchange
+  // the adjoint gradient of the initial field is returned
+  //
+  ///introduce adjoint quantities
+  // This must be allocated in two steps to avoid the implicit
+  // zero initialization.
+
+  preallocate();
+  auto &pos_ag = u_pos_ag->get_array();
+  auto &vel_ag = u_vel_ag->get_array();
+  auto &pos = u_pos->get_array();
+  auto &vel = u_vel->get_array();
+
+  ///re-evaluate redshift distortions from forward run
+  if (do_rsd) {
+    PhaseArrayRef &s_pos = u_s_pos->get_array();
+    ctx.print("doing redshift space distortions.");
+
+    ///work backwards from final to initial conditions
+    //1.) undo CIC
+    if (gradient_delta) {
+      gradient_delta.setRequestedIO(PREFERRED_REAL);
+      lpt_density_obs_ag(
+          s_pos, pos_ag, vel_ag, gradient_delta.getRealConst(),
+          redshiftInfo.localNumParticlesAfter);
+    }
+
+    particle_undistribute(redshiftInfo, pos_ag);
+
+    //2.) undo redshift distortions
+    lpt_redshift_pos_ag(pos, vel, pos_ag, vel_ag, *lc_timing);
+
+  } else {
+    ///work backwards from final to initial conditions
+    //1.) undo CIC
+    if (gradient_delta) {
+      gradient_delta.setRequestedIO(PREFERRED_REAL);
+      lpt_density_obs_ag(
+          pos, pos_ag, vel_ag, gradient_delta.getRealConst(),
+          realInfo.localNumParticlesAfter);
+    }
+  }
+}
+
+template <typename CIC>
+void BorgLptModel<CIC>::getAdjointModelOutput(
+    ModelOutputAdjoint<3> gradient_delta) {
+  LIBLSS_AUTO_DEBUG_CONTEXT(ctx);
+  auto &pos_ag = u_pos_ag->get_array();
+  auto &vel_ag = u_vel_ag->get_array();
+
+  if (!do_rsd)
+    particle_undistribute(
+        realInfo, pos_ag, make_attribute_helper(Particles::vector(vel_ag)));
+
+  gradient_delta.setRequestedIO(PREFERRED_FOURIER);
+  lpt_fwd_model_ag(
+      pos_ag, vel_ag, *lc_timing, gradient_delta.getFourierOutput());
+
+  clearAdjointGradient();
+}
+
+template <typename CIC>
+void BorgLptModel<CIC>::adjointModelParticles(
+    PhaseArrayRef &grad_pos, PhaseArrayRef &grad_vel) {
+  LIBLSS_AUTO_DEBUG_CONTEXT(ctx);
+
+  if (do_rsd) {
+    error_helper<ErrorBadState>(
+        "RSD and adjointModelParticles do not work together.");
+  }
+
+  preallocate();
+  auto w_p = fwrap(u_pos_ag->get_array());
+  auto w_v = fwrap(u_vel_ag->get_array());
+  w_p = w_p + fwrap(grad_pos);
+  w_v = w_v + fwrap(grad_vel);
+}
+
+template <typename CIC>
+void BorgLptModel<CIC>::clearAdjointGradient() {
+  u_pos_ag.reset();
+  u_vel_ag.reset();
+}
+// ARES TAG: authors_num = 2
+// ARES TAG: name(0) = Guilhem Lavaux
+// ARES TAG: email(0) = guilhem.lavaux@iap.fr
+// ARES TAG: year(0) = 2014-2020
+// ARES TAG: name(1) = Jens Jasche
+// ARES TAG: email(1) = jens.jasche@fysik.su.se
+// ARES TAG: year(1) = 2009-2018
--- a/extra/borg/libLSS/physics/forwards/particle_balancer/attributes.hpp
+++ b/extra/borg/libLSS/physics/forwards/particle_balancer/attributes.hpp
@ -0,0 +1,191 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/forwards/particle_balancer/attributes.hpp
+    Copyright (C) 2014-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#ifndef __LIBLSS_PARTICLE_ATTRIBUTES_HPP
+#define __LIBLSS_PARTICLE_ATTRIBUTES_HPP
+
+#include "libLSS/mpi/generic_mpi.hpp"
+
+#include <type_traits>
+#include <utility>
+#include <tuple>
+#include <boost/multi_array.hpp>
+#include "libLSS/physics/forwards/particle_balancer/aux_array.hpp"
+
+namespace LibLSS {
+
+  /* We implement classical attributes manager.
+   * At the moment we support a simple scalar, and a vector of 3 components.
+   * More is possible for a modest implementation cost.
+   */
+
+  namespace Particles {
+
+    /**
+     * This handles the addition arbitrary scalar attributes attached to a particle.
+     * If "tempScalar" is true, then the attribute is a temporary and some cleaning will be done
+     * at the exit.
+     */
+    template <typename ArrayRef, bool tempScalar = false>
+    struct ScalarAttribute {
+      typedef typename std::remove_reference<ArrayRef>::type ArrayRef_bare;
+      typedef typename ArrayRef_bare::reference refType;
+      typedef typename ArrayRef_bare::const_reference crefType;
+      typedef typename ArrayRef_bare::element Type;
+      typedef aux_array::TemporaryArrayStore<Type, 1> TemporaryArray;
+      typedef typename TemporaryArray::Array TemporaryArrayType;
+      typedef ScalarAttribute<TemporaryArrayType, true> TemporaryAttribute;
+      static constexpr bool temporaryHolder = tempScalar;
+
+      // This is a very thin unique pointer owner implementation.
+      // This allows to support temporary arrays that self clean
+      // when leaving the context, while allowing external arrays
+      // to be provided.
+      TemporaryArray temp_array_holder;
+      ArrayRef_bare &vec;
+
+      ScalarAttribute(ArrayRef_bare &_vec) : temp_array_holder(), vec(_vec) {}
+
+      ScalarAttribute(ScalarAttribute<ArrayRef, false> const &a)
+          : temp_array_holder(), vec(a.vec) {
+        static_assert(
+            tempScalar == false,
+            "It is not possible to copy a non-temp ScalarAttribute to a temp "
+            "ScalarAttribute. Fix your code.");
+      }
+
+      ScalarAttribute(ScalarAttribute<ArrayRef, true> &&a)
+          : temp_array_holder(std::move(a.temp_array_holder)),
+            vec(temp_array_holder.array) {}
+
+      ScalarAttribute(TemporaryArray &&temp)
+          : temp_array_holder(std::move(temp)), vec(temp_array_holder.array) {}
+
+      // Attribute swapper.
+      inline void swap(size_t idx0, size_t idx1) {
+        std::swap(vec[idx0], vec[idx1]);
+      }
+
+      // Store a value in the attribute array. This
+      // is completely expanded by the compiler.
+      template <typename Value>
+      inline void store(size_t idx, Value const &value) {
+        vec[idx] = value;
+      }
+
+      inline Type const &get(size_t idx) const { return vec[idx]; }
+
+      // Recover the mpi type of the content of this attribute.
+      inline MPI_Datatype mpi_type() {
+        // Use the mpiVector helper. This array is a set of 3-elements packed
+        // in a array. Create an MPI type for that to help us doing I/O
+        return translateMPIType<typename ArrayRef_bare::element>();
+      }
+
+      // Get the pointer to the memory holding the data.
+      inline Type *getArrayData(size_t shift) { return &vec[shift]; }
+
+      // Allocate a new independent, temporary, scalar attribute.
+      // It will self clean at the destruction of the returned object.
+      static inline TemporaryAttribute allocateTemporary(size_t sz) {
+        return TemporaryAttribute(TemporaryArray(boost::extents[sz]));
+      }
+    };
+
+    /**
+     * This creates a scalar attribute helper automatically from an array.
+     */
+    template <typename ArrayRef>
+    ScalarAttribute<ArrayRef> scalar(ArrayRef &a) {
+      return ScalarAttribute<ArrayRef>(a);
+    }
+
+    /**
+     * This handles the addition arbitrary 3d vector attribute attached to a particle.
+     */
+    template <typename ArrayRef, bool tempVector = false>
+    struct VectorAttribute {
+
+      typedef typename std::remove_reference<ArrayRef>::type ArrayRef_bare;
+      typedef typename ArrayRef_bare::reference refType;
+      typedef typename ArrayRef_bare::const_reference crefType;
+      typedef typename ArrayRef_bare::element Type;
+      typedef aux_array::TemporaryArrayStore<Type, 2> TemporaryArray;
+      typedef typename TemporaryArray::Array TemporaryArrayType;
+      typedef VectorAttribute<TemporaryArrayType, true> TemporaryAttribute;
+      static constexpr bool temporaryHolder = tempVector;
+
+      TemporaryArray temp_array_holder;
+      ArrayRef_bare &vec;
+
+      VectorAttribute(ArrayRef_bare &_vec) : temp_array_holder(), vec(_vec) {}
+
+      VectorAttribute(VectorAttribute<ArrayRef, false> const &a)
+          : temp_array_holder(), vec(a.vec) {
+        static_assert(
+            tempVector == false,
+            "It is not possible to copy a non-temp VectorAttribute to a temp "
+            "VectorAttribute. Fix your code.");
+      }
+
+      VectorAttribute(VectorAttribute<ArrayRef, true> &&a)
+          : temp_array_holder(std::move(a.temp_array_holder)),
+            vec(temp_array_holder.array) {}
+
+      // Only activate this constructor if the passed array is compatible with
+      // temporaryarray.
+      VectorAttribute(TemporaryArray &&temp)
+          : temp_array_holder(std::move(temp)), vec(temp_array_holder.array) {}
+
+      inline void swap(size_t idx0, size_t idx1) {
+        refType vec0 = vec[idx0];
+        refType vec1 = vec[idx1];
+
+        std::swap(vec0[0], vec1[0]);
+        std::swap(vec0[1], vec1[1]);
+        std::swap(vec0[2], vec1[2]);
+      }
+
+      template <typename Value>
+      inline void store(size_t idx, Value const &value) {
+        vec[idx][0] = value[0];
+        vec[idx][1] = value[1];
+        vec[idx][2] = value[2];
+      }
+
+      inline auto get(size_t idx) const -> decltype(vec[idx]) {
+        return vec[idx];
+      }
+
+      inline MPI_Datatype mpi_type() {
+        // Use the mpiVector helper. This array is a set of 3-elements packed
+        // in a array. Create an MPI type for that to help us doing I/O
+        return mpiVectorType<typename ArrayRef_bare::element, 3>::instance()
+            .type();
+      }
+
+      inline Type *getArrayData(size_t shift) { return &vec[shift][0]; }
+
+      inline static VectorAttribute<TemporaryArrayType, true>
+      allocateTemporary(size_t sz) {
+        return VectorAttribute<TemporaryArrayType, true>(
+            TemporaryArray(boost::extents[sz][3]));
+      }
+    };
+
+    template <typename ArrayRef>
+    VectorAttribute<ArrayRef> vector(ArrayRef &a) {
+      return VectorAttribute<ArrayRef>(a);
+    }
+
+  } // namespace Particles
+
+} // namespace LibLSS
+
+#endif
--- a/extra/borg/libLSS/physics/forwards/particle_balancer/aux_array.hpp
+++ b/extra/borg/libLSS/physics/forwards/particle_balancer/aux_array.hpp
@ -0,0 +1,88 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/forwards/particle_balancer/aux_array.hpp
+    Copyright (C) 2014-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#ifndef __LIBLSS_AUX_ARRAY_HPP
+#define __LIBLSS_AUX_ARRAY_HPP
+
+#include <array>
+#include <boost/multi_array.hpp>
+#include <boost/container/static_vector.hpp>
+#include "libLSS/tools/console.hpp"
+#include "libLSS/tools/log_traits.hpp"
+#include "libLSS/tools/memusage.hpp"
+
+namespace LibLSS {
+
+  namespace aux_array {
+    template <typename Extent>
+    size_t product_get(Extent ext) {
+      size_t N = 1;
+      for (auto s : ext) {
+        N *= s.size();
+      }
+      return N;
+    }
+
+    template <size_t N>
+    std::array<size_t, N> zero_gen() {
+      std::array<size_t, N> a;
+      a.fill(0);
+      return a;
+    }
+
+    template <typename T, size_t nd>
+    struct TemporaryArrayStore {
+      typedef LibLSS::Console Console;
+      typedef boost::multi_array_ref<T, nd> Array;
+      size_t array_size;
+      T *array_data;
+      Array array;
+
+      TemporaryArrayStore()
+        : array_size(0), array(0, zero_gen<nd>()){
+        array_data = 0;
+      }
+
+      TemporaryArrayStore(typename boost::multi_array_types::extent_gen::gen_type<nd>::type extents)
+        : array_size(product_get(extents.ranges_)),
+	  array_data(new T[product_get(extents.ranges_)]),
+          array(array_data, extents)
+      {
+	report_allocation(array_size*sizeof(T), array_data);
+      }
+
+      template <typename Extent>
+      TemporaryArrayStore(Extent extents)
+        : array_size(product_get(extents)),
+	  array_data(new T[product_get(extents)]),
+          array(array_data, extents)
+      {
+	report_allocation(array_size*sizeof(T), array_data);
+      }
+
+      TemporaryArrayStore(TemporaryArrayStore&& other)
+        : array_size(other.array_size),
+          array_data(other.array_data),
+          array(array_data, boost::container::static_vector<size_t, nd>(other.array.shape(), other.array.shape()+nd))
+        {
+          other.array_data = 0;
+        }
+
+      ~TemporaryArrayStore() {
+        if (array_data != 0) {
+          delete[] array_data;
+	  report_free(array_size*sizeof(T), array_data);
+        }
+      }
+    };
+  } // namespace aux_array
+
+} // namespace LibLSS
+
+#endif
--- a/extra/borg/libLSS/physics/forwards/particle_balancer/dyn/attributes.hpp
+++ b/extra/borg/libLSS/physics/forwards/particle_balancer/dyn/attributes.hpp
@ -0,0 +1,64 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/forwards/particle_balancer/dyn/attributes.hpp
+    Copyright (C) 2014-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#ifndef __LIBLSS_PARTICLE_ABSTRACT_ATTRIBUTES_HPP
+#define __LIBLSS_PARTICLE_ABSTRACT_ATTRIBUTES_HPP
+
+#include "libLSS/mpi/generic_mpi.hpp"
+
+#include <type_traits>
+#include <utility>
+#include <tuple>
+#include <boost/multi_array.hpp>
+
+namespace LibLSS {
+
+  namespace AbstractParticles {
+
+    /**
+     * @brief Abstract management of temporary memory for attribute
+     *
+     */
+    class TemporaryAttribute {
+    protected:
+      std::shared_ptr<void> ptr;
+      size_t sz;
+
+      TemporaryAttribute() = default;
+
+    public:
+      void *getData() { return ptr.get(); }
+      size_t size() const { return sz; }
+    };
+
+    /**
+     * @brief Management of attribute
+     *
+     */
+    class Attribute {
+    public:
+      virtual ~Attribute();
+      virtual std::shared_ptr<TemporaryAttribute>
+      allocateTemporary(size_t sz) = 0;
+      virtual MPI_Datatype mpi_type() = 0;
+      virtual size_t multiplicity() = 0;
+      virtual void *getArrayData(size_t offset) = 0;
+
+      virtual void swap(
+          boost::multi_array_ref<ssize_t, 1> const &permutation,
+          size_t num) = 0;
+
+      virtual void copy_from_tmp_to(
+          std::shared_ptr<TemporaryAttribute> &tmp, size_t offset) = 0;
+    };
+
+  } // namespace AbstractParticles
+} // namespace LibLSS
+
+#endif
--- a/extra/borg/libLSS/physics/forwards/particle_balancer/dyn/particle_distribute.cpp
+++ b/extra/borg/libLSS/physics/forwards/particle_balancer/dyn/particle_distribute.cpp
@ -0,0 +1,85 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/forwards/particle_balancer/dyn/particle_distribute.cpp
+    Copyright (C) 2014-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#include "libLSS/cconfig.h"
+#include <boost/multi_array.hpp>
+#include <boost/format.hpp>
+#include <functional>
+#include "libLSS/mpi/generic_mpi.hpp"
+#include "libLSS/physics/forwards/particle_balancer/dyn/particle_distribute.hpp"
+
+using namespace LibLSS;
+using namespace LibLSS::AbstractParticles;
+
+Attribute::~Attribute() {}
+
+void LibLSS::dynamic_particle_redistribute(
+    MPI_Communication *comm, BalanceInfo const &info,
+    std::vector<std::shared_ptr<AbstractParticles::Attribute>> attrs) {
+  LIBLSS_AUTO_DEBUG_CONTEXT(ctx);
+
+  auto &part_idx = info.u_idx->get_array();
+
+  // First we need to reorder the attributes to prepare for shipping.
+  ctx.print("Reorder dynamical attributes");
+  for (auto &a : attrs) {
+    a->swap(part_idx, info.localNumParticlesBefore);
+  }
+
+  // Go through each attribute and send the adequate slice to the remote.
+  std::list<MPICC_Request> reqSend, reqRecv;
+  std::vector<std::shared_ptr<TemporaryAttribute>> all_tmp(
+      attrs.size() * comm->size());
+
+  ctx.print("Send each slice of attributes to the remote");
+  for (auto &a : attrs) {
+    for (int r = 0; r < comm->size(); r++) {
+      if (info.numTransfer[r] != 0 && r != comm->rank()) {
+        void *data = a->getArrayData(info.offsetSend[r]);
+        ctx.format("  -> %d, num = %d", r, info.numTransfer[r]);
+        reqSend.push_back(comm->Isend(
+            data, a->multiplicity() * info.numTransfer[r], a->mpi_type(), r,
+            0));
+      }
+    }
+  }
+
+  ctx.print("Recv each slice of attributes from the remote");
+  for (size_t a = 0; a < attrs.size(); a++) {
+    for (size_t r = 0; r < comm->size(); r++) {
+      size_t idx = a * comm->size() + r;
+      ctx.format("  <- %d, num = %d", r, info.numReceive[r]);
+      if (info.numReceive[r] != 0 && r != comm->rank()) {
+        auto tmp = attrs[a]->allocateTemporary(info.numReceive[r]);
+        all_tmp[idx] = tmp;
+        void *data = tmp->getData();
+        reqRecv.push_back(comm->Irecv(
+            data, attrs[a]->multiplicity() * info.numReceive[r],
+            attrs[a]->mpi_type(), r, 0));
+      }
+    }
+  }
+  ctx.print("Waiting for transfer to complete");
+  for (auto &w : reqRecv)
+    w.wait();
+  for (auto &w : reqSend)
+    w.wait();
+  // DONE!
+
+  for (size_t a = 0; a < attrs.size(); a++) {
+    for (size_t r = 0; r < comm->size(); r++) {
+      size_t idx = a * comm->size() + r;
+      if (all_tmp[idx])
+        attrs[a]->copy_from_tmp_to(all_tmp[idx], info.offsetReceive[r]);
+    }
+  }
+
+  for (auto &a : all_tmp)
+    a.reset();
+}
--- a/extra/borg/libLSS/physics/forwards/particle_balancer/dyn/particle_distribute.hpp
+++ b/extra/borg/libLSS/physics/forwards/particle_balancer/dyn/particle_distribute.hpp
@ -0,0 +1,37 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/forwards/particle_balancer/dyn/particle_distribute.hpp
+    Copyright (C) 2014-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#ifndef __LIBLSS_PARTICLE_DYN_DISTRIBUTE_HPP
+#define __LIBLSS_PARTICLE_DYN_DISTRIBUTE_HPP
+
+#include <boost/multi_array.hpp>
+#include <boost/format.hpp>
+#include <functional>
+#include "libLSS/mpi/generic_mpi.hpp"
+#include "libLSS/tools/mpi_fftw_helper.hpp"
+#include "libLSS/tools/uninitialized_type.hpp"
+#include "libLSS/tools/array_tools.hpp"
+#include "libLSS/physics/forwards/particle_balancer/particle_distribute.hpp"
+#include "libLSS/physics/forwards/particle_balancer/dyn/attributes.hpp"
+
+namespace LibLSS {
+
+  /**
+   * @brief Distribute the attributes according to the balance info instructions.
+   *
+   * @param comm MPI communicator
+   * @param info Info for particle balancing
+   * @param attrs Vector of attributes
+   */
+  void dynamic_particle_redistribute(
+      MPI_Communication *comm, BalanceInfo const &info,
+      std::vector<std::shared_ptr<AbstractParticles::Attribute>> attrs);
+} // namespace LibLSS
+
+#endif
--- a/extra/borg/libLSS/physics/forwards/particle_balancer/dyn/scalar.hpp
+++ b/extra/borg/libLSS/physics/forwards/particle_balancer/dyn/scalar.hpp
@ -0,0 +1,97 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/forwards/particle_balancer/dyn/scalar.hpp
+    Copyright (C) 2014-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#ifndef __LIBLSS_PARTICLE_ABSTRACT_SCALAR_ATTRIBUTE_HPP
+#define __LIBLSS_PARTICLE_ABSTRACT_SCALAR_ATTRIBUTE_HPP
+
+#include "libLSS/mpi/generic_mpi.hpp"
+
+#include <type_traits>
+#include <utility>
+#include <tuple>
+#include <boost/multi_array.hpp>
+#include "libLSS/physics/forwards/particle_balancer/dyn/attributes.hpp"
+
+namespace LibLSS {
+
+  namespace AbstractParticles {
+
+    template <typename Element>
+    class ScalarTemporary : public TemporaryAttribute {
+    protected:
+      std::shared_ptr<Element> arr;
+
+      ScalarTemporary() = default;
+
+    public:
+      ScalarTemporary(size_t sz_)
+          : arr(std::shared_ptr<Element>(
+                new Element[sz_], [](Element *e) { delete[] e; })) {
+        this->ptr = arr;
+        this->sz = sz_;
+      }
+    };
+
+    template <typename ArrayRef>
+    class ScalarAttribute : public Attribute {
+    public:
+      typedef typename std::remove_reference<ArrayRef>::type ArrayRef_bare;
+      typedef typename ArrayRef_bare::reference refType;
+      typedef typename ArrayRef_bare::const_reference crefType;
+      typedef typename ArrayRef_bare::element Type;
+
+      ArrayRef_bare &vec;
+
+      ScalarAttribute(ArrayRef_bare &_vec) : vec(_vec) {}
+      ~ScalarAttribute() override {}
+
+      std::shared_ptr<TemporaryAttribute>
+      allocateTemporary(size_t sz) override {
+        Console::instance().format<LOG_DEBUG>("allocateTemporary(sz=%d)", sz);
+        return std::make_shared<ScalarTemporary<Type>>(sz);
+      }
+
+      size_t multiplicity() override { return 1; }
+
+      MPI_Datatype mpi_type() override { return translateMPIType<Type>(); }
+
+      void *getArrayData(size_t offset) override { return &vec[offset]; }
+
+      void swap(
+          boost::multi_array_ref<ssize_t, 1> const &permutation,
+          size_t num) override {
+        for (size_t i = 0; i < num; ++i) {
+          if (permutation[i] != i)
+            std::swap(vec[i], vec[permutation[i]]);
+        }
+      }
+
+      void copy_from_tmp_to(
+          std::shared_ptr<TemporaryAttribute> &tmp, size_t offset) override {
+        if (!tmp) {
+          error_helper<ErrorBadState>("Invalid array");
+        }
+        auto origin = (Type *)tmp->getData();
+        size_t sz = tmp->size();
+        for (size_t i = 0; i < sz; i++)
+          vec[offset + i] = origin[i];
+      }
+    };
+
+    /**
+     * This creates a scalar attribute helper automatically from an array.
+     */
+    template <typename ArrayRef>
+    std::shared_ptr<AbstractParticles::Attribute> scalar(ArrayRef &a) {
+      return std::make_shared<ScalarAttribute<ArrayRef>>(a);
+    }
+  } // namespace AbstractParticles
+} // namespace LibLSS
+
+#endif
--- a/extra/borg/libLSS/physics/forwards/particle_balancer/dyn/vector.hpp
+++ b/extra/borg/libLSS/physics/forwards/particle_balancer/dyn/vector.hpp
@ -0,0 +1,116 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/forwards/particle_balancer/dyn/vector.hpp
+    Copyright (C) 2014-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#ifndef __LIBLSS_PARTICLE_ABSTRACT_VECTOR_ATTRIBUTE_HPP
+#define __LIBLSS_PARTICLE_ABSTRACT_VECTOR_ATTRIBUTE_HPP
+
+#include "libLSS/mpi/generic_mpi.hpp"
+
+#include <type_traits>
+#include <utility>
+#include <tuple>
+#include <boost/multi_array.hpp>
+#include "libLSS/physics/forwards/particle_balancer/dyn/attributes.hpp"
+
+namespace LibLSS {
+
+  namespace AbstractParticles {
+
+    template <typename Element>
+    class VectorTemporary : public TemporaryAttribute {
+    protected:
+      std::shared_ptr<Element> arr;
+
+      VectorTemporary() = default;
+
+    public:
+      VectorTemporary(size_t sz_, size_t d)
+          : arr(std::shared_ptr<Element>(
+                new Element[sz_ * d], [sz_](Element *e) {
+                  Console::instance().format<LOG_DEBUG>("Freeing sz=%d", sz_);
+                  delete[] e;
+                })) {
+        LIBLSS_AUTO_DEBUG_CONTEXT(ctx);
+        ctx.format("allocated temporary vector sz=%d, d=%d", sz_, d);
+        this->ptr = arr;
+        this->sz = sz_;
+      }
+    };
+
+    template <typename ArrayRef>
+    class VectorAttribute : public Attribute {
+    public:
+      typedef typename std::remove_reference<ArrayRef>::type ArrayRef_bare;
+      typedef typename ArrayRef_bare::reference refType;
+      typedef typename ArrayRef_bare::const_reference crefType;
+      typedef typename ArrayRef_bare::element Type;
+
+      ArrayRef_bare &vec;
+
+      VectorAttribute(ArrayRef_bare &_vec) : vec(_vec) {}
+
+      std::shared_ptr<TemporaryAttribute>
+      allocateTemporary(size_t sz) override {
+        return std::make_shared<VectorTemporary<Type>>(sz, vec.shape()[1]);
+      }
+
+      size_t multiplicity() override { return vec.shape()[1]; }
+
+      MPI_Datatype mpi_type() override { return translateMPIType<Type>(); }
+
+      void *getArrayData(size_t offset) override {
+        return &this->vec[offset][0];
+      }
+
+      void swap(
+          boost::multi_array_ref<ssize_t, 1> const &permutation,
+          size_t num) override {
+        LIBLSS_AUTO_DEBUG_CONTEXT(ctx);
+        const size_t d = vec.shape()[1];
+
+        for (size_t i = 0; i < num; ++i) {
+          if (permutation[i] != i) {
+            for (size_t j = 0; j < d; j++) {
+              std::swap(vec[i][j], vec[permutation[i]][j]);
+            }
+          }
+        }
+      }
+
+      void copy_from_tmp_to(
+          std::shared_ptr<TemporaryAttribute> &tmp, size_t offset) override {
+        LIBLSS_AUTO_DEBUG_CONTEXT(ctx);
+        if (!tmp) {
+          error_helper<ErrorBadState>("Invalid array");
+        }
+        auto origin = (Type *)tmp->getData();
+        size_t sz = tmp->size();
+        const size_t d = vec.shape()[1];
+
+        for (size_t i = 0; i < sz; i++)
+          for (size_t j = 0; j < d; j++) {
+            size_t k = i * d + j;
+            ctx.format("i=%d, j=%d, k=%d", i, j, k);
+            vec[offset + i][j] = origin[k];
+          }
+      }
+    };
+
+    /**
+     * This creates a scalar attribute helper automatically from an array.
+     */
+    template <typename ArrayRef>
+    std::shared_ptr<AbstractParticles::Attribute> vector(ArrayRef &a) {
+      return std::make_shared<VectorAttribute<ArrayRef>>(a);
+    }
+
+  } // namespace AbstractParticles
+} // namespace LibLSS
+
+#endif
--- a/extra/borg/libLSS/physics/forwards/particle_balancer/part_swapper.hpp
+++ b/extra/borg/libLSS/physics/forwards/particle_balancer/part_swapper.hpp
@ -0,0 +1,355 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/forwards/particle_balancer/part_swapper.hpp
+    Copyright (C) 2014-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#ifndef __LIBLSS_TOOLS_PART_SWAPPER_HPP
+#define __LIBLSS_TOOLS_PART_SWAPPER_HPP
+
+#include <boost/multi_array.hpp>
+#include "libLSS/tools/console.hpp"
+#include "libLSS/mpi/generic_mpi.hpp"
+#include "libLSS/physics/forwards/particle_balancer/aux_array.hpp"
+
+namespace LibLSS {
+
+  namespace internal_swapper {
+
+    template <size_t id>
+    struct storage_impl {
+      template <typename A, typename B, typename T>
+      void exec(A &a, B &b, size_t idx, T const &d) {
+        storage_impl<id - 1>().exec(a.next, a.this_attribute, d);
+      }
+    };
+
+    template <>
+    struct storage_impl<0> {
+      template <typename A, typename B, typename T>
+      void exec(A &a, B &b, size_t idx, T const &d) {
+        b.store(idx, d);
+      }
+    };
+
+    // Particles comes as a bunch of arrays of the same size but possibly
+    // different types. One array is mandatory: the 3d position. However one
+    // can consider adding other attributes. For vectorization purpose, it is
+    // slightly better to consider them as bunch of arrays instead of an array
+    // of a structure. However it creates also a lot of tension in the code.
+    // Maybe in the future everything will be transposed, which will induce a
+    // significant rewrite of the PM code. Currently we will consider that a
+    // simulation  state can be modeled as follow:
+    // struct State {
+    //    Positions pos;
+    //    Velocities vel;
+    //    OtherStuff other;
+    // }
+    // But each of those is an array of the same size.
+    // When doing load balancing these arrays must be swapped synchronously.
+    // The other reason for keeping this representation is that a lot of the rest
+    // of the infrastructure relies on a Nx3 array for positions. Doing
+    // transposes online or non vectorized could cost us dearly in terms of
+    // performance.
+
+    template <typename... Attr>
+    struct AttributeTuple {};
+
+    // If we have at least one attribute then we can proceed with this
+    // implementation.
+    template <typename First, typename... Attr>
+    struct AttributeTuple<First, Attr...> {
+      typedef AttributeTuple<First, Attr...> Self;
+      typedef AttributeTuple<Attr...> Next;
+      typedef AttributeTuple<
+          typename First::TemporaryAttribute,
+          typename Attr::TemporaryAttribute...>
+          TempSelf;
+      typedef typename First::refType refType;
+      First this_attribute;
+      AttributeTuple<Attr...> next;
+      static constexpr size_t numAttributes = 1 + sizeof...(Attr);
+
+      AttributeTuple(Self const &other)
+          : this_attribute(other.this_attribute), next(other.next) {}
+
+      AttributeTuple(Self &&other)
+          : this_attribute(std::move(other.this_attribute)),
+            next(std::move(other.next)) {}
+
+      AttributeTuple(First &&attr, Attr &&... others)
+          : this_attribute(std::move(attr)), next(std::move(others)...) {}
+
+      inline void swap(size_t idx0, size_t idx1) {
+        this_attribute.swap(idx0, idx1);
+        next.swap(idx0, idx1);
+      }
+
+      // This is a classic recurrence for store<int> function.
+      // Do a recursion till id is zero.
+      template <size_t id, typename DataType>
+      inline void store(size_t idx, DataType const &data) {
+        storage_impl<id>().exec(this->next, this->this_attribute, idx, data);
+      }
+
+      template <typename Other>
+      struct AcceptedVariant {
+        static constexpr auto value = std::is_same<Other, Self>::value ||
+                                      std::is_same<Other, TempSelf>::value;
+      };
+
+      // We need this other type to be either ourself or a Variant
+      // corresponding to the temporary attribute. Anything else is
+      // rejected.
+      template <typename Other>
+      inline typename std::enable_if<AcceptedVariant<Other>::value>::type
+      recursive_store(size_t idx, Other const &other, size_t where_from) {
+        this_attribute.store(idx, other.this_attribute.get(where_from));
+        next.recursive_store(idx, other.next, where_from);
+      }
+
+      auto tuple_get(size_t idx) -> decltype(std::tuple_cat(
+          std::make_tuple(this_attribute.get(idx)), next.tuple_get(idx))) {
+        return std::tuple_cat(
+            std::make_tuple(this_attribute.get(idx)), next.tuple_get(idx));
+      }
+
+      // This is an helper function. It allows to build a call to function f
+      // where all attributes are expanded in a single call. More details in the
+      // particle swapper.
+      static inline TempSelf allocateTemporary(size_t sz) {
+        return TempSelf(
+            First::allocateTemporary(sz), Attr::allocateTemporary(sz)...);
+      }
+
+      // This is an unusual dynamic operator for static tuple.
+      // Unfortunately that is far easier to implement this with a dynamic
+      // syntax rather than with template syntax as it is going to be intertwinned
+      // with calls to MPI functions.
+      inline MPI_Datatype mpi_type(size_t const aid) {
+        if (aid == 0)
+          return this_attribute.mpi_type();
+        else
+          return next.mpi_type(aid - 1);
+      }
+
+      inline void *getArrayData(size_t const aid, size_t const shift) {
+        if (aid == 0)
+          return this_attribute.getArrayData(shift);
+        else
+          return next.getArrayData(aid - 1, shift);
+      }
+    };
+
+    template <>
+    struct AttributeTuple<> {
+      typedef AttributeTuple<> Self;
+      typedef AttributeTuple<> TempSelf;
+      static const size_t numAttributes = 0;
+
+      AttributeTuple(Self const &other) {}
+      AttributeTuple() {}
+      AttributeTuple(Self &&other) {}
+
+      inline void swap(size_t, size_t) {}
+
+      inline void recursive_store(
+          size_t idx, AttributeTuple<> const &other, size_t where_from) {}
+
+      std::tuple<> tuple_get(size_t idx) { return std::tuple<>(); }
+
+      static inline TempSelf allocateTemporary(size_t sz) { return TempSelf(); }
+
+      inline MPI_Datatype mpi_type(size_t aid) {
+        LibLSS::Console &cons = LibLSS::Console::instance();
+        cons.print<LOG_ERROR>("Invalid access in AttributeTuple::mpi_type");
+        MPI_Communication::instance()->abort();
+        return MPI_INTEGER;
+      }
+
+      inline void *getArrayData(size_t aid, size_t shift) {
+        LibLSS::Console &cons = LibLSS::Console::instance();
+        cons.print<LOG_ERROR>("Invalid access in AttributeTuple::getArrayData");
+        MPI_Communication::instance()->abort();
+        return (void *)0;
+      }
+    };
+
+  } // namespace internal_swapper
+
+  /**
+   * Build a attribute tuple which will help accessing the different arrays with a same
+   * syntaxic interface.
+   */
+  template <typename... Attrs>
+  inline internal_swapper::AttributeTuple<Attrs...>
+  make_attribute_helper(Attrs &&... attrs) {
+    return internal_swapper::AttributeTuple<Attrs...>(
+        std::forward<Attrs>(attrs)...);
+  }
+
+  /**
+   * This class implements helper methods to exchange particles
+   * and their attributes in place in their arrays.
+   * It also provides temporary array allocators for auxiliary attributes.
+   * This is supposed to be an internal class for the particle
+   * MPI distribution routine. However it is fairly generic for other use
+   * requiring synchronize reorganisation of different arrays
+   * at the same time, without a-prior knowing what are those arrays before
+   * the instanciation.
+   *
+   * @tparam ArrayRef             reference to the fundamental array corresponding to positions (typically a boost::multi_array or boost::multi_array_ref).
+   * @tparam AuxiliaryAttributes  Attribute class descriptor like LibLSS::internal_swapper::AttributeTuple.
+   *
+   * @see LibLSS::make_attribute_helper
+   */
+  template <typename ArrayRef, typename AuxiliaryAttributes>
+  class ParticleSwapper {
+  public:
+    typedef typename boost::remove_reference<ArrayRef>::type ArrayRef_bare;
+    typedef typename ArrayRef_bare::reference refType;
+    typedef ParticleSwapper<ArrayRef, AuxiliaryAttributes> Self;
+
+    ArrayRef pos;
+    AuxiliaryAttributes attrs;
+
+    /**
+     * Constructor.
+     * @param _pos the array of positions
+     * @param _attrs the attribute tuple obtained through LibLSS::make_attribute_helper
+     */
+    ParticleSwapper(ArrayRef _pos, AuxiliaryAttributes _attrs)
+        : pos(_pos), attrs(_attrs) {}
+
+    /**
+     * Execute an in-place swap of positions and attributes for particles
+     * at index idx0 and idx1.
+     *
+     * @param idx0 Index of first particle.
+     * @param idx1 Index of second particle.
+     */
+    inline void do_swap(size_t idx0, size_t idx1) {
+      refType in_pos = pos[idx0];
+      refType out_pos = pos[idx1];
+
+      std::swap(in_pos[0], out_pos[0]);
+      std::swap(in_pos[1], out_pos[1]);
+      std::swap(in_pos[2], out_pos[2]);
+      attrs.swap(idx0, idx1);
+    }
+
+    /**
+     * This is an helper function. It allows to build a call to function f
+     * where all attributes are expanded in a single call.
+     * allocateTemporary calls each attribute allocateTemporary to ask
+     * to expand the attribute to do a single call to create the new
+     * AuxiliaryAttributes object. To finalize the construction we rely on
+     * allocateTemporaryUnwrapper, which enforces the move semantic and call
+     * the actual allocateTemporary on each attribute.
+     */
+    static inline typename AuxiliaryAttributes::TempSelf
+    allocateTemporary(size_t sz) {
+      return AuxiliaryAttributes::allocateTemporary(sz);
+    }
+
+    /**
+     * Get access to the raw pointer holding the specified attribute,
+     * eventually shifted by an index 'shift'.
+     * @param aid   attribute index.
+     * @param shift index of the element for which we request the pointer.
+     */
+    inline void *getArrayData(size_t const aid, size_t const shift) {
+      return attrs.getArrayData(aid, shift);
+    }
+
+    /**
+     * Return the MPI datatype of the attribute "aid"
+     * @param aid Attribute index.
+     */
+    inline MPI_Datatype mpi_type(size_t const aid) {
+      return attrs.mpi_type(aid);
+    }
+
+    /**
+     * Copy the provided position  (a array-like object with a shape [N][3]) at "where_from"
+     * to the target position "target_idx" in the holded array.
+     * @param target_idx where to the copy the position to.
+     * @param posRecv    an array-like object of position
+     * @param where_from source position in posRecv.
+     */
+    template <typename ArrayRecv>
+    inline void copy_in_pos(
+        size_t target_idx, const ArrayRecv &posRecv, size_t where_from) {
+      typename ArrayRecv::const_reference loc_in_pos = posRecv[where_from];
+      refType loc_out_pos = pos[target_idx];
+
+      loc_out_pos[0] = loc_in_pos[0];
+      loc_out_pos[1] = loc_in_pos[1];
+      loc_out_pos[2] = loc_in_pos[2];
+    }
+
+    /**
+     * Copy the provided attribute provided in "attrRecv" at "where_from"
+     * to the target position "target_idx".
+     *
+     * @tparam id         the identifier of the attribute.
+     * @param  target_idx the target position.
+     * @param  attrRecv   an array-like object holding the new attribute value.
+     * @param  where_from where to copy that attribute from.
+     */
+    template <size_t id, typename ArrayRecv>
+    inline void copy_in_attr(
+        size_t target_idx, const ArrayRecv &attrRecv, size_t where_from) {
+      typename ArrayRecv::const_reference loc_in_attr = attrRecv[where_from];
+      attrs.store<id>(target_idx, loc_in_attr);
+    }
+
+    /**
+     * Copy all attributes from attrRecv (AttributeTuple class) to the
+     * currently holded position/attribute array.
+     */
+    template <typename OtherAttributes>
+    inline void copy_in_all_attrs(
+        size_t target_idx, OtherAttributes const &attrRecv, size_t where_from) {
+      attrs.recursive_store(target_idx, attrRecv, where_from);
+    }
+  };
+
+  // Import only the right class template into the LibLSS namespace
+  using internal_swapper::AttributeTuple;
+  typedef AttributeTuple<> NoAuxiliaryAttributes;
+
+  // That's compatibility layer for previous mechanisms.
+  template <bool doVel, typename ArrayRef>
+  class ParticleSwapperTaped {};
+
+  template <typename ArrayRef>
+  class ParticleSwapperTaped<true, ArrayRef>
+      : public ParticleSwapper<
+            typename ArrayRef::reference, NoAuxiliaryAttributes> {
+  public:
+    typedef ParticleSwapper<typename ArrayRef::reference, NoAuxiliaryAttributes>
+        super;
+
+    ParticleSwapperTaped(ArrayRef &_pos, ArrayRef &_vel, int istep)
+        : super(_pos[istep], _vel[istep]) {}
+  };
+
+  template <typename ArrayRef>
+  class ParticleSwapperTaped<false, ArrayRef>
+      : public ParticleSwapper<
+            typename ArrayRef::reference, NoAuxiliaryAttributes> {
+  public:
+    typedef ParticleSwapper<typename ArrayRef::reference, NoAuxiliaryAttributes>
+        super;
+
+    ParticleSwapperTaped(ArrayRef &_pos, ArrayRef &_vel, int istep)
+        : super(_pos[istep]) {}
+  };
+
+}; // namespace LibLSS
+
+#endif
--- a/extra/borg/libLSS/physics/forwards/particle_balancer/particle_distribute.hpp
+++ b/extra/borg/libLSS/physics/forwards/particle_balancer/particle_distribute.hpp
@ -0,0 +1,554 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/forwards/particle_balancer/particle_distribute.hpp
+    Copyright (C) 2017-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#ifndef __LIBLSS_PARTICLE_DISTRIBUTE_HPP
+#define __LIBLSS_PARTICLE_DISTRIBUTE_HPP
+
+#include <boost/multi_array.hpp>
+#include <boost/format.hpp>
+#include <functional>
+#include "libLSS/mpi/generic_mpi.hpp"
+#include "libLSS/tools/mpi_fftw_helper.hpp"
+#include "libLSS/tools/uninitialized_type.hpp"
+#include "libLSS/tools/array_tools.hpp"
+#include "libLSS/physics/forwards/particle_balancer/attributes.hpp"
+#include "libLSS/physics/forwards/particle_balancer/part_swapper.hpp"
+
+namespace LibLSS {
+
+  static const bool ULTRA_CHECK = false;
+
+  template <typename PartIdxArray>
+  inline void initIndexes(PartIdxArray part_idx, size_t numParts) {
+    typename PartIdxArray::index_gen i_gen;
+    typedef typename PartIdxArray::index_range i_range;
+    // initialize array with a range 0, 1, 2, 3, 4, ... numParts
+    copy_array_rv(
+        part_idx[i_gen[i_range(0, numParts)]],
+        b_fused_idx<long, 1>(boost::lambda::_1));
+  }
+
+  // This is an example of an integrated storage for all the ancillary parameters required
+  // by the particle redistribute algorithm
+  struct BalanceInfo {
+    typedef boost::multi_array<ssize_t, 1> IdxArray;
+
+    IdxArray offsetReceive, offsetSend, numTransfer, numReceive;
+    UninitializedArray<IdxArray> *u_idx;
+    size_t localNumParticlesBefore, localNumParticlesAfter;
+    MPI_Communication *comm;
+
+    BalanceInfo() : u_idx(0), comm(0) {}
+
+    void allocate(MPI_Communication *newComm, size_t partNum) {
+      LIBLSS_AUTO_DEBUG_CONTEXT(ctx);
+      auto extComm = boost::extents[newComm->size()];
+
+      comm = newComm;
+      clear();
+      u_idx = new UninitializedArray<IdxArray>(boost::extents[partNum]);
+      initIndexes(u_idx->get_array(), partNum);
+      offsetReceive.resize(extComm);
+      offsetSend.resize(extComm);
+      numTransfer.resize(extComm);
+      numReceive.resize(extComm);
+    }
+
+    void clear() {
+      if (u_idx != 0) {
+        delete u_idx;
+        u_idx = 0;
+      }
+    }
+  };
+
+  template <
+      typename ParticleArray, typename IndexArray, typename CountArray,
+      typename OffsetArray, typename ParticleSelector,
+      typename AuxiliaryAttributes = NoAuxiliaryAttributes>
+  void particle_redistribute(
+      MPI_Communication *comm, ParticleArray &in_pos, IndexArray &part_idx,
+      size_t inParticles, size_t &outParticles, CountArray &numTransferStep,
+      CountArray &numReceiveStep, OffsetArray &offsetReceiveStep,
+      OffsetArray &offsetSendStep, ParticleSelector selector,
+      AuxiliaryAttributes attrs = AuxiliaryAttributes()) {
+    ConsoleContext<LOG_DEBUG> ctx("particle distribution");
+    typedef boost::multi_array<long, 1> LongArray;
+
+    using boost::extents;
+    using boost::format;
+    using boost::lambda::_1;
+    typedef LongArray::index_range range;
+    LongArray::index_gen indices;
+    typedef typename ParticleArray::reference PosElt;
+    typedef typename IndexArray::reference IdxTapeElt;
+    typedef LongArray::element LongElt;
+    typedef size_t CommT;
+    CommT thisRank = comm->rank();
+    CommT commSize = comm->size();
+
+
+    if (commSize == 1) {
+      outParticles = inParticles;
+      return;
+    }
+    ParticleSwapper<ParticleArray &, AuxiliaryAttributes> swapper(
+        in_pos, attrs);
+
+    LongArray numTransfer(extents[commSize]);
+    LongArray numReceive(extents[commSize]);
+    LongArray offsetTransfer(extents[1 + commSize]);
+    LongArray offsetIncoming(extents[1 + commSize]);
+    LongArray offsetRangeTransfer(extents[1 + commSize]);
+
+    LongElt totalIncoming;
+    LongElt baseOffset;
+
+    RequestArray reqRecv_pos(extents[commSize]), reqSend_pos(extents[commSize]);
+    RequestArray reqRecv_attr(
+        extents[AuxiliaryAttributes::numAttributes * commSize]),
+        reqSend_attr(extents[AuxiliaryAttributes::numAttributes * commSize]);
+
+    ctx.format("Computing particles to be exchanged, (inParticles=%d)", inParticles);
+    // There is something to parallelize here
+    for (size_t i = 0; i < inParticles; i++) {
+      PosElt loc_pos = in_pos[i];
+      numTransfer[selector(loc_pos, attrs.tuple_get(i))]++;
+    }
+
+    // MPI: do particle exchange to restablish localities
+    ctx.print("all2all...");
+    comm->all2allT(numTransfer.data(), 1, numReceive.data(), 1);
+    numReceive[thisRank] = 0;
+    ctx.print("Done");
+
+    // Here we do sanity check, if there is a memory error
+    // we must check this hear and have all nodes stop operations
+    // together
+    size_t totalParticles[2] = {inParticles, in_pos.shape()[0]};
+    for (CommT i = 0; i < commSize; i++) {
+      if (i == thisRank)
+        continue;
+      totalParticles[0] += numReceive[i];
+      totalParticles[0] -= numTransfer[i];
+    }
+    ctx.print(boost::format("totalParticles = %ld") % totalParticles[0]);
+
+    boost::multi_array<size_t, 1> totalArray(boost::extents[2 * comm->size()]);
+    comm->all_gather_t(totalParticles, 2, totalArray.data(), 2);
+
+    for (CommT i = 0; i < commSize; i++) {
+      ctx.print(
+          boost::format("Node %d: totalParticles = %ld / %ld") % i %
+          totalArray[2 * i] % totalArray[2 * i + 1]);
+      if (totalArray[2 * i] >= totalArray[2 * i + 1] &&
+          totalArray[2 * i] != 0) {
+        // This is thrown on all nodes
+        error_helper<ErrorLoadBalance>(
+            "Not enough particles per node. Increase particle alloc factor");
+      }
+    }
+
+    for (CommT i = 0; i < commSize; i++)
+      ctx.print(
+          format(" - peer=%d, recv = %ld, send = %ld ") % i % numReceive[i] %
+          numTransfer[i]);
+
+    // Figure out the offsets of in the buffers that will be transferred
+    // to other nodes
+    offsetTransfer[0] = 0;
+    offsetIncoming[0] = 0;
+    for (CommT r = 1; r <= commSize; r++) {
+      offsetTransfer[r] = offsetTransfer[r - 1] + numTransfer[r - 1];
+      offsetIncoming[r] = offsetIncoming[r - 1] + numReceive[r - 1];
+    }
+    for (CommT r = 0; r < thisRank; r++) {
+      offsetTransfer[r] += numTransfer[thisRank];
+    }
+
+    // Adjust the amount to transfer for this node
+    // The positions for the particles going to this node is zero to promote locality
+    offsetTransfer[thisRank] = 0;
+    totalIncoming = offsetIncoming[commSize];
+    offsetRangeTransfer = offsetTransfer;
+
+    for (CommT i = 0; i < commSize; i++) {
+      if (numTransfer[i] != 0)
+        ctx.print(
+            format(" - peer=%d, offsetIncoming = %ld, offsetTransfer = "
+                   "[%ld,%ld]") %
+            i % offsetIncoming[i] % offsetTransfer[i] %
+            (offsetTransfer[i] + numTransfer[i]));
+    }
+
+    // Now reorder them to push them for transfer
+    {
+      ConsoleContext<LOG_DEBUG> loc_ctx("sorting particles");
+      LongElt i = 0;
+      size_t doneComm = 0;
+
+      for (CommT j = 0; j < commSize; j++) {
+        if (numTransfer[j] == 0)
+          doneComm++;
+      }
+
+      while (doneComm != commSize) {
+        PosElt loc_pos = in_pos[i];
+
+        LongElt node = selector(loc_pos, attrs.tuple_get(i));
+        LongElt minOffsetNode = offsetRangeTransfer[node];
+        LongElt maxOffsetNode = minOffsetNode + numTransfer[node];
+
+        if (minOffsetNode <= i && maxOffsetNode > i) {
+          // Particle is already located where it should be.
+          i++;
+        } else {
+          IdxTapeElt loc_idx = part_idx[i];
+          // Particle is not in place. Move it to its bin
+          LongElt offset = offsetTransfer[node];
+          IdxTapeElt target_idx = part_idx[offset];
+
+          swapper.do_swap(offset, i);
+          std::swap(loc_idx, target_idx);
+          // now the particle in i is different and not yet processed
+        }
+
+        // Increase occupation number
+        offsetTransfer[node]++;
+
+        // If we are full mark the comm as done
+        if (offsetTransfer[node] == maxOffsetNode)
+          doneComm++;
+      }
+    }
+
+    {
+      typedef UninitializedArray<boost::multi_array<double, 2>> U_PhaseArray;
+      U_PhaseArray posRecv_p(extents[totalIncoming][3]);
+      U_PhaseArray::array_type &posRecv = posRecv_p.get_array();
+      // Obtain a new set of arrays holding all the attributes.
+      // These are temporary arrays for a max of totalIncoming.
+      auto temp_attrs = swapper.allocateTemporary(totalIncoming);
+
+      ctx.print(format("Exchanging particles (bufsize = %d)") % totalIncoming);
+
+      for (CommT i = 0; i < commSize; i++) {
+        if (i == thisRank || numTransfer[i] == 0)
+          continue;
+        ctx.print(format(" -> %d: %d particles (offset %d)") % i % numTransfer[i] % offsetRangeTransfer[i]);
+        // First send particle positions
+        reqSend_pos[i] = comm->IsendT(
+            &in_pos[offsetRangeTransfer[i]][0], 3 * numTransfer[i], i, 0);
+        // Now send all the other attributes.
+        for (size_t a_id = 0; a_id < AuxiliaryAttributes::numAttributes;
+             a_id++) {
+          ctx.print(format(" -> %d: sending attribute %d") % i % a_id);
+          reqSend_attr[i + a_id * commSize] = comm->Isend(
+              swapper.getArrayData(a_id, offsetRangeTransfer[i]),
+              numTransfer[i],
+              swapper.mpi_type(a_id), // use the proper MPI type here
+              i, a_id + 1);
+        }
+      }
+      for (CommT i = 0; i < commSize; i++) {
+        if (i == thisRank || numReceive[i] == 0)
+          continue;
+        ctx.print(format(" <- %d: %d particles (offset %d)") % i % numReceive[i] % offsetIncoming[i]);
+        reqRecv_pos[i] = comm->IrecvT(
+            posRecv.data() + 3 * offsetIncoming[i], 3 * numReceive[i], i, 0);
+        for (size_t a_id = 0; a_id < AuxiliaryAttributes::numAttributes;
+             a_id++) {
+          ctx.print(
+              format(" <- %d: receiving attribute %d, buffer %p, offset %d") %
+              i % a_id % temp_attrs.getArrayData(a_id, 0) % offsetIncoming[i]);
+          reqRecv_attr[i + a_id * commSize] = comm->Irecv(
+              temp_attrs.getArrayData(a_id, offsetIncoming[i]), numReceive[i],
+              temp_attrs.mpi_type(a_id), // use the proper MPI type here
+              i, a_id + 1);
+        }
+      }
+
+      // Now we ensure that all data in the output queue have been sent so that
+      // we can reuse the buffer
+      for (CommT i = 0; i < commSize; i++) {
+        if (i == thisRank)
+          continue;
+
+        if (numTransfer[i] > 0) {
+          reqSend_pos[i].wait();
+          for (size_t a_id = 0; a_id < AuxiliaryAttributes::numAttributes;
+               a_id++)
+            reqSend_attr[i + a_id * commSize].wait();
+        }
+      }
+      ctx.print("Done sent");
+
+      baseOffset = numTransfer[thisRank];
+
+      for (CommT i = 0; i < commSize; i++) {
+        if (i == thisRank)
+          continue;
+
+        if (numReceive[i] > 0) {
+          reqRecv_pos[i].wait();
+          for (size_t a_id = 0; a_id < AuxiliaryAttributes::numAttributes;
+               a_id++)
+            reqRecv_attr[i + a_id * commSize].wait();
+
+          size_t shape0 = in_pos.shape()[0];
+          // Put the particles in place now that the buffer is unused
+          for (LongElt j = 0; j < numReceive[i]; j++) {
+
+            swapper.copy_in_pos(baseOffset, posRecv, offsetIncoming[i] + j);
+            swapper.copy_in_all_attrs(
+                baseOffset, temp_attrs, offsetIncoming[i] + j);
+
+            baseOffset++;
+            if (baseOffset == shape0) {
+              error_helper<ErrorBadState>(
+                  "Invalid state. Not enough particles per node. Increase "
+                  "particle alloc factor");
+            }
+          }
+        }
+      }
+
+      ctx.print("Done recv");
+
+      outParticles = baseOffset;
+
+      if (ULTRA_CHECK) {
+        for (long i = 0; i < baseOffset; i++) {
+          Console::instance().c_assert(
+              selector(in_pos[i], attrs.tuple_get(i)) == comm->rank(), "Incorrect node");
+        }
+      }
+      ctx.print(
+          format("New number of particles on this node: %ld / max = %ld") %
+          outParticles % in_pos.shape()[0]);
+
+      LibLSS::copy_array(numTransferStep, numTransfer);
+      LibLSS::copy_array(numReceiveStep, numReceive);
+
+      LibLSS::copy_array(
+          offsetReceiveStep,
+          b_fused<long>(offsetIncoming, _1 + numTransfer[thisRank]));
+      LibLSS::copy_array(offsetSendStep, offsetRangeTransfer);
+    }
+  }
+
+  template <
+      typename ParticleArray, typename IndexArray, typename CountArray,
+      typename OffsetArray,
+      typename AuxiliaryAttributes = NoAuxiliaryAttributes>
+  void particle_undistribute(
+      MPI_Communication *comm, ParticleArray &pos_ag, IndexArray &part_idx,
+      size_t inParticles, size_t target_usedParticles, CountArray &numTransfer,
+      CountArray &numReceive, OffsetArray &offsetReceive,
+      OffsetArray &offsetSend,
+      AuxiliaryAttributes attrs = AuxiliaryAttributes()) {
+    ConsoleContext<LOG_DEBUG> ctx("distribute_particles_ag");
+
+    using boost::extents;
+    using boost::format;
+    typedef boost::multi_array<long, 1> LongArray;
+    typedef LongArray::element LongT;
+    typedef LongArray::index_range range;
+    using boost::lambda::_1;
+
+    long thisRank = comm->rank();
+    long commSize = comm->size();
+
+    if (commSize == 1) {
+      return;
+    }
+
+    typedef ParticleSwapper<ParticleArray &, AuxiliaryAttributes> Swapper;
+    Swapper swapper(pos_ag, attrs);
+
+    typename IndexArray::index_gen indices;
+
+    // Step == 0 is very special. It has two boundary: the input to the integrator and the output of IC generation.
+    // Distribution occurs just after IC generation, technically that's really the
+    // input at istep==0
+    // but when istep==0 we want the redistributed particles as they were for IC
+    // generation.
+
+    // Reserve some space for request ids
+    RequestArray reqRecv_pos(extents[commSize]), reqSend_pos(extents[commSize]);
+    RequestArray reqSend_attr(
+        extents[AuxiliaryAttributes::numAttributes * commSize]),
+        reqRecv_attr(extents[AuxiliaryAttributes::numAttributes * commSize]);
+
+    // Schedule exchanges with other nodes for AG data
+    for (long i = 0; i < commSize; i++) {
+      if (i == thisRank)
+        continue;
+
+      LongT offs = offsetReceive[i];
+
+      // Received becomes send with AG
+      ctx.print(
+          format(
+              "Schedule send with ofs=%d to comm=%d (pos_ag_len=%d len=%d)") %
+          offs % i % pos_ag.shape()[0] % numReceive[i]);
+      if (numReceive[i] > 0) {
+        reqSend_pos[i] =
+            comm->IsendT(&pos_ag[offs][0], 3 * numReceive[i], i, 0);
+        for (size_t a_id = 0; a_id < AuxiliaryAttributes::numAttributes;
+             a_id++) {
+          reqSend_attr[i + a_id * commSize] = comm->Isend(
+              attrs.getArrayData(a_id, offs), numReceive[i],
+              attrs.mpi_type(a_id), // use the proper MPI type here
+              i, a_id + 1);
+        }
+      }
+    }
+
+    ctx.print(
+        format("send scheduled, current_parts = %d, balance = %d") %
+        inParticles % (long)(target_usedParticles - inParticles));
+
+    {
+      typedef UninitializedArray<boost::multi_array<double, 2>> U_PhaseArray;
+      LongT refOffset = numTransfer[thisRank];
+      // Number of particles to be exchanged. We do not allocate too much here.
+      LongT exchange_usedParticles = target_usedParticles - refOffset;
+
+      ctx.print(
+          format("Number of parts to reexchange = %d (refOffset = %d)") %
+          exchange_usedParticles % refOffset);
+
+      // Create temporary buffer for receiving incoming data
+      U_PhaseArray pos_ag_recv_p(extents[exchange_usedParticles][3]);
+      U_PhaseArray::array_type &pos_ag_recv = pos_ag_recv_p.get_array();
+      auto temp_attrs = swapper.allocateTemporary(exchange_usedParticles);
+
+      for (long i = 0; i < commSize; i++) {
+        if (i == thisRank)
+          continue;
+
+        long offs = offsetSend[i] - refOffset;
+
+        ctx.print(
+            format("Schedule recv with ofs=%d from comm=%d (len=%d)") % offs %
+            i % numTransfer[i]);
+        if (numTransfer[i] > 0) {
+          reqRecv_pos[i] =
+              comm->IrecvT(&pos_ag_recv[offs][0], 3 * numTransfer[i], i, 0);
+          for (size_t a_id = 0; a_id < AuxiliaryAttributes::numAttributes;
+               a_id++) {
+            reqRecv_attr[i + a_id * commSize] = comm->Irecv(
+                temp_attrs.getArrayData(a_id, offs), numTransfer[i],
+                temp_attrs.mpi_type(a_id), i, a_id + 1);
+          }
+        }
+      }
+      ctx.print("Scheduled");
+
+      // Now wait for all send to settle
+      for (long i = 0; i < commSize; i++) {
+        if (i == thisRank)
+          continue;
+
+        if (numReceive[i] > 0) {
+          reqSend_pos[i].wait();
+          for (size_t a_id = 0; a_id < AuxiliaryAttributes::numAttributes;
+               a_id++)
+            reqSend_attr[i + a_id * commSize].wait();
+        }
+      }
+
+      ctx.print("IO done. Reshuffle");
+
+      // All data sent, it is safe to modify now
+
+      // Now handle recvs
+      for (long i = 0; i < commSize; i++) {
+        if (i == thisRank)
+          continue;
+
+        if (numTransfer[i] == 0)
+          continue;
+
+        reqRecv_pos[i].wait();
+        for (size_t a_id = 0; a_id < AuxiliaryAttributes::numAttributes; a_id++)
+          reqRecv_attr[i + a_id * commSize].wait();
+
+        long offs = offsetSend[i] - refOffset;
+
+        ctx.print(
+            format("(rank=%d) Copying phase info to ofs=%d") % i %
+            offsetSend[i]);
+
+        range range_output =
+            offsetSend[i] <= range() < (offsetSend[i] + numTransfer[i]);
+        range range_input = offs <= range() < (offs + numTransfer[i]);
+        auto indices_input = indices[range_input][range()];
+        auto indices_output = indices[range_output][range()];
+
+        // Reposition incoming data in the original buffer
+        copy_array_rv(
+            // extract the slice for the given node
+            pos_ag[indices_output],
+            // original data in the recv buffer
+            pos_ag_recv[indices_input]);
+
+#pragma omp parallel for
+        for (size_t k = 0; k < numTransfer[i]; k++) {
+          // This should be extremely simple when optimized by compiler.
+          // Basically a set of assignment from the arrays in temp_attrs to
+          // the target arrays in swapper. No loop. Nothing.
+          swapper.copy_in_all_attrs(k + offsetSend[i], temp_attrs, k + offs);
+        }
+      }
+
+      ctx.print(
+          boost::format("reorder particles (target=%ld)") %
+          target_usedParticles);
+
+      array::reorder(
+          part_idx[indices[range(0, target_usedParticles)]],
+          std::bind(
+              &Swapper::do_swap, &swapper, std::placeholders::_1,
+              std::placeholders::_2));
+    }
+  }
+
+  template <
+      typename ParticleArray, typename ParticleSelector,
+      typename AuxiliaryAttributes = NoAuxiliaryAttributes>
+  void particle_redistribute(
+      BalanceInfo &info, ParticleArray &in_pos, ParticleSelector selector,
+      AuxiliaryAttributes aux = AuxiliaryAttributes()) {
+    particle_redistribute(
+        info.comm, in_pos, info.u_idx->get_array(),
+        info.localNumParticlesBefore, info.localNumParticlesAfter,
+        info.numTransfer, info.numReceive, info.offsetReceive, info.offsetSend,
+        selector, aux);
+  }
+
+  template <
+      typename ParticleArray,
+      typename AuxiliaryAttributes = NoAuxiliaryAttributes>
+  void particle_undistribute(
+      BalanceInfo &info, ParticleArray &pos_ag,
+      AuxiliaryAttributes aux = AuxiliaryAttributes()) {
+    particle_undistribute(
+        info.comm, pos_ag, info.u_idx->get_array(), info.localNumParticlesAfter,
+        info.localNumParticlesBefore, info.numTransfer, info.numReceive,
+        info.offsetReceive, info.offsetSend, aux);
+  }
+
+} // namespace LibLSS
+
+#endif
+// ARES TAG: authors_num = 1
+// ARES TAG: name(0) = Guilhem Lavaux
+// ARES TAG: email(0) = guilhem.lavaux@iap.fr
+// ARES TAG: year(0) = 2017-2020
--- a/extra/borg/libLSS/physics/forwards/patch_model.cpp
+++ b/extra/borg/libLSS/physics/forwards/patch_model.cpp
@ -0,0 +1,219 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/forwards/patch_model.cpp
+    Copyright (C) 2014-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#include <memory>
+#include "libLSS/physics/forward_model.hpp"
+#include "libLSS/physics/chain_forward_model.hpp"
+#include "libLSS/physics/forwards/patch_model.hpp"
+#include "libLSS/physics/forwards/transfer.hpp"
+#include "libLSS/physics/forwards/borg_lpt.hpp"
+#include "libLSS/physics/forwards/downgrade.hpp"
+#include "libLSS/physics/hades_pt.hpp"
+#include "libLSS/physics/hades_log.hpp"
+#include "libLSS/tools/overload.hpp"
+#include "libLSS/physics/forwards/transfer_ehu.hpp"
+#include "libLSS/physics/forwards/primordial.hpp"
+#include "libLSS/tools/ptree_proxy_map.hpp"
+
+#include "libLSS/physics/forwards/adapt_generic_bias.hpp"
+
+using namespace LibLSS;
+
+namespace {
+  class Scaler : public BORGForwardModel {
+  public:
+    ModelInput<3> hold_input;
+    ModelInputAdjoint<3> hold_ag_input;
+    std::string pName;
+
+    Scaler(
+        MPI_Communication *comm, BoxModel const &box, std::string const &pName_)
+        : BORGForwardModel(comm, box), pName(pName_) {
+      setModelParams({{pName, 0.05}});
+    }
+
+    PreferredIO getPreferredInput() const override { return PREFERRED_REAL; }
+    PreferredIO getPreferredOutput() const override { return PREFERRED_REAL; }
+
+    void forwardModel_v2(ModelInput<3> input) override {
+      LIBLSS_AUTO_DEBUG_CONTEXT(ctx);
+      input.setRequestedIO(PREFERRED_REAL);
+      hold_input = std::move(input);
+    }
+
+    void getDensityFinal(ModelOutput<3> output) override {
+      LIBLSS_AUTO_DEBUG_CONTEXT(ctx);
+      output.setRequestedIO(PREFERRED_REAL);
+      if (params.find(pName) == params.end()) {
+        error_helper<ErrorParams>("Missing " + pName + " parameter.");
+      }
+      double const aux = boost::any_cast<double>(params[pName]);
+      ctx.format("Using scaling %s=%g", pName, aux);
+
+      fwrap(output.getRealOutput()) = aux * fwrap(hold_input.getRealConst());
+    }
+
+    void adjointModel_v2(ModelInputAdjoint<3> input) override {
+      LIBLSS_AUTO_DEBUG_CONTEXT(ctx);
+      input.setRequestedIO(PREFERRED_REAL);
+      hold_ag_input = std::move(input);
+    }
+
+    void getAdjointModelOutput(ModelOutputAdjoint<3> output) override {
+      LIBLSS_AUTO_DEBUG_CONTEXT(ctx);
+      output.setRequestedIO(PREFERRED_REAL);
+      if (params.find(pName) == params.end()) {
+        error_helper<ErrorParams>("Missing " + pName + " parameter.");
+      }
+      double const aux = boost::any_cast<double>(params[pName]);
+      fwrap(output.getRealOutput()) = aux * fwrap(hold_ag_input.getRealConst());
+    }
+  };
+
+  class FrozenCache : public BORGForwardModel {
+  public:
+    ModelOutput<3> hold_output;
+    std::shared_ptr<BORGForwardModel> alt_model;
+    bool invalid;
+
+    FrozenCache(
+        MPI_Communication *comm, std::shared_ptr<BORGForwardModel> alt_model_)
+        : BORGForwardModel(
+              comm, alt_model_->get_box_model(),
+              alt_model_->get_box_model_output()),
+          alt_model(alt_model_) {
+      invalid = true;
+    }
+
+    PreferredIO getPreferredInput() const override {
+      return hold_output ? PREFERRED_NONE : alt_model->getPreferredInput();
+    }
+    PreferredIO getPreferredOutput() const override {
+      return hold_output ? hold_output.current
+                         : alt_model->getPreferredOutput();
+    }
+
+    bool densityInvalidated() const override { return invalid; }
+
+    void forwardModel_v2(ModelInput<3> input) override {
+      LIBLSS_AUTO_DEBUG_CONTEXT(ctx);
+      if (!hold_output)
+        alt_model->forwardModel_v2(std::move(input));
+    }
+
+    void getDensityFinal(ModelOutput<3> output) override {
+      LIBLSS_AUTO_DEBUG_CONTEXT(ctx);
+      invalid = false;
+      if (!hold_output) {
+        ctx.print("Cache invalid. Recompute");
+        hold_output = output.makeTempLike();
+        alt_model->getDensityFinal(hold_output.shallowClone());
+      } else {
+        ctx.print("Cache valid. Reuse.");
+        output.setRequestedIO(hold_output.current);
+        ctx.format(
+            "output.active = %d, hold.active = %d", output.active,
+            hold_output.current);
+      }
+      output.copyFrom(hold_output);
+    }
+
+    void adjointModel_v2(ModelInputAdjoint<3> input) override {}
+
+    void getAdjointModelOutput(ModelOutputAdjoint<3> output) override {
+      LIBLSS_AUTO_DEBUG_CONTEXT(ctx);
+      ctx.print("Clear adjoint output");
+      boost::apply_visitor(
+          overload([](auto const *v) {}, [](auto *v) { fwrap(*v) = 0.0; }),
+          output.getHolder());
+    }
+
+    void setModelParams(ModelDictionnary const &params) override {
+      alt_model->setModelParams(params);
+    }
+
+    boost::any
+    getModelParam(std::string const &n, std::string const &param) override {
+      LIBLSS_AUTO_DEBUG_CONTEXT(ctx);
+      return alt_model->getModelParam(n, param);
+    }
+
+    void updateCosmo() override {
+      LIBLSS_AUTO_DEBUG_CONTEXT(ctx);
+      alt_model->setCosmoParams(cosmo_params);
+      if (alt_model->densityInvalidated()) {
+        ctx.print("Submodel has invalidated its density field.");
+        invalid = true;
+        hold_output = ModelOutput<3>();
+      }
+    }
+  };
+} // namespace
+
+static std::shared_ptr<BORGForwardModel> new_patch_model(
+    MPI_Communication *comm, BoxModel const &box, PropertyProxy const &params) {
+  /*  auto transfer_filename = params.get_optional<std::string>("transfer");
+  auto transfer_cic = params.get_optional<bool>("use_invert_cic");
+  auto transfer_sharp = params.get_optional<bool>("use_sharpk");*/
+  auto ai = params.get<double>("ai");
+  auto af = params.get<double>("af");
+  auto k_transition = params.get<double>("k_transition");
+  auto part_factor = params.get<double>("part_factor", 1.2);
+
+  BoxModel box2 = box;
+  box2.N0 *= 2;
+  box2.N1 *= 2;
+  box2.N2 *= 2;
+
+  auto transfer1 = std::make_shared<ForwardTransfer>(comm, box);
+  auto transfer2 = std::make_shared<ForwardTransfer>(comm, box);
+  auto inverse_cic = std::make_shared<ForwardTransfer>(comm, box2);
+
+  auto lpt = std::make_shared<BorgLptModel<>>(
+      comm, box, box, false, 2, part_factor, ai, af, false);
+  auto sum = std::make_shared<SumForwardModel>(comm, box);
+
+  transfer1->setupSharpKcut(k_transition, false);
+  transfer2->setupSharpKcut(k_transition, true);
+  inverse_cic->setupInverseCIC(0.95);
+
+  auto chain1 = std::make_shared<ChainForwardModel>(comm, box);
+  auto chain1_1 = std::make_shared<ChainForwardModel>(comm, box);
+  chain1->addModel(std::make_shared<ForwardPrimordial>(comm, box, ai));
+  chain1->addModel(std::make_shared<ForwardEisensteinHu>(comm, box));
+  chain1->addModel(transfer1);
+  chain1->addModel(lpt);
+  //chain1->addModel(inverse_cic);
+  //chain1->addModel(std::make_shared<ForwardDowngrade>(comm, box2));
+  //chain1->addModel(transfer3);
+  //auto biasModel = ForwardRegistry::instance().get("bias::LinearBias")(
+  //    comm, box, PropertyFromMap());
+  //  auto biasModel = ForwardRegistry::instance().get("bias::PowerLaw")(
+  //      comm, box, PropertyFromMap());
+  auto biasModel = ForwardRegistry::instance().get("bias::ManyPower_1^2")(
+      comm, box, PropertyFromMap());
+  biasModel->setName("bias");
+  chain1_1->addModel(chain1); //std::make_shared<FrozenCache>(comm, chain1));
+  chain1_1->addModel(biasModel);
+
+  auto chain2 = std::make_shared<ChainForwardModel>(comm, box);
+  chain2->addModel(transfer2);
+  chain2->addModel(std::make_shared<Scaler>(comm, box, "aux"));
+  chain2->addModel(std::make_shared<HadesLog>(comm, box, ai, false));
+  chain2->addModel(std::make_shared<Scaler>(comm, box, "aux2"));
+  //chain2->addModel(std::make_shared<HadesLinear>(comm, box, box, ai, af));
+
+  sum->addModel(chain1_1);
+  sum->addModel(chain2);
+
+  return sum;
+}
+
+LIBLSS_REGISTER_FORWARD_IMPL(PATCH_MODEL, new_patch_model);
+
--- a/extra/borg/libLSS/physics/forwards/patch_model.hpp
+++ b/extra/borg/libLSS/physics/forwards/patch_model.hpp
@ -0,0 +1,12 @@
+#pragma once
+#ifndef __LIBLSS_PHYSICS_FORWARDS_PATCH_MODEL_HPP
+#  define __LIBLSS_PHYSICS_FORWARDS_PATCH_MODEL_HPP
+
+#  include "libLSS/physics/sum.hpp"
+#  include "libLSS/physics/forward_model.hpp"
+
+#  include "libLSS/physics/forwards/registry.hpp"
+
+LIBLSS_REGISTER_FORWARD_DECL(PATCH_MODEL);
+
+#endif
--- a/extra/borg/libLSS/physics/forwards/pm/part_decision.hpp
+++ b/extra/borg/libLSS/physics/forwards/pm/part_decision.hpp
@ -0,0 +1,13 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/forwards/pm/part_decision.hpp
+    Copyright (C) 2014-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#ifndef __LIBLSS_PARTICLE_DISTRIBUTE_DECISION_HPP
+#define __LIBLSS_PARTICLE_DISTRIBUTE_DECISION_HPP
+
+#endif
--- a/extra/borg/libLSS/physics/forwards/pm/plane_xchg.hpp
+++ b/extra/borg/libLSS/physics/forwards/pm/plane_xchg.hpp
@ -0,0 +1,142 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/forwards/pm/plane_xchg.hpp
+    Copyright (C) 2014-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#ifndef __LIBLSS_TOOLS_PLANE_XCHG_HPP
+#define __LIBLSS_TOOLS_PLANE_XCHG_HPP
+
+#include <boost/lambda/lambda.hpp>
+#include <boost/multi_array.hpp>
+#include "libLSS/mpi/generic_mpi.hpp"
+#include "libLSS/tools/console.hpp"
+#include "libLSS/tools/mpi_fftw_helper.hpp"
+#include "libLSS/tools/uninitialized_type.hpp"
+#include "libLSS/tools/array_tools.hpp"
+#include "libLSS/tools/push_operators.hpp"
+#include <memory>
+
+namespace LibLSS {
+
+  template <bool accum, typename PlaneArray, typename Mgr_p>
+  void density_exchange_planes(
+      MPI_Communication *comm, PlaneArray &&density, Mgr_p &d_mgr,
+      int extra_planes) {
+    ConsoleContext<LOG_DEBUG> ctx("exchanging nearby planes after projection");
+
+    using boost::extents;
+    using boost::format;
+    using boost::lambda::_1;
+
+    typedef Uninit_FFTW_Real_Array U_Array;
+    MPI_Communication::Request req_send, req_recv;
+
+    long d_N2real = d_mgr->N2real;
+    long d_N1 = d_mgr->N1;
+    long d_N2 = d_mgr->N2;
+    long d_N0 = d_mgr->N0;
+    long d_startN0 = d_mgr->startN0;
+    long d_localN0 = d_mgr->localN0;
+    U_Array tmp_plane_r_p(extents[extra_planes][d_N1][d_N2real]);
+
+    if (d_localN0 == 0)
+      return;
+
+    U_Array::array_type &tmp_plane_r = tmp_plane_r_p.get_array();
+    long high_plane_id = (d_startN0 + d_localN0) % d_N0;
+    long low_plane_id = d_startN0;
+    long low_plane_peer = d_mgr->get_peer((d_startN0 + d_N0 - 1) % d_N0);
+    typedef
+        typename std::remove_reference<PlaneArray>::type::reference plane_ref_t;
+    plane_ref_t plane = density[d_startN0 + d_localN0];
+    plane_ref_t low_plane = density[d_startN0];
+
+    ctx.print(
+        format("high_id=%d -> peer=%d") % high_plane_id %
+        d_mgr->get_peer(high_plane_id));
+    ctx.print(format("low_id=%d -> peer=%d") % low_plane_id % low_plane_peer);
+
+    // Missing some logic here if the planes are scattered on different nodes (happens for extra_planes > 1)
+    req_send = comm->IsendT(
+        &plane[0][0], plane.num_elements(), d_mgr->get_peer(high_plane_id),
+        high_plane_id);
+    req_recv = comm->IrecvT(
+        tmp_plane_r.data(), tmp_plane_r.num_elements(), low_plane_peer,
+        low_plane_id);
+
+    req_recv.wait();
+
+    // This should use the fused infrastructure. But it does not support
+    // sub array yet.
+    for (long i = 0; i < d_N1; i++)
+      for (long j = 0; j < d_N2; j++)
+        push_to<accum>::apply(low_plane[i][j], tmp_plane_r[0][i][j]);
+
+    req_send.wait();
+  }
+
+  template <
+      typename OutPlaneArray, typename InPlaneArray, typename Mgr_p>
+  void density_exchange_planes_ag(
+      MPI_Communication *comm, OutPlaneArray &loc_density,
+      InPlaneArray &global_density, Mgr_p &d_mgr, unsigned int extra_planes) {
+    using boost::format;
+    using boost::lambda::_1;
+    typedef Uninit_FFTW_Real_Array U_Array;
+
+    ConsoleContext<LOG_DEBUG> ctx(
+        "exchanging nearby planes before taking adjoint gradient");
+    MPI_Communication::Request req_send, req_recv;
+
+    long d_N2real = d_mgr->N2real;
+    long d_N1 = d_mgr->N1;
+    long d_N2 = d_mgr->N2;
+    long d_N0 = d_mgr->N0;
+    long d_startN0 = d_mgr->startN0;
+    long d_localN0 = d_mgr->localN0;
+    long high_plane_id = (d_startN0 + d_localN0);
+    long high_plane_peer = d_mgr->get_peer(high_plane_id % d_N0);
+    long low_plane_id = d_startN0;
+    long low_plane_peer = d_mgr->get_peer((d_startN0 + d_N0 - 1) % d_N0);
+    typedef typename OutPlaneArray::index_range o_range;
+    typename OutPlaneArray::index_gen o_indices;
+    typedef typename InPlaneArray::index_range i_range;
+    typename InPlaneArray::index_gen i_indices;
+
+    if (d_localN0 == 0)
+      return;
+
+    // Missing some logic here if the planes are scattered on different nodes (happens for extra_planes > 1)
+    auto loc_view =
+        loc_density[o_indices[o_range(d_startN0, d_startN0 + d_localN0)]
+                             [o_range()][o_range(0, d_N2)]];
+    auto glob_view =
+        global_density[i_indices[i_range(d_startN0, d_startN0 + d_localN0)]
+                                [i_range()][i_range(0, d_N2)]];
+
+    copy_array_rv(loc_view, glob_view, false);
+
+    ctx.print(
+        format("Receiving plane = %d from %d") % high_plane_id %
+        high_plane_peer);
+    req_recv = comm->IrecvT(
+        &loc_density[high_plane_id][0][0],
+        loc_density[high_plane_id].num_elements(), high_plane_peer,
+        high_plane_id % d_N0);
+    ctx.print(
+        format("Sending plane = %d to %d") % low_plane_id % low_plane_peer);
+    req_send = comm->IsendT(
+        &loc_density[low_plane_id][0][0],
+        loc_density[low_plane_id].num_elements(), low_plane_peer, low_plane_id);
+
+    req_recv.wait();
+    req_send.wait();
+  }
+
+} // namespace LibLSS
+
+#endif
--- a/extra/borg/libLSS/physics/forwards/qlpt/borg_fwd_qlpt.cpp
+++ b/extra/borg/libLSS/physics/forwards/qlpt/borg_fwd_qlpt.cpp
@ -0,0 +1,191 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/forwards/qlpt/borg_fwd_qlpt.cpp
+    Copyright (C) 2020 Guilhem Lavaux <n.porqueres@imperial.ac.uk>
+    Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+
+#include "../pm/plane_xchg.hpp"
+
+void BorgQLptModel::qlpt_ic(CArrayRef &deltao, PhaseArrayRef &lctim) {
+  ///set cosmological parameters
+  ///Initial density is scaled to initial redshift!!!
+  LIBLSS_AUTO_DEBUG_CONTEXT(ctx);
+  Cosmology cosmo(cosmo_params);
+
+  double an = af; ///set position ics at r_{0}, calculate till present epoch
+  double anh =
+      af; ///velocities are created at v_{0-1/2}, calculate till present epoch
+  double Hubble = cosmo.Hubble(anh) / cosmo_params.h; ///km /sec /(Mpc/h)
+
+  double volNorm = 1. / (L0 * L1 * L2);
+
+  typedef UninitializedArray<
+      FFTW_Complex_Array, FFTW_Allocator<std::complex<double>>>
+      U_CArray;
+  typedef U_CArray::array_type Ref_CArray;
+
+  auto &phi0 = potential->get_array();
+
+  U_CArray tmp_p(mgr->extents_complex(), mgr->allocator_complex);
+  Ref_CArray &tmp = tmp_p.get_array();
+
+#pragma omp parallel for collapse(3)
+  for (int i = startN0; i < startN0 + localN0; i++)
+    for (int j = 0; j < N1; j++)
+      for (int k = 0; k < N2_HC; k++) {
+        double kk[3] = {kmode(i, N0, L0), kmode(j, N1, L1), kmode(k, N2, L2)};
+
+        double ksquared = kk[0] * kk[0] + kk[1] * kk[1] + kk[2] * kk[2];
+        double fac = -1. / ksquared;
+        std::complex<double> &in_delta = deltao[i][j][k];
+        tmp[i][j][k] = fac * in_delta * volNorm;
+      }
+
+  if (startN0 == 0 && localN0 > 0) {
+    tmp[0][0][0] = 0;
+    tmp[0][0][N2_HC - 1] = 0;
+    tmp[0][N1 / 2][0] = 0;
+    tmp[0][N1 / 2][N2_HC - 1] = 0;
+  }
+
+  if (startN0 <= N0 / 2 && startN0 + localN0 > N0 / 2) {
+    tmp[N0 / 2][0][0] = 0;
+    tmp[N0 / 2][0][N2_HC - 1] = 0;
+    tmp[N0 / 2][N1 / 2][0] = 0;
+    tmp[N0 / 2][N1 / 2][N2_HC - 1] = 0;
+  }
+
+#pragma omp parallel for collapse(3)
+  for (int i = startN0; i < startN0 + localN0; i++)
+    for (int j = 0; j < N1; j++)
+      for (int k = 0; k < N2_HC; k++) {
+        (tmp_complex_field->get_array())[i][j][k] = tmp[i][j][k];
+      }
+
+  lo_mgr->execute_c2r(
+      synthesis_plan, tmp_complex_field->get_array().data(), phi0.data());
+}
+
+void BorgQLptModel::qlpt_density_obs(ArrayRef &deltao, size_t numParts) {
+  LIBLSS_AUTO_DEBUG_CONTEXT(ctx);
+
+  bool do_velocities = 0;
+
+  Cosmology cosmo(cosmo_params);
+  double an = af; ///set position ics at r_{0}, calculate till present epoch
+  double anh =
+      af; ///velocities are created at v_{0-1/2}, calculate till present epoch
+
+  auto &phi0 = potential->get_array();
+
+  //std::string fname = "phi.h5";
+  //H5::H5File f(fname, H5F_ACC_TRUNC);
+  //CosmoTool::hdf5_write_array(f, "phi", phi0);
+
+  auto array_in_t = lo_mgr->allocate_c2c_array();
+  auto &array_in = array_in_t.get_array();
+  auto array_out_t = lo_mgr->allocate_c2c_array();
+  auto &array_out = array_out_t.get_array();
+
+#pragma omp parallel for collapse(3)
+  for (int i = startN0; i < startN0 + localN0; i++)
+    for (int j = 0; j < N1; j++)
+      for (int k = 0; k < N2; k++) {
+        double p = phi0[i][j][k];
+        std::complex<double> exponent(0, -phi0[i][j][k] / hbar);
+        array_in[i][j][k] = exp(exponent);
+      }
+
+  DFT_Manager::Calls::plan_type plan =
+      lo_mgr->create_c2c_plan(array_in.data(), array_out.data(), -1);
+  lo_mgr->execute_c2c(plan, array_in.data(), array_out.data());
+  lo_mgr->destroy_plan(plan);
+
+  std::complex<double> vol(1. / (N0 * N1 * N2), 0);
+
+#pragma omp parallel for collapse(3)
+  for (int i = startN0; i < startN0 + localN0; i++)
+    for (int j = 0; j < N1; j++)
+      for (int k = 0; k < N2; k++) {
+        double kk[3] = {kmode(i, N0, L0), kmode(j, N1, L1), kmode(k, N2, L2)};
+        double ksquared = kk[0] * kk[0] + kk[1] * kk[1] + kk[2] * kk[2];
+        std::complex<double> exponent(0, -0.5 * hbar * ksquared * D1);
+        array_in[i][j][k] = exp(exponent) * array_out[i][j][k] * vol;
+      }
+
+  plan = lo_mgr->create_c2c_plan(array_in.data(), array_out.data(), 1);
+  lo_mgr->execute_c2c(plan, array_in.data(), array_out.data());
+  lo_mgr->destroy_plan(plan);
+
+#pragma omp parallel for collapse(3)
+  for (int i = startN0; i < startN0 + localN0; i++)
+    for (int j = 0; j < N1; j++)
+      for (int k = 0; k < N2; k++) {
+        std::complex<double> psi = array_out[i][j][k];
+        deltao[i][j][k] = std::real(psi * std::conj(psi)) - 1.;
+      }
+
+  if (do_velocities) {
+    std::string fname = "psi.h5";
+    H5::H5File f(fname, H5F_ACC_TRUNC);
+    CosmoTool::hdf5_write_array(f, "psi", array_out);
+    CosmoTool::hdf5_write_array(f, "density", deltao);
+  }
+
+  if (DUMP_BORG_DENSITY) {
+    std::string fname = str(format("borg_density_%d.h5") % comm->rank());
+    H5::H5File f(fname, H5F_ACC_TRUNC);
+    CosmoTool::hdf5_write_array(f, "density", deltao);
+  }
+}
+
+void BorgQLptModel::qlpt_fwd_model(CArrayRef &deltao, PhaseArrayRef &lctim) {
+  LIBLSS_AUTO_DEBUG_CONTEXT(ctx);
+
+  if (false) {
+    static int step = 0;
+    std::string fname = str(format("fwd_ic_%d_%d.h5") % step % comm->rank());
+    H5::H5File f(fname, H5F_ACC_TRUNC);
+    CosmoTool::hdf5_write_array(f, "deltao", deltao);
+    step++;
+  }
+
+  qlpt_ic(deltao, lctim);
+}
+
+void BorgQLptModel::forwardModelRsdField(ArrayRef &deltaf, double *vobs_ext) {
+  ConsoleContext<LOG_DEBUG> ctx("BORG forward model rsd density calculation");
+}
+
+void BorgQLptModel::test_qlpt_velocities(MarkovState &state) {}
+
+void BorgQLptModel::forwardModel_v2(ModelInput<3> delta_init) {
+  LIBLSS_AUTO_DEBUG_CONTEXT(ctx);
+
+  delta_init.setRequestedIO(PREFERRED_FOURIER);
+
+  delta_init.needDestroyInput();
+  qlpt_fwd_model(delta_init.getFourier(), lc_timing->get_array());
+}
+
+void BorgQLptModel::getDensityFinal(ModelOutput<3> delta_output) {
+  LIBLSS_AUTO_DEBUG_CONTEXT(ctx);
+  delta_output.setRequestedIO(PREFERRED_REAL);
+
+  qlpt_density_obs(
+      delta_output.getRealOutput(), realInfo.localNumParticlesAfter);
+
+  /* if (!forwardModelHold && !adjointRequired) {
+    releaseParticles();
+  }*/
+  forwardModelHold = false;
+}
+// ARES TAG: num_authors = 1
+// ARES TAG: author(0) = Natalia Porqueres
+// ARES TAG: email(0) = n.porqueres@imperial.ac.uk
+// ARES TAG: year(0) = 2020
+
--- a/extra/borg/libLSS/physics/forwards/qlpt/borg_fwd_qlpt.hpp
+++ b/extra/borg/libLSS/physics/forwards/qlpt/borg_fwd_qlpt.hpp
@ -0,0 +1,9 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/forwards/qlpt/borg_fwd_qlpt.hpp
+    Copyright (C) 2014-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
--- a/extra/borg/libLSS/physics/forwards/qlpt/borg_fwd_qlpt_adj.cpp
+++ b/extra/borg/libLSS/physics/forwards/qlpt/borg_fwd_qlpt_adj.cpp
@ -0,0 +1,206 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/forwards/qlpt/borg_fwd_qlpt_adj.cpp
+    Copyright (C) 2020 Guilhem Lavaux <n.porqueres@imperial.ac.uk>
+    Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+
+void BorgQLptModel::qlpt_fwd_model_ag(
+    PhaseArrayRef &lctime, ArrayRef &in_ag, ArrayRef &out_ag) {
+  ConsoleContext<LOG_DEBUG> ctx("BORG adjoint model (particles)");
+
+  Cosmology cosmo(cosmo_params);
+  double an = af; ///set position ics at r_{0}, calculate till present epoch
+  double anh =
+      af; ///velocities are created at v_{0-1/2}, calculate till present epoch
+
+  auto &phi0 = potential->get_array();
+
+  auto array_in_t = lo_mgr->allocate_c2c_array();
+  auto &array_in = array_in_t.get_array();
+  auto array_out_t = lo_mgr->allocate_c2c_array();
+  auto &array_out = array_out_t.get_array();
+
+  auto array_in3_t = lo_mgr->allocate_c2c_array();
+  auto &array_in3 = array_in3_t.get_array();
+  auto array_out3_t = lo_mgr->allocate_c2c_array();
+  auto &array_out3 = array_out3_t.get_array();
+
+  auto psi_t = lo_mgr->allocate_c2c_array();
+  auto &psi = psi_t.get_array();
+
+  double volNorm = volume / (N0 * N1 * N2);
+
+#pragma omp parallel for collapse(3)
+  for (int i = startN0; i < startN0 + localN0; i++)
+    for (int j = 0; j < N1; j++)
+      for (int k = 0; k < N2; k++) {
+        double p = phi0[i][j][k];
+        std::complex<double> exponent(0, -phi0[i][j][k] / hbar);
+        array_in[i][j][k] = exp(exponent);
+      }
+
+  auto psi0_t = lo_mgr->allocate_c2c_array();
+  auto &psi0 = psi0_t.get_array();
+  psi0 = array_in;
+
+  plan = lo_mgr->create_c2c_plan(array_in.data(), array_out.data(), -1);
+  lo_mgr->execute_c2c(plan, array_in.data(), array_out.data());
+  lo_mgr->destroy_plan(plan);
+
+  std::complex<double> vol(1. / (N0 * N1 * N2), 0);
+
+  auto propagator_t = lo_mgr->allocate_c2c_array();
+  auto &propagator = propagator_t.get_array();
+
+#pragma omp parallel for collapse(3)
+  for (int i = startN0; i < startN0 + localN0; i++)
+    for (int j = 0; j < N1; j++)
+      for (int k = 0; k < N2; k++) {
+        double kk[3] = {kmode(i, N0, L0), kmode(j, N1, L1), kmode(k, N2, L2)};
+        double ksquared = kk[0] * kk[0] + kk[1] * kk[1] + kk[2] * kk[2];
+        std::complex<double> exponent(0, -0.5 * hbar * ksquared * D1);
+        propagator[i][j][k] = exp(exponent);
+        array_in[i][j][k] = exp(exponent) * array_out[i][j][k] * vol;
+      }
+
+  plan = lo_mgr->create_c2c_plan(array_in.data(), psi.data(), 1);
+  lo_mgr->execute_c2c(plan, array_in.data(), psi.data());
+  lo_mgr->destroy_plan(plan);
+
+#pragma omp parallel for collapse(3)
+  for (int i = startN0; i < startN0 + localN0; i++)
+    for (int j = 0; j < N1; j++)
+      for (int k = 0; k < N2; k++) {
+        array_in[i][j][k] = in_ag[i][j][k] * std::conj(psi[i][j][k]);
+        array_in3[i][j][k] = std::conj(array_in[i][j][k]);
+      }
+
+  plan = lo_mgr->create_c2c_plan(array_in.data(), array_out.data(), -1);
+  lo_mgr->execute_c2c(plan, array_in.data(), array_out.data());
+  lo_mgr->destroy_plan(plan);
+
+  plan = lo_mgr->create_c2c_plan(array_in3.data(), array_out3.data(), -1);
+  lo_mgr->execute_c2c(plan, array_in3.data(), array_out3.data());
+  lo_mgr->destroy_plan(plan);
+
+#pragma omp parallel for collapse(3)
+  for (int i = startN0; i < startN0 + localN0; i++)
+    for (int j = 0; j < N1; j++)
+      for (int k = 0; k < N2; k++) {
+        array_in[i][j][k] = propagator[i][j][k] * array_out[i][j][k] * vol;
+        array_in3[i][j][k] =
+            std::conj(propagator[i][j][k]) * array_out3[i][j][k] * vol;
+      }
+
+  plan = lo_mgr->create_c2c_plan(array_in.data(), array_out.data(), 1);
+  lo_mgr->execute_c2c(plan, array_in.data(), array_out.data());
+  lo_mgr->destroy_plan(plan);
+
+  plan = lo_mgr->create_c2c_plan(array_in3.data(), array_out3.data(), 1);
+  lo_mgr->execute_c2c(plan, array_in3.data(), array_out3.data());
+  lo_mgr->destroy_plan(plan);
+
+  for (int i = startN0; i < startN0 + localN0; i++)
+    for (int j = 0; j < N1; j++)
+      for (int k = 0; k < N2; k++) {
+        std::complex<double> iunit(0, 1 / hbar);
+        std::complex<double> element = psi0[i][j][k];
+        array_in[i][j][k] = -iunit * element * array_out[i][j][k] +
+                            iunit * std::conj(element) * array_out3[i][j][k];
+      }
+
+  plan = lo_mgr->create_c2c_plan(array_in.data(), array_out.data(), -1);
+  lo_mgr->execute_c2c(plan, array_in.data(), array_out.data());
+  lo_mgr->destroy_plan(plan);
+
+  for (int i = startN0; i < startN0 + localN0; i++)
+    for (int j = 0; j < N1; j++)
+      for (int k = 0; k < N2; k++) {
+        double kk[3] = {kmode(i, N0, L0), kmode(j, N1, L1), kmode(k, N2, L2)};
+        double ksquared = kk[0] * kk[0] + kk[1] * kk[1] + kk[2] * kk[2];
+        double fac = -1. / ksquared / (N0 * N1 * N2);
+
+        if (std::isinf(fac))
+          fac = 0.;
+
+        if (startN0 == 0 && localN0 > 0) {
+          if (i == 0) {
+            if (j == 0) {
+              if (k == 0 or k == N2_HC - 1) {
+                fac = 0.;
+              }
+            }
+            if (j == N1 / 2) {
+              if (k == 0 or k == N2_HC - 1) {
+                fac = 0.;
+              }
+            }
+          }
+        }
+
+        if (startN0 <= N0 / 2 && startN0 + localN0 > N0 / 2) {
+
+          if (i == N0 / 2) {
+            if (j == 0) {
+              if (k == 0 or k == N2_HC - 1) {
+                fac = 0.;
+              }
+            }
+            if (j == N1 / 2) {
+              if (k == 0 or k == N2_HC - 1) {
+                fac = 0.;
+              }
+            }
+          }
+        }
+
+        array_in[i][j][k] = fac * array_out[i][j][k];
+      }
+
+  plan = lo_mgr->create_c2c_plan(array_in.data(), array_out.data(), 1);
+  lo_mgr->execute_c2c(plan, array_in.data(), array_out.data());
+  lo_mgr->destroy_plan(plan);
+
+#pragma omp parallel for collapse(3)
+  for (int i = startN0; i < startN0 + localN0; i++)
+    for (int j = 0; j < N1; j++)
+      for (int k = 0; k < N2; k++) {
+        out_ag[i][j][k] = std::real(array_out[i][j][k]);
+      }
+}
+
+void BorgQLptModel::preallocate() {}
+
+void BorgQLptModel::adjointModel_v2(ModelInputAdjoint<3> gradient_delta) {
+
+  preallocate();
+
+  ///re-evaluate redshift distortions from forward run
+
+  ///work backwards from final to initial conditions
+
+  if (gradient_delta) {
+    gradient_delta.setRequestedIO(PREFERRED_REAL);
+    hold_in_gradient = std::move(gradient_delta);
+  }
+}
+
+void BorgQLptModel::getAdjointModelOutput(
+    ModelOutputAdjoint<3> gradient_delta) {
+
+  gradient_delta.setRequestedIO(PREFERRED_REAL);
+  qlpt_fwd_model_ag(
+      *lc_timing, hold_in_gradient.getReal(), gradient_delta.getRealOutput());
+
+  clearAdjointGradient();
+}
+
+void BorgQLptModel::clearAdjointGradient() {}
+// ARES TAG: num_authors = 1
+// ARES TAG: author(0) = Natalia Porqueres
+// ARES TAG: email(0) = n.porqueres@imperial.ac.uk
+// ARES TAG: year(0) = 2020
--- a/extra/borg/libLSS/physics/forwards/qlpt_rsd/borg_fwd_qlpt_rsd.cpp
+++ b/extra/borg/libLSS/physics/forwards/qlpt_rsd/borg_fwd_qlpt_rsd.cpp
@ -0,0 +1,239 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/forwards/qlpt_rsd/borg_fwd_qlpt_rsd.cpp
+    Copyright (C) 2020 Guilhem Lavaux <n.porqueres@imperial.ac.uk>
+    Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+
+#include "../pm/plane_xchg.hpp"
+
+
+void BorgQLptRsdModel::qlpt_rsd_ic(CArrayRef &deltao, PhaseArrayRef &lctim) {
+  ///set cosmological parameters
+  ///Initial density is scaled to initial redshift!!!
+  LIBLSS_AUTO_DEBUG_CONTEXT(ctx);
+  Cosmology cosmo(cosmo_params);
+
+  double an = af; ///set position ics at r_{0}, calculate till present epoch
+  double anh =
+  af; ///velocities are created at v_{0-1/2}, calculate till present epoch
+  double Hubble = cosmo.Hubble(anh) / cosmo_params.h; ///km /sec /(Mpc/h)
+
+  double volNorm = 1. / (L0 * L1 * L2);
+  
+  typedef UninitializedArray<
+      FFTW_Complex_Array, FFTW_Allocator<std::complex<double>>>
+      U_CArray;
+  typedef U_CArray::array_type Ref_CArray;
+  
+  auto &phi0 = potential->get_array();
+
+  U_CArray tmp_p(mgr->extents_complex(), mgr->allocator_complex);
+  Ref_CArray &tmp = tmp_p.get_array();
+
+#pragma omp parallel for
+    for (int i = startN0; i < startN0 + localN0; i++)
+      for (int j = 0; j < N1; j++)
+        for (int k = 0; k < N2_HC; k++) {
+          double kk[3] = {kmode(i, N0, L0), kmode(j, N1, L1),
+                          kmode(k, N2, L2)};
+
+          double ksquared = kk[0] * kk[0] + kk[1] * kk[1] + kk[2] * kk[2];
+          double fac = -1. / ksquared;
+          std::complex<double> &in_delta = deltao[i][j][k];
+          tmp[i][j][k] = fac * in_delta * volNorm;
+        }
+
+    if (startN0 == 0 && localN0 > 0) {
+      tmp[0][0][0] = 0;
+      tmp[0][0][N2_HC - 1] = 0;
+      tmp[0][N1 / 2][0] = 0;
+      tmp[0][N1 / 2][N2_HC - 1] = 0;
+    }
+
+    if (startN0 <= N0 / 2 && startN0 + localN0 > N0 / 2) {
+      tmp[N0 / 2][0][0] = 0;
+      tmp[N0 / 2][0][N2_HC - 1] = 0;
+      tmp[N0 / 2][N1 / 2][0] = 0;
+      tmp[N0 / 2][N1 / 2][N2_HC - 1] = 0;
+    }
+    
+#pragma omp parallel for collapse(3)
+    for (int i = startN0; i < startN0 + localN0; i++)
+      for (int j = 0; j < N1; j++)
+        for (int k = 0; k < N2_HC; k++) {
+     	  (tmp_complex_field->get_array())[i][j][k] = tmp[i][j][k];   
+    }
+    
+    lo_mgr->execute_c2r(
+        synthesis_plan, tmp_complex_field->get_array().data(), phi0.data()); 
+
+}
+
+void BorgQLptRsdModel::qlpt_rsd_density_obs(ArrayRef &deltao, size_t numParts) {
+  LIBLSS_AUTO_DEBUG_CONTEXT(ctx);
+  
+  Cosmology cosmo(cosmo_params);
+  double an = af; ///set position ics at r_{0}, calculate till present epoch
+  double anh =
+      af; ///velocities are created at v_{0-1/2}, calculate till present epoch
+  
+  double A = 0.35;
+  double beta = 1.58; 
+  double ff = pow(cosmo_params.omega_m, 0.55);
+  
+  auto& phi0 = potential->get_array();
+  
+  auto array_in_t = lo_mgr->allocate_c2c_array();
+  auto& array_in = array_in_t.get_array();
+  auto array_out_t = lo_mgr->allocate_c2c_array();
+  auto& array_out = array_out_t.get_array();
+
+#pragma omp parallel for collapse(3)  
+  for (int i = startN0; i < startN0 + localN0; i++)
+	 for (int j = 0; j < N1; j++)
+		for (int k = 0; k < N2; k++) {
+			std::complex<double> exponent(0, -phi0[i][j][k]/hbar);
+			array_in[i][j][k] = exp(exponent);
+  }
+  
+  DFT_Manager::Calls::plan_type plan = lo_mgr->create_c2c_plan(array_in.data(), array_out.data(), -1);
+  lo_mgr->execute_c2c(plan, array_in.data(), array_out.data()); 
+  lo_mgr->destroy_plan(plan);
+    
+  std::complex<double> vol(1./(N0*N1*N2), 0);
+
+#pragma omp parallel for collapse(3)    
+  for (int i = startN0; i < startN0 + localN0; i++)
+	 for (int j = 0; j < N1; j++)
+		for (int k = 0; k < N2; k++) {
+			double kk[3] = {kmode(i, N0, L0), kmode(j, N1, L1),
+                          kmode(k, N2, L2)};
+            double ksquared = kk[0] * kk[0] + kk[1] * kk[1] + kk[2] * kk[2];
+            std::complex<double> exponent(0., -0.5 * hbar * D1 * ksquared);
+			array_in[i][j][k] = exp(exponent) * array_out[i][j][k] * vol;
+  }
+  
+  plan = lo_mgr->create_c2c_plan(array_in.data(), array_out.data(), 1);
+  lo_mgr->execute_c2c(plan, array_in.data(), array_out.data());  
+  lo_mgr->destroy_plan(plan);
+  
+#pragma omp parallel for collapse(3)      
+  for (int i = startN0; i < startN0 + localN0; i++)
+	 for (int j = 0; j < N1; j++)
+		for (int k = 0; k < N2; k++) {
+			std::complex<double> psi = array_out[i][j][k]; 
+			
+			double rho = std::real(psi * std::conj(psi));
+			array_in[i][j][k] = pow(A,0.5) * pow(rho, (beta-1.)/2.) * psi; //chi0
+			
+			deltao[i][j][k] = std::real(psi * std::conj(psi)) - 1.;
+  }
+  
+  //std::string fname = str(format("borg_density_%d.h5") % step);
+  //H5::H5File f(fname, H5F_ACC_TRUNC);
+  //CosmoTool::hdf5_write_array(f, "density", deltao);
+  //step += 1; 
+    
+  plan = lo_mgr->create_c2c_plan(array_in.data(), array_out.data(), -1);
+  lo_mgr->execute_c2c(plan, array_in.data(), array_out.data()); //array_out is FFT(chi_0)
+  lo_mgr->destroy_plan(plan);
+  
+  double ee[3] = {0.,0.,1.}; //FIXME: hardcoded for los parallel to z-axis.
+
+#pragma omp parallel for collapse(3)      
+  for (int i = startN0; i < startN0 + localN0; i++)
+	 for (int j = 0; j < N1; j++)
+		for (int k = 0; k < N2; k++) {
+		    double kk[3] = {kmode(i, N0, L0), kmode(j, N1, L1),
+                          kmode(k, N2, L2)};
+            double ksquared = kk[0] * kk[0] + kk[1] * kk[1] + kk[2] * kk[2];
+			
+			double prod = (kk[0] * ee[0] + kk[1] * ee[1] + kk[2] * ee[2]); 
+			std::complex<double> exponent(0,  -0.5 * hbar * D1 * (ksquared + ff * prod * prod));
+			
+			array_in[i][j][k] = exp(exponent) * array_out[i][j][k] * vol;
+            
+  }
+  
+  
+  plan = lo_mgr->create_c2c_plan(array_in.data(), array_out.data(), 1);
+  lo_mgr->execute_c2c(plan, array_in.data(), array_out.data());  
+  lo_mgr->destroy_plan(plan);
+
+#pragma omp parallel for collapse(3)    
+  for (int i = startN0; i < startN0 + localN0; i++)
+	 for (int j = 0; j < N1; j++)
+		for (int k = 0; k < N2; k++) {
+			std::complex<double> chi = array_out[i][j][k]; 
+			
+			deltao[i][j][k] = std::real(chi * std::conj(chi));
+  }
+  
+  
+  
+  //array::density_rescale(deltao, nmean);
+
+  if (DUMP_BORG_DENSITY) {
+    std::string fname = str(format("borg_density_%d.h5") % comm->rank());
+    H5::H5File f(fname, H5F_ACC_TRUNC);
+    CosmoTool::hdf5_write_array(f, "density", deltao);
+  }
+  
+  
+}
+
+
+void BorgQLptRsdModel::qlpt_rsd_fwd_model(CArrayRef &deltao, PhaseArrayRef &lctim) {
+  LIBLSS_AUTO_DEBUG_CONTEXT(ctx);
+
+  if (false) {
+    static int step = 0;
+    std::string fname = str(format("fwd_ic_%d_%d.h5") % step % comm->rank());
+    H5::H5File f(fname, H5F_ACC_TRUNC);
+    CosmoTool::hdf5_write_array(f, "deltao", deltao);
+    step++;
+  }
+
+    qlpt_rsd_ic(deltao, lctim);
+}
+
+
+void BorgQLptRsdModel::forwardModelRsdField(ArrayRef &deltaf, double *vobs_ext) {
+  ConsoleContext<LOG_DEBUG> ctx("BORG forward model rsd density calculation");
+}
+
+
+void BorgQLptRsdModel::test_qlpt_rsd_velocities(MarkovState &state) {}
+
+
+void BorgQLptRsdModel::forwardModel_v2(ModelInput<3> delta_init) {
+  LIBLSS_AUTO_DEBUG_CONTEXT(ctx);
+
+  delta_init.setRequestedIO(PREFERRED_FOURIER);
+  
+  delta_init.needDestroyInput();
+  qlpt_rsd_fwd_model(delta_init.getFourier(), lc_timing->get_array());
+
+}
+
+
+void BorgQLptRsdModel::getDensityFinal(ModelOutput<3> delta_output) {
+  LIBLSS_AUTO_DEBUG_CONTEXT(ctx);
+  delta_output.setRequestedIO(PREFERRED_REAL);
+
+  qlpt_rsd_density_obs(delta_output.getRealOutput(), realInfo.localNumParticlesAfter);
+
+  /* if (!forwardModelHold && !adjointRequired) {
+    releaseParticles();
+  }*/
+  forwardModelHold = false;
+}
+// ARES TAG: num_authors = 1
+// ARES TAG: author(0) = Natalia Porqueres
+// ARES TAG: email(0) = n.porqueres@imperial.ac.uk
+// ARES TAG: year(0) = 2020
+
--- a/extra/borg/libLSS/physics/forwards/qlpt_rsd/borg_fwd_qlpt_rsd.hpp
+++ b/extra/borg/libLSS/physics/forwards/qlpt_rsd/borg_fwd_qlpt_rsd.hpp
@ -0,0 +1,9 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/forwards/qlpt_rsd/borg_fwd_qlpt_rsd.hpp
+    Copyright (C) 2014-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+    Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
--- a/extra/borg/libLSS/physics/forwards/qlpt_rsd/borg_fwd_qlpt_rsd_adj.cpp
+++ b/extra/borg/libLSS/physics/forwards/qlpt_rsd/borg_fwd_qlpt_rsd_adj.cpp
@ -0,0 +1,296 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/forwards/qlpt_rsd/borg_fwd_qlpt_rsd_adj.cpp
+    Copyright (C) 2020 Guilhem Lavaux <n.porqueres@imperial.ac.uk>
+    Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+
+void BorgQLptRsdModel::qlpt_rsd_fwd_model_ag(
+    PhaseArrayRef &lctime, ArrayRef &in_ag, ArrayRef &out_ag) {
+  ConsoleContext<LOG_DEBUG> ctx("BORG adjoint model (particles)");
+
+  Cosmology cosmo(cosmo_params);
+  double an = af; ///set position ics at r_{0}, calculate till present epoch
+  double anh =
+      af; ///velocities are created at v_{0-1/2}, calculate till present epoch
+
+  auto &phi0 = potential->get_array();
+
+  auto array_in_t = lo_mgr->allocate_c2c_array();
+  auto &array_in = array_in_t.get_array();
+  auto array_out_t = lo_mgr->allocate_c2c_array();
+  auto &array_out = array_out_t.get_array();
+
+  auto psi0_t = lo_mgr->allocate_c2c_array();
+  auto &psi0 = psi0_t.get_array();
+  auto psi_t = lo_mgr->allocate_c2c_array();
+  auto &psi = psi_t.get_array();
+  auto propagator_t = lo_mgr->allocate_c2c_array();
+  auto &propagator = propagator_t.get_array();
+  auto propagator_rsd_t = lo_mgr->allocate_c2c_array();
+  auto &propagator_rsd = propagator_rsd_t.get_array();
+  auto chi_t = lo_mgr->allocate_c2c_array();
+  auto &chi = chi_t.get_array();
+
+#pragma omp parallel for collapse(3)
+  for (int i = startN0; i < startN0 + localN0; i++)
+    for (int j = 0; j < N1; j++)
+      for (int k = 0; k < N2; k++) {
+        std::complex<double> exponent(0, -phi0[i][j][k] / hbar);
+        psi0[i][j][k] = exp(exponent);
+      }
+
+  DFT_Manager::Calls::plan_type plan =
+      lo_mgr->create_c2c_plan(psi0.data(), array_out.data(), -1);
+  lo_mgr->execute_c2c(plan, psi0.data(), array_out.data());
+  lo_mgr->destroy_plan(plan);
+
+  std::complex<double> vol(1. / (N0 * N1 * N2), 0);
+
+#pragma omp parallel for collapse(3)
+  for (int i = startN0; i < startN0 + localN0; i++)
+    for (int j = 0; j < N1; j++)
+      for (int k = 0; k < N2; k++) {
+        double kk[3] = {kmode(i, N0, L0), kmode(j, N1, L1), kmode(k, N2, L2)};
+        double ksquared = kk[0] * kk[0] + kk[1] * kk[1] + kk[2] * kk[2];
+        std::complex<double> exponent(0, -0.5 * hbar * D1 * ksquared);
+        propagator[i][j][k] = exp(exponent);
+        array_in[i][j][k] = propagator[i][j][k] * array_out[i][j][k] * vol;
+      }
+
+  plan = lo_mgr->create_c2c_plan(array_in.data(), psi.data(), 1);
+  lo_mgr->execute_c2c(plan, array_in.data(), psi.data());
+  lo_mgr->destroy_plan(plan);
+
+  double A = 0.35;
+  double beta = 1.58;
+  double ff = pow(cosmo_params.omega_m, 0.55);
+
+#pragma omp parallel for collapse(3)
+  for (int i = startN0; i < startN0 + localN0; i++)
+    for (int j = 0; j < N1; j++)
+      for (int k = 0; k < N2; k++) {
+        double rho = std::real(psi[i][j][k] * std::conj(psi[i][j][k]));
+        array_in[i][j][k] =
+            pow(A, 0.5) * pow(rho, (beta - 1.) / 2.) * psi[i][j][k]; //chi0
+      }
+
+  plan = lo_mgr->create_c2c_plan(array_in.data(), array_out.data(), -1);
+  lo_mgr->execute_c2c(
+      plan, array_in.data(), array_out.data()); //array_out is FFT(chi_0)
+  lo_mgr->destroy_plan(plan);
+
+  double ee[3] = {0., 0., 1.}; //FIXME: hardcoded for los parallel to z-axis.
+#pragma omp parallel for collapse(3)
+  for (int i = startN0; i < startN0 + localN0; i++)
+    for (int j = 0; j < N1; j++)
+      for (int k = 0; k < N2; k++) {
+        double kk[3] = {kmode(i, N0, L0), kmode(j, N1, L1), kmode(k, N2, L2)};
+        double ksquared = kk[0] * kk[0] + kk[1] * kk[1] + kk[2] * kk[2];
+
+        double prod = (kk[0] * ee[0] + kk[1] * ee[1] + kk[2] * ee[2]);
+        std::complex<double> exponent(
+            0, -0.5 * hbar * D1 * (ksquared + ff * prod * prod));
+        propagator_rsd[i][j][k] = exp(exponent);
+        array_in[i][j][k] = propagator_rsd[i][j][k] * array_out[i][j][k] * vol;
+      }
+
+  plan = lo_mgr->create_c2c_plan(array_in.data(), chi.data(), 1);
+  lo_mgr->execute_c2c(plan, array_in.data(), chi.data());
+  lo_mgr->destroy_plan(plan);
+
+  // all elements ready. start to compute gradient.
+
+  auto array_in2_t = lo_mgr->allocate_c2c_array();
+  auto &array_in2 = array_in2_t.get_array();
+  auto array_out2_t = lo_mgr->allocate_c2c_array();
+  auto &array_out2 = array_out2_t.get_array();
+
+  auto array_in3_t = lo_mgr->allocate_c2c_array();
+  auto &array_in3 = array_in3_t.get_array();
+  auto array_out3_t = lo_mgr->allocate_c2c_array();
+  auto &array_out3 = array_out3_t.get_array();
+
+  auto array_in4_t = lo_mgr->allocate_c2c_array();
+  auto &array_in4 = array_in4_t.get_array();
+  auto array_out4_t = lo_mgr->allocate_c2c_array();
+  auto &array_out4 = array_out4_t.get_array();
+
+#pragma omp parallel for collapse(3)
+  for (int i = startN0; i < startN0 + localN0; i++)
+    for (int j = 0; j < N1; j++)
+      for (int k = 0; k < N2; k++) {
+        array_in[i][j][k] = in_ag[i][j][k] * std::conj(chi[i][j][k]);
+        array_in3[i][j][k] = std::conj(array_in[i][j][k]);
+      }
+
+  plan = lo_mgr->create_c2c_plan(array_in.data(), array_out.data(), -1);
+  lo_mgr->execute_c2c(plan, array_in.data(), array_out.data());
+  lo_mgr->destroy_plan(plan);
+
+  plan = lo_mgr->create_c2c_plan(array_in3.data(), array_out3.data(), -1);
+  lo_mgr->execute_c2c(plan, array_in3.data(), array_out3.data());
+  lo_mgr->destroy_plan(plan);
+
+#pragma omp parallel for collapse(3)
+  for (int i = startN0; i < startN0 + localN0; i++)
+    for (int j = 0; j < N1; j++)
+      for (int k = 0; k < N2; k++) {
+        array_in[i][j][k] = propagator_rsd[i][j][k] * array_out[i][j][k] * vol;
+        array_in3[i][j][k] =
+            std::conj(propagator_rsd[i][j][k]) * array_out3[i][j][k] * vol;
+      }
+
+  plan = lo_mgr->create_c2c_plan(array_in.data(), array_out.data(), 1);
+  lo_mgr->execute_c2c(plan, array_in.data(), array_out.data());
+  lo_mgr->destroy_plan(plan);
+
+  plan = lo_mgr->create_c2c_plan(array_in3.data(), array_out3.data(), 1);
+  lo_mgr->execute_c2c(plan, array_in3.data(), array_out3.data());
+  lo_mgr->destroy_plan(plan);
+
+  for (int i = startN0; i < startN0 + localN0; i++)
+    for (int j = 0; j < N1; j++)
+      for (int k = 0; k < N2; k++) {
+        double rho = std::real(psi[i][j][k] * std::conj(psi[i][j][k]));
+        array_in[i][j][k] =
+            pow(A, 0.5) * 0.5 * (beta + 1.) * pow(rho, 0.5 * (beta - 1)) *
+                array_out[i][j][k] +
+            pow(A, 0.5) * 0.5 * (beta - 1) * pow(rho, 0.5 * (beta - 3)) *
+                std::conj(psi[i][j][k]) * std::conj(psi[i][j][k]) *
+                array_out3[i][j][k];
+        array_in3[i][j][k] =
+            pow(A, 0.5) * 0.5 * (beta - 1) * pow(rho, 0.5 * (beta - 3)) *
+                psi[i][j][k] * psi[i][j][k] * array_out[i][j][k] +
+            pow(A, 0.5) * 0.5 * (beta + 1.) * pow(rho, 0.5 * (beta - 1)) *
+                array_out3[i][j][k];
+      }
+
+  plan = lo_mgr->create_c2c_plan(array_in.data(), array_out.data(), -1);
+  lo_mgr->execute_c2c(plan, array_in.data(), array_out.data());
+  lo_mgr->destroy_plan(plan);
+
+  plan = lo_mgr->create_c2c_plan(array_in3.data(), array_out3.data(), -1);
+  lo_mgr->execute_c2c(plan, array_in3.data(), array_out3.data());
+  lo_mgr->destroy_plan(plan);
+
+#pragma omp parallel for collapse(3)
+  for (int i = startN0; i < startN0 + localN0; i++)
+    for (int j = 0; j < N1; j++)
+      for (int k = 0; k < N2; k++) {
+        array_in[i][j][k] = propagator[i][j][k] * array_out[i][j][k] * vol;
+        array_in3[i][j][k] =
+            std::conj(propagator[i][j][k]) * array_out3[i][j][k] * vol;
+      }
+
+  plan = lo_mgr->create_c2c_plan(array_in.data(), array_out.data(), 1);
+  lo_mgr->execute_c2c(plan, array_in.data(), array_out.data());
+  lo_mgr->destroy_plan(plan);
+
+  plan = lo_mgr->create_c2c_plan(array_in3.data(), array_out3.data(), 1);
+  lo_mgr->execute_c2c(plan, array_in3.data(), array_out3.data());
+  lo_mgr->destroy_plan(plan);
+
+  for (int i = startN0; i < startN0 + localN0; i++)
+    for (int j = 0; j < N1; j++)
+      for (int k = 0; k < N2; k++) {
+        std::complex<double> iunit(0, 1 / hbar);
+        std::complex<double> element = psi0[i][j][k];
+        array_in[i][j][k] = -iunit * element * array_out[i][j][k] +
+                            +iunit * std::conj(element) * array_out3[i][j][k];
+      }
+
+  plan = lo_mgr->create_c2c_plan(array_in.data(), array_out.data(), -1);
+  lo_mgr->execute_c2c(plan, array_in.data(), array_out.data());
+  lo_mgr->destroy_plan(plan);
+
+  for (int i = startN0; i < startN0 + localN0; i++)
+    for (int j = 0; j < N1; j++)
+      for (int k = 0; k < N2; k++) {
+        double kk[3] = {kmode(i, N0, L0), kmode(j, N1, L1), kmode(k, N2, L2)};
+        double ksquared = kk[0] * kk[0] + kk[1] * kk[1] + kk[2] * kk[2];
+        double fac = -1. / ksquared;
+
+        if (std::isinf(fac))
+          fac = 0.;
+
+        if (startN0 == 0 && localN0 > 0) {
+          if (i == 0) {
+            if (j == 0) {
+              if (k == 0 or k == N2_HC - 1) {
+                fac = 0.;
+              }
+            }
+            if (j == N1 / 2) {
+              if (k == 0 or k == N2_HC - 1) {
+                fac = 0.;
+              }
+            }
+          }
+        }
+
+        if (startN0 <= N0 / 2 && startN0 + localN0 > N0 / 2) {
+
+          if (i == N0 / 2) {
+            if (j == 0) {
+              if (k == 0 or k == N2_HC - 1) {
+                fac = 0.;
+              }
+            }
+            if (j == N1 / 2) {
+              if (k == 0 or k == N2_HC - 1) {
+                fac = 0.;
+              }
+            }
+          }
+        }
+        array_in[i][j][k] = fac * array_out[i][j][k] * vol;
+      }
+
+  plan = lo_mgr->create_c2c_plan(array_in.data(), array_out.data(), 1);
+  lo_mgr->execute_c2c(plan, array_in.data(), array_out.data());
+  lo_mgr->destroy_plan(plan);
+
+#pragma omp parallel for collapse(3)
+  for (int i = startN0; i < startN0 + localN0; i++)
+    for (int j = 0; j < N1; j++)
+      for (int k = 0; k < N2; k++) {
+        out_ag[i][j][k] = std::real(array_out[i][j][k]);
+      }
+}
+
+void BorgQLptRsdModel::preallocate() {}
+
+void BorgQLptRsdModel::adjointModel_v2(ModelInputAdjoint<3> gradient_delta) {
+
+  preallocate();
+
+  ///re-evaluate redshift distortions from forward run
+
+  ///work backwards from final to initial conditions
+
+  if (gradient_delta) {
+    gradient_delta.setRequestedIO(PREFERRED_REAL);
+    hold_in_gradient = std::move(gradient_delta);
+  }
+}
+
+void BorgQLptRsdModel::getAdjointModelOutput(
+    ModelOutputAdjoint<3> gradient_delta) {
+
+  gradient_delta.setRequestedIO(PREFERRED_REAL);
+  qlpt_rsd_fwd_model_ag(
+      *lc_timing, hold_in_gradient.getReal(), gradient_delta.getRealOutput());
+
+  clearAdjointGradient();
+}
+
+void BorgQLptRsdModel::clearAdjointGradient() {}
+
+// ARES TAG: num_authors = 1
+// ARES TAG: author(0) = Natalia Porqueres
+// ARES TAG: email(0) = n.porqueres@imperial.ac.uk
+// ARES TAG: year(0) = 2020
--- a/extra/borg/libLSS/physics/forwards/softplus.cpp
+++ b/extra/borg/libLSS/physics/forwards/softplus.cpp
@ -0,0 +1,89 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/forwards/softplus.cpp
+    Copyright (C) 2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#include "libLSS/cconfig.h"
+#include <string>
+#include "libLSS/tools/console.hpp"
+#include "libLSS/physics/model_io.hpp"
+#include "libLSS/physics/forwards/softplus.hpp"
+#include "libLSS/tools/fusewrapper.hpp"
+#include "libLSS/physics/forwards/registry.hpp"
+#include "libLSS/tools/ptree_proxy.hpp"
+
+using namespace LibLSS;
+
+void ForwardSoftPlus::forwardModel_v2(ModelInput<3> delta_init) {
+  LIBLSS_AUTO_DEBUG_CONTEXT(ctx);
+
+  // Setup the IO basis that is required for this forward.
+  delta_init.setRequestedIO(PREFERRED_REAL);
+
+  hold_input = std::move(delta_init);
+}
+
+void ForwardSoftPlus::getDensityFinal(ModelOutput<3> delta_output) {
+  delta_output.setRequestedIO(PREFERRED_REAL);
+  auto w_delta_init = fwrap(hold_input.getRealConst()) + bias_value; 
+  auto w_delta_output = fwrap(delta_output.getRealOutput());
+
+  auto basic_softplus =
+      std::log(1.0 + std::exp(hardness * w_delta_init)) / hardness;
+
+  w_delta_output =
+      mask((hardness * w_delta_init) > 10.0, w_delta_init, basic_softplus) - bias_value;
+}
+
+void ForwardSoftPlus::setHardness(double h) { hardness = h; }
+
+void ForwardSoftPlus::setBiasValue(double b) { bias_value = b; }
+
+void ForwardSoftPlus::adjointModel_v2(ModelInputAdjoint<3> in_gradient_delta) {
+  LIBLSS_AUTO_DEBUG_CONTEXT(ctx);
+  // Build strict range views (we do not want to see the
+  // the FFTW padding, ensure that the view object lives till the end of this function.
+
+  in_gradient_delta.setRequestedIO(PREFERRED_REAL);
+  hold_ag_input = std::move(in_gradient_delta);
+}
+
+void ForwardSoftPlus::getAdjointModelOutput(
+    ModelOutputAdjoint<3> out_gradient_delta) {
+  out_gradient_delta.setRequestedIO(PREFERRED_REAL);
+  auto w_in_gradient = fwrap(hold_ag_input.getRealConst());
+  auto w_out_gradient = fwrap(out_gradient_delta.getRealOutput());
+  auto w_delta_init = fwrap(hold_input.getRealConst()) + bias_value;
+
+  // FIXME: Being lazy and abusing the autowrap API here.
+  auto constantGradient =
+      fwrap(b_fused_idx<double, 3>([](auto... x) { return 1.0; }));
+  auto basic_gradient = 1.0 / (1.0 + std::exp(-hardness * w_delta_init));
+
+  w_out_gradient =
+      mask((hardness * w_delta_init) > 10.0, constantGradient, basic_gradient) *
+      w_in_gradient;
+}
+
+static std::shared_ptr<BORGForwardModel> build_softplus(
+    MPI_Communication *comm, BoxModel const &box, PropertyProxy const &params) {
+  auto hardness = params.get<double>("hardness");
+  auto bias_value = params.get<double>("bias_value", 1.0);
+
+  // TODO: Setup transfer function
+  auto model = std::make_shared<ForwardSoftPlus>(comm, box);
+
+  model->setBiasValue(bias_value);
+  model->setHardness(hardness);
+  return model;
+}
+
+LIBLSS_REGISTER_FORWARD_IMPL(Softplus, build_softplus);
+
+// ARES TAG: authors_num = 1
+// ARES TAG: name(0) = Guilhem Lavaux
+// ARES TAG: year(0) = 2020
+// ARES TAG: email(0) = guilhem.lavaux@iap.fr
--- a/extra/borg/libLSS/physics/forwards/softplus.hpp
+++ b/extra/borg/libLSS/physics/forwards/softplus.hpp
@ -0,0 +1,70 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/forwards/softplus.hpp
+    Copyright (C) 2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#ifndef __LIBLSS_HADES_FORWARD_SOFTPLUS_HPP
+#  define __LIBLSS_HADES_FORWARD_SOFTPLUS_HPP
+#  pragma once
+
+#  include "libLSS/physics/forward_model.hpp"
+#  include "libLSS/tools/fusewrapper.hpp"
+#  include "libLSS/tools/errors.hpp"
+#  include "libLSS/physics/forwards/registry.hpp"
+
+namespace LibLSS {
+
+  /**
+   * This class handles the convolution of a real field by some Fourier kernel.
+   */
+  class ForwardSoftPlus : public BORGForwardModel {
+  public:
+    using BORGForwardModel::CArrayRef;
+
+  protected:
+    double hardness;
+    double bias_value;
+
+    ModelInput<3> hold_input;
+    ModelInputAdjoint<3> hold_ag_input;
+
+  public:
+    /**
+     * Constructor.
+     */
+    explicit ForwardSoftPlus(MPI_Communication *comm, const BoxModel &box)
+        : BORGForwardModel(comm, box), hardness(1.0), bias_value(1.0) {
+      ensureInputEqualOutput();
+    }
+
+    virtual PreferredIO getPreferredInput() const { return PREFERRED_REAL; }
+    virtual PreferredIO getPreferredOutput() const { return PREFERRED_REAL; }
+
+    virtual void forwardModel_v2(ModelInput<3> delta_init);
+    virtual void getDensityFinal(ModelOutput<3> delta_output);
+
+    void setBiasValue(double b); 
+    void setHardness(double h);
+
+    virtual void adjointModel_v2(ModelInputAdjoint<3> in_gradient_delta);
+    virtual void
+    getAdjointModelOutput(ModelOutputAdjoint<3> out_gradient_delta);
+
+    virtual void forwardModelRsdField(ArrayRef &, double *) {}
+
+    virtual void releaseParticles() {}
+  }; // namespace LibLSS
+
+} // namespace LibLSS
+
+LIBLSS_REGISTER_FORWARD_DECL(Softplus);
+
+#endif
+
+// ARES TAG: authors_num = 1
+// ARES TAG: name(0) = Guilhem Lavaux
+// ARES TAG: year(0) = 2020
+// ARES TAG: email(0) = guilhem.lavaux@iap.fr
--- a/extra/borg/libLSS/physics/forwards/transfer.cpp
+++ b/extra/borg/libLSS/physics/forwards/transfer.cpp
@ -0,0 +1,161 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/forwards/transfer.cpp
+    Copyright (C) 2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#include "libLSS/cconfig.h"
+#include <string>
+#include "libLSS/tools/console.hpp"
+#include "libLSS/physics/model_io.hpp"
+#include "libLSS/physics/forwards/transfer.hpp"
+#include "libLSS/tools/fusewrapper.hpp"
+#include "libLSS/physics/forwards/registry.hpp"
+#include "libLSS/tools/ptree_proxy.hpp"
+
+using namespace LibLSS;
+
+void ForwardTransfer::forwardModel_v2(ModelInput<3> delta_init) {
+  LIBLSS_AUTO_DEBUG_CONTEXT(ctx);
+
+  // Setup the IO basis that is required for this forward.
+  delta_init.setRequestedIO(PREFERRED_FOURIER);
+
+  hold_input = std::move(delta_init);
+}
+
+void ForwardTransfer::getDensityFinal(ModelOutput<3> delta_output) {
+  LIBLSS_AUTO_DEBUG_CONTEXT(ctx);
+  delta_output.setRequestedIO(PREFERRED_FOURIER);
+  auto w_delta_init = fwrap(hold_input.getFourierConst());
+  auto w_delta_output = fwrap(delta_output.getFourierOutput());
+
+  w_delta_output = w_delta_init * fwrap(*Tk);
+}
+
+void ForwardTransfer::setupInverseCIC(double smoother) {
+  size_t N0 = lo_mgr->N0, N1 = lo_mgr->N1, N2 = lo_mgr->N2;
+  size_t hN0 = N0 / 2;
+  size_t hN1 = N1 / 2;
+  size_t hN2 = N2 / 2;
+  double const regul = std::sin(M_PI * smoother) / (M_PI * smoother);
+  auto sinc = [regul, smoother](double x) {
+    return (std::abs(x) < smoother) ? (std::sin(M_PI * x) / (M_PI * x)) : regul;
+  };
+  auto cic_Kernel =
+      b_fused_idx<double, 3>([&](ssize_t i, ssize_t j, ssize_t k) {
+        if (i > hN0)
+          i -= N0;
+        if (j > hN1)
+          j -= N1;
+        if (k > hN2)
+          k -= N2;
+        double r = 1.0;
+        if (i != 0)
+          r *= sinc(double(i) / N0);
+        if (j != 0)
+          r *= sinc(double(j) / N1);
+        if (k != 0)
+          r *= sinc(double(k) / N2);
+        assert(r != 0);
+        return 1.0 / (r * r);
+      });
+
+  Tk = std::move(lo_mgr->allocate_ptr_complex_array());
+
+  fwrap(*Tk) = cic_Kernel;
+}
+
+void ForwardTransfer::setupSharpKcut(double cut, bool reversed) {
+  size_t N0 = lo_mgr->N0, N1 = lo_mgr->N1, N2 = lo_mgr->N2;
+  double const cut2 = cut * cut;
+  size_t hN0 = N0 / 2;
+  size_t hN1 = N1 / 2;
+  size_t hN2 = N2 / 2;
+  auto sharp_Kernel =
+      b_fused_idx<double, 3>([&](ssize_t i, ssize_t j, ssize_t k) {
+        if (i > hN0)
+          i -= N0;
+        if (j > hN1)
+          j -= N1;
+        if (k > hN2)
+          k -= N2;
+        double r = 0.0;
+        if (i != 0)
+          r += CosmoTool::square(2*M_PI/L0*double(i));
+        if (j != 0)
+          r += CosmoTool::square(2*M_PI/L1*double(j));
+        if (k != 0)
+          r += CosmoTool::square(2*M_PI/L2*double(k));
+        return r < cut2 ? 1.0 : 0.0;
+      });
+
+  Tk = std::move(lo_mgr->allocate_ptr_complex_array());
+
+  if (reversed)
+    fwrap(*Tk) = 1.0 - fwrap(sharp_Kernel);
+  else
+    fwrap(*Tk) = sharp_Kernel;
+}
+
+void ForwardTransfer::setTransfer(
+    std::shared_ptr<DFT_Manager::U_ArrayFourier> Tk_) {
+  LIBLSS_AUTO_DEBUG_CONTEXT(ctx);
+  Tk = Tk_;
+}
+
+void ForwardTransfer::adjointModel_v2(ModelInputAdjoint<3> in_gradient_delta) {
+  LIBLSS_AUTO_DEBUG_CONTEXT(ctx);
+  // Build strict range views (we do not want to see the
+  // the FFTW padding, ensure that the view object lives till the end of this function.
+
+  in_gradient_delta.setRequestedIO(PREFERRED_FOURIER);
+  hold_ag_input = std::move(in_gradient_delta);
+}
+
+void ForwardTransfer::getAdjointModelOutput(
+    ModelOutputAdjoint<3> out_gradient_delta) {
+  out_gradient_delta.setRequestedIO(PREFERRED_FOURIER);
+  auto w_in_gradient = fwrap(hold_ag_input.getFourierConst());
+  auto w_out_gradient = fwrap(out_gradient_delta.getFourierOutput());
+
+  w_out_gradient = w_in_gradient * fwrap(*Tk);
+}
+
+static std::shared_ptr<BORGForwardModel> build_transfer(
+    MPI_Communication *comm, BoxModel const &box, PropertyProxy const &params) {
+  auto transfer_filename = params.get_optional<std::string>("transfer");
+  auto transfer_cic = params.get_optional<bool>("use_invert_cic");
+  auto transfer_sharp = params.get_optional<bool>("use_sharpk");
+
+  // TODO: Setup transfer function
+  auto model = std::make_shared<ForwardTransfer>(comm, box);
+
+  if (transfer_filename) {
+    auto Tk = model->lo_mgr->allocate_ptr_complex_array();
+    H5::H5File f(*transfer_filename, H5F_ACC_RDONLY);
+    try {
+      CosmoTool::hdf5_read_array(f, "transfer", Tk->get_array(), false, true);
+    } catch (CosmoTool::InvalidDimensions const &) {
+      error_helper<ErrorParams>(
+          "Provided transfer function does not have the correct shape.");
+    }
+    model->setTransfer(std::move(Tk));
+  } else if (transfer_cic && *transfer_cic) {
+    model->setupInverseCIC(params.get<double>("smoother"));
+  } else if (transfer_sharp && *transfer_sharp) {
+    model->setupSharpKcut(params.get<double>("k_max"));
+  } else {
+    error_helper<ErrorParams>("Transfer function non specified");
+  }
+  return model;
+}
+
+LIBLSS_REGISTER_FORWARD_IMPL(Transfer, build_transfer);
+
+// ARES TAG: authors_num = 1
+// ARES TAG: name(0) = Guilhem Lavaux
+// ARES TAG: year(0) = 2020
+// ARES TAG: email(0) = guilhem.lavaux@iap.fr
--- a/extra/borg/libLSS/physics/forwards/transfer.hpp
+++ b/extra/borg/libLSS/physics/forwards/transfer.hpp
@ -0,0 +1,80 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/forwards/transfer.hpp
+    Copyright (C) 2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#ifndef __LIBLSS_HADES_FORWARD_TRANSFER_HPP
+#  define __LIBLSS_HADES_FORWARD_TRANSFER_HPP
+#  pragma once
+
+#  include "libLSS/physics/forward_model.hpp"
+#  include "libLSS/tools/fusewrapper.hpp"
+#  include "libLSS/tools/errors.hpp"
+#  include "libLSS/physics/forwards/registry.hpp"
+
+namespace LibLSS {
+
+  /**
+   * This class handles the convolution of a real field by some Fourier kernel.
+   */
+  class ForwardTransfer : public BORGForwardModel {
+  public:
+    using BORGForwardModel::CArrayRef;
+
+  protected:
+    std::shared_ptr<DFT_Manager::U_ArrayFourier> Tk;
+
+    ModelInput<3> hold_input;
+    ModelInputAdjoint<3> hold_ag_input;
+
+  public:
+    /**
+     * Constructor.
+     */
+    explicit ForwardTransfer(
+        MPI_Communication *comm, const BoxModel &box)
+        : BORGForwardModel(comm, box) {
+      ensureInputEqualOutput();
+    }
+
+    PreferredIO getPreferredInput() const override { return PREFERRED_FOURIER; }
+    PreferredIO getPreferredOutput() const override { return PREFERRED_FOURIER; }
+
+    bool densityInvalidated() const override { return false; }
+      
+    void forwardModel_v2(ModelInput<3> delta_init) override;
+    void getDensityFinal(ModelOutput<3> delta_output) override;
+
+    /**
+     * Set the transfer function, it is copied inside the
+     * object such that the initial array can be discarded.
+     *
+     * @param Tk 3d transfer function
+     */
+    void setTransfer(std::shared_ptr<DFT_Manager::U_ArrayFourier> Tk_);
+
+    void setupInverseCIC(double smoother);
+    void setupSharpKcut(double cut, bool reversed = false);
+
+    void adjointModel_v2(ModelInputAdjoint<3> in_gradient_delta) override;
+    void
+    getAdjointModelOutput(ModelOutputAdjoint<3> out_gradient_delta) override;
+
+    void forwardModelRsdField(ArrayRef &, double *) override {}
+
+    void releaseParticles() override {}
+  }; // namespace LibLSS
+
+} // namespace LibLSS
+
+LIBLSS_REGISTER_FORWARD_DECL(Transfer);
+
+#endif
+
+// ARES TAG: authors_num = 1
+// ARES TAG: name(0) = Guilhem Lavaux
+// ARES TAG: year(0) = 2020
+// ARES TAG: email(0) = guilhem.lavaux@iap.fr
--- a/extra/borg/libLSS/physics/likelihoods/eft.hpp
+++ b/extra/borg/libLSS/physics/likelihoods/eft.hpp
@ -0,0 +1,343 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/likelihoods/eft.hpp
+    Copyright (C) 2018-2019 Franz Elsner <>
+    Copyright (C) 2019-2021 Fabian Schmidt <fabians@mpa-garching.mpg.de>
+    Copyright (C) 2019-2021 Martin Reinecke <martin@mpa-garching.mpg.de>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+#ifndef __LIBLSS_EFT_LIKELIHOOD_HPP
+#define __LIBLSS_EFT_LIKELIHOOD_HPP
+
+#include <gsl/gsl_sf_gamma.h>
+#include "libLSS/tools/fused_array.hpp"
+#include "libLSS/tools/console.hpp"
+#include "libLSS/tools/fused_reduce.hpp"
+#include <CosmoTool/algo.hpp>
+#include "libLSS/physics/likelihoods/base.hpp"
+#include "libLSS/tools/mpi_fftw_helper.hpp"
+#include "libLSS/samplers/core/powerspec_tools.hpp"
+
+namespace LibLSS {
+
+  namespace detail_EFT {
+
+    namespace ph = std::placeholders;
+    using DFT_Manager = FFTW_Manager_3d<double>;
+    using U_Array = Uninit_FFTW_Real_Array;
+    using U_CArray = Uninit_FFTW_Complex_Array;
+    using U_ArrayRef = U_Array::array_type;
+    using U_CArrayRef = U_CArray::array_type;
+
+    template <typename T>
+    struct myarr {
+      T arr;
+      typename T::array_type ref;
+      template <typename T1, typename T2>
+      myarr(const T1 &extent, const T2 &alloc)
+          : arr(extent, alloc), ref(arr.get_array()) {}
+    };
+
+    class EFTLikelihood {
+    protected:
+      bool have_sharpk_data;
+      long N0, N1, N2, N2_HC, startN0, localN0;
+      double L0, L1, L2;
+      double lambda;
+      double ctmpmean; // \vec k=0 value obtained in sharpk_filter
+
+      struct Arrs {
+        MPI_Communication &comm;
+        DFT_Manager mgr;
+        myarr<U_Array> tmp;
+        myarr<U_CArray> ctmp;
+        myarr<U_Array> sharpk_data, sharpk_mu;
+        FCalls::plan_type analysis_plan;
+        FCalls::plan_type synthesis_plan;
+
+        Arrs(MPI_Communication &comm_, size_t N0, size_t N1, size_t N2)
+            : comm(comm_), mgr(N0, N1, N2, &comm),
+              tmp(mgr.extents_real(), mgr.allocator_real),
+              ctmp(mgr.extents_complex(), mgr.allocator_complex),
+              sharpk_data(mgr.extents_real(), mgr.allocator_real),
+              sharpk_mu(mgr.extents_real(), mgr.allocator_real) {
+          myarr<U_Array> tmp(mgr.extents_real(), mgr.allocator_real);
+          analysis_plan = mgr.create_r2c_plan(tmp.ref.data(), ctmp.ref.data());
+          synthesis_plan = mgr.create_c2r_plan(ctmp.ref.data(), tmp.ref.data());
+        }
+      };
+      std::unique_ptr<Arrs> arrs;
+      static constexpr size_t numberLikelihoodParams =
+          3; // density, sigma_0, lnPprior
+
+      // apply sharp-k cut to field: modes with k > limit are set to zero
+      // - also sets to zero \vec k==0 mode -> subtract mean
+      size_t sharpk_filter(U_ArrayRef &in, U_ArrayRef &out, double limit) {
+        auto ctmp = arrs->ctmp.ref;
+        double norm = 1.0 / (N0 * N1 * N2);
+        arrs->mgr.execute_r2c(arrs->analysis_plan, in.data(), ctmp.data());
+
+        // save mean of field, i.e. \vec k=0 value divided by N^3, needed
+        // for data
+        if (startN0 == 0 && localN0 > 0) {
+          ctmpmean = ctmp[0][0][0].real() * norm;
+        }
+
+        size_t cnt = 0;
+#pragma omp parallel for collapse(3) reduction(+ : cnt)
+        for (size_t i = startN0; i < startN0 + localN0; i++)
+          for (size_t j = 0; j < N1; j++)
+            for (size_t k = 0; k < N2_HC; k++) {
+              double kk[3] = {
+                  kmode(i, N0, L0), kmode(j, N1, L1), kmode(k, N2, L2)};
+
+              double ksquared = kk[0] * kk[0] + kk[1] * kk[1] + kk[2] * kk[2];
+              if (ksquared > limit * limit || ksquared < 1.e-15)
+                ctmp[i][j][k] = 0.;
+              else {
+                cnt += ((k == 0) || (k == N2_HC - 1)) ? 1 : 2;
+                ctmp[i][j][k] *= norm;
+              }
+            }
+
+        arrs->mgr.execute_c2r(arrs->synthesis_plan, ctmp.data(), out.data());
+
+        // return _local_ mode count
+        return cnt;
+      }
+
+      // copied from gaussian.hpp: equally applicable here
+      static inline double log_proba(double d, double rho, double sigma_0) {
+        return CosmoTool::square((d - rho) / sigma_0);
+      }
+
+      static inline double
+      diff_log_proba(double d, double rho, double sigma_0, bool mask) {
+        return mask ? ((d - rho) / (sigma_0 * sigma_0)) : 0;
+      }
+
+      template <typename RandomGen>
+      static inline double
+      gen_sample(RandomGen &rgen, double rho, double sigma_0) {
+        // add trivial mean, which will be divided by in data preparation
+        return 1. + rgen.gaussian() * sigma_0 + rho;
+      }
+
+    public:
+      EFTLikelihood(LikelihoodInfo const &info = LikelihoodInfo())
+          : have_sharpk_data(false) {
+        auto comm(Likelihood::getMPI(info));
+        auto grid =
+            Likelihood::query<Likelihood::GridSize>(info, Likelihood::GRID);
+        N0 = grid[0];
+        N1 = grid[1];
+        N2 = grid[2];
+        N2_HC = N2 / 2 + 1;
+        arrs.reset(new Arrs(*comm, N0, N1, N2));
+        startN0 = arrs->mgr.startN0;
+        localN0 = arrs->mgr.localN0;
+        L0 = Likelihood::gridSide(info)[0];
+        L1 = Likelihood::gridSide(info)[1];
+        L2 = Likelihood::gridSide(info)[2];
+        lambda = Likelihood::query<double>(info, "EFT_Lambda");
+      }
+
+      template <typename RandomGen, typename TupleLike>
+      auto sample(RandomGen &rgen, TupleLike tuple_data) {
+        static_assert(
+            std::tuple_size<TupleLike>::value == numberLikelihoodParams);
+
+        // same procedure as to get 'sharpk_mu' in log_probability:
+        // take tuple_data<0>, apply sharp-k filter in Fourier space and IFT
+        auto mu = std::get<0>(
+            tuple_data); // is delta_lambda of EFTbias, multiplied by mask array
+        auto tmp = arrs->tmp.ref;
+#pragma omp parallel for collapse(3)
+        for (size_t i = startN0; i < startN0 + localN0; i++)
+          for (size_t j = 0; j < N1; j++)
+            for (size_t k = 0; k < N2; k++)
+              tmp[i][j][k] = mu[i][j][k];
+        sharpk_filter(tmp, arrs->sharpk_mu.ref, lambda);
+
+        // this is a 3D array now.
+        auto sigma_0 = std::get<1>(tuple_data);
+
+        // only OK if sigma_0 has constant value everywhere
+        // - take care of zero-width slabs
+        double sigma0_value = localN0 ? sigma_0[startN0][0][0] : 1.;
+        // debug output
+        Console::instance().format<LOG_DEBUG>(
+            "Sampling with sigma_0 = %g.", sigma0_value);
+
+        return b_va_fused<double>(
+            std::bind(gen_sample<RandomGen>, std::ref(rgen), ph::_1, ph::_2),
+            arrs->sharpk_mu.ref, std::move(sigma_0));
+      }
+
+      template <typename DataArray, typename TupleLike, typename MaskArray>
+      double log_probability(
+          const DataArray &data, TupleLike tuple_data, MaskArray &&mask) {
+        static_assert(
+            std::tuple_size<TupleLike>::value == numberLikelihoodParams);
+
+        // The tuple from the bias model currently holds
+        // mu=delta_lambda, sigma_0, prior(bias)
+        auto mu = std::get<0>(
+            tuple_data); // is delta_lambda of EFTbias, multiplied by mask array
+        auto sigma_0 = std::get<1>(tuple_data);
+        // this is prior on bias parameters:
+        auto lnPprior = std::get<2>(tuple_data);
+
+        // obtain sharp-k filtered data, if we don't have them already
+        if (!have_sharpk_data) {
+          auto tmp = arrs->tmp.ref;
+#pragma omp parallel for collapse(3)
+          for (size_t i = startN0; i < startN0 + localN0; i++)
+            for (size_t j = 0; j < N1; j++)
+              for (size_t k = 0; k < N2; k++) {
+                tmp[i][j][k] = data[i][j][k];
+              }
+
+          sharpk_filter(tmp, arrs->sharpk_data.ref, lambda);
+
+          // divide data by mean to obtain \delta_g
+          // - first, broadcast ctmpmean computed in sharpk_filter to all threads
+          arrs->comm.broadcast_t(&ctmpmean, 1, 0);
+          Console::instance().format<LOG_DEBUG>(
+              "Mean of data: %.6e.", ctmpmean);
+#pragma omp parallel for collapse(3)
+          for (size_t i = startN0; i < startN0 + localN0; i++)
+            for (size_t j = 0; j < N1; j++)
+              for (size_t k = 0; k < N2; k++) {
+                arrs->sharpk_data.ref[i][j][k] /= ctmpmean;
+              }
+
+          have_sharpk_data = true;
+        }
+
+        // Fourier-transform mu, apply sharp-k filter and Fourier-transform back
+        // result is in sharpk_mu
+        auto tmp = arrs->tmp.ref;
+#pragma omp parallel for collapse(3)
+        for (size_t i = startN0; i < startN0 + localN0; i++)
+          for (size_t j = 0; j < N1; j++)
+            for (size_t k = 0; k < N2; k++)
+              tmp[i][j][k] = mu[i][j][k];
+        // Nmodes is the number of modes that pass sharp-k filter;
+        // needed for correct normalization
+        size_t Nmodes_local = sharpk_filter(tmp, arrs->sharpk_mu.ref, lambda);
+
+        // Now evaluate likelihood, following gaussian.hpp
+
+        // I. Normalization
+        // - note that this form of likelihood ONLY works for trivial mask
+        // - desired _global_ normalization is
+        //      Nmodes * ln( N_g^3 sigma_0^2 )
+        // - hence take Nmodes = N_modes(slab) here
+
+        // only OK if sigma_0 has constant value everywhere
+        // - take care of zero-width slabs
+        double sigma0_value = localN0 ? sigma_0[startN0][0][0] : 1.;
+        double norm = -0.5 * double(Nmodes_local) *
+                      std::log(
+                          double(N0) * double(N1) * double(N2) * sigma0_value *
+                          sigma0_value);
+
+        // II. Likelihood proper
+        // - compute chi2
+        double chi2 = 0;
+        size_t cntmask = 0; // count pixels after mask (for check)
+#pragma omp parallel for collapse(3) reduction(+ : chi2, cntmask)
+        for (size_t i = startN0; i < startN0 + localN0; i++)
+          for (size_t j = 0; j < N1; j++)
+            for (size_t k = 0; k < N2; k++) {
+              if (mask[i][j][k]) {
+                chi2 += log_proba(
+                    arrs->sharpk_data.ref[i][j][k],
+                    arrs->sharpk_mu.ref[i][j][k], sigma_0[i][j][k]);
+                cntmask++;
+              }
+            }
+
+        // convert from chi2 to ln(likelihood)
+        chi2 *= -0.5;
+
+        // if in thread 0, add prior
+        double lpp = 0.;
+        if (arrs->comm.rank() == 0)
+          lpp = lnPprior[0][0][0];
+
+        Console::instance().format<LOG_DEBUG>(
+            "chi2 = %.7e, norm = %g, lnPprior = %.7e (sigma_0 = %g; Nmodes = "
+            "%zu, startN0 = %zu, localN0 = %zu, cntmask = %zu)",
+            chi2, norm, lpp, sigma0_value, Nmodes_local, startN0, localN0,
+            cntmask);
+
+        return chi2 + norm + lpp;
+      }
+
+      // Compute the gradient of the log probability, convention is that this function
+      // accepts a tuple, the first element being the poisson intensity.
+      // Other elements are discarded.
+      // The gradient is written in the output array, which must have the same shape
+      // as the input virtual arrays.
+      // L(b_0, b_1, ...)
+      // param is the i index in b_i.
+      // tuple_data must have the adequate tuple size to account for all "b_i".
+      template <typename DataArray, typename TupleLike, typename Mask>
+      auto diff_log_probability(
+          const DataArray &data, TupleLike tuple_data, const Mask &mask) {
+        static_assert(
+            std::tuple_size<TupleLike>::value == numberLikelihoodParams);
+
+        // The tuple from the bias model currently holds
+        // mu=delta_lambda, sigma_0
+        auto mu = std::get<0>(tuple_data); // is delta_lambda of EFTbias
+        auto sigma_0 = std::get<1>(tuple_data);
+
+        auto tmp = arrs->tmp.ref;
+        // apply sharp-k cut as in log_probability
+        if (!have_sharpk_data) {
+#pragma omp parallel for collapse(3)
+          for (size_t i = startN0; i < startN0 + localN0; i++)
+            for (size_t j = 0; j < N1; j++)
+              for (size_t k = 0; k < N2; k++)
+                tmp[i][j][k] = data[i][j][k];
+          sharpk_filter(tmp, arrs->sharpk_data.ref, lambda);
+          have_sharpk_data = true;
+        }
+
+#pragma omp parallel for collapse(3)
+        for (size_t i = startN0; i < startN0 + localN0; i++)
+          for (size_t j = 0; j < N1; j++)
+            for (size_t k = 0; k < N2; k++)
+              tmp[i][j][k] = mu[i][j][k];
+        sharpk_filter(tmp, arrs->sharpk_mu.ref, lambda);
+
+        return std::make_tuple(b_va_fused<double>(
+            &diff_log_proba, arrs->sharpk_data.ref, arrs->sharpk_mu.ref,
+            std::move(sigma_0), mask));
+
+        // FS: notice that in general (in the presence of non-binary mask) we  would also have
+        //     sharp-k filter the resulting array before returning; we will skip this for now
+      }
+    };
+  } // namespace detail_EFT
+
+  using detail_EFT::EFTLikelihood;
+
+} // namespace LibLSS
+
+#endif
+// ARES TAG: authors_num = 3
+// ARES TAG: name(0) = Franz Elsner
+// ARES TAG: year(0) = 2018-2019
+// ARES TAG: email(0) = 
+// ARES TAG: name(1) = Fabian Schmidt
+// ARES TAG: year(1) = 2019-2021
+// ARES TAG: email(1) = fabians@mpa-garching.mpg.de
+// ARES TAG: name(2) = Martin Reinecke
+// ARES TAG: year(2) = 2019-2021
+// ARES TAG: email(2) = martin@mpa-garching.mpg.de
--- a/extra/borg/libLSS/physics/likelihoods/eftmarg.hpp
+++ b/extra/borg/libLSS/physics/likelihoods/eftmarg.hpp
@ -0,0 +1,536 @@
+/*+
+    ARES/HADES/BORG Package -- ./extra/borg/libLSS/physics/likelihoods/eftmarg.hpp
+    Copyright (C) 2019-2021 Fabian Schmidt <fabians@mpa-garching.mpg.de>
+    Copyright (C) 2019-2021 Martin Reinecke <martin@mpa-garching.mpg.de>
+
+    Additional contributions from:
+       Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+    
+*/
+/*
+    =================================================================================
+
+    Implements the /marginalized/ EFT likelihood as described in
+      F. Elsner, F. Schmidt, J. Jasche, G. Lavaux and N.-M. Nguyen [1906.07143].
+      G. Cabass and F. Schmidt, [1909.04022],
+    however without higher-derivative stochasticity. This allows us to evaluate the
+    likelihood in real space.
+
+    Preparation: Fourier transform (FT) data, apply sharp-k cut, inverse FT (IFT) back
+
+    Likelihood:
+    1) Obtain individual bias fields from eft_bias_marg (in real space, AFTER sharp-k cut)
+    2) Compute scalar products of these weighted by sigma (distributed memory)
+       -- currently, sigma is constant; future: mask
+    3) Evaluate marginalized Gaussian likelihood (only on root thread)
+       -- Gaussian priors on bias parameters are included; obtained from
+          "bias_prior_mean", "bias_prior_sigma" in "likelihood" section of ini
+	  file, as comma-separated float values
+
+    Gradient: same procedure as in likelihood.
+
+    Sampling: use EFTLikelihood version with hard-coded dummy bias parameter values
+
+    =================================================================================
+
+    This program is free software; you can redistribute it and/or modify it
+    under the terms of either the CeCILL license or the GNU General Public
+    license, as included with the software package.
+
+    The text of the license is located in Licence_CeCILL_V2.1-en.txt
+    and GPL.txt in the root directory of the source package.
+
+*/
+#ifndef __LIBLSS_EFT_LIKELIHOOD_MARG_HPP
+#  define __LIBLSS_EFT_LIKELIHOOD_MARG_HPP
+
+#  include <gsl/gsl_sf_gamma.h>
+#  include "libLSS/tools/fused_array.hpp"
+#  include "libLSS/tools/console.hpp"
+#  include "libLSS/tools/fused_reduce.hpp"
+#  include "libLSS/tools/string_tools.hpp"
+#  include "libLSS/tools/ptree_vectors.hpp"
+#  include "libLSS/tools/mpi_fftw_helper.hpp"
+#  include <CosmoTool/algo.hpp>
+#  include "libLSS/physics/likelihoods/base.hpp"
+#  include "libLSS/samplers/core/powerspec_tools.hpp"
+#  include "eft.hpp"
+#  include "Eigen/Dense"
+
+namespace LibLSS {
+
+  namespace detail_EFT {
+
+    typedef size_t st;
+    using Eigen::MatrixXd;
+    using Eigen::VectorXd;
+    // define which matrix decomposition to use:
+    // Householder rank-revealing QR decomposition of a matrix with column-pivoting.
+    // - need to replace function call 'determinant' with 'absDeterminant'
+    typedef Eigen::ColPivHouseholderQR<MatrixXd> matrixDecomp;
+
+    class EFTMargLikelihood : public EFTLikelihood {
+      // prior parameters as obtained from ini file (could be empty)
+      std::vector<double> priormean, priorsigma;
+      // bias prior mean, inverse variance, and normalization
+      // - prepared in prepare_like (since we only know size of bias vector during runtime)
+      std::vector<double> priorB, priorIC;
+      double priornorm;
+      double sigmaprior_mean, sigmaprior_IC;
+
+      std::vector<std::shared_ptr<myarr<U_Array>>> gradvec;
+
+      template <typename DataArray>
+      void prepare_like(const DataArray &data, const st Nbias) {
+        // obtain sharp-k filtered data, if we don't have them already
+        if (!have_sharpk_data) {
+          auto tmp = arrs->tmp.ref;
+#  pragma omp parallel for collapse(3)
+          for (size_t i = startN0; i < startN0 + localN0; i++)
+            for (size_t j = 0; j < N1; j++)
+              for (size_t k = 0; k < N2; k++)
+                tmp[i][j][k] = data[i][j][k];
+          sharpk_filter(tmp, arrs->sharpk_data.ref, lambda);
+
+          // divide data by mean to obtain \delta_g
+          // - first, broadcast ctmpmean computed in sharpk_filter to all threads
+          arrs->comm.broadcast_t(&ctmpmean, 1, 0);
+          Console::instance().format<LOG_DEBUG>(
+              "Mean of data: %.6e.", ctmpmean);
+#  pragma omp parallel for collapse(3)
+          for (size_t i = startN0; i < startN0 + localN0; i++)
+            for (size_t j = 0; j < N1; j++)
+              for (size_t k = 0; k < N2; k++) {
+                arrs->sharpk_data.ref[i][j][k] /= ctmpmean;
+              }
+
+          have_sharpk_data = true;
+        }
+
+        // generate vectors with prior mean and inverse variance
+        priorB.resize(Nbias);
+        priorIC.resize(Nbias);
+        priornorm = 0.;
+        for (st i = 0; i < Nbias; ++i) {
+          // inverse variance; 0 <==> uniform prior
+          st ip = i + 1; // ignore prior on nmean
+          double IC = (ip < priorsigma.size() && priorsigma[ip] > 0.)
+                          ? 1. / (priorsigma[ip] * priorsigma[ip])
+                          : 0.;
+          priorB[i] = ip < priormean.size() ? priormean[ip] * IC : 0.;
+          priorIC[i] = IC;
+          priornorm -= IC > 0. ? log(IC) : 0.;
+        }
+
+        // in case there is a prior on sigma_0, assign values
+        st ip = Nbias + 1;
+        sigmaprior_mean = ip < priormean.size() ? priormean[ip] : 0.;
+        sigmaprior_IC = (ip < priorsigma.size() && priorsigma[ip] > 0.)
+                            ? 1. / (priorsigma[ip] * priorsigma[ip])
+                            : 0.;
+        priornorm -= sigmaprior_IC > 0. ? log(sigmaprior_IC) : 0.;
+      }
+
+    protected:
+      // number of likelihood parameters: 2
+      // - sigma0 field and vector of bias fields
+      static constexpr size_t numberLikelihoodParams = 2;
+
+      // computes and returns \sum_x g(x) h(x) / sigma^2(x)
+      // - this does an MPI reduce over all threads
+      // - implement mask here in the future
+      template <typename SigmaArray>
+      double scalar_product(
+          const U_ArrayRef &g, const U_ArrayRef &h,
+          SigmaArray const &sigma) const {
+        double S_local = 0.;
+#  pragma omp parallel for collapse(3) reduction(+ : S_local)
+        for (size_t i = startN0; i < startN0 + localN0; i++)
+          for (size_t j = 0; j < N1; j++)
+            for (size_t k = 0; k < N2; k++) {
+              double s = sigma[i][j][k];
+              if (!(s > 0.))
+                continue;
+              S_local += g[i][j][k] * h[i][j][k] / (s * s);
+            }
+
+        // now MPI reduce (adjoint needs scalar product results in all threads,
+        //    hence all_reduce)
+        double S = 0.;
+        arrs->comm.all_reduce_t(&S_local, &S, 1, MPI_SUM);
+        return S;
+      }
+
+      // locally count modes that pass sharp-k cut limit, for normalization
+      size_t mode_count(double limit) const {
+        size_t cnt = 0;
+#  pragma omp parallel for collapse(3) reduction(+ : cnt)
+        for (size_t i = startN0; i < startN0 + localN0; i++)
+          for (size_t j = 0; j < N1; j++)
+            for (size_t k = 0; k < N2_HC; k++) {
+              double kk[3] = {
+                  kmode(i, N0, L0), kmode(j, N1, L1), kmode(k, N2, L2)};
+
+              double ksquared = kk[0] * kk[0] + kk[1] * kk[1] + kk[2] * kk[2];
+              if (ksquared > limit * limit || ksquared < 1.e-15)
+                continue;
+
+              cnt += ((k == 0) || (k == N2_HC - 1)) ? 1 : 2;
+            }
+
+        // return _local_ mode count
+        return cnt;
+      }
+
+    public:
+      EFTMargLikelihood(LikelihoodInfo const &info = LikelihoodInfo())
+          : EFTLikelihood(info) {
+        ConsoleContext<LOG_DEBUG> ctx("EFTMargLikelihood constructor");
+
+        // set likelihood Lambda cut to kmax (if kmax > 0 and < lambda)
+        double kmax = Likelihood::query<double>(info, "EFT_kmax");
+        if (kmax > 0. && kmax < lambda)
+          lambda = kmax;
+
+        // get prior mean, sigma as set of doubles (priormean, priorsigma),
+        // if available
+        std::string smean =
+            Likelihood::query<std::string>(info, "bias_prior_mean");
+        if (smean.length()) {
+          auto bias_double = string_as_vector<double>(smean, ", ");
+          priormean.resize(bias_double.size());
+          std::copy(bias_double.begin(), bias_double.end(), priormean.begin());
+          ctx.print(
+              "EFTMargLikelihood: Set the bias prior mean to [" +
+              to_string(priormean) + "]");
+        }
+        std::string ssigma =
+            Likelihood::query<std::string>(info, "bias_prior_sigma");
+        if (ssigma.length()) {
+          auto bias_double = string_as_vector<double>(ssigma, ", ");
+          priorsigma.resize(bias_double.size());
+          std::copy(bias_double.begin(), bias_double.end(), priorsigma.begin());
+          ctx.print(
+              "EFTMargLikelihood: Set the bias prior sigma to [" +
+              to_string(priorsigma) + "]");
+        }
+      }
+
+      // sample: use return tuple from EFTBiasMarg, and assemble mean density field using
+      //         hard-coded default values
+      template <typename RandomGen, typename TupleLike>
+      auto sample(RandomGen &rgen, TupleLike tuple_data) {
+        static_assert(
+            std::tuple_size<TupleLike>::value == numberLikelihoodParams);
+
+        // get data from EFTBiasMarg
+        auto &gbias = std::get<1>(tuple_data);
+        auto &gsigma = std::get<0>(tuple_data);
+
+        // get sigma_0 ; only for debug output here
+        // - take care of zero-width slabs
+        double sigma0_value = localN0 ? gsigma[startN0][0][0] : 1.;
+
+        // assemble bias field into tmp
+        // - already sharp-k filtered
+        // - we do not have access to the params of the bias model here,
+        //   so we have to make do with our own numbers...
+
+        Console::instance().format<LOG_DEBUG>(
+            "INFO in EFTMargLikelihood::sample: sampling with "
+            "fixed bias parameters; sigma_0 = %g.",
+            sigma0_value);
+
+        auto mu = arrs->sharpk_mu.ref;
+#  pragma omp parallel for collapse(3)
+        for (size_t i = startN0; i < startN0 + localN0; i++)
+          for (size_t j = 0; j < N1; j++)
+            for (size_t k = 0; k < N2; k++) {
+              mu[i][j][k] = 1.5 * (gbias[0])[i][j][k]    // delta
+                            + 0.5 * (gbias[1])[i][j][k]  // delta^2
+                            + 0.3 * (gbias[2])[i][j][k]  // K^2
+                            + 20. * (gbias[3])[i][j][k]; // lapl delta
+            }
+
+        return b_va_fused<double>(
+            std::bind(
+                EFTLikelihood::gen_sample<RandomGen>, std::ref(rgen), ph::_1,
+                ph::_2),
+            arrs->sharpk_mu.ref, std::move(gsigma));
+      }
+
+      template <typename DataArray, typename TupleLike, typename MaskArray>
+      double log_probability(
+          const DataArray &data, TupleLike tuple_data, MaskArray &&mask) {
+        static_assert(
+            std::tuple_size<TupleLike>::value == numberLikelihoodParams);
+
+        // The return vector (tuple element 1) from EFTBiasMarg contains
+        // 0: delta
+        // 1: delta^2
+        // 2: K^2
+        // 3: lapl delta
+        // 4: sigma
+        // -- all in real space after sharp-k filter
+        // -- worry about mask later
+
+        // get data
+        auto &gbias = std::get<1>(tuple_data);
+        auto &gsigma = std::get<0>(tuple_data);
+        const st Nbias = gbias.size();
+
+        // prepare sharp-k data and priors
+        prepare_like(data, Nbias);
+
+        // now compute scalar products
+
+        // construct A
+        MatrixXd A(Nbias, Nbias);
+        for (st i = 0; i < Nbias; i++) {
+          for (st j = 0; j <= i; j++) {
+            A(i, j) = scalar_product(gbias[i], gbias[j], gsigma);
+
+            // add prior inverse variance to diagonal
+            if (j == i)
+              A(i, j) += priorIC[j];
+          }
+        }
+        // other components through symmetry
+        for (st i = 0; i < Nbias; i++) {
+          for (st j = i + 1; j < Nbias; j++) {
+            A(i, j) = A(j, i);
+          }
+        }
+
+        // construct B (note no unmarginalized bias fields here)
+        VectorXd B(Nbias);
+        for (st i = 0; i < Nbias; i++) {
+          B(i) = scalar_product(arrs->sharpk_data.ref, gbias[i], gsigma);
+          // add prior
+          B(i) += priorB[i];
+        }
+
+        // construct C (note no unmarginalized bias fields here)
+        double C = scalar_product(
+            arrs->sharpk_data.ref, arrs->sharpk_data.ref, gsigma);
+
+        Console::instance().format<LOG_DEBUG>(
+            "[%d] Done with scalar products; marginalizing over "
+            "%zu bias parameters (A(0,0) = %.5e).",
+            arrs->comm.rank(), Nbias, A(0, 0));
+        // compute normalization
+        // -- assume constant value for sigma_0 here; can be easily generalized to sum over
+        //    sigma field
+        st Nmodes_local = mode_count(lambda);
+        double sigma0_value = 0., norm = 0.;
+        if (Nmodes_local) { // take care of zero-width slabs
+          sigma0_value = gsigma[startN0][0][0];
+          norm = -0.5 * double(Nmodes_local) *
+                 (std::log(sigma0_value * sigma0_value) +
+                  std::log(double(N0) * double(N1) * double(N2)));
+        }
+
+        // the rest should only run on master thread 0 to avoid duplication over MPI threads
+        if (arrs->comm.rank() != 0) {
+          Console::instance().format<LOG_DEBUG>(
+              "[%d] (no chi2) norm = %g (sigma_0 = %g; Nmodes = %zu, "
+              "startN0 = %zu, localN0 = %zu, Lambda(like) = %.4f)",
+              arrs->comm.rank(), norm, sigma0_value, Nmodes_local,
+              startN0 % localN0, lambda);
+          return norm;
+        }
+
+        // main part of pseudo-chi^2
+        double chi2 = C;
+        // - more efficient (and accurate?) solution via matrix decomposition
+        // -- see above for selection of decomposition
+        matrixDecomp AD(A);
+        VectorXd X(Nbias);
+        X = AD.solve(B);
+        double QT = B.transpose() * X;
+        chi2 -= QT;
+
+        // add log-determinant of A
+        // - using Cholesky decomposition;
+        //   see https://gist.github.com/redpony/fc8a0db6b20f7b1a3f23
+        // Eigen::LLT<MatrixXd> chol(A);
+        // chi2 += 2. * chol.matrixL().toDenseMatrix().diagonal().array().log().sum();
+        // - using matrix decomposition itself (this seems to be more accurate numerically):
+        // chi2 += log(fabs(AD.determinant()));
+        chi2 += log(AD.absDeterminant());
+
+        // add prior on sigma_0
+        if (sigmaprior_IC > 0.) {
+          double D = sigma0_value - sigmaprior_mean;
+          chi2 += sigmaprior_IC * D * D;
+        }
+
+        // to conclude, add prior normalization (usually least relevant term)
+        chi2 += priornorm;
+
+        Console::instance().format<LOG_DEBUG>(
+            "[%d] chi2 = %.7e, norm = %g, priornorm = %g (sigma_0 = %g; "
+            "Nmodes = %zu, startN0 = %zu, localN0 = %zu)",
+            arrs->comm.rank(), chi2, norm, priornorm, sigma0_value,
+            Nmodes_local, startN0, localN0);
+
+        // convert from chi^2 to ln P and add normalization
+        return -0.5 * chi2 + norm;
+      }
+
+      // Compute the gradient of the log probability, convention is that this function
+      // accepts a tuple, the first element being the poisson intensity.
+      // Other elements are discarded.
+      // The gradient is written in the output array, which must have the same shape
+      // as the input virtual arrays.
+      // L(b_0, b_1, ...)
+      // param is the i index in b_i.
+      // tuple_data must have the adequate tuple size to account for all "b_i".
+      //
+      // returns tuple which contains
+      // 0: dlogL/ddelta
+      // 1: dlogL/ddelta^2
+      // 2: dlogL/dK^2
+      // 3: dlogL/d(lapl delta)
+      //
+      // while debugging gradient:
+      template <typename DataArray, typename TupleLike, typename Mask>
+      auto diff_log_probability(
+          const DataArray &data, TupleLike tuple_data, const Mask &mask) {
+
+        static_assert(
+            std::tuple_size<TupleLike>::value == numberLikelihoodParams);
+
+        // this is the return vector
+        std::vector<U_ArrayRef> gradref;
+
+        // =======
+        // PREPARATION
+
+        // The return tuple from EFTBiasMarg contains
+        // 0: delta
+        // 1: delta^2
+        // 2: K^2
+        // 3: lapl delta
+        // 4: sigma
+        // -- all in real space after sharp-k filter
+        // -- worry about mask later
+
+        // get data
+        auto &gbias = std::get<1>(tuple_data);
+        auto &gsigma = std::get<0>(tuple_data);
+        const st Nbias = gbias.size();
+
+        // prepare sharp-k data and priors
+        prepare_like(data, Nbias);
+
+        // allocate gradient arrays, if we don't have them already
+        if (gradvec.size() < Nbias) {
+          gradvec.clear();
+          Console::instance().format<LOG_DEBUG>(
+              "[%d] EFTMargLikelihood gradient: allocating %zu "
+              "gradient arrays.",
+              arrs->comm.rank(), Nbias);
+          for (st I = 0; I < Nbias; I++) {
+            gradvec.push_back(std::make_shared<myarr<U_Array>>(
+                arrs->mgr.extents_real(), arrs->mgr.allocator_real));
+          }
+        }
+
+        // now compute scalar products
+
+        // construct A
+        MatrixXd A(Nbias, Nbias);
+        for (st i = 0; i < Nbias; i++) {
+          for (st j = 0; j <= i; j++) {
+            A(i, j) = scalar_product(gbias[i], gbias[j], gsigma);
+
+            // add prior inverse variance to diagonal
+            if (j == i)
+              A(i, j) += priorIC[j];
+          }
+        }
+        // other components through symmetry
+        for (st i = 0; i < Nbias; i++) {
+          for (st j = i + 1; j < Nbias; j++) {
+            A(i, j) = A(j, i);
+          }
+        }
+
+        // construct B
+        VectorXd B(Nbias);
+        for (st i = 0; i < Nbias; i++) {
+          B(i) = scalar_product(arrs->sharpk_data.ref, gbias[i], gsigma);
+          // add prior
+          B(i) += priorB[i];
+        }
+
+        // no need for 'C' in gradient
+
+        Console::instance().format<LOG_DEBUG>(
+            "[%d] EFTMargLikelihood gradient: Done with scalar "
+            "products; marginalizing over %zu bias parameters.",
+            arrs->comm.rank(), Nbias);
+
+        // now compute the constant matrix components we need
+        // - note these need to be available in all threads
+        matrixDecomp AD(A);
+        VectorXd X(Nbias);
+        X = AD.solve(B);
+        MatrixXd Ainv(A.inverse()); // need this explicitly this time
+
+        // GRADIENT PROPER
+
+        // loop over bias fields for which to compute gradient
+        for (st ip = 0; ip < Nbias; ip++) {
+          auto gradp = gradvec[ip]->ref;
+
+          // part 1 (assign propto data)
+#  pragma omp parallel for collapse(3)
+          for (size_t i = startN0; i < startN0 + localN0; i++)
+            for (size_t j = 0; j < N1; j++)
+              for (size_t k = 0; k < N2; k++) {
+                double s = gsigma[i][j][k];
+                if (!(s > 0.))
+                  continue;
+                double is2 = 1. / (s * s);
+                gradp[i][j][k] = is2 * X(ip) * arrs->sharpk_data.ref[i][j][k];
+              }
+
+          // part 2 (add bias operators)
+          for (st iO = 0; iO < Nbias; iO++) {
+#  pragma omp parallel for collapse(3)
+            for (size_t i = startN0; i < startN0 + localN0; i++)
+              for (size_t j = 0; j < N1; j++)
+                for (size_t k = 0; k < N2; k++) {
+                  double s = gsigma[i][j][k];
+                  if (!(s > 0.))
+                    continue;
+                  double is2 = 1. / (s * s);
+                  gradp[i][j][k] -= is2 * (X(ip) * X(iO) + Ainv(ip, iO)) *
+                                    (gbias[iO])[i][j][k];
+                }
+          }
+
+          gradref.push_back(gradp);
+        } // end loop over bias
+
+        // return tuple (first element actually not used; it is multiplied by gradient of
+        //  selector)
+        return std::make_tuple(gsigma, gradref);
+      }
+    };
+  } // namespace detail_EFT
+
+  using detail_EFT::EFTMargLikelihood;
+
+} // namespace LibLSS
+
+#endif
+// ARES TAG: authors_num = 2
+// ARES TAG: name(0) = Fabian Schmidt
+// ARES TAG: year(0) = 2019-2021
+// ARES TAG: email(0) = fabians@mpa-garching.mpg.de
+// ARES TAG: name(1) = Martin Reinecke
+// ARES TAG: year(1) = 2019-2021
+// ARES TAG: email(1) = martin@mpa-garching.mpg.de
--- a/Show more
+++ b/Show more