Update cosmotool 2nd part

2025-07-04 23:31:12 +00:00 · 2018-07-19 15:11:23 +03:00 · 2018-07-19 15:11:23 +03:00 · 003bc39d4a
commit 003bc39d4a
parent 64e05fc180
70 changed files with 8708 additions and 0 deletions
--- a/external/cosmotool/python/CMakeLists.txt
+++ b/external/cosmotool/python/CMakeLists.txt
@ -0,0 +1,111 @@
+set(CMAKE_SHARED_MODULE_PREFIX)
+
+
+set(PYTHON_INCLUDES ${NUMPY_INCLUDE_DIRS} ${PYTHON_INCLUDE_PATH} ${CMAKE_SOURCE_DIR}/python)
+include_directories(${CMAKE_SOURCE_DIR}/src ${CMAKE_BINARY_DIR}/src)
+
+IF(CYTHON)
+  add_custom_command(
+    OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/_cosmotool.cpp
+    COMMAND ${CYTHON} --cplus -o ${CMAKE_CURRENT_BINARY_DIR}/_cosmotool.cpp ${CMAKE_CURRENT_SOURCE_DIR}/_cosmotool.pyx
+    DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/_cosmotool.pyx) 
+
+  add_custom_command(
+    OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/_cosmo_power.cpp
+    COMMAND ${CYTHON} --cplus -o ${CMAKE_CURRENT_BINARY_DIR}/_cosmo_power.cpp ${CMAKE_CURRENT_SOURCE_DIR}/_cosmo_power.pyx
+    DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/_cosmo_power.pyx) 
+
+  add_custom_command(
+    OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/_fast_interp.cpp
+    COMMAND ${CYTHON} --cplus -o ${CMAKE_CURRENT_BINARY_DIR}/_fast_interp.cpp ${CMAKE_CURRENT_SOURCE_DIR}/_fast_interp.pyx
+    DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/_fast_interp.pyx) 
+
+  add_custom_command(
+    OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/_cosmo_cic.cpp
+    COMMAND ${CYTHON} --cplus -o ${CMAKE_CURRENT_BINARY_DIR}/_cosmo_cic.cpp ${CMAKE_CURRENT_SOURCE_DIR}/_cosmo_cic.pyx
+    DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/_cosmo_cic.pyx) 
+
+  add_custom_command(
+    OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/_project.cpp
+    COMMAND ${CYTHON} --cplus -o ${CMAKE_CURRENT_BINARY_DIR}/_project.cpp ${CMAKE_CURRENT_SOURCE_DIR}/_project.pyx
+    DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/_project.pyx ${CMAKE_CURRENT_SOURCE_DIR}/project_tool.hpp ) 
+
+ENDIF(CYTHON)
+
+
+add_library(_cosmotool MODULE ${CMAKE_CURRENT_BINARY_DIR}/_cosmotool.cpp)
+add_library(_cosmo_power MODULE ${CMAKE_CURRENT_BINARY_DIR}/_cosmo_power.cpp)
+add_library(_cosmo_cic MODULE ${CMAKE_CURRENT_BINARY_DIR}/_cosmo_cic.cpp)
+add_library(_fast_interp MODULE ${CMAKE_CURRENT_BINARY_DIR}/_fast_interp.cpp)
+add_library(_project MODULE ${CMAKE_CURRENT_BINARY_DIR}/_project.cpp)
+target_include_directories(_cosmotool PRIVATE ${PYTHON_INCLUDES}) 
+target_include_directories(_cosmo_power PRIVATE ${PYTHON_INCLUDES}) 
+target_include_directories(_cosmo_cic PRIVATE ${PYTHON_INCLUDES}) 
+target_include_directories(_fast_interp PRIVATE ${PYTHON_INCLUDES}) 
+target_include_directories(_project PRIVATE ${PYTHON_INCLUDES}) 
+
+
+SET(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} -Bsymbolic-functions")
+if(APPLE)
+  set(CMAKE_MODULE_LINKER_FLAGS "-undefined dynamic_lookup")
+endif()
+
+target_link_libraries(_cosmotool ${CosmoTool_local} ${PYTHON_LIBRARIES} ${GSL_LIBRARIES})
+target_link_libraries(_cosmo_power ${CosmoTool_local} ${PYTHON_LIBRARIES} ${GSL_LIBRARIES})
+target_link_libraries(_cosmo_cic ${CosmoTool_local} ${PYTHON_LIBRARIES} ${GSL_LIBRARIES})
+target_link_libraries(_project ${PYTHON_LIBRARIES})
+target_link_libraries(_fast_interp ${CosmoTool_local} ${PYTHON_LIBRARIES})
+
+SET(ct_TARGETS   _cosmotool _project _cosmo_power _cosmo_cic _fast_interp )
+
+if (Boost_FOUND)
+  message(STATUS "Building bispectrum support (path = ${Boost_INCLUDE_DIRS})")
+  include_directories(${Boost_INCLUDE_DIRS})
+  add_library(_cosmo_bispectrum MODULE _cosmo_bispectrum.cpp)
+  target_link_libraries(_cosmo_bispectrum ${MATH_LIBRARY})
+  if(ENABLE_OPENMP)
+    set_target_properties(_cosmo_bispectrum PROPERTIES COMPILE_FLAGS "${OpenMP_CXX_FLAGS}" LINK_FLAGS "${OpenMP_CXX_FLAGS}")
+  endif()
+  if (Boost_DEP)
+    add_dependencies(_cosmo_bispectrum ${Boost_DEP})
+  endif()
+  SET(ct_TARGETS ${ct_TARGETS} _cosmo_bispectrum)
+endif()
+
+# Discover where to put packages
+if (NOT PYTHON_SITE_PACKAGES)
+  execute_process (COMMAND ${PYTHON_EXECUTABLE} -c "from distutils.sysconfig import get_python_lib; print(get_python_lib())" OUTPUT_VARIABLE internal_PYTHON_SITE_PACKAGES OUTPUT_STRIP_TRAILING_WHITESPACE)
+  SET(SYSTEM_PYTHON_SITE_PACKAGES ${internal_PYTHON_SITE_PACKAGES} CACHE PATH "Path to the target system-wide site-package where to install python modules")
+
+  execute_process (COMMAND ${PYTHON_EXECUTABLE} -c "from site import USER_SITE; print(USER_SITE)" OUTPUT_VARIABLE internal_PYTHON_SITE_PACKAGES OUTPUT_STRIP_TRAILING_WHITESPACE)
+  SET(USER_PYTHON_SITE_PACKAGES ${internal_PYTHON_SITE_PACKAGES} CACHE PATH "Path to the target user site-package where to install python modules")
+
+  mark_as_advanced(USER_PYTHON_SITE_PACKAGES SYSTEM_PYTHON_SITE_PACKAGES)
+endif (NOT PYTHON_SITE_PACKAGES)
+
+message(STATUS "System python site: ${SYSTEM_PYTHON_SITE_PACKAGES}")
+message(STATUS "User python site: ${USER_PYTHON_SITE_PACKAGES}")
+
+OPTION(INSTALL_PYTHON_LOCAL OFF)
+
+IF (NOT INSTALL_PYTHON_LOCAL)
+  SET(PYTHON_SITE_PACKAGES ${SYSTEM_PYTHON_SITE_PACKAGES})
+ELSE (NOT INSTALL_PYTHON_LOCAL)
+  SET(PYTHON_SITE_PACKAGES ${USER_PYTHON_SITE_PACKAGES})
+ENDIF(NOT INSTALL_PYTHON_LOCAL)
+cmessage(STATUS "Python install location: ${PYTHON_SITE_PACKAGES}")
+
+
+if (WIN32 AND NOT CYGWIN)
+  SET_TARGET_PROPERTIES(_cosmotool PROPERTIES SUFFIX ".pyd")
+endif (WIN32 AND NOT CYGWIN)
+
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cosmotool/config.py.in ${CMAKE_CURRENT_BINARY_DIR}/cosmotool/config.py @ONLY)
+
+INSTALL(TARGETS 
+  ${ct_TARGETS}
+  LIBRARY DESTINATION ${PYTHON_SITE_PACKAGES}/cosmotool
+)
+
+INSTALL(DIRECTORY cosmotool ${CMAKE_CURRENT_BINARY_DIR}/cosmotool DESTINATION ${PYTHON_SITE_PACKAGES} 
+        FILES_MATCHING PATTERN "*.py")
--- a/external/cosmotool/python/_cosmo_bispectrum.cpp
+++ b/external/cosmotool/python/_cosmo_bispectrum.cpp
@ -0,0 +1,278 @@
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+#include <iostream>
+#include <boost/format.hpp>
+#include <boost/multi_array.hpp>
+#include <sys/types.h>
+#include <cmath>
+#include "symbol_visible.hpp"
+#include "algo.hpp"
+
+using std::cout;
+using std::endl;
+using boost::format;
+using CosmoTool::square;
+
+
+struct ModeSet
+{
+    ssize_t N1, N2, N3;
+    bool half_copy;
+    
+    struct TriangleIterator
+    {
+        ssize_t i1, i2, i3;
+        ssize_t N1, N2, N3;
+        ssize_t first_iteration;
+        
+        
+        TriangleIterator& operator++() {
+            i3++;
+            if (i3==(N3/2+1)) { i3 = first_iteration; i2++; }
+            if (i2==(N2/2+1)) { i2 = -N2/2; i1++; }
+            return *this;
+        }
+    
+        bool operator!=(const TriangleIterator& t) const {
+            return i1!=t.i1 || i2!=t.i2 || i3 != t.i3;
+        }
+        
+        bool in_box() const {
+            ssize_t hN1 = N1/2, hN2 = N2/2, hN3 = N3/2;
+            
+            return (i1 >= -hN1) && (i1 <= hN1) && (i2 >= -hN2) && (i2 <= hN2) && (i3 >= -hN3) && (i3 <= hN3);
+        }
+        
+        TriangleIterator operator+(const TriangleIterator& other_t) const {
+            TriangleIterator t = *this;
+            
+            t.i1 = (t.i1+other_t.i1);
+            t.i2 = (t.i2+other_t.i2);
+            t.i3 = (t.i3+other_t.i3);
+            return t;
+        }
+
+        TriangleIterator& inp_array() {
+            if (i1 < 0)
+                i1 += N1;  
+            if (i2 < 0)
+                i2 += N2;  
+            if (i3 < 0)
+                i3 += N3;  
+            return *this;
+        }
+
+        TriangleIterator array() const {
+          TriangleIterator t = *this;
+          
+          t.inp_array();
+          return t;
+        }
+    
+        TriangleIterator real() const {
+            TriangleIterator t = *this;
+            if (t.i1 >= N1/2)
+                t.i1 -= N1;  
+            if (t.i2 >= N2/2)
+                t.i2 -= N2;  
+            if (t.i3 >= N3/2)
+                t.i3 -= N3;  
+            return t;
+        }
+        
+        double norm() const {
+            double r1 = i1, r2 = i2, r3 = i3;
+            return std::sqrt(r1*r1 + r2*r2 + r3*r3);
+        }
+        void reverse() { i1=-i1; i2=-i2; i3=-i3; }
+  
+        TriangleIterator& operator*() { return *this; }
+    };
+
+    ModeSet(size_t N1_, size_t N2_, size_t N3_, bool _half_copy = false)
+        : N1(N1_), N2(N2_), N3(N3_),half_copy(_half_copy) {
+    } 
+
+    TriangleIterator begin() const {
+        TriangleIterator t;
+        t.i1 = -N1/2;
+        t.i2 = -N2/2;
+        if (half_copy)
+          t.first_iteration = t.i3 = 0;
+        else
+          t.first_iteration = t.i3 = -N3/2;
+        t.N1 = N1;
+        t.N2 = N2;
+        t.N3 = N3;
+        return t;
+    }
+
+    TriangleIterator end() const {
+        TriangleIterator t;
+        t.first_iteration = (half_copy ? 0 : (-N3/2));
+        t.i3 = t.first_iteration;
+        t.i2 = -N2/2;
+        t.i1 = N1/2+1;
+        t.N1 = N1;
+        t.N2 = N2;
+        t.N3 = N3;
+        return t;
+    }
+
+};
+
+std::ostream& operator<<(std::ostream& o, const ModeSet::TriangleIterator& t)
+{
+  o << t.i1 << "," << t.i2 << "," << t.i3;
+  return o;
+}
+
+template<typename T>
+static T no_conj(const T& a) { return a; }
+
+template<typename SubArrayB,typename SubArrayCnt,typename Delta>
+static inline void accum_bispec(const Delta& delta_mirror, SubArrayB b_Nt, SubArrayCnt b_B,
+    const typename Delta::element& v1, const typename Delta::element& v2,
+    const ModeSet::TriangleIterator& rm1, 
+    const ModeSet::TriangleIterator& rm2, 
+    const ModeSet::TriangleIterator& rm3,
+    double delta_k,
+    size_t Nk)
+{
+  typedef std::complex<double> CType;
+
+  size_t q1 = std::floor(rm1.norm()/delta_k);
+  if (q1 >= Nk)
+    return;
+    
+  size_t q2 = std::floor(rm2.norm()/delta_k);
+  if (q2 >= Nk)
+    return;
+
+  size_t q3 = std::floor(rm3.norm()/delta_k);
+  if (q3 >= Nk)
+    return;
+
+  CType prod = v1*v2;
+  ModeSet::TriangleIterator m3 = rm3;
+  // We use hermitic symmetry to get -m3, it is just the mode in m3 but conjugated.
+  m3.reverse();
+  m3.inp_array();
+  prod *= delta_mirror[m3.i1][m3.i2][m3.i3];
+
+  b_Nt[q1][q2][q3] ++;
+  b_B[q1][q2][q3] += prod;
+}
+
+extern "C" CTOOL_DLL_PUBLIC
+void CosmoTool_compute_bispectrum(
+    double *delta_hat, size_t Nx, size_t Ny, size_t Nz,
+    size_t *Ntriangles,
+    double* B, double delta_k, size_t Nk ) 
+{
+    // First remap to multi_array for easy access
+    size_t kNz = Nz/2+1;
+#ifdef _OPENMP
+    int Ntasks = omp_get_max_threads();
+#else
+    int Ntasks = 1;
+#endif
+    boost::multi_array_ref<std::complex<double>, 3> a_delta(reinterpret_cast<std::complex<double>*>(delta_hat), boost::extents[Nx][Ny][kNz]); 
+    boost::multi_array_ref<size_t, 3> a_Nt(Ntriangles, boost::extents[Nk][Nk][Nk]); 
+    boost::multi_array_ref<std::complex<double>, 3> a_B(reinterpret_cast<std::complex<double>*>(B), boost::extents[Nk][Nk][Nk]); 
+    boost::multi_array<std::complex<double>, 4> b_B(boost::extents[Ntasks][Nk][Nk][Nk]);
+    boost::multi_array<size_t, 4> b_Nt(boost::extents[Ntasks][Nk][Nk][Nk]);
+    typedef std::complex<double> CType;
+    boost::multi_array<std::complex<double>, 3> delta_mirror(boost::extents[Nx][Ny][Nz]); 
+    
+    // Add hermiticity
+    for (auto m : ModeSet(Nx, Ny, Nz, true)) {
+      auto n1 = m;
+      auto n2 = m.array();
+      n1.reverse();
+      n1.inp_array();
+      delta_mirror[n2.i1][n2.i2][n2.i3] = (a_delta[n2.i1][n2.i2][n2.i3]);
+      delta_mirror[n1.i1][n1.i2][n1.i3] = std::conj(delta_mirror[n2.i1][n2.i2][n2.i3]);
+    }
+
+#ifdef _OPENMP
+    // First loop over m1
+#pragma omp parallel
+    {
+#pragma omp single
+      {
+        for (auto m1 : ModeSet(Nx, Ny, Nz)) {
+          auto am1 = m1.array();
+          CType v1 = delta_mirror[am1.i1][am1.i2][am1.i3];
+          int tid = omp_get_thread_num();
+          
+#pragma omp task
+          {
+            auto rm1 = m1.real();
+            // Second mode m2
+            for (auto m2 : ModeSet(Nx, Ny, Nz)) {
+                // Now derive m3
+                auto am2 = m2.array();
+                auto m3 = (m1+m2);
+                CType v2 = delta_mirror[am2.i1][am2.i2][am2.i3];
+
+                // Not in Fourier box, stop here     
+                if (!m3.in_box())
+                    continue;
+
+                accum_bispec(delta_mirror, b_Nt[tid], b_B[tid], v1, v2, m1, m2, m3, delta_k, Nk);
+
+            }
+          }          
+
+        }
+      }
+    }
+
+#pragma omp taskwait
+  for (int tid = 0; tid < Ntasks; tid++) {
+    size_t *b_p = b_Nt[tid].origin();
+    size_t *a_p = a_Nt.data();
+    std::complex<double> *b_B_p = b_B[tid].origin();
+    std::complex<double> *a_B_p = a_B.origin();
+//#pragma omp simd
+#pragma omp parallel for
+    for (size_t q = 0; q < Nk*Nk*Nk; q++) {
+      a_p[q] += b_p[q];
+      a_B_p[q] += b_B_p[q];
+    }
+  }
+#else
+#warning Serial version not implemented
+#endif
+}
+
+
+extern "C" CTOOL_DLL_PUBLIC
+void CosmoTool_compute_powerspectrum(
+    double *delta_hat, size_t Nx, size_t Ny, size_t Nz,
+    size_t *Ncounts,
+    double* P, double delta_k, size_t Nk ) 
+{
+    // First remap to multi_array for easy access
+    size_t kNz = Nz/2+1;
+    boost::multi_array_ref<std::complex<double>, 3> a_delta(reinterpret_cast<std::complex<double>*>(delta_hat), boost::extents[Nx][Ny][kNz]); 
+    boost::multi_array_ref<size_t, 1> a_Nc(Ncounts, boost::extents[Nk]); 
+    boost::multi_array_ref<double, 1> a_P(reinterpret_cast<double*>(P), boost::extents[Nk]); 
+    typedef std::complex<double> CType;
+
+    // First loop over m1
+    for (auto m : ModeSet(Nx, Ny, kNz)) {
+        auto m1 = m.array();
+        CType& v1 = a_delta[m1.i1][m1.i2][m1.i3];
+
+        size_t q1 = std::floor(m.norm()/delta_k);
+
+        if (q1 >= Nk)
+            continue;
+
+        a_Nc[q1] ++;
+        a_P[q1] += std::norm(v1);
+    }
+}
--- a/external/cosmotool/python/_cosmo_cic.pyx
+++ b/external/cosmotool/python/_cosmo_cic.pyx
@ -0,0 +1,76 @@
+from libcpp cimport bool
+from libcpp cimport string as cppstring
+from libcpp cimport vector as cppvector
+import numpy as np
+cimport numpy as np
+from cpython cimport PyObject, Py_INCREF
+cimport cython
+
+np.import_array()
+
+cdef extern from "cic.hpp" namespace "CosmoTool":
+
+  ctypedef float CICType
+  ctypedef float Coordinates[3]
+  
+  cdef cppclass CICParticles:
+    float mass
+    Coordinates coords
+
+  cdef cppclass CICFilter:
+  
+    CICFilter(np.uint32_t resolution, double L) nogil
+    
+    void resetMesh() nogil
+    void putParticles(CICParticles* particles, np.uint32_t N) nogil
+    void getDensityField(CICType *& field, np.uint32_t& res) nogil
+
+
+@cython.boundscheck(False)
+@cython.cdivision(True)
+@cython.wraparound(False)
+def leanCic(float[:,:] particles, float L, int Resolution):
+  cdef cppvector.vector[CICParticles] *p
+  cdef CICFilter *cic
+  cdef np.uint64_t i
+  cdef CICType *field
+  cdef np.uint32_t dummyRes
+  cdef np.ndarray[np.float64_t, ndim=3] out_field
+  cdef np.ndarray[np.float64_t, ndim=1] out_field0
+  cdef np.float64_t[:] out_field_buf
+  cdef np.uint64_t j
+
+  cic = new CICFilter(Resolution, L)
+  print("Reset mesh")
+  cic.resetMesh()
+  
+  if particles.shape[1] != 3:
+    raise ValueError("Particles must be Nx3 array")
+  
+  print("Inserting particles")
+#  p = new cppvector.vector[CICParticles](particles.shape[0])
+#  for i in xrange(particles.shape[0]):
+#    *p[i].mass = 1
+#    *p[i].coords[0] = particles[i,0]
+#    *p[i].coords[1] = particles[i,1]
+#    *p[i].coords[2] = particles[i,2]
+
+#  cic.putParticles(&p[0], particles.shape[0])
+  del p
+    
+  print("Done")
+  field = <CICType*>0
+  dummyRes = 0
+  cic.getDensityField(field, dummyRes)
+
+  print("Got to allocate a numpy %dx%dx%d" % (dummyRes, dummyRes,dummyRes))
+  
+  out_field = np.empty((dummyRes, dummyRes, dummyRes), dtype=np.float64)
+  out_field0 = out_field.reshape(out_field.size)
+  out_field_buf = out_field
+  print("Copy")
+  for j in xrange(out_field_buf.size):
+    out_field_buf[j] = field[j]
+  
+  del cic
+  return out_field
--- a/external/cosmotool/python/_cosmo_power.pyx
+++ b/external/cosmotool/python/_cosmo_power.pyx
@ -0,0 +1,153 @@
+from libcpp cimport bool
+from libcpp cimport string as cppstring
+import numpy as np
+cimport numpy as np
+from cpython cimport PyObject, Py_INCREF
+cimport cython
+
+np.import_array()
+
+cdef extern from "cosmopower.hpp" namespace "CosmoTool":
+
+  cdef enum CosmoFunction "CosmoTool::CosmoPower::CosmoFunction":
+      POWER_EFSTATHIOU "CosmoTool::CosmoPower::POWER_EFSTATHIOU",
+      HU_WIGGLES "CosmoTool::CosmoPower::HU_WIGGLES",
+      HU_BARYON "CosmoTool::CosmoPower::HU_BARYON",
+      OLD_POWERSPECTRUM,
+      POWER_BARDEEN "CosmoTool::CosmoPower::POWER_BARDEEN",
+      POWER_SUGIYAMA "CosmoTool::CosmoPower::POWER_SUGIYAMA",
+      POWER_BDM,
+      POWER_TEST,
+      HU_WIGGLES_ORIGINAL "CosmoTool::CosmoPower::HU_WIGGLES_ORIGINAL"
+
+  cdef cppclass CosmoPower:
+    double n
+    double K0
+    double V_LG_CMB
+    
+    double CMB_VECTOR[3]
+    double h
+    double SIGMA8
+    double OMEGA_B
+    double OMEGA_C
+    double omega_B
+    double omega_C
+    double Theta_27
+    double OMEGA_0
+    double Omega
+    double beta
+    double OmegaEff
+    double Gamma0
+    double normPower
+    
+        
+    CosmoPower()
+    void setFunction(CosmoFunction)
+    void updateCosmology()
+    void updatePhysicalCosmology()
+    void normalize(double,double)
+    void setNormalization(double)
+    double power(double)
+ 
+cdef class CosmologyPower:
+  """CosmologyPower(**cosmo)
+  
+  CosmologyPower manages and compute power spectra computation according to different
+  approximation given in the litterature.
+  
+  Keyword arguments:
+    omega_B_0 (float): relative baryon density
+    omega_M_0 (float): relative matter density
+    h (float): Hubble constant relative to 100 km/s/Mpc
+    ns (float): power law of the large scale inflation spectrum
+  """
+
+  cdef CosmoPower power
+
+  def __init__(self,**cosmo):    
+    self.power = CosmoPower()
+    self.power.OMEGA_B = cosmo['omega_B_0']
+    self.power.OMEGA_C = cosmo['omega_M_0']-cosmo['omega_B_0']
+    self.power.h = cosmo['h']
+    if 'ns' in cosmo:
+      self.power.n = cosmo['ns'] 
+    
+    assert self.power.OMEGA_C > 0
+    
+    self.power.updateCosmology()
+
+  def setNormalization(self,A):
+    self.power.setNormalization(A)
+    
+  def normalize(self,s8,k_min=-1,k_max=-1):
+    """normalize(self, sigma8)
+    
+    Compute the normalization of the power spectrum using sigma8.
+    
+    Arguments:
+      sigma8 (float): standard deviation of density field smoothed at 8 Mpc/h
+    """
+    self.power.SIGMA8 = s8
+    self.power.normalize(k_min, k_max)
+    
+    
+  def setFunction(self,funcname):
+    """setFunction(self, funcname)
+    
+    Choose an approximation to use for the computation of the power spectrum
+    
+    Arguments:
+      funcname (str): the name of the approximation. It can be either
+                      EFSTATHIOU, HU_WIGGLES, HU_BARYON, BARDEEN or SUGIYAMA.
+    """
+    cdef CosmoFunction f
+    
+    f = POWER_EFSTATHIOU
+  
+    if funcname=='EFSTATHIOU':
+      f = POWER_EFSTATHIOU
+    elif funcname=='HU_WIGGLES':
+      f = HU_WIGGLES
+    elif funcname=='HU_BARYON':
+      f = HU_BARYON
+    elif funcname=='BARDEEN':
+      f = POWER_BARDEEN
+    elif funcname=='SUGIYAMA':
+      f = POWER_SUGIYAMA
+    elif funcname=='HU_WIGGLES_ORIGINAL':
+      f = HU_WIGGLES_ORIGINAL
+    else:
+      raise ValueError("Unknown function name " + funcname)
+
+    self.power.setFunction(f)
+    
+  cdef double _compute(self, double k):
+    k *= self.power.h
+    return self.power.power(k) * self.power.h**3
+    
+  def compute(self, k):
+    """compute(self, k)
+    
+    Compute the power spectrum for mode which length k.
+    
+    Arguments:
+      k (float): Mode for which to evaluate the power spectrum.
+                 It can be a scalar or a numpy array. 
+                 The units must be in 'h Mpc^{-1}'.
+    
+    Returns:
+      a scalar or a numpy array depending on the type of the k argument 
+    """
+    
+    cdef np.ndarray out
+    cdef double kval
+    cdef tuple i
+    
+    if isinstance(k, np.ndarray):
+      out = np.empty(k.shape, dtype=np.float64)
+      for i,kval in np.ndenumerate(k):
+        out[i] = self._compute(kval)
+      return out
+    else:
+      return self._compute(k)
+      
--- a/external/cosmotool/python/_cosmotool.pyx
+++ b/external/cosmotool/python/_cosmotool.pyx
@ -0,0 +1,560 @@
+from libcpp cimport bool
+from libcpp cimport string as cppstring
+from libcpp.vector cimport vector as cppvector
+from cython.parallel cimport prange
+import numpy as np
+cimport numpy as np
+from cpython cimport PyObject, Py_INCREF
+cimport cython
+
+np.import_array()
+
+cdef extern from "sys/types.h":
+   ctypedef np.int64_t int64_t
+
+cdef extern from "loadSimu.hpp" namespace "CosmoTool":
+
+  cdef cppclass SimuData:
+    np.float_t BoxSize
+    np.float_t time
+    np.float_t Hubble
+
+    np.float_t Omega_M
+    np.float_t Omega_Lambda
+    np.int64_t TotalNumPart
+    np.int64_t NumPart
+    int64_t *Id
+    float *Pos[3]
+    float *Vel[3]
+    int *type
+    float *Mass
+
+    bool noAuto
+
+  cdef const int NEED_GADGET_ID 
+  cdef const int NEED_POSITION
+  cdef const int NEED_VELOCITY
+  cdef const int NEED_TYPE
+  cdef const int NEED_MASS
+
+cdef extern from "loadGadget.hpp" namespace "CosmoTool":
+
+  SimuData *loadGadgetMulti(const char *fname, int id, int flags, int gformat) nogil except +
+  void cxx_writeGadget "CosmoTool::writeGadget" (const char * s, SimuData *data) except +
+
+cdef extern from "safe_gadget.hpp":
+  SimuData *loadGadgetMulti_safe(cppstring.string s, int flags, int gformat) nogil
+  SimuData **alloc_simudata(int num) nogil
+  void del_simudata(SimuData **d) nogil
+
+cdef extern from "cppHelper.hpp":
+  int customCosmotoolHandler() nogil
+
+cdef extern from "loadRamses.hpp" namespace "CosmoTool":
+  SimuData *loadRamsesSimu(const char *basename, int id, int cpuid, bool dp, int flags) except +customCosmotoolHandler
+
+class PySimulationBase(object):
+  """
+  This is the base class to representation Simulation in CosmoTool/python.
+  """
+  
+  def getPositions(self):
+    """
+    getPositions(self)
+    
+    Returns:
+        A list of three arrays holding the positions of the particles. 
+        The i-th element is the i-th coordinate of each particle.
+        It may be None if the positions were not requested.
+    """
+    raise NotImplementedError("getPositions is not implemented")
+    
+  def getVelocities(self):
+    """
+    getVelocities(self)
+    
+    Returns:
+        A list of three arrays holding the velocities of the particles. 
+        The i-th element is the i-th coordinate of each particle.
+        It may be None if the velocities were not requested.
+    """
+    raise NotImplementedError("getVelocities is not implemented")
+    
+  def getIdentifiers(self):
+    """
+    getIdentifiers(self)
+    
+    Returns:
+        Returns an integer array that hold the unique identifiers of
+        each particle. 
+        It may be None if the identifiers were not requested.
+    """
+    raise NotImplementedError("getIdentifiers is not implemented")
+
+  def getTypes(self):
+    """
+    getTypes(self)
+    
+    Returns:
+        Returns an integer array that hold the type of
+        each particle. 
+        It may be None if the types were not requested.
+    """
+    raise NotImplementedError("getTypes is not implemented")
+
+  def getOmega_M(self):
+    """
+    getOmega_M(self)
+    
+    Returns:
+        the mean matter density in the simulation, with respect
+        to the critical density.
+    """ 
+    raise NotImplementedError("getOmega_M is not implemented")
+  
+  def getOmega_Lambda(self):
+    """
+    getOmega_Lambda(self)
+    
+    Returns:
+        the mean dark energy density in the simulation, with respect
+        to the critical density.
+    """ 
+    raise NotImplementedError("getOmega_Lambda is not implemented")
+
+  def getTime(self):
+    """
+    getTime(self)
+    
+    Returns:
+        the time the snapshot was taken in the simulation. It can
+        have various units depending on the file format.
+    """
+    raise NotImplementedError("getTime is not implemented")
+
+  def getHubble(self):
+    """
+    getHubble(self)
+    
+    Returns:
+        the hubble constant in unit of 100 km/s/Mpc
+    """
+    raise NotImplementedError("getHubble is not implemented")
+
+  def getBoxsize(self):
+    """
+    getBoxsize(self)
+    
+    Returns:
+        the size of the simulation box. The length unit is not fixed,
+        though it is customary to have it in Mpc/h if the loader has
+        access to the unit normalization.
+    """
+    raise NotImplementedError("getBoxsize is not implemented")
+
+  def getMasses(self):
+    """
+    getMasses(self)
+    
+    Returns:
+        an array with the masses of each particles, in unspecified unit that
+        depend on the loader.
+    """
+    raise NotImplementedError("getMasses is not implemented")
+
+cdef class Simulation:
+    """
+    Simulation()
+    
+    Class that directly manages internal loaded data obtained from a loader
+    """
+
+    cdef list positions
+    cdef list velocities
+    cdef object identifiers
+    cdef object types
+    cdef object masses
+
+    cdef SimuData *data
+
+    property BoxSize:
+        def __get__(Simulation self):
+            return self.data.BoxSize
+        
+    property time:
+        def __get__(Simulation self):
+            return self.data.time
+
+    property Hubble:
+        def __get__(Simulation self):
+            return self.data.Hubble
+
+    property Omega_M:
+        def __get__(Simulation self):
+            return self.data.Omega_M
+        
+    property Omega_Lambda:
+        def __get__(Simulation self):
+            return self.data.Omega_Lambda
+        
+    property positions:
+        def __get__(Simulation self):
+            return self.positions
+        
+    property velocities:
+        def __get__(Simulation self):
+            return self.velocities
+
+    property identifiers:
+        def __get__(Simulation self):
+            return self.identifiers
+
+    property types:
+        def __get__(Simulation self):
+            return self.types
+
+    property masses:
+        def __get__(Simulation self):
+            return self.masses
+        
+    property numParticles:
+        def __get__(Simulation self):
+            return self.data.NumPart
+
+
+    property totalNumParticles:
+        def __get__(Simulation self):
+            return self.data.TotalNumPart
+
+    def __cinit__(Simulation self):
+        self.data = <SimuData *>0
+  
+    def __dealloc__(Simulation self):
+        if self.data != <SimuData *>0:
+          del self.data
+
+
+class PySimulationAdaptor(PySimulationBase):
+  """
+  PySimulationAdaptor(PySimulationBase_)
+  
+  This class is an adaptor for an internal type to the loader. It defines
+  all the methods of PySimulationBase.
+  
+  Attributes:
+    simu: a Simulation_ object
+  """
+  def __init__(self,sim):
+    self.simu = sim
+
+  def getBoxsize(self):
+    return self.simu.BoxSize
+    
+  def getPositions(self):
+    return self.simu.positions
+
+  def getTypes(self):
+    return self.simu.types
+    
+  def getVelocities(self):
+    return self.simu.velocities
+    
+  def getIdentifiers(self):
+    return self.simu.identifiers
+
+  def getTime(self):
+    return self.simu.time
+    
+  def getHubble(self):
+    return self.simu.Hubble
+    
+  def getOmega_M(self):
+    return self.simu.Omega_M
+    
+  def getOmega_Lambda(self):
+    return self.simu.Omega_Lambda
+
+  def getMasses(self):
+    return self.simu.masses
+
+cdef class ArrayWrapper:
+    cdef void* data_ptr
+    cdef np.uint64_t size
+    cdef int type_array
+    
+    cdef set_data(self, np.uint64_t size, int type_array, void* data_ptr):
+        """ Set the data of the array
+            This cannot be done in the constructor as it must recieve C-level
+            arguments.
+
+            Args:
+                size (int): Length of the array.
+                data_ptr (void*): Pointer to the data
+        """
+        self.data_ptr = data_ptr
+        self.size = size
+        self.type_array = type_array
+     
+    def __array__(self):
+        """ Here we use the __array__ method, that is called when numpy
+tries to get an array from the object."""
+        cdef np.npy_intp shape[1]
+        
+        shape[0] = <np.npy_intp> self.size
+        # Create a 1D array, of length 'size'
+        ndarray = np.PyArray_SimpleNewFromData(1, shape, self.type_array, self.data_ptr)
+        return ndarray
+     
+    def __dealloc__(self):
+        """ Frees the array. This is called by Python when all the
+references to the object are gone. """
+        pass
+ 
+cdef object wrap_array(void *p, np.uint64_t s, int typ):
+    cdef np.ndarray ndarray
+    cdef ArrayWrapper wrapper
+
+    wrapper = ArrayWrapper()
+    wrapper.set_data(s, typ, p)
+    ndarray = np.array(wrapper, copy=False)
+    ndarray.base = <PyObject*> wrapper
+    Py_INCREF(wrapper)
+    
+    return ndarray
+
+
+cdef object wrap_float_array(float *p, np.uint64_t s):
+    return wrap_array(<void *>p, s, np.NPY_FLOAT32)
+
+cdef object wrap_int64_array(int64_t* p, np.uint64_t s):
+    return wrap_array(<void *>p, s, np.NPY_INT64)
+
+cdef object wrap_int_array(int* p, np.uint64_t s):
+    return wrap_array(<void *>p, s, np.NPY_INT)
+
+cdef object wrap_simudata(SimuData *data, int flags):
+  cdef Simulation simu
+
+  simu = Simulation()
+  simu.data = data
+  if flags & NEED_POSITION:
+     simu.positions = [wrap_float_array(data.Pos[i], data.NumPart) for i in xrange(3)]
+  else:
+     simu.positions = None
+
+  if flags & NEED_VELOCITY:
+     simu.velocities = [wrap_float_array(data.Vel[i], data.NumPart) for i in xrange(3)]
+  else:
+     simu.velocities = None
+
+  if flags & NEED_GADGET_ID:
+     simu.identifiers = wrap_int64_array(data.Id, data.NumPart)
+  else:
+     simu.identifiers = None
+
+  if flags & NEED_TYPE:
+     simu.types = wrap_int_array(data.type, data.NumPart)
+  else:
+     simu.types = None
+
+  if flags & NEED_MASS:
+     simu.masses = wrap_float_array(data.Mass, data.NumPart)
+  else:
+     simu.masses = None
+
+  return simu
+
+def loadGadget(str filename, int snapshot_id, int gadgetFormat = 1, bool loadPosition = True, bool loadVelocity = False, bool loadId = False, bool loadType = False, bool loadMass=False):
+    """loadGadget(filename, snapshot_id, gadgetFormat = 1, loadPosition=True, loadVelocity=False, loadId=False, loadType=False)
+
+    This function loads Gadget-1 snapshot format.
+    
+    If snapshot_id is negative then the snapshot is considered not to be part of
+    a set of snapshots written by different cpu. Otherwise the filename is modified
+    to reflect the indicated snapshot_id.
+
+    Arguments:
+        filename (str): input filename
+        snapshot_id (int): identifier of the gadget file if it is a multi-file snapshot
+    
+    Keyword arguments:
+        loadPosition (bool): whether to load positions
+        loadVelocity (bool): whether to load velocities
+        loadId (bool): whether to load unique identifiers
+        loadType (bool): whether to set types to particles
+        loadMass (bool): whether to set the mass of particles
+    
+    Returns:
+        an PySimulationAdaptor instance. 
+"""
+
+    cdef int flags
+    cdef SimuData *data
+    cdef Simulation simu
+    cdef const char *filename_bs
+
+    flags = 0
+    if loadPosition:
+        flags |= NEED_POSITION
+    if loadVelocity:
+        flags |= NEED_VELOCITY
+    if loadId:
+        flags |= NEED_GADGET_ID
+    if loadType:
+        flags |= NEED_TYPE
+    if loadMass:
+        flags |= NEED_MASS
+
+    filename_b = bytes(filename, 'utf-8')
+    filename_bs = filename_b
+    with nogil:
+      data = loadGadgetMulti(filename_bs, snapshot_id, flags, gadgetFormat)
+    if data == <SimuData*>0:
+        return None
+
+    return PySimulationAdaptor(wrap_simudata(data, flags))
+
+def loadParallelGadget(object filename_list, int gadgetFormat = 1, bool loadPosition = True, bool loadVelocity = False, bool loadId = False, bool loadType = False, bool loadMass=False):
+  """loadParallelGadget(filename list, gadgetFormat=1, loadPosition=True, loadVelocity=False, loadId=False, loadType=False)
+
+  Arguments:
+    filename (list): a list or tuple of filenames to load in parallel
+  
+  Keyword arguments:
+    loadPosition (bool): indicate to load positions
+    loadVelocity (bool): indicate to load velocities
+    loadId (bool): indicate to load id
+    loadType (bool): indicate to load particle types
+    
+  Returns:
+    It loads a gadget-1 snapshot and return a cosmotool.PySimulationBase_ object.
+  """
+  cdef int flags, i, num_files
+  cdef list out_arrays
+  cdef SimuData ** data
+  cdef SimuData * local_data
+  cdef Simulation simu
+  cdef cppvector[cppstring.string] filenames
+
+  flags = 0
+  if loadPosition:
+     flags |= NEED_POSITION
+  if loadVelocity:
+     flags |= NEED_VELOCITY
+  if loadId:
+     flags |= NEED_GADGET_ID
+  if loadType:
+     flags |= NEED_TYPE
+  if loadMass:
+     flags |= NEED_MASS
+
+  num_files = len(filename_list)
+  filenames.resize(num_files)
+  data = alloc_simudata(num_files)
+  for i,l in enumerate(filename_list):
+    filenames[i] = l.encode('utf-8')
+  
+  with nogil:
+    for i in prange(num_files):
+      local_data = loadGadgetMulti_safe(filenames[i], flags, gadgetFormat)
+      data[i] = local_data
+#      data[i] = loadGadgetMulti(filenames[i].c_str(), -1, flags)
+  
+  out_arrays = []
+  for i in xrange(num_files):
+    if data[i] == <SimuData*>0:        
+      out_arrays.append(None)
+    else:
+      out_arrays.append(PySimulationAdaptor(wrap_simudata(data[i], flags)))
+
+  del_simudata(data)
+
+  return out_arrays
+
+def writeGadget(str filename, object simulation):
+  """writeGadget(filename, simulation)
+  
+  This function attempts to write the content of the simulation object into 
+  a file named `filename` using a Gadget-1 file format.
+  
+  Arguments:
+    filename (str): output filename
+    simulation (PySimulationBase): a simulation object
+  """
+  cdef SimuData simdata
+  cdef np.ndarray[np.float32_t, ndim=1] pos, vel
+  cdef np.ndarray[np.int64_t, ndim=1] ids
+  cdef np.int64_t NumPart
+  cdef int j
+  
+  if not isinstance(simulation,PySimulationBase):
+    raise TypeError("Second argument must be of type SimulationBase")
+  
+  NumPart = simulation.positions[0].size
+  simdata.noAuto = True
+  
+  for j in xrange(3):
+    pos = simulation.getPositions()[j]
+    vel = simulation.getVelocities()[j]
+    
+    if pos.size != NumPart or vel.size != NumPart:
+      raise ValueError("Invalid number of particles")
+    
+    simdata.Pos[j] = <float *>pos.data
+    simdata.Vel[j] = <float *>vel.data
+  
+  ids = simulation.getIdentifiers()
+  simdata.Id = <int64_t *>ids.data
+  simdata.BoxSize = simulation.getBoxsize()
+  simdata.time = simulation.getTime()
+  simdata.Hubble = simulation.getHubble()
+  simdata.Omega_M = simulation.getOmega_M()
+  simdata.Omega_Lambda = simulation.getOmega_Lambda()
+  simdata.TotalNumPart = NumPart
+  simdata.NumPart = NumPart
+
+  cxx_writeGadget(filename, &simdata)
+
+def loadRamses(str basepath, int snapshot_id, int cpu_id, bool doublePrecision = False, bool loadPosition = True, bool loadVelocity = False, bool loadId = False, bool loadMass = False):
+  """ loadRamses(basepath, snapshot_id, cpu_id, doublePrecision = False, loadPosition = True, loadVelocity = False)
+  Loads the indicated snapshot based on the cpu id, snapshot id and basepath. It is important to specify the correct precision in doublePrecision otherwise the loading will fail. There is no way of auto-detecting properly the precision of the snapshot file.
+  
+  Args:
+    basepath (str): the base directory of the snapshot
+    snapshot_id (int): the snapshot id
+    cpu_id (int): the cpu id of the file to load
+  
+  Keyword args:
+    doublePrecision (bool): By default it is False, thus singlePrecision
+    loadPosition (bool): Whether to load positions
+    loadVelocity (bool): Whether to load velocities
+    loadId (bool): Whether to load identifiers
+    loadMass (bool): Whether to load mass value 
+    
+  Returns:
+    An object derived from PySimulationBase_. 
+"""
+  cdef int flags
+  cdef SimuData *data
+  cdef Simulation simu
+
+  flags = 0
+  if loadPosition:
+     flags |= NEED_POSITION
+  if loadVelocity:
+     flags |= NEED_VELOCITY
+  if loadId:
+     flags |= NEED_GADGET_ID
+  if loadMass:
+     flags |= NEED_MASS
+
+  encpath = basepath.encode('utf-8') 
+  try:
+    data = loadRamsesSimu(encpath, snapshot_id, cpu_id, doublePrecision, flags)
+    if data == <SimuData*>0:
+      return None
+  except RuntimeError as e:
+    raise RuntimeError(str(e) + ' (check the float precision in snapshot)')
+
+  return PySimulationAdaptor(wrap_simudata(data, flags))
+  
--- a/external/cosmotool/python/_fast_interp.pyx
+++ b/external/cosmotool/python/_fast_interp.pyx
@ -0,0 +1,43 @@
+from cpython cimport bool
+from cython cimport view
+from cython.parallel import prange, parallel
+from libc.math cimport sin, cos, abs, floor, round, sqrt
+import numpy as np
+cimport numpy as npx
+cimport cython
+
+__all__=["fast_interp"]
+
+
+@cython.boundscheck(False)
+@cython.cdivision(True)
+def fast_interp(xmin0, dx0, A0, y0, out0, beyond_val=np.nan):
+  cdef double rq, q
+  cdef int iq
+  cdef long i, Asize, ysize
+  cdef npx.float64_t xmin, dx
+  cdef npx.float64_t[:] out
+  cdef npx.float64_t[:] A, y
+  cdef npx.float64_t beyond
+
+  beyond=beyond_val
+  xmin = xmin0
+  dx = dx0
+  A = A0
+  y = y0
+  ysize = y.size
+  out = out0
+  Asize = A.size
+
+  if out.size != ysize:
+    raise ValueError("out and y must have the same size")
+
+  with nogil:
+    for i in prange(ysize):
+      q = (y[i] - xmin) / dx
+      iq = int(floor(q))
+      rq = (q-iq)
+      if iq+1 >= Asize or iq < 0:
+        out[i] = beyond
+      else:
+        out[i] = rq * A[iq+1] + (1-rq)*A[iq]
--- a/external/cosmotool/python/_project.pyx
+++ b/external/cosmotool/python/_project.pyx
@ -0,0 +1,890 @@
+from cpython cimport bool
+from cython cimport view
+from cython.parallel import prange, parallel
+from libc.math cimport sin, cos, abs, floor, round, sqrt
+import numpy as np
+cimport numpy as npx
+cimport cython
+from copy cimport *
+
+ctypedef npx.float64_t DTYPE_t
+DTYPE=np.float64
+FORMAT_DTYPE="d"
+
+__all__=["project_cic","line_of_sight_projection","spherical_projection","DTYPE","interp3d","interp2d"]
+
+cdef extern from "project_tool.hpp" namespace "":
+
+  DTYPE_t compute_projection(DTYPE_t *vertex_value, DTYPE_t *u, DTYPE_t *u0, DTYPE_t rho) nogil
+
+cdef extern from "openmp.hpp" namespace "CosmoTool":
+  int smp_get_max_threads() nogil
+  int smp_get_thread_id() nogil
+
+
+@cython.boundscheck(False)
+@cython.cdivision(True)
+@cython.wraparound(False)
+cdef void interp3d_INTERNAL_periodic(DTYPE_t x, DTYPE_t y, 
+                                        DTYPE_t z, 
+                                        DTYPE_t[:,:,:] d, DTYPE_t Lbox, DTYPE_t *retval) nogil:
+
+    cdef int Ngrid = d.shape[0]
+    cdef DTYPE_t inv_delta = Ngrid/Lbox
+    cdef int ix, iy, iz
+    cdef DTYPE_t f[2][2][2]
+    cdef DTYPE_t rx, ry, rz
+    cdef int jx, jy, jz
+
+    rx = (inv_delta*x)
+    ry = (inv_delta*y)
+    rz = (inv_delta*z)
+
+    ix = int(floor(rx))
+    iy = int(floor(ry))
+    iz = int(floor(rz))
+
+
+    rx -= ix
+    ry -= iy
+    rz -= iz
+
+    ix = ix % Ngrid
+    iy = iy % Ngrid
+    iz = iz % Ngrid
+
+    jx = (ix+1)%Ngrid
+    jy = (iy+1)%Ngrid
+    jz = (iz+1)%Ngrid
+
+    ix = ix%Ngrid
+    iy = iy%Ngrid
+    iz = iz%Ngrid
+
+    f[0][0][0] = (1-rx)*(1-ry)*(1-rz)
+    f[1][0][0] = (  rx)*(1-ry)*(1-rz)
+    f[0][1][0] = (1-rx)*(  ry)*(1-rz)
+    f[1][1][0] = (  rx)*(  ry)*(1-rz)
+
+    f[0][0][1] = (1-rx)*(1-ry)*(  rz)
+    f[1][0][1] = (  rx)*(1-ry)*(  rz)
+    f[0][1][1] = (1-rx)*(  ry)*(  rz)
+    f[1][1][1] = (  rx)*(  ry)*(  rz)
+
+    retval[0] = \
+        d[ix ,iy ,iz ] * f[0][0][0] + \
+        d[jx ,iy ,iz ] * f[1][0][0] + \
+        d[ix ,jy ,iz ] * f[0][1][0] + \
+        d[jx ,jy ,iz ] * f[1][1][0] + \
+        d[ix ,iy ,jz ] * f[0][0][1] + \
+        d[jx ,iy ,jz ] * f[1][0][1] + \
+        d[ix ,jy ,jz ] * f[0][1][1] + \
+        d[jx ,jy ,jz ] * f[1][1][1]
+
+@cython.boundscheck(False)
+@cython.cdivision(True)
+@cython.wraparound(False)
+cdef void ngp3d_INTERNAL_periodic(DTYPE_t x, DTYPE_t y, 
+                                 DTYPE_t z, 
+                                 DTYPE_t[:,:,:] d, DTYPE_t Lbox, DTYPE_t *retval) nogil:
+
+    cdef int Ngrid = d.shape[0]
+    cdef DTYPE_t inv_delta = Ngrid/Lbox
+    cdef int ix, iy, iz
+    cdef DTYPE_t f[2][2][2]
+    cdef DTYPE_t rx, ry, rz
+    cdef int jx, jy, jz
+
+    rx = (inv_delta*x)
+    ry = (inv_delta*y)
+    rz = (inv_delta*z)
+
+    ix = int(round(rx))
+    iy = int(round(ry))
+    iz = int(round(rz))
+
+
+    ix = ix%Ngrid
+    iy = iy%Ngrid
+    iz = iz%Ngrid
+
+    retval[0] = d[ix ,iy ,iz ] 
+
+
+@cython.boundscheck(False)
+@cython.cdivision(True)
+@cython.wraparound(False)
+cdef void ngp3d_INTERNAL(DTYPE_t x, DTYPE_t y, 
+                         DTYPE_t z, 
+                         DTYPE_t[:,:,:] d, DTYPE_t Lbox, DTYPE_t *retval, DTYPE_t inval) nogil:
+
+    cdef int Ngrid = d.shape[0]
+    cdef DTYPE_t inv_delta = Ngrid/Lbox
+    cdef int ix, iy, iz
+    cdef DTYPE_t f[2][2][2]
+    cdef DTYPE_t rx, ry, rz
+    cdef int jx, jy, jz
+
+    rx = (inv_delta*x)
+    ry = (inv_delta*y)
+    rz = (inv_delta*z)
+
+    ix = int(round(rx))
+    iy = int(round(ry))
+    iz = int(round(rz))
+
+    if ((ix < 0)  or (ix+1) >= Ngrid or (iy < 0)  or (iy+1) >= Ngrid or (iz < 0)  or (iz+1) >= Ngrid):
+      retval[0] = inval
+      return
+
+    retval[0] = d[ix ,iy ,iz ] 
+
+
+@cython.boundscheck(False)
+@cython.cdivision(True)
+@cython.wraparound(False)
+cdef void interp3d_INTERNAL(DTYPE_t x, DTYPE_t y, 
+                               DTYPE_t z, 
+                               DTYPE_t[:,:,:] d, DTYPE_t Lbox, DTYPE_t *retval, DTYPE_t inval) nogil:
+    
+    cdef int Ngrid = d.shape[0]
+    cdef DTYPE_t inv_delta = Ngrid/Lbox
+    cdef int ix, iy, iz
+    cdef DTYPE_t f[2][2][2]
+    cdef DTYPE_t rx, ry, rz
+
+    rx = (inv_delta*x)
+    ry = (inv_delta*y)
+    rz = (inv_delta*z)
+
+    ix = int(floor(rx))
+    iy = int(floor(ry))
+    iz = int(floor(rz))
+
+    rx -= ix
+    ry -= iy
+    rz -= iz
+
+    if ((ix < 0)  or (ix+1) >= Ngrid or (iy < 0)  or (iy+1) >= Ngrid or (iz < 0)  or (iz+1) >= Ngrid):
+      retval[0] = inval
+      return
+    #   assert ((ix >= 0) and ((ix+1) < Ngrid))
+#    assert ((iy >= 0) and ((iy+1) < Ngrid))
+#    assert ((iz >= 0) and ((iz+1) < Ngrid))
+
+    f[0][0][0] = (1-rx)*(1-ry)*(1-rz)
+    f[1][0][0] = (  rx)*(1-ry)*(1-rz)
+    f[0][1][0] = (1-rx)*(  ry)*(1-rz)
+    f[1][1][0] = (  rx)*(  ry)*(1-rz)
+
+    f[0][0][1] = (1-rx)*(1-ry)*(  rz)
+    f[1][0][1] = (  rx)*(1-ry)*(  rz)
+    f[0][1][1] = (1-rx)*(  ry)*(  rz)
+    f[1][1][1] = (  rx)*(  ry)*(  rz)
+
+    retval[0] = \
+        d[ix  ,iy  ,iz  ] * f[0][0][0] + \
+        d[ix+1,iy  ,iz  ] * f[1][0][0] + \
+        d[ix  ,iy+1,iz  ] * f[0][1][0] + \
+        d[ix+1,iy+1,iz  ] * f[1][1][0] + \
+        d[ix  ,iy  ,iz+1] * f[0][0][1] + \
+        d[ix+1,iy  ,iz+1] * f[1][0][1] + \
+        d[ix  ,iy+1,iz+1] * f[0][1][1] + \
+        d[ix+1,iy+1,iz+1] * f[1][1][1]
+
+@cython.boundscheck(False)
+def interp3d(x not None, y not None, 
+             z not None,
+             npx.ndarray[DTYPE_t, ndim=3] d not None, DTYPE_t Lbox,
+             bool periodic=False, bool centered=True, bool ngp=False, DTYPE_t inval = 0):
+    """ interp3d(x,y,z,d,Lbox,periodic=False,centered=True,ngp=False) -> interpolated values
+    
+    Compute the tri-linear interpolation of the given field (d) at the given position (x,y,z). It assumes that they are box-centered coordinates. So (x,y,z) == (0,0,0) is equivalent to the pixel at (Nx/2,Ny/2,Nz/2) with Nx,Ny,Nz = d.shape. If periodic is set, it assumes the box is periodic 
+"""
+    cdef npx.ndarray[DTYPE_t] out
+    cdef DTYPE_t[:] out_slice
+    cdef DTYPE_t[:] ax, ay, az
+    cdef DTYPE_t[:,:,:] in_slice
+    cdef DTYPE_t retval
+    cdef long i
+    cdef long Nelt
+    cdef int myperiodic, myngp
+    cdef DTYPE_t shifter
+
+    myperiodic = periodic
+    myngp = ngp
+
+    if centered:
+      shifter = Lbox/2
+    else:
+      shifter = 0
+
+    if d.shape[0] != d.shape[1] or d.shape[0] != d.shape[2]:
+      raise ValueError("Grid must have a cubic shape")
+
+
+    ierror = IndexError("Interpolating outside range")
+    if type(x) == np.ndarray or type(y) == np.ndarray or type(z) == np.ndarray:
+        if type(x) != np.ndarray or type(y) != np.ndarray or type(z) != np.ndarray:
+            raise ValueError("All or no array. No partial arguments")
+        
+        ax = x
+        ay = y
+        az = z
+        assert ax.size == ay.size and ax.size == az.size
+        
+        out = np.empty(x.shape, dtype=DTYPE)
+        out_slice = out
+        in_slice = d
+        Nelt = ax.size
+        with nogil:
+          if not myngp:
+            if myperiodic:
+              for i in prange(Nelt):
+                interp3d_INTERNAL_periodic(shifter+ax[i], shifter+ay[i], shifter+az[i], in_slice, Lbox, &out_slice[i])
+            else:
+              for i in prange(Nelt):
+                interp3d_INTERNAL(shifter+ax[i], shifter+ay[i], shifter+az[i], in_slice, Lbox, &out_slice[i], inval)
+          else:
+            if myperiodic:
+              for i in prange(Nelt):
+                ngp3d_INTERNAL_periodic(shifter+ax[i], shifter+ay[i], shifter+az[i], in_slice, Lbox, &out_slice[i])
+            else:
+              for i in prange(Nelt):
+                ngp3d_INTERNAL(shifter+ax[i], shifter+ay[i], shifter+az[i], in_slice, Lbox, &out_slice[i], inval)
+        return out
+    else:
+      if not myngp:
+        if periodic:
+          interp3d_INTERNAL_periodic(shifter+x, shifter+y, shifter+z, d, Lbox, &retval)
+        else:
+          interp3d_INTERNAL(shifter+x, shifter+y, shifter+z, d, Lbox, &retval, inval)
+      else:
+        if periodic:
+          ngp3d_INTERNAL_periodic(shifter+x, shifter+y, shifter+z, d, Lbox, &retval)
+        else:
+          ngp3d_INTERNAL(shifter+x, shifter+y, shifter+z, d, Lbox, &retval, inval)
+      return retval
+
+@cython.boundscheck(False)
+@cython.cdivision(True)
+cdef DTYPE_t interp2d_INTERNAL_periodic(DTYPE_t x, DTYPE_t y,
+                                        npx.ndarray[DTYPE_t, ndim=2] d, DTYPE_t Lbox) except? 0:
+
+    cdef int Ngrid = d.shape[0]
+    cdef DTYPE_t inv_delta = Ngrid/Lbox
+    cdef int ix, iy
+    cdef DTYPE_t f[2][2]
+    cdef DTYPE_t rx, ry
+    cdef int jx, jy
+
+    rx = (inv_delta*x + Ngrid/2)
+    ry = (inv_delta*y + Ngrid/2)
+    
+    ix = int(floor(rx))
+    iy = int(floor(ry))
+    
+    rx -= ix
+    ry -= iy
+
+    while ix < 0:
+      ix += Ngrid
+    while iy < 0:
+      iy += Ngrid
+    
+    jx = (ix+1)%Ngrid
+    jy = (iy+1)%Ngrid
+    
+    assert ((ix >= 0) and ((jx) < Ngrid))
+    assert ((iy >= 0) and ((jy) < Ngrid))
+    
+    f[0][0] = (1-rx)*(1-ry)
+    f[1][0] = (  rx)*(1-ry)
+    f[0][1] = (1-rx)*(  ry)
+    f[1][1] = (  rx)*(  ry)
+
+    return \
+        d[ix ,iy ] * f[0][0] + \
+        d[jx ,iy ] * f[1][0] + \
+        d[ix ,jy ] * f[0][1] + \
+        d[jx ,jy ] * f[1][1]
+
+
+@cython.boundscheck(False)
+@cython.cdivision(True)
+cdef DTYPE_t interp2d_INTERNAL(DTYPE_t x, DTYPE_t y,
+                               npx.ndarray[DTYPE_t, ndim=2] d, DTYPE_t Lbox) except? 0:
+    
+    cdef int Ngrid = d.shape[0]
+    cdef DTYPE_t inv_delta = Ngrid/Lbox
+    cdef int ix, iy
+    cdef DTYPE_t f[2][2]
+    cdef DTYPE_t rx, ry
+
+    rx = (inv_delta*x + Ngrid/2)
+    ry = (inv_delta*y + Ngrid/2)
+
+    ix = int(floor(rx))
+    iy = int(floor(ry))
+
+    rx -= ix
+    ry -= iy
+
+    if ((ix < 0)  or (ix+1) >= Ngrid):
+      raise IndexError("X coord out of bound (ix=%d, x=%g)" % (ix,x))
+    if ((iy < 0)  or (iy+1) >= Ngrid):
+      raise IndexError("Y coord out of bound (iy=%d, y=%g)" % (iy,y))
+#   assert ((ix >= 0) and ((ix+1) < Ngrid))
+#    assert ((iy >= 0) and ((iy+1) < Ngrid))
+#    assert ((iz >= 0) and ((iz+1) < Ngrid))
+
+    f[0][0] = (1-rx)*(1-ry)
+    f[1][0] = (  rx)*(1-ry)
+    f[0][1] = (1-rx)*(  ry)
+    f[1][1] = (  rx)*(  ry)
+
+    return \
+        d[ix  ,iy  ] * f[0][0] + \
+        d[ix+1,iy  ] * f[1][0] + \
+        d[ix  ,iy+1] * f[0][1] + \
+        d[ix+1,iy+1] * f[1][1]
+        
+def interp2d(x not None, y not None,
+             npx.ndarray[DTYPE_t, ndim=2] d not None, DTYPE_t Lbox,
+             bool periodic=False):
+    cdef npx.ndarray[DTYPE_t] out
+    cdef npx.ndarray[DTYPE_t] ax, ay
+    cdef int i
+
+    if d.shape[0] != d.shape[1]:
+      raise ValueError("Grid must have a square shape")
+
+    if type(x) == np.ndarray or type(y) == np.ndarray:
+        if type(x) != np.ndarray or type(y) != np.ndarray:
+            raise ValueError("All or no array. No partial arguments")
+        
+        ax = x
+        ay = y
+        assert ax.size == ay.size 
+        
+        out = np.empty(x.shape, dtype=DTYPE)
+        if periodic:
+          for i in range(ax.size):
+            out[i] = interp2d_INTERNAL_periodic(ax[i], ay[i], d, Lbox)
+        else:
+          for i in range(ax.size):
+            out[i] = interp2d_INTERNAL(ax[i], ay[i], d, Lbox)
+
+        return out
+    else:
+      if periodic:
+        return interp2d_INTERNAL_periodic(x, y, d, Lbox)
+      else:
+        return interp2d_INTERNAL(x, y, d, Lbox)
+        
+ 
+@cython.boundscheck(False)
+@cython.cdivision(True)
+cdef void INTERNAL_project_cic_no_mass(DTYPE_t[:,:,:] g,
+                                       DTYPE_t[:,:] x, int Ngrid, double Lbox, double shifter) nogil:
+    cdef double delta_Box = Ngrid/Lbox
+    cdef int i
+    cdef double a[3]
+    cdef double c[3]
+    cdef int b[3]
+    cdef int do_not_put
+
+    for i in range(x.shape[0]):
+
+        do_not_put = 0
+        for j in range(3):
+            a[j] = (x[i,j]+shifter)*delta_Box
+            b[j] = int(floor(a[j]))
+            a[j] -= b[j]
+            c[j] = 1-a[j]
+            if b[j] < 0 or b[j]+1 >= Ngrid:
+              do_not_put = True
+
+        if not do_not_put:
+            g[b[0],b[1],b[2]] += c[0]*c[1]*c[2]
+            g[b[0]+1,b[1],b[2]] += a[0]*c[1]*c[2]
+            g[b[0],b[1]+1,b[2]] += c[0]*a[1]*c[2]
+            g[b[0]+1,b[1]+1,b[2]] += a[0]*a[1]*c[2]
+
+            g[b[0],b[1],b[2]+1] += c[0]*c[1]*a[2]
+            g[b[0]+1,b[1],b[2]+1] += a[0]*c[1]*a[2]
+            g[b[0],b[1]+1,b[2]+1] += c[0]*a[1]*a[2]
+            g[b[0]+1,b[1]+1,b[2]+1] += a[0]*a[1]*a[2]
+
+@cython.boundscheck(False)
+@cython.cdivision(True)
+cdef void INTERNAL_project_cic_no_mass_periodic(DTYPE_t[:,:,:] g,
+                                                DTYPE_t[:,:] x, int Ngrid, double Lbox, double shifter) nogil:
+    cdef double delta_Box = Ngrid/Lbox
+    cdef int i
+    cdef double a[3]
+    cdef double c[3]
+    cdef int b[3]
+    cdef int b1[3]
+    cdef int do_not_put
+    cdef DTYPE_t[:,:] ax
+    cdef DTYPE_t[:,:,:] ag
+
+    ax = x
+    ag = g
+
+    for i in range(x.shape[0]):
+
+        do_not_put = 0
+        for j in range(3):
+            a[j] = (ax[i,j]+shifter)*delta_Box
+            b[j] = int(floor(a[j]))
+            b1[j] = (b[j]+1) % Ngrid
+
+            a[j] -= b[j]
+            c[j] = 1-a[j]
+
+            b[j] %= Ngrid
+
+        ag[b[0],b[1],b[2]] += c[0]*c[1]*c[2]
+        ag[b1[0],b[1],b[2]] += a[0]*c[1]*c[2]
+        ag[b[0],b1[1],b[2]] += c[0]*a[1]*c[2]
+        ag[b1[0],b1[1],b[2]] += a[0]*a[1]*c[2]
+        
+        ag[b[0],b[1],b1[2]] += c[0]*c[1]*a[2]
+        ag[b1[0],b[1],b1[2]] += a[0]*c[1]*a[2]
+        ag[b[0],b1[1],b1[2]] += c[0]*a[1]*a[2]
+        ag[b1[0],b1[1],b1[2]] += a[0]*a[1]*a[2]
+
+
+@cython.boundscheck(False)
+@cython.cdivision(True)
+cdef void INTERNAL_project_cic_with_mass(DTYPE_t[:,:,:] g,
+                                         DTYPE_t[:,:] x,
+                                         DTYPE_t[:] mass,
+                                         int Ngrid, double Lbox, double shifter) nogil:
+    cdef double delta_Box = Ngrid/Lbox
+    cdef int i
+    cdef double a[3]
+    cdef double c[3]
+    cdef DTYPE_t m0
+    cdef int b[3]
+
+    for i in range(x.shape[0]):
+
+        do_not_put = False
+        for j in range(3):
+            a[j] = (x[i,j]+shifter)*delta_Box
+            b[j] = int(a[j])
+            a[j] -= b[j]
+            c[j] = 1-a[j]
+            if b[j] < 0 or b[j]+1 >= Ngrid:
+              do_not_put = True
+
+        if not do_not_put:
+            m0 = mass[i]
+
+            g[b[0],b[1],b[2]] += c[0]*c[1]*c[2]*m0
+            g[b[0]+1,b[1],b[2]] += a[0]*c[1]*c[2]*m0
+            g[b[0],b[1]+1,b[2]] += c[0]*a[1]*c[2]*m0
+            g[b[0]+1,b[1]+1,b[2]] += a[0]*a[1]*c[2]*m0
+
+            g[b[0],b[1],b[2]+1] += c[0]*c[1]*a[2]*m0
+            g[b[0]+1,b[1],b[2]+1] += a[0]*c[1]*a[2]*m0
+            g[b[0],b[1]+1,b[2]+1] += c[0]*a[1]*a[2]*m0
+            g[b[0]+1,b[1]+1,b[2]+1] += a[0]*a[1]*a[2]*m0
+
+@cython.boundscheck(False)
+@cython.cdivision(True)
+cdef void INTERNAL_project_cic_with_mass_periodic(DTYPE_t[:,:,:] g,
+                                                  DTYPE_t[:,:] x,
+                                                  DTYPE_t[:] mass,
+                                                  int Ngrid, double Lbox, double shifter) nogil:
+    cdef double half_Box = 0.5*Lbox, m0
+    cdef double delta_Box = Ngrid/Lbox
+    cdef int i
+    cdef double a[3]
+    cdef double c[3]
+    cdef int b[3]
+    cdef int b1[3]
+
+    for i in range(x.shape[0]):
+
+        for j in range(3):
+            a[j] = (x[i,j]+shifter)*delta_Box
+            b[j] = int(floor(a[j]))
+            b1[j] = b[j]+1
+            while b1[j] < 0:
+              b1[j] += Ngrid
+            while b1[j] >= Ngrid:
+              b1[j] -= Ngrid
+
+            a[j] -= b[j]
+            c[j] = 1-a[j]
+
+        m0 = mass[i]
+        g[b[0],b[1],b[2]] += c[0]*c[1]*c[2]*m0
+        g[b1[0],b[1],b[2]] += a[0]*c[1]*c[2]*m0
+        g[b[0],b1[1],b[2]] += c[0]*a[1]*c[2]*m0
+        g[b1[0],b1[1],b[2]] += a[0]*a[1]*c[2]*m0
+        
+        g[b[0],b[1],b1[2]] += c[0]*c[1]*a[2]*m0
+        g[b1[0],b[1],b1[2]] += a[0]*c[1]*a[2]*m0
+        g[b[0],b1[1],b1[2]] += c[0]*a[1]*a[2]*m0
+        g[b1[0],b1[1],b1[2]] += a[0]*a[1]*a[2]*m0
+
+       
+def project_cic(npx.ndarray[DTYPE_t, ndim=2] x not None, npx.ndarray[DTYPE_t, ndim=1] mass, int Ngrid,
+                double Lbox, bool periodic = False, centered=True):
+    """
+    project_cic(x array (N,3), mass (may be None), Ngrid, Lbox, periodict, centered=True)
+
+    This function does a Cloud-In-Cell projection of a 3d unstructured dataset. First argument is a Nx3 array of coordinates. 
+    Second argument is an optinal mass. Ngrid is the size output grid and Lbox is the physical size of the grid.
+    """
+    cdef npx.ndarray[DTYPE_t, ndim=3] g
+    cdef double shifter
+    cdef bool local_periodic
+
+    local_periodic = periodic
+
+    if centered:
+      shifter = 0.5*Lbox
+    else:
+      shifter = 0
+
+    if x.shape[1] != 3:
+        raise ValueError("Invalid shape for x array")
+
+    if mass is not None and mass.shape[0] != x.shape[0]:
+        raise ValueError("Mass array and coordinate array must have the same number of elements")
+
+    g = np.zeros((Ngrid,Ngrid,Ngrid),dtype=DTYPE)
+
+    if not local_periodic:
+        if mass is None:
+         with nogil:
+           INTERNAL_project_cic_no_mass(g, x, Ngrid, Lbox, shifter)
+        else:
+         with nogil:
+           INTERNAL_project_cic_with_mass(g, x, mass, Ngrid, Lbox, shifter)       
+    else:
+        if mass is None:
+         with nogil:
+          INTERNAL_project_cic_no_mass_periodic(g, x, Ngrid, Lbox, shifter)
+        else:
+         with nogil:
+          INTERNAL_project_cic_with_mass_periodic(g, x, mass, Ngrid, Lbox, shifter)       
+         
+    return g
+
+def tophat_fourier_internal(npx.ndarray[DTYPE_t, ndim=1] x not None):
+    cdef int i
+    cdef npx.ndarray[DTYPE_t] y    
+    cdef DTYPE_t x0
+
+    y = np.empty(x.size, dtype=DTYPE)
+
+    for i in range(x.size):
+        x0 = x[i]
+        if abs(x0)<1e-5:
+            y[i] = 1
+        else:
+            y[i] = (3*(sin(x0) - x0 * cos(x0))/(x0**3))
+
+    return y
+
+def tophat_fourier(x not None):
+    cdef npx.ndarray[DTYPE_t, ndim=1] b
+
+    if type(x) != np.ndarray:
+        raise ValueError("x must be a Numpy array")
+
+    b = np.array(x, dtype=DTYPE).ravel()
+
+    b = tophat_fourier_internal(b)
+
+    return b.reshape(x.shape)
+
+    
+
+@cython.boundscheck(False)
+@cython.cdivision(True)
+cdef DTYPE_t cube_integral(DTYPE_t u[3], DTYPE_t u0[3], int r[1], DTYPE_t alpha_max) nogil:
+    cdef DTYPE_t tmp_a
+    cdef DTYPE_t v[3]
+    cdef int i, j
+
+    for i in xrange(3):
+        if u[i] == 0.:
+            continue
+
+        if u[i] < 0:
+            tmp_a = -u0[i]/u[i]
+        else:
+            tmp_a = (1-u0[i])/u[i]
+
+        if tmp_a < alpha_max:
+            alpha_max = tmp_a
+            j = i
+
+    for i in range(3):
+        u0[i] += u[i]*alpha_max
+
+    r[0] = j
+
+    return alpha_max
+
+@cython.boundscheck(False)
+@cython.cdivision(True)
+cdef DTYPE_t cube_integral_trilin(DTYPE_t u[3], DTYPE_t u0[3], int r[1], DTYPE_t vertex_value[8], DTYPE_t alpha_max) nogil:
+    cdef DTYPE_t I, tmp_a
+    cdef DTYPE_t v[3]
+    cdef DTYPE_t term[4]
+    cdef int i, j, q
+
+    j = 0
+    for i in range(3):
+        if u[i] == 0.:
+            continue
+
+        if u[i] < 0:
+            tmp_a = -u0[i]/u[i]
+        else:
+            tmp_a = (1-u0[i])/u[i]
+
+        if tmp_a < alpha_max:
+            alpha_max = tmp_a
+            j = i
+     
+    I = compute_projection(vertex_value, u, u0, alpha_max)
+    
+    for i in xrange(3):
+        u0[i] += u[i]*alpha_max
+
+    # alpha_max is the integration length
+    # we integrate between 0 and alpha_max (curvilinear coordinates)
+    r[0] = j
+    
+    return I
+
+@cython.boundscheck(False)
+cdef DTYPE_t integrator0(DTYPE_t[:,:,:] density,
+                         DTYPE_t u[3], DTYPE_t u0[3], int u_delta[3], int iu0[3], int jumper[1], DTYPE_t alpha_max) nogil:
+    cdef DTYPE_t d
+    
+    d = density[iu0[0], iu0[1], iu0[2]]
+    
+    return cube_integral(u, u0, jumper, alpha_max)*d
+
+@cython.boundscheck(False)
+cdef DTYPE_t integrator1(DTYPE_t[:,:,:] density,
+                         DTYPE_t u[3], DTYPE_t u0[3], int u_delta[3], int iu0[3], int jumper[1], DTYPE_t alpha_max) nogil:
+    cdef DTYPE_t vertex_value[8]
+    cdef DTYPE_t d
+    cdef int a[3][2]
+    cdef int i
+    
+    for i in xrange(3):
+      a[i][0] = iu0[i]
+      a[i][1] = iu0[i]+1
+
+    vertex_value[0 + 2*0 + 4*0] = density[a[0][0], a[1][0], a[2][0]]
+    vertex_value[1 + 2*0 + 4*0] = density[a[0][1], a[1][0], a[2][0]]
+    vertex_value[0 + 2*1 + 4*0] = density[a[0][0], a[1][1], a[2][0]]
+    vertex_value[1 + 2*1 + 4*0] = density[a[0][1], a[1][1], a[2][0]]
+
+    vertex_value[0 + 2*0 + 4*1] = density[a[0][0], a[1][0], a[2][1]]
+    vertex_value[1 + 2*0 + 4*1] = density[a[0][1], a[1][0], a[2][1]]
+    vertex_value[0 + 2*1 + 4*1] = density[a[0][0], a[1][1], a[2][1]]
+    vertex_value[1 + 2*1 + 4*1] = density[a[0][1], a[1][1], a[2][1]]
+
+    return cube_integral_trilin(u, u0, jumper, vertex_value, alpha_max)
+
+
+    
+@cython.boundscheck(False)
+cdef DTYPE_t C_line_of_sight_projection(DTYPE_t[:,:,:] density,
+                             DTYPE_t a_u[3],
+                             DTYPE_t min_distance,
+                             DTYPE_t max_distance, DTYPE_t[:] shifter, int integrator_id) nogil except? 0:
+
+    cdef DTYPE_t u[3] 
+    cdef DTYPE_t ifu0[3]
+    cdef DTYPE_t u0[3]
+    cdef DTYPE_t utot[3]
+    cdef int u_delta[3]
+    cdef int iu0[3]
+    cdef int i
+    cdef int N = density.shape[0]
+    cdef int half_N = density.shape[0]/2
+    cdef int completed
+    cdef DTYPE_t I0, d, dist2, delta, s, max_distance2
+    cdef int jumper[1]
+    
+    cdef DTYPE_t (*integrator)(DTYPE_t[:,:,:],
+                               DTYPE_t u[3], DTYPE_t u0[3], int u_delta[3], int iu0[3], int jumper[1], DTYPE_t alpha_max) nogil
+
+    if integrator_id == 0:
+      integrator = integrator0
+    else:
+      integrator = integrator1
+
+    max_distance2 = max_distance**2
+
+    for i in range(3):
+        u[i] = a_u[i]
+        u0[i] = a_u[i]*min_distance
+        ifu0[i] = half_N+u0[i]+shifter[i]
+        if (ifu0[i] <= 0 or ifu0[i] >= N):
+          return 0
+        iu0[i] = int(floor(ifu0[i]))
+        u0[i] = ifu0[i]-iu0[i]
+        u_delta[i] = 1 if iu0[i] > 0 else -1
+        if (not ((iu0[i]>= 0) and (iu0[i] < N))):
+          with gil:
+            raise RuntimeError("iu0[%d] = %d !!" % (i,iu0[i]))
+            
+        if (not (u0[i]>=0 and u0[i]<=1)):
+          with gil:
+            raise RuntimeError("u0[%d] = %g !" % (i,u0[i]))
+
+    completed = 0
+    if ((iu0[0] >= N-1) or (iu0[0] <= 0) or
+        (iu0[1] >= N-1) or (iu0[1] <= 0) or
+        (iu0[2] >= N-1) or (iu0[2] <= 0)):
+      completed = 1 
+
+    I0 = 0
+    jumper[0] = 0
+    dist2 = 0
+    while completed == 0:
+        I0 += integrator(density, u, u0, u_delta, iu0, jumper, max_distance-sqrt(dist2))
+
+        if u[jumper[0]] < 0:
+            iu0[jumper[0]] -= 1
+            u0[jumper[0]] = 1
+        else:
+            iu0[jumper[0]] += 1
+            u0[jumper[0]] = 0
+
+            
+        if ((iu0[0] >= N-1) or (iu0[0] <= 0) or 
+            (iu0[1] >= N-1) or (iu0[1] <= 0) or
+            (iu0[2] >= N-1) or (iu0[2] <= 0)):
+            completed = 1
+        else:
+          dist2 = 0
+          for i in range(3):
+             delta =  iu0[i]+u0[i]-half_N-shifter[i]
+             dist2 += delta*delta
+
+          if (dist2 > max_distance2):
+            # Remove the last portion of the integral
+            #delta = sqrt(dist2) - max_distance
+            #I0 -= d*delta
+            completed = 1
+            
+    return I0
+
+def line_of_sight_projection(DTYPE_t[:,:,:] density not None,
+                             DTYPE_t[:] a_u not None,
+                             DTYPE_t min_distance,
+                             DTYPE_t max_distance, DTYPE_t[:] shifter not None, int integrator_id=0):
+  cdef DTYPE_t u[3]
+  
+  u[0] = a_u[0]
+  u[1] = a_u[1]
+  u[2] = a_u[2]
+  
+  return C_line_of_sight_projection(density,
+                             u,
+                             min_distance,
+                             max_distance, shifter, integrator_id)
+
+cdef double _spherical_projloop(double theta, double phi, DTYPE_t[:,:,:] density, 
+                double min_distance, double max_distance, 
+                DTYPE_t[:] shifter, int integrator_id) nogil:
+  cdef DTYPE_t u0[3]
+
+  stheta = sin(theta)
+  u0[0] = cos(phi)*stheta
+  u0[1] = sin(phi)*stheta
+  u0[2] = cos(theta)
+ 
+  return C_line_of_sight_projection(density, u0, min_distance, max_distance, shifter, integrator_id)
+
+
+@cython.boundscheck(False)
+def spherical_projection(int Nside,
+                         npx.ndarray[DTYPE_t, ndim=3] density not None,
+                         DTYPE_t min_distance,
+                         DTYPE_t max_distance, int progress=1, int integrator_id=0, DTYPE_t[:] shifter = None, int booster=-1):
+    """
+    spherical_projection(Nside, density, min_distance, max_distance, progress=1, integrator_id=0, shifter=None, booster=-1)
+    
+    Keyword arguments:
+      progress (int): show progress if it is equal to 1
+      integrator_id (int): specify the order of integration along the line of shift
+      shifter (DTYPE_t array): this is an array of size 3. It specifies the amount of shift to apply to the center, in unit of voxel
+      booster (int): what is the frequency of refreshment of the progress bar. Small number decreases performance by locking the GIL.
+      
+
+    Arguments:
+      Nside (int): Nside of the returned map
+      density (NxNxN array): this is the density field, expressed as a cubic array
+      min_distance (float): lower bound of the integration
+      max_distance (float): upper bound of the integration
+      
+    Returns:
+      an healpix map, as a 1-dimensional array.
+    """
+    import healpy as hp
+    import progressbar as pb
+    cdef int i
+    cdef DTYPE_t[:] theta,phi
+    cdef DTYPE_t[:,:,:] density_view
+    cdef DTYPE_t[:] outm
+    cdef int[:] job_done
+    cdef npx.ndarray[DTYPE_t, ndim=1] outm_array
+    cdef long N, N0
+    cdef double stheta
+    cdef int tid
+    
+    if shifter is None:
+        shifter = view.array(shape=(3,), format=FORMAT_DTYPE, itemsize=sizeof(DTYPE_t))
+        shifter[:] = 0
+        
+    print("allocating map")
+    outm_array = np.empty(hp.nside2npix(Nside),dtype=DTYPE)
+    print("initializing views")
+    outm = outm_array
+    density_view = density
+
+    print("progress?")
+    if progress != 0:
+      p = pb.ProgressBar(maxval=outm.size,widgets=[pb.Bar(), pb.ETA()]).start()
+
+    N = smp_get_max_threads()
+    N0 = outm.size
+    
+    if booster < 0:
+      booster = 1#000
+    
+    job_done = view.array(shape=(N,), format="i", itemsize=sizeof(int))
+    job_done[:] = 0
+    theta,phi = hp.pix2ang(Nside, np.arange(N0))
+    with nogil, parallel():
+      tid = smp_get_thread_id()
+      for i in prange(N0,schedule='dynamic',chunksize=256):
+          if progress != 0 and (i%booster) == 0:
+            with gil:
+              p.update(_mysum(job_done))
+          outm[i] = _spherical_projloop(theta[i], phi[i], density_view, min_distance, max_distance, shifter, integrator_id)
+          job_done[tid] += 1
+
+    if progress:
+      p.finish()
+
+
+    return outm_array
--- a/external/cosmotool/python/copy.pxd
+++ b/external/cosmotool/python/copy.pxd
@ -0,0 +1,22 @@
+cimport cython
+cimport numpy as npx
+
+ctypedef fused sum_type:
+    cython.int
+    cython.float
+    npx.uint64_t
+    npx.uint32_t
+
+@cython.boundscheck(False)
+cdef inline sum_type _mysum(sum_type[:] jobs) nogil:
+  cdef sum_type s
+  cdef npx.uint64_t N
+  cdef int i
+
+  s = 0
+  N = jobs.shape[0]
+  for i in xrange(N):
+    s += jobs[i]
+  return s
+
+
--- a/external/cosmotool/python/cosmotool/init.py
+++ b/external/cosmotool/python/cosmotool/init.py
@ -0,0 +1,23 @@
+from ._cosmotool import *
+from ._project import *
+from ._cosmo_power import *
+from ._cosmo_cic import *
+from ._fast_interp import *
+from .grafic import writeGrafic, writeWhitePhase, readGrafic, readWhitePhase
+from .borg import read_borg_vol
+from .cic import cicParticles
+try:
+  import pyopencl
+  from .cl_cic import cl_CIC_Density
+except:
+  print("No opencl support")
+
+from .simu import loadRamsesAll, simpleWriteGadget, SimulationBare
+from .timing import time_block, timeit, timeit_quiet
+from .bispectrum import bispectrum, powerspectrum
+from .smooth import smooth_particle_density
+
+try:
+  from .fftw import CubeFT
+except ImportError:
+  print("No FFTW support")
--- a/external/cosmotool/python/cosmotool/bispectrum.py
+++ b/external/cosmotool/python/cosmotool/bispectrum.py
@ -0,0 +1,127 @@
+import numpy as np
+
+try:
+    import cffi
+    import os
+
+    _ffi = cffi.FFI()
+    _ffi.cdef("""
+
+    void CosmoTool_compute_bispectrum(
+        double *delta_hat, size_t Nx, size_t Ny, size_t Nz,
+        size_t *Ntriangles,
+        double* B, double delta_k, size_t Nk ) ;
+    void CosmoTool_compute_powerspectrum(
+        double *delta_hat, size_t Nx, size_t Ny, size_t Nz,
+        size_t *Ncounts,
+        double* P, double delta_k, size_t Nk );
+
+    """);
+
+    _pathlib = os.path.dirname(os.path.abspath(__file__))
+    _lib = _ffi.dlopen(os.path.join(_pathlib,"_cosmo_bispectrum.so"))
+except Exception as e:
+    print(repr(e))
+    raise RuntimeError("Failed to initialize _cosmo_bispectrum module")
+
+
+def bispectrum(delta, delta_k, Nk, fourier=True):
+    """bispectrum(delta, fourier=True)
+    
+    Args:
+        * delta: a 3d density field, can be Fourier modes if fourier set to True
+        
+        
+    Return:
+        * A 3d array of the binned bispectrum
+"""
+
+    if len(delta.shape) != 3:
+        raise ValueError("Invalid shape for delta")
+    try:
+        delta_k = float(delta_k)
+        Nk = int(Nk)
+    except:
+        raise ValueError()
+
+    if not fourier:
+        delta = np.fft.rfftn(delta)        
+    N1,N2,N3 = delta.shape
+    rN3 = (N3-1)*2
+    delta_hat_buf = np.empty((N1*N2*N3*2),dtype=np.double)
+    delta_hat_buf[::2] = delta.real.ravel()
+    delta_hat_buf[1::2] = delta.imag.ravel()
+    
+    size_size = _ffi.sizeof("size_t")
+    if size_size == 4:
+        triangle_buf = np.zeros((Nk,Nk,Nk),dtype=np.int32)
+    elif size_size == 8:
+        triangle_buf = np.zeros((Nk,Nk,Nk),dtype=np.int64)
+    else:
+        raise RuntimeError("Internal error, do not know how to map size_t")
+
+    B_buf = np.zeros((Nk*Nk*Nk*2), dtype=np.double)
+    
+    _lib.CosmoTool_compute_bispectrum( \
+        _ffi.cast("double *", delta_hat_buf.ctypes.data), \
+        N1, N2, rN3, \
+        _ffi.cast("size_t *", triangle_buf.ctypes.data), \
+        _ffi.cast("double *", B_buf.ctypes.data), \
+        delta_k,  \
+        Nk)
+    B_buf = B_buf.reshape((Nk,Nk,Nk,2))
+    return triangle_buf, B_buf[...,0]+1j*B_buf[...,1]
+
+def powerspectrum(delta, delta_k, Nk, fourier=True):
+    """powerspectrum(delta, fourier=True)
+    
+    Args:
+        * delta: a 3d density field, can be Fourier modes if fourier set to True
+        
+        
+    Return:
+        * A 3d array of the binned bispectrum
+"""
+
+    if len(delta.shape) != 3:
+        raise ValueError("Invalid shape for delta")
+    try:
+        delta_k = float(delta_k)
+        Nk = int(Nk)
+    except:
+        raise ValueError()
+
+    if not fourier:
+        delta = np.fft.rfftn(delta)        
+    N1,N2,N3 = delta.shape
+    delta_hat_buf = np.empty((N1*N2*N3*2),dtype=np.double)
+    delta_hat_buf[::2] = delta.real.ravel()
+    delta_hat_buf[1::2] = delta.imag.ravel()
+    
+    size_size = _ffi.sizeof("size_t")
+    if size_size == 4:
+        count_buf = np.zeros((Nk,),dtype=np.int32)
+    elif size_size == 8:
+        count_buf = np.zeros((Nk,),dtype=np.int64)
+    else:
+        raise RuntimeError("Internal error, do not know how to map size_t")
+
+    B_buf = np.zeros((Nk,), dtype=np.double)
+    
+    _lib.CosmoTool_compute_powerspectrum( \
+        _ffi.cast("double *", delta_hat_buf.ctypes.data), \
+        N1, N2, N3, \
+        _ffi.cast("size_t *", count_buf.ctypes.data), \
+        _ffi.cast("double *", B_buf.ctypes.data), \
+        delta_k,  \
+        Nk)
+    return count_buf, B_buf[...]
+
+        
+if __name__=="__main__":
+    delta=np.zeros((16,16,16))
+    delta[0,0,0]=1
+    delta[3,2,1]=1
+    b = powerspectrum(delta, 1, 16, fourier=False)
+    a = bispectrum(delta, 1, 16, fourier=False)
+    print(a[0].max())
--- a/external/cosmotool/python/cosmotool/borg.py
+++ b/external/cosmotool/python/cosmotool/borg.py
@ -0,0 +1,265 @@
+### 
+### BORG code is from J. Jasche
+###
+import io
+import numpy as np
+from numpy import *
+import os.path
+import array
+import glob
+
+class BorgVolume(object):
+
+    def __init__(self, density, ranges):
+        self.density = density
+        self.ranges = ranges
+
+def build_filelist(fdir):
+    #builds list of all borg density fields which may be distributed over several directories
+    
+    fname_0=glob.glob(fdir[0]+'initial_density_*')
+    fname_1=glob.glob(fdir[0]+'final_density_*')
+       
+    fdir=fdir[1:] #eliminate first element
+       
+    for fd in fdir:
+      fname_0=fname_0+glob.glob(fd+'initial_density_*')
+      fname_1=fname_1+glob.glob(fd+'final_density_*')
+	   
+    return fname_0, fname_1
+
+def read_borg_vol(BORGFILE):
+    """ Reading routine for BORG data
+    """
+
+    openfile=open(BORGFILE,'rb')
+
+    period=0
+    N0=0
+    N1=0
+    N2=0
+
+    xmin=0
+    xmax=0
+
+    ymin=0
+    ymax=0
+
+    zmin=0
+    zmax=0
+    nlines=0
+
+    while True:
+      line=openfile.readline()
+      s=line.rstrip('\n')
+      r=s.rsplit(' ')
+    
+      if size(r)==5 : 
+        if r[0] =="define":
+          if r[1]=="Lattice" :
+            N0=int(r[2])
+            N1=int(r[3])
+            N2=int(r[4])
+            
+          if size(r)==11 :
+            if r[4] =="BoundingBox":
+              xmin=float(r[5])
+              xmax=float(r[6])
+              ymin=float(r[7])
+              ymax=float(r[8])
+              zmin=float(r[9])
+              zmax=float(r[10].rstrip(','))
+              
+          if r[0]=='@1': break
+  
+    ranges=[]
+    ranges.append(xmin)
+    ranges.append(xmax)
+    ranges.append(ymin)
+    ranges.append(ymax)
+    ranges.append(zmin)
+    ranges.append(zmax)
+    
+    #now read data
+    data=np.fromfile(openfile, '>f4')
+    data=data.reshape(N2,N0,N1)
+    return BorgVolume(data,ranges)
+
+def read_spec( fname ):
+    """ Reading routine for ARES spectrum samples
+    """
+    x,y=np.loadtxt( fname ,usecols=(0,1),unpack=True)
+
+    return x , y
+    
+        
+def read_bias_nmean( fname ):
+    """ Reading routine for ARES bias data
+    """
+    x,b0,b1,nmean=np.loadtxt( fname ,usecols=(0,1,2,3),unpack=True)
+
+    return x , b0, b1, nmean
+
+def read_nmean( fname ):
+    """ Reading routine for BORG bias data
+    """
+    x,nmean=np.loadtxt( fname ,usecols=(0,1),unpack=True)
+
+    return x, nmean    
+    
+def get_grid_values(xx,data, ranges):
+    """ return values at grid positions
+    """
+    xmin=ranges[0]
+    xmax=ranges[1]
+    
+    ymin=ranges[2]
+    ymax=ranges[3]
+    
+    zmin=ranges[4]
+    zmax=ranges[5]
+    
+    Lx= xmax-xmin
+    Ly= ymax-ymin
+    Lz= zmax-zmin
+        
+    Nx=shape(data)[0]
+    Ny=shape(data)[1]
+    Nz=shape(data)[2]
+    
+    dx=Lx/float(Nx)
+    dy=Ly/float(Ny)
+    dz=Lz/float(Nz)
+    
+    idx=(xx[:,0]-xmin)/dx
+    idy=(xx[:,1]-ymin)/dz
+    idz=(xx[:,2]-zmin)/dy
+    
+    idx=idx.astype(int)
+    idy=idy.astype(int)
+    idz=idz.astype(int)
+    
+    idflag=np.where( (idx>-1)*(idx<Nx)*(idy>-1)*(idy<Ny)*(idz>-1)*(idz<Nz) )
+        
+    flag=[False]*len(xx)
+    vals=[-999.]*len(xx)
+    
+    flag=np.array(flag)
+    vals=np.array(vals)
+    flag[idflag]=True
+    vals[idflag]=data[idx[idflag],idy[idflag],idz[idflag]]
+        
+    return vals,flag    
+    
+def get_mean_density(fdir, smin, step):
+    """ estimate ensemble mean
+    """
+    import progressbar as pb
+
+    print('-'*60)
+    print('Get 3D ensemble mean density field')
+    print('-'*60)
+
+    fname0 = fdir + 'initial_density_'+str(0)+'.dat'
+    fname1 = fdir + 'final_density_'+str(0)+'.dat'
+        
+    MEAN0,ranges=read_borg_vol(fname0)
+    MEAN0=MEAN0*0.;
+    VAR0=copy(MEAN0)
+    MEAN1=copy(MEAN0)
+    VAR1=copy(MEAN0)
+    norm=0.
+    
+    idat=smin
+    
+    fname0 = fdir + 'initial_density_'+str(idat)+'.dat'
+    fname1 = fdir + 'final_density_'+str(idat)+'.dat'
+        #and (idat<smin+1000)
+    while((os.path.exists(fname0))):
+      auxdata0,auxranges0=read_borg_vol(fname0)
+      auxdata1,auxranges1=read_borg_vol(fname1)
+      auxx0=auxdata0
+      auxx1=auxdata1
+      MEAN0+=auxx0
+      VAR0+=auxx0**2
+      MEAN1+=auxx1
+      VAR1+=auxx1**2
+
+      norm+=1
+      idat+=step
+      fname0 = fdir + 'initial_density_'+str(idat)+'.dat'
+      fname1 = fdir + 'final_density_'+str(idat)+'.dat'
+      del auxranges0
+      del auxdata0
+      del auxranges1
+      del auxdata1
+	    
+    MEAN0/=norm
+    VAR0/=norm
+    VAR0-=MEAN0**2
+    VAR0=sqrt(fabs(VAR0))
+    
+    MEAN1/=norm
+    VAR1/=norm
+    VAR1-=MEAN1**2
+    VAR1=sqrt(fabs(VAR1))
+    
+    
+    return MEAN0,VAR0,MEAN1,VAR1,ranges
+
+def get_mean_density_fdir(fdir,init,steps):
+    """ estimate ensemble mean
+    """
+    import progressbar as pb
+
+    print('-'*60)
+    print('Get 3D ensemble mean density field')
+    print('-'*60)
+
+    fname0,fname1=build_filelist(fdir)
+    
+    fname0=fname0[init::steps]
+    fname1=fname1[init::steps]
+    
+    borg=read_borg_vol(fname0[0])
+    MEAN0 = borg.density
+    RANGES0 = borg.ranges
+    MEAN0=MEAN0*0.;
+    VAR0=copy(MEAN0)
+    MEAN1=copy(MEAN0)
+    VAR1=copy(MEAN0)
+    norm0=0.
+    norm1=0.
+    
+    for fn in pb.ProgressBar(len(fname0))(fname0):
+      auxborg=read_borg_vol(fn)
+      auxdata0 = auxborg.density
+      MEAN0+=auxdata0
+      VAR0+=auxdata0**2.
+      norm0+=1.
+      del auxdata0
+      del auxborg
+
+    for fn in pb.ProgressBar(len(fname1))(fname1):
+      auxborg1=read_borg_vol(fn)
+      auxdata1 = auxborg1.density
+      MEAN1+=auxdata1
+      VAR1+=auxdata1**2.
+      norm1+=1.
+      del auxdata1
+      del auxborg1
+	
+    MEAN0/=norm0
+    VAR0/=norm0
+    VAR0-=MEAN0**2
+    VAR0=sqrt(fabs(VAR0))
+    
+    MEAN1/=norm1
+    VAR1/=norm1
+    VAR1-=MEAN1**2
+    VAR1=sqrt(fabs(VAR1))
+    
+    
+    return MEAN0,VAR0,MEAN1,VAR1,ranges    
+    
+    
--- a/external/cosmotool/python/cosmotool/cic.py
+++ b/external/cosmotool/python/cosmotool/cic.py
@ -0,0 +1,42 @@
+import numexpr as ne
+import numpy as np
+
+
+def cicParticles(particles, L, N):
+
+    if type(N) not in [int,int]:
+      raise TypeError("N must be a numeric type")
+
+    def shifted(i, t):
+      a = np.empty(i[0].size, dtype=np.int64)
+      return ne.evaluate('(i2+t2)%N + N*((i1+t1)%N + N*((i0+t0)%N) )', local_dict={'i2':i[2], 't2':t[2], 'i1':i[1], 't1':t[1], 'i0':i[0], 't0':t[0], 'N':N}, out=a)
+
+
+    i =[]
+    r = []
+    for d in range(3):
+        q = ne.evaluate('(p%L)*N/L', local_dict={'p':particles[d], 'L':L, 'N':N })
+        o = np.empty(q.size, dtype=np.int64)
+        o[:] = np.floor(q)
+        i.append(o)
+        r.append(ne.evaluate('q-o'))
+
+    D = {'a':r[0],'b':r[1],'c':r[2]}
+    N3 = N*N*N
+
+    def accum(density, ss, op):
+      d0 = np.bincount(shifted(i, ss), weights=ne.evaluate(op, local_dict=D), minlength=N3)
+      ne.evaluate('d + d0', local_dict={'d':density, 'd0':d0}, out=density)
+
+    density = np.empty(N3, dtype=np.float64)
+
+    accum(density, (1,1,1), 'a     *    b  *    c ')
+    accum(density, (1,1,0), 'a     *    b  * (1-c)')
+    accum(density, (1,0,1), 'a     * (1-b) *    c ')
+    accum(density, (1,0,0), 'a     * (1-b) * (1-c)')
+    accum(density, (0,1,1), '(1-a) *    b  *    c ')
+    accum(density, (0,1,0), '(1-a) *    b  * (1-c)')
+    accum(density, (0,0,1), '(1-a) * (1-b) *    c ')
+    accum(density, (0,0,0), '(1-a) * (1-b) * (1-c)')
+
+    return density.reshape((N,N,N))
--- a/external/cosmotool/python/cosmotool/cl_cic.py
+++ b/external/cosmotool/python/cosmotool/cl_cic.py
@ -0,0 +1,239 @@
+from .timing import time_block as time_block_orig
+import numpy as np
+import pyopencl as cl
+import pyopencl.array as cl_array
+from contextlib import contextmanager
+
+TIMER_ACTIVE=False
+
+@contextmanager
+def time_block_dummy(*args):
+  yield
+
+if TIMER_ACTIVE:
+  time_block=time_block_orig
+else: 
+  time_block=time_block_dummy
+
+CIC_PREKERNEL='''
+#define NDIM {ndim}
+#define CENTERED {centered}
+typedef {cicType} BASIC_TYPE;
+
+'''
+
+CIC_KERNEL='''///CL///
+#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
+
+__kernel void init_pcell(__global int *p_cell, const int value)
+{
+    int i = get_global_id(0);
+    p_cell[i] = value;
+}
+
+__kernel void build_indices(__global const BASIC_TYPE *pos,
+                            __global int *part_mesh, __global int *part_list, const int N, const BASIC_TYPE delta, const BASIC_TYPE shift_pos)
+{
+    int i_part = get_global_id(0);
+    long shifter = 1;
+    long idx = 0;
+    int d;
+
+    for (d = 0; d < NDIM; d++) {
+        BASIC_TYPE x;
+        
+        if (CENTERED)
+          x = pos[i_part*NDIM + d] - shift_pos;
+        else
+          x = pos[i_part*NDIM + d];
+
+        int m = (int)floor(x*delta) %% N;
+        
+        idx += shifter * m;        
+        shifter *= N;
+    }
+    
+    // Head of the list
+    int initial_elt = atom_xchg(&part_mesh[idx], i_part);
+    if (initial_elt == -1) {
+        return;
+    }
+    // Point the next pointer of old_end to i_part
+    part_list[i_part] = initial_elt;
+}
+
+__kernel void reverse_list(__global int *part_mesh, __global int *part_list)
+{
+    int mid = get_global_id(0);
+   
+    int current_part = part_mesh[mid];
+    if (current_part >= 0) {
+        int next_part = part_list[current_part];
+        part_list[current_part] = -1;
+        while (next_part != -1) {
+          int p = part_list[next_part];
+          part_list[next_part] = current_part;
+          current_part = next_part;
+          next_part = p;
+        }
+        part_mesh[mid] = current_part;
+    }
+}
+
+__kernel void dance(__global const BASIC_TYPE *pos,
+                    __global BASIC_TYPE *density,
+                    __global int *part_mesh, __global int *part_list, const int N, const BASIC_TYPE delta, const BASIC_TYPE shift_pos)
+{
+    int m[NDIM];
+    int shifter = 1;    
+    int i;
+    int first, i_part;
+    int idx = 0;
+    
+    for (i = 0; i < NDIM; i++) {
+        m[i] = get_global_id(i);
+        idx += shifter * m[i];
+        shifter *= N;
+    }
+
+    first = 1;    
+
+//BEGIN LOOPER
+%(looperFor)s
+//END LOOPER        
+
+            int idx_dance = 0; 
+            BASIC_TYPE w = 0;
+//LOOPER INDEX
+            int r[NDIM] = { %(looperVariables)s };
+//END LOOPER
+
+            i_part = part_mesh[idx];
+            while (i_part != -1) {
+                BASIC_TYPE w0 = 1;
+                
+                for (int d = 0; d < NDIM; d++) {
+                    BASIC_TYPE x;
+                    BASIC_TYPE q;
+                    BASIC_TYPE dx;
+
+                    if (CENTERED)
+                      x = pos[i_part*NDIM + d]*delta - shift_pos;
+                    else
+                      x = pos[i_part*NDIM + d]*delta;
+                    q = floor(x);
+                    dx = x - q;
+                    
+                    w0 *= (r[d] == 1) ? dx : ((BASIC_TYPE)1-dx);
+                }
+
+                i_part = part_list[i_part];
+                w += w0;
+            }
+        
+            shifter = 1;    
+            for (i = 0; i < NDIM; i++) {
+                idx_dance += shifter * ((m[i]+r[i])%%N);
+                shifter *= N;
+            }
+
+            density[idx_dance] += w;
+                
+            // One dance done. Wait for everybody for the next iteration
+            barrier(CLK_GLOBAL_MEM_FENCE);
+%(looperForEnd)s
+}
+'''
+
+class CIC_CL(object):
+
+    def __init__(self, context, ndim=2, ktype=np.float32, centered=False):
+        global CIC_PREKERNEL, CIC_KERNEL
+
+        translator = {}
+        if ktype == np.float32:
+            translator['cicType'] = 'float'
+            pragmas = ''
+        elif ktype == np.float64:
+            translator['cicType'] = 'double'
+            pragmas = '#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n'
+        else:
+            raise ValueError("Invalid ktype")
+            
+        # 2 dimensions
+        translator['ndim'] = ndim
+        translator['centered'] = '1' if centered else '0'
+        looperVariables = ','.join(['id%d' % d for d in range(ndim)])
+        looperFor = '\n'.join(['for (int id{dim}=0; id{dim} < 2; id{dim}++) {{'.format(dim=d) for d in range(ndim)])
+        looperForEnd = '}' * ndim
+        
+        kern = pragmas + CIC_PREKERNEL.format(**translator) + (CIC_KERNEL % {'looperVariables': looperVariables, 'looperFor': looperFor, 'looperForEnd':looperForEnd})
+        self.kern_code = kern
+        self.ctx = context
+        self.queue = cl.CommandQueue(context)#, properties=cl.OUT_OF_ORDER_EXEC_MODE_ENABLE)
+        self.ktype = ktype
+        self.ndim = ndim
+        self.prog = cl.Program(self.ctx, kern).build()
+        self.centered = centered
+
+    def run(self, particles, Ng, L):
+        assert particles.strides[1] == self.ktype().itemsize  # This is C-ordering
+        assert particles.shape[1] == self.ndim
+
+        print("Start again")
+
+        ndim = self.ndim
+        part_pos = cl_array.to_device(self.queue, particles)
+        part_mesh = cl_array.empty(self.queue, (Ng,)*ndim, np.int32, order='C')
+        density = cl_array.zeros(self.queue, (Ng,)*ndim, self.ktype, order='C')
+        part_list = cl_array.empty(self.queue, (particles.shape[0],), np.int32, order='C')
+        shift_pos = 0.5*L if self.centered else 0
+        
+        if True:
+          delta = Ng/L
+    
+          with time_block("Init pcell array"):
+              e = self.prog.init_pcell(self.queue, (Ng**ndim,), None, part_mesh.data, np.int32(-1))
+              e.wait()
+        
+          with time_block("Init idx array"):
+              e=self.prog.init_pcell(self.queue, (particles.shape[0],), None, part_list.data, np.int32(-1))
+              e.wait()
+        
+          with time_block("Build indices"):
+              self.prog.build_indices(self.queue, (particles.shape[0],), None,
+                      part_pos.data, part_mesh.data, part_list.data, np.int32(Ng), self.ktype(delta), self.ktype(shift_pos))
+        
+        if True:
+          with time_block("Reverse list"):
+              lastevt = self.prog.reverse_list(self.queue, (Ng**ndim,), None, part_mesh.data, part_list.data)
+        # We require pmax pass, particles are ordered according to part_idx
+
+          with time_block("dance"):
+              self.prog.dance(self.queue, (Ng,)*ndim, None, part_pos.data, density.data, part_mesh.data, part_list.data, np.int32(Ng), self.ktype(delta), self.ktype(shift_pos))
+    
+        self.queue.finish()
+        del part_pos
+        del part_mesh
+        del part_list
+        with time_block("download"):
+          return density.get()
+
+
+def cl_CIC_Density(particles, Ngrid, Lbox, context=None, periodic=True, centered=False):
+  """
+  cl_CIC_Density(particles (Nx3), Ngrid, Lbox, context=None, periodic=True, centered=False)
+"""
+  if context is None:
+    context = cl.create_some_context()
+
+  ktype = particles.dtype
+  if ktype != np.float32 and ktype != np.float64:
+    raise ValueError("particles may only be float32 or float64")
+
+  if len(particles.shape) != 2 or particles.shape[1] != 3:
+    raise ValueError("particles may only be a Nx3 array")
+
+  cic = CIC_CL(context, ndim=3, centered=centered, ktype=ktype)
+
+  return cic.run(particles, Ngrid, Lbox)
--- a/external/cosmotool/python/cosmotool/config.py.in
+++ b/external/cosmotool/python/cosmotool/config.py.in
@ -0,0 +1 @@
+install_prefix="@CMAKE_INSTALL_PREFIX@"
--- a/external/cosmotool/python/cosmotool/ctpv.py
+++ b/external/cosmotool/python/cosmotool/ctpv.py
@ -0,0 +1,211 @@
+import numpy as np
+from contextlib import contextmanager
+
+class ProgrammableParticleLoad(object):
+
+  @staticmethod
+  def main_script(source, particles, aname="default", aux=None):
+    import vtk
+    from vtk.util import numpy_support as ns
+  
+    out = source.GetOutput()
+    vv = vtk.vtkPoints()
+
+    assert len(particles.shape) == 2
+    assert particles.shape[1] == 3
+
+    vv.SetData(ns.numpy_to_vtk(np.ascontiguousarray(particles.astype(np.float64)), deep=1))
+    vv.SetDataTypeToDouble()
+
+    out.Allocate(1,1)
+    out.SetPoints(vv)
+
+    if aux is not None:
+      for n,a in aux:
+        a_vtk = ns.numpy_to_vtk(
+           np.ascontiguousarray(a.astype(np.float64)
+             ), 
+           deep=1)
+        a_vtk.SetName(n)
+        out.GetPointData().AddArray(a_vtk)
+
+    out.InsertNextCell(vtk.VTK_VERTEX, particles.shape[0], range(particles.shape[0]))
+
+  @staticmethod
+  def request_information(source):
+   pass
+
+
+
+class ProgrammableParticleHistoryLoad(object):
+
+  @staticmethod
+  def main_script(source, particles, velocities=None, aname="default",addtime=False):
+    import vtk
+    from vtk.util import numpy_support as ns
+  
+    out = source.GetOutput()
+    vv = vtk.vtkPoints()
+
+    assert len(particles.shape) == 3
+    assert particles.shape[2] == 3
+    
+    if not velocities is None:
+      for i,j in zip(velocities.shape,particles.shape):
+        assert i==j
+
+    Ntime,Npart,_ = particles.shape
+
+    vv.SetData(ns.numpy_to_vtk(np.ascontiguousarray(particles.reshape((Ntime*Npart,3)).astype(np.float64)), deep=1))
+    vv.SetDataTypeToDouble()
+
+    out.Allocate(1,1)
+    out.SetPoints(vv)
+
+    if not velocities is None:
+      print("Adding velocities")
+      vel_vtk = ns.numpy_to_vtk(np.ascontiguousarray(velocities.reshape((Ntime*Npart,3)).astype(np.float64)), deep=1)
+      vel_vtk.SetName("velocities")
+      out.GetPointData().AddArray(vel_vtk)
+
+    if addtime:
+      timearray = np.arange(Ntime)[:,None].repeat(Npart, axis=1).reshape(Ntime*Npart)
+      timearray = ns.numpy_to_vtk(np.ascontiguousarray(timearray.astype(np.float64)), deep=1)
+      timearray.SetName("timearray")
+      out.GetPointData().AddArray(timearray)
+
+    out.InsertNextCell(vtk.VTK_VERTEX, particles.shape[0], range(particles.shape[0]))
+    for p in range(Npart):
+      out.InsertNextCell(vtk.VTK_LINE, Ntime, range(p, p + Npart*Ntime, Npart) )
+
+
+  @staticmethod
+  def request_information(source):
+   pass
+
+
+class ProgrammableDensityLoad(object):
+
+  @staticmethod
+  def main_script(source, density, extents=None, aname="default"):
+    import vtk
+    from vtk.util import numpy_support
+  
+    if len(density.shape) > 3:
+      _, Nx, Ny, Nz = density.shape
+    else:
+      Nx, Ny, Nz = density.shape
+
+    ido = source.GetOutput()
+    ido.SetDimensions(Nx, Ny, Nz)
+    if not extents is None:
+      origin = extents[:6:2]
+      spacing = (extents[1]-extents[0])/Nx, (extents[3]-extents[2])/Ny, (extents[5]-extents[4])/Nz
+    else:
+      origin = (-1, -1, -1)
+      spacing = 2.0 / Nx, 2.0/Ny, 2.0/Nz
+      
+    ido.SetOrigin(*origin)
+    ido.SetSpacing(*spacing)
+    ido.SetExtent([0,Nx-1,0,Ny-1,0,Nz-1])
+    if len(density.shape) > 3 and density.shape[0] == 3:
+       N = Nx*Ny*Nz
+       density = density.transpose().astype(np.float64).reshape((N,3))
+       arr = numpy_support.numpy_to_vtk(density, deep=1)
+    else:
+       arr = numpy_support.numpy_to_vtk(density.transpose().astype(np.float64).ravel(), deep=1)
+    arr.SetName(aname)
+    ido.GetPointData().AddArray(arr)
+    
+  @staticmethod
+  def request_information(source, density=None, dims=None):
+    import vtk
+    
+    Nx = Ny = Nz = None
+    if not density is None:
+      Nx, Ny, Nz = density.shape
+    elif not dims is None:
+      Nx, Ny, Nz = dims
+    else:
+      raise ValueError("Need at least a density or dims")
+
+    source.GetExecutive().GetOutputInformation(0).Set(vtk.vtkStreamingDemandDrivenPipeline.WHOLE_EXTENT(), 0, Nx-1, 0, Ny-1, 0, Nz-1)
+
+  @staticmethod
+  def prepare_timesteps_info(algorithm, timesteps):
+
+    def SetOutputTimesteps(algorithm, timesteps):
+      executive = algorithm.GetExecutive()
+      outInfo = executive.GetOutputInformation(0)
+      outInfo.Remove(executive.TIME_STEPS())
+      for timestep in timesteps:
+        outInfo.Append(executive.TIME_STEPS(), timestep)
+      outInfo.Remove(executive.TIME_RANGE())
+      outInfo.Append(executive.TIME_RANGE(), timesteps[0])
+      outInfo.Append(executive.TIME_RANGE(), timesteps[-1])
+
+    SetOutputTimesteps(algorithm, timesteps)
+
+  @staticmethod
+  @contextmanager
+  def get_timestep(algorithm):
+
+    def GetUpdateTimestep(algorithm):
+      """Returns the requested time value, or None if not present"""
+      executive = algorithm.GetExecutive()
+      outInfo = executive.GetOutputInformation(0)
+      if not outInfo.Has(executive.UPDATE_TIME_STEP()):
+          return None
+      return outInfo.Get(executive.UPDATE_TIME_STEP())
+
+    # This is the requested time-step. This may not be exactly equal to the 
+    # timesteps published in RequestInformation(). Your code must handle that
+    # correctly
+    req_time = GetUpdateTimestep(algorithm)
+
+    output = algorithm.GetOutput()
+
+    yield req_time
+    
+    # Now mark the timestep produced.
+    output.GetInformation().Set(output.DATA_TIME_STEP(), req_time)
+
+    
+def load_borg(pdo, restart_name, mcmc_name, info=False, aname="BORG"):
+  import h5py as h5
+  
+  with h5.File(restart_name) as f:
+    N0 = f["/info/scalars/N0"][:]
+    N1 = f["/info/scalars/N1"][:]
+    N2 = f["/info/scalars/N2"][:]
+    L0 = f["/info/scalars/L0"][:]
+    L1 = f["/info/scalars/L1"][:]
+    L2 = f["/info/scalars/L2"][:]
+    c0 = f["/info/scalars/corner0"][:]
+    c1 = f["/info/scalars/corner1"][:]
+    c2 = f["/info/scalars/corner2"][:]
+
+  if not info:
+    with h5.File(mcmc_name) as f:
+      d = f["/scalars/BORG_final_density"][:]+1
+                
+    ProgrammableDensityLoad.main_script(pdo, d, extents=[c0,c0+L0,c1,c1+L1,c2,c2+L2], aname=aname)
+  else:
+    ProgrammableDensityLoad.request_information(pdo, dims=[N0,N1,N2])
+    
+    
+def load_borg_galaxies(pdo, restart_name, cid=0, info=False, aname="Galaxies"):
+  import h5py as h5
+  
+  with h5.File(restart_name) as f:
+    gals = f['/info/galaxy_catalog_%d/galaxies' % cid]
+    ra = gals['phi'][:]
+    dec = gals['theta'][:]
+    r = gals['r'][:]
+
+  if not info:
+    x = r * np.cos(ra)*np.cos(dec)
+    y = r * np.sin(ra)*np.cos(dec)
+    z = r * np.sin(dec)
+    parts = np.array([x,y,z]).transpose()
+    ProgrammableParticleLoad.main_script(pdo, parts)
--- a/external/cosmotool/python/cosmotool/fftw.py
+++ b/external/cosmotool/python/cosmotool/fftw.py
@ -0,0 +1,45 @@
+import pyfftw
+import multiprocessing
+import numpy as np
+import numexpr as ne
+
+class CubeFT(object):
+  def __init__(self, L, N, max_cpu=-1, width=32):
+  
+      if width==32:
+        fourier_type='complex64'
+        real_type='float32'
+      elif width==64:
+        fourier_type='complex128'
+        real_type='float64'
+      else:
+        raise ValueError("Invalid bitwidth (must be 32 or 64)")
+
+      self.N = N
+      self.align = pyfftw.simd_alignment
+      self.L = float(L)
+      self.max_cpu = multiprocessing.cpu_count() if max_cpu < 0 else max_cpu
+      self._dhat = pyfftw.n_byte_align_empty((self.N,self.N,self.N//2+1), self.align, dtype=fourier_type)
+      self._density = pyfftw.n_byte_align_empty((self.N,self.N,self.N), self.align, dtype=real_type)
+      self._irfft = pyfftw.FFTW(self._dhat, self._density, axes=(0,1,2), direction='FFTW_BACKWARD', threads=self.max_cpu)#, normalize_idft=False)
+      self._rfft = pyfftw.FFTW(self._density, self._dhat, axes=(0,1,2), threads=self.max_cpu) #, normalize_idft=False)
+
+  def rfft(self):
+      return ne.evaluate('c*a', out=self._dhat, local_dict={'c':self._rfft(normalise_idft=False),'a':(self.L/self.N)**3}, casting='unsafe')
+      
+  def irfft(self):
+      return ne.evaluate('c*a', out=self._density, local_dict={'c':self._irfft(normalise_idft=False),'a':(1/self.L)**3}, casting='unsafe')
+   
+  def get_dhat(self):
+      return self._dhat
+  def set_dhat(self, in_dhat):
+      self._dhat[:] = in_dhat 
+  dhat = property(get_dhat, set_dhat, None)
+      
+  def get_density(self):
+      return self._density      
+  def set_density(self, d):
+      self._density[:] = d 
+  density = property(get_density, set_density, None)
+      
+
--- a/external/cosmotool/python/cosmotool/grafic.py
+++ b/external/cosmotool/python/cosmotool/grafic.py
@ -0,0 +1,108 @@
+import struct
+import numpy as np
+
+def readGrafic(filename):
+  """This function reads a grafic file.
+  
+  Arguments:
+    filename (str): the path to the grafic file
+    
+  Returns:
+    a tuple containing:
+      * the array held in the grafic file
+      * the size of the box
+      * the scale factor
+      * the mean matter density :math:`\Omega_\mathrm{m}`
+      * the dark energy density :math:`\Omega_\Lambda`
+      * the hubble constant, relative to 100 km/s/Mpc
+      * xoffset
+      * yoffset
+      * zoffset
+  """ 
+  with open(filename, mode="rb") as f:
+        p = struct.unpack("IIIIffffffffI", f.read(4*11 + 2*4))
+        checkPoint0, Nx, Ny, Nz, delta, xoff, yoff, zoff, scalefac, omega0, omegalambda0, h, checkPoint1 = p
+        if checkPoint0 != checkPoint1 or checkPoint0 != 4*11:
+          raise ValueError("Invalid unformatted access")
+          
+        a = np.empty((Nx,Ny,Nz), dtype=np.float32)
+        
+        BoxSize = delta * Nx * h
+        xoff *= h
+        yoff *= h
+        zoff *= h
+        
+        checkPoint = 4*Ny*Nz
+        for i in range(Nx):
+            checkPoint = struct.unpack("I", f.read(4))[0]
+            if checkPoint != 4*Ny*Nz:
+              raise ValueError("Invalid unformatted access")
+              
+            a[i, :, :] = np.fromfile(f, dtype=np.float32, count=Ny*Nz).reshape((Ny, Nz))
+
+            checkPoint = struct.unpack("I", f.read(4))[0]
+            if checkPoint != 4*Ny*Nz:
+              raise ValueError("Invalid unformatted access")
+
+  return a, BoxSize, scalefac, omega0, omegalambda0, h, xoff, yoff,zoff
+
+def writeGrafic(filename, field, BoxSize, scalefac, **cosmo):
+
+    with open(filename, mode="wb") as f:
+        checkPoint = 4*11
+        
+        Nx,Ny,Nz = field.shape
+        delta = BoxSize/Nx/cosmo['h']
+        bad = 0.0
+        
+        f.write(struct.pack("IIIIffffffffI", checkPoint, 
+                                          Nx, Ny, Nz, 
+                                          delta, 
+                                          bad, bad, bad, 
+                                          scalefac,
+                                          cosmo['omega_M_0'], cosmo['omega_lambda_0'], 100*cosmo['h'], checkPoint))
+        checkPoint = 4*Ny*Nz
+        field = field.reshape(field.shape, order='F')
+        for i in range(Nx):
+            f.write(struct.pack("I", checkPoint))
+            f.write(field[i].astype(np.float32).tostring())
+            f.write(struct.pack("I", checkPoint))
+        
+        
+def writeWhitePhase(filename, field):
+
+    with open(filename, mode="wb") as f:
+        Nx,Ny,Nz = field.shape
+        
+        checkPoint = 4*4
+        f.write(struct.pack("IIIIII", checkPoint, Nx, Ny, Nz, 0, checkPoint))
+        
+        field = field.reshape(field.shape, order='F')
+        checkPoint = struct.pack("I", 4*Ny*Nz)
+        for i in range(Nx):
+            f.write(checkPoint)
+            f.write(field[i].astype(np.float32).tostring())
+            f.write(checkPoint)
+
+
+def readWhitePhase(filename):
+    with open(filename, mode="rb") as f:
+        _, Nx, Ny, Nz, _, _ = struct.unpack("IIIIII", f.read(4*4+2*4))
+        
+        a = np.empty((Nx,Ny,Nz), dtype=np.float32)
+        
+        checkPoint_ref = 4*Ny*Nz
+        
+        for i in range(Nx):
+            if struct.unpack("I", f.read(4))[0] != checkPoint_ref:
+              raise ValueError("Invalid unformatted access")
+              
+            b = np.fromfile(f, dtype=np.float32, count=Ny*Nz).reshape((Ny, Nz))
+            if i==0:
+              print(b)
+            a[i, : ,:] = b
+            
+            if struct.unpack("I", f.read(4))[0] != checkPoint_ref:
+              raise ValueError("Invalid unformatted access")
+
+    return a
--- a/external/cosmotool/python/cosmotool/simu.py
+++ b/external/cosmotool/python/cosmotool/simu.py
@ -0,0 +1,154 @@
+import warnings
+from _cosmotool import *
+
+class SimulationBare(PySimulationBase):
+
+  def __init__(self, *args):
+    if len(args) == 0:
+      return
+      
+    if not isinstance(args[0], PySimulationBase):
+      raise TypeError("Simulation object to mirror must be a PySimulationBase")
+    s = args[0]
+
+    self.positions = [q.copy() for q in s.getPositions()] if s.getPositions() is not None else None
+    self.velocities = [q.copy() for q in s.getVelocities()] if s.getVelocities() is not None else None
+    self.identifiers = s.getIdentifiers().copy() if s.getIdentifiers() is not None else None
+    self.types = s.getTypes().copy() if s.getTypes() is not None else None
+    self.boxsize = s.getBoxsize()
+    self.time = s.getTime()
+    self.Hubble = s.getHubble()
+    self.Omega_M = s.getOmega_M()
+    self.Omega_Lambda = s.getOmega_Lambda()
+    try:
+      self.masses = s.getMasses().copy() if s.getMasses() is not None else None
+    except Exception as e:
+      warnings.warn("Unexpected exception: " + repr(e))
+  
+  
+  def merge(self, other):
+
+    def _safe_merge(a, b):
+      if b is not None:
+        if a is not None:
+          a = [np.append(q, r) for q,r in zip(a,b)]
+        else:
+          a = b
+      return a
+      
+    def _safe_merge0(a, b):
+      if b is not None:
+        if a is not None:
+          a = np.append(a, b)
+        else:
+          a = b
+      return a
+  
+  
+    assert self.time == other.getTime()
+    assert self.Hubble == other.getHubble()
+    assert self.boxsize == other.getBoxsize()
+    assert self.Omega_M == other.getOmega_M()
+    assert self.Omega_Lambda == other.getOmega_Lambda()
+    
+    self.positions = _safe_merge(self.positions, other.getPositions())
+    self.velocities = _safe_merge(self.velocities, other.getVelocities())
+    self.identifiers = _safe_merge0(self.identifiers, other.getIdentifiers())
+    self.types = _safe_merge0(self.types, other.getTypes())
+    try:
+      self.masses = _safe_merge0(self.masses, other.getMasses())
+    except Exception as e:
+      warnings.warn("Unexpected exception: " + repr(e));
+      self.masses = None
+
+  def getTypes(self):
+    return self.types
+    
+  def getPositions(self):
+    return self.positions
+    
+  def getVelocities(self):
+    return self.velocities
+    
+  def getIdentifiers(self):
+    return self.identifiers
+
+  def getMasses(self):
+    return self.masses
+
+  def getTime(self):
+    return self.time
+    
+  def getHubble(self):
+    return self.Hubble
+
+  def getBoxsize(self):
+    return self.boxsize
+
+  def getOmega_M(self):
+    return self.Omega_M
+
+  def getOmega_Lambda(self):
+    return self.Omega_Lambda
+
+
+def simpleWriteGadget(filename, positions, boxsize=1.0, Hubble=100, Omega_M=0.30, time=1, velocities=None, identifiers=None):
+
+  s = SimulationBare()
+  
+  s.positions = positions
+  
+  if velocities:
+    s.velocities = velocities
+  else:
+    s.velocities = [np.zeros(positions[0].size,dtype=np.float32)]*3
+    
+  if identifiers:
+    s.identifiers = identifiers
+  else:
+    s.identifiers = np.arange(positions[0].size, dtype=np.int64)
+    
+  s.Hubble = Hubble
+  s.time = time
+  s.Omega_M = Omega_M
+  s.Omega_Lambda = 1-Omega_M
+  s.boxsize = boxsize
+    
+  writeGadget(filename, s)
+  
+def loadRamsesAll(basepath, snapshot_id, **kwargs):
+  """This function loads an entire ramses snapshot in memory. The keyword arguments are the one accepted
+  by cosmotool.loadRamses
+  
+  Args:
+    basepath (str): The base path of the snapshot (i.e. the directory holding the output_XXXXX directories)
+    snapshot_id (int): The identifier of the snapshot to load.
+
+  Keyword args:
+    verboseMulti (bool): If true, print some progress information on loading multiple files
+    
+  See Also:
+    cosmotool.loadRamses
+    
+  """
+  cpu_id = 0
+  output = None
+  verbose = kwargs.get('verboseMulti',False)
+  new_kw = dict(kwargs)
+  if 'verboseMulti' in new_kw:
+    del new_kw['verboseMulti']
+  while True:
+    base = "%s/output_%05d" % (basepath,snapshot_id)
+    if verbose:
+      print("Loading sub-snapshot %s (cpu_id=%d)" % (base,cpu_id))
+    s = loadRamses(base, snapshot_id, cpu_id, **new_kw)
+    if s == None:
+      break
+    if output == None:
+      output = SimulationBare(s)
+    else:
+      output.merge(s)
+      
+    cpu_id += 1
+
+  return output 
--- a/external/cosmotool/python/cosmotool/smooth.py
+++ b/external/cosmotool/python/cosmotool/smooth.py
@ -0,0 +1,109 @@
+from .config import install_prefix
+import subprocess
+import os
+try:
+  from tempfile import TemporaryDirectory
+except:
+  from backports.tempfile import TemporaryDirectory
+
+import h5py as h5
+import numpy as np
+import weakref
+
+def smooth_particle_density(
+           position,
+           velocities=None,
+           radius=1e6,
+           boxsize=None, 
+           resolution=128,
+           center=None, tmpprefix=None ):
+  """Use adaptive smoothing to produce density and momentum fields.
+  The algorithm is originally described in [1].
+
+  Parameters:
+  position   : numpy array NxQ
+               the particle positions
+               if Q==3, only positions. Q==6 means full space phase
+  velocities : Optional numpy array Nx3. 
+               It is only optional if the above Q is 6.
+  radius     : float
+               Maximum radius to which we need to compute fields
+  boxsize    : float
+               Size of the box for the generated fields
+  resolution : int
+               Resolution of the output boxes
+  center     : list of 3 floats
+               Center of the new box. It depends on the convention 
+               for particles. If those are between [0, L], then [0,0,0]
+               is correct. If those are [-L/2,L/2] then you should set
+               [L/2,L/2,L/2].
+  tmpprefix  : string
+               prefix of the temporary directory that will be used.
+               It needs to have a lot of space available. By default
+               '/tmp/ will be typically used.
+
+
+  Returns
+  -------
+   dictionnary
+   The dict has two entries: 'rho' for the density, and 'p' for the momenta.
+   Once the dictionary is garbage collected all temporary files and directories
+   will be cleared automatically.
+
+
+  Raises
+  ------
+   ValueError
+     if arguments are invalid
+
+
+  .. [1] S. Colombi, M. Chodorowski, 
+         "Cosmic velocity-gravity in redshift space", MNRAS, 2007, 375, 1
+  """
+  if len(position.shape) != 2:
+     raise ValueError("Invalid position array shape")
+
+  if velocities is None:
+    if position.shape[1] != 6:
+      raise ValueError("Position must be phase space if no velocities are given")
+
+
+  if boxsize is None:
+    raise ValueError("Need a boxsize")
+
+  cx,cy,cz=center
+  tmpdir = TemporaryDirectory(prefix=tmpprefix) 
+  h5_file = os.path.join(tmpdir.name, 'particles.h5')
+  with h5.File(h5_file, mode="w") as f:
+    data = f.create_dataset('particles', shape=(position.shape[0],7), dtype=np.float32)
+    data[:,:3] = position[:,:3]
+    if velocities is not None: 
+      data[:,3:6] = velocities[:,:3]
+    else:
+      data[:,3:6] = position[:,3:]
+    data[:,6] = 1
+
+
+  ret =  \
+     subprocess.run([
+         os.path.join(install_prefix,'bin','simple3DFilter'),
+         h5_file,
+         str(radius),
+         str(boxsize),
+         str(resolution),
+         str(cx), str(cy), str(cz)
+     ], cwd=tmpdir.name)
+
+  
+  f0 = h5.File(os.path.join(tmpdir.name,'fields.h5'), mode="r")
+  def cleanup_f0():
+    f0.close()
+    tmpdir.cleanup()
+
+  class Dict(dict):
+    pass
+
+  t = Dict(rho=f0['density'], p=[f0['p0'], f0['p1'], f0['p2']])
+  t._tmpdir_=tmpdir
+  weakref.finalize(t, cleanup_f0)
+  return t
--- a/external/cosmotool/python/cosmotool/timing.py
+++ b/external/cosmotool/python/cosmotool/timing.py
@ -0,0 +1,53 @@
+import time                                                
+from contextlib import contextmanager
+
+@contextmanager
+def time_block(name):
+    """
+    This generator measure the time taken by a step, and prints the result
+    in second to the console.
+    
+    Arguments:
+        name (str): prefix to print
+    """
+    ts = time.time()
+    yield
+    te = time.time()
+ 
+    print('%s %2.2f sec' %  (name, te-ts))
+
+def timeit(method):
+    """This decorator add a timing request for each call to the decorated function.
+    
+    Arguments:
+        method (function): the method to decorate
+    """
+
+    def timed(*args, **kw):
+        ts = time.time()
+        result = method(*args, **kw)
+        te = time.time()
+
+        print('%r (%r, %r) %2.2f sec' % (method.__name__, args, kw, te-ts))
+        return result
+
+    return timed
+
+def timeit_quiet(method):
+    """This decorator add a timing request for each call to the decorated function.
+    Same as cosmotool.timeit_ but is quieter by not printing the values of the arguments.
+    
+    Arguments:
+        method (function): the method to decorate
+    """
+
+    def timed(*args, **kw):
+        ts = time.time()
+        result = method(*args, **kw)
+        te = time.time()
+
+        print('%r %2.2f sec' % (method.__name__, te-ts))
+        return result
+
+    return timed
+
--- a/external/cosmotool/python/cppHelper.hpp
+++ b/external/cosmotool/python/cppHelper.hpp
@ -0,0 +1,45 @@
+#include "config.hpp"
+#include "fortran.hpp"
+
+static void customCosmotoolHandler()
+{
+  try {
+    if (PyErr_Occurred())
+      ;
+    else
+      throw;
+  } catch (const CosmoTool::InvalidArgumentException& e) {
+    PyErr_SetString(PyExc_ValueError, "Invalid argument");
+  } catch (const CosmoTool::NoSuchFileException& e) {
+    PyErr_SetString(PyExc_IOError, "No such file");
+  } catch (const CosmoTool::InvalidUnformattedAccess& e) {
+    PyErr_SetString(PyExc_RuntimeError, "Invalid fortran unformatted access");
+  } catch (const CosmoTool::Exception& e) {
+    PyErr_SetString(PyExc_RuntimeError, e.what());
+  } catch (const std::bad_alloc& exn) {
+    PyErr_SetString(PyExc_MemoryError, exn.what());
+  } catch (const std::bad_cast& exn) {
+    PyErr_SetString(PyExc_TypeError, exn.what());
+  } catch (const std::domain_error& exn) {
+    PyErr_SetString(PyExc_ValueError, exn.what());
+  } catch (const std::invalid_argument& exn) {
+    PyErr_SetString(PyExc_ValueError, exn.what());
+  } catch (const std::ios_base::failure& exn) {
+    // Unfortunately, in standard C++ we have no way of distinguishing EOF
+    // from other errors here; be careful with the exception mask
+    PyErr_SetString(PyExc_IOError, exn.what());
+  } catch (const std::out_of_range& exn) {
+    // Change out_of_range to IndexError
+    PyErr_SetString(PyExc_IndexError, exn.what());
+  } catch (const std::overflow_error& exn) {
+    PyErr_SetString(PyExc_OverflowError, exn.what());
+  } catch (const std::range_error& exn) {
+    PyErr_SetString(PyExc_ArithmeticError, exn.what());
+  } catch (const std::underflow_error& exn) {
+    PyErr_SetString(PyExc_ArithmeticError, exn.what());
+  } catch (const std::exception& exn) {
+    PyErr_SetString(PyExc_RuntimeError, exn.what());
+  } catch(...) {
+    PyErr_SetString(PyExc_RuntimeError, "Unknown exception");
+  }
+}
--- a/external/cosmotool/python/project_tool.hpp
+++ b/external/cosmotool/python/project_tool.hpp
@ -0,0 +1,120 @@
+
+// Only in 3d
+
+template<typename T, typename ProdType>
+static T project_tool(T *vertex_value, T *u, T *u0)
+{
+  T ret0 = 0;
+  for (unsigned int i = 0; i < 8; i++)
+    {
+      unsigned int c[3] = { i & 1, (i>>1)&1, (i>>2)&1 };
+      int epsilon[3];
+      T ret = 0;
+      
+      for (int q = 0; q < 3; q++)
+        epsilon[q] = 2*c[q]-1;
+
+      for (int q = 0; q < ProdType::numProducts; q++)
+        ret += ProdType::product(u, u0, epsilon, q);
+      ret *= vertex_value[i];
+      ret0 += ret;
+    }
+    
+  return ret0;
+}
+
+
+template<typename T>
+static inline T get_u0(const T& u0, int epsilon)
+{
+  return (1-epsilon)/2 + epsilon*u0;
+//  return (epsilon > 0) ? u0 : (1-u0);
+}
+
+template<typename T>
+struct ProductTerm0
+{
+  static const int numProducts = 1;
+  
+  static inline T product(T *u, T *u0, int *epsilon, int q)
+  {
+    T a = 1;
+    
+    for (unsigned int r = 0; r < 3; r++)
+      a *= get_u0(u0[r], epsilon[r]);
+    return a;
+  }
+};
+
+
+template<typename T>
+struct ProductTerm1
+{
+  static const int numProducts = 3;
+  
+  static T product(T *u, T *u0, int *epsilon, int q)
+  {
+    T a = 1;
+    T G[3];
+    
+    for (unsigned int r = 0; r < 3; r++)
+      {
+        G[r] = get_u0(u0[r], epsilon[r]);
+      }
+
+    T F[3] = { G[1]*G[2], G[0]*G[2], G[0]*G[1] };
+
+    return F[q] * u[q] * epsilon[q];
+  }
+};
+
+template<typename T>
+struct ProductTerm2
+{
+  static const int numProducts = 3;
+  
+  static inline T product(T *u, T *u0, int *epsilon, int q)
+  {
+    T a = 1;
+    T G[3];
+    
+    for (unsigned int r = 0; r < 3; r++)
+      {
+        G[r] = get_u0(u0[r], epsilon[r]);
+      }
+
+    T F[3] = { epsilon[1]*epsilon[2]*u[1]*u[2], epsilon[0]*epsilon[2]*u[0]*u[2], epsilon[0]*epsilon[1]*u[0]*u[1] };    
+
+    return F[q] * G[q];
+  }
+};
+
+
+
+template<typename T>
+struct ProductTerm3
+{
+  static const int numProducts = 1;
+  
+  static inline T product(T *u, T *u0, int *epsilon, int q)
+  {
+    return epsilon[0]*epsilon[1]*epsilon[2]*u[0]*u[1]*u[2];
+  }
+};
+
+
+template<typename T>
+T compute_projection(T *vertex_value, T *u, T *u0, T rho)
+{
+  T ret;
+  
+  ret = project_tool<T, ProductTerm0<T> >(vertex_value, u, u0) * rho;
+  ret += project_tool<T, ProductTerm1<T> >(vertex_value, u, u0) * rho * rho / 2;
+  ret += project_tool<T, ProductTerm2<T> >(vertex_value, u, u0) * rho * rho * rho / 3;
+  ret += project_tool<T, ProductTerm3<T> >(vertex_value, u, u0) * rho * rho * rho * rho / 4;
+  return ret;
+}
+
+template
+double compute_projection(double *vertex_value, double *u, double *u0, double rho);
+
--- a/external/cosmotool/python/safe_gadget.hpp
+++ b/external/cosmotool/python/safe_gadget.hpp
@ -0,0 +1,29 @@
+#include "config.hpp"
+#include "loadGadget.hpp"
+#include <string>
+
+static inline
+CosmoTool::SimuData *loadGadgetMulti_safe(const std::string& fname, int flags, int gadgetFormat)
+{
+  try
+    {
+      return CosmoTool::loadGadgetMulti(fname.c_str(), -1, flags, gadgetFormat);
+    }
+  catch (const CosmoTool::Exception& e)
+    {
+      return 0;
+    }
+}
+
+
+static inline
+CosmoTool::SimuData **alloc_simudata(int n)
+{
+  return new CosmoTool::SimuData *[n];
+}
+
+static inline
+void del_simudata(CosmoTool::SimuData **s)
+{
+  delete[] s;
+}