Correct fitting script for both clumps and halos (#46)

* Minor typos * fix minor bugs * pep8 * formatting * pep8 * Fix minor bugs * New path & pep8 * add splitt * Updates * Improve calculation within radius * pep8 * pep8 * get the script working * Add matter overdensity * Add m200m to the script * Fix looping bug * add parents support * add import * Optionally concatenate velocities * Make optional masking * Ignore the error message * Start reading in raw data * Fix cat reading * Additional units conversions * Add clump reading * Fix indexing * Remove old comment * Remove old comment * set npart to 0 instead of overflow from NaN * fix docs * rm boring stuff * Remove old stuff * Remove old stuff * Remove old comment * Update nb
2025-06-08 18:01:11 +00:00 · 2023-04-19 16:39:35 +02:00 · 2023-04-19 16:39:35 +02:00 · 39b3498621
commit 39b3498621
parent c2fde1566b
17 changed files with 709 additions and 357 deletions
--- a/csiborgtools/fits/init.py
+++ b/csiborgtools/fits/init.py
@ -14,3 +14,4 @@
 # 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 from .halo import Clump, Halo  # noqa
 from .haloprofile import NFWPosterior, NFWProfile  # noqa
+from .utils import split_jobs  # noqa
--- a/csiborgtools/fits/halo.py
+++ b/csiborgtools/fits/halo.py
@ -22,6 +22,7 @@ class BaseStructure(ABC):
    """
    Basic structure object for handling operations on its particles.
    """
+
    _particles = None
    _info = None
    _box = None
@ -39,7 +40,7 @@ class BaseStructure(ABC):

    @particles.setter
    def particles(self, particles):
-        pars = ['x', 'y', 'z', 'vx', 'vy', 'vz', 'M']
+        pars = ["x", "y", "z", "vx", "vy", "vz", "M"]
        assert all(p in particles.dtype.names for p in pars)
        self._particles = particles

@ -87,20 +88,9 @@ class BaseStructure(ABC):
        -------
        pos : 2-dimensional array of shape `(n_particles, 3)`.
        """
-        ps = ('x', 'y', 'z')
+        ps = ("x", "y", "z")
        return numpy.vstack([self[p] - self.info[p] for p in ps]).T

-    @property
-    def r(self):
-        """
-        Radial separation of the particles from the centre of the object.
-
-        Returns
-        -------
-        r : 1-dimensional array of shape `(n_particles, )`.
-        """
-        return numpy.linalg.norm(self.pos, axis=1)
-
    @property
    def vel(self):
        """
@ -112,34 +102,61 @@ class BaseStructure(ABC):
        """
        return numpy.vstack([self[p] for p in ("vx", "vy", "vz")]).T

-    @property
-    def cmass(self):
+    def r(self):
        """
-        Cartesian position components of the object's centre of mass. Note that
-        this is already in a frame centered at the clump's potential minimum,
-        so its distance from origin indicates the separation of the centre of
-        mass and potential minimum.
+        Calculate the radial separation of the particles from the centre of the
+        object.
+
+        Returns
+        -------
+        r : 1-dimensional array of shape `(n_particles, )`.
+        """
+        return numpy.linalg.norm(self.pos, axis=1)
+
+    def cmass(self, rmax, rmin):
+        """
+        Calculate Cartesian position components of the object's centre of mass.
+        Note that this is already in a frame centered at the clump's potential
+        minimum, so its distance from origin indicates the separation of the
+        centre of mass and potential minimum.
+
+        Parameters
+        ----------
+        rmax : float
+            Maximum radius for particles to be included in the calculation.
+        rmin : float
+            Minimum radius for particles to be included in the calculation.

        Returns
        -------
        cm : 1-dimensional array of shape `(3, )`
        """
-        return numpy.average(self.pos, axis=0, weights=self['M'])
+        r = self.r()
+        mask = (r >= rmin) & (r <= rmax)
+        return numpy.average(self.pos[mask], axis=0, weights=self["M"][mask])

-    @property
-    def angular_momentum(self):
+    def angular_momentum(self, rmax, rmin=0):
        """
-        Angular momentum in the box coordinates.
+        Calculate angular momentum in the box coordinates.

-        NOTE: here also change velocities to the CM and appropriately edit the
-        docs.
+        Parameters
+        ----------
+        rmax : float
+            Maximum radius for particles to be included in the calculation.
+        rmin : float
+            Minimum radius for particles to be included in the calculation.

        Returns
        -------
        J : 1-dimensional array or shape `(3, )`
        """
-        J = numpy.cross(self.pos - self.cmass, self.vel)
-        return numpy.einsum("i,ij->j", self.m, J)
+        r = self.r()
+        mask = (r >= rmin) & (r <= rmax)
+        pos = self.pos[mask] - self.cmass(rmax, rmin)
+        # Velocitities in the object CM frame
+        vel = self.vel[mask]
+        vel -= numpy.average(self.vel[mask], axis=0, weights=self["M"][mask])
+        return numpy.einsum("i,ij->j", self["M"][mask], numpy.cross(pos, vel))

    def enclosed_mass(self, rmax, rmin=0):
        """
@ -156,8 +173,8 @@ class BaseStructure(ABC):
        -------
        enclosed_mass : float
        """
-        r = self.r
-        return numpy.sum(self['M'][(r >= rmin) & (r <= rmax)])
+        r = self.r()
+        return numpy.sum(self["M"][(r >= rmin) & (r <= rmax)])

    def lambda_bullock(self, radius, npart_min=10):
        r"""
@ -182,19 +199,21 @@ class BaseStructure(ABC):
        Bullock, J. S.;  Dekel, A.;  Kolatt, T. S.;  Kravtsov, A. V.;
        Klypin, A. A.;  Porciani, C.;  Primack, J. R.
        """
-        mask = self.r <= radius
+        mask = self.r() <= radius
        if numpy.sum(mask) < npart_min:
            return numpy.nan
        mass = self.enclosed_mass(radius)
        V = numpy.sqrt(self.box.box_G * mass / radius)
-        return (numpy.linalg.norm(self.angular_momentum[mask])
-                / (numpy.sqrt(2) * mass * V * radius))
+        return numpy.linalg.norm(self.angular_momentum(radius)) / (
+            numpy.sqrt(2) * mass * V * radius
+        )

-    def spherical_overdensity_mass(self, delta_mult, npart_min=10):
+    def spherical_overdensity_mass(self, delta_mult, npart_min=10, kind="crit"):
        r"""
        Calculate spherical overdensity mass and radius. The mass is defined as
        the enclosed mass within an outermost radius where the mean enclosed
-        spherical density reaches a multiple of the critical density `delta`.
+        spherical density reaches a multiple of the critical density `delta`
+        (times the matter density if `kind` is `matter`).

        Parameters
        ----------
@ -203,6 +222,8 @@ class BaseStructure(ABC):
        npart_min : int
            Minimum number of enclosed particles for a radius to be
            considered trustworthy.
+        kind : str
+            Either `crit` or `matter`, for critical or matter overdensity

        Returns
        -------
@ -211,17 +232,25 @@ class BaseStructure(ABC):
        mx :  float
            Corresponding spherical enclosed mass.
        """
+        # Quick check of inputs
+        assert kind in ["crit", "matter"]
+
        # We first sort the particles in an increasing separation
-        rs = self.r
+        rs = self.r()
        order = numpy.argsort(rs)
        rs = rs[order]
-        cmass = numpy.cumsum(self['M'])  # Cumulative mass
+        cmass = numpy.cumsum(self["M"][order])  # Cumulative mass
        # We calculate the enclosed volume and indices where it is above target
        vol = 4 * numpy.pi / 3 * (rs**3 - rs[0] ** 3)
-        ks = numpy.where([cmass / vol > delta_mult * self.box.rhoc])[0]
+
+        target_density = delta_mult * self.box.box_rhoc
+        if kind == "matter":
+            target_density *= self.box.cosmo.Om0
+        with numpy.errstate(divide="ignore"):
+            ks = numpy.where(cmass / vol > target_density)[0]
        if ks.size == 0:  # Never above the threshold?
            return numpy.nan, numpy.nan
-        k = numpy.maximum(ks)
+        k = numpy.max(ks)
        if k < npart_min:  # Too few particles?
            return numpy.nan, numpy.nan
        return rs[k], cmass[k]
@ -235,7 +264,7 @@ class BaseStructure(ABC):
        -------
        key : list of str
        """
-        return self.data.dtype.names
+        return self.particles.dtype.names

    def __getitem__(self, key):
        if key not in self.keys:
@ -259,6 +288,7 @@ class Clump(BaseStructure):
    box : :py:class:`csiborgtools.read.BoxUnits`
        Box units object.
    """
+
    def __init__(self, particles, info, box):
        self.particles = particles
        self.info = info
@ -279,6 +309,7 @@ class Halo(BaseStructure):
    box : :py:class:`csiborgtools.read.BoxUnits`
        Box units object.
    """
+
    def __init__(self, particles, info, box):
        self.particles = particles
        self.info = info
--- a/csiborgtools/fits/haloprofile.py
+++ b/csiborgtools/fits/haloprofile.py
@ -187,9 +187,6 @@ class NFWPosterior(NFWProfile):
        Clump object containing the particles and clump information.
    """

-    def __init__(self):
-        super().__init__()
-
    @property
    def clump(self):
        """
@ -228,7 +225,7 @@ class NFWPosterior(NFWProfile):
        -------
        rho0: float
        """
-        return mass / self.bounded_enclosed_mass(rmin, rmax, Rs, 1)
+        return mass / self.bounded_mass(rmin, rmax, Rs, 1)

    def initlogRs(self, r, rmin, rmax, binsguess=10):
        r"""
@ -255,7 +252,6 @@ class NFWPosterior(NFWProfile):
        counts, edges = numpy.histogram(r, bins)
        return numpy.log10(edges[numpy.argmax(counts)])

-
    def logprior(self, logRs, rmin, rmax):
        r"""
        Logarithmic uniform prior on :math:`\log R_{\rm s}`. Unnormalised but
@ -276,7 +272,7 @@ class NFWPosterior(NFWProfile):
        """
        if not rmin < 10**logRs < rmax:
            return -numpy.infty
-        return 0.
+        return 0.0

    def loglikelihood(self, logRs, r, rmin, rmax, npart):
        """
@ -303,7 +299,6 @@ class NFWPosterior(NFWProfile):
        mnfw = self.bounded_mass(rmin, rmax, Rs, 1)
        return numpy.sum(self._logprofile(r, Rs, 1)) - npart * numpy.log(mnfw)

-
    def __call__(self, logRs, r, rmin, rmax, npart):
        """
        Logarithmic posterior. Sum of the logarithmic prior and likelihood.
@ -353,18 +348,20 @@ class NFWPosterior(NFWProfile):
            Best fit NFW central density.
        """
        assert isinstance(clump, Clump)
-        r = clump.r
+        r = clump.r()
        rmin = numpy.min(r)
        rmax, mtot = clump.spherical_overdensity_mass(200)
-        npart = numpy.sum((rmin <= r) & (r <= rmax))
+        mask = (rmin <= r) & (r <= rmax)
+        npart = numpy.sum(mask)
+        r = r[mask]

        # Loss function to optimize
        def loss(logRs):
            return -self(logRs, r, rmin, rmax, npart)

        res = minimize_scalar(
-            loss, bounds=(numpy.log10(rmin), numpy.log10(rmax)),
-            method='bounded')
+            loss, bounds=(numpy.log10(rmin), numpy.log10(rmax)), method="bounded"
+        )

        if numpy.log10(rmax) - res.x < eps:
            res.success = False
--- a/csiborgtools/fits/utils.py
+++ b/csiborgtools/fits/utils.py
@ -0,0 +1,43 @@
+# Copyright (C) 2022 Richard Stiskalek, Deaglan Bartlett
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation; either version 3 of the License, or (at your
+# option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General
+# Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+"""Fitting utility functions."""
+
+import numpy
+
+
+def split_jobs(njobs, ncpu):
+    """
+    Split `njobs` amongst `ncpu`.
+
+    Parameters
+    ----------
+    njobs : int
+        Number of jobs.
+    ncpu : int
+        Number of CPUs.
+
+    Returns
+    -------
+    jobs : list of lists of integers
+        Outer list of each CPU and inner lists for CPU's jobs.
+    """
+    njobs_per_cpu, njobs_remainder = divmod(njobs, ncpu)
+    jobs = numpy.arange(njobs_per_cpu * ncpu).reshape((njobs_per_cpu, ncpu)).T
+
+    jobs = jobs.tolist()
+    for i in range(njobs_remainder):
+        jobs[i].append(njobs_per_cpu * ncpu + i)
+
+    return jobs
--- a/csiborgtools/match/init.py
+++ b/csiborgtools/match/init.py
@ -12,7 +12,7 @@
 # You should have received a copy of the GNU General Public License along
 # with this program; if not, write to the Free Software Foundation, Inc.,
 # 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
-from csiborgtools.match.match import (  # noqa
+from .match import (  # noqa
    ParticleOverlap,
    RealisationsMatcher,
    calculate_overlap,
@ -24,4 +24,5 @@ from csiborgtools.match.match import (  # noqa
    fill_delta_indxs,
    get_clumplims,
 )
-from csiborgtools.match.num_density import binned_counts, number_density  # noqa
+from .num_density import binned_counts, number_density  # noqa
+from .utils import concatenate_clumps  # noqa
--- a/csiborgtools/match/utils.py
+++ b/csiborgtools/match/utils.py
@ -16,7 +16,7 @@
 import numpy


-def concatenate_clumps(clumps):
+def concatenate_clumps(clumps, include_velocities=False):
    """
    Concatenate an array of clumps to a single array containing all particles.

@ -24,6 +24,8 @@ def concatenate_clumps(clumps):
    ----------
    clumps : list of structured arrays
        List of clumps. Each clump must be a structured array with keys
+    include_velocities : bool, optional
+        Whether to include velocities in the output array.

    Returns
    -------
@ -34,21 +36,31 @@ def concatenate_clumps(clumps):
    for clump, __ in clumps:
        N += clump.size
    # Infer dtype of positions
-    if clumps[0][0]['x'].dtype.char in numpy.typecodes["AllInteger"]:
+    if clumps[0][0]["x"].dtype.char in numpy.typecodes["AllInteger"]:
        posdtype = numpy.int32
    else:
        posdtype = numpy.float32

-    # Pre-allocate array
-    dtype = {"names": ['x', 'y', 'z', 'M'],
-             "formats": [posdtype] * 3 + [numpy.float32]}
+    # We pre-allocate an empty array. By default, we include just particle positions,
+    # which may be specified by cell IDs if integers, and masses. Additionally also
+    # outputs velocities.
+    if include_velocities:
+        dtype = {
+            "names": ["x", "y", "z", "vx", "vy", "vz", "M"],
+            "formats": [posdtype] * 3 + [numpy.float32] * 4,
+        }
+    else:
+        dtype = {
+            "names": ["x", "y", "z", "M"],
+            "formats": [posdtype] * 3 + [numpy.float32],
+        }
    particles = numpy.full(N, numpy.nan, dtype)

    # Fill it one clump by another
    start = 0
    for clump, __ in clumps:
        end = start + clump.size
-        for p in ('x', 'y', 'z', 'M'):
+        for p in dtype["names"]:
            particles[p][start:end] = clump[p]
        start = end

--- a/csiborgtools/read/init.py
+++ b/csiborgtools/read/init.py
@ -12,6 +12,7 @@
 # You should have received a copy of the GNU General Public License along
 # with this program; if not, write to the Free Software Foundation, Inc.,
 # 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+from .box_units import BoxUnits  # noqa
 from .halo_cat import ClumpsCatalogue, HaloCatalogue  # noqa
 from .knn_summary import kNNCDFReader  # noqa
 from .obs import (  # noqa
@ -26,4 +27,4 @@ from .paths import CSiBORGPaths  # noqa
 from .pk_summary import PKReader  # noqa
 from .readsim import MmainReader, ParticleReader, halfwidth_select, read_initcm  # noqa
 from .tpcf_summary import TPCFReader  # noqa
-from .utils import cartesian_to_radec, radec_to_cartesian
+from .utils import cartesian_to_radec, cols_to_structured, radec_to_cartesian  # noqa
--- a/csiborgtools/read/box_units.py
+++ b/csiborgtools/read/box_units.py
@ -24,10 +24,26 @@ from .readsim import ParticleReader

 # Map of unit conversions
 CONV_NAME = {
-    "length": ['x', 'y', 'z', "peak_x", "peak_y", "peak_z", "Rs", "rmin",
-               "rmax", "r200", "r500", "x0", "y0", "z0", "lagpatch"],
-    "mass": ["mass_cl", "totpartmass", "m200", "m500", "mass_mmain", 'M'],
-    "density": ["rho0"]
+    "length": [
+        "x",
+        "y",
+        "z",
+        "peak_x",
+        "peak_y",
+        "peak_z",
+        "Rs",
+        "rmin",
+        "rmax",
+        "r200c",
+        "r500c",
+        "r200m",
+        "x0",
+        "y0",
+        "z0",
+        "lagpatch",
+    ],
+    "mass": ["mass_cl", "totpartmass", "m200c", "m500c", "mass_mmain", "M", "m200m"],
+    "density": ["rho0"],
 }


@ -53,15 +69,29 @@ class BoxUnits:
        """
        partreader = ParticleReader(paths)
        info = partreader.read_info(nsnap, nsim)
-        pars = ["boxlen", "time", "aexp", "H0",
-                "omega_m", "omega_l", "omega_k", "omega_b",
-                "unit_l", "unit_d", "unit_t"]
+        pars = [
+            "boxlen",
+            "time",
+            "aexp",
+            "H0",
+            "omega_m",
+            "omega_l",
+            "omega_k",
+            "omega_b",
+            "unit_l",
+            "unit_d",
+            "unit_t",
+        ]
        for par in pars:
            setattr(self, "_" + par, float(info[par]))

-        self._cosmo = LambdaCDM(H0=self._H0, Om0=self._omega_m,
-                                Ode0=self._omega_l, Tcmb0=2.725 * units.K,
-                                Ob0=self._omega_b)
+        self._cosmo = LambdaCDM(
+            H0=self._H0,
+            Om0=self._omega_m,
+            Ode0=self._omega_l,
+            Tcmb0=2.725 * units.K,
+            Ob0=self._omega_b,
+        )
        self._Msuncgs = constants.M_sun.cgs.value  # Solar mass in grams

    @property
@ -227,7 +257,7 @@ class BoxUnits:
        cosmo_redshift : foat
            Cosmological redshift.
        """
-        x = numpy.linspace(0., 1., 5001)
+        x = numpy.linspace(0.0, 1.0, 5001)
        y = self.cosmo.comoving_distance(x)
        return interp1d(y, x)(self.box2mpc(dist))

@ -337,8 +367,7 @@ class BoxUnits:
        density : float
            Density in :math:`M_\odot / \mathrm{pc}^3`.
        """
-        return (density * self._unit_d / self._Msuncgs
-                * (units.Mpc.to(units.cm))**3)
+        return density * self._unit_d / self._Msuncgs * (units.Mpc.to(units.cm)) ** 3

    def dens2box(self, density):
        r"""
@ -355,8 +384,7 @@ class BoxUnits:
        density : float
            Density in box units.
        """
-        return (density / self._unit_d * self._Msuncgs
-                / (units.Mpc.to(units.cm))**3)
+        return density / self._unit_d * self._Msuncgs / (units.Mpc.to(units.cm)) ** 3

    def convert_from_boxunits(self, data, names):
        r"""
@ -389,7 +417,7 @@ class BoxUnits:
        transforms = {
            "length": self.box2mpc,
            "mass": self.box2solarmass,
-            "density": self.box2dens
+            "density": self.box2dens,
        }

        for name in names:
@ -407,7 +435,8 @@ class BoxUnits:
            # If nothing found
            if not found:
                raise NotImplementedError(
-                    "Conversion of `{}` is not defined.".format(name))
+                    "Conversion of `{}` is not defined.".format(name)
+                )

            # Center at the observer
            if name in ["peak_x", "peak_y", "peak_z", "x0", "y0", "z0"]:
--- a/csiborgtools/read/halo_cat.py
+++ b/csiborgtools/read/halo_cat.py
@ -21,13 +21,14 @@ from sklearn.neighbors import NearestNeighbors
 from .box_units import BoxUnits
 from .paths import CSiBORGPaths
 from .readsim import ParticleReader
-from .utils import cartesian_to_radec, flip_cols, radec_to_cartesian
+from .utils import add_columns, cartesian_to_radec, flip_cols, radec_to_cartesian


 class BaseCatalogue(ABC):
    """
    Base (sub)halo catalogue.
    """
+
    _data = None
    _paths = None
    _nsim = None
@ -132,9 +133,9 @@ class BaseCatalogue(ABC):
        pos : 2-dimensional array of shape `(nobjects, 3)`
        """
        if in_initial:
-            ps = ['x0', 'y0', 'z0']
+            ps = ["x0", "y0", "z0"]
        else:
-            ps = ['x', 'y', 'z']
+            ps = ["x", "y", "z"]
        pos = numpy.vstack([self[p] for p in ps]).T
        if not cartesian:
            pos = cartesian_to_radec(pos)
@ -248,8 +249,7 @@ class BaseCatalogue(ABC):
        knn.fit(pos)
        # Convert angular radius to cosine difference.
        metric_maxdist = 1 - numpy.cos(numpy.deg2rad(ang_radius))
-        dist, ind = knn.radius_neighbors(X, radius=metric_maxdist,
-                                         sort_results=True)
+        dist, ind = knn.radius_neighbors(X, radius=metric_maxdist, sort_results=True)
        # And the cosine difference to angular distance.
        for i in range(X.shape[0]):
            dist[i] = numpy.rad2deg(numpy.arccos(1 - dist[i]))
@ -282,13 +282,9 @@ class ClumpsCatalogue(BaseCatalogue):
    r"""
    Clumps catalogue, defined in the final snapshot.

-    TODO:
-        Add fitted quantities.
-        Add threshold on number of particles
-
    Parameters
    ----------
-    nsim : int
+    sim : int
        IC realisation index.
    paths : py:class`csiborgtools.read.CSiBORGPaths`
        CSiBORG paths object.
@ -299,28 +295,68 @@ class ClumpsCatalogue(BaseCatalogue):
    minmass : len-2 tuple, optional
        Minimum mass. The first element is the catalogue key and the second is
        the value.
+    load_fitted : bool, optional
+        Whether to load fitted quantities.
+    rawdata : bool, optional
+        Whether to return the raw data. In this case applies no cuts and
+        transformations.
    """
-    def __init__(self, nsim, paths, maxdist=155.5 / 0.705,
-                 minmass=("mass_cl", 1e12)):
+
+    def __init__(
+        self,
+        nsim,
+        paths,
+        maxdist=155.5 / 0.705,
+        minmass=("mass_cl", 1e12),
+        load_fitted=True,
+        rawdata=False,
+    ):
        self.nsim = nsim
        self.paths = paths
        # Read in the clumps from the final snapshot
        partreader = ParticleReader(self.paths)
-        cols = ["index", "parent", 'x', 'y', 'z', "mass_cl"]
-        data = partreader.read_clumps(self.nsnap, self.nsim, cols=cols)
+        cols = ["index", "parent", "x", "y", "z", "mass_cl"]
+        self._data = partreader.read_clumps(self.nsnap, self.nsim, cols=cols)
        # Overwrite the parent with the ultimate parent
        mmain = numpy.load(self.paths.mmain_path(self.nsnap, self.nsim))
-        data["parent"] = mmain["ultimate_parent"]
+        self._data["parent"] = mmain["ultimate_parent"]

-        # Flip positions and convert from code units to cMpc. Convert M too
-        flip_cols(data, "x", "z")
-        for p in ('x', 'y', 'z'):
-            data[p] -= 0.5
-        data = self.box.convert_from_boxunits(data, ['x', 'y', 'z', "mass_cl"])
+        if load_fitted:
+            fits = numpy.load(paths.structfit_path(self.nsnap, nsim, "clumps"))
+            cols = [col for col in fits.dtype.names if col != "index"]
+            X = [fits[col] for col in cols]
+            self._data = add_columns(self._data, X, cols)

-        mask = numpy.sqrt(data['x']**2 + data['y']**2 + data['z']**2) < maxdist
-        mask &= data[minmass[0]] > minmass[1]
-        self._data = data[mask]
+        # If the raw data is not required, then start applying transformations
+        # and cuts.
+        if not rawdata:
+            flip_cols(self._data, "x", "z")
+            for p in ("x", "y", "z"):
+                self._data[p] -= 0.5
+            self._data = self.box.convert_from_boxunits(
+                self._data,
+                [
+                    "x",
+                    "y",
+                    "z",
+                    "mass_cl",
+                    "totpartmass",
+                    "rho0",
+                    "r200c",
+                    "r500c",
+                    "m200c",
+                    "m500c",
+                    "r200m",
+                    "m200m",
+                ],
+            )
+            if maxdist is not None:
+                dist = numpy.sqrt(
+                    self._data["x"] ** 2 + self._data["y"] ** 2 + self._data["z"] ** 2
+                )
+                self._data = self._data[dist < maxdist]
+            if minmass is not None:
+                self._data = self._data[self._data[minmass[0]] > minmass[1]]

    @property
    def ismain(self):
@ -341,7 +377,6 @@ class HaloCatalogue(BaseCatalogue):

    TODO:
        Add the fitted quantities
-        Add threshold on number of particles

    Parameters
    ----------
@ -356,20 +391,32 @@ class HaloCatalogue(BaseCatalogue):
    minmass : len-2 tuple
        Minimum mass. The first element is the catalogue key and the second is
        the value.
+    rawdata : bool, optional
+        Whether to return the raw data. In this case applies no cuts and
+        transformations.
    """
-    def __init__(self, nsim, paths, maxdist=155.5 / 0.705,
-                 minmass=('M', 1e12)):
+
+    def __init__(
+        self, nsim, paths, maxdist=155.5 / 0.705, minmass=("M", 1e12), rawdata=False
+    ):
        self.nsim = nsim
        self.paths = paths
        # Read in the mmain catalogue of summed substructure
        mmain = numpy.load(self.paths.mmain_path(self.nsnap, self.nsim))
-        data = mmain["mmain"]
+        self._data = mmain["mmain"]
+        if not rawdata:
            # Flip positions and convert from code units to cMpc. Convert M too
-        flip_cols(data, "x", "z")
-        for p in ('x', 'y', 'z'):
-            data[p] -= 0.5
-        data = self.box.convert_from_boxunits(data, ['x', 'y', 'z', 'M'])
+            flip_cols(self._data, "x", "z")
+            for p in ("x", "y", "z"):
+                self._data[p] -= 0.5
+            self._data = self.box.convert_from_boxunits(
+                self._data, ["x", "y", "z", "M"]
+            )

-        mask = numpy.sqrt(data['x']**2 + data['y']**2 + data['z']**2) < maxdist
-        mask &= data[minmass[0]] > minmass[1]
-        self._data = data[mask]
+            if maxdist is not None:
+                dist = numpy.sqrt(
+                    self._data["x"] ** 2 + self._data["y"] ** 2 + self._data["z"] ** 2
+                )
+                self._data = self._data[dist < maxdist]
+            if minmass is not None:
+                self._data = self._data[self._data[minmass[0]] > minmass[1]]
--- a/csiborgtools/read/paths.py
+++ b/csiborgtools/read/paths.py
@ -32,6 +32,7 @@ class CSiBORGPaths:
     postdir: str
         Path to the folder where post-processed files are stored.
    """
+
    _srcdir = None
    _postdir = None

@ -96,8 +97,7 @@ class CSiBORGPaths:
        fpath = join(self.postdir, "temp")
        if not isdir(fpath):
            mkdir(fpath)
-            warn("Created directory `{}`.".format(fpath), UserWarning,
-                 stacklevel=1)
+            warn("Created directory `{}`.".format(fpath), UserWarning, stacklevel=1)
        return fpath

    def mmain_path(self, nsnap, nsim):
@ -118,11 +118,9 @@ class CSiBORGPaths:
        fdir = join(self.postdir, "mmain")
        if not isdir(fdir):
            mkdir(fdir)
-            warn("Created directory `{}`.".format(fdir), UserWarning,
-                 stacklevel=1)
+            warn("Created directory `{}`.".format(fdir), UserWarning, stacklevel=1)
        return join(
-            fdir,
-            "mmain_{}_{}.npz".format(str(nsim).zfill(5), str(nsnap).zfill(5))
+            fdir, "mmain_{}_{}.npz".format(str(nsim).zfill(5), str(nsnap).zfill(5))
        )

    def initmatch_path(self, nsim, kind):
@ -145,8 +143,7 @@ class CSiBORGPaths:
        fdir = join(self.postdir, "initmatch")
        if not isdir(fdir):
            mkdir(fdir)
-            warn("Created directory `{}`.".format(fdir), UserWarning,
-                 stacklevel=1)
+            warn("Created directory `{}`.".format(fdir), UserWarning, stacklevel=1)
        return join(fdir, "{}_{}.npy".format(kind, str(nsim).zfill(5)))

    def split_path(self, nsnap, nsim):
@ -167,10 +164,10 @@ class CSiBORGPaths:
        fdir = join(self.postdir, "split")
        if not isdir(fdir):
            mkdir(fdir)
-            warn("Created directory `{}`.".format(fdir), UserWarning,
-                 stacklevel=1)
-        return join(fdir, "clumps_{}_{}.npz"
-                    .format(str(nsim).zfill(5), str(nsnap).zfill(5)))
+            warn("Created directory `{}`.".format(fdir), UserWarning, stacklevel=1)
+        return join(
+            fdir, "clumps_{}_{}.npz".format(str(nsim).zfill(5), str(nsnap).zfill(5))
+        )

    def get_ics(self, tonew):
        """
@ -239,7 +236,7 @@ class CSiBORGPaths:
        # Get all files in simpath that start with output_
        snaps = glob(join(simpath, "output_*"))
        # Take just the last _00XXXX from each file  and strip zeros
-        snaps = [int(snap.split('_')[-1].lstrip('0')) for snap in snaps]
+        snaps = [int(snap.split("_")[-1].lstrip("0")) for snap in snaps]
        return numpy.sort(snaps)

    def snapshot_path(self, nsnap, nsim):
@ -261,22 +258,31 @@ class CSiBORGPaths:
        simpath = self.ic_path(nsim, tonew=tonew)
        return join(simpath, "output_{}".format(str(nsnap).zfill(5)))

-    def hcat_path(self, nsim):
+    def structfit_path(self, nsnap, nsim, kind):
        """
-        Path to the final snapshot halo catalogue from `fit_halos.py`.
+        Path to the clump or halo catalogue from `fit_halos.py`.

        Parameters
        ----------
+        nsnap : int
+            Snapshot index.
        nsim : int
            IC realisation index.
+        kind : str
+            Type of catalogue.  Can be either `clumps` or `halos`.

        Returns
        -------
        path : str
        """
-        nsnap = str(max(self.get_snapshots(nsim))).zfill(5)
-        fname = "ramses_out_{}_{}.npy".format(str(self.nsim).zfill(5), nsnap)
-        return join(self.postdir, fname)
+        assert kind in ["clumps", "halos"]
+        fdir = join(self.postdir, "structfit")
+        if not isdir(fdir):
+            mkdir(fdir)
+            warn("Created directory `{}`.".format(fdir), UserWarning, stacklevel=1)
+
+        fname = "{}_out_{}_{}.npy".format(kind, str(nsim).zfill(5), str(nsnap).zfill(5))
+        return join(fdir, fname)

    def knnauto_path(self, run, nsim=None):
        """
@ -297,8 +303,7 @@ class CSiBORGPaths:
        fdir = join(self.postdir, "knn", "auto")
        if not isdir(fdir):
            makedirs(fdir)
-            warn("Created directory `{}`.".format(fdir), UserWarning,
-                 stacklevel=1)
+            warn("Created directory `{}`.".format(fdir), UserWarning, stacklevel=1)
        if nsim is not None:
            return join(fdir, "knncdf_{}_{}.p".format(str(nsim).zfill(5), run))

@ -325,8 +330,7 @@ class CSiBORGPaths:
        fdir = join(self.postdir, "knn", "cross")
        if not isdir(fdir):
            makedirs(fdir)
-            warn("Created directory `{}`.".format(fdir), UserWarning,
-                 stacklevel=1)
+            warn("Created directory `{}`.".format(fdir), UserWarning, stacklevel=1)
        if nsims is not None:
            assert isinstance(nsims, (list, tuple)) and len(nsims) == 2
            nsim0 = str(nsims[0]).zfill(5)
@ -356,8 +360,7 @@ class CSiBORGPaths:
        fdir = join(self.postdir, "tpcf", "auto")
        if not isdir(fdir):
            makedirs(fdir)
-            warn("Created directory `{}`.".format(fdir), UserWarning,
-                 stacklevel=1)
+            warn("Created directory `{}`.".format(fdir), UserWarning, stacklevel=1)
        if nsim is not None:
            return join(fdir, "tpcf{}_{}.p".format(str(nsim).zfill(5), run))

--- a/csiborgtools/read/utils.py
+++ b/csiborgtools/read/utils.py
@ -101,8 +101,7 @@ def cols_to_structured(N, cols):
    if not isinstance(cols, list) and all(isinstance(c, tuple) for c in cols):
        raise TypeError("`cols` must be a list of tuples.")

-    dtype = {"names": [col[0] for col in cols],
-             "formats": [col[1] for col in cols]}
+    dtype = {"names": [col[0] for col in cols], "formats": [col[1] for col in cols]}
    return numpy.full(N, numpy.nan, dtype=dtype)


@ -237,8 +236,9 @@ def array_to_structured(arr, cols):
    """
    cols = [cols] if isinstance(cols, str) else cols
    if arr.ndim != 2 and arr.shape[1] != len(cols):
-        raise TypeError("`arr` must be a 2-dimensional array of "
-                        "shape `(n_samples, n_cols)`.")
+        raise TypeError(
+            "`arr` must be a 2-dimensional array of " "shape `(n_samples, n_cols)`."
+        )

    dtype = {"names": cols, "formats": [arr.dtype] * len(cols)}
    out = numpy.full(arr.shape[0], numpy.nan, dtype=dtype)
@ -299,5 +299,7 @@ def extract_from_structured(arr, cols):
        out[:, i] = arr[col]
    # Optionally flatten
    if len(cols) == 1:
-        return out.reshape(-1,)
+        return out.reshape(
+            -1,
+        )
    return out
--- a/notebooks/fitting.ipynb
+++ b/notebooks/fitting.ipynb
--- a/scripts/cluster_knn_auto.py
+++ b/scripts/cluster_knn_auto.py
@ -29,6 +29,7 @@ try:
    import csiborgtools
 except ModuleNotFoundError:
    import sys
+
    sys.path.append("../")
    import csiborgtools

@ -43,23 +44,13 @@ nproc = comm.Get_size()
 parser = ArgumentParser()
 parser.add_argument("--runs", type=str, nargs="+")
 args = parser.parse_args()
-with open('../scripts/knn_auto.yml', 'r') as file:
+with open("../scripts/knn_auto.yml", "r") as file:
    config = yaml.safe_load(file)

 Rmax = 155 / 0.705  # Mpc (h = 0.705) high resolution region radius
 totvol = 4 * numpy.pi * Rmax**3 / 3
-minmass = 1e12
-ics = [7444, 7468, 7492, 7516, 7540, 7564, 7588, 7612, 7636, 7660, 7684,
-       7708, 7732, 7756, 7780, 7804, 7828, 7852, 7876, 7900, 7924, 7948,
-       7972, 7996, 8020, 8044, 8068, 8092, 8116, 8140, 8164, 8188, 8212,
-       8236, 8260, 8284, 8308, 8332, 8356, 8380, 8404, 8428, 8452, 8476,
-       8500, 8524, 8548, 8572, 8596, 8620, 8644, 8668, 8692, 8716, 8740,
-       8764, 8788, 8812, 8836, 8860, 8884, 8908, 8932, 8956, 8980, 9004,
-       9028, 9052, 9076, 9100, 9124, 9148, 9172, 9196, 9220, 9244, 9268,
-       9292, 9316, 9340, 9364, 9388, 9412, 9436, 9460, 9484, 9508, 9532,
-       9556, 9580, 9604, 9628, 9652, 9676, 9700, 9724, 9748, 9772, 9796,
-       9820, 9844]
 paths = csiborgtools.read.CSiBORGPaths(**csiborgtools.paths_glamdring)
+ics = paths.get_ics(False)
 knncdf = csiborgtools.clustering.kNN_CDF()

 ###############################################################################
@ -75,9 +66,9 @@ def read_single(selection, cat):
    psel = selection["primary"]
    pmin, pmax = psel.get("min", None), psel.get("max", None)
    if pmin is not None:
-        mmask &= (cat[psel["name"]] >= pmin)
+        mmask &= cat[psel["name"]] >= pmin
    if pmax is not None:
-        mmask &= (cat[psel["name"]] < pmax)
+        mmask &= cat[psel["name"]] < pmax
    pos = pos[mmask, ...]

    # Secondary selection
@ -92,12 +83,13 @@ def read_single(selection, cat):
    if ssel.get("marked", True):
        x = cat[psel["name"]][mmask]
        prop = csiborgtools.clustering.normalised_marks(
-            x, prop, nbins=config["nbins_marks"])
+            x, prop, nbins=config["nbins_marks"]
+        )

    if smin is not None:
-        smask &= (prop >= smin)
+        smask &= prop >= smin
    if smax is not None:
-        smask &= (prop < smax)
+        smask &= prop < smax

    return pos[smask, ...]

@ -106,8 +98,7 @@ def do_auto(run, cat, ic):
    """Calculate the kNN-CDF single catalgoue autocorrelation."""
    _config = config.get(run, None)
    if _config is None:
-        warn("No configuration for run {}.".format(run), UserWarning,
-             stacklevel=1)
+        warn("No configuration for run {}.".format(run), UserWarning, stacklevel=1)
        return

    rvs_gen = csiborgtools.clustering.RVSinsphere(Rmax)
@ -115,21 +106,28 @@ def do_auto(run, cat, ic):
    knn = NearestNeighbors()
    knn.fit(pos)
    rs, cdf = knncdf(
-        knn, rvs_gen=rvs_gen, nneighbours=config["nneighbours"],
-        rmin=config["rmin"], rmax=config["rmax"],
-        nsamples=int(config["nsamples"]), neval=int(config["neval"]),
-        batch_size=int(config["batch_size"]), random_state=config["seed"])
+        knn,
+        rvs_gen=rvs_gen,
+        nneighbours=config["nneighbours"],
+        rmin=config["rmin"],
+        rmax=config["rmax"],
+        nsamples=int(config["nsamples"]),
+        neval=int(config["neval"]),
+        batch_size=int(config["batch_size"]),
+        random_state=config["seed"],
+    )

-    joblib.dump({"rs": rs, "cdf": cdf, "ndensity": pos.shape[0] / totvol},
-                paths.knnauto_path(run,  ic))
+    joblib.dump(
+        {"rs": rs, "cdf": cdf, "ndensity": pos.shape[0] / totvol},
+        paths.knnauto_path(run, ic),
+    )


 def do_cross_rand(run, cat, ic):
    """Calculate the kNN-CDF cross catalogue random correlation."""
    _config = config.get(run, None)
    if _config is None:
-        warn("No configuration for run {}.".format(run), UserWarning,
-             stacklevel=1)
+        warn("No configuration for run {}.".format(run), UserWarning, stacklevel=1)
        return

    rvs_gen = csiborgtools.clustering.RVSinsphere(Rmax)
@ -142,10 +140,17 @@ def do_cross_rand(run, cat, ic):
    knn2.fit(pos2)

    rs, cdf0, cdf1, joint_cdf = knncdf.joint(
-        knn1, knn2, rvs_gen=rvs_gen, nneighbours=int(config["nneighbours"]),
-        rmin=config["rmin"], rmax=config["rmax"],
-        nsamples=int(config["nsamples"]), neval=int(config["neval"]),
-        batch_size=int(config["batch_size"]), random_state=config["seed"])
+        knn1,
+        knn2,
+        rvs_gen=rvs_gen,
+        nneighbours=int(config["nneighbours"]),
+        rmin=config["rmin"],
+        rmax=config["rmax"],
+        nsamples=int(config["nsamples"]),
+        neval=int(config["neval"]),
+        batch_size=int(config["batch_size"]),
+        random_state=config["seed"],
+    )
    corr = knncdf.joint_to_corr(cdf0, cdf1, joint_cdf)
    joblib.dump({"rs": rs, "corr": corr}, paths.knnauto_path(run, ic))

--- a/scripts/cluster_knn_cross.py
+++ b/scripts/cluster_knn_cross.py
@ -16,7 +16,6 @@
 from argparse import ArgumentParser
 from datetime import datetime
 from itertools import combinations
-from os.path import join
 from warnings import warn

 import joblib
@ -30,6 +29,7 @@ try:
    import csiborgtools
 except ModuleNotFoundError:
    import sys
+
    sys.path.append("../")
    import csiborgtools

@ -44,24 +44,12 @@ nproc = comm.Get_size()
 parser = ArgumentParser()
 parser.add_argument("--runs", type=str, nargs="+")
 args = parser.parse_args()
-with open('../scripts/knn_cross.yml', 'r') as file:
+with open("../scripts/knn_cross.yml", "r") as file:
    config = yaml.safe_load(file)

 Rmax = 155 / 0.705  # Mpc (h = 0.705) high resolution region radius
-minmass = 1e12
-ics = [7444, 7468, 7492, 7516, 7540, 7564, 7588, 7612, 7636, 7660, 7684,
-       7708, 7732, 7756, 7780, 7804, 7828, 7852, 7876, 7900, 7924, 7948,
-       7972, 7996, 8020, 8044, 8068, 8092, 8116, 8140, 8164, 8188, 8212,
-       8236, 8260, 8284, 8308, 8332, 8356, 8380, 8404, 8428, 8452, 8476,
-       8500, 8524, 8548, 8572, 8596, 8620, 8644, 8668, 8692, 8716, 8740,
-       8764, 8788, 8812, 8836, 8860, 8884, 8908, 8932, 8956, 8980, 9004,
-       9028, 9052, 9076, 9100, 9124, 9148, 9172, 9196, 9220, 9244, 9268,
-       9292, 9316, 9340, 9364, 9388, 9412, 9436, 9460, 9484, 9508, 9532,
-       9556, 9580, 9604, 9628, 9652, 9676, 9700, 9724, 9748, 9772, 9796,
-       9820, 9844]
 paths = csiborgtools.read.CSiBORGPaths(**csiborgtools.paths_glamdring)
-dumpdir = "/mnt/extraspace/rstiskalek/csiborg/knn"
-fout = join(dumpdir, "cross", "knncdf_{}_{}_{}.p")
+ics = paths.get_ics(False)
 knncdf = csiborgtools.clustering.kNN_CDF()

 ###############################################################################
@ -76,9 +64,9 @@ def read_single(selection, cat):
    psel = selection["primary"]
    pmin, pmax = psel.get("min", None), psel.get("max", None)
    if pmin is not None:
-        mmask &= (cat[psel["name"]] >= pmin)
+        mmask &= cat[psel["name"]] >= pmin
    if pmax is not None:
-        mmask &= (cat[psel["name"]] < pmax)
+        mmask &= cat[psel["name"]] < pmax
    return pos[mmask, ...]


@ -99,10 +87,17 @@ def do_cross(run, ics):
    knn2.fit(pos2)

    rs, cdf0, cdf1, joint_cdf = knncdf.joint(
-        knn1, knn2, rvs_gen=rvs_gen, nneighbours=int(config["nneighbours"]),
-        rmin=config["rmin"], rmax=config["rmax"],
-        nsamples=int(config["nsamples"]), neval=int(config["neval"]),
-        batch_size=int(config["batch_size"]), random_state=config["seed"])
+        knn1,
+        knn2,
+        rvs_gen=rvs_gen,
+        nneighbours=int(config["nneighbours"]),
+        rmin=config["rmin"],
+        rmax=config["rmax"],
+        nsamples=int(config["nsamples"]),
+        neval=int(config["neval"]),
+        batch_size=int(config["batch_size"]),
+        random_state=config["seed"],
+    )

    corr = knncdf.joint_to_corr(cdf0, cdf1, joint_cdf)
    joblib.dump({"rs": rs, "corr": corr}, paths.knncross_path(run, ics))
--- a/scripts/cluster_tcpf_auto.py
+++ b/scripts/cluster_tcpf_auto.py
@ -16,7 +16,6 @@
 from argparse import ArgumentParser
 from copy import deepcopy
 from datetime import datetime
-from os.path import join
 from warnings import warn

 import joblib
@ -29,6 +28,7 @@ try:
    import csiborgtools
 except ModuleNotFoundError:
    import sys
+
    sys.path.append("../")
    import csiborgtools

@ -43,24 +43,12 @@ nproc = comm.Get_size()
 parser = ArgumentParser()
 parser.add_argument("--runs", type=str, nargs="+")
 args = parser.parse_args()
-with open('../scripts/tpcf_auto.yml', 'r') as file:
+with open("../scripts/tpcf_auto.yml", "r") as file:
    config = yaml.safe_load(file)

 Rmax = 155 / 0.705  # Mpc (h = 0.705) high resolution region radius
-minmass = 1e12
-ics = [7444, 7468, 7492, 7516, 7540, 7564, 7588, 7612, 7636, 7660, 7684,
-       7708, 7732, 7756, 7780, 7804, 7828, 7852, 7876, 7900, 7924, 7948,
-       7972, 7996, 8020, 8044, 8068, 8092, 8116, 8140, 8164, 8188, 8212,
-       8236, 8260, 8284, 8308, 8332, 8356, 8380, 8404, 8428, 8452, 8476,
-       8500, 8524, 8548, 8572, 8596, 8620, 8644, 8668, 8692, 8716, 8740,
-       8764, 8788, 8812, 8836, 8860, 8884, 8908, 8932, 8956, 8980, 9004,
-       9028, 9052, 9076, 9100, 9124, 9148, 9172, 9196, 9220, 9244, 9268,
-       9292, 9316, 9340, 9364, 9388, 9412, 9436, 9460, 9484, 9508, 9532,
-       9556, 9580, 9604, 9628, 9652, 9676, 9700, 9724, 9748, 9772, 9796,
-       9820, 9844]
-dumpdir = "/mnt/extraspace/rstiskalek/csiborg/tpcf"
-fout = join(dumpdir, "auto", "tpcf_{}_{}.p")
 paths = csiborgtools.read.CSiBORGPaths()
+ics = paths.get_ics(False)
 tpcf = csiborgtools.clustering.Mock2PCF()

 ###############################################################################
@ -76,9 +64,9 @@ def read_single(selection, cat):
    psel = selection["primary"]
    pmin, pmax = psel.get("min", None), psel.get("max", None)
    if pmin is not None:
-        mmask &= (cat[psel["name"]] >= pmin)
+        mmask &= cat[psel["name"]] >= pmin
    if pmax is not None:
-        mmask &= (cat[psel["name"]] < pmax)
+        mmask &= cat[psel["name"]] < pmax
    pos = pos[mmask, ...]

    # Secondary selection
@ -93,12 +81,13 @@ def read_single(selection, cat):
    if ssel.get("marked", True):
        x = cat[psel["name"]][mmask]
        prop = csiborgtools.clustering.normalised_marks(
-            x, prop, nbins=config["nbins_marks"])
+            x, prop, nbins=config["nbins_marks"]
+        )

    if smin is not None:
-        smask &= (prop >= smin)
+        smask &= prop >= smin
    if smax is not None:
-        smask &= (prop < smax)
+        smask &= prop < smax

    return pos[smask, ...]

@ -110,8 +99,11 @@ def do_auto(run, cat, ic):
        return

    rvs_gen = csiborgtools.clustering.RVSinsphere(Rmax)
-    bins = numpy.logspace(numpy.log10(config["rpmin"]),
-                          numpy.log10(config["rpmax"]), config["nrpbins"] + 1)
+    bins = numpy.logspace(
+        numpy.log10(config["rpmin"]),
+        numpy.log10(config["rpmax"]),
+        config["nrpbins"] + 1,
+    )
    pos = read_single(_config, cat)
    nrandom = int(config["randmult"] * pos.shape[0])
    rp, wp = tpcf(pos, rvs_gen, nrandom, bins)
--- a/scripts/pre_fithalos.py
+++ b/scripts/pre_fithalos.py
@ -16,18 +16,26 @@
 A script to fit halos (concentration, ...). The particle array of each CSiBORG
 realisation must have been split in advance by `runsplit_halos`.
 """
+from argparse import ArgumentParser
 from datetime import datetime
+from os.path import join

 import numpy
 from mpi4py import MPI
+from tqdm import tqdm

 try:
    import csiborgtools
 except ModuleNotFoundError:
    import sys
+
    sys.path.append("../")
    import csiborgtools

+parser = ArgumentParser()
+parser.add_argument("--kind", type=str, choices=["halos", "clumps"])
+args = parser.parse_args()
+

 # Get MPI things
 comm = MPI.COMM_WORLD
@ -36,127 +44,169 @@ nproc = comm.Get_size()

 paths = csiborgtools.read.CSiBORGPaths(**csiborgtools.paths_glamdring)
 partreader = csiborgtools.read.ParticleReader(paths)
-
-cols_collect = [("npart", numpy.int64), ("totpartmass", numpy.float64),
-                ("Rs", numpy.float64), ("vx", numpy.float64),
-                ("vy", numpy.float64), ("vz", numpy.float64),
-                ("Lx", numpy.float64), ("Ly", numpy.float64),
-                ("Lz", numpy.float64), ("rho0", numpy.float64),
-                ("conc", numpy.float64), ("rmin", numpy.float64),
-                ("rmax", numpy.float64), ("r200", numpy.float64),
-                ("r500", numpy.float64), ("m200", numpy.float64),
-                ("m500", numpy.float64), ("lambda200c", numpy.float64)]
-
-def fit_clump(particles, clump, box):
+nfwpost = csiborgtools.fits.NFWPosterior()
+ftemp = join(paths.temp_dumpdir, "fit_clump_{}_{}_{}.npy")
+cols_collect = [
+    ("index", numpy.int32),
+    ("npart", numpy.int32),
+    ("totpartmass", numpy.float32),
+    ("vx", numpy.float32),
+    ("vy", numpy.float32),
+    ("vz", numpy.float32),
+    ("conc", numpy.float32),
+    ("rho0", numpy.float32),
+    ("r200c", numpy.float32),
+    ("r500c", numpy.float32),
+    ("m200c", numpy.float32),
+    ("m500c", numpy.float32),
+    ("lambda200c", numpy.float32),
+    ("r200m", numpy.float32),
+    ("m200m", numpy.float32),
+]


+def fit_clump(particles, clump_info, box):
+    """
+    Fit an object. Can be eithe a clump or a parent halo.
+    """
+    obj = csiborgtools.fits.Clump(particles, clump_info, box)
+
+    out = {}
+    if numpy.isnan(clump_info["index"]):
+        print("Why am I NaN?", flush=True)
+    out["index"] = clump_info["index"]
+    out["npart"] = len(obj)
+    out["totpartmass"] = numpy.sum(obj["M"])
+    for i, v in enumerate(["vx", "vy", "vz"]):
+        out[v] = numpy.average(obj.vel[:, i], weights=obj["M"])
+    # Overdensity masses
+    out["r200c"], out["m200c"] = obj.spherical_overdensity_mass(200, kind="crit")
+    out["r500c"], out["m500c"] = obj.spherical_overdensity_mass(500, kind="crit")
+    out["r200m"], out["m200m"] = obj.spherical_overdensity_mass(200, kind="matter")
+    # NFW fit
+    if out["npart"] > 10 and numpy.isfinite(out["r200c"]):
+        Rs, rho0 = nfwpost.fit(obj)
+        out["conc"] = Rs / out["r200c"]
+        out["rho0"] = rho0
+    # Spin within R200c
+    if numpy.isfinite(out["r200c"]):
+        out["lambda200c"] = obj.lambda_bullock(out["r200c"])
+    return out


-    out["npart"][n] = clump.Npart
-    out["rmin"][n] = clump.rmin
-    out["rmax"][n] = clump.rmax
-    out["totpartmass"][n] = clump.total_particle_mass
-    out["vx"][n] = numpy.average(clump.vel[:, 0], weights=clump.m)
-    out["vy"][n] = numpy.average(clump.vel[:, 1], weights=clump.m)
-    out["vz"][n] = numpy.average(clump.vel[:, 2], weights=clump.m)
-    out["Lx"][n], out["Ly"][n], out["Lz"][n] = clump.angular_momentum
+def load_clump_particles(clumpid, particle_archive):
+    """
+    Load a clump's particles from the particle archive. If it is not there, i.e
+    clump has no associated particles, return `None`.
+    """
+    try:
+        part = particle_archive[str(clumpid)]
+    except KeyError:
+        part = None
+    return part


+def load_parent_particles(clumpid, particle_archive, clumps_cat):
+    """
+    Load a parent halo's particles.
+    """
+    indxs = clumps_cat["index"][clumps_cat["parent"] == clumpid]
+    # We first load the particles of each clump belonging to this parent and then
+    # concatenate them for further analysis.
+    clumps = []
+    for ind in indxs:
+        parts = load_clump_particles(ind, particle_archive)
+        if parts is not None:
+            clumps.append([parts, None])

+    if len(clumps) == 0:
+        return None
+    return csiborgtools.match.concatenate_clumps(clumps, include_velocities=True)
+
+
+# We now start looping over all simulations
 for i, nsim in enumerate(paths.get_ics(tonew=False)):
    if rank == 0:
-        print("{}: calculating {}th simulation `{}`."
-              .format(datetime.now(), i, nsim), flush=True)
+        print(
+            "{}: calculating {}th simulation `{}`.".format(datetime.now(), i, nsim),
+            flush=True,
+        )
    nsnap = max(paths.get_snapshots(nsim))
    box = csiborgtools.read.BoxUnits(nsnap, nsim, paths)

    # Archive of clumps, keywords are their clump IDs
-    particle_archive = paths.split_path(nsnap, nsim)
-    clumpsarr = partreader.read_clumps(nsnap, nsim,
-                                       cols=["index", 'x', 'y', 'z'])
-    clumpid2arrpos = {ind: ii for ii, ind in enumerate(clumpsarr["index"])}
+    particle_archive = numpy.load(paths.split_path(nsnap, nsim))
+    clumps_cat = csiborgtools.read.ClumpsCatalogue(
+        nsim, paths, maxdist=None, minmass=None, rawdata=True, load_fitted=False
+    )
+    # We check whether we fit halos or clumps, will be indexing over different
+    # iterators.
+    if args.kind == "halos":
+        ismain = clumps_cat.ismain
+    else:
+        ismain = numpy.ones(len(clumps_cat), dtype=bool)
+    ntasks = len(clumps_cat)
+    # We split the clumps among the processes. Each CPU calculates a fraction
+    # of them and dumps the results in a structured array. Even if we are
+    # calculating parent halo this index runs over all clumps.
+    jobs = csiborgtools.fits.split_jobs(ntasks, nproc)[rank]
+    out = csiborgtools.read.cols_to_structured(len(jobs), cols_collect)
+    for i, j in enumerate(tqdm(jobs)) if nproc == 1 else enumerate(jobs):
+        # If we are fitting halos and this clump is not a main, then continue.
+        if args.kind == "halos" and not ismain[j]:
+            continue

+        clumpid = clumps_cat["index"][j]
+        if args.kind == "halos":
+            part = load_parent_particles(clumpid, particle_archive, clumps_cat)
+        else:
+            part = load_clump_particles(clumpid, particle_archive)

-    nclumps = len(particle_archive.files)
-    # Fit 5000 clumps at a time, then dump results
-    batchsize = 5000
+        # We fit the particles if there are any. If not we assign the index,
+        # otherwise it would be NaN converted to integers (-2147483648) and
+        # yield an error further down.
+        if part is not None:
+            _out = fit_clump(part, clumps_cat[j], box)
+            for key in _out.keys():
+                out[key][i] = _out[key]
+        else:
+            out["index"][i] = clumpid
+            out["npart"][i] = 0

-    # This rank does these `batchsize` clumps/halos
-    jobs = csiborgtools.utils.split_jobs(nclumps, nclumps // batchsize)[rank]
-    for clumpid in jobs:
-        ... = fit_clump(particle_archive[str(clumpid)], clumpsarr[clumpid2arrpos[clumpid]])
-
-
-
-    jobs = csiborgtools.utils.split_jobs(nclumps, nproc)[rank]
-    for nsplit in jobs:
-        parts, part_clumps, clumps = csiborgtools.fits.load_split_particles(
-            nsplit, nsnap, nsim, paths, remove_split=False)
-
-        N = clumps.size
-        cols = [("index", numpy.int64), ("npart", numpy.int64),
-                ("totpartmass", numpy.float64), ("Rs", numpy.float64),
-                ("rho0", numpy.float64), ("conc", numpy.float64),
-                ("lambda200c", numpy.float64), ("vx", numpy.float64),
-                ("vy", numpy.float64), ("vz", numpy.float64),
-                ("Lx", numpy.float64), ("Ly", numpy.float64),
-                ("Lz", numpy.float64), ("rmin", numpy.float64),
-                ("rmax", numpy.float64), ("r200", numpy.float64),
-                ("r500", numpy.float64), ("m200", numpy.float64),
-                ("m500", numpy.float64)]
-        out = csiborgtools.utils.cols_to_structured(N, cols)
-        out["index"] = clumps["index"]
-
-        for n in range(N):
-            # Pick clump and its particles
-            xs = csiborgtools.fits.pick_single_clump(n, parts, part_clumps,
-                                                     clumps)
-            clump = csiborgtools.fits.Clump.from_arrays(
-                *xs, rhoc=box.box_rhoc, G=box.box_G)
-            out["npart"][n] = clump.Npart
-            out["rmin"][n] = clump.rmin
-            out["rmax"][n] = clump.rmax
-            out["totpartmass"][n] = clump.total_particle_mass
-            out["vx"][n] = numpy.average(clump.vel[:, 0], weights=clump.m)
-            out["vy"][n] = numpy.average(clump.vel[:, 1], weights=clump.m)
-            out["vz"][n] = numpy.average(clump.vel[:, 2], weights=clump.m)
-            out["Lx"][n], out["Ly"][n], out["Lz"][n] = clump.angular_momentum
-
-            # Spherical overdensity radii and masses
-            rs, ms = clump.spherical_overdensity_mass([200, 500])
-            out["r200"][n] = rs[0]
-            out["r500"][n] = rs[1]
-            out["m200"][n] = ms[0]
-            out["m500"][n] = ms[1]
-            out["lambda200c"][n] = clump.lambda200c
-
-            # NFW profile fit
-            if clump.Npart > 10 and numpy.isfinite(out["r200"][n]):
-                nfwpost = csiborgtools.fits.NFWPosterior(clump)
-                logRs, __ = nfwpost.maxpost_logRs()
-                Rs = 10**logRs
-                if not numpy.isnan(logRs):
-                    out["Rs"][n] = Rs
-                    out["rho0"][n] = nfwpost.rho0_from_Rs(Rs)
-                    out["conc"][n] = out["r200"][n] / Rs
-
-        csiborgtools.read.dump_split(out, nsplit, nsnap, nsim, paths)
-
-    # Wait until all jobs finished before moving to another simulation
+    fout = ftemp.format(str(nsim).zfill(5), str(nsnap).zfill(5), rank)
+    if nproc == 0:
+        print(
+            "{}: rank {} saving to `{}`.".format(datetime.now(), rank, fout), flush=True
+        )
+    numpy.save(fout, out)
+    # We saved this CPU's results in a temporary file. Wait now for the other
+    # CPUs and then collect results from the 0th rank and save them.
    comm.Barrier()

-#     # Use the rank 0 to combine outputs for this CSiBORG realisation
-#     if rank == 0:
-#         print("Collecting results!")
-#         partreader = csiborgtools.read.ParticleReader(paths)
-#         out_collected = csiborgtools.read.combine_splits(
-#             utils.Nsplits, nsnap, nsim, partreader, cols_collect,
-#             remove_splits=True, verbose=False)
-#         fname = paths.hcat_path(nsim)
-#         print("Saving results to `{}`.".format(fname))
-#         numpy.save(fname, out_collected)
-#
-#     comm.Barrier()
-#
-# if rank == 0:
-#     print("All finished! See ya!")
+    if rank == 0:
+        print(
+            "{}: collecting results for simulation `{}`.".format(datetime.now(), nsim),
+            flush=True,
+        )
+        # We write to the output array. Load data from each CPU and append to
+        # the output array.
+        out = csiborgtools.read.cols_to_structured(ntasks, cols_collect)
+        clumpid2outpos = {indx: i for i, indx in enumerate(clumps_cat["index"])}
+        for i in range(nproc):
+            inp = numpy.load(ftemp.format(str(nsim).zfill(5), str(nsnap).zfill(5), i))
+            for j, clumpid in enumerate(inp["index"]):
+                k = clumpid2outpos[clumpid]
+                for key in inp.dtype.names:
+                    out[key][k] = inp[key][j]
+
+        # If we were analysing main halos, then remove array indices that do
+        # not correspond to parent halos.
+        if args.kind == "halos":
+            out = out[ismain]
+
+        fout = paths.structfit_path(nsnap, nsim, "clumps")
+        print("Saving to `{}`.".format(fout), flush=True)
+        numpy.save(fout, out)
+
+    # We now wait before moving on to another simulation.
+    comm.Barrier()
--- a/scripts/pre_splithalos.py
+++ b/scripts/pre_splithalos.py
@ -12,7 +12,10 @@
 # You should have received a copy of the GNU General Public License along
 # with this program; if not, write to the Free Software Foundation, Inc.,
 # 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
-"""Script to split particles to indivudual files according to their clump."""
+"""
+Script to split particles to individual files according to their clump. This is
+useful for calculating the halo properties directly from the particles.
+"""
 from datetime import datetime
 from gc import collect
 from glob import glob
@ -28,6 +31,7 @@ try:
    import csiborgtools
 except ModuleNotFoundError:
    import sys
+
    sys.path.append("../")
    import csiborgtools

@ -38,7 +42,7 @@ nproc = comm.Get_size()

 paths = csiborgtools.read.CSiBORGPaths(**csiborgtools.paths_glamdring)
 verbose = nproc == 1
-partcols = ['x', 'y', 'z', "vx", "vy", "vz", 'M']
+partcols = ["x", "y", "z", "vx", "vy", "vz", "M"]


 def do_split(nsim):
@ -46,7 +50,7 @@ def do_split(nsim):
    reader = csiborgtools.read.ParticleReader(paths)
    ftemp_base = join(
        paths.temp_dumpdir,
-        "split_{}_{}".format(str(nsim).zfill(5), str(nsnap).zfill(5))
+        "split_{}_{}".format(str(nsim).zfill(5), str(nsnap).zfill(5)),
    )
    ftemp = ftemp_base + "_{}.npz"

@ -85,7 +89,7 @@ def do_split(nsim):
    # Now load back in every temporary file, combine them into a single
    # dictionary  and save as a single .npz file.
    out = {}
-    for file in glob(ftemp_base + '*'):
+    for file in glob(ftemp_base + "*"):
        inp = numpy.load(file)
        for key in inp.files:
            out.update({key: inp[key]})
@ -107,7 +111,6 @@ if nproc > 1:
        worker_process(do_split, comm, verbose=False)
 else:
    tasks = paths.get_ics(tonew=False)
-    tasks = [tasks[0]]  # REMOVE
    for task in tasks:
        print("{}: completing task `{}`.".format(datetime.now(), task))
        do_split(task)