New filesystem, basic neighbours calculation (#10)

* simplify Planck catalogue * add MCXC and base survey * Add 2MPP classes * move match to MCXC to Planck * add halo catalogue * rm comment * rm unused imports * Move conversion to box * add min mass * Run on all simulations * rm old function * add combined catalogue * add halo positions * add knn neighbors * set to 5 sims for testing * add docstring * Switch to neighbours in a catalogue * rename io to read * fix indentation * rename to read * io -> read * add import * add RealisationMatcher * io -> read * add docstrings * add search_sim_indiices * update todo * keep make_cat at 10 for now * update nb
2025-07-14 18:03:04 +00:00 · 2022-11-22 10:00:01 +00:00 · 2022-11-22 10:00:01 +00:00 · c748c87e45
commit c748c87e45
parent f5bf04c46e
17 changed files with 3515 additions and 549 deletions
--- a/README.md
+++ b/README.md
@ -1,10 +1,9 @@
 # CSiBORG tools

 ## :scroll: Short-term TODO
- [x] Add the X-ray clusters
- [x] Match the X-ray clusters to Planck
- [ ] Calculate catalogues for all realisations.
+- [ ] Find the distribution of particles in the first snapshot
 - [ ] Implement Max's halo matching
+- [ ] Implement a custom model for matchin galaxies to halos.


 ## :hourglass: Long-term TODO
--- a/csiborgtools/init.py
+++ b/csiborgtools/init.py
@ -13,4 +13,4 @@
 # with this program; if not, write to the Free Software Foundation, Inc.,
 # 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.

-from csiborgtools import (io, match, utils, units, fits)  # noqa
+from csiborgtools import (read, match, utils, units, fits)  # noqa
--- a/csiborgtools/fits/halo.py
+++ b/csiborgtools/fits/halo.py
@ -22,7 +22,7 @@ from os import remove
 from warnings import warn
 from os.path import join
 from tqdm import trange
-from ..io import nparts_to_start_ind
+from ..read import nparts_to_start_ind


 def clump_with_particles(particle_clumps, clumps):
--- a/csiborgtools/io/readobs.py
+++ b/csiborgtools/io/readobs.py
@ -1,263 +0,0 @@
-# Copyright (C) 2022 Richard Stiskalek
-# This program is free software; you can redistribute it and/or modify it
-# under the terms of the GNU General Public License as published by the
-# Free Software Foundation; either version 3 of the License, or (at your
-# option) any later version.
-#
-# This program is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General
-# Public License for more details.
-#
-# You should have received a copy of the GNU General Public License along
-# with this program; if not, write to the Free Software Foundation, Inc.,
-# 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
-"""
-Scripts to read in observation.
-"""
-
-import numpy
-from astropy.io import fits
-from astropy.coordinates import SkyCoord
-from astropy import units as u
-from ..utils import (add_columns, cols_to_structured)
-
-F64 = numpy.float64
-
-
-def read_planck2015(fpath, cosmo, max_comdist=None):
-    r"""
-    Read the Planck 2nd Sunyaev-Zeldovich source catalogue [1]. The following
-    is performed:
-        - removes clusters without a redshift estimate,
-        - calculates the comoving distance with the provided cosmology.
-        - Converts `MSZ` from units of :math:`1e14 M_\odot` to :math:`M_\odot`
-
-    Parameters
-    ----------
-    fpath : str
-        Path to the source catalogue.
-    cosmo : `astropy.cosmology` object
-        The cosmology to calculate cluster comoving distance from redshift and
-        convert their mass.
-    max_comdist : float, optional
-        Maximum comoving distance threshold in units of :math:`\mathrm{Mpc}`.
-        By default `None` and no threshold is applied.
-
-    Returns
-    -------
-    out : structured array
-        The catalogue structured array.
-
-    References
-    ----------
-    [1] https://heasarc.gsfc.nasa.gov/W3Browse/all/plancksz2.html
-    """
-    data = fits.open(fpath)[1].data
-    hdata = 0.7
-    # Convert FITS to a structured array
-    out = numpy.full(data.size, numpy.nan, dtype=data.dtype.descr)
-    for name in out.dtype.names:
-        out[name] = data[name]
-    # Take only clusters with redshifts
-    out = out[out["REDSHIFT"] >= 0]
-    # Add comoving distance
-    dist = cosmo.comoving_distance(out["REDSHIFT"]).value
-    out = add_columns(out, dist, "COMDIST")
-    # Convert masses
-    for par in ("MSZ", "MSZ_ERR_UP", "MSZ_ERR_LOW"):
-        out[par] *= 1e14
-        out[par] *= (hdata / cosmo.h)**2
-    # Distance threshold
-    if max_comdist is not None:
-        out = out[out["COMDIST"] < max_comdist]
-
-    return out
-
-
-def read_mcxc(fpath, cosmo, max_comdist=None):
-    r"""
-    Read the MCXC Meta-Catalog of X-Ray Detected Clusters of Galaxies
-    catalogue [1], with data description at [2] and download at [3].
-
-    Note
-    ----
-    The exact mass conversion has non-trivial dependence on :math:`H(z)`, see
-    [1] for more details. However, this should be negligible.
-
-    Parameters
-    ----------
-    fpath : str
-        Path to the source catalogue obtained from [3]. Expected to be the fits
-        file.
-    cosmo : `astropy.cosmology` object
-        The cosmology to calculate cluster comoving distance from redshift and
-        convert their mass.
-    max_comdist : float, optional
-        Maximum comoving distance threshold in units of :math:`\mathrm{Mpc}`.
-        By default `None` and no threshold is applied.
-
-    Returns
-    -------
-    out : structured array
-        The catalogue structured array.
-
-    References
-    ----------
-    [1] The MCXC: a meta-catalogue of x-ray detected clusters of galaxies
-        (2011); Piffaretti, R. ;  Arnaud, M. ;  Pratt, G. W. ;  Pointecouteau,
-        E. ;  Melin, J. -B.
-    [2] https://heasarc.gsfc.nasa.gov/W3Browse/rosat/mcxc.html
-    [3] https://cdsarc.cds.unistra.fr/viz-bin/cat/J/A+A/534/A109#/article
-    """
-    data = fits.open(fpath)[1].data
-    hdata = 0.7  # Little h of the catalogue
-
-    cols = [("RAdeg", F64), ("DEdeg", F64), ("z", F64),
-            ("L500", F64), ("M500", F64), ("R500", F64)]
-    out = cols_to_structured(data.size, cols)
-    for col in cols:
-        par = col[0]
-        out[par] = data[par]
-    # Get little h units to match the cosmology
-    out["L500"] *= (hdata / cosmo.h)**2
-    out["M500"] *= (hdata / cosmo.h)**2
-    # Get the 10s back in
-    out["L500"] *= 1e44  # ergs/s
-    out["M500"] *= 1e14  # Msun
-
-    dist = cosmo.comoving_distance(data["z"]).value
-    out = add_columns(out, dist, "COMDIST")
-    out = add_columns(out, data["MCXC"], "name")
-
-    if max_comdist is not None:
-        out = out[out["COMDIST"] < max_comdist]
-
-    return out
-
-
-def read_2mpp(fpath, dist_cosmo):
-    """
-    Read in the 2M++ galaxy redshift catalogue [1], with the catalogue at [2].
-    Removes fake galaxies used to fill the zone of avoidance. Note that in
-    principle additional care should be taken for calculating the distance
-    to objects [3]. Currently calculated from the CMB redshift, so some
-    distance estimates may be negative..
-
-    Parameters
-    ----------
-    fpath : str
-        File path to the catalogue.
-    cosmo : `astropy.cosmology` object
-        The cosmology to calculate distance from redshift.
-
-    Returns
-    -------
-    out : structured array
-        The catalogue.
-
-    References
-    ----------
-    [1] The 2M++ galaxy redshift catalogue; Lavaux, Guilhem, Hudson, Michael J.
-    [2] https://cdsarc.cds.unistra.fr/viz-bin/cat/J/MNRAS/416/2840#/article
-    [3] Improving NASA/IPAC Extragalactic Database Redshift Calculations
-        (2021); Anthony Carr and Tamara Davis
-    """
-    from scipy.constants import c
-    # Read the catalogue and select non-fake galaxies
-    cat = numpy.genfromtxt(fpath, delimiter="|", )
-    cat = cat[cat[:, 12] == 0, :]
-
-    cols = [("RA", F64), ("DEC", F64), ("Ksmag", F64), ("ZCMB", F64),
-            ("DIST", F64)]
-    out = cols_to_structured(cat.shape[0], cols)
-    out["RA"] = cat[:, 1]
-    out["DEC"] = cat[:, 2]
-    out["Ksmag"] = cat[:, 5]
-    out["ZCMB"] = cat[:, 7] / (c * 1e-3)
-    out["DIST"] = cat[:, 7] / dist_cosmo.H0
-    return out
-
-
-def read_2mpp_groups(fpath, dist_cosmo):
-    """
-    Read in the 2M++ galaxy group catalogue [1], with the catalogue at [2].
-    Note that the same caveats apply to the distance calculation as in
-    py:function:`read_2mpp`. See that function for more details.
-
-    Parameters
-    ----------
-    fpath : str
-        File path to the catalogue.
-    cosmo : `astropy.cosmology` object
-        The cosmology to calculate distance from redshift.
-
-    Returns
-    -------
-    out : structured array
-        The catalogue.
-
-    References
-    ----------
-    [1] The 2M++ galaxy redshift catalogue; Lavaux, Guilhem, Hudson, Michael J.
-    [2] https://cdsarc.cds.unistra.fr/viz-bin/cat/J/MNRAS/416/2840#/article
-    [3] Improving NASA/IPAC Extragalactic Database Redshift Calculations
-        (2021); Anthony Carr and Tamara Davis
-    """
-    cat = numpy.genfromtxt(fpath, delimiter="|", )
-
-    cols = [("RA", F64), ("DEC", F64), ("K2mag", F64), ("Rich", numpy.int64),
-            ("dist", F64), ("sigma", F64)]
-    out = cols_to_structured(cat.shape[0], cols)
-
-    out["K2mag"] = cat[:, 3]
-    out["Rich"] = cat[:, 4]
-    out["sigma"] = cat[:, 7]
-    out["dist"] = cat[:, 6] / dist_cosmo.H0
-
-    # Convert galactic coordinates to RA, dec
-    glon = cat[:, 1]
-    glat = cat[:, 2]
-    coords = SkyCoord(l=glon*u.degree, b=glat*u.degree, frame='galactic')
-    coords = coords.transform_to("icrs")
-    out["RA"] = coords.ra
-    out["DEC"] = coords.dec
-
-    return out
-
-
-def match_planck_to_mcxc(planck, mcxc):
-    """
-    Return the MCXC catalogue indices of the Planck catalogue detections. Finds
-    the index of the quoted Planck MCXC counterpart in the MCXC array. If not
-    found throws an error. For this reason it may be better to make sure the
-    MCXC catalogue reaches further.
-
-
-    Parameters
-    ----------
-    planck : structured array
-        The Planck cluster array.
-    mcxc : structured array
-        The MCXC cluster array.
-
-    Returns
-    -------
-    indxs : 1-dimensional array
-        The array of MCXC indices to match the Planck array. If no counterpart
-        is found returns `numpy.nan`.
-    """
-    # Planck MCXC need to be decoded to str
-    planck_names = [name.decode() for name in planck["MCXC"]]
-    mcxc_names = [name for name in mcxc["name"]]
-
-    indxs = [numpy.nan] * len(planck_names)
-    for i, name in enumerate(planck_names):
-        if name == "":
-            continue
-        if name in mcxc_names:
-            indxs[i] = mcxc_names.index(name)
-        else:
-            raise ValueError("Planck MCXC identifies `{}` not found in the "
-                             "MCXC catalogue.".format(name))
-    return indxs
--- a/csiborgtools/match/init.py
+++ b/csiborgtools/match/init.py
@ -13,6 +13,6 @@
 # with this program; if not, write to the Free Software Foundation, Inc.,
 # 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.

-from .match import brute_spatial_separation  # noqa
+from .match import (brute_spatial_separation, RealisationsMatcher)  # noqa
 from .num_density import (binned_counts, number_density)  # noqa
 # from .correlation import (get_randoms_sphere, sphere_angular_tpcf) # noqa
--- a/csiborgtools/match/match.py
+++ b/csiborgtools/match/match.py
@ -16,6 +16,7 @@
 import numpy
 from tqdm import tqdm
 from astropy.coordinates import SkyCoord
+from ..read import CombinedHaloCatalogue


 def brute_spatial_separation(c1, c2, angular=False, N=None, verbose=False):
@ -65,3 +66,130 @@ def brute_spatial_separation(c1, c2, angular=False, N=None, verbose=False):
        sep[i, :] = dist[sort]

    return sep, indxs
+
+
+class RealisationsMatcher:
+    """
+    A tool to match halos between IC realisations. Looks for halos 3D space
+    neighbours in all remaining IC realisations that are within some mass
+    range of it.
+
+
+    Parameters
+    ----------
+    cats : :py:class`csiborgtools.read.CombinedHaloCatalogue`
+        Combined halo catalogue to search.
+    """
+    _cats = None
+
+    def __init__(self, cats):
+        self.cats = cats
+
+    @property
+    def cats(self):
+        """
+        Combined catalogues.
+
+        Returns
+        -------
+        cats : :py:class`csiborgtools.read.CombinedHaloCatalogue`
+            Combined halo catalogue.
+        """
+        return self._cats
+
+    @cats.setter
+    def cats(self, cats):
+        """
+        Sets `cats`, ensures that this is a `CombinedHaloCatalogue` object.
+        """
+        if not isinstance(cats, CombinedHaloCatalogue):
+            raise TypeError("`cats` must be of type `CombinedHaloCatalogue`.")
+        self._cats = cats
+
+    def search_sim_indices(self, n_sim):
+        """
+        Return indices of all other IC realisations but of `n_sim`.
+
+        Parameters
+        ----------
+        n_sim : int
+            IC realisation index of `self.cats` to be skipped.
+
+        Returns
+        -------
+        indxs : list of int
+            The remaining IC indices.
+        """
+        return [i for i in range(self.cats.N) if i != n_sim]
+
+    def cross_knn_position_single(self, n_sim, nmult=5, dlogmass=2):
+        r"""
+        Find all neighbours within :math:`n_{\rm mult} R_{200c}` of halos in
+        the `nsim`th simulation. Also enforces that the neighbours'
+        :math:`\log M_{200c}` be within `dlogmass` dex.
+
+        Parameters
+        ----------
+        n_sim : int
+            Index of an IC realisation in `self.cats` whose halos' neighbours
+            in the remaining simulations to search for.
+        nmult : float or int, optional
+            Multiple of :math:`R_{200c}` within which to return neighbours. By
+            default 5.
+        dlogmass : float, optional
+            Tolerance on mass logarithmic mass difference. By default 2 dex.
+
+        Returns
+        -------
+        matches : composite array
+            Array, indices are `(n_sims - 1, 2, n_halos, n_matches)`. The
+            2nd axis is `index` of the neighbouring halo in its catalogue and
+            `dist`, which is the 3D distance to the halo whose neighbours are
+            searched.
+        """
+        # R200c, M200c and positions of halos in `n_sim` IC realisation
+        r200 = self.cats[n_sim]["r200"]
+        logm200 = numpy.log10(self.cats[n_sim]["m200"])
+        pos = self.cats[n_sim].positions
+
+        matches = [None] * (self.cats.N - 1)
+        # Search for neighbours in the other simulations
+        for count, i in enumerate(self.search_sim_indices(n_sim)):
+            dist, indxs = self.cats[i].radius_neigbours(pos, r200 * nmult)
+            # Get rid of neighbors whose mass is too off
+            for j, indx in enumerate(indxs):
+                match_logm200 = numpy.log10(self.cats[i][indx]["m200"])
+                mask = numpy.abs(match_logm200 - logm200[j]) < dlogmass
+                dist[j] = dist[j][mask]
+                indxs[j] = indx[mask]
+            # Append as a composite array
+            matches[count] = numpy.asarray([indxs, dist], dtype=object)
+
+        return numpy.asarray(matches, dtype=object)
+
+    def cross_knn_position_all(self, nmult=5, dlogmass=2):
+        r"""
+        Find all neighbours within :math:`n_{\rm mult} R_{200c}` of halos in
+        all simulations listed in `self.cats`. Also enforces that the
+        neighbours' :math:`\log M_{200c}` be within `dlogmass` dex.
+
+        Parameters
+        ----------
+        nmult : float or int, optional
+            Multiple of :math:`R_{200c}` within which to return neighbours. By
+            default 5.
+        dlogmass : float, optional
+            Tolerance on mass logarithmic mass difference. By default 2 dex.
+
+        Returns
+        -------
+        matches : list of composite arrays
+            List whose length is `n_sims`. For description of its elements see
+            `self.cross_knn_position_single(...)`.
+        """
+        N = self.cats.N  # Number of catalogues
+        matches = [None] * N
+        # Loop over each catalogue
+        for i in range(N):
+            matches[i] = self.cross_knn_position_single(i, nmult, dlogmass)
+        return matches
--- a/csiborgtools/read/init.py
+++ b/csiborgtools/read/init.py
@ -17,7 +17,7 @@ from .readsim import (get_csiborg_ids, get_sim_path, get_snapshots,  # noqa
                      get_snapshot_path, get_maximum_snapshot, read_info, nparts_to_start_ind,  # noqa
                      open_particle, open_unbinding, read_particle,  # noqa
                      drop_zero_indx,  # noqa
-                      read_clumpid, read_clumps, read_mmain,  # noqa
-                      merge_mmain_to_clumps)  # noqa
-from .readobs import (read_planck2015, read_mcxc, read_2mpp, read_2mpp_groups, match_planck_to_mcxc)  # noqa
+                      read_clumpid, read_clumps, read_mmain)  # noqa
+from .make_cat import (HaloCatalogue, CombinedHaloCatalogue)  # noqa
+from .readobs import (PlanckClusters, MCXCClusters, TwoMPPGalaxies, TwoMPPGroups)  # noqa
 from .outsim import (dump_split, combine_splits)  # noqa
--- a/csiborgtools/read/make_cat.py
+++ b/csiborgtools/read/make_cat.py
@ -0,0 +1,331 @@
+# Copyright (C) 2022 Richard Stiskalek, Harry Desmond
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation; either version 3 of the License, or (at your
+# option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General
+# Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+"""
+Functions to read in the particle and clump files.
+"""
+
+import numpy
+from os.path import join
+from tqdm import trange
+from sklearn.neighbors import NearestNeighbors
+from .readsim import (get_sim_path, read_mmain, get_csiborg_ids,
+                      get_maximum_snapshot)
+from ..utils import (flip_cols, add_columns)
+from ..units import (BoxUnits, cartesian_to_radec)
+
+
+class HaloCatalogue:
+    r"""
+    Processed halo catalogue, the data should be calculated in `run_fit_halos`.
+
+    Parameters
+    ----------
+    n_sim: int
+        Initial condition index.
+    n_snap: int
+        Snapshot index.
+    minimum_m500 : float, optional
+        The minimum :math:`M_{rm 500c} / M_\odot` mass. By default no
+        threshold.
+    dumpdir : str, optional
+        Path to where files from `run_fit_halos` are stored. By default
+        `/mnt/extraspace/rstiskalek/csiborg/`.
+    mmain_path : str, optional
+        Path to where mmain files are stored. By default
+        `/mnt/zfsusers/hdesmond/Mmain`.
+    """
+    _box = None
+    _n_sim = None
+    _n_snap = None
+    _data = None
+    _knn = None
+    _positions = None
+
+    def __init__(self, n_sim, n_snap, minimum_m500=None,
+                 dumpdir="/mnt/extraspace/rstiskalek/csiborg/",
+                 mmain_path="/mnt/zfsusers/hdesmond/Mmain"):
+        self._box = BoxUnits(n_snap, get_sim_path(n_sim))
+        minimum_m500 = 0 if minimum_m500 is None else minimum_m500
+        self._set_data(n_sim, n_snap, dumpdir, mmain_path, minimum_m500)
+        self._nsim = n_sim
+        self._nsnap = n_snap
+        # Initialise the KNN
+        knn = NearestNeighbors()
+        knn.fit(self.positions)
+        self._knn = knn
+
+    @property
+    def data(self):
+        """
+        Halo catalogue.
+
+        Returns
+        -------
+        cat : structured array
+            Catalogue.
+        """
+        if self._data is None:
+            raise ValueError("`data` is not set!")
+        return self._data
+
+    @property
+    def box(self):
+        """
+        Box object, useful for change of units.
+
+        Returns
+        -------
+        box : :py:class:`csiborgtools.units.BoxUnits`
+            The box object.
+        """
+        return self._box
+
+    @property
+    def cosmo(self):
+        """
+        The box cosmology.
+
+        Returns
+        -------
+        cosmo : `astropy` cosmology object
+            Box cosmology.
+        """
+        return self.box.cosmo
+
+    @property
+    def n_snap(self):
+        """
+        The snapshot ID.
+
+        Returns
+        -------
+        n_snap : int
+            Snapshot ID.
+        """
+        return self._n_snap
+
+    @property
+    def n_sim(self):
+        """
+        The initiali condition (IC) realisation ID.
+
+        Returns
+        -------
+        n_sim : int
+            The IC ID.
+        """
+        return self._n_sim
+
+    def _set_data(self, n_sim, n_snap, dumpdir, mmain_path, minimum_m500):
+        """
+        Loads the data, merges with mmain, does various coordinate transforms.
+        """
+        # Load the processed data
+        fname = "ramses_out_{}_{}.npy".format(
+            str(n_sim).zfill(5), str(n_snap).zfill(5))
+        data = numpy.load(join(dumpdir, fname))
+
+        # Load the mmain file and add it to the data
+        mmain = read_mmain(n_sim, mmain_path)
+        data = self.merge_mmain_to_clumps(data, mmain)
+        flip_cols(data, "peak_x", "peak_z")
+
+        # Cut on number of particles and finite m200
+        data = data[(data["npart"] > 100) & numpy.isfinite(data["m200"])]
+
+        # Unit conversion
+        convert_cols = ["m200", "m500", "totpartmass", "mass_mmain",
+                        "r200", "r500", "Rs", "rho0",
+                        "peak_x", "peak_y", "peak_z"]
+        data = self.box.convert_from_boxunits(data, convert_cols)
+
+        # Cut on mass. Note that this is in Msun
+        data = data[data["m500"] > minimum_m500]
+
+        # Now calculate spherical coordinates
+        d, ra, dec = cartesian_to_radec(data)
+        data = add_columns(data, [d, ra, dec], ["dist", "ra", "dec"])
+
+        # Pre-allocate the positions array
+        self._positions = numpy.vstack(
+            [data["peak_{}".format(p)] for p in ("x", "y", "z")]).T
+
+        self._data = data
+
+    def merge_mmain_to_clumps(self, clumps, mmain):
+        """
+        Merge columns from the `mmain` files to the `clump` file, matches them
+        by their halo index while assuming that the indices `index` in both
+        arrays are sorted.
+
+        Parameters
+        ----------
+        clumps : structured array
+            Clumps structured array.
+        mmain : structured array
+            Parent halo array whose information is to be merged into `clumps`.
+
+        Returns
+        -------
+        out : structured array
+            Array with added columns.
+        """
+        X = numpy.full((clumps.size, 2), numpy.nan)
+        # Mask of which clumps have a mmain index
+        mask = numpy.isin(clumps["index"], mmain["index"])
+
+        X[mask, 0] = mmain["mass_cl"]
+        X[mask, 1] = mmain["sub_frac"]
+        return add_columns(clumps, X, ["mass_mmain", "sub_frac"])
+
+    @property
+    def positions(self):
+        """
+        3D positions of halos.
+
+        Returns
+        -------
+        X : 2-dimensional array
+            Array of shape `(n_halos, 3)`, where the latter axis represents
+            `x`, `y` and `z`.
+        """
+        return self._positions
+
+    def radius_neigbours(self, X, radius):
+        """
+        Return sorted nearest neigbours within `radius` or `X`.
+
+        Parameters
+        ----------
+        X : 2-dimensional array
+            Array of shape `(n_queries, 3)`, where the latter axis represents
+            `x`, `y` and `z`.
+        radius : float
+            Limiting distance of neighbours.
+
+        Returns
+        -------
+        dist : list of 1-dimensional arrays
+            List of length `n_queries` whose elements are arrays of distances
+            to the nearest neighbours.
+        knns : list of 1-dimensional arrays
+            List of length `n_queries` whose elements are arrays of indices of
+            nearest neighbours in this catalogue.
+        """
+        if not (X.ndim == 2 and X.shape[1] == 3):
+            raise TypeError("`X` must be an array of shape `(n_samples, 3)`.")
+        # Query the KNN
+        return self._knn.radius_neighbors(X, radius, sort_results=True)
+
+    @property
+    def keys(self):
+        """Catalogue keys."""
+        return self.data.dtype.names
+
+    def __getitem__(self, key):
+        return self._data[key]
+
+
+class CombinedHaloCatalogue:
+    r"""
+    A combined halo catalogue, containing `HaloCatalogue` for each IC
+    realisation at the latest redshift.
+
+    Parameters
+    ----------
+    minimum_m500 : float, optional
+        The minimum :math:`M_{rm 500c} / M_\odot` mass. By default no
+        threshold.
+    dumpdir : str, optional
+        Path to where files from `run_fit_halos` are stored. By default
+        `/mnt/extraspace/rstiskalek/csiborg/`.
+    mmain_path : str, optional
+        Path to where mmain files are stored. By default
+        `/mnt/zfsusers/hdesmond/Mmain`.
+    verbose : bool, optional
+        Verbosity flag for reading the catalogues.
+    """
+    _n_sims = None
+    _n_snaps = None
+    _cats = None
+
+    def __init__(self, minimum_m500=None,
+                 dumpdir="/mnt/extraspace/rstiskalek/csiborg/",
+                 mmain_path="/mnt/zfsusers/hdesmond/Mmain", verbose=True):
+        # Read simulations and their maximum snapshots
+        # NOTE remove this later and take all cats
+        self._n_sims = get_csiborg_ids("/mnt/extraspace/hdesmond")[:10]
+        n_snaps = [get_maximum_snapshot(get_sim_path(i)) for i in self._n_sims]
+        self._n_snaps = numpy.asanyarray(n_snaps)
+
+        cats = [None] * self.N
+        for i in trange(self.N) if verbose else range(self.N):
+            cats[i] = HaloCatalogue(self._n_sims[i], self._n_snaps[i],
+                                    minimum_m500, dumpdir, mmain_path)
+        self._cats = cats
+
+    @property
+    def N(self):
+        """
+        Number of IC realisations in this combined catalogue.
+
+        Returns
+        -------
+        N : int
+            Number of catalogues.
+        """
+        return len(self.n_sims)
+
+    @property
+    def n_sims(self):
+        """
+        IC realisations CSiBORG identifiers.
+
+        Returns
+        -------
+        ids : 1-dimensional array
+            Array of IDs.
+        """
+        return self._n_sims
+
+    @property
+    def n_snaps(self):
+        """
+        Snapshot numbers corresponding to `self.n_sims`.
+
+        Returns
+        -------
+        n_snaps : 1-dimensional array
+            Array of snapshot numbers.
+        """
+        return self._n_snaps
+
+    @property
+    def cats(self):
+        """
+        Catalogues associated with this object.
+
+        Returns
+        -------
+        cats : list of `HaloCatalogue`
+            Catalogues.
+        """
+        return self._cats
+
+    def __getitem__(self, n):
+        if n > self.N:
+            raise ValueError("Catalogue count is {}, requested catalogue {}."
+                             .format(self.N, n))
+        return self.cats[n]
--- a/csiborgtools/read/outsim.py
+++ b/csiborgtools/read/outsim.py
--- a/csiborgtools/read/readobs.py
+++ b/csiborgtools/read/readobs.py
@ -0,0 +1,300 @@
+# Copyright (C) 2022 Richard Stiskalek
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation; either version 3 of the License, or (at your
+# option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General
+# Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+"""
+Scripts to read in observation.
+"""
+
+import numpy
+from astropy.io import fits
+from astropy.coordinates import SkyCoord
+from astropy.cosmology import FlatLambdaCDM
+from astropy import units as u
+from ..utils import (add_columns, cols_to_structured)
+
+F64 = numpy.float64
+
+
+class BaseSurvey:
+    """
+    Base survey class with some methods that are common to all survey classes.
+    """
+    _data = None
+    _cosmo = None
+
+    @property
+    def data(self):
+        """
+        Cluster catalogue.
+
+        Returns
+        -------
+        cat : structured array
+            Catalogue.
+        """
+        if self._data is None:
+            raise ValueError("`data` is not set!")
+        return self._data
+
+    @property
+    def cosmo(self):
+        """Desired cosmology."""
+        if self._cosmo is None:
+            raise ValueError("`cosmo` is not set!")
+        return self._cosmo
+
+    @property
+    def keys(self):
+        """Catalogue keys."""
+        return self.data.dtype.names
+
+    def __getitem__(self, key):
+        return self._data[key]
+
+
+class PlanckClusters(BaseSurvey):
+    r"""
+    Planck 2nd Sunyaev-Zeldovich source catalogue [1]. Automatically removes
+    clusters without a redshift estimate.
+
+    Parameters
+    ----------
+    fpath : str
+        Path to the source catalogue.
+    cosmo : `astropy.cosmology` object, optional
+        Cosmology to convert masses (particularly :math:`H_0`). By default
+        `FlatLambdaCDM(H0=70.5, Om0=0.307, Tcmb0=2.728)`.
+    max_redshift: float, optional
+        Maximum cluster redshift. By default `None` and no selection is
+        performed.
+
+    References
+    ----------
+    [1] https://heasarc.gsfc.nasa.gov/W3Browse/all/plancksz2.html
+    """
+    _hdata = 0.7  # little h value of the data
+
+    def __init__(self, fpath, cosmo=None, max_redshift=None):
+        if cosmo is None:
+            self._cosmo = FlatLambdaCDM(H0=70.5, Om0=0.307, Tcmb0=2.728)
+        else:
+            self._cosmo = cosmo
+        self.set_data(fpath, max_redshift)
+
+    def set_data(self, fpath, max_redshift=None):
+        """
+        Set the catalogue, loads it and applies a maximum redshift cut.
+        """
+        cat = fits.open(fpath)[1].data
+        # Convert FITS to a structured array
+        data = numpy.full(cat.size, numpy.nan, dtype=cat.dtype.descr)
+        for name in cat.dtype.names:
+            data[name] = cat[name]
+        # Take only clusters with redshifts
+        data = data[data["REDSHIFT"] >= 0]
+        # Convert masses
+        for par in ("MSZ", "MSZ_ERR_UP", "MSZ_ERR_LOW"):
+            data[par] *= 1e14
+            data[par] *= (self._hdata / self.cosmo.h)**2
+        # Redshift cut
+        if max_redshift is not None:
+            data = data["REDSHIFT" <= max_redshift]
+        self._data = data
+
+    def match_to_mcxc(self, mcxc):
+        """
+        Return the MCXC catalogue indices of the Planck catalogue detections.
+        Finds the index of the quoted Planck MCXC counterpart in the MCXC
+        array. If not found throws an error. For this reason it may be better
+        to make sure the MCXC catalogue reaches further.
+
+        Parameters
+        ----------
+        mcxc : :py:class`MCXCClusters`
+            MCXC cluster object.
+
+        Returns
+        -------
+        indxs : list of int
+            Array of MCXC indices to match the Planck array. If no counterpart
+            is found returns `numpy.nan`.
+        """
+        if not isinstance(mcxc, MCXCClusters):
+            raise TypeError("`mcxc` must be `MCXCClusters` type.")
+
+        # Planck MCXC need to be decoded to str
+        planck_names = [name.decode() for name in self["MCXC"]]
+        mcxc_names = [name for name in mcxc["name"]]
+
+        indxs = [numpy.nan] * len(planck_names)
+        for i, name in enumerate(planck_names):
+            if name == "":
+                continue
+            if name in mcxc_names:
+                indxs[i] = mcxc_names.index(name)
+            else:
+                raise ValueError("Planck MCXC identifier `{}` not found in "
+                                 "the MCXC catalogue.".format(name))
+        return indxs
+
+
+class MCXCClusters(BaseSurvey):
+    r"""
+    MCXC Meta-Catalog of X-Ray Detected Clusters of Galaxies catalogue [1],
+    with data description at [2] and download at [3].
+
+    Note
+    ----
+    The exact mass conversion has non-trivial dependence on :math:`H(z)`, see
+    [1] for more details. However, this should be negligible.
+
+    Parameters
+    ----------
+    fpath : str
+        Path to the source catalogue obtained from [3]. Expected to be the fits
+        file.
+    cosmo : `astropy.cosmology` object, optional
+        The cosmology to to convert cluster masses (to first order). By default
+        `FlatLambdaCDM(H0=70.5, Om0=0.307, Tcmb0=2.728)`.
+    max_redshift: float, optional
+        Maximum cluster redshift. By default `None` and no selection is
+        performed.
+
+    References
+    ----------
+    [1] The MCXC: a meta-catalogue of x-ray detected clusters of galaxies
+        (2011); Piffaretti, R. ;  Arnaud, M. ;  Pratt, G. W. ;  Pointecouteau,
+        E. ;  Melin, J. -B.
+    [2] https://heasarc.gsfc.nasa.gov/W3Browse/rosat/mcxc.html
+    [3] https://cdsarc.cds.unistra.fr/viz-bin/cat/J/A+A/534/A109#/article
+    """
+    _hdata = 0.7  # Little h of the catalogue
+
+    def __init__(self, fpath, cosmo=None, max_redshift=None):
+        if cosmo is None:
+            self._cosmo = FlatLambdaCDM(H0=70.5, Om0=0.307, Tcmb0=2.728)
+        else:
+            self._cosmo = cosmo
+        self._set_data(fpath, max_redshift)
+
+    def _set_data(self, fpath, max_redshift):
+        """
+        Set the catalogue, loads it and applies a maximum redshift cut.
+        """
+        cat = fits.open(fpath)[1].data
+        # Pre-allocate array and extract selected variables
+        cols = [("RAdeg", F64), ("DEdeg", F64), ("z", F64),
+                ("L500", F64), ("M500", F64), ("R500", F64)]
+        data = cols_to_structured(cat.size, cols)
+        for col in cols:
+            par = col[0]
+            data[par] = cat[par]
+        # Add the cluster names
+        data = add_columns(data, cat["MCXC"], "name")
+
+        # Get little h units to match the cosmology
+        data["L500"] *= (self._hdata / self.cosmo.h)**2
+        data["M500"] *= (self._hdata / self.cosmo.h)**2
+        # Get the 10s back in
+        data["L500"] *= 1e44  # ergs/s
+        data["M500"] *= 1e14  # Msun
+
+        if max_redshift is not None:
+            data = data["z" <= max_redshift]
+
+        self._data = data
+
+
+class TwoMPPGalaxies(BaseSurvey):
+    """
+    The 2M++ galaxy redshift catalogue [1], with the catalogue at [2].
+    Removes fake galaxies used to fill the zone of avoidance. Note that the
+    stated redshift is in the CMB frame.
+
+    Parameters
+    ----------
+    fpath : str
+        File path to the catalogue.
+
+    References
+    ----------
+    [1] The 2M++ galaxy redshift catalogue; Lavaux, Guilhem, Hudson, Michael J.
+    [2] https://cdsarc.cds.unistra.fr/viz-bin/cat/J/MNRAS/416/2840#/article
+    [3] Improving NASA/IPAC Extragalactic Database Redshift Calculations
+        (2021); Anthony Carr and Tamara Davis
+    """
+
+    def __init__(self, fpath):
+        self._set_data(fpath)
+
+    def _set_data(self, fpath):
+        """
+        Set the catalogue
+        """
+        from scipy.constants import c
+        # Read the catalogue and select non-fake galaxies
+        cat = numpy.genfromtxt(fpath, delimiter="|", )
+        cat = cat[cat[:, 12] == 0, :]
+        # Pre=allocate array and fillt it
+        cols = [("RA", F64), ("DEC", F64), ("Ksmag", F64), ("ZCMB", F64),
+                ("DIST", F64)]
+        data = cols_to_structured(cat.shape[0], cols)
+        data["RA"] = cat[:, 1]
+        data["DEC"] = cat[:, 2]
+        data["Ksmag"] = cat[:, 5]
+        data["ZCMB"] = cat[:, 7] / (c * 1e-3)
+        self._data = data
+
+
+class TwoMPPGroups(BaseSurvey):
+    """
+    The 2M++ galaxy group catalogue [1], with the catalogue at [2].
+
+    Parameters
+    ----------
+    fpath : str
+        File path to the catalogue.
+
+    References
+    ----------
+    [1] The 2M++ galaxy redshift catalogue; Lavaux, Guilhem, Hudson, Michael J.
+    [2] https://cdsarc.cds.unistra.fr/viz-bin/cat/J/MNRAS/416/2840#/article
+    [3] Improving NASA/IPAC Extragalactic Database Redshift Calculations
+        (2021); Anthony Carr and Tamara Davis
+    """
+
+    def __init__(self, fpath):
+        self._set_data(fpath)
+
+    def _set_data(self, fpath):
+        """
+        Set the catalogue
+        """
+        cat = numpy.genfromtxt(fpath, delimiter="|", )
+        # Pre-allocate and fill the array
+        cols = [("RA", F64), ("DEC", F64), ("K2mag", F64),
+                ("Rich", numpy.int64), ("sigma", F64)]
+        data = cols_to_structured(cat.shape[0], cols)
+        data["K2mag"] = cat[:, 3]
+        data["Rich"] = cat[:, 4]
+        data["sigma"] = cat[:, 7]
+
+        # Convert galactic coordinates to RA, dec
+        glon = data[:, 1]
+        glat = data[:, 2]
+        coords = SkyCoord(l=glon*u.degree, b=glat*u.degree, frame='galactic')
+        coords = coords.transform_to("icrs")
+        data["RA"] = coords.ra
+        data["DEC"] = coords.dec
+        self._data = data
--- a/csiborgtools/read/readsim.py
+++ b/csiborgtools/read/readsim.py
@ -22,8 +22,7 @@ from os import listdir
 from os.path import (join, isfile)
 from glob import glob
 from tqdm import tqdm
-
-from ..utils import (cols_to_structured, add_columns)
+from ..utils import cols_to_structured


 F16 = numpy.float16
@ -512,30 +511,3 @@ def read_mmain(n, srcdir, fname="Mmain_{}.npy"):
        out[name] = arr[:, i]

    return out
-
-
-def merge_mmain_to_clumps(clumps, mmain):
-    """
-    Merge columns from the `mmain` files to the `clump` file, matches them
-    by their halo index while assuming that the indices `index` in both arrays
-    are sorted.
-
-    Parameters
-    ----------
-    clumps : structured array
-        Clumps structured array.
-    mmain : structured array
-        Parent halo array whose information is to be merged into `clumps`.
-
-    Returns
-    -------
-    out : structured array
-        Array with added columns.
-    """
-    X = numpy.full((clumps.size, 2), numpy.nan)
-    # Mask of which clumps have a mmain index
-    mask = numpy.isin(clumps["index"], mmain["index"])
-
-    X[mask, 0] = mmain["mass_cl"]
-    X[mask, 1] = mmain["sub_frac"]
-    return add_columns(clumps, X, ["mass_mmain", "sub_frac"])
--- a/csiborgtools/units/init.py
+++ b/csiborgtools/units/init.py
@ -14,4 +14,4 @@
 # 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.

 from .transforms import cartesian_to_radec  # noqa
-from .box_units import (BoxUnits, convert_from_boxunits)  # noqa
+from .box_units import (BoxUnits)  # noqa
--- a/csiborgtools/units/box_units.py
+++ b/csiborgtools/units/box_units.py
@ -20,7 +20,7 @@ import numpy
 from scipy.interpolate import interp1d
 from astropy.cosmology import LambdaCDM
 from astropy import (constants, units)
-from ..io import read_info
+from ..read import read_info


 # Map of unit conversions
@ -329,65 +329,59 @@ class BoxUnits:
        return (density / self._unit_d * self._Msuncgs
                / (units.Mpc.to(units.cm))**3)

+    def convert_from_boxunits(self, data, names):
+        r"""
+        Convert columns named `names` in array `data` from box units to
+        physical units, such that
+            - length -> :math:`Mpc`,
+            - mass -> :math:`M_\odot`,
+            - density -> :math:`M_\odot / \mathrm{Mpc}^3`.

-def convert_from_boxunits(data, names, boxunits):
-    r"""
-    Convert columns named `names` in array `data` from box units to physical
-    units, such that
-        - length -> :math:`Mpc`,
-        - mass -> :math:`M_\odot`,
-        - density -> :math:`M_\odot / \mathrm{Mpc}^3`.
-    Any other conversions are currently not implemented. Note that the array
-    is passed by reference and directly modified, even though it is also
-    explicitly returned. Additionally centres the box coordinates on the
-    observer, if they are being transformed.
+        Any other conversions are currently not implemented. Note that the
+        array is passed by reference and directly modified, even though it is
+        also explicitly returned. Additionally centres the box coordinates on
+        the observer, if they are being transformed.

-    Parameters
-    ----------
-    data : structured array
-        Input array.
-    names : list of str
-        Columns to be converted.
-    boxunits : `BoxUnits`
-        Box units class of the simulation and snapshot.
+        Parameters
+        ----------
+        data : structured array
+            Input array.
+        names : list of str
+            Columns to be converted.

-    Returns
-    -------
-    data : structured array
-        Input array with converted columns.
-    """
-    if not isinstance(boxunits, BoxUnits):
-        raise TypeError("`boxunits` must be of type `{}`. Currently `{}`."
-                        .format(BoxUnits, type(boxunits)))
-    names = [names] if isinstance(names, str) else names
+        Returns
+        -------
+        data : structured array
+            Input array with converted columns.
+        """
+        names = [names] if isinstance(names, str) else names

-    # Shortcut for the transform functions
-    transforms = {
-        "length": boxunits.box2mpc,
-        "mass": boxunits.box2solarmass,
-        "density": boxunits.box2dens
-        }
+        # Shortcut for the transform functions
+        transforms = {
+            "length": self.box2mpc,
+            "mass": self.box2solarmass,
+            "density": self.box2dens
+            }

-    for name in names:
-        # Check that the name is even in the array
-        if name not in data.dtype.names:
-            raise ValueError("Name `{}` is not in `data` array.".format(name))
+        for name in names:
+            # Check that the name is even in the array
+            if name not in data.dtype.names:
+                raise ValueError("Name `{}` not in `data` array.".format(name))

-        # Convert
-        found = False
-        for unittype, suppnames in CONV_NAME.items():
-            if name in suppnames:
-                data[name] = transforms[unittype](data[name])
-                found = True
-                continue
-        # If nothing found
-        if not found:
-            raise NotImplementedError(
-                "Conversion of `{}` is not defined.".format(name))
+            # Convert
+            found = False
+            for unittype, suppnames in CONV_NAME.items():
+                if name in suppnames:
+                    data[name] = transforms[unittype](data[name])
+                    found = True
+                    continue
+            # If nothing found
+            if not found:
+                raise NotImplementedError(
+                    "Conversion of `{}` is not defined.".format(name))

-        # Center at the observer
-        if name in ["peak_x", "peak_y", "peak_z"]:
-            data[name] -= transforms["length"](0.5)
-            data[name] -= (0.5)
+            # Center at the observer
+            if name in ["peak_x", "peak_y", "peak_z"]:
+                data[name] -= transforms["length"](0.5)

-    return data
+        return data
--- a/scripts/playground.ipynb
+++ b/scripts/playground.ipynb
--- a/scripts/run_fit_halos.py
+++ b/scripts/run_fit_halos.py
@ -32,9 +32,6 @@ import utils
 F64 = numpy.float64
 I64 = numpy.int64

-# Simulations and their snapshot to analyze
-Nsims = [9844]
-Nsnap = 1016

 # Get MPI things
 comm = MPI.COMM_WORLD
@ -50,68 +47,76 @@ cols_collect = [("npart", I64), ("totpartmass", F64), ("Rs", F64),
                ("rmax", F64), ("r200", F64), ("r500", F64),
                ("m200", F64), ("m500", F64)]

-# NOTE later loop over sims too
-Nsim = Nsims[0]
-simpath = csiborgtools.io.get_sim_path(Nsim)
-box = csiborgtools.units.BoxUnits(Nsnap, simpath)
+Nsims = csiborgtools.read.get_csiborg_ids("/mnt/extraspace/hdesmond")
+for i, Nsim in enumerate(Nsims):
+    if rank == 0:
+        print("{}: calculating {}th simulation.".format(datetime.now(), i))

-jobs = csiborgtools.fits.split_jobs(utils.Nsplits, nproc)[rank]
-for icount, Nsplit in enumerate(jobs):
-    print("{}: rank {} working {} / {} jobs.".format(datetime.now(), rank,
-                                                     icount + 1, len(jobs)))
-    parts, part_clumps, clumps = csiborgtools.fits.load_split_particles(
-        Nsplit, loaddir, Nsim, Nsnap, remove_split=False)
+    simpath = csiborgtools.read.get_sim_path(Nsim)
+    Nsnap = csiborgtools.read.get_maximum_snapshot(simpath)
+    box = csiborgtools.units.BoxUnits(Nsnap, simpath)

-    N = clumps.size
-    cols = [("index", I64), ("npart", I64), ("totpartmass", F64),
-            ("Rs", F64), ("rho0", F64), ("conc", F64),
-            ("vx", F64), ("vy", F64), ("vz", F64),
-            ("rmin", F64), ("rmax", F64),
-            ("r200", F64), ("r500", F64), ("m200", F64), ("m500", F64)]
-    out = csiborgtools.utils.cols_to_structured(N, cols)
-    out["index"] = clumps["index"]
+    jobs = csiborgtools.fits.split_jobs(utils.Nsplits, nproc)[rank]
+    for Nsplit in jobs:
+        parts, part_clumps, clumps = csiborgtools.fits.load_split_particles(
+            Nsplit, loaddir, Nsim, Nsnap, remove_split=False)

-    for n in range(N):
-        # Pick clump and its particles
-        xs = csiborgtools.fits.pick_single_clump(n, parts, part_clumps, clumps)
-        clump = csiborgtools.fits.Clump.from_arrays(*xs, rhoc=box.box_rhoc)
-        out["npart"][n] = clump.Npart
-        out["rmin"][n] = clump.rmin
-        out["rmax"][n] = clump.rmax
-        out["totpartmass"][n] = clump.total_particle_mass
-        out["vx"] = numpy.average(clump.vel[:, 0], weights=clump.m)
-        out["vy"] = numpy.average(clump.vel[:, 1], weights=clump.m)
-        out["vz"] = numpy.average(clump.vel[:, 2], weights=clump.m)
+        N = clumps.size
+        cols = [("index", I64), ("npart", I64), ("totpartmass", F64),
+                ("Rs", F64), ("rho0", F64), ("conc", F64),
+                ("vx", F64), ("vy", F64), ("vz", F64),
+                ("rmin", F64), ("rmax", F64),
+                ("r200", F64), ("r500", F64), ("m200", F64), ("m500", F64)]
+        out = csiborgtools.utils.cols_to_structured(N, cols)
+        out["index"] = clumps["index"]

-        # Spherical overdensity radii and masses
-        rs, ms = clump.spherical_overdensity_mass([200, 500])
-        out["r200"][n] = rs[0]
-        out["r500"][n] = rs[1]
-        out["m200"][n] = ms[0]
-        out["m500"][n] = ms[1]
+        for n in range(N):
+            # Pick clump and its particles
+            xs = csiborgtools.fits.pick_single_clump(n, parts, part_clumps,
+                                                     clumps)
+            clump = csiborgtools.fits.Clump.from_arrays(*xs, rhoc=box.box_rhoc)
+            out["npart"][n] = clump.Npart
+            out["rmin"][n] = clump.rmin
+            out["rmax"][n] = clump.rmax
+            out["totpartmass"][n] = clump.total_particle_mass
+            out["vx"] = numpy.average(clump.vel[:, 0], weights=clump.m)
+            out["vy"] = numpy.average(clump.vel[:, 1], weights=clump.m)
+            out["vz"] = numpy.average(clump.vel[:, 2], weights=clump.m)

-        # NFW profile fit
-        if clump.Npart > 10 and numpy.isfinite(out["r200"][n]):
-            nfwpost = csiborgtools.fits.NFWPosterior(clump)
-            logRs, __ = nfwpost.maxpost_logRs()
-            Rs = 10**logRs
-            if not numpy.isnan(logRs):
-                out["Rs"][n] = Rs
-                out["rho0"][n] = nfwpost.rho0_from_Rs(Rs)
-                out["conc"][n] = out["r200"][n] / Rs
+            # Spherical overdensity radii and masses
+            rs, ms = clump.spherical_overdensity_mass([200, 500])
+            out["r200"][n] = rs[0]
+            out["r500"][n] = rs[1]
+            out["m200"][n] = ms[0]
+            out["m500"][n] = ms[1]

-    csiborgtools.io.dump_split(out, Nsplit, Nsim, Nsnap, dumpdir)
+            # NFW profile fit
+            if clump.Npart > 10 and numpy.isfinite(out["r200"][n]):
+                nfwpost = csiborgtools.fits.NFWPosterior(clump)
+                logRs, __ = nfwpost.maxpost_logRs()
+                Rs = 10**logRs
+                if not numpy.isnan(logRs):
+                    out["Rs"][n] = Rs
+                    out["rho0"][n] = nfwpost.rho0_from_Rs(Rs)
+                    out["conc"][n] = out["r200"][n] / Rs
+
+        csiborgtools.read.dump_split(out, Nsplit, Nsim, Nsnap, dumpdir)
+
+    # Wait until all jobs finished before moving to another simulation
+    comm.Barrier()
+
+    # Use the rank 0 to combine outputs for this CSiBORG realisation
+    if rank == 0:
+        print("Collecting results!")
+        out_collected = csiborgtools.read.combine_splits(
+            utils.Nsplits, Nsim, Nsnap, utils.dumpdir, cols_collect,
+            remove_splits=True, verbose=False)
+        fname = join(utils.dumpdir, "ramses_out_{}_{}.npy"
+                     .format(str(Nsim).zfill(5), str(Nsnap).zfill(5)))
+        print("Saving results to `{}`.".format(fname))
+        numpy.save(fname, out_collected)
+
+    comm.Barrier()

-# Force all ranks to wait
-comm.Barrier()
-# Use the rank 0 to combine outputs for this CSiBORG realisation
 if rank == 0:
-    print("Collecting results!")
-    out_collected = csiborgtools.io.combine_splits(
-        utils.Nsplits, Nsim, Nsnap, utils.dumpdir, cols_collect,
-        remove_splits=True, verbose=False)
-    fname = join(utils.dumpdir, "ramses_out_{}_{}.npy"
-                 .format(str(Nsim).zfill(5), str(Nsnap).zfill(5)))
-    print("Saving results to `{}`.".format(fname))
-    numpy.save(fname, out_collected)
    print("All finished! See ya!")
--- a/scripts/run_split_halos.py
+++ b/scripts/run_split_halos.py
@ -34,7 +34,7 @@ comm = MPI.COMM_WORLD
 rank = comm.Get_rank()
 nproc = comm.Get_size()

-Nsims = csiborgtools.io.get_csiborg_ids("/mnt/extraspace/hdesmond")
+Nsims = csiborgtools.read.get_csiborg_ids("/mnt/extraspace/hdesmond")
 partcols = ["x", "y", "z", "vx", "vy", "vz", "M", "level"]
 dumpdir = join(utils.dumpdir, "temp")

@ -43,16 +43,16 @@ for icount, sim_index in enumerate(jobs):
    print("{}: rank {} working {} / {} jobs.".format(datetime.now(), rank,
                                                     icount + 1, len(jobs)))
    Nsim = Nsims[sim_index]
-    simpath = csiborgtools.io.get_sim_path(Nsim)
-    Nsnap = csiborgtools.io.get_maximum_snapshot(simpath)
+    simpath = csiborgtools.read.get_sim_path(Nsim)
+    Nsnap = csiborgtools.read.get_maximum_snapshot(simpath)
    # Load the clumps, particles' clump IDs and particles.
-    clumps = csiborgtools.io.read_clumps(Nsnap, simpath)
-    particle_clumps = csiborgtools.io.read_clumpid(Nsnap, simpath,
-                                                   verbose=False)
-    particles = csiborgtools.io.read_particle(partcols, Nsnap, simpath,
-                                              verbose=False)
+    clumps = csiborgtools.read.read_clumps(Nsnap, simpath)
+    particle_clumps = csiborgtools.read.read_clumpid(Nsnap, simpath,
+                                                     verbose=False)
+    particles = csiborgtools.read.read_particle(partcols, Nsnap, simpath,
+                                                verbose=False)
    # Drop all particles whose clump index is 0 (not assigned to any halo)
-    particle_clumps, particles = csiborgtools.io.drop_zero_indx(
+    particle_clumps, particles = csiborgtools.read.drop_zero_indx(
        particle_clumps, particles)
    # Dump it!
    csiborgtools.fits.dump_split_particles(particles, particle_clumps, clumps,
--- a/scripts/utils.py
+++ b/scripts/utils.py
@ -16,16 +16,14 @@
 Notebook utility functions.
 """

+# import numpy
+# from os.path import join

-import numpy
-from os.path import join
-from astropy.cosmology import FlatLambdaCDM
-
-try:
-    import csiborgtools
-except ModuleNotFoundError:
-    import sys
-    sys.path.append("../")
+# try:
+#     import csiborgtools
+# except ModuleNotFoundError:
+#     import sys
+#     sys.path.append("../")


 Nsplits = 200
@ -42,52 +40,3 @@ _virgo = {"RA": (12 + 27 / 60) * 15,
          "COMDIST": 16.5}

 specific_clusters = {"Coma": _coma, "Virgo": _virgo}
-
-
-def load_processed(Nsim, Nsnap):
-    simpath = csiborgtools.io.get_sim_path(Nsim)
-    outfname = join(
-        dumpdir, "ramses_out_{}_{}.npy".format(str(Nsim).zfill(5),
-                                               str(Nsnap).zfill(5)))
-    data = numpy.load(outfname)
-    # Add mmain
-    mmain = csiborgtools.io.read_mmain(Nsim, "/mnt/zfsusers/hdesmond/Mmain")
-    data = csiborgtools.io.merge_mmain_to_clumps(data, mmain)
-    csiborgtools.utils.flip_cols(data, "peak_x", "peak_z")
-    # Cut on numbre of particles and finite m200
-    data = data[(data["npart"] > 100) & numpy.isfinite(data["m200"])]
-
-    # Do unit conversion
-    boxunits = csiborgtools.units.BoxUnits(Nsnap, simpath)
-    convert_cols = ["m200", "m500", "totpartmass", "mass_mmain",
-                    "r200", "r500", "Rs", "rho0", "peak_x", "peak_y", "peak_z"]
-    data = csiborgtools.units.convert_from_boxunits(
-        data, convert_cols, boxunits)
-    # Now calculate spherical coordinates
-    d, ra, dec = csiborgtools.units.cartesian_to_radec(data)
-    data = csiborgtools.utils.add_columns(
-        data, [d, ra, dec], ["dist", "ra", "dec"])
-    return data, boxunits
-
-
-def load_planck2015(max_comdist=214):
-    cosmo = FlatLambdaCDM(H0=70.5, Om0=0.307, Tcmb0=2.728)
-    fpath = "../data/HFI_PCCS_SZ-union_R2.08.fits"
-    return csiborgtools.io.read_planck2015(fpath, cosmo, max_comdist)
-
-
-def load_mcxc(max_comdist=214):
-    cosmo = FlatLambdaCDM(H0=70.5, Om0=0.307, Tcmb0=2.728)
-    fpath = ("../data/mcxc.fits")
-    return csiborgtools.io.read_mcxc(fpath, cosmo, max_comdist)
-
-
-def load_2mpp():
-    cosmo = FlatLambdaCDM(H0=70.5, Om0=0.307, Tcmb0=2.728)
-    return csiborgtools.io.read_2mpp("../data/2M++_galaxy_catalog.dat", cosmo)
-
-
-def load_2mpp_groups():
-    cosmo = FlatLambdaCDM(H0=70.5, Om0=0.307, Tcmb0=2.728)
-    return csiborgtools.io.read_2mpp_groups(
-        "../data/../data/2M++_group_catalog.dat", cosmo)