Correct fitting script for both clumps and halos (#46)

* Minor typos * fix minor bugs * pep8 * formatting * pep8 * Fix minor bugs * New path & pep8 * add splitt * Updates * Improve calculation within radius * pep8 * pep8 * get the script working * Add matter overdensity * Add m200m to the script * Fix looping bug * add parents support * add import * Optionally concatenate velocities * Make optional masking * Ignore the error message * Start reading in raw data * Fix cat reading * Additional units conversions * Add clump reading * Fix indexing * Remove old comment * Remove old comment * set npart to 0 instead of overflow from NaN * fix docs * rm boring stuff * Remove old stuff * Remove old stuff * Remove old comment * Update nb
2025-07-18 19:53:03 +00:00 · 2023-04-19 16:39:35 +02:00 · 2023-04-19 16:39:35 +02:00 · 39b3498621
commit 39b3498621
parent c2fde1566b
17 changed files with 709 additions and 357 deletions
--- a/scripts/cluster_knn_auto.py
+++ b/scripts/cluster_knn_auto.py
@ -29,6 +29,7 @@ try:
    import csiborgtools
 except ModuleNotFoundError:
    import sys
+
    sys.path.append("../")
    import csiborgtools

@ -43,23 +44,13 @@ nproc = comm.Get_size()
 parser = ArgumentParser()
 parser.add_argument("--runs", type=str, nargs="+")
 args = parser.parse_args()
-with open('../scripts/knn_auto.yml', 'r') as file:
+with open("../scripts/knn_auto.yml", "r") as file:
    config = yaml.safe_load(file)

 Rmax = 155 / 0.705  # Mpc (h = 0.705) high resolution region radius
 totvol = 4 * numpy.pi * Rmax**3 / 3
-minmass = 1e12
-ics = [7444, 7468, 7492, 7516, 7540, 7564, 7588, 7612, 7636, 7660, 7684,
-       7708, 7732, 7756, 7780, 7804, 7828, 7852, 7876, 7900, 7924, 7948,
-       7972, 7996, 8020, 8044, 8068, 8092, 8116, 8140, 8164, 8188, 8212,
-       8236, 8260, 8284, 8308, 8332, 8356, 8380, 8404, 8428, 8452, 8476,
-       8500, 8524, 8548, 8572, 8596, 8620, 8644, 8668, 8692, 8716, 8740,
-       8764, 8788, 8812, 8836, 8860, 8884, 8908, 8932, 8956, 8980, 9004,
-       9028, 9052, 9076, 9100, 9124, 9148, 9172, 9196, 9220, 9244, 9268,
-       9292, 9316, 9340, 9364, 9388, 9412, 9436, 9460, 9484, 9508, 9532,
-       9556, 9580, 9604, 9628, 9652, 9676, 9700, 9724, 9748, 9772, 9796,
-       9820, 9844]
 paths = csiborgtools.read.CSiBORGPaths(**csiborgtools.paths_glamdring)
+ics = paths.get_ics(False)
 knncdf = csiborgtools.clustering.kNN_CDF()

 ###############################################################################
@ -75,9 +66,9 @@ def read_single(selection, cat):
    psel = selection["primary"]
    pmin, pmax = psel.get("min", None), psel.get("max", None)
    if pmin is not None:
-        mmask &= (cat[psel["name"]] >= pmin)
+        mmask &= cat[psel["name"]] >= pmin
    if pmax is not None:
-        mmask &= (cat[psel["name"]] < pmax)
+        mmask &= cat[psel["name"]] < pmax
    pos = pos[mmask, ...]

    # Secondary selection
@ -92,12 +83,13 @@ def read_single(selection, cat):
    if ssel.get("marked", True):
        x = cat[psel["name"]][mmask]
        prop = csiborgtools.clustering.normalised_marks(
-            x, prop, nbins=config["nbins_marks"])
+            x, prop, nbins=config["nbins_marks"]
+        )

    if smin is not None:
-        smask &= (prop >= smin)
+        smask &= prop >= smin
    if smax is not None:
-        smask &= (prop < smax)
+        smask &= prop < smax

    return pos[smask, ...]

@ -106,8 +98,7 @@ def do_auto(run, cat, ic):
    """Calculate the kNN-CDF single catalgoue autocorrelation."""
    _config = config.get(run, None)
    if _config is None:
-        warn("No configuration for run {}.".format(run), UserWarning,
-             stacklevel=1)
+        warn("No configuration for run {}.".format(run), UserWarning, stacklevel=1)
        return

    rvs_gen = csiborgtools.clustering.RVSinsphere(Rmax)
@ -115,21 +106,28 @@ def do_auto(run, cat, ic):
    knn = NearestNeighbors()
    knn.fit(pos)
    rs, cdf = knncdf(
-        knn, rvs_gen=rvs_gen, nneighbours=config["nneighbours"],
-        rmin=config["rmin"], rmax=config["rmax"],
-        nsamples=int(config["nsamples"]), neval=int(config["neval"]),
-        batch_size=int(config["batch_size"]), random_state=config["seed"])
+        knn,
+        rvs_gen=rvs_gen,
+        nneighbours=config["nneighbours"],
+        rmin=config["rmin"],
+        rmax=config["rmax"],
+        nsamples=int(config["nsamples"]),
+        neval=int(config["neval"]),
+        batch_size=int(config["batch_size"]),
+        random_state=config["seed"],
+    )

-    joblib.dump({"rs": rs, "cdf": cdf, "ndensity": pos.shape[0] / totvol},
-                paths.knnauto_path(run,  ic))
+    joblib.dump(
+        {"rs": rs, "cdf": cdf, "ndensity": pos.shape[0] / totvol},
+        paths.knnauto_path(run, ic),
+    )


 def do_cross_rand(run, cat, ic):
    """Calculate the kNN-CDF cross catalogue random correlation."""
    _config = config.get(run, None)
    if _config is None:
-        warn("No configuration for run {}.".format(run), UserWarning,
-             stacklevel=1)
+        warn("No configuration for run {}.".format(run), UserWarning, stacklevel=1)
        return

    rvs_gen = csiborgtools.clustering.RVSinsphere(Rmax)
@ -142,10 +140,17 @@ def do_cross_rand(run, cat, ic):
    knn2.fit(pos2)

    rs, cdf0, cdf1, joint_cdf = knncdf.joint(
-        knn1, knn2, rvs_gen=rvs_gen, nneighbours=int(config["nneighbours"]),
-        rmin=config["rmin"], rmax=config["rmax"],
-        nsamples=int(config["nsamples"]), neval=int(config["neval"]),
-        batch_size=int(config["batch_size"]), random_state=config["seed"])
+        knn1,
+        knn2,
+        rvs_gen=rvs_gen,
+        nneighbours=int(config["nneighbours"]),
+        rmin=config["rmin"],
+        rmax=config["rmax"],
+        nsamples=int(config["nsamples"]),
+        neval=int(config["neval"]),
+        batch_size=int(config["batch_size"]),
+        random_state=config["seed"],
+    )
    corr = knncdf.joint_to_corr(cdf0, cdf1, joint_cdf)
    joblib.dump({"rs": rs, "corr": corr}, paths.knnauto_path(run, ic))

@ -180,4 +185,4 @@ comm.Barrier()

 if rank == 0:
    print("{}: all finished.".format(datetime.now()))
-quit()  # Force quit the script
+quit()  # Force quit the script
--- a/scripts/cluster_knn_cross.py
+++ b/scripts/cluster_knn_cross.py
@ -16,7 +16,6 @@
 from argparse import ArgumentParser
 from datetime import datetime
 from itertools import combinations
-from os.path import join
 from warnings import warn

 import joblib
@ -30,6 +29,7 @@ try:
    import csiborgtools
 except ModuleNotFoundError:
    import sys
+
    sys.path.append("../")
    import csiborgtools

@ -44,24 +44,12 @@ nproc = comm.Get_size()
 parser = ArgumentParser()
 parser.add_argument("--runs", type=str, nargs="+")
 args = parser.parse_args()
-with open('../scripts/knn_cross.yml', 'r') as file:
+with open("../scripts/knn_cross.yml", "r") as file:
    config = yaml.safe_load(file)

 Rmax = 155 / 0.705  # Mpc (h = 0.705) high resolution region radius
-minmass = 1e12
-ics = [7444, 7468, 7492, 7516, 7540, 7564, 7588, 7612, 7636, 7660, 7684,
-       7708, 7732, 7756, 7780, 7804, 7828, 7852, 7876, 7900, 7924, 7948,
-       7972, 7996, 8020, 8044, 8068, 8092, 8116, 8140, 8164, 8188, 8212,
-       8236, 8260, 8284, 8308, 8332, 8356, 8380, 8404, 8428, 8452, 8476,
-       8500, 8524, 8548, 8572, 8596, 8620, 8644, 8668, 8692, 8716, 8740,
-       8764, 8788, 8812, 8836, 8860, 8884, 8908, 8932, 8956, 8980, 9004,
-       9028, 9052, 9076, 9100, 9124, 9148, 9172, 9196, 9220, 9244, 9268,
-       9292, 9316, 9340, 9364, 9388, 9412, 9436, 9460, 9484, 9508, 9532,
-       9556, 9580, 9604, 9628, 9652, 9676, 9700, 9724, 9748, 9772, 9796,
-       9820, 9844]
 paths = csiborgtools.read.CSiBORGPaths(**csiborgtools.paths_glamdring)
-dumpdir = "/mnt/extraspace/rstiskalek/csiborg/knn"
-fout = join(dumpdir, "cross", "knncdf_{}_{}_{}.p")
+ics = paths.get_ics(False)
 knncdf = csiborgtools.clustering.kNN_CDF()

 ###############################################################################
@ -76,9 +64,9 @@ def read_single(selection, cat):
    psel = selection["primary"]
    pmin, pmax = psel.get("min", None), psel.get("max", None)
    if pmin is not None:
-        mmask &= (cat[psel["name"]] >= pmin)
+        mmask &= cat[psel["name"]] >= pmin
    if pmax is not None:
-        mmask &= (cat[psel["name"]] < pmax)
+        mmask &= cat[psel["name"]] < pmax
    return pos[mmask, ...]


@ -99,10 +87,17 @@ def do_cross(run, ics):
    knn2.fit(pos2)

    rs, cdf0, cdf1, joint_cdf = knncdf.joint(
-        knn1, knn2, rvs_gen=rvs_gen, nneighbours=int(config["nneighbours"]),
-        rmin=config["rmin"], rmax=config["rmax"],
-        nsamples=int(config["nsamples"]), neval=int(config["neval"]),
-        batch_size=int(config["batch_size"]), random_state=config["seed"])
+        knn1,
+        knn2,
+        rvs_gen=rvs_gen,
+        nneighbours=int(config["nneighbours"]),
+        rmin=config["rmin"],
+        rmax=config["rmax"],
+        nsamples=int(config["nsamples"]),
+        neval=int(config["neval"]),
+        batch_size=int(config["batch_size"]),
+        random_state=config["seed"],
+    )

    corr = knncdf.joint_to_corr(cdf0, cdf1, joint_cdf)
    joblib.dump({"rs": rs, "corr": corr}, paths.knncross_path(run, ics))
@ -135,4 +130,4 @@ comm.Barrier()

 if rank == 0:
    print("{}: all finished.".format(datetime.now()))
-quit()  # Force quit the script
+quit()  # Force quit the script
--- a/scripts/cluster_tcpf_auto.py
+++ b/scripts/cluster_tcpf_auto.py
@ -16,7 +16,6 @@
 from argparse import ArgumentParser
 from copy import deepcopy
 from datetime import datetime
-from os.path import join
 from warnings import warn

 import joblib
@ -29,6 +28,7 @@ try:
    import csiborgtools
 except ModuleNotFoundError:
    import sys
+
    sys.path.append("../")
    import csiborgtools

@ -43,24 +43,12 @@ nproc = comm.Get_size()
 parser = ArgumentParser()
 parser.add_argument("--runs", type=str, nargs="+")
 args = parser.parse_args()
-with open('../scripts/tpcf_auto.yml', 'r') as file:
+with open("../scripts/tpcf_auto.yml", "r") as file:
    config = yaml.safe_load(file)

 Rmax = 155 / 0.705  # Mpc (h = 0.705) high resolution region radius
-minmass = 1e12
-ics = [7444, 7468, 7492, 7516, 7540, 7564, 7588, 7612, 7636, 7660, 7684,
-       7708, 7732, 7756, 7780, 7804, 7828, 7852, 7876, 7900, 7924, 7948,
-       7972, 7996, 8020, 8044, 8068, 8092, 8116, 8140, 8164, 8188, 8212,
-       8236, 8260, 8284, 8308, 8332, 8356, 8380, 8404, 8428, 8452, 8476,
-       8500, 8524, 8548, 8572, 8596, 8620, 8644, 8668, 8692, 8716, 8740,
-       8764, 8788, 8812, 8836, 8860, 8884, 8908, 8932, 8956, 8980, 9004,
-       9028, 9052, 9076, 9100, 9124, 9148, 9172, 9196, 9220, 9244, 9268,
-       9292, 9316, 9340, 9364, 9388, 9412, 9436, 9460, 9484, 9508, 9532,
-       9556, 9580, 9604, 9628, 9652, 9676, 9700, 9724, 9748, 9772, 9796,
-       9820, 9844]
-dumpdir = "/mnt/extraspace/rstiskalek/csiborg/tpcf"
-fout = join(dumpdir, "auto", "tpcf_{}_{}.p")
 paths = csiborgtools.read.CSiBORGPaths()
+ics = paths.get_ics(False)
 tpcf = csiborgtools.clustering.Mock2PCF()

 ###############################################################################
@ -76,9 +64,9 @@ def read_single(selection, cat):
    psel = selection["primary"]
    pmin, pmax = psel.get("min", None), psel.get("max", None)
    if pmin is not None:
-        mmask &= (cat[psel["name"]] >= pmin)
+        mmask &= cat[psel["name"]] >= pmin
    if pmax is not None:
-        mmask &= (cat[psel["name"]] < pmax)
+        mmask &= cat[psel["name"]] < pmax
    pos = pos[mmask, ...]

    # Secondary selection
@ -93,12 +81,13 @@ def read_single(selection, cat):
    if ssel.get("marked", True):
        x = cat[psel["name"]][mmask]
        prop = csiborgtools.clustering.normalised_marks(
-            x, prop, nbins=config["nbins_marks"])
+            x, prop, nbins=config["nbins_marks"]
+        )

    if smin is not None:
-        smask &= (prop >= smin)
+        smask &= prop >= smin
    if smax is not None:
-        smask &= (prop < smax)
+        smask &= prop < smax

    return pos[smask, ...]

@ -110,8 +99,11 @@ def do_auto(run, cat, ic):
        return

    rvs_gen = csiborgtools.clustering.RVSinsphere(Rmax)
-    bins = numpy.logspace(numpy.log10(config["rpmin"]),
-                          numpy.log10(config["rpmax"]), config["nrpbins"] + 1)
+    bins = numpy.logspace(
+        numpy.log10(config["rpmin"]),
+        numpy.log10(config["rpmax"]),
+        config["nrpbins"] + 1,
+    )
    pos = read_single(_config, cat)
    nrandom = int(config["randmult"] * pos.shape[0])
    rp, wp = tpcf(pos, rvs_gen, nrandom, bins)
@ -146,4 +138,4 @@ comm.Barrier()

 if rank == 0:
    print("{}: all finished.".format(datetime.now()))
-quit()  # Force quit the script
+quit()  # Force quit the script
--- a/scripts/pre_fithalos.py
+++ b/scripts/pre_fithalos.py
@ -16,18 +16,26 @@
 A script to fit halos (concentration, ...). The particle array of each CSiBORG
 realisation must have been split in advance by `runsplit_halos`.
 """
+from argparse import ArgumentParser
 from datetime import datetime
+from os.path import join

 import numpy
 from mpi4py import MPI
+from tqdm import tqdm

 try:
    import csiborgtools
 except ModuleNotFoundError:
    import sys
+
    sys.path.append("../")
    import csiborgtools

+parser = ArgumentParser()
+parser.add_argument("--kind", type=str, choices=["halos", "clumps"])
+args = parser.parse_args()
+

 # Get MPI things
 comm = MPI.COMM_WORLD
@ -35,128 +43,170 @@ rank = comm.Get_rank()
 nproc = comm.Get_size()

 paths = csiborgtools.read.CSiBORGPaths(**csiborgtools.paths_glamdring)
-partreader =csiborgtools.read.ParticleReader(paths)
-
-cols_collect = [("npart", numpy.int64), ("totpartmass", numpy.float64),
-                ("Rs", numpy.float64), ("vx", numpy.float64),
-                ("vy", numpy.float64), ("vz", numpy.float64),
-                ("Lx", numpy.float64), ("Ly", numpy.float64),
-                ("Lz", numpy.float64), ("rho0", numpy.float64),
-                ("conc", numpy.float64), ("rmin", numpy.float64),
-                ("rmax", numpy.float64), ("r200", numpy.float64),
-                ("r500", numpy.float64), ("m200", numpy.float64),
-                ("m500", numpy.float64), ("lambda200c", numpy.float64)]
-
-def fit_clump(particles, clump, box):
+partreader = csiborgtools.read.ParticleReader(paths)
+nfwpost = csiborgtools.fits.NFWPosterior()
+ftemp = join(paths.temp_dumpdir, "fit_clump_{}_{}_{}.npy")
+cols_collect = [
+    ("index", numpy.int32),
+    ("npart", numpy.int32),
+    ("totpartmass", numpy.float32),
+    ("vx", numpy.float32),
+    ("vy", numpy.float32),
+    ("vz", numpy.float32),
+    ("conc", numpy.float32),
+    ("rho0", numpy.float32),
+    ("r200c", numpy.float32),
+    ("r500c", numpy.float32),
+    ("m200c", numpy.float32),
+    ("m500c", numpy.float32),
+    ("lambda200c", numpy.float32),
+    ("r200m", numpy.float32),
+    ("m200m", numpy.float32),
+]


+def fit_clump(particles, clump_info, box):
+    """
+    Fit an object. Can be eithe a clump or a parent halo.
+    """
+    obj = csiborgtools.fits.Clump(particles, clump_info, box)
+
+    out = {}
+    if numpy.isnan(clump_info["index"]):
+        print("Why am I NaN?", flush=True)
+    out["index"] = clump_info["index"]
+    out["npart"] = len(obj)
+    out["totpartmass"] = numpy.sum(obj["M"])
+    for i, v in enumerate(["vx", "vy", "vz"]):
+        out[v] = numpy.average(obj.vel[:, i], weights=obj["M"])
+    # Overdensity masses
+    out["r200c"], out["m200c"] = obj.spherical_overdensity_mass(200, kind="crit")
+    out["r500c"], out["m500c"] = obj.spherical_overdensity_mass(500, kind="crit")
+    out["r200m"], out["m200m"] = obj.spherical_overdensity_mass(200, kind="matter")
+    # NFW fit
+    if out["npart"] > 10 and numpy.isfinite(out["r200c"]):
+        Rs, rho0 = nfwpost.fit(obj)
+        out["conc"] = Rs / out["r200c"]
+        out["rho0"] = rho0
+    # Spin within R200c
+    if numpy.isfinite(out["r200c"]):
+        out["lambda200c"] = obj.lambda_bullock(out["r200c"])
+    return out


-    out["npart"][n] = clump.Npart
-    out["rmin"][n] = clump.rmin
-    out["rmax"][n] = clump.rmax
-    out["totpartmass"][n] = clump.total_particle_mass
-    out["vx"][n] = numpy.average(clump.vel[:, 0], weights=clump.m)
-    out["vy"][n] = numpy.average(clump.vel[:, 1], weights=clump.m)
-    out["vz"][n] = numpy.average(clump.vel[:, 2], weights=clump.m)
-    out["Lx"][n], out["Ly"][n], out["Lz"][n] = clump.angular_momentum
+def load_clump_particles(clumpid, particle_archive):
+    """
+    Load a clump's particles from the particle archive. If it is not there, i.e
+    clump has no associated particles, return `None`.
+    """
+    try:
+        part = particle_archive[str(clumpid)]
+    except KeyError:
+        part = None
+    return part


+def load_parent_particles(clumpid, particle_archive, clumps_cat):
+    """
+    Load a parent halo's particles.
+    """
+    indxs = clumps_cat["index"][clumps_cat["parent"] == clumpid]
+    # We first load the particles of each clump belonging to this parent and then
+    # concatenate them for further analysis.
+    clumps = []
+    for ind in indxs:
+        parts = load_clump_particles(ind, particle_archive)
+        if parts is not None:
+            clumps.append([parts, None])

+    if len(clumps) == 0:
+        return None
+    return csiborgtools.match.concatenate_clumps(clumps, include_velocities=True)
+
+
+# We now start looping over all simulations
 for i, nsim in enumerate(paths.get_ics(tonew=False)):
    if rank == 0:
-        print("{}: calculating {}th simulation `{}`."
-              .format(datetime.now(), i, nsim), flush=True)
+        print(
+            "{}: calculating {}th simulation `{}`.".format(datetime.now(), i, nsim),
+            flush=True,
+        )
    nsnap = max(paths.get_snapshots(nsim))
    box = csiborgtools.read.BoxUnits(nsnap, nsim, paths)

    # Archive of clumps, keywords are their clump IDs
-    particle_archive = paths.split_path(nsnap, nsim)
-    clumpsarr = partreader.read_clumps(nsnap, nsim,
-                                       cols=["index", 'x', 'y', 'z'])
-    clumpid2arrpos = {ind: ii for ii, ind in enumerate(clumpsarr["index"])}
+    particle_archive = numpy.load(paths.split_path(nsnap, nsim))
+    clumps_cat = csiborgtools.read.ClumpsCatalogue(
+        nsim, paths, maxdist=None, minmass=None, rawdata=True, load_fitted=False
+    )
+    # We check whether we fit halos or clumps, will be indexing over different
+    # iterators.
+    if args.kind == "halos":
+        ismain = clumps_cat.ismain
+    else:
+        ismain = numpy.ones(len(clumps_cat), dtype=bool)
+    ntasks = len(clumps_cat)
+    # We split the clumps among the processes. Each CPU calculates a fraction
+    # of them and dumps the results in a structured array. Even if we are
+    # calculating parent halo this index runs over all clumps.
+    jobs = csiborgtools.fits.split_jobs(ntasks, nproc)[rank]
+    out = csiborgtools.read.cols_to_structured(len(jobs), cols_collect)
+    for i, j in enumerate(tqdm(jobs)) if nproc == 1 else enumerate(jobs):
+        # If we are fitting halos and this clump is not a main, then continue.
+        if args.kind == "halos" and not ismain[j]:
+            continue

+        clumpid = clumps_cat["index"][j]
+        if args.kind == "halos":
+            part = load_parent_particles(clumpid, particle_archive, clumps_cat)
+        else:
+            part = load_clump_particles(clumpid, particle_archive)

-    nclumps = len(particle_archive.files)
-    # Fit 5000 clumps at a time, then dump results
-    batchsize = 5000
+        # We fit the particles if there are any. If not we assign the index,
+        # otherwise it would be NaN converted to integers (-2147483648) and
+        # yield an error further down.
+        if part is not None:
+            _out = fit_clump(part, clumps_cat[j], box)
+            for key in _out.keys():
+                out[key][i] = _out[key]
+        else:
+            out["index"][i] = clumpid
+            out["npart"][i] = 0

-    # This rank does these `batchsize` clumps/halos
-    jobs = csiborgtools.utils.split_jobs(nclumps, nclumps // batchsize)[rank]
-    for clumpid in jobs:
-        ... = fit_clump(particle_archive[str(clumpid)], clumpsarr[clumpid2arrpos[clumpid]])
-
-
-
-    jobs = csiborgtools.utils.split_jobs(nclumps, nproc)[rank]
-    for nsplit in jobs:
-        parts, part_clumps, clumps = csiborgtools.fits.load_split_particles(
-            nsplit, nsnap, nsim, paths, remove_split=False)
-
-        N = clumps.size
-        cols = [("index", numpy.int64), ("npart", numpy.int64),
-                ("totpartmass", numpy.float64), ("Rs", numpy.float64),
-                ("rho0", numpy.float64), ("conc", numpy.float64),
-                ("lambda200c", numpy.float64), ("vx", numpy.float64),
-                ("vy", numpy.float64), ("vz", numpy.float64),
-                ("Lx", numpy.float64), ("Ly", numpy.float64),
-                ("Lz", numpy.float64), ("rmin", numpy.float64),
-                ("rmax", numpy.float64), ("r200", numpy.float64),
-                ("r500", numpy.float64), ("m200", numpy.float64),
-                ("m500", numpy.float64)]
-        out = csiborgtools.utils.cols_to_structured(N, cols)
-        out["index"] = clumps["index"]
-
-        for n in range(N):
-            # Pick clump and its particles
-            xs = csiborgtools.fits.pick_single_clump(n, parts, part_clumps,
-                                                     clumps)
-            clump = csiborgtools.fits.Clump.from_arrays(
-                *xs, rhoc=box.box_rhoc, G=box.box_G)
-            out["npart"][n] = clump.Npart
-            out["rmin"][n] = clump.rmin
-            out["rmax"][n] = clump.rmax
-            out["totpartmass"][n] = clump.total_particle_mass
-            out["vx"][n] = numpy.average(clump.vel[:, 0], weights=clump.m)
-            out["vy"][n] = numpy.average(clump.vel[:, 1], weights=clump.m)
-            out["vz"][n] = numpy.average(clump.vel[:, 2], weights=clump.m)
-            out["Lx"][n], out["Ly"][n], out["Lz"][n] = clump.angular_momentum
-
-            # Spherical overdensity radii and masses
-            rs, ms = clump.spherical_overdensity_mass([200, 500])
-            out["r200"][n] = rs[0]
-            out["r500"][n] = rs[1]
-            out["m200"][n] = ms[0]
-            out["m500"][n] = ms[1]
-            out["lambda200c"][n] = clump.lambda200c
-
-            # NFW profile fit
-            if clump.Npart > 10 and numpy.isfinite(out["r200"][n]):
-                nfwpost = csiborgtools.fits.NFWPosterior(clump)
-                logRs, __ = nfwpost.maxpost_logRs()
-                Rs = 10**logRs
-                if not numpy.isnan(logRs):
-                    out["Rs"][n] = Rs
-                    out["rho0"][n] = nfwpost.rho0_from_Rs(Rs)
-                    out["conc"][n] = out["r200"][n] / Rs
-
-        csiborgtools.read.dump_split(out, nsplit, nsnap, nsim, paths)
-
-    # Wait until all jobs finished before moving to another simulation
+    fout = ftemp.format(str(nsim).zfill(5), str(nsnap).zfill(5), rank)
+    if nproc == 0:
+        print(
+            "{}: rank {} saving to `{}`.".format(datetime.now(), rank, fout), flush=True
+        )
+    numpy.save(fout, out)
+    # We saved this CPU's results in a temporary file. Wait now for the other
+    # CPUs and then collect results from the 0th rank and save them.
    comm.Barrier()

-#     # Use the rank 0 to combine outputs for this CSiBORG realisation
-#     if rank == 0:
-#         print("Collecting results!")
-#         partreader = csiborgtools.read.ParticleReader(paths)
-#         out_collected = csiborgtools.read.combine_splits(
-#             utils.Nsplits, nsnap, nsim, partreader, cols_collect,
-#             remove_splits=True, verbose=False)
-#         fname = paths.hcat_path(nsim)
-#         print("Saving results to `{}`.".format(fname))
-#         numpy.save(fname, out_collected)
-#
-#     comm.Barrier()
-#
-# if rank == 0:
-#     print("All finished! See ya!")
+    if rank == 0:
+        print(
+            "{}: collecting results for simulation `{}`.".format(datetime.now(), nsim),
+            flush=True,
+        )
+        # We write to the output array. Load data from each CPU and append to
+        # the output array.
+        out = csiborgtools.read.cols_to_structured(ntasks, cols_collect)
+        clumpid2outpos = {indx: i for i, indx in enumerate(clumps_cat["index"])}
+        for i in range(nproc):
+            inp = numpy.load(ftemp.format(str(nsim).zfill(5), str(nsnap).zfill(5), i))
+            for j, clumpid in enumerate(inp["index"]):
+                k = clumpid2outpos[clumpid]
+                for key in inp.dtype.names:
+                    out[key][k] = inp[key][j]
+
+        # If we were analysing main halos, then remove array indices that do
+        # not correspond to parent halos.
+        if args.kind == "halos":
+            out = out[ismain]
+
+        fout = paths.structfit_path(nsnap, nsim, "clumps")
+        print("Saving to `{}`.".format(fout), flush=True)
+        numpy.save(fout, out)
+
+    # We now wait before moving on to another simulation.
+    comm.Barrier()
--- a/scripts/pre_splithalos.py
+++ b/scripts/pre_splithalos.py
@ -12,7 +12,10 @@
 # You should have received a copy of the GNU General Public License along
 # with this program; if not, write to the Free Software Foundation, Inc.,
 # 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
-"""Script to split particles to indivudual files according to their clump."""
+"""
+Script to split particles to individual files according to their clump. This is
+useful for calculating the halo properties directly from the particles.
+"""
 from datetime import datetime
 from gc import collect
 from glob import glob
@ -28,6 +31,7 @@ try:
    import csiborgtools
 except ModuleNotFoundError:
    import sys
+
    sys.path.append("../")
    import csiborgtools

@ -38,7 +42,7 @@ nproc = comm.Get_size()

 paths = csiborgtools.read.CSiBORGPaths(**csiborgtools.paths_glamdring)
 verbose = nproc == 1
-partcols = ['x', 'y', 'z', "vx", "vy", "vz", 'M']
+partcols = ["x", "y", "z", "vx", "vy", "vz", "M"]


 def do_split(nsim):
@ -46,8 +50,8 @@ def do_split(nsim):
    reader = csiborgtools.read.ParticleReader(paths)
    ftemp_base = join(
        paths.temp_dumpdir,
-        "split_{}_{}".format(str(nsim).zfill(5), str(nsnap).zfill(5))
-        )
+        "split_{}_{}".format(str(nsim).zfill(5), str(nsnap).zfill(5)),
+    )
    ftemp = ftemp_base + "_{}.npz"

    # Load the particles and their clump IDs
@ -85,7 +89,7 @@ def do_split(nsim):
    # Now load back in every temporary file, combine them into a single
    # dictionary  and save as a single .npz file.
    out = {}
-    for file in glob(ftemp_base + '*'):
+    for file in glob(ftemp_base + "*"):
        inp = numpy.load(file)
        for key in inp.files:
            out.update({key: inp[key]})
@ -107,9 +111,8 @@ if nproc > 1:
        worker_process(do_split, comm, verbose=False)
 else:
    tasks = paths.get_ics(tonew=False)
-    tasks = [tasks[0]]  # REMOVE
    for task in tasks:
        print("{}: completing task `{}`.".format(datetime.now(), task))
        do_split(task)

-comm.Barrier()
+comm.Barrier()