Add mmain and other major updates (#44)

* Move paths to a separate file * Add mmain reader * Add a verbosity flag * Fix imports * Fix bug * Rename files * Return ultimate parents * Add script to generate mmain * Remove mmain path * edit path * Add mmain path * Change function name * Rename function * Turn off verbose * Fix list requirement * Edit init match paths * Fix init pathing * Edit paths docs * Edit dumpdir name * Rename path * Fix split paths * Remove unused import * Add comment * Update readme * remove read mmain * Rename haloatalogue * Fix minor bugs * Update nbs * Add create directory option * Move split jobs * Move spliot jobs * Remove splitting * Add import * Edit script * Deeper split folder * Fix paths bug * Rename catalogue * Rename Catalogue * Add new clumpread * Edit paths * add knn paths * Update commenting * Update imports * Add more conversions * Update temp file * Add a note * Add catalogue * Cooment * Update TODO * Update script * add nb * Update * pep8 * edit paths & pep8 * Fix knn auto paths * add paths docs * Add auto and cross knn paths * Add new paths * Simplify tpcf reading * pep8 patch * update readme * Update progress * pep8 * pep8 * pep8 * pep8 * pep8 * pep8 * pep8 * pep8 * pep8 * pep8 * pep8 * pep8 * pep8 * pep8 * pep8 * Pep 8 and restructure * add lambda spin * add clump and halo * add checks * Edit halo profile fit * Update gitignore * backup script
2025-07-12 00:53:02 +00:00 · 2023-04-18 11:02:36 +02:00 · 2023-04-18 11:02:36 +02:00 · fdb0df8d4c
commit fdb0df8d4c
parent e0d3854277
50 changed files with 2152 additions and 1844 deletions
--- a/scripts/cluster_crosspk.py
+++ b/scripts/cluster_crosspk.py
@ -16,16 +16,18 @@
 MPI script to calculate the matter cross power spectrum between CSiBORG
 IC realisations. Units are Mpc/h.
 """
-from gc import collect
 from argparse import ArgumentParser
+from datetime import datetime
+from gc import collect
+from itertools import combinations
 from os import remove
 from os.path import join
-from itertools import combinations
-from datetime import datetime
-import numpy
+
 import joblib
-from mpi4py import MPI
+import numpy
 import Pk_library as PKL
+from mpi4py import MPI
+
 try:
    import csiborgtools
 except ModuleNotFoundError:
@ -47,9 +49,9 @@ nproc = comm.Get_size()
 MAS = "CIC"  # mass asignment scheme

 paths = csiborgtools.read.CSiBORGPaths(**csiborgtools.paths_glamdring)
-box = csiborgtools.units.BoxUnits(paths)
+box = csiborgtools.read.BoxUnits(paths)
 reader = csiborgtools.read.ParticleReader(paths)
-ics = paths.ic_ids(tonew=False)
+ics = paths.get_ics(tonew=False)
 nsims = len(ics)

 # File paths
@ -59,7 +61,7 @@ fout = join(dumpdir, "crosspk",
            "out_{}_{}" + "_{}.p".format(args.halfwidth))


-jobs = csiborgtools.fits.split_jobs(nsims, nproc)[rank]
+jobs = csiborgtools.utils.split_jobs(nsims, nproc)[rank]
 for n in jobs:
    print("Rank {}@{}: saving {}th delta.".format(rank, datetime.now(), n))
    nsim = ics[n]
@ -99,7 +101,7 @@ for i in range(nsims):
    combs.append((i, i))
 prev_delta = [-1, None, None, None]  # i, delta, aexp, length

-jobs = csiborgtools.fits.split_jobs(len(combs), nproc)[rank]
+jobs = csiborgtools.utils.split_jobs(len(combs), nproc)[rank]
 for n in jobs:
    i, j = combs[n]
    print("Rank {}@{}: combination {}.".format(rank, datetime.now(), (i, j)))
@ -153,4 +155,4 @@ if rank == 0:
        remove(ftemp.format(ic, "delta") + ".npy")
        remove(ftemp.format(ic, "lengths") + ".p")

-    print("All finished!")
+    print("All finished!")
--- a/scripts/cluster_knn_auto.py
+++ b/scripts/cluster_knn_auto.py
@ -13,17 +13,18 @@
 # with this program; if not, write to the Free Software Foundation, Inc.,
 # 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 """A script to calculate the KNN-CDF for a set of CSiBORG halo catalogues."""
-from os.path import join
-from warnings import warn
 from argparse import ArgumentParser
 from copy import deepcopy
 from datetime import datetime
-from mpi4py import MPI
-from TaskmasterMPI import master_process, worker_process
-import numpy
-from sklearn.neighbors import NearestNeighbors
+from warnings import warn
+
 import joblib
+import numpy
 import yaml
+from mpi4py import MPI
+from sklearn.neighbors import NearestNeighbors
+from TaskmasterMPI import master_process, worker_process
+
 try:
    import csiborgtools
 except ModuleNotFoundError:
@ -58,8 +59,6 @@ ics = [7444, 7468, 7492, 7516, 7540, 7564, 7588, 7612, 7636, 7660, 7684,
       9292, 9316, 9340, 9364, 9388, 9412, 9436, 9460, 9484, 9508, 9532,
       9556, 9580, 9604, 9628, 9652, 9676, 9700, 9724, 9748, 9772, 9796,
       9820, 9844]
-dumpdir = "/mnt/extraspace/rstiskalek/csiborg/knn"
-fout = join(dumpdir, "auto", "knncdf_{}_{}.p")
 paths = csiborgtools.read.CSiBORGPaths(**csiborgtools.paths_glamdring)
 knncdf = csiborgtools.clustering.kNN_CDF()

@ -67,6 +66,7 @@ knncdf = csiborgtools.clustering.kNN_CDF()
 #                                 Analysis                                    #
 ###############################################################################

+
 def read_single(selection, cat):
    """Positions for single catalogue auto-correlation."""
    mmask = numpy.ones(len(cat), dtype=bool)
@ -101,11 +101,13 @@ def read_single(selection, cat):

    return pos[smask, ...]

+
 def do_auto(run, cat, ic):
    """Calculate the kNN-CDF single catalgoue autocorrelation."""
    _config = config.get(run, None)
    if _config is None:
-        warn("No configuration for run {}.".format(run))
+        warn("No configuration for run {}.".format(run), UserWarning,
+             stacklevel=1)
        return

    rvs_gen = csiborgtools.clustering.RVSinsphere(Rmax)
@ -119,13 +121,15 @@ def do_auto(run, cat, ic):
        batch_size=int(config["batch_size"]), random_state=config["seed"])

    joblib.dump({"rs": rs, "cdf": cdf, "ndensity": pos.shape[0] / totvol},
-                fout.format(str(ic).zfill(5), run))
+                paths.knnauto_path(run,  ic))
+

 def do_cross_rand(run, cat, ic):
    """Calculate the kNN-CDF cross catalogue random correlation."""
    _config = config.get(run, None)
    if _config is None:
-        warn("No configuration for run {}.".format(run))
+        warn("No configuration for run {}.".format(run), UserWarning,
+             stacklevel=1)
        return

    rvs_gen = csiborgtools.clustering.RVSinsphere(Rmax)
@ -143,14 +147,11 @@ def do_cross_rand(run, cat, ic):
        nsamples=int(config["nsamples"]), neval=int(config["neval"]),
        batch_size=int(config["batch_size"]), random_state=config["seed"])
    corr = knncdf.joint_to_corr(cdf0, cdf1, joint_cdf)
-
-    joblib.dump({"rs": rs, "corr": corr}, fout.format(str(ic).zfill(5), run))
-
+    joblib.dump({"rs": rs, "corr": corr}, paths.knnauto_path(run, ic))


 def do_runs(ic):
-    cat = csiborgtools.read.HaloCatalogue(ic, paths, max_dist=Rmax,
-                                          min_mass=minmass)
+    cat = csiborgtools.read.ClumpsCatalogue(ic, paths, maxdist=Rmax)
    for run in args.runs:
        if "random" in run:
            do_cross_rand(run, cat, ic)
@ -179,4 +180,4 @@ comm.Barrier()

 if rank == 0:
    print("{}: all finished.".format(datetime.now()))
-quit()  # Force quit the script
+quit()  # Force quit the script
--- a/scripts/cluster_knn_auto.yml
+++ b/scripts/cluster_knn_auto.yml
--- a/scripts/cluster_knn_cross.py
+++ b/scripts/cluster_knn_cross.py
@ -13,18 +13,19 @@
 # with this program; if not, write to the Free Software Foundation, Inc.,
 # 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 """A script to calculate the KNN-CDF for a set of CSiBORG halo catalogues."""
-from warnings import warn
-from os.path import join
 from argparse import ArgumentParser
-from copy import deepcopy
 from datetime import datetime
 from itertools import combinations
-from mpi4py import MPI
-from TaskmasterMPI import master_process, worker_process
-import numpy
-from sklearn.neighbors import NearestNeighbors
+from os.path import join
+from warnings import warn
+
 import joblib
+import numpy
 import yaml
+from mpi4py import MPI
+from sklearn.neighbors import NearestNeighbors
+from TaskmasterMPI import master_process, worker_process
+
 try:
    import csiborgtools
 except ModuleNotFoundError:
@ -67,6 +68,7 @@ knncdf = csiborgtools.clustering.kNN_CDF()
 #                               Analysis                                      #
 ###############################################################################

+
 def read_single(selection, cat):
    mmask = numpy.ones(len(cat), dtype=bool)
    pos = cat.positions(False)
@ -79,19 +81,20 @@ def read_single(selection, cat):
        mmask &= (cat[psel["name"]] < pmax)
    return pos[mmask, ...]

+
 def do_cross(run, ics):
    _config = config.get(run, None)
    if _config is None:
-        warn("No configuration for run {}.".format(run))
+        warn("No configuration for run {}.".format(run), stacklevel=1)
        return
    rvs_gen = csiborgtools.clustering.RVSinsphere(Rmax)
    knn1, knn2 = NearestNeighbors(), NearestNeighbors()

-    cat1 = csiborgtools.read.HaloCatalogue(ics[0], paths, max_dist=Rmax)
+    cat1 = csiborgtools.read.ClumpsCatalogue(ics[0], paths, max_dist=Rmax)
    pos1 = read_single(_config, cat1)
    knn1.fit(pos1)

-    cat2 = csiborgtools.read.HaloCatalogue(ics[1], paths, max_dist=Rmax)
+    cat2 = csiborgtools.read.ClumpsCatalogue(ics[1], paths, max_dist=Rmax)
    pos2 = read_single(_config, cat2)
    knn2.fit(pos2)

@ -102,9 +105,8 @@ def do_cross(run, ics):
        batch_size=int(config["batch_size"]), random_state=config["seed"])

    corr = knncdf.joint_to_corr(cdf0, cdf1, joint_cdf)
+    joblib.dump({"rs": rs, "corr": corr}, paths.knncross_path(run, ics))

-    joblib.dump({"rs": rs, "corr": corr},
-                fout.format(str(ics[0]).zfill(5), str(ics[1]).zfill(5), run))

 def do_runs(ics):
    print(ics)
@ -133,4 +135,4 @@ comm.Barrier()

 if rank == 0:
    print("{}: all finished.".format(datetime.now()))
-quit()  # Force quit the script
+quit()  # Force quit the script
--- a/scripts/cluster_knn_cross.yml
+++ b/scripts/cluster_knn_cross.yml
--- a/scripts/cluster_tcpf_auto.py
+++ b/scripts/cluster_tcpf_auto.py
@ -13,16 +13,18 @@
 # with this program; if not, write to the Free Software Foundation, Inc.,
 # 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 """A script to calculate the auto-2PCF of CSiBORG catalogues."""
-from os.path import join
-from warnings import warn
 from argparse import ArgumentParser
 from copy import deepcopy
 from datetime import datetime
+from os.path import join
+from warnings import warn
+
+import joblib
+import numpy
+import yaml
 from mpi4py import MPI
 from TaskmasterMPI import master_process, worker_process
-import numpy
-import joblib
-import yaml
+
 try:
    import csiborgtools
 except ModuleNotFoundError:
@ -65,6 +67,7 @@ tpcf = csiborgtools.clustering.Mock2PCF()
 #                                 Analysis                                    #
 ###############################################################################

+
 def read_single(selection, cat):
    """Positions for single catalogue auto-correlation."""
    mmask = numpy.ones(len(cat), dtype=bool)
@ -99,10 +102,11 @@ def read_single(selection, cat):

    return pos[smask, ...]

+
 def do_auto(run, cat, ic):
    _config = config.get(run, None)
    if _config is None:
-        warn("No configuration for run {}.".format(run))
+        warn("No configuration for run {}.".format(run), stacklevel=1)
        return

    rvs_gen = csiborgtools.clustering.RVSinsphere(Rmax)
@ -112,12 +116,11 @@ def do_auto(run, cat, ic):
    nrandom = int(config["randmult"] * pos.shape[0])
    rp, wp = tpcf(pos, rvs_gen, nrandom, bins)

-    joblib.dump({"rp": rp, "wp": wp}, fout.format(str(ic).zfill(5), run))
+    joblib.dump({"rp": rp, "wp": wp}, paths.tpcfauto_path(run, ic))


 def do_runs(ic):
-    cat = csiborgtools.read.HaloCatalogue(ic, paths, max_dist=Rmax,
-                                          min_mass=minmass)
+    cat = csiborgtools.read.ClumpsCatalogue(ic, paths, maxdist=Rmax)
    for run in args.runs:
        do_auto(run, cat, ic)

@ -143,4 +146,4 @@ comm.Barrier()

 if rank == 0:
    print("{}: all finished.".format(datetime.now()))
-quit()  # Force quit the script
+quit()  # Force quit the script
--- a/scripts/cluster_tpcf_auto.yml
+++ b/scripts/cluster_tpcf_auto.yml
--- a/scripts/field_prop.py
+++ b/scripts/field_prop.py
@ -16,17 +16,20 @@
 MPI script to evaluate field properties at the galaxy positions.
 """
 from argparse import ArgumentParser
-from os.path import join
-from os import remove
 from datetime import datetime
+from os import remove
+from os.path import join
+
 import numpy
 from mpi4py import MPI
+
 try:
    import csiborgtools
 except ModuleNotFoundError:
    import sys
    sys.path.append("../")
    import csiborgtools
+
 import utils

 dumpdir = "/mnt/extraspace/rstiskalek/csiborg/"
@ -61,16 +64,16 @@ dtype = {"names": ["delta", "phi"], "formats": [numpy.float32] * 2}

 # CSiBORG simulation paths
 paths = csiborgtools.read.CSiBORGPaths(**csiborgtools.paths_glamdring)
-ics = paths.ic_ids(tonew=False)
+ics = paths.get_ics(tonew=False)
 nsims = len(ics)

-for n in csiborgtools.fits.split_jobs(nsims, nproc)[rank]:
+for n in csiborgtools.utils.split_jobs(nsims, nproc)[rank]:
    print("Rank {}@{}: working on {}th IC.".format(rank, datetime.now(), n),
          flush=True)
    nsim = ics[n]
    nsnap = max(paths.get_snapshots(nsim))
    reader = csiborgtools.read.ParticleReader(paths)
-    box = csiborgtools.units.BoxUnits(nsnap, nsim, paths)
+    box = csiborgtools.read.BoxUnits(nsnap, nsim, paths)

    # Read particles and select a subset of them
    particles = reader.read_particle(nsnap, nsim, ["x", "y", "z", "M"],
@ -121,4 +124,4 @@ if rank == 0:

    print("Saving results to `{}`.".format(fperm), flush=True)
    with open(fperm, "wb") as f:
-        numpy.save(f, out)
+        numpy.save(f, out)
--- a/scripts/match_singlematch.py
+++ b/scripts/match_singlematch.py
@ -13,17 +13,20 @@
 # with this program; if not, write to the Free Software Foundation, Inc.,
 # 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 """A script to calculate overlap between two CSiBORG realisations."""
-from os.path import join
 from argparse import ArgumentParser
 from datetime import datetime
+from os.path import join
+
 import numpy
 from scipy.ndimage import gaussian_filter
+
 try:
    import csiborgtools
 except ModuleNotFoundError:
    import sys
    sys.path.append("../")
    import csiborgtools
+
 import utils

 # Argument parser
@ -44,18 +47,19 @@ overlapper = csiborgtools.match.ParticleOverlap()
 # Load catalogues
 print("{}: loading catalogues {} and {}."
      .format(datetime.now(), args.nsim0, args.nsimx), flush=True)
-cat0 = csiborgtools.read.HaloCatalogue(args.nsim0, paths)
-catx = csiborgtools.read.HaloCatalogue(args.nsimx, paths)
+cat0 = csiborgtools.read.ClumpsCatalogue(args.nsim0, paths)
+catx = csiborgtools.read.ClumpsCatalogue(args.nsimx, paths)


 print("{}: loading simulation {} and converting positions to cell numbers."
      .format(datetime.now(), args.nsim0), flush=True)
-with open(paths.clump0_path(args.nsim0), "rb") as f:
+
+with open(paths.initmatch_path(args.nsim0, "particles"), "rb") as f:
    clumps0 = numpy.load(f, allow_pickle=True)
    overlapper.clumps_pos2cell(clumps0)
 print("{}: loading simulation {} and converting positions to cell numbers."
      .format(datetime.now(), args.nsimx), flush=True)
-with open(paths.clump0_path(args.nsimx), 'rb') as f:
+with open(paths.initmatch_path(args.nsimx, "particles"), 'rb') as f:
    clumpsx = numpy.load(f, allow_pickle=True)
    overlapper.clumps_pos2cell(clumpsx)

@ -87,4 +91,4 @@ with open(fout, "wb") as f:
    numpy.savez(fout, ref_indxs=ref_indxs, cross_indxs=cross_indxs,
                match_indxs=match_indxs, ngp_overlap=ngp_overlap,
                smoothed_overlap=smoothed_overlap, sigma=args.sigma)
-print("All finished.", flush=True)
+print("All finished.", flush=True)
--- a/scripts/pre_fithalos.py
+++ b/scripts/pre_fithalos.py
@ -16,17 +16,17 @@
 A script to fit halos (concentration, ...). The particle array of each CSiBORG
 realisation must have been split in advance by `runsplit_halos`.
 """
-from os.path import join
 from datetime import datetime
+
 import numpy
 from mpi4py import MPI
+
 try:
    import csiborgtools
 except ModuleNotFoundError:
    import sys
    sys.path.append("../")
    import csiborgtools
-import utils


 # Get MPI things
@ -35,8 +35,8 @@ rank = comm.Get_rank()
 nproc = comm.Get_size()

 paths = csiborgtools.read.CSiBORGPaths(**csiborgtools.paths_glamdring)
-dumpdir = "/mnt/extraspace/rstiskalek/csiborg/"
-loaddir = join(dumpdir, "temp")
+partreader =csiborgtools.read.ParticleReader(paths)
+
 cols_collect = [("npart", numpy.int64), ("totpartmass", numpy.float64),
                ("Rs", numpy.float64), ("vx", numpy.float64),
                ("vy", numpy.float64), ("vz", numpy.float64),
@ -47,14 +47,48 @@ cols_collect = [("npart", numpy.int64), ("totpartmass", numpy.float64),
                ("r500", numpy.float64), ("m200", numpy.float64),
                ("m500", numpy.float64), ("lambda200c", numpy.float64)]

+def fit_clump(particles, clump, box):

-for i, nsim in enumerate(paths.ic_ids(tonew=False)):
+
+
+
+    out["npart"][n] = clump.Npart
+    out["rmin"][n] = clump.rmin
+    out["rmax"][n] = clump.rmax
+    out["totpartmass"][n] = clump.total_particle_mass
+    out["vx"][n] = numpy.average(clump.vel[:, 0], weights=clump.m)
+    out["vy"][n] = numpy.average(clump.vel[:, 1], weights=clump.m)
+    out["vz"][n] = numpy.average(clump.vel[:, 2], weights=clump.m)
+    out["Lx"][n], out["Ly"][n], out["Lz"][n] = clump.angular_momentum
+
+
+
+for i, nsim in enumerate(paths.get_ics(tonew=False)):
    if rank == 0:
-        print("{}: calculating {}th simulation.".format(datetime.now(), i))
+        print("{}: calculating {}th simulation `{}`."
+              .format(datetime.now(), i, nsim), flush=True)
    nsnap = max(paths.get_snapshots(nsim))
-    box = csiborgtools.units.BoxUnits(nsnap, nsim, paths)
+    box = csiborgtools.read.BoxUnits(nsnap, nsim, paths)

-    jobs = csiborgtools.fits.split_jobs(utils.Nsplits, nproc)[rank]
+    # Archive of clumps, keywords are their clump IDs
+    particle_archive = paths.split_path(nsnap, nsim)
+    clumpsarr = partreader.read_clumps(nsnap, nsim,
+                                       cols=["index", 'x', 'y', 'z'])
+    clumpid2arrpos = {ind: ii for ii, ind in enumerate(clumpsarr["index"])}
+
+
+    nclumps = len(particle_archive.files)
+    # Fit 5000 clumps at a time, then dump results
+    batchsize = 5000
+
+    # This rank does these `batchsize` clumps/halos
+    jobs = csiborgtools.utils.split_jobs(nclumps, nclumps // batchsize)[rank]
+    for clumpid in jobs:
+        ... = fit_clump(particle_archive[str(clumpid)], clumpsarr[clumpid2arrpos[clumpid]])
+
+
+
+    jobs = csiborgtools.utils.split_jobs(nclumps, nproc)[rank]
    for nsplit in jobs:
        parts, part_clumps, clumps = csiborgtools.fits.load_split_particles(
            nsplit, nsnap, nsim, paths, remove_split=False)
@ -111,18 +145,18 @@ for i, nsim in enumerate(paths.ic_ids(tonew=False)):
    # Wait until all jobs finished before moving to another simulation
    comm.Barrier()

-    # Use the rank 0 to combine outputs for this CSiBORG realisation
-    if rank == 0:
-        print("Collecting results!")
-        partreader = csiborgtools.read.ParticleReader(paths)
-        out_collected = csiborgtools.read.combine_splits(
-            utils.Nsplits, nsnap, nsim, partreader, cols_collect,
-            remove_splits=True, verbose=False)
-        fname = paths.hcat_path(nsim)
-        print("Saving results to `{}`.".format(fname))
-        numpy.save(fname, out_collected)
-
-    comm.Barrier()
-
-if rank == 0:
-    print("All finished! See ya!")
+#     # Use the rank 0 to combine outputs for this CSiBORG realisation
+#     if rank == 0:
+#         print("Collecting results!")
+#         partreader = csiborgtools.read.ParticleReader(paths)
+#         out_collected = csiborgtools.read.combine_splits(
+#             utils.Nsplits, nsnap, nsim, partreader, cols_collect,
+#             remove_splits=True, verbose=False)
+#         fname = paths.hcat_path(nsim)
+#         print("Saving results to `{}`.".format(fname))
+#         numpy.save(fname, out_collected)
+#
+#     comm.Barrier()
+#
+# if rank == 0:
+#     print("All finished! See ya!")
--- a/scripts/pre_initmatch.py
+++ b/scripts/pre_initmatch.py
@ -19,14 +19,16 @@ are grouped in a clump at present redshift.
 Optionally also dumps the clumps information, however watch out as this will
 eat up a lot of memory.
 """
-from gc import collect
-from os.path import join
-from os import remove
 from argparse import ArgumentParser
 from datetime import datetime
 from distutils.util import strtobool
+from gc import collect
+from os import remove
+from os.path import join
+
 import numpy
 from mpi4py import MPI
+
 try:
    import csiborgtools
 except ModuleNotFoundError:
@ -45,12 +47,10 @@ parser.add_argument("--dump_clumps", type=lambda x: bool(strtobool(x)))
 args = parser.parse_args()

 paths = csiborgtools.read.CSiBORGPaths(**csiborgtools.paths_glamdring)
-nsims = paths.ic_ids(tonew=True)
+nsims = paths.get_ics(tonew=True)

-# Output files
-ftemp = join(paths.dumpdir, "temp_initmatch", "temp_{}_{}_{}.npy")
-fpermcm = join(paths.dumpdir, "initmatch", "clump_{}_cm.npy")
-fpermpart = join(paths.dumpdir, "initmatch", "clump_{}_particles.npy")
+# Temporary output file
+ftemp = join(paths.dumpdir, "temp", "initmatch_{}_{}_{}.npy")

 for nsim in nsims:
    if rank == 0:
@ -87,7 +87,7 @@ for nsim in nsims:
    unique_clumpids = numpy.unique(clump_ids)

    njobs = unique_clumpids.size
-    jobs = csiborgtools.fits.split_jobs(njobs, nproc)[rank]
+    jobs = csiborgtools.utils.split_jobs(njobs, nproc)[rank]
    for i in jobs:
        n = unique_clumpids[i]
        x0 = part0[clump_ids == n]
@ -139,8 +139,8 @@ for nsim in nsims:
            out["ID"][i] = n

        print("{}: dumping to .. `{}`.".format(
-            datetime.now(), fpermcm.format(nsim)), flush=True)
-        with open(fpermcm.format(nsim), 'wb') as f:
+            datetime.now(), paths.initmatch_path(nsim, "cm")), flush=True)
+        with open(paths.initmatch_path(nsim, "cm"), 'wb') as f:
            numpy.save(f, out)

        if args.dump_clumps:
@ -157,10 +157,12 @@ for nsim in nsims:
                out["clump"][i] = fin
                out["ID"][i] = n
                remove(fpath)
-            print("{}: dumping to .. `{}`.".format(
-                datetime.now(), fpermpart.format(nsim)), flush=True)
-            with open(fpermpart.format(nsim), "wb") as f:
+
+            fout = paths.initmatch_path(nsim, "particles")
+            print("{}: dumping to .. `{}`.".format(datetime.now(), fout),
+                  flush=True)
+            with open(fout, "wb") as f:
                numpy.save(f, out)

            del out
-            collect()
+            collect()
--- a/scripts/pre_mmain.py
+++ b/scripts/pre_mmain.py
@ -0,0 +1,64 @@
+# Copyright (C) 2022 Richard Stiskalek
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation; either version 3 of the License, or (at your
+# option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General
+# Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+"""
+Script to generate the mmain files, i.e. sums up the substructe of children.
+"""
+from datetime import datetime
+
+import numpy
+from mpi4py import MPI
+from TaskmasterMPI import master_process, worker_process
+
+try:
+    import csiborgtools
+except ModuleNotFoundError:
+    import sys
+    sys.path.append("../")
+    import csiborgtools
+
+# Get MPI things
+comm = MPI.COMM_WORLD
+rank = comm.Get_rank()
+nproc = comm.Get_size()
+
+paths = csiborgtools.read.CSiBORGPaths(**csiborgtools.paths_glamdring)
+mmain_reader = csiborgtools.read.MmainReader(paths)
+
+
+def do_mmain(nsim):
+    nsnap = max(paths.get_snapshots(nsim))
+    # NOTE: currently works for highest snapshot anyway
+    mmain, ultimate_parent = mmain_reader.make_mmain(nsim, verbose=False)
+    numpy.savez(paths.mmain_path(nsnap, nsim),
+                mmain=mmain, ultimate_parent=ultimate_parent)
+
+###############################################################################
+#                             MPI task delegation                             #
+###############################################################################
+
+
+if nproc > 1:
+    if rank == 0:
+        tasks = list(paths.get_ics(tonew=False))
+        master_process(tasks, comm, verbose=True)
+    else:
+        worker_process(do_mmain, comm, verbose=False)
+else:
+    tasks = paths.get_ics(tonew=False)
+    for task in tasks:
+        print("{}: completing task `{}`.".format(datetime.now(), task))
+        do_mmain(task)
+
+comm.Barrier()
--- a/scripts/pre_splithalos.py
+++ b/scripts/pre_splithalos.py
@ -0,0 +1,115 @@
+# Copyright (C) 2022 Richard Stiskalek
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation; either version 3 of the License, or (at your
+# option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General
+# Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+"""Script to split particles to indivudual files according to their clump."""
+from datetime import datetime
+from gc import collect
+from glob import glob
+from os import remove
+from os.path import join
+
+import numpy
+from mpi4py import MPI
+from TaskmasterMPI import master_process, worker_process
+from tqdm import tqdm
+
+try:
+    import csiborgtools
+except ModuleNotFoundError:
+    import sys
+    sys.path.append("../")
+    import csiborgtools
+
+# Get MPI things
+comm = MPI.COMM_WORLD
+rank = comm.Get_rank()
+nproc = comm.Get_size()
+
+paths = csiborgtools.read.CSiBORGPaths(**csiborgtools.paths_glamdring)
+verbose = nproc == 1
+partcols = ['x', 'y', 'z', "vx", "vy", "vz", 'M']
+
+
+def do_split(nsim):
+    nsnap = max(paths.get_snapshots(nsim))
+    reader = csiborgtools.read.ParticleReader(paths)
+    ftemp_base = join(
+        paths.temp_dumpdir,
+        "split_{}_{}".format(str(nsim).zfill(5), str(nsnap).zfill(5))
+        )
+    ftemp = ftemp_base + "_{}.npz"
+
+    # Load the particles and their clump IDs
+    particles = reader.read_particle(nsnap, nsim, partcols, verbose=verbose)
+    particle_clumps = reader.read_clumpid(nsnap, nsim, verbose=verbose)
+    # Drop all particles whose clump index is 0 (not assigned to any clump)
+    assigned_mask = particle_clumps != 0
+    particle_clumps = particle_clumps[assigned_mask]
+    particles = particles[assigned_mask]
+    del assigned_mask
+    collect()
+
+    # Load the clump indices
+    clumpinds = reader.read_clumps(nsnap, nsim, cols="index")["index"]
+    # Some of the clumps have no particles, so we do not loop over them
+    clumpinds = clumpinds[numpy.isin(clumpinds, particle_clumps)]
+
+    # Loop over the clump indices and save the particles to a temporary file
+    # every 10000 clumps. We will later read this back and combine into a
+    # single file.
+    out = {}
+    for i, clind in enumerate(tqdm(clumpinds) if verbose else clumpinds):
+        key = str(clind)
+        out.update({str(clind): particles[particle_clumps == clind]})
+
+        # REMOVE bump this back up
+        if i % 10000 == 0 or i == clumpinds.size - 1:
+            numpy.savez(ftemp.format(i), **out)
+            out = {}
+
+    # Clear up memory because we will be loading everything back
+    del particles, particle_clumps, clumpinds
+    collect()
+
+    # Now load back in every temporary file, combine them into a single
+    # dictionary  and save as a single .npz file.
+    out = {}
+    for file in glob(ftemp_base + '*'):
+        inp = numpy.load(file)
+        for key in inp.files:
+            out.update({key: inp[key]})
+        remove(file)
+
+    numpy.savez(paths.split_path(nsnap, nsim), **out)
+
+
+###############################################################################
+#                             MPI task delegation                             #
+###############################################################################
+
+
+if nproc > 1:
+    if rank == 0:
+        tasks = list(paths.get_ics(tonew=False))
+        master_process(tasks, comm, verbose=True)
+    else:
+        worker_process(do_split, comm, verbose=False)
+else:
+    tasks = paths.get_ics(tonew=False)
+    tasks = [tasks[0]]  # REMOVE
+    for task in tasks:
+        print("{}: completing task `{}`.".format(datetime.now(), task))
+        do_split(task)
+
+comm.Barrier()
--- a/scripts/split_halos.py
+++ b/scripts/split_halos.py
@ -1,58 +0,0 @@
-# Copyright (C) 2022 Richard Stiskalek
-# This program is free software; you can redistribute it and/or modify it
-# under the terms of the GNU General Public License as published by the
-# Free Software Foundation; either version 3 of the License, or (at your
-# option) any later version.
-#
-# This program is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General
-# Public License for more details.
-#
-# You should have received a copy of the GNU General Public License along
-# with this program; if not, write to the Free Software Foundation, Inc.,
-# 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
-"""
-Script to split particles into smaller files according to their clump
-membership for faster manipulation. Currently does this for the maximum
-snapshot of each simulation. Running this requires a lot of memory.
-"""
-from mpi4py import MPI
-from datetime import datetime
-try:
-    import csiborgtools
-except ModuleNotFoundError:
-    import sys
-    sys.path.append("../")
-    import csiborgtools
-import utils
-
-# Get MPI things
-comm = MPI.COMM_WORLD
-rank = comm.Get_rank()
-nproc = comm.Get_size()
-
-paths = csiborgtools.read.CSiBORGPaths(**csiborgtools.paths_glamdring)
-sims = paths.ic_ids(False)
-partcols = ["x", "y", "z", "vx", "vy", "vz", "M", "level"]
-
-jobs = csiborgtools.fits.split_jobs(len(sims), nproc)[rank]
-for icount, sim_index in enumerate(jobs):
-    print("{}: rank {} working {} / {} jobs."
-          .format(datetime.now(), rank, icount + 1, len(jobs)), flush=True)
-    nsim = sims[sim_index]
-    nsnap = max(paths.get_snapshots(nsim))
-    partreader = csiborgtools.read.ParticleReader(paths)
-    # Load the clumps, particles' clump IDs and particles.
-    clumps = partreader.read_clumps(nsnap, nsim)
-    particle_clumps = partreader.read_clumpid(nsnap, nsim, verbose=False)
-    particles = partreader.read_particle(nsnap, nsim, partcols, verbose=False)
-    # Drop all particles whose clump index is 0 (not assigned to any halo)
-    particle_clumps, particles = partreader.drop_zero_indx(
-        particle_clumps, particles)
-    # Dump it!
-    csiborgtools.fits.dump_split_particles(particles, particle_clumps, clumps,
-                                           utils.Nsplits, nsnap, nsim, paths,
-                                           verbose=False)
-
-print("All finished!", flush=True)