Update initial matching & overlaps (#47)

* pep8 * fix convention * Update script * enforce optimisation boundaries to be finite * Update TODO * Remove sky matching * FIx a small bug * fix bug * Remove import * Add halo fitted quantities * Update nbs * update README * Add load_initial comments * Rename nbs * Delete nb * Update imports * Rename function * Update matcher * Add overlap paths * Update the matching script * Update verbosity * Add verbosity flags * Simplify make_bckg_delta * bug fix * fix bug
2025-07-18 19:53:03 +00:00 · 2023-04-21 01:35:06 +02:00 · 2023-04-21 01:35:06 +02:00 · 04119a5314
commit 04119a5314
parent 39b3498621
14 changed files with 527 additions and 2836 deletions
--- a/scripts/match_singlematch.py
+++ b/scripts/match_singlematch.py
@ -1,4 +1,3 @@
-# Copyright (C) 2022 Richard Stiskalek
 # This program is free software; you can redistribute it and/or modify it
 # under the terms of the GNU General Public License as published by the
 # Free Software Foundation; either version 3 of the License, or (at your
@ -15,7 +14,7 @@
 """A script to calculate overlap between two CSiBORG realisations."""
 from argparse import ArgumentParser
 from datetime import datetime
-from os.path import join
+from distutils.util import strtobool

 import numpy
 from scipy.ndimage import gaussian_filter
@ -24,71 +23,76 @@ try:
    import csiborgtools
 except ModuleNotFoundError:
    import sys
+
    sys.path.append("../")
    import csiborgtools

-import utils
-
 # Argument parser
 parser = ArgumentParser()
 parser.add_argument("--nsim0", type=int)
 parser.add_argument("--nsimx", type=int)
 parser.add_argument("--nmult", type=float)
 parser.add_argument("--sigma", type=float)
+parser.add_argument("--verbose", type=lambda x: bool(strtobool(x)), default=False)
 args = parser.parse_args()
-
-# File paths
 paths = csiborgtools.read.CSiBORGPaths(**csiborgtools.paths_glamdring)
-fout = join(utils.dumpdir, "overlap",
-            "cross_{}_{}.npz".format(args.nsim0, args.nsimx))
 smooth_kwargs = {"sigma": args.sigma, "mode": "constant", "cval": 0.0}
 overlapper = csiborgtools.match.ParticleOverlap()
-
-# Load catalogues
-print("{}: loading catalogues {} and {}."
-      .format(datetime.now(), args.nsim0, args.nsimx), flush=True)
-cat0 = csiborgtools.read.ClumpsCatalogue(args.nsim0, paths)
-catx = csiborgtools.read.ClumpsCatalogue(args.nsimx, paths)
-
-
-print("{}: loading simulation {} and converting positions to cell numbers."
-      .format(datetime.now(), args.nsim0), flush=True)
-
-with open(paths.initmatch_path(args.nsim0, "particles"), "rb") as f:
-    clumps0 = numpy.load(f, allow_pickle=True)
-    overlapper.clumps_pos2cell(clumps0)
-print("{}: loading simulation {} and converting positions to cell numbers."
-      .format(datetime.now(), args.nsimx), flush=True)
-with open(paths.initmatch_path(args.nsimx, "particles"), 'rb') as f:
-    clumpsx = numpy.load(f, allow_pickle=True)
-    overlapper.clumps_pos2cell(clumpsx)
-
-
-print("{}: generating the background density fields.".format(datetime.now()),
-      flush=True)
-delta_bckg = overlapper.make_bckg_delta(clumps0)
-delta_bckg = overlapper.make_bckg_delta(clumpsx, delta=delta_bckg)
-
-
-print("{}: crossing the simulations.".format(datetime.now()), flush=True)
 matcher = csiborgtools.match.RealisationsMatcher()
-ref_indxs, cross_indxs, match_indxs, ngp_overlap = matcher.cross(
-    cat0, catx, clumps0, clumpsx, delta_bckg)

+# Load the raw catalogues (i.e. no selection) including the initial CM positions
+# and the particle archives.
+cat0 = csiborgtools.read.HaloCatalogue(
+    args.nsim0, paths, load_initial=True, rawdata=True
+)
+catx = csiborgtools.read.HaloCatalogue(
+    args.nsimx, paths, load_initial=True, rawdata=True
+)
+halos0_archive = paths.initmatch_path(args.nsim0, "particles")
+halosx_archive = paths.initmatch_path(args.nsimx, "particles")

-print("{}: smoothing the background field.".format(datetime.now()), flush=True)
+# We generate the background density fields. Loads halos's particles one by one
+# from the archive, concatenates them and calculates the NGP density field.
+args.verbose and print(
+    "{}: generating the background density fields.".format(datetime.now()), flush=True
+)
+delta_bckg = overlapper.make_bckg_delta(halos0_archive, verbose=args.verbose)
+delta_bckg = overlapper.make_bckg_delta(
+    halosx_archive, delta=delta_bckg, verbose=args.verbose
+)
+
+# We calculate the overlap between the NGP fields.
+args.verbose and print(
+    "{}: crossing the simulations.".format(datetime.now()), flush=True
+)
+match_indxs, ngp_overlap = matcher.cross(
+    cat0, catx, halos0_archive, halosx_archive, delta_bckg
+)
+
+# We now smoothen up the background density field for the smoothed overlap calculation.
+args.verbose and print(
+    "{}: smoothing the background field.".format(datetime.now()), flush=True
+)
 gaussian_filter(delta_bckg, output=delta_bckg, **smooth_kwargs)

+# We calculate the smoothed overlap for the pairs whose NGP overlap is > 0.
+args.verbose and print(
+    "{}: calculating smoothed overlaps.".format(datetime.now()), flush=True
+)
+smoothed_overlap = matcher.smoothed_cross(
+    cat0, catx, halos0_archive, halosx_archive, delta_bckg, match_indxs, smooth_kwargs
+)

-print("{}: calculating smoothed overlaps.".format(datetime.now()), flush=True)
-smoothed_overlap = matcher.smoothed_cross(clumps0, clumpsx, delta_bckg,
-                                          ref_indxs, cross_indxs, match_indxs,
-                                          smooth_kwargs)
-
-# Dump the result
-print("Saving results to `{}`.".format(fout), flush=True)
-with open(fout, "wb") as f:
-    numpy.savez(fout, ref_indxs=ref_indxs, cross_indxs=cross_indxs,
-                match_indxs=match_indxs, ngp_overlap=ngp_overlap,
-                smoothed_overlap=smoothed_overlap, sigma=args.sigma)
-print("All finished.", flush=True)
+# We save the results at long last.
+fout = paths.overlap_path(args.nsim0, args.nsimx)
+args.verbose and print(
+    "{}: saving results to `{}`.".format(datetime.now(), fout), flush=True
+)
+numpy.savez(
+    fout,
+    match_indxs=match_indxs,
+    ngp_overlap=ngp_overlap,
+    smoothed_overlap=smoothed_overlap,
+    sigma=args.sigma,
+)
+print("{}: all finished.".format(datetime.now()), flush=True)
--- a/scripts/pre_fithalos.py
+++ b/scripts/pre_fithalos.py
@ -72,9 +72,6 @@ def fit_clump(particles, clump_info, box):
    obj = csiborgtools.fits.Clump(particles, clump_info, box)

    out = {}
-    if numpy.isnan(clump_info["index"]):
-        print("Why am I NaN?", flush=True)
-    out["index"] = clump_info["index"]
    out["npart"] = len(obj)
    out["totpartmass"] = numpy.sum(obj["M"])
    for i, v in enumerate(["vx", "vy", "vz"]):
@ -121,7 +118,7 @@ def load_parent_particles(clumpid, particle_archive, clumps_cat):

    if len(clumps) == 0:
        return None
-    return csiborgtools.match.concatenate_clumps(clumps, include_velocities=True)
+    return csiborgtools.match.concatenate_parts(clumps, include_velocities=True)


 # We now start looping over all simulations
@ -152,11 +149,13 @@ for i, nsim in enumerate(paths.get_ics(tonew=False)):
    jobs = csiborgtools.fits.split_jobs(ntasks, nproc)[rank]
    out = csiborgtools.read.cols_to_structured(len(jobs), cols_collect)
    for i, j in enumerate(tqdm(jobs)) if nproc == 1 else enumerate(jobs):
+        clumpid = clumps_cat["index"][j]
+        out["index"][i] = clumpid
+
        # If we are fitting halos and this clump is not a main, then continue.
        if args.kind == "halos" and not ismain[j]:
            continue

-        clumpid = clumps_cat["index"][j]
        if args.kind == "halos":
            part = load_parent_particles(clumpid, particle_archive, clumps_cat)
        else:
@ -169,9 +168,6 @@ for i, nsim in enumerate(paths.get_ics(tonew=False)):
            _out = fit_clump(part, clumps_cat[j], box)
            for key in _out.keys():
                out[key][i] = _out[key]
-        else:
-            out["index"][i] = clumpid
-            out["npart"][i] = 0

    fout = ftemp.format(str(nsim).zfill(5), str(nsnap).zfill(5), rank)
    if nproc == 0:
@ -204,7 +200,7 @@ for i, nsim in enumerate(paths.get_ics(tonew=False)):
        if args.kind == "halos":
            out = out[ismain]

-        fout = paths.structfit_path(nsnap, nsim, "clumps")
+        fout = paths.structfit_path(nsnap, nsim, args.kind)
        print("Saving to `{}`.".format(fout), flush=True)
        numpy.save(fout, out)

--- a/scripts/pre_initmatch.py
+++ b/scripts/pre_initmatch.py
@ -13,11 +13,8 @@
 # with this program; if not, write to the Free Software Foundation, Inc.,
 # 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 """
-A script to calculate the centre of mass of particles at redshift 70 that
-are grouped in a clump at present redshift.
-
-Optionally also dumps the clumps information, however watch out as this will
-eat up a lot of memory.
+Script to calculate the particle centre of mass and Lagrangian patch size in the initial
+snapshot. Optinally dumps the particle files, however this requires a lot of memory.
 """
 from argparse import ArgumentParser
 from datetime import datetime
@ -28,141 +25,143 @@ from os.path import join

 import numpy
 from mpi4py import MPI
+from tqdm import tqdm

 try:
    import csiborgtools
 except ModuleNotFoundError:
    import sys
+
    sys.path.append("../")
    import csiborgtools

+
 # Get MPI things
 comm = MPI.COMM_WORLD
 rank = comm.Get_rank()
 nproc = comm.Get_size()
+verbose = nproc == 1

 # Argument parser
 parser = ArgumentParser()
-parser.add_argument("--dump_clumps", type=lambda x: bool(strtobool(x)))
+parser.add_argument("--dump", type=lambda x: bool(strtobool(x)))
 args = parser.parse_args()
-
 paths = csiborgtools.read.CSiBORGPaths(**csiborgtools.paths_glamdring)
-nsims = paths.get_ics(tonew=True)
+partreader = csiborgtools.read.ParticleReader(paths)
+ftemp = join(paths.temp_dumpdir, "initmatch_{}_{}_{}.npy")

-# Temporary output file
-ftemp = join(paths.dumpdir, "temp", "initmatch_{}_{}_{}.npy")
-
-for nsim in nsims:
+# We loop over all particles and then use MPI when matching halos to the
+# initial snapshot and dumping them.
+for i, nsim in enumerate(paths.get_ics(tonew=True)):
    if rank == 0:
-        print("{}: reading simulation {}.".format(datetime.now(), nsim),
-              flush=True)
-    nsnap_max = max(paths.get_snapshots(nsim))
-    reader = csiborgtools.read.ParticleReader(paths)
+        print("{}: reading simulation {}.".format(datetime.now(), nsim), flush=True)
+    nsnap = max(paths.get_snapshots(nsim))

-    # Read and sort the initial particle files by their particle IDs
-    part0 = reader.read_particle(1, nsim, ["x", "y", "z", "M", "ID"],
-                                 verbose=False)
+    # We first load particles in the initial and final snapshots and sort them
+    # by their particle IDs so that we can match them by array position.
+    # `clump_ids` are the clump IDs of particles.
+    part0 = partreader.read_particle(
+        1, nsim, ["x", "y", "z", "M", "ID"], verbose=verbose
+    )
    part0 = part0[numpy.argsort(part0["ID"])]

-    # Order the final snapshot clump IDs by the particle IDs
-    pid = reader.read_particle(nsnap_max, nsim, ["ID"], verbose=False)["ID"]
-    clump_ids = reader.read_clumpid(nsnap_max, nsim, verbose=False)
+    pid = partreader.read_particle(nsnap, nsim, ["ID"], verbose=verbose)["ID"]
+    clump_ids = partreader.read_clumpid(nsnap, nsim, verbose=verbose)
    clump_ids = clump_ids[numpy.argsort(pid)]
-
+    # Release the particle IDs, we will not need them anymore now that both
+    # particle arrays are matched in ordering.
    del pid
    collect()

-    # Get rid of the clumps whose index is 0 -- those are unassigned
+    # Particles whose clump ID is 0 are unassigned to a clump, so we can get
+    # rid of them to speed up subsequent operations. Again we release the mask.
    mask = clump_ids > 0
    clump_ids = clump_ids[mask]
    part0 = part0[mask]
    del mask
    collect()

+    # Calculate the centre of mass of each parent halo, the Lagrangian patch
+    # size and optionally the initial snapshot particles belonging to this
+    # parent halo. Dumping the particles will take majority of time.
    if rank == 0:
-        print("{}: dumping intermediate files.".format(datetime.now()),
-              flush=True)
+        print(
+            "{}: calculating {}th simulation {}.".format(datetime.now(), i, nsim),
+            flush=True,
+        )
+    # We load up the clump catalogue which contains information about the
+    # ultimate  parent halos of each clump. We will loop only over the clump
+    # IDs of ultimate parent halos and add their substructure particles and at
+    # the end save these.
+    cat = csiborgtools.read.ClumpsCatalogue(
+        nsim, paths, load_fitted=False, rawdata=True
+    )
+    parent_ids = cat["index"][cat.ismain][:500]
+    jobs = csiborgtools.fits.split_jobs(parent_ids.size, nproc)[rank]
+    for i in tqdm(jobs) if verbose else jobs:
+        clid = parent_ids[i]
+        mmain_indxs = cat["index"][cat["parent"] == clid]

-    # Grab unique clump IDs and loop over them
-    unique_clumpids = numpy.unique(clump_ids)
+        mmain_mask = numpy.isin(clump_ids, mmain_indxs, assume_unique=True)
+        mmain_particles = part0[mmain_mask]

-    njobs = unique_clumpids.size
-    jobs = csiborgtools.utils.split_jobs(njobs, nproc)[rank]
-    for i in jobs:
-        n = unique_clumpids[i]
-        x0 = part0[clump_ids == n]
+        raddist, cmpos = csiborgtools.match.dist_centmass(mmain_particles)
+        patchsize = csiborgtools.match.dist_percentile(raddist, [99], distmax=0.075)
+        with open(ftemp.format(nsim, clid, "fit"), "wb") as f:
+            numpy.savez(f, cmpos=cmpos, patchsize=patchsize)

-        # Center of mass and Lagrangian patch size
-        dist, cm = csiborgtools.match.dist_centmass(x0)
-        patch = csiborgtools.match.dist_percentile(dist, [99], distmax=0.075)
-
-        # Dump the center of mass
-        with open(ftemp.format(nsim, n, "cm"), 'wb') as f:
-            numpy.save(f, cm)
-        # Dump the Lagrangian patch size
-        with open(ftemp.format(nsim, n, "lagpatch"), 'wb') as f:
-            numpy.save(f, patch)
-        # Dump the entire clump
-        if args.dump_clumps:
-            with open(ftemp.format(nsim, n, "clump"), "wb") as f:
-                numpy.save(f, x0)
+        if args.dump:
+            with open(ftemp.format(nsim, clid, "particles"), "wb") as f:
+                numpy.save(f, mmain_particles)

+    # We force clean up the memory before continuing.
    del part0, clump_ids
    collect()

+    # We now wait for all processes and then use the 0th process to collect the results.
+    # We first collect just the Lagrangian patch size information.
    comm.Barrier()
    if rank == 0:
-        print("{}: collecting summary files...".format(datetime.now()),
-              flush=True)
-        # Collect the centre of masses, patch size, etc. and dump them
-        dtype = {"names": ['x', 'y', 'z', "lagpatch", "ID"],
-                 "formats": [numpy.float32] * 4 + [numpy.int32]}
-        out = numpy.full(njobs, numpy.nan, dtype=dtype)
-
-        for i, n in enumerate(unique_clumpids):
-            # Load in CM vector
-            fpath = ftemp.format(nsim, n, "cm")
+        print("{}: collecting fits...".format(datetime.now()), flush=True)
+        dtype = {
+            "names": ["index", "x", "y", "z", "lagpatch"],
+            "formats": [numpy.int32] + [numpy.float32] * 4,
+        }
+        out = numpy.full(parent_ids.size, numpy.nan, dtype=dtype)
+        for i, clid in enumerate(parent_ids):
+            fpath = ftemp.format(nsim, clid, "fit")
            with open(fpath, "rb") as f:
-                fin = numpy.load(f)
-                out['x'][i] = fin[0]
-                out['y'][i] = fin[1]
-                out['z'][i] = fin[2]
+                inp = numpy.load(f)
+                out["index"][i] = clid
+                out["x"][i] = inp["cmpos"][0]
+                out["y"][i] = inp["cmpos"][1]
+                out["z"][i] = inp["cmpos"][2]
+                out["lagpatch"][i] = inp["patchsize"]
            remove(fpath)

-            # Load in the patch size
-            fpath = ftemp.format(nsim, n, "lagpatch")
-            with open(fpath, "rb") as f:
-                out["lagpatch"][i] = numpy.load(f)
-            remove(fpath)
-
-            # Store the halo ID
-            out["ID"][i] = n
-
-        print("{}: dumping to .. `{}`.".format(
-            datetime.now(), paths.initmatch_path(nsim, "cm")), flush=True)
-        with open(paths.initmatch_path(nsim, "cm"), 'wb') as f:
+        fout = paths.initmatch_path(nsim, "fit")
+        print("{}: dumping fits to .. `{}`.".format(datetime.now(), fout), flush=True)
+        with open(fout, "wb") as f:
            numpy.save(f, out)

-        if args.dump_clumps:
-            print("{}: collecting particle files...".format(datetime.now()),
-                  flush=True)
-            out = [None] * unique_clumpids.size
-            dtype = {"names": ["clump", "ID"],
-                     "formats": [object, numpy.int32]}
-            out = numpy.full(unique_clumpids.size, numpy.nan, dtype=dtype)
-            for i, n in enumerate(unique_clumpids):
-                fpath = ftemp.format(nsim, n, "clump")
-                with open(fpath, 'rb') as f:
-                    fin = numpy.load(f)
-                out["clump"][i] = fin
-                out["ID"][i] = n
-                remove(fpath)
+        # We now optionally collect the individual clumps and store them in an archive,
+        # which has the benefit of being a single file that can be easily read in.
+        if args.dump:
+            print("{}: collecting particles...".format(datetime.now()), flush=True)
+            out = {}
+            for clid in parent_ids:
+                fpath = ftemp.format(nsim, clid, "particles")
+                with open(fpath, "rb") as f:
+                    out.update({str(clid): numpy.load(f)})

            fout = paths.initmatch_path(nsim, "particles")
-            print("{}: dumping to .. `{}`.".format(datetime.now(), fout),
-                  flush=True)
+            print(
+                "{}: dumping particles to .. `{}`.".format(datetime.now(), fout),
+                flush=True,
+            )
            with open(fout, "wb") as f:
-                numpy.save(f, out)
+                numpy.savez(f, **out)

+            # Again we force clean up the memory before continuing.
            del out
-            collect()
+            collect()