Overlapper improvements (#53)

* Store indices as f32 * Fix init sorting * Organise imports * Rename pathing * Add particle loading * Improve particle reading * Add h5py reader * edit particle path * Update particles loading * update particles loading * Fix particle dumping * Add init fitting * Fix bug due to insufficient precision * Add commnet * Add comment * Add clumps catalogue to halo cat * Add comment * Make sure PIDS never forced to float32 * fix pid reading * fix pid reading * Update matching to work with new arrays * Stop using cubical sub boxes, turn off nshift if no smoothing * Improve caching * Move function definitions * Simplify calculation * Add import * Small updates to the halo * Simplify calculation * Simplify looping calculation * fix tonew * Add initial data * Add skip condition * Add unit conversion * Add loading background in batches * Rename mmain index * Switch overlaps to h5 * Add finite lagpatch check * fix column name * Add verbosity flags * Save halo IDs instead. * Switch back to npz * Delte nbs * Reduce size of the box * Load correct bckg of halos being matched * Remove verbosity * verbosity edits * Change lower thresholds
2025-07-05 14:01:12 +00:00 · 2023-05-06 16:52:48 +01:00 · 2023-05-06 16:52:48 +01:00 · 56e39a8b1d
commit 56e39a8b1d
parent 1c9dacfde5
20 changed files with 864 additions and 3816 deletions
--- a/scripts/pre_dumppart.py
+++ b/scripts/pre_dumppart.py
@ -12,18 +12,20 @@
 # with this program; if not, write to the Free Software Foundation, Inc.,
 # 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 """
-Script to load in the simulation particles and dump them to a HDF5 file.
-Creates a mapping to access directly particles of a single clump.
+Script to load in the simulation particles, load them by their clump ID and
+dump into a HDF5 file. Stores the first and last index of each clump in the
+particle array. This can be used for fast slicing of the array to acces
+particles of a single clump.
 """

 from datetime import datetime
-from distutils.util import strtobool
 from gc import collect

 import h5py
+import numba
 import numpy
 from mpi4py import MPI
-from tqdm import tqdm
+from tqdm import trange

 try:
    import csiborgtools
@ -44,75 +46,109 @@ nproc = comm.Get_size()
 parser = ArgumentParser()
 parser.add_argument("--ics", type=int, nargs="+", default=None,
                    help="IC realisations. If `-1` processes all simulations.")
-parser.add_argument("--pos_only", type=lambda x: bool(strtobool(x)),
-                    help="Do we only dump positions?")
-parser.add_argument("--dtype", type=str, choices=["float32", "float64"],
-                    default="float32",)
 args = parser.parse_args()

 verbose = nproc == 1
 paths = csiborgtools.read.CSiBORGPaths(**csiborgtools.paths_glamdring)
 partreader = csiborgtools.read.ParticleReader(paths)
-
-if args.pos_only:
-    pars_extract = ['x', 'y', 'z', 'M']
-else:
-    pars_extract = ['x', 'y', 'z', 'vx', 'vy', 'vz', 'M']
+# Keep "ID" as the last column!
+pars_extract = ['x', 'y', 'z', 'vx', 'vy', 'vz', 'M', "ID"]

 if args.ics is None or args.ics[0] == -1:
    ics = paths.get_ics(tonew=False)
 else:
    ics = args.ics

+
+@numba.jit(nopython=True)
+def minmax_clump(clid, clump_ids, start_loop=0):
+    """
+    Find the start and end index of a clump in a sorted array of clump IDs.
+    This is much faster than using `numpy.where` and then `numpy.min` and
+    `numpy.max`.
+    """
+    start = None
+    end = None
+
+    for i in range(start_loop, clump_ids.size):
+        n = clump_ids[i]
+        if n == clid:
+            if start is None:
+                start = i
+            end = i
+        elif n > clid:
+            break
+    return start, end
+
+
 # MPI loop over individual simulations. We read in the particles from RAMSES
 # files and dump them to a HDF5 file.
 jobs = csiborgtools.fits.split_jobs(len(ics), nproc)[rank]
 for i in jobs:
    nsim = ics[i]
    nsnap = max(paths.get_snapshots(nsim))
-    print(f"{datetime.now()}: Rank {rank} loading particles {nsim}.",
+    fname = paths.particles_path(nsim)
+    # We first read in the clump IDs of the particles and infer the sorting.
+    # Right away we dump the clump IDs to a HDF5 file and clear up memory.
+    print(f"{datetime.now()}: rank {rank} loading particles {nsim}.",
          flush=True)
+    part_cids = partreader.read_clumpid(nsnap, nsim, verbose=verbose)
+    sort_indxs = numpy.argsort(part_cids).astype(numpy.int32)
+    part_cids = part_cids[sort_indxs]
+    with h5py.File(fname, "w") as f:
+        f.create_dataset("clump_ids", data=part_cids)
+        f.close()
+    del part_cids
+    collect()

-    parts = partreader.read_particle(nsnap, nsim, pars_extract,
-                                     return_structured=False, verbose=verbose)
-    if args.dtype == "float64":
-        parts = parts.astype(numpy.float64)
-
-    kind = "pos" if args.pos_only else None
-
-    print(f"{datetime.now()}: Rank {rank} dumping particles from {nsim}.",
+    # Next we read in the particles and sort them by their clump ID.
+    # We cannot directly read this as an unstructured array because the float32
+    # precision is insufficient to capture the clump IDs.
+    parts, pids = partreader.read_particle(
+        nsnap, nsim, pars_extract, return_structured=False, verbose=verbose)
+    # Now we in two steps save the particles and particle IDs.
+    print(f"{datetime.now()}: rank {rank} dumping particles from {nsim}.",
          flush=True)
+    parts = parts[sort_indxs]
+    pids = pids[sort_indxs]
+    del sort_indxs
+    collect()

-    with h5py.File(paths.particle_h5py_path(nsim, kind, args.dtype), "w") as f:
+    with h5py.File(fname, "r+") as f:
+        f.create_dataset("particle_ids", data=pids)
+        f.close()
+    del pids
+    collect()
+
+    with h5py.File(fname, "r+") as f:
        f.create_dataset("particles", data=parts)
+        f.close()
    del parts
    collect()
-    print(f"{datetime.now()}: Rank {rank} finished dumping of {nsim}.",
-          flush=True)
-    # If we are dumping only particle positions, then we are done.
-    if args.pos_only:
-        continue

-    print(f"{datetime.now()}: Rank {rank} mapping particles from {nsim}.",
+    print(f"{datetime.now()}: rank {rank} creating clump mapping for {nsim}.",
          flush=True)
-    # If not, then load the clump IDs and prepare the memory mapping. We find
-    # which array positions correspond to which clump IDs and save it. With
-    # this we can then lazily load into memory the particles for each clump.
-    part_cids = partreader.read_clumpid(nsnap, nsim, verbose=verbose)
-    cat = csiborgtools.read.ClumpsCatalogue(nsim, paths, load_fitted=False,
-                                            rawdata=True)
-    clumpinds = cat["index"]
-    # Some of the clumps have no particles, so we do not loop over them
-    clumpinds = clumpinds[numpy.isin(clumpinds, part_cids)]
-
-    out = {}
-    for i, cid in enumerate(tqdm(clumpinds) if verbose else clumpinds):
-        out.update({str(cid): numpy.where(part_cids == cid)[0]})
+    # Load clump IDs back to memory
+    with h5py.File(fname, "r") as f:
+        part_cids = f["clump_ids"][:]
+    # We loop over the unique clump IDs.
+    unique_clump_ids = numpy.unique(part_cids)
+    clump_map = numpy.full((unique_clump_ids.size, 3), numpy.nan,
+                           dtype=numpy.int32)
+    start_loop = 0
+    niters = unique_clump_ids.size
+    for i in trange(niters) if verbose else range(niters):
+        clid = unique_clump_ids[i]
+        k0, kf = minmax_clump(clid, part_cids, start_loop=start_loop)
+        clump_map[i, 0] = clid
+        clump_map[i, 1] = k0
+        clump_map[i, 2] = kf
+        start_loop = kf

    # We save the mapping to a HDF5 file
-    with h5py.File(paths.particle_h5py_path(nsim, "clumpmap"), "w") as f:
-        for cid, indxs in out.items():
-            f.create_dataset(cid, data=indxs)
+    with h5py.File(paths.particles_path(nsim), "r+") as f:
+        f.create_dataset("clumpmap", data=clump_map)
+        f.close()

-    del part_cids, cat, clumpinds, out
+    del part_cids
    collect()