From 153f1c00021c013c3c9d65677c5205f176874e42 Mon Sep 17 00:00:00 2001 From: Richard Stiskalek Date: Wed, 15 Mar 2023 20:55:50 +0000 Subject: [PATCH] Overlap reader thresholds (#31) * Improve data * Add comment * Update how KNN is called * Bring back indices * New function output * return catx["index"] * Remove unnecessary arguments * Remove useless arguments * Rename output * thin up catalogues * Add thresholding * Update README --- README.md | 9 +- csiborgtools/match/match.py | 25 ++-- csiborgtools/read/make_cat.py | 117 ++++++----------- csiborgtools/read/summaries.py | 227 ++++++++++++++++++--------------- scripts/run_singlematch.py | 7 +- 5 files changed, 183 insertions(+), 202 deletions(-) diff --git a/README.md b/README.md index 91b5435..d1317c6 100644 --- a/README.md +++ b/README.md @@ -1,21 +1,14 @@ -# CSiBORG tools +# CSiBORGTools -## CSiBORG Matching - -### TODO -- [ ] Modify the call to tN ### Questions -- What scaling of the search region? No reason for it to be a multiple of $R_{200c}$. - How well can observed clusters be matched to CSiBORG? Do their masses agree? - Is the number of clusters in CSiBORG consistent? - ## CSiBORG Galaxy Environmental Dependence ### TODO - [ ] Add gradient and Hessian of the overdensity field. -- [x] Write a script to smoothen an overdensity field, calculate the derived fields and evaluate them at the galaxy positions. ### Questions diff --git a/csiborgtools/match/match.py b/csiborgtools/match/match.py index 60e50b4..1e20f09 100644 --- a/csiborgtools/match/match.py +++ b/csiborgtools/match/match.py @@ -215,7 +215,7 @@ class RealisationsMatcher: mapping[ind2] = ind1 return mapping - def cross(self, nsim0, nsimx, cat0, catx, overlap=False, verbose=True): + def cross(self, cat0, catx, overlap=False, verbose=True): r""" Find all neighbours whose CM separation is less than `nmult` times the sum of their initial Lagrangian patch sizes. Enforces that the @@ -223,10 +223,9 @@ class RealisationsMatcher: Parameters ---------- - nsim0, nsimx : int - The reference and cross simulation IDs. cat0, catx: :py:class:`csiborgtools.read.HaloCatalogue` - Halo catalogues corresponding to `nsim0` and `nsimx`, respectively. + Halo catalogues corresponding to the reference and cross + simulations. overlap : bool, optional whether to calculate overlap between clumps in the initial snapshot. by default `false`. this operation is slow. @@ -235,22 +234,22 @@ class RealisationsMatcher: Returns ------- - indxs : 1-dimensional array of shape `(nhalos, )` + ref_indxs : 1-dimensional array Indices of halos in the reference catalogue. + cross_indxs : 1-dimensional array + Indices of halos in the cross catalogue. match_indxs : 1-dimensional array of arrays Indices of halo counterparts in the cross catalogue. overlaps : 1-dimensional array of arrays Overlaps with the cross catalogue. """ - assert (nsim0 == cat0.paths.n_sim) & (nsimx == catx.paths.n_sim) - # Query the KNN if verbose: print("{}: querying the KNN.".format(datetime.now()), flush=True) match_indxs = radius_neighbours( - catx.knn0, cat0.positions0, radiusX=cat0["lagpatch"], - radiusKNN=catx["lagpatch"], nmult=self.nmult, enforce_in32=True, - verbose=verbose) + catx.knn(select_initial=True), cat0.positions0, + radiusX=cat0["lagpatch"], radiusKNN=catx["lagpatch"], + nmult=self.nmult, enforce_in32=True, verbose=verbose) # Remove neighbours whose mass is too large/small if self.dlogmass is not None: @@ -265,9 +264,9 @@ class RealisationsMatcher: if overlap: if verbose: print("Loading the clump particles", flush=True) - with open(cat0.paths.clump0_path(nsim0), "rb") as f: + with open(cat0.paths.clump0_path(cat0.n_sim), "rb") as f: clumps0 = numpy.load(f, allow_pickle=True) - with open(catx.paths.clump0_path(nsimx), 'rb') as f: + with open(catx.paths.clump0_path(catx.n_sim), 'rb') as f: clumpsx = numpy.load(f, allow_pickle=True) # Convert 3D positions to particle IDs @@ -320,7 +319,7 @@ class RealisationsMatcher: match_indxs[k] = match_indxs[k][mask] cross[k] = cross[k][mask] - return cat0["index"], match_indxs, cross + return cat0["index"], catx["index"], match_indxs, cross ############################################################################### diff --git a/csiborgtools/read/make_cat.py b/csiborgtools/read/make_cat.py index b590c5f..ca3c7dd 100644 --- a/csiborgtools/read/make_cat.py +++ b/csiborgtools/read/make_cat.py @@ -31,38 +31,24 @@ class HaloCatalogue: ---------- paths : py:class:`csiborgtools.read.CSiBORGPaths` CSiBORG paths-handling object with set `n_sim` and `n_snap`. - min_m500 : float, optional - The minimum :math:`M_{rm 500c} / M_\odot` mass. By default no - threshold. + min_mass : float, optional + The minimum :math:`M_{rm tot} / M_\odot` mass. By default no threshold. max_dist : float, optional The maximum comoving distance of a halo. By default no upper limit. """ _box = None _paths = None _data = None - _knn = None - _knn0 = None - _positions = None - _positions0 = None + _selmask = None - def __init__(self, nsim, min_m500=None, max_dist=None): + def __init__(self, nsim, min_mass=None, max_dist=None): # Set up paths paths = CSiBORGPaths(n_sim=nsim) paths.n_snap = paths.get_maximum_snapshot() self._paths = paths self._box = BoxUnits(paths) - min_m500 = 0 if min_m500 is None else min_m500 - max_dist = numpy.infty if max_dist is None else max_dist self._paths = paths - self._set_data(min_m500, max_dist) - # Initialise the KNN at z = 0 and at z = 70 - knn = NearestNeighbors() - knn.fit(self.positions) - self._knn = knn - - knn0 = NearestNeighbors() - knn0.fit(self.positions0) - self._knn0 = knn0 + self._set_data(min_mass, max_dist) @property def data(self): @@ -88,17 +74,6 @@ class HaloCatalogue: """ return self._box - @property - def cosmo(self): - """ - The box cosmology. - - Returns - ------- - cosmo : `astropy` cosmology object - """ - return self.box.cosmo - @property def paths(self): """ @@ -132,29 +107,23 @@ class HaloCatalogue: """ return self.paths.n_sim - @property - def knn(self): + def knn(self, select_initial): """ The final snapshot k-nearest neighbour object. - Returns - ------- - knn : :py:class:`sklearn.neighbors.NearestNeighbors` - """ - return self._knn - - @property - def knn0(self): - """ - The initial snapshot k-nearest neighbour object. + Parameters + ---------- + select_initial : bool + Whether to define the KNN on the initial or final snapshot. Returns ------- knn : :py:class:`sklearn.neighbors.NearestNeighbors` """ - return self._knn0 + knn = NearestNeighbors() + return knn.fit(self.positions0 if select_initial else self.positions) - def _set_data(self, min_m500, max_dist): + def _set_data(self, min_mass, max_dist): """ Loads the data, merges with mmain, does various coordinate transforms. """ @@ -168,7 +137,7 @@ class HaloCatalogue: data = self.merge_mmain_to_clumps(data, mmain) flip_cols(data, "peak_x", "peak_z") - # Cut on number of particles and finite m200 + # Cut on number of particles and finite m200. Do not change! Hardcoded data = data[(data["npart"] > 100) & numpy.isfinite(data["m200"])] # Now also load the initial positions @@ -177,16 +146,15 @@ class HaloCatalogue: data = self.merge_initmatch_to_clumps(data, initcm) flip_cols(data, "x0", "z0") - # Calculate redshift - pos = [data["peak_{}".format(p)] - 0.5 for p in ("x", "y", "z")] - vel = [data["v{}".format(p)] for p in ("x", "y", "z")] - zpec = self.box.box2pecredshift(*vel, *pos) - zobs = self.box.box2obsredshift(*vel, *pos) - zcosmo = self.box.box2cosmoredshift( - sum(pos[i]**2 for i in range(3))**0.5) - - data = add_columns(data, [zpec, zobs, zcosmo], - ["zpec", "zobs", "zcosmo"]) +# # Calculate redshift +# pos = [data["peak_{}".format(p)] - 0.5 for p in ("x", "y", "z")] +# vel = [data["v{}".format(p)] for p in ("x", "y", "z")] +# zpec = self.box.box2pecredshift(*vel, *pos) +# zobs = self.box.box2obsredshift(*vel, *pos) +# zcosmo = self.box.box2cosmoredshift( +# sum(pos[i]**2 for i in range(3))**0.5) +# data = add_columns(data, [zpec, zobs, zcosmo], +# ["zpec", "zobs", "zcosmo"]) # Unit conversion convert_cols = ["m200", "m500", "totpartmass", "mass_mmain", @@ -194,29 +162,15 @@ class HaloCatalogue: "peak_x", "peak_y", "peak_z"] data = self.box.convert_from_boxunits(data, convert_cols) - # Cut on mass. Note that this is in Msun - data = data[data["m500"] > min_m500] - # Now calculate spherical coordinates d, ra, dec = cartesian_to_radec( data["peak_x"], data["peak_y"], data["peak_z"]) - data = add_columns(data, [d, ra, dec], ["dist", "ra", "dec"]) - # Cut on separation - data = data[data["dist"] < max_dist] - - # Pre-allocate the positions arrays - self._positions = numpy.vstack( - [data["peak_{}".format(p)] for p in ("x", "y", "z")]).T - self._positions = self._positions.astype(numpy.float32) # And do the unit transform if initcm is not None: data = self.box.convert_from_boxunits( data, ["x0", "y0", "z0", "lagpatch"]) - self._positions0 = numpy.vstack( - [data["{}0".format(p)] for p in ("x", "y", "z")]).T - self._positions0 = self._positions0.astype(numpy.float32) # Convert all that is not an integer to float32 names = list(data.dtype.names) @@ -227,9 +181,13 @@ class HaloCatalogue: else: formats.append(numpy.float32) dtype = numpy.dtype({"names": names, "formats": formats}) - data = data.astype(dtype) - self._data = data + # Apply cuts on distance and min500 if any + data = data[data["dist"] < max_dist] if max_dist is not None else data + data = (data[data["totpartmass"] > min_mass] + if min_mass is not None else data) + + self._data = data.astype(dtype) def merge_mmain_to_clumps(self, clumps, mmain): """ @@ -288,8 +246,8 @@ class HaloCatalogue: @property def positions(self): - """ - 3D positions of halos in comoving units of Mpc. + r""" + 3D positions of halos in :math:`\mathrm{cMpc}`. Returns ------- @@ -297,12 +255,13 @@ class HaloCatalogue: Array of shape `(n_halos, 3)`, where the latter axis represents `x`, `y` and `z`. """ - return self._positions + return numpy.vstack( + [self._data["peak_{}".format(p)] for p in ("x", "y", "z")]).T @property def positions0(self): r""" - 3D positions of halos in the initial snapshot in comoving units of Mpc. + 3D positions of halos in the initial snapshot in :math:`\mathrm{cMpc}`. Returns ------- @@ -310,9 +269,11 @@ class HaloCatalogue: Array of shape `(n_halos, 3)`, where the latter axis represents `x`, `y` and `z`. """ - if self._positions0 is None: + try: + return numpy.vstack( + [self._data["{}".format(p)] for p in ("x0", "y0", "z0")]).T + except KeyError: raise RuntimeError("Initial positions are not set!") - return self._positions0 @property def velocities(self): @@ -365,7 +326,7 @@ class HaloCatalogue: """ if not (X.ndim == 2 and X.shape[1] == 3): raise TypeError("`X` must be an array of shape `(n_samples, 3)`.") - knn = self.knn0 if select_initial else self.knn # Pick the right KNN + knn = self.knn(select_initial) return knn.radius_neighbors(X, radius, sort_results=True) @property diff --git a/csiborgtools/read/summaries.py b/csiborgtools/read/summaries.py index 897e548..20e1507 100644 --- a/csiborgtools/read/summaries.py +++ b/csiborgtools/read/summaries.py @@ -17,9 +17,8 @@ Tools for summarising various results. """ import numpy import joblib -from os.path import isfile +from os.path import (join, isfile) from tqdm import tqdm -from .make_cat import HaloCatalogue class PKReader: @@ -171,73 +170,64 @@ class PKReader: class OverlapReader: - """ + r""" A shortcut object for reading in the results of matching two simulations. Parameters ---------- - nsim0 : int - The reference simulation ID. - nsimx : int - The cross simulation ID. + cat0, catx: :py:class:`csiborgtools.read.HaloCatalogue` + Halo catalogues corresponding to the reference and cross + simulations. fskel : str, optional Path to the overlap. By default `None`, i.e. `/mnt/extraspace/rstiskalek/csiborg/overlap/cross_{}_{}.npz`. + min_mass : float, optional + The minimum :math:`M_{\rm tot} / M_\odot` mass. By default no + threshold. + max_dist : float, optional + The maximum comoving distance of a halo. By default no upper limit. """ - def __init__(self, nsim0, nsimx, fskel=None): - if fskel is None: - fskel = "/mnt/extraspace/rstiskalek/csiborg/overlap/" - fskel += "cross_{}_{}.npz" + _cat0 = None + _catx = None + _refmask = None - fpath = fskel.format(nsim0, nsimx) - fpath_inv = fskel.format(nsimx, nsim0) - is_inverted = False + def __init__(self, cat0, catx, fskel=None, min_mass=None, max_dist=None): + self._cat0 = cat0 + self._catx = catx + + if fskel is None: + fskel = join("/mnt/extraspace/rstiskalek/csiborg/overlap", + "cross_{}_{}.npz") + + fpath = fskel.format(cat0.n_sim, catx.n_sim) + fpath_inv = fskel.format(catx.n_sim, cat0.n_sim) if isfile(fpath): - pass + is_inverted = False elif isfile(fpath_inv): fpath = fpath_inv is_inverted = True - nsim0, nsimx = nsimx, nsim0 else: raise FileNotFoundError( "No overlap file found for combination `{}` and `{}`." - .format(nsim0, nsimx)) + .format(cat0.n_sim, catx.n_sim)) # We can set catalogues already now even if inverted - self._set_cats(nsim0, nsimx) - print(is_inverted) data = numpy.load(fpath, allow_pickle=True) if is_inverted: inv_match_indxs, inv_overlap = self._invert_match( - data["match_indxs"], data["cross"], self.cat0["index"].size,) - # Overwrite the original file and store as a dictionary - data = {"indxs": self.cat0["index"], - "match_indxs": inv_match_indxs, - "cross": inv_overlap, - } - self._data = data + data["match_indxs"], data["overlap"], + data["cross_indxs"].size,) + self._data = { + "index": data["cross_indxs"], + "match_indxs": inv_match_indxs, + "overlap": inv_overlap} + else: + self._data = { + "index": data["ref_indxs"], + "match_indxs": data["match_indxs"], + "overlap": data["overlap"]} - @property - def nsim0(self): - """ - The reference simulation ID. - - Returns - ------- - nsim0 : int - """ - return self._nsim0 - - @property - def nsimx(self): - """ - The cross simulation ID. - - Returns - ------- - nsimx : int - """ - return self._nsimx + self._make_refmask(min_mass, max_dist) @property def cat0(self): @@ -261,24 +251,6 @@ class OverlapReader: """ return self._catx - def _set_cats(self, nsim0, nsimx): - """ - Set the simulation IDs and catalogues. - - Parameters - ---------- - nsim0, nsimx : int - The reference and cross simulation IDs. - - Returns - ------- - None - """ - self._nsim0 = nsim0 - self._nsimx = nsimx - self._cat0 = HaloCatalogue(nsim0) - self._catx = HaloCatalogue(nsimx) - @staticmethod def _invert_match(match_indxs, overlap, cross_size): """ @@ -304,8 +276,8 @@ class OverlapReader: The corresponding overlaps to `inv_match_indxs`. """ # 1. Invert the match. Each reference halo has a list of counterparts - # so loop over those to each counterpart assign a reference halo. - # Add the same time also add the overlaps + # so loop over those to each counterpart assign a reference halo + # and at the same time also add the overlaps inv_match_indxs = [[] for __ in range(cross_size)] inv_overlap = [[] for __ in range(cross_size)] for ref_id in range(match_indxs.size): @@ -330,6 +302,35 @@ class OverlapReader: return inv_match_indxs, inv_overlap + def _make_refmask(self, min_mass, max_dist): + r""" + Create a mask for the reference catalogue that accounts for the mass + and distance cuts. Note that *no* masking is applied to the cross + catalogue. + + Parameters + ---------- + min_mass : float, optional + The minimum :math:`M_{rm tot} / M_\odot` mass. + max_dist : float, optional + The maximum comoving distance of a halo. + + Returns + ------- + None + """ + # Enforce a cut on the reference catalogue + min_mass = 0 if min_mass is None else min_mass + max_dist = numpy.infty if max_dist is None else max_dist + m = ((self.cat0["totpartmass"] > min_mass) + & (self.cat0["dist"] < max_dist)) + # Now remove indices that are below this cut + self._data["index"] = self._data["index"][m] + self._data["match_indxs"] = self._data["match_indxs"][m] + self._data["overlap"] = self._data["overlap"][m] + + self._refmask = m + @property def indxs(self): """ @@ -339,7 +340,7 @@ class OverlapReader: ------- indxs : 1-dimensional array """ - return self._data["indxs"] + return self._data["index"] @property def match_indxs(self): @@ -361,47 +362,73 @@ class OverlapReader: ------- overlap : array of 1-dimensional arrays of shape `(nhalos, )` """ - return self._data["cross"] + return self._data["overlap"] - def dist(self, in_initial, norm=None): + @property + def refmask(self): """ - Final snapshot pair distances. + Mask of the reference catalogue to match the calculated overlaps. + + Returns + ------- + refmask : 1-dimensional boolean array + """ + return self._refmask + + def dist(self, in_initial, norm_kind=None): + """ + Pair distances of matched halos between the reference and cross + simulations. Parameters ---------- in_initial : bool Whether to calculate separation in the initial or final snapshot. + norm_kind : str, optional + The kind of normalisation to apply to the distances. Can be `r200`, + `ref_patch` or `sum_patch`. Returns ------- dist : array of 1-dimensional arrays of shape `(nhalos, )` """ - assert norm is None or norm in ("r200", "ref_patch", "sum_patch") - # Positions either in the initial or final snapshot + assert (norm_kind is None + or norm_kind in ("r200", "ref_patch", "sum_patch")) + # Get positions either in the initial or final snapshot if in_initial: - pos0 = self.cat0.positions0 - posx = self.catx.positions0 + pos0, posx = self.cat0.positions0, self.catx.positions0 else: - pos0 = self.cat0.positions - posx = self.catx.positions + pos0, posx = self.cat0.positions, self.catx.positions + pos0 = pos0[self.refmask, :] # Apply the reference catalogue mask + # Get the normalisation array if applicable + if norm_kind == "r200": + norm = self.cat0["r200"][self.refmask] + if norm_kind == "ref_patch": + norm = self.cat0["lagpatch"][self.refmask] + if norm_kind == "sum_patch": + patch0 = self.cat0["lagpatch"][self.refmask] + patchx = self.catx["lagpatch"] + norm = [None] * self.indxs.size + for i, ind in enumerate(self.match_indxs): + norm[i] = patch0[i] + patchx[ind] + norm = numpy.array(norm, dtype=object) + + # Now calculate distances dist = [None] * self.indxs.size - for n, ind in enumerate(self.match_indxs): - dist[n] = numpy.linalg.norm(pos0[n, :] - posx[ind, :], axis=1) + for i, ind in enumerate(self.match_indxs): + # n refers to the reference halo catalogue position + dist[i] = numpy.linalg.norm(pos0[i, :] - posx[ind, :], axis=1) + + if norm_kind is not None: + dist[i] /= norm[i] - # Normalisation - if norm == "r200": - dist[n] /= self.cat0["r200"][n] - if norm == "ref_patch": - dist[n] /= self.cat0["lagpatch"][n] - if norm == "sum_patch": - dist[n] /= (self.cat0["lagpatch"][n] - + self.catx["lagpatch"][ind]) return numpy.array(dist, dtype=object) def mass_ratio(self, mass_kind="totpartmass", in_log=True, in_abs=True): """ - Pair mass ratio. + Pair mass ratio of matched halos between the reference and cross + simulations. Parameters ---------- @@ -418,16 +445,16 @@ class OverlapReader: ------- ratio : array of 1-dimensional arrays of shape `(nhalos, )` """ - mass0 = self.cat0[mass_kind] + mass0 = self.cat0[mass_kind][self.refmask] massx = self.catx[mass_kind] ratio = [None] * self.indxs.size - for n, ind in enumerate(self.match_indxs): - ratio[n] = mass0[n] / massx[ind] + for i, ind in enumerate(self.match_indxs): + ratio[i] = mass0[i] / massx[ind] if in_log: - ratio[n] = numpy.log10(ratio[n]) + ratio[i] = numpy.log10(ratio[i]) if in_abs: - ratio[n] = numpy.abs(ratio[n]) + ratio[i] = numpy.abs(ratio[i]) return numpy.array(ratio, dtype=object) def summed_overlap(self): @@ -456,9 +483,10 @@ class OverlapReader: ------- out : 1-dimensional array of shape `(nhalos, )` """ + vals = self.cat0[par][self.refmask] out = [None] * self.indxs.size - for n, ind in enumerate(self.match_indxs): - out[n] = numpy.ones(ind.size) * self.cat0[par][n] + for i, ind in enumerate(self.match_indxs): + out[i] = numpy.ones(ind.size) * vals[i] return numpy.array(out, dtype=object) def prob_nomatch(self): @@ -504,14 +532,13 @@ class OverlapReader: massx = self.catx[mass_kind] # Create references to the arrays here overlap = self.overlap # to speed up the loop below. - # Is the iterator verbose? - for n, match_ind in enumerate((self.match_indxs)): + for i, match_ind in enumerate(self.match_indxs): # Skip if no match if match_ind.size == 0: continue massx_ = massx[match_ind] # Again just create references - overlap_ = overlap[n] # to the appropriate elements + overlap_ = overlap[i] # to the appropriate elements # Optionally apply overlap threshold if overlap_threshold > 0.: @@ -530,8 +557,8 @@ class OverlapReader: mean_ = 10**mean_ if in_log else mean_ std_ = mean_ * std_ * numpy.log(10) if in_log else std_ - mean[n] = mean_ - std[n] = std_ + mean[i] = mean_ + std[i] = std_ return mean, std diff --git a/scripts/run_singlematch.py b/scripts/run_singlematch.py index 32c1996..44f1a8b 100644 --- a/scripts/run_singlematch.py +++ b/scripts/run_singlematch.py @@ -46,12 +46,13 @@ catx = csiborgtools.read.HaloCatalogue(args.nsimx) matcher = csiborgtools.match.RealisationsMatcher() print("{}: crossing the simulations.".format(datetime.now()), flush=True) -indxs, match_indxs, cross = matcher.cross( - args.nsim0, args.nsimx, cat0, catx, overlap=args.overlap) +ref_indxs, cross_indxs, match_indxs, overlap = matcher.cross( + cat0, catx, overlap=args.overlap) # Dump the result print("Saving results to `{}`.".format(fout), flush=True) with open(fout, "wb") as f: - numpy.savez(fout, indxs=indxs, match_indxs=match_indxs, cross=cross) + numpy.savez(fout, ref_indxs=ref_indxs, cross_indxs=cross_indxs, + match_indxs=match_indxs, overlap=overlap) print("All finished.", flush=True)