Overlap calculation (#18)

* Move cosine similarity out * Basic overlap calculation * add overlap import * Add clump0 dict path * Update README * fix path bug * Save as dict instead * Change format to array of arrays * Update paths * Take fewer ICs for now * Change to structured array * Add overlap calculation * Start saving IDs * Add a blank space * Update TODO
2025-06-08 01:41:12 +00:00 · 2022-12-19 11:58:22 +01:00 · 2022-12-19 11:58:22 +01:00 · 18f09767f4
commit 18f09767f4
parent 13a9d11afe
6 changed files with 297 additions and 65 deletions
--- a/README.md
+++ b/README.md
@ -7,14 +7,11 @@
 ## Short-term TODO
- [x] Add code to calculate the cross-correlation for resolved region only.
+- [ ] Implement the CIC binning.
- [ ] Calculate the spectra for all 101 boxes and visualise them.
+- [ ] Write a script to perform the matching on a node.
 - [ ] See about the $z=70$ particles.
 ## Long-term TODO
 - [ ] Calculate the cross-correlation in CSiBORG. Should see the scale of the constraints?
 - [ ] Find the distribution of particles in the first snapshot
 - [ ] Implement a custom model for matchin galaxies to halos.
--- a/csiborgtools/match/init.py
+++ b/csiborgtools/match/init.py
@ -13,6 +13,6 @@
 # with this program; if not, write to the Free Software Foundation, Inc.,
 # 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
-from .match import (brute_spatial_separation, RealisationsMatcher)  # noqa
+from .match import (brute_spatial_separation, RealisationsMatcher, cosine_similarity, ParticleOverlap)  # noqa
 from .num_density import (binned_counts, number_density)  # noqa
 # from .correlation import (get_randoms_sphere, sphere_angular_tpcf) # noqa
--- a/csiborgtools/match/match.py
+++ b/csiborgtools/match/match.py
@ -79,6 +79,9 @@ class RealisationsMatcher:
    ----------
    cats : :py:class`csiborgtools.read.CombinedHaloCatalogue`
        Combined halo catalogue to search.
    # NOTE add later
 #    dtype : dtype, optional
 #        Output precision. By default `numpy.float32`.
    """
    _cats = None
@ -122,42 +125,42 @@ class RealisationsMatcher:
        """
        return [i for i in range(self.cats.N) if i != n_sim]
-    def cosine_similarity(self, x, y):
+    def _check_masskind(self, mass_kind):
-        r"""
+        """Check that `mass_kind` is a valid key."""
-        Calculate the cosine similarity between two Cartesian vectors. Defined
+        if mass_kind not in self.cats[0].keys:
-        as :math:`\Sum_{i} x_i y_{i} / (|x| * |y|)`.
+            raise ValueError("Invalid mass kind `{}`.".format(mass_kind))
    @staticmethod
    def _cat2clump_mapping(cat_indxs, clump_indxs):
        """
        Create a mapping from a catalogue array index to a clump array index.
        Parameters
        ----------
-        x : 1-dimensional array
+        cat_indxs : 1-dimensional array
-            The first vector.
+            Clump indices in the catalogue array.
-        y : 1- or 2-dimensional array
+        clump_indxs : 1-dimensional array
-            The second vector. Can be 2-dimensional of shape `(n_samples, 3)`,
+            Clump indices in the clump array.
            in which case the calculation is broadcasted.
        Returns
        -------
-        out : float or 1-dimensional array
+        mapping : 1-dimensional array
-            The cosine similarity. If y is 1-dimensinal returns only a float.
+            Mapping. The array indices match catalogue array and values are
            array positions in the clump array.
        """
-        # Quick check of dimensions
+        mapping = numpy.full(cat_indxs.size, numpy.nan, dtype=int)
-        if x.ndim != 1:
+        __, ind1, ind2 = numpy.intersect1d(clump_indxs, cat_indxs,
-            raise ValueError("`x` must be a 1-dimensional array.")
+                                           return_indices=True)
-        y = y.reshape(-1, 3) if y.ndim == 1 else y
+        mapping[ind2] = ind1
-
+        return mapping
        out = numpy.sum(x * y, axis=1)
        out /= numpy.linalg.norm(x) * numpy.linalg.norm(y, axis=1)
        if out.size == 1:
            return out[0]
        return out
    def cross_knn_position_single(self, n_sim, nmult=5, dlogmass=None,
-                                  init_dist=False, verbose=True):
+                                  mass_kind="totpartmass", init_dist=False,
                                  overlap=False, verbose=True):
        r"""
        Find all neighbours within :math:`n_{\rm mult} R_{200c}` of halos in
        the `nsim`th simulation. Also enforces that the neighbours'
-        :math:`\log M_{200c}` be within `dlogmass` dex.
+        :math:`\log M / M_\odot` be within `dlogmass` dex.
        Parameters
        ----------
@ -169,43 +172,71 @@ class RealisationsMatcher:
            default 5.
        dlogmass : float, optional
            Tolerance on mass logarithmic mass difference. By default `None`.
        mass_kind : str, optional
            The mass kind whose similarity is to be checked. Must be a valid
            catalogue key. By default `totpartmass`, i.e. the total particle
            mass associated with a halo.
        init_dist : bool, optional
            Whether to calculate separation of the initial CMs. By default
            `False`.
        overlap : bool, optional
            Whether to calculate overlap between clumps in the initial
            snapshot. By default `False`. Note that this operation is
            substantially slower.
        verbose : bool, optional
            Iterator verbosity flag. By default `True`.
        Returns
        -------
        matches : composite array
-            Array, indices are `(n_sims - 1, 3, n_halos, n_matches)`. The
+            Array, indices are `(n_sims - 1, 4, n_halos, n_matches)`. The
            2nd axis is `index` of the neighbouring halo in its catalogue,
-            `dist`, which is the 3D distance to the halo whose neighbours are
+            `dist` is the 3D distance to the halo whose neighbours are
-            searched, and `dist0` which is the separation of the initial CMs.
+            searched, `dist0` is the separation of the initial CMs and
-            The latter is calculated only if `init_dist` is `True`.
+            `overlap` is the overlap over the initial clumps, all respectively.
            The latter two are calculated only if `init_dist` or `overlap` is
            `True`.
        TODO:
        - [ ] Precalculate the mapping from halo index to clump array position
        """
-        # Radius, M200c and positions of halos in `n_sim` IC realisation
+        self._check_masskind(mass_kind)
-        logm200 = numpy.log10(self.cats[n_sim]["m200"])
+        # Radius, mass and positions of halos in `n_sim` IC realisation
        logmass = numpy.log10(self.cats[n_sim][mass_kind])
        R = self.cats[n_sim]["r200"]
        pos = self.cats[n_sim].positions
        if init_dist:
            pos0 = self.cats[n_sim].positions0  # These are CM positions
        if overlap:
            if verbose:
                print("Loading initial clump particles for `n_sim = {}`."
                      .format(n_sim))
            # Grab a paths object. What it is set to is unimportant
            paths = self.cats[0].paths
            with open(paths.clump0_path(self.cats.n_sims[n_sim]), "rb") as f:
                clumps0 = numpy.load(f, allow_pickle=True)
            overlapper = ParticleOverlap()
            cat2clumps0 = self._cat2clump_mapping(self.cats[n_sim]["index"],
                                                  clumps0["ID"])
        matches = [None] * (self.cats.N - 1)
        # Verbose iterator
        if verbose:
            iters = enumerate(tqdm(self.search_sim_indices(n_sim)))
        else:
            iters = enumerate(self.search_sim_indices(n_sim))
        iters = enumerate(self.search_sim_indices(n_sim))
        # Search for neighbours in the other simulations
        for count, i in iters:
            dist, indxs = self.cats[i].radius_neigbours(pos, R * nmult)
            # Get rid of neighbors whose mass is too off
            if dlogmass is not None:
                for j, indx in enumerate(indxs):
-                    match_logm200 = numpy.log10(self.cats[i]["m200"][indx])
+                    match_logmass = numpy.log10(self.cats[i][mass_kind][indx])
-                    mask = numpy.abs(match_logm200 - logm200[j]) < dlogmass
+                    mask = numpy.abs(match_logmass - logmass[j]) < dlogmass
                    dist[j] = dist[j][mask]
                    indxs[j] = indx[mask]
            # Find distance to the between the initial CM
            dist0 = [numpy.asanyarray([], dtype=numpy.float64)] * dist.size
            if init_dist:
@ -215,13 +246,47 @@ class RealisationsMatcher:
                    dist0[k] = numpy.linalg.norm(
                        pos0[k] - self.cats[i].positions0[indxs[k]], axis=1)
            # Calculate the initial snapshot overlap
            cross = [numpy.asanyarray([], dtype=numpy.float64)] * dist.size
            if overlap:
                if verbose:
                    print("Loading initial clump particles for `n_sim = {}` "
                          "to compare against `n_sim = {}`.".format(i, n_sim))
                with open(paths.clump0_path(self.cats.n_sims[i]), 'rb') as f:
                    clumpsx = numpy.load(f, allow_pickle=True)
                cat2clumpsx = self._cat2clump_mapping(self.cats[i]["index"],
                                                      clumpsx["ID"])
                # Loop only over halos that have neighbours
                with_neigbours = numpy.where([ii.size > 0 for ii in indxs])[0]
                for k in tqdm(with_neigbours) if verbose else with_neigbours:
                    # Find which clump matches index of this halo from cat
                    match0 = cat2clumps0[k]
                    # Get the clump and pre-calculate its cell assignment
                    cl0 = clumps0["clump"][match0]
                    cl0_cells = overlapper.assign_to_cell(
                        *(cl0[p] for p in ('x', 'y', 'z')))
                    dint = numpy.full(indxs[k].size, numpy.nan, numpy.float64)
                    # Loop over the ones we cross-correlate with
                    for ii, ind in enumerate(indxs[k]):
                        # Again which cross clump to this index
                        matchx = cat2clumpsx[ind]
                        dint[ii] = overlapper.mass_overlap(
                            cl0, clumpsx["clump"][matchx], cl0_cells)
                    cross[k] = dint
            # Append as a composite array
-            matches[count] = numpy.asarray([indxs, dist, dist0], dtype=object)
+            matches[count] = numpy.asarray(
                [indxs, dist, dist0, cross], dtype=object)
        return numpy.asarray(matches, dtype=object)
    def cross_knn_position_all(self, nmult=5, dlogmass=None,
-                               init_dist=False, verbose=True):
+                               mass_kind="totpartmass", init_dist=False,
                               overlap=False, verbose=True):
        r"""
        Find all neighbours within :math:`n_{\rm mult} R_{200c}` of halos in
        all simulations listed in `self.cats`. Also enforces that the
@ -234,9 +299,17 @@ class RealisationsMatcher:
            default 5.
        dlogmass : float, optional
            Tolerance on mass logarithmic mass difference. By default `None`.
        mass_kind : str, optional
            The mass kind whose similarity is to be checked. Must be a valid
            catalogue key. By default `totpartmass`, i.e. the total particle
            mass associated with a halo.
        init_dist : bool, optional
            Whether to calculate separation of the initial CMs. By default
            `False`.
        overlap : bool, optional
            Whether to calculate overlap between clumps in the initial
            snapshot. By default `False`. Note that this operation is
            substantially slower.
        verbose : bool, optional
            Iterator verbosity flag. By default `True`.
@ -251,5 +324,136 @@ class RealisationsMatcher:
        # Loop over each catalogue
        for i in trange(N) if verbose else range(N):
            matches[i] = self.cross_knn_position_single(
-                i, nmult, dlogmass, init_dist)
+                i, nmult, dlogmass, mass_kind=mass_kind, init_dist=init_dist,
                overlap=overlap, verbose=verbose)
        return matches
 ###############################################################################
 #                           Matching statistics                               #
 ###############################################################################
 def cosine_similarity(x, y):
    r"""
    Calculate the cosine similarity between two Cartesian vectors. Defined
    as :math:`\Sum_{i} x_i y_{i} / (|x| * |y|)`.
    Parameters
    ----------
    x : 1-dimensional array
        The first vector.
    y : 1- or 2-dimensional array
        The second vector. Can be 2-dimensional of shape `(n_samples, 3)`,
        in which case the calculation is broadcasted.
    Returns
    -------
    out : float or 1-dimensional array
        The cosine similarity. If y is 1-dimensinal returns only a float.
    """
    # Quick check of dimensions
    if x.ndim != 1:
        raise ValueError("`x` must be a 1-dimensional array.")
    y = y.reshape(-1, 3) if y.ndim == 1 else y
    out = numpy.sum(x * y, axis=1)
    out /= numpy.linalg.norm(x) * numpy.linalg.norm(y, axis=1)
    if out.size == 1:
        return out[0]
    return out
 class ParticleOverlap:
    """
    TODO:
    - [ ] Class documentation
    """
    _bins = None
    def __init__(self, bins=None):
        if bins is None:
            dx = 1 / 2**11
            bins = numpy.arange(0, 1 + dx, dx)
        self.bins = bins
    @property
    def bins(self):
        """
        The grid spacing. Assumed to be equal for all three dimensions. Units
        ought to match the requested coordinates.
        Returns
        -------
        bins : 1-dimensional array
        """
        return self._bins
    @bins.setter
    def bins(self, bins):
        """Sets `bins`."""
        bins = numpy.asarray(bins) if isinstance(bins, list) else bins
        assert bins.ndim == 1, "`bins` must be a 1-dimensional array."
        self._bins = bins
    def assign_to_cell(self, x, y, z):
        """
        Assign particles specified by coordinates `x`, `y`, and `z` to grid
        cells.
        Parameters
        ----------
        x, y, z : 1-dimensional arrays
            Positions of particles in the box.
        Returns
        -------
        cells : 1-dimensional array
            Cell ID of each particle.
        """
        assert x.ndim == 1 and x.size == y.size == z.size
        xbin = numpy.digitize(x, self.bins)
        ybin = numpy.digitize(y, self.bins)
        zbin = numpy.digitize(z, self.bins)
        N = self.bins.size
        return xbin + ybin * N + zbin * N**2
    def mass_overlap(self, clump1, clump2, cells1=None):
        r"""
        Calculate the particle, mass-weighted overlap between two halos.
        Defined as
        ..math::
            (M_{u,1} + M_{u,2}) / (M_1 + M_2),
        where :math:`M_{u, 1}` is the mass of particles of the first halo in
        cells that are also present in the second halo and :math:`M_1` is the
        total particle mass of the first halo.
        Parameters
        ----------
        clump1, clump2 : structured arrays
            Structured arrays corresponding to the two clumps. Should contain
            keys `x`, `y`, `z` and `M`.
        cells1 : 1-dimensional array, optional
            Optionlaly precomputed cells of `clump1`. Be careful when using
            this to ensure it matches `clump1`.
        Returns
        -------
        overlap : float
        """
        # 1-dimensional cell ID of each particle in clump1 and clump2
        if cells1 is None:
            cells1 = self.assign_to_cell(*[clump1[p] for p in ('x', 'y', 'z')])
        cells2 = self.assign_to_cell(*[clump2[p] for p in ('x', 'y', 'z')])
        # Elementwise cells1 in cells2 and vice versa
        m1 = numpy.isin(cells1, cells2)
        m2 = numpy.isin(cells2, cells1)
        # Summed shared mass and the total
        interp = numpy.sum(clump1["M"][m1]) + numpy.sum(clump2["M"][m2])
        mtot = numpy.sum(clump1["M"]) + numpy.sum(clump2["M"])
        return interp / mtot
--- a/csiborgtools/read/make_cat.py
+++ b/csiborgtools/read/make_cat.py
@ -378,7 +378,8 @@ class CombinedHaloCatalogue:
    def __init__(self, paths, min_m500=None, max_dist=None, verbose=True):
        # Read simulations and their maximum snapshots
        # NOTE later change this back to all simulations
-        self._n_sims = [7468, 7588, 8020, 8452, 8836]
+        # self._n_sims = [7468, 7588, 8020, 8452, 8836]
        self._n_sims = [7468, 7588]
 #        self._n_sims = paths.ic_ids
        n_snaps = [paths.get_maximum_snapshot(i) for i in self._n_sims]
        self._n_snaps = numpy.asanyarray(n_snaps)
--- a/csiborgtools/read/readsim.py
+++ b/csiborgtools/read/readsim.py
@ -72,7 +72,6 @@ class CSiBORGPaths:
    _initmatch_path = None
    _to_new = None
    # NOTE deuglify this stuff
    def __init__(self, n_sim=None, n_snap=None, srcdir=None, dumpdir=None,
                 mmain_path=None, initmatch_path=None, to_new=False):
        if srcdir is None:
@ -425,6 +424,24 @@ class CSiBORGPaths:
        n_sim = self.get_n_sim(n_sim)
        return min(self.get_snapshots(n_sim))
    def clump0_path(self, nsim):
        """
        Path to a single dumped clump's particles. This is expected to point
        to a dictonary whose keys are the clump indices and items structured
        arrays with the clump's particles in the initial snapshot.
        Parameters
        ----------
        nsim : int
            Index of the initial conditions (IC) realisation.
        Returns
        -------
        path : str
        """
        cdir = join(self.dumpdir, "initmatch")
        return join(cdir, "clump_{}_{}.npy".format(nsim, "particles"))
    def snapshot_path(self, n_snap=None, n_sim=None):
        """
        Path to a CSiBORG IC realisation snapshot.
@ -839,7 +856,7 @@ def read_mmain(n, srcdir, fname="Mmain_{}.npy"):
    return out
-def read_initcm(n, srcdir, fname="clump_cm_{}.npy"):
+def read_initcm(n, srcdir, fname="clump_{}_cm.npy"):
    """
    Read `clump_cm`, i.e. the center of mass of a clump at redshift z = 70.
    If the file does not exist returns `None`.
--- a/scripts/run_initmatch.py
+++ b/scripts/run_initmatch.py
@ -24,8 +24,7 @@ import numpy
 from datetime import datetime
 from mpi4py import MPI
 from distutils.util import strtobool
-from os.path import join, isdir
+from os.path import join
 from os import mkdir
 from os import remove
 from sys import stdout
 from gc import collect
@ -51,21 +50,15 @@ fin_paths = csiborgtools.read.CSiBORGPaths(to_new=False)
 nsims = init_paths.ic_ids
 # Output files
-dumpdir = "/mnt/extraspace/rstiskalek/csiborg/initmatch"
+dumpdir = "/mnt/extraspace/rstiskalek/csiborg/"
-ftemp = join(dumpdir, "temp", "temp_{}_{}.npy")
+ftemp = join(dumpdir, "temp_initmatch", "temp_{}_{}_{}.npy")
-fperm = join(dumpdir, "clump_cm_{}.npy")
+fpermcm = join(dumpdir, "initmatch", "clump_{}_cm.npy")
 fpermpart = join(dumpdir, "initmatch", "clump_{}_particles.npy")
 for nsim in nsims:
    if rank == 0:
        print("{}: reading simulation {}.".format(datetime.now(), nsim))
        stdout.flush()
    # Check that the output folder for this sim exists
    clumpdumpdir = join(dumpdir, "out_{}".format(nsim))
    if args.dump_clumps and rank == 0 and not isdir(clumpdumpdir):
        mkdir(clumpdumpdir)
    # Barrier to make sure we created the directory with the rank 0
    comm.Barrier()
    # Set the snapshot numbers
    init_paths.set_info(nsim, init_paths.get_minimum_snapshot(nsim))
@ -111,15 +104,16 @@ for nsim in nsims:
        cm = numpy.asanyarray(
            [numpy.average(x0[p], weights=x0["M"]) for p in ('x', 'y', 'z')])
        # Dump the center of mass
-        with open(ftemp.format(nsim, n), 'wb') as f:
+        with open(ftemp.format(nsim, n, "cm"), 'wb') as f:
            numpy.save(f, cm)
        # Optionally dump the entire clump
        if args.dump_clumps:
-            fout = join(clumpdumpdir, "clump_{}.npy".format(n))
+            with open(ftemp.format(nsim, n, "clump"), "wb") as f:
            stdout.flush()
            with open(fout, "wb") as f:
                numpy.save(f, x0)
    del part0, clump_ids
    collect()
    comm.Barrier()
    if rank == 0:
        print("Collecting CM files...")
@ -130,14 +124,33 @@ for nsim in nsims:
        out = numpy.full(njobs, numpy.nan, dtype=dtype)
        for i, n in enumerate(unique_clumpids):
-            with open(ftemp.format(nsim, n), 'rb') as f:
+            fpath = ftemp.format(nsim, n, "cm")
            with open(fpath, 'rb') as f:
                fin = numpy.load(f)
            out['x'][i] = fin[0]
            out['y'][i] = fin[1]
            out['z'][i] = fin[2]
            out["ID"][i] = n
-            remove(ftemp.format(nsim, n))
+            remove(fpath)
-
+        print("Dumping CM files to .. `{}`.".format(fpermcm.format(nsim)))
-        print("Dumping CM files to .. `{}`.".format(fperm.format(nsim)))
+        with open(fpermcm.format(nsim), 'wb') as f:
        with open(fperm.format(nsim), 'wb') as f:
            numpy.save(f, out)
        print("Collecting clump files...")
        stdout.flush()
        out = [None] * unique_clumpids.size
        dtype = {"names": ["clump", "ID"], "formats": [object, numpy.int32]}
        out = numpy.full(unique_clumpids.size, numpy.nan, dtype=dtype)
        for i, n in enumerate(unique_clumpids):
            fpath = ftemp.format(nsim, n, "clump")
            with open(fpath, 'rb') as f:
                fin = numpy.load(f)
            out["clump"][i] = fin
            out["ID"][i] = n
            remove(fpath)
        print("Dumping clump files to .. `{}`.".format(fpermpart.format(nsim)))
        with open(fpermpart.format(nsim), "wb") as f:
            numpy.save(f, out)
        del out
        collect()