Overlap calculation (#18)

* Move cosine similarity out * Basic overlap calculation * add overlap import * Add clump0 dict path * Update README * fix path bug * Save as dict instead * Change format to array of arrays * Update paths * Take fewer ICs for now * Change to structured array * Add overlap calculation * Start saving IDs * Add a blank space * Update TODO
2025-04-18 04:20:55 +00:00 · 2022-12-19 11:58:22 +01:00 · 2022-12-19 11:58:22 +01:00 · 18f09767f4
commit 18f09767f4
parent 13a9d11afe
6 changed files with 297 additions and 65 deletions
--- a/README.md
+++ b/README.md
@ -7,14 +7,11 @@


 ## Short-term TODO
- [x] Add code to calculate the cross-correlation for resolved region only.
- [ ] Calculate the spectra for all 101 boxes and visualise them.
- [ ] See about the $z=70$ particles.
+- [ ] Implement the CIC binning.
+- [ ] Write a script to perform the matching on a node.


 ## Long-term TODO
- [ ] Calculate the cross-correlation in CSiBORG. Should see the scale of the constraints?
- [ ] Find the distribution of particles in the first snapshot
 - [ ] Implement a custom model for matchin galaxies to halos.


--- a/csiborgtools/match/init.py
+++ b/csiborgtools/match/init.py
@ -13,6 +13,6 @@
 # with this program; if not, write to the Free Software Foundation, Inc.,
 # 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.

-from .match import (brute_spatial_separation, RealisationsMatcher)  # noqa
+from .match import (brute_spatial_separation, RealisationsMatcher, cosine_similarity, ParticleOverlap)  # noqa
 from .num_density import (binned_counts, number_density)  # noqa
 # from .correlation import (get_randoms_sphere, sphere_angular_tpcf) # noqa
--- a/csiborgtools/match/match.py
+++ b/csiborgtools/match/match.py
@ -79,6 +79,9 @@ class RealisationsMatcher:
    ----------
    cats : :py:class`csiborgtools.read.CombinedHaloCatalogue`
        Combined halo catalogue to search.
+    # NOTE add later
+#    dtype : dtype, optional
+#        Output precision. By default `numpy.float32`.
    """
    _cats = None

@ -122,42 +125,42 @@ class RealisationsMatcher:
        """
        return [i for i in range(self.cats.N) if i != n_sim]

-    def cosine_similarity(self, x, y):
-        r"""
-        Calculate the cosine similarity between two Cartesian vectors. Defined
-        as :math:`\Sum_{i} x_i y_{i} / (|x| * |y|)`.
+    def _check_masskind(self, mass_kind):
+        """Check that `mass_kind` is a valid key."""
+        if mass_kind not in self.cats[0].keys:
+            raise ValueError("Invalid mass kind `{}`.".format(mass_kind))
+
+    @staticmethod
+    def _cat2clump_mapping(cat_indxs, clump_indxs):
+        """
+        Create a mapping from a catalogue array index to a clump array index.

        Parameters
        ----------
-        x : 1-dimensional array
-            The first vector.
-        y : 1- or 2-dimensional array
-            The second vector. Can be 2-dimensional of shape `(n_samples, 3)`,
-            in which case the calculation is broadcasted.
+        cat_indxs : 1-dimensional array
+            Clump indices in the catalogue array.
+        clump_indxs : 1-dimensional array
+            Clump indices in the clump array.

        Returns
        -------
-        out : float or 1-dimensional array
-            The cosine similarity. If y is 1-dimensinal returns only a float.
+        mapping : 1-dimensional array
+            Mapping. The array indices match catalogue array and values are
+            array positions in the clump array.
        """
-        # Quick check of dimensions
-        if x.ndim != 1:
-            raise ValueError("`x` must be a 1-dimensional array.")
-        y = y.reshape(-1, 3) if y.ndim == 1 else y
-
-        out = numpy.sum(x * y, axis=1)
-        out /= numpy.linalg.norm(x) * numpy.linalg.norm(y, axis=1)
-
-        if out.size == 1:
-            return out[0]
-        return out
+        mapping = numpy.full(cat_indxs.size, numpy.nan, dtype=int)
+        __, ind1, ind2 = numpy.intersect1d(clump_indxs, cat_indxs,
+                                           return_indices=True)
+        mapping[ind2] = ind1
+        return mapping

    def cross_knn_position_single(self, n_sim, nmult=5, dlogmass=None,
-                                  init_dist=False, verbose=True):
+                                  mass_kind="totpartmass", init_dist=False,
+                                  overlap=False, verbose=True):
        r"""
        Find all neighbours within :math:`n_{\rm mult} R_{200c}` of halos in
        the `nsim`th simulation. Also enforces that the neighbours'
-        :math:`\log M_{200c}` be within `dlogmass` dex.
+        :math:`\log M / M_\odot` be within `dlogmass` dex.

        Parameters
        ----------
@ -169,43 +172,71 @@ class RealisationsMatcher:
            default 5.
        dlogmass : float, optional
            Tolerance on mass logarithmic mass difference. By default `None`.
+        mass_kind : str, optional
+            The mass kind whose similarity is to be checked. Must be a valid
+            catalogue key. By default `totpartmass`, i.e. the total particle
+            mass associated with a halo.
        init_dist : bool, optional
            Whether to calculate separation of the initial CMs. By default
            `False`.
+        overlap : bool, optional
+            Whether to calculate overlap between clumps in the initial
+            snapshot. By default `False`. Note that this operation is
+            substantially slower.
        verbose : bool, optional
            Iterator verbosity flag. By default `True`.

        Returns
        -------
        matches : composite array
-            Array, indices are `(n_sims - 1, 3, n_halos, n_matches)`. The
+            Array, indices are `(n_sims - 1, 4, n_halos, n_matches)`. The
            2nd axis is `index` of the neighbouring halo in its catalogue,
-            `dist`, which is the 3D distance to the halo whose neighbours are
-            searched, and `dist0` which is the separation of the initial CMs.
-            The latter is calculated only if `init_dist` is `True`.
+            `dist` is the 3D distance to the halo whose neighbours are
+            searched, `dist0` is the separation of the initial CMs and
+            `overlap` is the overlap over the initial clumps, all respectively.
+            The latter two are calculated only if `init_dist` or `overlap` is
+            `True`.
+
+        TODO:
+        - [ ] Precalculate the mapping from halo index to clump array position
        """
-        # Radius, M200c and positions of halos in `n_sim` IC realisation
-        logm200 = numpy.log10(self.cats[n_sim]["m200"])
+        self._check_masskind(mass_kind)
+        # Radius, mass and positions of halos in `n_sim` IC realisation
+        logmass = numpy.log10(self.cats[n_sim][mass_kind])
        R = self.cats[n_sim]["r200"]
        pos = self.cats[n_sim].positions
        if init_dist:
            pos0 = self.cats[n_sim].positions0  # These are CM positions
+        if overlap:
+            if verbose:
+                print("Loading initial clump particles for `n_sim = {}`."
+                      .format(n_sim))
+            # Grab a paths object. What it is set to is unimportant
+            paths = self.cats[0].paths
+            with open(paths.clump0_path(self.cats.n_sims[n_sim]), "rb") as f:
+                clumps0 = numpy.load(f, allow_pickle=True)
+            overlapper = ParticleOverlap()
+            cat2clumps0 = self._cat2clump_mapping(self.cats[n_sim]["index"],
+                                                  clumps0["ID"])
+
        matches = [None] * (self.cats.N - 1)
        # Verbose iterator
        if verbose:
            iters = enumerate(tqdm(self.search_sim_indices(n_sim)))
        else:
            iters = enumerate(self.search_sim_indices(n_sim))
+        iters = enumerate(self.search_sim_indices(n_sim))
        # Search for neighbours in the other simulations
        for count, i in iters:
            dist, indxs = self.cats[i].radius_neigbours(pos, R * nmult)
            # Get rid of neighbors whose mass is too off
            if dlogmass is not None:
                for j, indx in enumerate(indxs):
-                    match_logm200 = numpy.log10(self.cats[i]["m200"][indx])
-                    mask = numpy.abs(match_logm200 - logm200[j]) < dlogmass
+                    match_logmass = numpy.log10(self.cats[i][mass_kind][indx])
+                    mask = numpy.abs(match_logmass - logmass[j]) < dlogmass
                    dist[j] = dist[j][mask]
                    indxs[j] = indx[mask]
+
            # Find distance to the between the initial CM
            dist0 = [numpy.asanyarray([], dtype=numpy.float64)] * dist.size
            if init_dist:
@ -215,13 +246,47 @@ class RealisationsMatcher:
                    dist0[k] = numpy.linalg.norm(
                        pos0[k] - self.cats[i].positions0[indxs[k]], axis=1)

+            # Calculate the initial snapshot overlap
+            cross = [numpy.asanyarray([], dtype=numpy.float64)] * dist.size
+            if overlap:
+                if verbose:
+                    print("Loading initial clump particles for `n_sim = {}` "
+                          "to compare against `n_sim = {}`.".format(i, n_sim))
+                with open(paths.clump0_path(self.cats.n_sims[i]), 'rb') as f:
+                    clumpsx = numpy.load(f, allow_pickle=True)
+                cat2clumpsx = self._cat2clump_mapping(self.cats[i]["index"],
+                                                      clumpsx["ID"])
+
+                # Loop only over halos that have neighbours
+                with_neigbours = numpy.where([ii.size > 0 for ii in indxs])[0]
+                for k in tqdm(with_neigbours) if verbose else with_neigbours:
+                    # Find which clump matches index of this halo from cat
+                    match0 = cat2clumps0[k]
+
+                    # Get the clump and pre-calculate its cell assignment
+                    cl0 = clumps0["clump"][match0]
+                    cl0_cells = overlapper.assign_to_cell(
+                        *(cl0[p] for p in ('x', 'y', 'z')))
+                    dint = numpy.full(indxs[k].size, numpy.nan, numpy.float64)
+
+                    # Loop over the ones we cross-correlate with
+                    for ii, ind in enumerate(indxs[k]):
+                        # Again which cross clump to this index
+                        matchx = cat2clumpsx[ind]
+                        dint[ii] = overlapper.mass_overlap(
+                            cl0, clumpsx["clump"][matchx], cl0_cells)
+
+                    cross[k] = dint
+
            # Append as a composite array
-            matches[count] = numpy.asarray([indxs, dist, dist0], dtype=object)
+            matches[count] = numpy.asarray(
+                [indxs, dist, dist0, cross], dtype=object)

        return numpy.asarray(matches, dtype=object)

    def cross_knn_position_all(self, nmult=5, dlogmass=None,
-                               init_dist=False, verbose=True):
+                               mass_kind="totpartmass", init_dist=False,
+                               overlap=False, verbose=True):
        r"""
        Find all neighbours within :math:`n_{\rm mult} R_{200c}` of halos in
        all simulations listed in `self.cats`. Also enforces that the
@ -234,9 +299,17 @@ class RealisationsMatcher:
            default 5.
        dlogmass : float, optional
            Tolerance on mass logarithmic mass difference. By default `None`.
+        mass_kind : str, optional
+            The mass kind whose similarity is to be checked. Must be a valid
+            catalogue key. By default `totpartmass`, i.e. the total particle
+            mass associated with a halo.
        init_dist : bool, optional
            Whether to calculate separation of the initial CMs. By default
            `False`.
+        overlap : bool, optional
+            Whether to calculate overlap between clumps in the initial
+            snapshot. By default `False`. Note that this operation is
+            substantially slower.
        verbose : bool, optional
            Iterator verbosity flag. By default `True`.

@ -251,5 +324,136 @@ class RealisationsMatcher:
        # Loop over each catalogue
        for i in trange(N) if verbose else range(N):
            matches[i] = self.cross_knn_position_single(
-                i, nmult, dlogmass, init_dist)
+                i, nmult, dlogmass, mass_kind=mass_kind, init_dist=init_dist,
+                overlap=overlap, verbose=verbose)
        return matches
+
+
+###############################################################################
+#                           Matching statistics                               #
+###############################################################################
+
+
+def cosine_similarity(x, y):
+    r"""
+    Calculate the cosine similarity between two Cartesian vectors. Defined
+    as :math:`\Sum_{i} x_i y_{i} / (|x| * |y|)`.
+
+    Parameters
+    ----------
+    x : 1-dimensional array
+        The first vector.
+    y : 1- or 2-dimensional array
+        The second vector. Can be 2-dimensional of shape `(n_samples, 3)`,
+        in which case the calculation is broadcasted.
+
+    Returns
+    -------
+    out : float or 1-dimensional array
+        The cosine similarity. If y is 1-dimensinal returns only a float.
+    """
+    # Quick check of dimensions
+    if x.ndim != 1:
+        raise ValueError("`x` must be a 1-dimensional array.")
+    y = y.reshape(-1, 3) if y.ndim == 1 else y
+
+    out = numpy.sum(x * y, axis=1)
+    out /= numpy.linalg.norm(x) * numpy.linalg.norm(y, axis=1)
+
+    if out.size == 1:
+        return out[0]
+    return out
+
+
+class ParticleOverlap:
+    """
+    TODO:
+    - [ ] Class documentation
+    """
+    _bins = None
+
+    def __init__(self, bins=None):
+        if bins is None:
+            dx = 1 / 2**11
+            bins = numpy.arange(0, 1 + dx, dx)
+        self.bins = bins
+
+    @property
+    def bins(self):
+        """
+        The grid spacing. Assumed to be equal for all three dimensions. Units
+        ought to match the requested coordinates.
+
+        Returns
+        -------
+        bins : 1-dimensional array
+        """
+        return self._bins
+
+    @bins.setter
+    def bins(self, bins):
+        """Sets `bins`."""
+        bins = numpy.asarray(bins) if isinstance(bins, list) else bins
+        assert bins.ndim == 1, "`bins` must be a 1-dimensional array."
+        self._bins = bins
+
+    def assign_to_cell(self, x, y, z):
+        """
+        Assign particles specified by coordinates `x`, `y`, and `z` to grid
+        cells.
+
+        Parameters
+        ----------
+        x, y, z : 1-dimensional arrays
+            Positions of particles in the box.
+
+        Returns
+        -------
+        cells : 1-dimensional array
+            Cell ID of each particle.
+        """
+        assert x.ndim == 1 and x.size == y.size == z.size
+        xbin = numpy.digitize(x, self.bins)
+        ybin = numpy.digitize(y, self.bins)
+        zbin = numpy.digitize(z, self.bins)
+        N = self.bins.size
+
+        return xbin + ybin * N + zbin * N**2
+
+    def mass_overlap(self, clump1, clump2, cells1=None):
+        r"""
+        Calculate the particle, mass-weighted overlap between two halos.
+        Defined as
+
+        ..math::
+            (M_{u,1} + M_{u,2}) / (M_1 + M_2),
+
+        where :math:`M_{u, 1}` is the mass of particles of the first halo in
+        cells that are also present in the second halo and :math:`M_1` is the
+        total particle mass of the first halo.
+
+        Parameters
+        ----------
+        clump1, clump2 : structured arrays
+            Structured arrays corresponding to the two clumps. Should contain
+            keys `x`, `y`, `z` and `M`.
+        cells1 : 1-dimensional array, optional
+            Optionlaly precomputed cells of `clump1`. Be careful when using
+            this to ensure it matches `clump1`.
+
+        Returns
+        -------
+        overlap : float
+        """
+        # 1-dimensional cell ID of each particle in clump1 and clump2
+        if cells1 is None:
+            cells1 = self.assign_to_cell(*[clump1[p] for p in ('x', 'y', 'z')])
+        cells2 = self.assign_to_cell(*[clump2[p] for p in ('x', 'y', 'z')])
+        # Elementwise cells1 in cells2 and vice versa
+        m1 = numpy.isin(cells1, cells2)
+        m2 = numpy.isin(cells2, cells1)
+        # Summed shared mass and the total
+        interp = numpy.sum(clump1["M"][m1]) + numpy.sum(clump2["M"][m2])
+        mtot = numpy.sum(clump1["M"]) + numpy.sum(clump2["M"])
+
+        return interp / mtot
--- a/csiborgtools/read/make_cat.py
+++ b/csiborgtools/read/make_cat.py
@ -378,7 +378,8 @@ class CombinedHaloCatalogue:
    def __init__(self, paths, min_m500=None, max_dist=None, verbose=True):
        # Read simulations and their maximum snapshots
        # NOTE later change this back to all simulations
-        self._n_sims = [7468, 7588, 8020, 8452, 8836]
+        # self._n_sims = [7468, 7588, 8020, 8452, 8836]
+        self._n_sims = [7468, 7588]
 #        self._n_sims = paths.ic_ids
        n_snaps = [paths.get_maximum_snapshot(i) for i in self._n_sims]
        self._n_snaps = numpy.asanyarray(n_snaps)
--- a/csiborgtools/read/readsim.py
+++ b/csiborgtools/read/readsim.py
@ -72,7 +72,6 @@ class CSiBORGPaths:
    _initmatch_path = None
    _to_new = None

-    # NOTE deuglify this stuff
    def __init__(self, n_sim=None, n_snap=None, srcdir=None, dumpdir=None,
                 mmain_path=None, initmatch_path=None, to_new=False):
        if srcdir is None:
@ -425,6 +424,24 @@ class CSiBORGPaths:
        n_sim = self.get_n_sim(n_sim)
        return min(self.get_snapshots(n_sim))

+    def clump0_path(self, nsim):
+        """
+        Path to a single dumped clump's particles. This is expected to point
+        to a dictonary whose keys are the clump indices and items structured
+        arrays with the clump's particles in the initial snapshot.
+
+        Parameters
+        ----------
+        nsim : int
+            Index of the initial conditions (IC) realisation.
+
+        Returns
+        -------
+        path : str
+        """
+        cdir = join(self.dumpdir, "initmatch")
+        return join(cdir, "clump_{}_{}.npy".format(nsim, "particles"))
+
    def snapshot_path(self, n_snap=None, n_sim=None):
        """
        Path to a CSiBORG IC realisation snapshot.
@ -839,7 +856,7 @@ def read_mmain(n, srcdir, fname="Mmain_{}.npy"):
    return out


-def read_initcm(n, srcdir, fname="clump_cm_{}.npy"):
+def read_initcm(n, srcdir, fname="clump_{}_cm.npy"):
    """
    Read `clump_cm`, i.e. the center of mass of a clump at redshift z = 70.
    If the file does not exist returns `None`.
--- a/scripts/run_initmatch.py
+++ b/scripts/run_initmatch.py
@ -24,8 +24,7 @@ import numpy
 from datetime import datetime
 from mpi4py import MPI
 from distutils.util import strtobool
-from os.path import join, isdir
-from os import mkdir
+from os.path import join
 from os import remove
 from sys import stdout
 from gc import collect
@ -51,21 +50,15 @@ fin_paths = csiborgtools.read.CSiBORGPaths(to_new=False)
 nsims = init_paths.ic_ids

 # Output files
-dumpdir = "/mnt/extraspace/rstiskalek/csiborg/initmatch"
-ftemp = join(dumpdir, "temp", "temp_{}_{}.npy")
-fperm = join(dumpdir, "clump_cm_{}.npy")
+dumpdir = "/mnt/extraspace/rstiskalek/csiborg/"
+ftemp = join(dumpdir, "temp_initmatch", "temp_{}_{}_{}.npy")
+fpermcm = join(dumpdir, "initmatch", "clump_{}_cm.npy")
+fpermpart = join(dumpdir, "initmatch", "clump_{}_particles.npy")

 for nsim in nsims:
    if rank == 0:
        print("{}: reading simulation {}.".format(datetime.now(), nsim))
        stdout.flush()
-    # Check that the output folder for this sim exists
-    clumpdumpdir = join(dumpdir, "out_{}".format(nsim))
-    if args.dump_clumps and rank == 0 and not isdir(clumpdumpdir):
-        mkdir(clumpdumpdir)
-
-    # Barrier to make sure we created the directory with the rank 0
-    comm.Barrier()

    # Set the snapshot numbers
    init_paths.set_info(nsim, init_paths.get_minimum_snapshot(nsim))
@ -111,15 +104,16 @@ for nsim in nsims:
        cm = numpy.asanyarray(
            [numpy.average(x0[p], weights=x0["M"]) for p in ('x', 'y', 'z')])
        # Dump the center of mass
-        with open(ftemp.format(nsim, n), 'wb') as f:
+        with open(ftemp.format(nsim, n, "cm"), 'wb') as f:
            numpy.save(f, cm)
        # Optionally dump the entire clump
        if args.dump_clumps:
-            fout = join(clumpdumpdir, "clump_{}.npy".format(n))
-            stdout.flush()
-            with open(fout, "wb") as f:
+            with open(ftemp.format(nsim, n, "clump"), "wb") as f:
                numpy.save(f, x0)

+    del part0, clump_ids
+    collect()
+
    comm.Barrier()
    if rank == 0:
        print("Collecting CM files...")
@ -130,14 +124,33 @@ for nsim in nsims:
        out = numpy.full(njobs, numpy.nan, dtype=dtype)

        for i, n in enumerate(unique_clumpids):
-            with open(ftemp.format(nsim, n), 'rb') as f:
+            fpath = ftemp.format(nsim, n, "cm")
+            with open(fpath, 'rb') as f:
                fin = numpy.load(f)
            out['x'][i] = fin[0]
            out['y'][i] = fin[1]
            out['z'][i] = fin[2]
            out["ID"][i] = n
-            remove(ftemp.format(nsim, n))
-
-        print("Dumping CM files to .. `{}`.".format(fperm.format(nsim)))
-        with open(fperm.format(nsim), 'wb') as f:
+            remove(fpath)
+        print("Dumping CM files to .. `{}`.".format(fpermcm.format(nsim)))
+        with open(fpermcm.format(nsim), 'wb') as f:
            numpy.save(f, out)
+
+        print("Collecting clump files...")
+        stdout.flush()
+        out = [None] * unique_clumpids.size
+        dtype = {"names": ["clump", "ID"], "formats": [object, numpy.int32]}
+        out = numpy.full(unique_clumpids.size, numpy.nan, dtype=dtype)
+        for i, n in enumerate(unique_clumpids):
+            fpath = ftemp.format(nsim, n, "clump")
+            with open(fpath, 'rb') as f:
+                fin = numpy.load(f)
+            out["clump"][i] = fin
+            out["ID"][i] = n
+            remove(fpath)
+        print("Dumping clump files to .. `{}`.".format(fpermpart.format(nsim)))
+        with open(fpermpart.format(nsim), "wb") as f:
+            numpy.save(f, out)
+
+        del out
+        collect()