Speed up overlap (#27)

* Edit improt * Simplify patch size calculation * Add patch size percentiles * Add various percentiles * Remove comment * Update TODO * Change to 95th percentile * Add import * Add KNN properties * Add new matching initial condition * Add import * Remove import * Add fast neighbours option * Further edits to fast neighbours * add imports * add new overlap calculation and non-zero things * Remove print * Clean up code * Fix small bug * Remove comment * Add run single cross match * change values * Edit hyperparams * Add comment * Add the argument parser * Add new lagpatch calc * New lagpatch calc * Delete old patch definitions * Make clump dumping once again optional * Add lagpatch to the catalogue * Edit print statement * Fix small bug * Remove init radius * Change to lagpatch key * Fix a small bug * Fix little bug
2025-06-08 09:51:12 +00:00 · 2023-02-05 11:46:19 +00:00 · 2023-02-05 11:46:19 +00:00 · 8dea3da4de
commit 8dea3da4de
parent beb811e84c
9 changed files with 418 additions and 193 deletions
--- a/README.md
+++ b/README.md
@ -3,10 +3,7 @@
 ## CSiBORG Matching
 ### TODO
- [x] Implement CIC binning or an alternative scheme for nearby objects.
+- [ ] Modify the call to tN
 - [x] Consistently locate region spanned by a single halo.
 - [x] Write a script to perform the matching on a node.
 - [x] Make a coarser grid for halos outside of the well resolved region.
 ### Questions
 - What scaling of the search region? No reason for it to be a multiple of $R_{200c}$.
--- a/csiborgtools/match/init.py
+++ b/csiborgtools/match/init.py
@ -14,6 +14,8 @@
 # 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 from .match import (brute_spatial_separation, RealisationsMatcher, cosine_similarity,  # noqa
-                    ParticleOverlap, get_clumplims, lagpatch_size)  # noqa
+                    ParticleOverlap, get_clumplims, fill_delta, fill_delta_indxs,  # noqa
                    calculate_overlap, calculate_overlap_indxs,  # noqa
                    dist_centmass, dist_percentile)  # noqa
 from .num_density import (binned_counts, number_density)  # noqa
 # from .correlation import (get_randoms_sphere, sphere_angular_tpcf) # noqa
--- a/csiborgtools/match/match.py
+++ b/csiborgtools/match/match.py
@ -14,7 +14,6 @@
 # 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 import numpy
 from scipy.interpolate import interp1d
 from scipy.ndimage import gaussian_filter
 from tqdm import (tqdm, trange)
 from astropy.coordinates import SkyCoord
@ -155,15 +154,16 @@ class RealisationsMatcher:
        mapping[ind2] = ind1
        return mapping
-    def cross_knn_position_single(self, n_sim, nmult=5, dlogmass=None,
+    def cross_knn_position_single(self, n_sim, nmult=1, dlogmass=None,
                                  mass_kind="totpartmass", overlap=False,
                                  overlapper_kwargs={}, select_initial=True,
-                                  remove_nooverlap=True, verbose=True):
+                                  remove_nooverlap=True, fast_neighbours=False,
                                  verbose=True):
        r"""
-        Find all neighbours within a multiple of either :math:`R_{\rm init}`
+        Find all neighbours within a multiple of the sum of either the initial
-        (distance at :math:`z = 70`) or :math:`R_{200c}` (distance at
+        Lagrangian patch sizes (distance at :math:`z = 70`) or :math:`R_{200c}`
-        :math:`z = 0`) of halos in the `nsim`th simulation. Enforces that the
+        (distance at :math:`z = 0`). Enforces that the neighbours' are similar
-        neighbours' are similar in mass up to `dlogmass` dex.
+        in mass up to `dlogmass` dex and optionally calculates their overlap.
        Parameters
        ----------
@ -171,8 +171,8 @@ class RealisationsMatcher:
            Index of an IC realisation in `self.cats` whose halos' neighbours
            in the remaining simulations to search for.
        nmult : float or int, optional
-            Multiple of :math:`R_{\rm init}` or :math:`R_{200c}` within which
+            Multiple of the sum of pair Lagrangian patch sizes or
-            to return neighbours. By default 5.
+            :math:`R_{200c}` within which to return neighbours. By default 1.
        dlogmass : float, optional
            Tolerance on mass logarithmic mass difference. By default `None`.
        mass_kind : str, optional
@ -190,6 +190,11 @@ class RealisationsMatcher:
        remove_nooverlap : bool, optional
            Whether to remove pairs with exactly zero overlap. By default
            `True`.
        fast_neighbours : bool, optional
            Whether to calculate neighbours within a fixed radius of each
            clump. Note that this will result in missing some matches. If
            `True` then `nmult` is a multiple of either the initial patch size
            of :math:`R_{200c}`.
        verbose : bool, optional
            Iterator verbosity flag. By default `True`.
@ -208,7 +213,7 @@ class RealisationsMatcher:
        pos = self.cats[n_sim].positions      # Grav potential minimum
        pos0 = self.cats[n_sim].positions0    # CM positions
        if select_initial:
-            R = self.cats[n_sim]["patch_size"]  # Initial Lagrangian patch size
+            R = self.cats[n_sim]["lagpatch"]  # Initial Lagrangian patch size
        else:
            R = self.cats[n_sim]["r200"]      # R200c at z = 0
@ -238,13 +243,29 @@ class RealisationsMatcher:
        iters = enumerate(self.search_sim_indices(n_sim))
        # Search for neighbours in the other simulations at z = 70
        for count, i in iters:
            if verbose:
                print("Querying the KNN for `n_sim = {}`.".format(n_sim),
                      flush=True)
            # Query the KNN either fast (miss some) or slow (get all)
            if select_initial:
-                dist0, indxs = self.cats[i].radius_initial_neigbours(
+                if fast_neighbours:
-                    pos0, R * nmult)
+                    dist0, indxs = self.cats[i].radius_neigbours(
                        pos0, R * nmult, select_initial=True)
                else:
                    dist0, indxs = radius_neighbours(
                        self.cats[i].knn0, pos0, radiusX=R,
                        radiusKNN=self.cats[i]["lagpatch"], nmult=nmult,
                        verbose=verbose)
            else:
                # Will switch dist0 <-> dist at the end
                if fast_neighbours:
                    dist0, indxs = self.cats[i].radius_neigbours(
-                    pos, R * nmult)
+                        pos, R * nmult, select_initial=False)
                else:
                    dist0, indxs = radius_neighbours(
                        self.cats[i].knn, pos, radiusX=R,
                        radiusKNN=self.cats[i]["r200"], nmult=nmult,
                        verbose=verbose)
            # Enforce int32 and float32
            for n in range(dist0.size):
                dist0[n] = dist0[n].astype(numpy.float32)
@ -303,8 +324,9 @@ class RealisationsMatcher:
                    # Find which clump matches index of this halo from cat
                    match0 = cat2clumps0[k]
-                    # Unpack this clum and its mins and maxs
+                    # Unpack this clum, its mamss and mins and maxs
                    cl0 = clumps0["clump"][match0]
                    mass0 = numpy.sum(cl0['M'])
                    mins0_current, maxs0_current = mins0[match0], maxs0[match0]
                    # Preallocate this array.
                    crosses = numpy.full(indxs[k].size, numpy.nan,
@ -314,10 +336,11 @@ class RealisationsMatcher:
                    for ii, ind in enumerate(indxs[k]):
                        # Again which cross clump to this index
                        matchx = cat2clumpsx[ind]
                        clx = clumpsx["clump"][matchx]
                        crosses[ii] = overlapper(
-                            cl0, clumpsx["clump"][matchx], delta,
+                            cl0, clx, delta, mins0_current, maxs0_current,
-                            mins0_current, maxs0_current,
+                            minsx[matchx], maxsx[matchx],
-                            minsx[matchx], maxsx[matchx])
+                            mass1=mass0, mass2=numpy.sum(clx['M']))
                    cross[k] = crosses
                    # Optionally remove points whose overlap is exaclt zero
@ -338,30 +361,26 @@ class RealisationsMatcher:
        return numpy.asarray(matches, dtype=object)
-    def cross_knn_position_all(self, nmult=5, dlogmass=None,
+    def cross_knn_position_all(self, nmult=1, dlogmass=None,
-                               mass_kind="totpartmass", init_dist=False,
+                               mass_kind="totpartmass", overlap=False,
-                               overlap=False, overlapper_kwargs={},
+                               overlapper_kwargs={},
                               select_initial=True, remove_nooverlap=True,
                               verbose=True):
        r"""
-        Find all neighbours within :math:`n_{\rm mult} R_{200c}` of halos in
+        Find all counterparts of halos in all simulations listed in
-        all simulations listed in `self.cats`. Also enforces that the
+        `self.cats`. See `self.cross_knn_position_single` for more details.
        neighbours' :math:`\log M_{200c}` be within `dlogmass` dex.
        Parameters
        ----------
        nmult : float or int, optional
-            Multiple of :math:`R_{200c}` within which to return neighbours. By
+            Multiple of the sum of pair Lagrangian patch sizes or
-            default 5.
+            :math:`R_{200c}` within which to return neighbours. By default 1.
        dlogmass : float, optional
            Tolerance on mass logarithmic mass difference. By default `None`.
        mass_kind : str, optional
            The mass kind whose similarity is to be checked. Must be a valid
            catalogue key. By default `totpartmass`, i.e. the total particle
            mass associated with a halo.
        init_dist : bool, optional
            Whether to calculate separation of the initial CMs. By default
            `False`.
        overlap : bool, optional
            Whether to calculate overlap between clumps in the initial
            snapshot. By default `False`. Note that this operation is
@ -388,8 +407,7 @@ class RealisationsMatcher:
        # Loop over each catalogue
        for i in trange(N) if verbose else range(N):
            matches[i] = self.cross_knn_position_single(
-                i, nmult, dlogmass, mass_kind=mass_kind,
+                i, nmult, dlogmass, mass_kind=mass_kind, overlap=overlap,
                init_dist=init_dist, overlap=overlap,
                overlapper_kwargs=overlapper_kwargs,
                select_initial=select_initial,
                remove_nooverlap=remove_nooverlap, verbose=verbose)
@ -599,7 +617,7 @@ class ParticleOverlap:
        return delta
    def make_deltas(self, clump1, clump2, mins1=None, maxs1=None,
-                    mins2=None, maxs2=None):
+                    mins2=None, maxs2=None, return_nonzero1=False):
        """
        Calculate a NGP density fields of two halos on a grid that encloses
        them both.
@ -622,6 +640,9 @@ class ParticleOverlap:
            Density arrays of `clump1` and `clump2`, respectively.
        cellmins : len-3 tuple
            Tuple of left-most cell ID in the full box.
        nonzero1 : 2-dimensional array
            Indices where `delta1` has a non-zero density. If `return_nonzero1`
            is `False` return `None` instead.
        """
        xc1, yc1, zc1 = (self.pos2cell(clump1[p]) for p in ('x', 'y', 'z'))
        xc2, yc2, zc2 = (self.pos2cell(clump2[p]) for p in ('x', 'y', 'z'))
@ -648,16 +669,22 @@ class ParticleOverlap:
        cellmins = (xmin, ymin, zmin, )  # Cell minima
        ncells = max(xmax - xmin, ymax - ymin, zmax - zmin) + 1  # Num cells
-        # Preallocate and fill the array
+        # Preallocate and fill the arrays
        delta1 = numpy.zeros((ncells,)*3, dtype=numpy.float32)
        fill_delta(delta1, xc1, yc1, zc1, *cellmins, clump1['M'])
        delta2 = numpy.zeros((ncells,)*3, dtype=numpy.float32)
        if return_nonzero1:
            nonzero1 = fill_delta_indxs(
                delta1, xc1, yc1, zc1, *cellmins, clump1['M'])
        else:
            fill_delta(delta1, xc1, yc1, zc1, *cellmins, clump1['M'])
            nonzero1 = None
        fill_delta(delta2, xc2, yc2, zc2, *cellmins, clump2['M'])
        if self.smooth_scale is not None:
            gaussian_filter(delta1, self.smooth_scale, output=delta1)
            gaussian_filter(delta2, self.smooth_scale, output=delta2)
-        return delta1, delta2, cellmins
+
        return delta1, delta2, cellmins, nonzero1
    @staticmethod
    def overlap(delta1, delta2, cellmins, delta2_full):
@ -679,10 +706,11 @@ class ParticleOverlap:
        -------
        overlap : float
        """
-        return _calculate_overlap(delta1, delta2, cellmins, delta2_full)
+        return calculate_overlap(delta1, delta2, cellmins, delta2_full)
    def __call__(self, clump1, clump2, delta2_full, mins1=None, maxs1=None,
-                 mins2=None, maxs2=None):
+                 mins2=None, maxs2=None, mass1=None, mass2=None,
                 loop_nonzero=True):
        """
        Calculate overlap between `clump1` and `clump2`. See
        `self.overlap(...)` and `self.make_deltas(...)` for further
@ -704,21 +732,33 @@ class ParticleOverlap:
        mins2, maxs2 : 1-dimensional arrays of shape `(3,)`
            Minimun and maximum cell numbers along each dimension of `clump2`.
            Optional.
        mass1, mass2 : floats, optional
            Total mass of `clump1` and `clump2`, respectively. Must be provided
            if `loop_nonzero` is `True`.
        loop_nonzer : bool, optional
            Whether to only loop over cells where `clump1` has non-zero
            density. By default `True`.
        Returns
        -------
        overlap : float
        """
-        delta1, delta2, cellmins = self.make_deltas(
+        delta1, delta2, cellmins, nonzero1 = self.make_deltas(
-            clump1, clump2, mins1, maxs1, mins2, maxs2)
+            clump1, clump2, mins1, maxs1, mins2, maxs2,
-        return _calculate_overlap(delta1, delta2, cellmins, delta2_full)
+            return_nonzero1=loop_nonzero)
        if not loop_nonzero:
            return calculate_overlap(delta1, delta2, cellmins, delta2_full)
        return calculate_overlap_indxs(delta1, delta2, cellmins, delta2_full,
                                       nonzero1, mass1, mass2)
@jit(nopython=True)
 def fill_delta(delta, xcell, ycell, zcell, xmin, ymin, zmin, weights):
    """
-    Fill array delta at the specified indices with their weights. This is a JIT
+    Fill array `delta` at the specified indices with their weights. This is a
-    implementation.
+    JIT implementation.
    Parameters
    ----------
@ -735,8 +775,45 @@ def fill_delta(delta, xcell, ycell, zcell, xmin, ymin, zmin, weights):
    -------
    None
    """
-    for i in range(xcell.size):
+    for n in range(xcell.size):
-        delta[xcell[i] - xmin, ycell[i] - ymin, zcell[i] - zmin] += weights[i]
+        delta[xcell[n] - xmin, ycell[n] - ymin, zcell[n] - zmin] += weights[n]
@jit(nopython=True)
 def fill_delta_indxs(delta, xcell, ycell, zcell, xmin, ymin, zmin, weights):
    """
    Fill array `delta` at the specified indices with their weights and return
    indices where `delta` was assigned a value. This is a JIT implementation.
    Parameters
    ----------
    delta : 3-dimensional array
        Grid to be filled with weights.
    xcell, ycell, zcell : 1-dimensional arrays
        Indices where to assign `weights`.
    xmin, ymin, zmin : ints
        Minimum cell IDs of particles.
    weights : 1-dimensional arrays
        Particle mass.
    Returns
    -------
    cells : 1-dimensional array
        Indices where `delta` was assigned a value.
    """
    # Array to count non-zero cells
    cells = numpy.full((xcell.size, 3), numpy.nan, numpy.int32)
    count_nonzero = 0
    for n in range(xcell.size):
        i, j, k = xcell[n] - xmin, ycell[n] - ymin, zcell[n] - zmin
        # If a cell is zero add it
        if delta[i, j, k] == 0:
            cells[count_nonzero, :] = i, j, k
            count_nonzero += 1
        delta[i, j, k] += weights[n]
    return cells[:count_nonzero, :]  # Cutoff unassigned places
 def get_clumplims(clumps, ncells, nshift=None):
@ -776,7 +853,7 @@ def get_clumplims(clumps, ncells, nshift=None):
@jit(nopython=True)
-def _calculate_overlap(delta1, delta2, cellmins, delta2_full):
+def calculate_overlap(delta1, delta2, cellmins, delta2_full):
    r"""
    Overlap between two clumps whose density fields are evaluated on the
    same grid. This is a JIT implementation, hence it is outside of the main
@ -796,14 +873,13 @@ def _calculate_overlap(delta1, delta2, cellmins, delta2_full):
    -------
    overlap : float
    """
    imax, jmax, kmax = delta1.shape
    totmass = 0.           # Total mass of clump 1 and clump 2
    intersect = 0.         # Mass of pixels that are non-zero in both clumps
    weight = 0.            # Weight to account for other halos
    count = 0              # Total number of pixels that are both non-zero
    i0, j0, k0 = cellmins  # Unpack things
    imax, jmax, kmax = delta1.shape
    for i in range(imax):
        ii = i0 + i
        for j in range(jmax):
@ -826,62 +902,153 @@ def _calculate_overlap(delta1, delta2, cellmins, delta2_full):
    return weight * intersect / (totmass - intersect)
-def lagpatch_size(x, y, z, M, dr=0.0025, dqperc=1, minperc=75, defperc=95,
+@jit(nopython=True)
-                  rmax=0.075):
+def calculate_overlap_indxs(delta1, delta2, cellmins, delta2_full, nonzero1,
-    """
+                            mass1, mass2):
-    Calculate an approximate Lagrangian patch size in the initial conditions.
+    r"""
-    Returned as the first bin whose percentile drops by less than `dqperc` and
+    Overlap between two clumps whose density fields are evaluated on the
-    is above `minperc`. Note that all distances must be in box units.
+    same grid and `nonzero1` enumerates the non-zero cells of `delta1.  This is
    a JIT implementation, hence it is outside of the main class.
    Parameters
    ----------
-    x, y, z : 1-dimensional arrays
+    delta1, delta2 : 3-dimensional arrays
-        Particle coordinates.
+        Clumps density fields.
-    M : 1-dimensional array
+    cellmins : len-3 tuple
-        Particle masses.
+        Tuple of left-most cell ID in the full box.
-    dr : float, optional
+    delta2_full : 3-dimensional array
-        Separation spacing to evaluate q-th percentile change. Optional, by
+        Density field of the whole box calculated with particles assigned
-        default 0.0025
+        to halos at zero redshift.
-    dqperc : int or float, optional
+    nonzero1 : 2-dimensional array of shape `(n_cells, 3)`
-        Change of q-th percentile in a bin to find a threshold separation.
+        Indices of cells that are non-zero in `delta1`. Expected to be
-        Optional, by default 1.
+        precomputed from `fill_delta_indxs`.
-    minperc : int or float, optional
+    mass1, mass2 : floats, optional
-        Minimum q-th percentile of separation to be considered a patch size.
+        Total masses of the two clumps, respectively. Optional. If not provided
-        Optional, by default 75.
+        calculcated directly from the density field.
    defperc : int or float, optional
        Default q-th percentile if reduction by `minperc` is not satisfied in
        any bin. Optional. By default 95.
    rmax : float, optional
        The maximum allowed patch size. Optional, by default 0.075.
    Returns
    -------
-    size : float
+    overlap : float
    """
    totmass = mass1 + mass2  # Total mass of clump 1 and clump 2
    intersect = 0.           # Mass of pixels that are non-zero in both clumps
    weight = 0.              # Weight to account for other halos
    count = 0                # Total number of pixels that are both non-zero
    i0, j0, k0 = cellmins    # Unpack cell minimas
    ncells = nonzero1.shape[0]
    for n in range(ncells):
        i, j, k = nonzero1[n, :]
        cell1, cell2 = delta1[i, j, k], delta2[i, j, k]
        if cell2 > 0:  # We already know that cell1 is non-zero
            intersect += cell1 + cell2
            weight += cell2 / delta2_full[i0 + i, j0 + j, k0 + k]
            count += 1
    # Normalise the intersect and weights
    intersect *= 0.5
    weight = weight / count if count > 0 else 0.
    return weight * intersect / (totmass - intersect)
 def dist_centmass(clump):
    """
    Calculate the clump particles' distance from the centre of mass.
    Parameters
    ----------
    clump : structurered arrays
        Clump structured array. Keyes must include `x`, `y`, `z` and `M`.
    Returns
    -------
    dist : 1-dimensional array of shape `(n_particles, )`
        Particle distance from the centre of mass.
    cm : 1-dimensional array of shape `(3,)`
        Center of mass coordinates.
    """
    # CM along each dimension
-    cmx, cmy, cmz = [numpy.average(p, weights=M) for p in (x, y, z)]
+    cmx, cmy, cmz = [numpy.average(clump[p], weights=clump['M'])
                     for p in ('x', 'y', 'z')]
    # Particle distance from the CM
-    sep = numpy.sqrt(numpy.square(x - cmx)
+    dist = numpy.sqrt(numpy.square(clump['x'] - cmx)
-                     + numpy.square(y - cmy)
+                      + numpy.square(clump['y'] - cmy)
-                     + numpy.square(z - cmz))
+                      + numpy.square(clump['z'] - cmz))
-    qs = numpy.linspace(0, 100, 100)  # Percentile: where to evaluate
+    return dist, numpy.asarray([cmx, cmy, cmz])
    per = numpy.percentile(sep, qs)   # Percentile: evaluated
    sep2qs = interp1d(per, qs)        # Separation to q-th percentile
    # Evaluate in q-th percentile in separation bins
    sep_bin = numpy.arange(per[0], per[-1], dr)
    q_bin = sep2qs(sep_bin)            # Evaluate for everyhing
    dq_bin = (q_bin[1:] - q_bin[:-1])  # Take the difference
    # Indices when q-th percentile changes below tolerance and is above limit
    k = numpy.where((dq_bin < dqperc) & (q_bin[1:] > minperc))[0]
-    if k.size == 0:
+def dist_percentile(dist, qs, distmax=0.075):
-        return per[defperc]  # Nothing found, so default percentile
+    """
-    else:
+    Calculate q-th percentiles of `dist`, with an upper limit of `distmax`.
        k = k[0]  # Take the first one that satisfies the cut.
-    size = 0.5 * (sep_bin[k + 1] + sep_bin[k])  # Bin centre
+    Parameters
-    size = rmax if size > rmax else size        # Enforce maximum size
+    ----------
    dist : 1-dimensional array
        Array of distances.
    qs : 1-dimensional array
        Percentiles to compute.
    distmax : float, optional
        The maximum distance. By default 0.075.
-    return size
+    Returns
    -------
    x : 1-dimensional array
    """
    x = numpy.percentile(dist, qs)
    x[x > distmax] = distmax  # Enforce the upper limit
    return x
 def radius_neighbours(knn, X, radiusX, radiusKNN, nmult=1., verbose=True):
    """
    Find all neigbours of a trained KNN model whose center of mass separation
    is less than `nmult` times the sum of their respective radii.
    Parameters
    ----------
    knn : :py:class:`sklearn.neighbors.NearestNeighbors`
        Fitted nearest neighbour search.
    X : 2-dimensional array
        Array of shape `(n_samples, 3)`, where the latter axis represents
        `x`, `y` and `z`.
    radiusX: 1-dimensional array of shape `(n_samples, )`
        Patch radii corresponding to clumps in `X`.
    radiusKNN : 1-dimensional array
        Patch radii corresponding to clumps used to train `knn`.
    nmult : float, optional
        Multiple of the sum of two radii below which to consider a match.
    verbose : bool, optional
        Verbosity flag.
    Returns
    -------
    dists : 1-dimensional array `(n_samples,)` of arrays
        Distance from `X` to matches from `knn`.
    indxs : 1-dimensional array `(n_samples,)` of arrays
        Matches to `X` from `knn`.
    """
    assert X.ndim == 2 and X.shape[1] == 3       # shape of X ok?
    assert X.shape[0] == radiusX.size            # patchX matches X?
    assert radiusKNN.size == knn.n_samples_fit_  # patchknn matches the knn?
    nsamples = X.shape[0]
    dists = [None] * nsamples            # Initiate lists
    indxs = [None] * nsamples
    patchknn_max = numpy.max(radiusKNN)  # Maximum for completeness
    for i in trange(nsamples) if verbose else range(nsamples):
        dist, indx = knn.radius_neighbors(X[i, :].reshape(-1, 3),
                                          radiusX[i] + patchknn_max,
                                          sort_results=True)
        # Note that `dist` and `indx` are wrapped in 1-element arrays
        # so we take the first item where appropriate
        mask = (dist[0] / (radiusX[i] + radiusKNN[indx[0]])) < nmult
        dists[i] = dist[0][mask]
        indxs[i] = indx[0][mask]
    dists = numpy.asarray(dists, dtype=object)  # Turn into array of arrays
    indxs = numpy.asarray(indxs, dtype=object)
    return dists, indxs
--- a/csiborgtools/read/init.py
+++ b/csiborgtools/read/init.py
@ -14,7 +14,8 @@
 # 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 from .readsim import (CSiBORGPaths, ParticleReader, read_mmain, read_initcm, halfwidth_select)  # noqa
-from .make_cat import (HaloCatalogue, CombinedHaloCatalogue, concatenate_clumps, clumps_pos2cell)  # noqa
+from .make_cat import (HaloCatalogue, CombinedHaloCatalogue, concatenate_clumps,  # noqa
                       clumps_pos2cell)  # noqa
 from .readobs import (PlanckClusters, MCXCClusters, TwoMPPGalaxies,  # noqa
                      TwoMPPGroups, SDSS)  # noqa
 from .outsim import (dump_split, combine_splits, make_ascii_powmes)  # noqa
--- a/csiborgtools/read/make_cat.py
+++ b/csiborgtools/read/make_cat.py
@ -130,6 +130,28 @@ class HaloCatalogue:
        """
        return self.paths.n_sim
    @property
    def knn(self):
        """
        The final snapshot k-nearest neighbour object.
        Returns
        -------
        knn : :py:class:`sklearn.neighbors.NearestNeighbors`
        """
        return self._knn
    @property
    def knn0(self):
        """
        The initial snapshot k-nearest neighbour object.
        Returns
        -------
        knn : :py:class:`sklearn.neighbors.NearestNeighbors`
        """
        return self._knn0
    def _set_data(self, min_m500, max_dist):
        """
        Loads the data, merges with mmain, does various coordinate transforms.
@ -189,7 +211,7 @@ class HaloCatalogue:
        # And do the unit transform
        if initcm is not None:
            data = self.box.convert_from_boxunits(
-                data, ["x0", "y0", "z0", "patch_size"])
+                data, ["x0", "y0", "z0", "lagpatch"])
            self._positions0 = numpy.vstack(
                [data["{}0".format(p)] for p in ("x", "y", "z")]).T
            self._positions0 = self._positions0.astype(numpy.float32)
@ -258,9 +280,9 @@ class HaloCatalogue:
                "Ordering of `initcat` and `clumps` is inconsistent.")
        X = numpy.full((clumps.size, 4), numpy.nan)
-        for i, p in enumerate(['x', 'y', 'z', "patch_size"]):
+        for i, p in enumerate(['x', 'y', 'z', "lagpatch"]):
            X[:, i] = initcat[p]
-        return add_columns(clumps, X, ["x0", "y0", "z0", "patch_size"])
+        return add_columns(clumps, X, ["x0", "y0", "z0", "lagpatch"])
    @property
    def positions(self):
@ -314,30 +336,10 @@ class HaloCatalogue:
        """
        return numpy.vstack([self["L{}".format(p)] for p in ("x", "y", "z")]).T
-    @property
+    def radius_neigbours(self, X, radius, select_initial=True):
    def init_radius(self):
        r"""
-        A fiducial initial radius of particles that are identified as a single
+        Return sorted nearest neigbours within `radius` of `X` in the initial
-        halo in the final snapshot. Estimated to be
+        or final snapshot.
        ..math:
            R = (3 N / 4 \pi)^{1 / 3} * \Delta
        where :math:`N` is the number of particles and `Delta` is the initial
        inter-particular distance :math:`Delta = 1 / 2^{11}` in box units. The
        output fiducial radius is in comoving units of Mpc.
        Returns
        -------
        R : float
        """
        delta = self.box.box2mpc(1 / 2**11)
        return (3 * self["npart"] / (4 * numpy.pi))**(1/3) * delta
    def radius_neigbours(self, X, radius):
        """
        Return sorted nearest neigbours within `radius` of `X` in the final
        snapshot.
        Parameters
        ----------
@ -346,6 +348,9 @@ class HaloCatalogue:
            `x`, `y` and `z`.
        radius : float
            Limiting distance of neighbours.
        select_initial : bool, optional
            Whether to search for neighbours in the initial or final snapshot.
            By default `True`, i.e. the final snapshot.
        Returns
        -------
@ -358,35 +363,8 @@ class HaloCatalogue:
        """
        if not (X.ndim == 2 and X.shape[1] == 3):
            raise TypeError("`X` must be an array of shape `(n_samples, 3)`.")
-        # Query the KNN
+        knn = self.knn0 if select_initial else self.knn  # Pick the right KNN
-        return self._knn.radius_neighbors(X, radius, sort_results=True)
+        return knn.radius_neighbors(X, radius, sort_results=True)
    def radius_initial_neigbours(self, X, radius):
        r"""
        Return sorted nearest neigbours within `radius` or `X` in the initial
        snapshot.
        Parameters
        ----------
        X : 2-dimensional array
            Array of shape `(n_queries, 3)`, where the latter axis represents
            `x`, `y` and `z`.
        radius : float
            Limiting distance of neighbours.
        Returns
        -------
        dist : list of 1-dimensional arrays
            List of length `n_queries` whose elements are arrays of distances
            to the nearest neighbours.
        knns : list of 1-dimensional arrays
            List of length `n_queries` whose elements are arrays of indices of
            nearest neighbours in this catalogue.
        """
        if not (X.ndim == 2 and X.shape[1] == 3):
            raise TypeError("`X` must be an array of shape `(n_samples, 3)`.")
        # Query the KNN
        return self._knn0.radius_neighbors(X, radius, sort_results=True)
    @property
    def keys(self):
--- a/csiborgtools/units/box_units.py
+++ b/csiborgtools/units/box_units.py
@ -26,7 +26,7 @@ from ..read import ParticleReader
 # Map of unit conversions
 CONV_NAME = {
    "length": ["peak_x", "peak_y", "peak_z", "Rs", "rmin", "rmax", "r200",
-               "r500", "x0", "y0", "z0", "patch_size"],
+               "r500", "x0", "y0", "z0", "lagpatch"],
    "mass": ["mass_cl", "totpartmass", "m200", "m500", "mass_mmain"],
    "density": ["rho0"]
    }
--- a/scripts/run_crossmatch.py
+++ b/scripts/run_crossmatch.py
@ -14,6 +14,10 @@
 # 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 """
 MPI script to run the CSiBORG realisations matcher.
 TODO
 ----
    - [ ] Update this script
 """
 import numpy
 from datetime import datetime
--- a/scripts/run_initmatch.py
+++ b/scripts/run_initmatch.py
@ -20,6 +20,8 @@ Optionally also dumps the clumps information, however watch out as this will
 eat up a lot of memory.
 """
 import numpy
 from argparse import ArgumentParser
 from distutils.util import strtobool
 from datetime import datetime
 from mpi4py import MPI
 from os.path import join
@ -37,6 +39,11 @@ comm = MPI.COMM_WORLD
 rank = comm.Get_rank()
 nproc = comm.Get_size()
 # Argument parser
 parser = ArgumentParser()
 parser.add_argument("--dump_clumps", type=lambda x: bool(strtobool(x)))
 args = parser.parse_args()
 init_paths = csiborgtools.read.CSiBORGPaths(to_new=True)
 fin_paths = csiborgtools.read.CSiBORGPaths(to_new=False)
 nsims = init_paths.ic_ids
@ -80,7 +87,7 @@ for nsim in nsims:
    collect()
    if rank == 0:
-        print("{}: dumping clumps for simulation.".format(datetime.now()),
+        print("{}: dumping intermediate files.".format(datetime.now()),
              flush=True)
    # Grab unique clump IDs and loop over them
@ -93,18 +100,17 @@ for nsim in nsims:
        x0 = part0[clump_ids == n]
        # Center of mass and Lagrangian patch size
-        pos = numpy.vstack([x0[p] for p in ('x', 'y', 'z')]).T
+        dist, cm = csiborgtools.match.dist_centmass(x0)
-        cm = numpy.average(pos, axis=0, weights=x0['M'])
+        patch = csiborgtools.match.dist_percentile(dist, [99], distmax=0.075)
        patch_size = csiborgtools.match.lagpatch_size(
            *(x0[p] for p in ('x', 'y', 'z', 'M')))
        # Dump the center of mass
        with open(ftemp.format(nsim, n, "cm"), 'wb') as f:
            numpy.save(f, cm)
        # Dump the Lagrangian patch size
-        with open(ftemp.format(nsim, n, "patch_size"), 'wb') as f:
+        with open(ftemp.format(nsim, n, "lagpatch"), 'wb') as f:
-            numpy.save(f, patch_size)
+            numpy.save(f, patch)
        # Dump the entire clump
        if args.dump_clumps:
            with open(ftemp.format(nsim, n, "clump"), "wb") as f:
                numpy.save(f, x0)
@ -113,9 +119,10 @@ for nsim in nsims:
    comm.Barrier()
    if rank == 0:
-        print("Collecting CM files...", flush=True)
+        print("{}: collecting summary files...".format(datetime.now()),
              flush=True)
        # Collect the centre of masses, patch size, etc. and dump them
-        dtype = {"names": ['x', 'y', 'z', "patch_size", "ID"],
+        dtype = {"names": ['x', 'y', 'z', "lagpatch", "ID"],
                 "formats": [numpy.float32] * 4 + [numpy.int32]}
        out = numpy.full(njobs, numpy.nan, dtype=dtype)
@ -130,22 +137,25 @@ for nsim in nsims:
            remove(fpath)
            # Load in the patch size
-            fpath = ftemp.format(nsim, n, "patch_size")
+            fpath = ftemp.format(nsim, n, "lagpatch")
            with open(fpath, "rb") as f:
-                out["patch_size"][i] = numpy.load(f)
+                out["lagpatch"][i] = numpy.load(f)
            remove(fpath)
            # Store the halo ID
            out["ID"][i] = n
-        print("Dumping CM files to .. `{}`.".format(fpermcm.format(nsim)),
+        print("{}: dumping to .. `{}`.".format(
-              flush=True)
+            datetime.now(), fpermcm.format(nsim)), flush=True)
        with open(fpermcm.format(nsim), 'wb') as f:
            numpy.save(f, out)
-        print("Collecting clump files...", flush=True)
+        if args.dump_clumps:
            print("{}: collecting particle files...".format(datetime.now()),
                  flush=True)
            out = [None] * unique_clumpids.size
-        dtype = {"names": ["clump", "ID"], "formats": [object, numpy.int32]}
+            dtype = {"names": ["clump", "ID"],
                     "formats": [object, numpy.int32]}
            out = numpy.full(unique_clumpids.size, numpy.nan, dtype=dtype)
            for i, n in enumerate(unique_clumpids):
                fpath = ftemp.format(nsim, n, "clump")
@ -154,8 +164,8 @@ for nsim in nsims:
                out["clump"][i] = fin
                out["ID"][i] = n
                remove(fpath)
-        print("Dumping clump files to .. `{}`.".format(fpermpart.format(nsim)),
+            print("{}: dumping to .. `{}`.".format(
-              flush=True)
+                datetime.now(), fpermpart.format(nsim)), flush=True)
            with open(fpermpart.format(nsim), "wb") as f:
                numpy.save(f, out)
--- a/scripts/run_singlematch.py
+++ b/scripts/run_singlematch.py
@ -0,0 +1,66 @@
 # Copyright (C) 2022 Richard Stiskalek
 # This program is free software; you can redistribute it and/or modify it
 # under the terms of the GNU General Public License as published by the
 # Free Software Foundation; either version 3 of the License, or (at your
 # option) any later version.
 #
 # This program is distributed in the hope that it will be useful, but
 # WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General
 # Public License for more details.
 #
 # You should have received a copy of the GNU General Public License along
 # with this program; if not, write to the Free Software Foundation, Inc.,
 # 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 """
 Script to test running the CSiBORG realisations matcher.
 """
 import numpy
 from argparse import ArgumentParser
 from distutils.util import strtobool
 from datetime import datetime
 from os.path import join
 try:
    import csiborgtools
 except ModuleNotFoundError:
    import sys
    sys.path.append("../")
    import csiborgtools
 import utils
 # Argument parser
 parser = ArgumentParser()
 parser.add_argument("--nmult", type=float)
 parser.add_argument("--overlap", type=lambda x: bool(strtobool(x)))
 parser.add_argument("--select_initial", type=lambda x: bool(strtobool(x)))
 parser.add_argument("--fast_neighbours", type=lambda x: bool(strtobool(x)))
 args = parser.parse_args()
 # File paths
 ic = 7468
 fperm = join(utils.dumpdir, "overlap", "cross_{}.npy")
 paths = csiborgtools.read.CSiBORGPaths(to_new=False)
 paths.set_info(ic, paths.get_maximum_snapshot(ic))
 print("{}: loading catalogues.".format(datetime.now()), flush=True)
 cat = csiborgtools.read.CombinedHaloCatalogue(paths)
 matcher = csiborgtools.match.RealisationsMatcher(cat)
 nsim0 = cat.n_sims[0]
 nsimx = cat.n_sims[1]
 print("{}: crossing the simulations.".format(datetime.now()), flush=True)
 out = matcher.cross_knn_position_single(
    0, nmult=args.nmult, dlogmass=2., overlap=args.overlap,
    select_initial=args.select_initial, fast_neighbours=args.fast_neighbours)
 # Dump the result
 fout = fperm.format(nsim0)
 print("Saving results to `{}`.".format(fout), flush=True)
 with open(fout, "wb") as f:
    numpy.save(fout, out)
 print("All finished.", flush=True)