Matching paper plots (#91)

* Fix calculations of expected mass * Add paper plots * Edits to pltos * Add overlap summary * Add imports * Add import * Add binned stat * Add fit * Add more plots * Add basic env * Add histogram mode * Edit expected mass * Improve expected plots * Clean up plot * Improve separation plot * Update plots * Edit expected calculation * Update plotting * Update plots * Update plots * Update plots * Add conc fraction * Add halo maker sorting * Renaming * Add import * Add NaN treatment * add import * Move cosine smi * Update plots * Move similarity * Fix little bugs * Shorten documentation * Update plots
2025-06-28 10:51:10 +00:00 · 2023-10-17 12:11:15 +01:00 · 2023-10-17 12:11:15 +01:00 · 5500fbd2b9
commit 5500fbd2b9
parent 136c552369
11 changed files with 2193 additions and 294 deletions
--- a/csiborgtools/init.py
+++ b/csiborgtools/init.py
@ -12,10 +12,12 @@
 # You should have received a copy of the GNU General Public License along
 # with this program; if not, write to the Free Software Foundation, Inc.,
 # 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
-from csiborgtools import clustering, field, match, read, summary  # noqa
+from csiborgtools import clustering, field, match, read, summary                 # noqa
+
+from .utils import (center_of_mass, delta2ncells, number_counts,                 # noqa
+                    periodic_distance, periodic_distance_two_points,             # noqa
+                    binned_statistic, cosine_similarity)                         # noqa

-from .utils import (center_of_mass, delta2ncells, number_counts,  # noqa
-                    periodic_distance, periodic_distance_two_points)  # noqa

 # Arguments to csiborgtools.read.Paths.
 paths_glamdring = {"srcdir": "/mnt/extraspace/hdesmond/",
--- a/csiborgtools/match/init.py
+++ b/csiborgtools/match/init.py
@ -14,5 +14,5 @@
 # 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 from .match import (ParticleOverlap, RealisationsMatcher,  # noqa
                    calculate_overlap, calculate_overlap_indxs, pos2cell, # noqa
-                    cosine_similarity, find_neighbour, get_halo_cell_limits,  # noqa
+                    find_neighbour, get_halo_cell_limits,  # noqa
                    matching_max)  # noqa
--- a/csiborgtools/match/match.py
+++ b/csiborgtools/match/match.py
@ -660,19 +660,12 @@ class ParticleOverlap(BaseMatcher):


 def pos2cell(pos, ncells):
-    """
-    Convert position to cell number if there are `ncells` cells along the axis.
-    """
    if pos.dtype.char in numpy.typecodes["AllInteger"]:
        return pos
    return numpy.floor(pos * ncells).astype(numpy.int32)


 def read_nshift(smooth_kwargs):
-    """
-    Determine the number of cells to pad the density field if smoothing is
-    applied. Defaults to the ceiling of three times the smoothing scale.
-    """
    return 0 if smooth_kwargs is None else ceil(3 * smooth_kwargs["sigma"])


@ -774,33 +767,26 @@ def get_halo_cell_limits(pos, ncells, nshift=0):
    return mins, maxs


-@jit(nopython=True)
+@jit(nopython=True, boundscheck=False)
 def calculate_overlap(delta1, delta2, cellmins, delta_bckg, box_size,
                      bckg_halfsize):
-    r"""
-    Overlap between two halos whose density fields are evaluated on the
-    same grid. This is a JIT implementation, hence it is outside of the main
-    class.
+    """
+    Calculate overlap between two halos' density fields on the same grid.

    Parameters
    ----------
-    delta1: 3-dimensional array
-        Density field of the first halo.
-    delta2 : 3-dimensional array
-        Density field of the second halo.
-    cellmins : len-3 tuple
-        Tuple of lower cell ID in the full box.
-    delta_bckg : 3-dimensional array
-        Summed background density field of the reference and cross simulations
-        calculated with particles assigned to halos at the final snapshot.
-        Calculated on a grid determined by `bckg_halfsize`.
+    delta1, delta2 : 3D array
+        Density fields of the first and second halos, respectively.
+    cellmins : tuple (len=3)
+        Lower cell ID in the full box.
+    delta_bckg : 3D array
+        Combined background density field of reference and cross simulations
+        on `bckg_halfsize` grid.
    box_size : int
-        Number of cells in the box.
+        Cell count in the box.
    bckg_halfsize : int
-        Background half-size for density field calculation. This is the
-        grid distance from the center of the box to each side over which to
-        evaluate the background density field. Must be less than or equal to
-        half the box size.
+        Grid distance from box center for background density.
+        ≤ 0.5 * box_size.

    Returns
    -------
@ -834,39 +820,29 @@ def calculate_overlap(delta1, delta2, cellmins, delta_bckg, box_size,
    return intersect / (totmass - intersect)


-@jit(nopython=True)
+@jit(nopython=True, boundscheck=False)
 def calculate_overlap_indxs(delta1, delta2, cellmins, delta_bckg, nonzero,
                            mass1, mass2, box_size, bckg_halfsize):
-    r"""
-    Overlap between two haloes whose density fields are evaluated on the
-    same grid and `nonzero1` enumerates the non-zero cells of `delta1.  This is
-    a JIT implementation, hence it is outside of the main class.
+    """
+    Calculate overlap of two halos' density fields on the same grid.

    Parameters
    ----------
-    delta1: 3-dimensional array
-        Density field of the first halo.
-    delta2 : 3-dimensional array
-        Density field of the second halo.
-    cellmins : len-3 tuple
-        Tuple of lower cell ID in the full box.
-    delta_bckg : 3-dimensional array
-        Summed background density field of the reference and cross simulations
-        calculated with particles assigned to halos at the final snapshot.
-        Calculated on a grid determined by `bckg_halfsize`.
-    nonzero : 2-dimensional array of shape `(n_cells, 3)`
-        Indices of cells that are non-zero of the lower mass halo. Expected to
-        be precomputed from `fill_delta_indxs`.
-    mass1, mass2 : floats, optional
-        Total masses of the two haloes, respectively. Optional. If not provided
-        calculcated directly from the density field.
+    delta1, delta2 : 3D array
+        Density fields of the first and second halos, respectively.
+    cellmins : tuple (len=3)
+        Lower cell ID in the full box.
+    delta_bckg : 3D array
+        Combined background density from reference and cross simulations
+        on `bckg_halfsize` grid.
+    nonzero : 2D array (shape: (n_cells, 3))
+        Non-zero cells for the lower mass halo (from `fill_delta_indxs`).
+    mass1, mass2 : float, optional
+        Halos' total masses. Calculated from density if not provided.
    box_size : int
-        Number of cells in the box.
+        Cell count in the box.
    bckg_halfsize : int
-        Background half-size for density field calculation. This is the
-        grid distance from the center of the box to each side over which to
-        evaluate the background density field. Must be less than or equal to
-        half the box size.
+        Grid distance from box center for background density; ≤ 0.5 * box_size.

    Returns
    -------
@ -1039,35 +1015,6 @@ def find_neighbour(nsim0, cats):
    return dists, cross_hindxs


-def cosine_similarity(x, y):
-    r"""
-    Calculate the cosine similarity between two Cartesian vectors. Defined
-    as :math:`\Sum_{i} x_i y_{i} / (|x| * |y|)`.
-
-    Parameters
-    ----------
-    x : 1-dimensional array
-        The first vector.
-    y : 1- or 2-dimensional array
-        The second vector. Can be 2-dimensional of shape `(n_samples, 3)`,
-        in which case the calculation is broadcasted.
-
-    Returns
-    -------
-    out : float or 1-dimensional array
-    """
-    if x.ndim != 1:
-        raise ValueError("`x` must be a 1-dimensional array.")
-
-    if y.ndim == 1:
-        y = y.reshape(1, -1)
-
-    out = numpy.sum(x * y, axis=1)
-    out /= numpy.linalg.norm(x) * numpy.linalg.norm(y, axis=1)
-
-    return out[0] if out.size == 1 else out
-
-
 def matching_max(cat0, catx, mass_kind, mult, periodic, overlap=None,
                 match_indxs=None, verbose=True):
    """
--- a/csiborgtools/summary/init.py
+++ b/csiborgtools/summary/init.py
@ -16,7 +16,11 @@
 from .knn_summary import kNNCDFReader                                           # noqa
 from .nearest_neighbour_summary import NearestNeighbourReader                   # noqa
 from .overlap_summary import weighted_stats                                     # noqa
-from .overlap_summary import NPairsOverlap, PairOverlap, get_cross_sims         # noqa
+from .overlap_summary import (NPairsOverlap, PairOverlap, get_cross_sims,       # noqa
+                              max_overlap_agreement, max_overlap_agreements,    # noqa
+                              find_peak)                                        # noqa
 from .pk_summary import PKReader                                                # noqa
 from .tpcf_summary import TPCFReader                                            # noqa
-from .field_interp import read_interpolated_field                               # noqa
+from .field_interp import (read_interpolated_field,                             # noqa
+                           bayesian_bootstrap_correlation,                      # noqa
+                           correlate_at_fixed_smoothing)                        # noqa
--- a/csiborgtools/summary/field_interp.py
+++ b/csiborgtools/summary/field_interp.py
@ -15,6 +15,12 @@

 import numpy
 from tqdm import tqdm
+from numba import jit
+
+
+###############################################################################
+#             Read in the field values at the galaxy positions                #
+###############################################################################


 def read_interpolated_field(survey_name, kind, galaxy_index, paths, MAS, grid,
@ -75,3 +81,142 @@ def read_interpolated_field(survey_name, kind, galaxy_index, paths, MAS, grid,
        ks[i] = j

    return out[:, ks, :]
+
+
+###############################################################################
+#            Calculate the Bayesian bootstrapped correlation                 #
+###############################################################################
+
+
+@jit(nopython=True, fastmath=True, boundscheck=False)
+def dot_product(x, y):
+    tot = 0.0
+    for i in range(len(x)):
+        tot += x[i] * y[i]
+    return tot
+
+
+@jit(nopython=True, fastmath=True, boundscheck=False)
+def cov(x, y, mean_x, mean_y, weights):
+    tot = 0.0
+    for i in range(len(x)):
+        tot += (x[i] - mean_x) * (y[i] - mean_y) * weights[i]
+    return tot
+
+
+@jit(nopython=True, fastmath=True, boundscheck=False)
+def var(x, mean_x, weights):
+    tot = 0.0
+    for i in range(len(x)):
+        tot += (x[i] - mean_x)**2 * weights[i]
+    return tot
+
+
+@jit(nopython=True, fastmath=True, boundscheck=False)
+def weighted_correlation(x, y, weights):
+    mean_x = dot_product(x, weights)
+    mean_y = dot_product(y, weights)
+
+    cov_xy = cov(x, y, mean_x, mean_y, weights)
+
+    var_x = var(x, mean_x, weights)
+    var_y = var(y, mean_y, weights)
+
+    return cov_xy / numpy.sqrt(var_x * var_y)
+
+
+@jit(nopython=True, fastmath=True, boundscheck=False)
+def _bayesian_bootstrap_correlation(x, y, weights):
+    nweights = len(weights)
+    bootstrapped_correlations = numpy.full(nweights, numpy.nan, dtype=x.dtype)
+    for i in range(nweights):
+        bootstrapped_correlations[i] = weighted_correlation(x, y, weights[i])
+    return bootstrapped_correlations
+
+
+@jit(nopython=True, fastmath=True, boundscheck=False)
+def rank(x):
+    order = numpy.argsort(x)
+    ranks = order.argsort()
+    return ranks
+
+
+@jit(nopython=True, fastmath=True, boundscheck=False)
+def bayesian_bootstrap_correlation(x, y, kind="spearman", n_bootstrap=10000):
+    """
+    Calculate the Bayesian bootstrapped correlation between two arrays.
+
+    Parameters
+    ----------
+    x, y : 1-dimensional arrays
+        The two arrays to calculate the correlation between.
+    kind : str, optional
+        The type of correlation to calculate. Either `spearman` or `pearson`.
+    n_bootstrap : int, optional
+        The number of bootstrap samples to use.
+
+    Returns
+    -------
+    corr : 1-dimensional array of shape `(n_bootstrap,)`
+    """
+    if len(x) != len(y):
+        raise ValueError("Input arrays must have the same length")
+
+    if kind not in ["spearman", "pearson"]:
+        raise ValueError("kind must be either `spearman` or `pearson`")
+
+    if kind == "spearman":
+        dtype = x.dtype
+        x = rank(x).astype(dtype)
+        y = rank(y).astype(dtype)
+
+    alphas = numpy.ones(len(x), dtype=x.dtype)
+    weights = numpy.random.dirichlet(alphas, size=n_bootstrap)
+    return _bayesian_bootstrap_correlation(x, y, weights)
+
+
+###############################################################################
+#                       Distribution disagreement                             #
+###############################################################################
+
+
+def distribution_disagreement(x, y):
+    """
+    Think about this more when stacking non-Gaussian distributions.
+    """
+    delta = x - y
+    return numpy.abs(delta.mean()) / delta.std()
+
+
+
+
+"""
+
+field will be of value (nsims, ngal, nsmooth)
+
+Calculate the correlation for each sim and smoothing scale (nsims, nsmooth)
+
+For each of the above stack the distributions?
+"""
+def correlate_at_fixed_smoothing(field_values, galaxy_property,
+                                 kind="spearman", n_bootstrap=1000):
+    galaxy_property = galaxy_property.astype(field_values.dtype)
+    nsims = len(field_values)
+
+    distributions = numpy.empty((nsims, n_bootstrap), dtype=field_values.dtype)
+
+    from tqdm import trange
+
+    for i in trange(nsims):
+        distributions[i] = bayesian_bootstrap_correlation(
+            field_values[i], galaxy_property, kind=kind, n_bootstrap=n_bootstrap)
+
+    return distributions
+
+
+
+def do_something(field_values, galaxy_property):
+
+    pass
+
+
--- a/csiborgtools/summary/overlap_summary.py
+++ b/csiborgtools/summary/overlap_summary.py
@ -23,6 +23,30 @@ from tqdm import tqdm, trange

 from ..utils import periodic_distance

+
+###############################################################################
+#                           Utility functions                             #
+###############################################################################
+
+def find_peak(x, weights, shrink=0.95, min_obs=5):
+    """
+    Find the peak of a 1D distribution using a shrinking window.
+    """
+    assert shrink <= 1.
+
+    xmin, xmax = numpy.min(x), numpy.max(x)
+    xpos = (xmax + xmin) / 2
+    rad = (xmax - xmin) / 2
+
+    while True:
+        mask = numpy.abs(x - xpos) < rad
+        if mask.sum() < min_obs:
+            return xpos
+
+        xpos = numpy.average(x[mask], weights=weights[mask])
+        rad *= shrink
+
+
 ###############################################################################
 #                         Overlap of two simulations                          #
 ###############################################################################
@ -251,42 +275,16 @@ class PairOverlap:
        ----------
        from_smoothed : bool
            Whether to use the smoothed overlap or not.
-
        Returns
        -------
        summed_overlap : 1-dimensional array of shape `(nhalos, )`
        """
        overlap = self.overlap(from_smoothed)
-        out = numpy.full(len(overlap), numpy.nan, dtype=numpy.float32)
+        out = numpy.zeros(len(overlap), dtype=numpy.float32)
+
        for i in range(len(overlap)):
            if len(overlap[i]) > 0:
                out[i] = numpy.sum(overlap[i])
-            else:
-                out[i] = 0
-        return out
-
-    def prob_nomatch(self, from_smoothed):
-        """
-        Probability of no match for each halo in the reference simulation with
-        the cross simulation. Defined as a product of 1 - overlap with other
-        halos.
-
-        Parameters
-        ----------
-        from_smoothed : bool
-            Whether to use the smoothed overlap or not.
-
-        Returns
-        -------
-        prob_nomatch : 1-dimensional array of shape `(nhalos, )`
-        """
-        overlap = self.overlap(from_smoothed)
-        out = numpy.full(len(overlap), numpy.nan, dtype=numpy.float32)
-        for i in range(len(overlap)):
-            if len(overlap[i]) > 0:
-                out[i] = numpy.product(numpy.subtract(1, overlap[i]))
-            else:
-                out[i] = 1
        return out

    def dist(self, in_initial, boxsize, norm_kind=None):
@ -308,8 +306,7 @@ class PairOverlap:
        -------
        dist : array of 1-dimensional arrays of shape `(nhalos, )`
        """
-        assert (norm_kind is None
-                or norm_kind in ("r200c", "ref_patch", "sum_patch"))
+        assert (norm_kind is None or norm_kind in ("r200c", "ref_patch", "sum_patch"))  # noqa
        # Get positions either in the initial or final snapshot
        pos0 = self.cat0().position(in_initial=in_initial)
        posx = self.catx().position(in_initial=in_initial)
@ -400,60 +397,6 @@ class PairOverlap:

        return out

-    def counterpart_mass(self, from_smoothed, overlap_threshold=0.,
-                         mass_kind="totpartmass"):
-        """
-        Calculate the expected counterpart mass of each halo in the reference
-        simulation from the crossed simulation.
-
-        Parameters
-        ----------
-        from_smoothed : bool
-            Whether to use the smoothed overlap or not.
-        overlap_threshold : float, optional
-            Minimum overlap required for a halo to be considered a match. By
-            default 0.0, i.e. no threshold.
-        mass_kind : str, optional
-            The mass kind whose ratio is to be calculated. Must be a valid
-            catalogue key. By default `totpartmass`, i.e. the total particle
-            mass associated with a halo.
-
-        Returns
-        -------
-        mean, std : 1-dimensional arrays of shape `(nhalos, )`
-        """
-        mean = numpy.full(len(self), numpy.nan, dtype=numpy.float32)
-        std = numpy.full(len(self), numpy.nan, dtype=numpy.float32)
-
-        massx = self.catx(mass_kind)           # Create references to speed
-        overlap = self.overlap(from_smoothed)  # up the loop below
-
-        for i, match_ind in enumerate(self["match_indxs"]):
-            # Skip if no match
-            if len(match_ind) == 0:
-                continue
-
-            massx_ = massx[match_ind]  # Again just create references
-            overlap_ = overlap[i]      # to the appropriate elements
-
-            # Optionally apply overlap threshold
-            if overlap_threshold > 0.:
-                mask = overlap_ > overlap_threshold
-                if numpy.sum(mask) == 0:
-                    continue
-                massx_ = massx_[mask]
-                overlap_ = overlap_[mask]
-
-            massx_ = numpy.log10(massx_)
-            # Weighted average and *biased* standard deviation
-            mean_ = numpy.average(massx_, weights=overlap_)
-            std_ = numpy.average((massx_ - mean_)**2, weights=overlap_)**0.5
-
-            mean[i] = mean_
-            std[i] = std_
-
-        return mean, std
-
    def copy_per_match(self, par):
        """
        Make an array like `self.match_indxs` where each of its element is an
@ -525,6 +468,82 @@ class PairOverlap:
        return self["match_indxs"].size


+###############################################################################
+#                 Support functions for pair overlaps                         #
+###############################################################################
+
+
+def max_overlap_agreement(cat0, catx, min_logmass, maxdist, paths):
+    r"""
+    Calculate whether for a halo `A` from catalogue `cat0` that has a maximum
+    overlap with halo `B` from catalogue `catx` it is also `B` that has a
+    maximum overlap with `A`.
+
+    Parameters
+    ----------
+    cat0 : instance of :py:class:`csiborgtools.read.BaseCatalogue`
+        Halo catalogue corresponding to the reference simulation.
+    catx : instance of :py:class:`csiborgtools.read.BaseCatalogue`
+        Halo catalogue corresponding to the cross simulation.
+    min_logmass : float
+        Minimum halo mass in :math:`\log_{10} M_\odot / h` to consider.
+    maxdist : float, optional
+        Maximum halo distance in :math:`\mathrm{Mpc} / h` from the centre
+        of the high-resolution region.
+    paths : py:class`csiborgtools.read.Paths`
+        CSiBORG paths object.
+
+    Returns
+    -------
+    agreement : 1-dimensional array of shape `(nhalos, )`
+    """
+    kwargs = {"paths": paths, "min_logmass": min_logmass, "maxdist": maxdist}
+    pair_forward = PairOverlap(cat0, catx, **kwargs)
+    pair_backward = PairOverlap(catx, cat0, **kwargs)
+
+    nhalos = len(pair_forward.cat0())
+    agreement = numpy.full(nhalos, numpy.nan, dtype=numpy.float32)
+
+    for i in range(nhalos):
+        match_indxs_forward = pair_forward["match_indxs"][i]
+
+        if len(match_indxs_forward) == 0:
+            continue
+
+        overlap_forward = pair_forward["smoothed_overlap"][i]
+
+        kmax = match_indxs_forward[numpy.argmax(overlap_forward)]
+        match_indxs_backward = pair_backward["match_indxs"][kmax]
+        overlap_backward = pair_backward["smoothed_overlap"][kmax]
+
+        imatch = match_indxs_backward[numpy.argmax(overlap_backward)]
+        agreement[i] = imatch == i
+
+    return agreement
+
+
+def max_overlap_agreements(cat0, catxs, min_logmass, maxdist, paths,
+                           verbose=True):
+    """
+    Repeat `max_overlap_agreement` for many cross simulations.
+
+    Parameters
+    ----------
+    ...
+
+    Returns
+    -------
+    agreements : 2-dimensional array of shape `(ncatxs, nhalos)`
+    """
+    agreements = [None] * len(catxs)
+    desc = "Calculating maximum overlap agreement"
+    for i, catx in enumerate(tqdm(catxs, desc=desc, disable=not verbose)):
+        agreements[i] = max_overlap_agreement(cat0, catx, min_logmass,
+                                              maxdist, paths)
+
+    return numpy.asanyarray(agreements)
+
+
 def weighted_stats(x, weights, min_weight=0, verbose=False):
    """
    Calculate the weighted mean and standard deviation of `x` using `weights`
@ -544,11 +563,10 @@ def weighted_stats(x, weights, min_weight=0, verbose=False):

    Returns
    -------
-    stat : 2-dimensional array of shape `(len(x), 2)`
-        The first column is the weighted mean and the second column is the
-        weighted standard deviation.
+    mu, std : 1-dimensional arrays of shape `(len(x), )`
    """
-    out = numpy.full((x.size, 2), numpy.nan, dtype=numpy.float32)
+    mu = numpy.full(x.size, numpy.nan, dtype=numpy.float32)
+    std = numpy.full(x.size, numpy.nan, dtype=numpy.float32)

    for i in trange(len(x), disable=not verbose):
        x_, w_ = numpy.asarray(x[i]), numpy.asarray(weights[i])
@ -557,9 +575,9 @@ def weighted_stats(x, weights, min_weight=0, verbose=False):
        w_ = w_[mask]
        if len(w_) == 0:
            continue
-        out[i, 0] = numpy.average(x_, weights=w_)
-        out[i, 1] = numpy.average((x_ - out[i, 0])**2, weights=w_)**0.5
-    return out
+        mu[i] = numpy.average(x_, weights=w_)
+        std[i] = numpy.average((x_ - mu[i])**2, weights=w_)**0.5
+    return mu, std


 ###############################################################################
@ -684,92 +702,87 @@ class NPairsOverlap:
            out[i] = pair.summed_overlap(from_smoothed)
        return numpy.vstack(out).T

-    def prob_nomatch(self, from_smoothed, verbose=True):
-        """
-        Probability of no match for each halo in the reference simulation with
-        the cross simulation.
+    def expected_property_single(self, k, key, from_smoothed,  in_log=True):
+        ys = [None] * len(self)
+        overlaps = [None] * len(self)
+        for i, pair in enumerate(self):
+            overlap = pair.overlap(from_smoothed)
+            if len(overlap[k]) == 0:
+                ys[i] = numpy.nan
+                overlaps[i] = numpy.nan
+                continue
+            match_indxs = pair["match_indxs"]
+            j = numpy.argmax(overlap[k])

-        Parameters
-        ----------
-        from_smoothed : bool
-            Whether to use the smoothed overlap or not.
-        verbose : bool, optional
-            Verbosity flag.
+            ys[i] = pair.catx(key)[match_indxs[k][j]]
+            if in_log:
+                ys[i] = numpy.log10(ys[i])
+            overlaps[i] = overlap[k][j]

-        Returns
-        -------
-        prob_nomatch : 2-dimensional array of shape `(nhalos, ncatxs)`
-        """
-        iterator = tqdm(self.pairs,
-                        desc="Calculating probability of no match",
-                        disable=not verbose
-                        )
-        out = [None] * len(self)
-        for i, pair in enumerate(iterator):
-            out[i] = pair.prob_nomatch(from_smoothed)
-        return numpy.vstack(out).T
+        return ys, overlaps

-    def counterpart_mass(self, from_smoothed, overlap_threshold=0.,
-                         mass_kind="totpartmass", return_full=False,
-                         verbose=True):
+    def expected_property(self, key, from_smoothed, min_logmass,
+                          in_log=True, mass_kind="totpartmass", verbose=True):
        """
        Calculate the expected counterpart mass of each halo in the reference
        simulation from the crossed simulation.

        Parameters
        ----------
+        key : str
+            Property key.
        from_smoothed : bool
            Whether to use the smoothed overlap or not.
-        overlap_threshold : float, optional
-            Minimum overlap required for a halo to be considered a match. By
-            default 0.0, i.e. no threshold.
+        min_logmass : float
+            Minimum log mass of reference halos to consider.
+        in_log : bool, optional
+            Whether to calculated the expected property in log10.
        mass_kind : str, optional
            The mass kind whose ratio is to be calculated. Must be a valid
            catalogue key. By default `totpartmass`, i.e. the total particle
            mass associated with a halo.
-        return_full : bool, optional
-            Whether to return the full results of matching each pair or
-            calculate summary statistics by Gaussian averaging.
        verbose : bool, optional
            Verbosity flag.

        Returns
        -------
-        mu, std : 1-dimensional arrays of shape `(nhalos,)`
-            Summary expected mass and standard deviation from all cross
-            simulations.
-        mus, stds : 2-dimensional arrays of shape `(nhalos, ncatx)`, optional
-            Expected mass and standard deviation from each cross simulation.
-            Returned only if `return_full` is `True`.
+        mean_expected : 1-dimensional array of shape `(nhalos, )`
+            Expected property from all cross simulations.
+        std_expected : 1-dimensional array of shape `(nhalos, )`
+            Standard deviation of the expected property.
        """
-        iterator = tqdm(self.pairs,
-                        desc="Calculating counterpart masses",
-                        disable=not verbose)
-        mus, stds = [None] * len(self), [None] * len(self)
-        for i, pair in enumerate(iterator):
-            mus[i], stds[i] = pair.counterpart_mass(
-                from_smoothed=from_smoothed,
-                overlap_threshold=overlap_threshold, mass_kind=mass_kind)
-        mus, stds = numpy.vstack(mus).T, numpy.vstack(stds).T
+        log_mass0 = numpy.log10(self.cat0(mass_kind))
+        ntot = len(log_mass0)
+        mean_expected = numpy.full(ntot, numpy.nan, dtype=numpy.float32)
+        std_expected = numpy.full(ntot, numpy.nan, dtype=numpy.float32)

-        # Prob of > 0 matches
-        probmatch = 1 - self.prob_nomatch(from_smoothed)
-        # Normalise it for weighted sums etc.
-        norm_probmatch = numpy.apply_along_axis(
-            lambda x: x / numpy.sum(x), axis=1, arr=probmatch)
+        indxs = numpy.where(log_mass0 > min_logmass)[0]
+        for i in tqdm(indxs, disable=not verbose,
+                      desc="Calculating expectation"):
+            ys = numpy.full(len(self), numpy.nan, dtype=numpy.float32)
+            weights = numpy.full(len(self), numpy.nan, dtype=numpy.float32)
+            for j, pair in enumerate(self):
+                overlap = pair.overlap(from_smoothed)
+                if len(overlap[i]) == 0:
+                    continue

-        # Mean and standard deviation of weighted stacked Gaussians
-        mu = numpy.sum((norm_probmatch * mus), axis=1)
-        std = numpy.sum((norm_probmatch * (mus**2 + stds**2)), axis=1) - mu**2
-        std **= 0.5
+                k = numpy.argmax(overlap[i])
+                ys[j] = pair.catx(key)[pair["match_indxs"][i][k]]
+                weights[j] = overlap[i][k]

-        mask = mu <= 0
-        mu[mask] = numpy.nan
-        std[mask] = numpy.nan
+                if in_log:
+                    ys[j] = numpy.log10(ys[j])

-        if return_full:
-            return mu, std, mus, stds
-        return mu, std
+            mask = numpy.isfinite(ys) & numpy.isfinite(weights)
+            if numpy.sum(mask) <= 2:
+                continue
+
+            mean_expected[i] = find_peak(ys[mask], weights=weights[mask])
+            std_expected[i] = numpy.average((ys[mask] - mean_expected[i])**2,
+                                            weights=weights[mask])**0.5
+            print(log_mass0[i], mean_expected[i], std_expected[i])
+
+        return mean_expected, std_expected

    @property
    def pairs(self):
--- a/csiborgtools/utils.py
+++ b/csiborgtools/utils.py
@ -150,6 +150,35 @@ def radec_to_cartesian(X):
        ]).T


+def cosine_similarity(x, y):
+    r"""
+    Calculate the cosine similarity between two Cartesian vectors. Defined
+    as :math:`\Sum_{i} x_i y_{i} / (|x| * |y|)`.
+
+    Parameters
+    ----------
+    x : 1-dimensional array
+        The first vector.
+    y : 1- or 2-dimensional array
+        The second vector. Can be 2-dimensional of shape `(n_samples, 3)`,
+        in which case the calculation is broadcasted.
+
+    Returns
+    -------
+    out : float or 1-dimensional array
+    """
+    if x.ndim != 1:
+        raise ValueError("`x` must be a 1-dimensional array.")
+
+    if y.ndim == 1:
+        y = y.reshape(1, -1)
+
+    out = numpy.sum(x * y, axis=1)
+    out /= numpy.linalg.norm(x) * numpy.linalg.norm(y, axis=1)
+
+    return out[0] if out.size == 1 else out
+
+
 def real2redshift(pos, vel, observer_location, observer_velocity, box,
                  periodic_wrap=True, make_copy=True):
    r"""
@ -219,3 +248,17 @@ def number_counts(x, bin_edges):
    for i in range(bin_edges.size - 1):
        out[i] = numpy.sum((x >= bin_edges[i]) & (x < bin_edges[i + 1]))
    return out
+
+
+def binned_statistic(x, y, left_edges, bin_width, statistic):
+    """
+    Calculate a binned statistic.
+    """
+    out = numpy.full(left_edges.size, numpy.nan, dtype=x.dtype)
+
+    for i in range(left_edges.size):
+        mask = (x >= left_edges[i]) & (x < left_edges[i] + bin_width)
+
+        if numpy.any(mask):
+            out[i] = statistic(y[mask])
+    return out
--- a/scripts/sort_halomaker.py
+++ b/scripts/sort_halomaker.py
@ -0,0 +1,100 @@
+# Copyright (C) 2022 Richard Stiskalek
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation; either version 3 of the License, or (at your
+# option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General
+# Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+"""
+Script to sort the HaloMaker's `particle_membership` file to match the ordering
+of particles in the simulation snapshot.
+"""
+from argparse import ArgumentParser
+from datetime import datetime
+from glob import iglob
+
+import h5py
+import numpy
+import pynbody
+from mpi4py import MPI
+from taskmaster import work_delegation
+from tqdm import trange
+
+import csiborgtools
+
+
+def sort_particle_membership(nsim, nsnap, method):
+    """
+    Read the FoF particle halo membership and sort the halo IDs to the ordering
+    of particles in the PHEW clump IDs.
+
+    Parameters
+    ----------
+    nsim : int
+        IC realisation index.
+    verbose : bool, optional
+        Verbosity flag.
+    """
+    print(f"{datetime.now()}:   starting simulation {nsim}, snapshot {nsnap} and method {method}.")  # noqa
+    paths = csiborgtools.read.Paths(**csiborgtools.paths_glamdring)
+
+    fpath = next(iglob(f"/mnt/extraspace/rstiskalek/CSiBORG/halo_maker/ramses_{nsim}/output_{str(nsnap).zfill(5)}/**/*particle_membership*", recursive=True), None)  # noqa
+    print(f"{datetime.now()}:   loading particle membership `{fpath}`.")
+    # Columns are halo ID, particle ID
+    membership = numpy.genfromtxt(fpath, dtype=int)
+
+    print(f"{datetime.now()}:   loading particle IDs from the snapshot.")
+    sim = pynbody.load(paths.snapshot(nsnap, nsim, "csiborg"))
+    pids = numpy.asanyarray(sim["iord"])
+
+    print(f"{datetime.now()}:   mapping particle IDs to their indices.")
+    pids_idx = {pid: i for i, pid in enumerate(pids)}
+
+    print(f"{datetime.now()}:   mapping HIDs to their array indices.")
+    # Unassigned particle IDs are assigned a halo ID of 0.
+    hids = numpy.zeros(pids.size, dtype=numpy.int32)
+    for i in trange(membership.shape[0]):
+        hid, pid = membership[i]
+        hids[pids_idx[pid]] = hid
+
+    fout = fpath + "_sorted.hdf5"
+    print(f"{datetime.now()}:   saving the sorted data to ... `{fout}`")
+
+    header = """
+    This dataset represents halo indices for each particle.
+        - The particles are ordered as they appear in the simulation snapshot.
+        - Unassigned particles are given a halo index of 0.
+        """
+    with h5py.File(fout, 'w') as hdf:
+        dset = hdf.create_dataset('hids_dataset', data=hids)
+        dset.attrs['header'] = header
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser()
+    parser.add_argument("--method", type=str, required=True,
+                        help="HaloMaker method")
+    parser.add_argument("--nsim", type=int, required=False, default=None,
+                        help="IC index. If not set process all.")
+    args = parser.parse_args()
+    paths = csiborgtools.read.Paths(**csiborgtools.paths_glamdring)
+
+    if args.nsim is None:
+        ics = paths.get_ics("csiborg")
+    else:
+        ics = [args.nsim]
+
+    snaps = numpy.array([max(paths.get_snapshots(nsim, "csiborg"))
+                         for nsim in ics])
+
+    def main(n):
+        sort_particle_membership(ics[n], snaps[n], args.method)
+
+    work_delegation(main, list(range(len(ics))), MPI.COMM_WORLD)
--- a/scripts/sort_initsnap.py
+++ b/scripts/sort_initsnap.py
@ -29,16 +29,9 @@ import numpy
 from mpi4py import MPI
 from taskmaster import work_delegation

+import csiborgtools
 from utils import get_nsims

-try:
-    import csiborgtools
-except ModuleNotFoundError:
-    import sys
-
-    sys.path.append("../")
-    import csiborgtools
-

 def _main(nsim, simname, verbose):
    """
@ -51,9 +44,7 @@ def _main(nsim, simname, verbose):
    else:
        partreader = csiborgtools.read.QuijoteReader(paths)

-    if verbose:
-        print(f"{datetime.now()}: reading and processing simulation `{nsim}`.",
-              flush=True)
+    print(f"{datetime.now()}:   processing simulation `{nsim}`.", flush=True)
    # We first load the particle IDs in the final snapshot.
    pidf = csiborgtools.read.read_h5(paths.particles(nsim, simname))
    pidf = pidf["particle_ids"]
--- a/scripts_plots/paper_match.py
+++ b/scripts_plots/paper_match.py
--- a/scripts_plots/plot_data.py
+++ b/scripts_plots/plot_data.py
@ -66,6 +66,12 @@ def plot_mass_vs_ncells(nsim, pdf=False):
    cat = open_csiborg(nsim)
    mpart = 4.38304044e+09

+    x = numpy.log10(cat["totpartmass"])
+    y = numpy.log10(cat["lagpatch_ncells"])
+
+    p = numpy.polyfit(x, y, 1)
+    print("Fitted parameters are: ", p)
+
    with plt.style.context(plt_utils.mplstyle):
        plt.figure()
        plt.scatter(cat["totpartmass"], cat["lagpatch_ncells"], s=0.25,
@ -105,9 +111,9 @@ def plot_hmf(pdf=False):
            csiborg_counts = numpy.full((len(csiborg_nsims), len(bins) - 1),
                                        numpy.nan, dtype=numpy.float32)
        csiborg_counts[i, :] = data["counts"]
-    # csiborg_counts /= numpy.diff(bins).reshape(1, -1)
+    csiborg_counts /= numpy.diff(bins).reshape(1, -1)

-    csiborg5511 = numpy.load(paths.halo_counts("csiborg", 5511))["counts"]
+    # csiborg5511 = numpy.load(paths.halo_counts("csiborg", 5511))["counts"]
    # csiborg5511 /= numpy.diff(data["bins"])

    print("Loading Quijote halo counts.", flush=True)
@ -121,73 +127,89 @@ def plot_hmf(pdf=False):
                (len(quijote_nsims) * nmax, len(bins) - 1), numpy.nan,
                dtype=numpy.float32)
        quijote_counts[i * nmax:(i + 1) * nmax, :] = data["counts"]
-    # quijote_counts /= numpy.diff(bins).reshape(1, -1)
+    quijote_counts /= numpy.diff(bins).reshape(1, -1)

-    # vol = 155.5**3
-    # csiborg_counts /= vol
-    # quijote_counts /= vol
+    vol = 4 * numpy.pi / 3 * 155.5**3
+    csiborg_counts /= vol
+    quijote_counts /= vol
    # csiborg5511 /= vol

    x = 10**(0.5 * (bins[1:] + bins[:-1]))
    # Edit lower limits
-    csiborg_counts[:, x < 1e12] = numpy.nan
+    csiborg_counts[:, x < 10**13.1] = numpy.nan
    quijote_counts[:, x < 10**(13.1)] = numpy.nan
    # Edit upper limits
    csiborg_counts[:, x > 3e15] = numpy.nan
    quijote_counts[:, x > 3e15] = numpy.nan
-    csiborg5511[x > 3e15] = numpy.nan
+    # csiborg5511[x > 3e15] = numpy.nan

    with plt.style.context(plt_utils.mplstyle):
        cols = plt.rcParams["axes.prop_cycle"].by_key()["color"]
-        fig, ax = plt.subplots(nrows=2, sharex=True,
-                               figsize=(3.5, 2.625 * 1.25),
-                               gridspec_kw={"height_ratios": [1, 0.45]})
-        fig.subplots_adjust(hspace=0, wspace=0)
+        fig, ax = plt.subplots(nrows=1, sharex=True,
+                               figsize=(3.5, 2.625))
+        ax = [ax]
+        # fig, ax = plt.subplots(nrows=2, sharex=True,
+        #                        figsize=(3.5, 2.625 * 1.25),
+        #                        gridspec_kw={"height_ratios": [1, 0.25]})
+        # fig.subplots_adjust(hspace=0, wspace=0)

        # Upper panel data
        mean_csiborg = numpy.mean(csiborg_counts, axis=0)
        std_csiborg = numpy.std(csiborg_counts, axis=0)
-        ax[0].plot(x, mean_csiborg, label="CSiBORG", c=cols[0])
-        ax[0].fill_between(x, mean_csiborg - std_csiborg,
-                           mean_csiborg + std_csiborg,
-                           alpha=0.5, color=cols[0])
+
+        for i in range(len(csiborg_counts)):
+            ax[0].plot(x, csiborg_counts[i, :], c="cornflowerblue", lw=0.5, zorder=0)
+
+        ax[0].plot(x, mean_csiborg, label="CSiBORG", c="mediumblue", zorder=1)
+        # ax[0].fill_between(x, mean_csiborg - std_csiborg,
+        #                    mean_csiborg + std_csiborg,
+        #                    alpha=0.5, color=cols[0])

        mean_quijote = numpy.mean(quijote_counts, axis=0)
        std_quijote = numpy.std(quijote_counts, axis=0)
-        ax[0].plot(x, mean_quijote, label="Quijote", c=cols[1])
-        ax[0].fill_between(x, mean_quijote - std_quijote,
-                           mean_quijote + std_quijote, alpha=0.5,
-                           color=cols[1])

-        ax[0].plot(x, csiborg5511, label="CSiBORG 5511", c="k", ls="--")
-        std5511 = numpy.sqrt(csiborg5511)
-        ax[0].fill_between(x, csiborg5511 - std_csiborg, csiborg5511 + std5511,
-                           alpha=0.2, color="k")
-        # Lower panel data
-        log_y = numpy.log10(mean_csiborg / mean_quijote)
-        err = numpy.sqrt((std_csiborg / mean_csiborg / numpy.log(10))**2
-                         + (std_quijote / mean_quijote / numpy.log(10))**2)
-        ax[1].plot(x, 10**log_y, c=cols[0])
-        ax[1].fill_between(x, 10**(log_y - err), 10**(log_y + err), alpha=0.5,
-                           color=cols[0])
+        for i in range(len(quijote_counts)):
+            ax[0].plot(x, quijote_counts[i, :], c="palegreen", lw=0.5, zorder=-1)

-        ax[1].plot(x, csiborg5511 / mean_quijote, c="k", ls="--")
+
+
+        ax[0].plot(x, mean_quijote, label="Quijote", c="darkgreen", zorder=1)
+        # ax[0].fill_between(x, mean_quijote - std_quijote,
+        #                    mean_quijote + std_quijote, alpha=0.5,
+        #                    color=cols[1])
+
+        # ax[0].plot(x, csiborg5511, label="CSiBORG 5511", c="k", ls="--")
+        # std5511 = numpy.sqrt(csiborg5511)
+        # ax[0].fill_between(x, csiborg5511 - std_csiborg, csiborg5511 + std5511,
+        #                    alpha=0.2, color="k")
+
+        # # Lower panel data
+        # log_y = numpy.log10(mean_csiborg / mean_quijote)
+        # err = numpy.sqrt((std_csiborg / mean_csiborg / numpy.log(10))**2
+        #                  + (std_quijote / mean_quijote / numpy.log(10))**2)
+        # ax[1].plot(x, 10**log_y, c=cols[0])
+        # ax[1].fill_between(x, 10**(log_y - err), 10**(log_y + err), alpha=0.5,
+        #                    color="k")
+
+        # ax[1].plot(x, csiborg5511 / mean_quijote, c="k", ls="--")

        # Labels and accesories
-        ax[1].axhline(1, color="k", ls="--",
-                      lw=0.5 * plt.rcParams["lines.linewidth"], zorder=0)
+        # ax[1].axhline(1, color="k", ls="--",
+        #               lw=0.5 * plt.rcParams["lines.linewidth"], zorder=0)
        # ax[0].set_ylabel(r"$\frac{\mathrm{d}^2 N}{\mathrm{d} V \mathrm{d}\log M_{\rm tot}}~[\mathrm{dex}^{-1} (\mathrm{Mpc} / h)^{-3}]$",  # noqa
        #                  fontsize="small")
-        ax[0].set_ylabel("Counts in bins")
-        ax[1].set_xlabel(r"$M_{\rm tot}~[M_\odot / h]$", fontsize="small")
-        ax[1].set_ylabel(r"$\mathrm{CSiBORG} / \mathrm{Quijote}$",
-                         fontsize="small")
+        m = numpy.isfinite(mean_quijote)
+        ax[0].set_xlim(x[m].min(), x[m].max())
+        ax[0].set_ylabel(r"$\mathrm{HMF}~[\mathrm{dex}^{-1} (\mathrm{Mpc} / h)^{-3}]$")
+        ax[0].set_xlabel(r"$M_{\rm tot}~[M_\odot / h]$", fontsize="small")
+        # ax[1].set_ylabel(r"$\mathrm{CSiBORG} / \mathrm{Quijote}$",
+        #                  fontsize="small")

        ax[0].set_xscale("log")
        ax[0].set_yscale("log")
-        ax[1].set_ylim(0.5, 2.0)
+        # ax[1].set_ylim(0.5, 1.5)
        # ax[1].set_yscale("log")
-        ax[0].legend(fontsize="small")
+        ax[0].legend()

        fig.tight_layout(h_pad=0, w_pad=0)
        for ext in ["png"] if pdf is False else ["png", "pdf"]:
@ -556,8 +578,8 @@ if __name__ == "__main__":
    if False:
        plot_mass_vs_ncells(7444, pdf=False)

-    if False:
-        plot_hmf(pdf=False)
+    if True:
+        plot_hmf(pdf=True)

    if False:
        plot_hmf_quijote_full(pdf=False)
@ -569,7 +591,7 @@ if __name__ == "__main__":
                              plot_groups=False, dmin=45, dmax=60,
                              plot_halos=5e13, volume_weight=True)

-    if True:
+    if False:
        kind = "environment"
        grid = 512
        smooth_scale = 8.0