Matching paper plots (#91)

* Fix calculations of expected mass * Add paper plots * Edits to pltos * Add overlap summary * Add imports * Add import * Add binned stat * Add fit * Add more plots * Add basic env * Add histogram mode * Edit expected mass * Improve expected plots * Clean up plot * Improve separation plot * Update plots * Edit expected calculation * Update plotting * Update plots * Update plots * Update plots * Add conc fraction * Add halo maker sorting * Renaming * Add import * Add NaN treatment * add import * Move cosine smi * Update plots * Move similarity * Fix little bugs * Shorten documentation * Update plots
2025-06-12 19:51:11 +00:00 · 2023-10-17 12:11:15 +01:00 · 2023-10-17 12:11:15 +01:00 · 5500fbd2b9
commit 5500fbd2b9
parent 136c552369
11 changed files with 2193 additions and 294 deletions
--- a/csiborgtools/init.py
+++ b/csiborgtools/init.py
@ -12,10 +12,12 @@
 # You should have received a copy of the GNU General Public License along
 # with this program; if not, write to the Free Software Foundation, Inc.,
 # 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
-from csiborgtools import clustering, field, match, read, summary  # noqa
+from csiborgtools import clustering, field, match, read, summary                 # noqa
 from .utils import (center_of_mass, delta2ncells, number_counts,                 # noqa
                    periodic_distance, periodic_distance_two_points,             # noqa
                    binned_statistic, cosine_similarity)                         # noqa
 from .utils import (center_of_mass, delta2ncells, number_counts,  # noqa
                    periodic_distance, periodic_distance_two_points)  # noqa
 # Arguments to csiborgtools.read.Paths.
 paths_glamdring = {"srcdir": "/mnt/extraspace/hdesmond/",
--- a/csiborgtools/match/init.py
+++ b/csiborgtools/match/init.py
@ -14,5 +14,5 @@
 # 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 from .match import (ParticleOverlap, RealisationsMatcher,  # noqa
                    calculate_overlap, calculate_overlap_indxs, pos2cell, # noqa
-                    cosine_similarity, find_neighbour, get_halo_cell_limits,  # noqa
+                    find_neighbour, get_halo_cell_limits,  # noqa
                    matching_max)  # noqa
--- a/csiborgtools/match/match.py
+++ b/csiborgtools/match/match.py
@ -660,19 +660,12 @@ class ParticleOverlap(BaseMatcher):
 def pos2cell(pos, ncells):
    """
    Convert position to cell number if there are `ncells` cells along the axis.
    """
    if pos.dtype.char in numpy.typecodes["AllInteger"]:
        return pos
    return numpy.floor(pos * ncells).astype(numpy.int32)
 def read_nshift(smooth_kwargs):
    """
    Determine the number of cells to pad the density field if smoothing is
    applied. Defaults to the ceiling of three times the smoothing scale.
    """
    return 0 if smooth_kwargs is None else ceil(3 * smooth_kwargs["sigma"])
@ -774,33 +767,26 @@ def get_halo_cell_limits(pos, ncells, nshift=0):
    return mins, maxs
-@jit(nopython=True)
+@jit(nopython=True, boundscheck=False)
 def calculate_overlap(delta1, delta2, cellmins, delta_bckg, box_size,
                      bckg_halfsize):
-    r"""
+    """
-    Overlap between two halos whose density fields are evaluated on the
+    Calculate overlap between two halos' density fields on the same grid.
    same grid. This is a JIT implementation, hence it is outside of the main
    class.
    Parameters
    ----------
-    delta1: 3-dimensional array
+    delta1, delta2 : 3D array
-        Density field of the first halo.
+        Density fields of the first and second halos, respectively.
-    delta2 : 3-dimensional array
+    cellmins : tuple (len=3)
-        Density field of the second halo.
+        Lower cell ID in the full box.
-    cellmins : len-3 tuple
+    delta_bckg : 3D array
-        Tuple of lower cell ID in the full box.
+        Combined background density field of reference and cross simulations
-    delta_bckg : 3-dimensional array
+        on `bckg_halfsize` grid.
        Summed background density field of the reference and cross simulations
        calculated with particles assigned to halos at the final snapshot.
        Calculated on a grid determined by `bckg_halfsize`.
    box_size : int
-        Number of cells in the box.
+        Cell count in the box.
    bckg_halfsize : int
-        Background half-size for density field calculation. This is the
+        Grid distance from box center for background density.
-        grid distance from the center of the box to each side over which to
+        ≤ 0.5 * box_size.
        evaluate the background density field. Must be less than or equal to
        half the box size.
    Returns
    -------
@ -834,39 +820,29 @@ def calculate_overlap(delta1, delta2, cellmins, delta_bckg, box_size,
    return intersect / (totmass - intersect)
-@jit(nopython=True)
+@jit(nopython=True, boundscheck=False)
 def calculate_overlap_indxs(delta1, delta2, cellmins, delta_bckg, nonzero,
                            mass1, mass2, box_size, bckg_halfsize):
-    r"""
+    """
-    Overlap between two haloes whose density fields are evaluated on the
+    Calculate overlap of two halos' density fields on the same grid.
    same grid and `nonzero1` enumerates the non-zero cells of `delta1.  This is
    a JIT implementation, hence it is outside of the main class.
    Parameters
    ----------
-    delta1: 3-dimensional array
+    delta1, delta2 : 3D array
-        Density field of the first halo.
+        Density fields of the first and second halos, respectively.
-    delta2 : 3-dimensional array
+    cellmins : tuple (len=3)
-        Density field of the second halo.
+        Lower cell ID in the full box.
-    cellmins : len-3 tuple
+    delta_bckg : 3D array
-        Tuple of lower cell ID in the full box.
+        Combined background density from reference and cross simulations
-    delta_bckg : 3-dimensional array
+        on `bckg_halfsize` grid.
-        Summed background density field of the reference and cross simulations
+    nonzero : 2D array (shape: (n_cells, 3))
-        calculated with particles assigned to halos at the final snapshot.
+        Non-zero cells for the lower mass halo (from `fill_delta_indxs`).
-        Calculated on a grid determined by `bckg_halfsize`.
+    mass1, mass2 : float, optional
-    nonzero : 2-dimensional array of shape `(n_cells, 3)`
+        Halos' total masses. Calculated from density if not provided.
        Indices of cells that are non-zero of the lower mass halo. Expected to
        be precomputed from `fill_delta_indxs`.
    mass1, mass2 : floats, optional
        Total masses of the two haloes, respectively. Optional. If not provided
        calculcated directly from the density field.
    box_size : int
-        Number of cells in the box.
+        Cell count in the box.
    bckg_halfsize : int
-        Background half-size for density field calculation. This is the
+        Grid distance from box center for background density; ≤ 0.5 * box_size.
        grid distance from the center of the box to each side over which to
        evaluate the background density field. Must be less than or equal to
        half the box size.
    Returns
    -------
@ -1039,35 +1015,6 @@ def find_neighbour(nsim0, cats):
    return dists, cross_hindxs
 def cosine_similarity(x, y):
    r"""
    Calculate the cosine similarity between two Cartesian vectors. Defined
    as :math:`\Sum_{i} x_i y_{i} / (|x| * |y|)`.
    Parameters
    ----------
    x : 1-dimensional array
        The first vector.
    y : 1- or 2-dimensional array
        The second vector. Can be 2-dimensional of shape `(n_samples, 3)`,
        in which case the calculation is broadcasted.
    Returns
    -------
    out : float or 1-dimensional array
    """
    if x.ndim != 1:
        raise ValueError("`x` must be a 1-dimensional array.")
    if y.ndim == 1:
        y = y.reshape(1, -1)
    out = numpy.sum(x * y, axis=1)
    out /= numpy.linalg.norm(x) * numpy.linalg.norm(y, axis=1)
    return out[0] if out.size == 1 else out
 def matching_max(cat0, catx, mass_kind, mult, periodic, overlap=None,
                 match_indxs=None, verbose=True):
    """
--- a/csiborgtools/summary/init.py
+++ b/csiborgtools/summary/init.py
@ -16,7 +16,11 @@
 from .knn_summary import kNNCDFReader                                           # noqa
 from .nearest_neighbour_summary import NearestNeighbourReader                   # noqa
 from .overlap_summary import weighted_stats                                     # noqa
-from .overlap_summary import NPairsOverlap, PairOverlap, get_cross_sims         # noqa
+from .overlap_summary import (NPairsOverlap, PairOverlap, get_cross_sims,       # noqa
                              max_overlap_agreement, max_overlap_agreements,    # noqa
                              find_peak)                                        # noqa
 from .pk_summary import PKReader                                                # noqa
 from .tpcf_summary import TPCFReader                                            # noqa
-from .field_interp import read_interpolated_field                               # noqa
+from .field_interp import (read_interpolated_field,                             # noqa
                           bayesian_bootstrap_correlation,                      # noqa
                           correlate_at_fixed_smoothing)                        # noqa
--- a/csiborgtools/summary/field_interp.py
+++ b/csiborgtools/summary/field_interp.py
@ -15,6 +15,12 @@
 import numpy
 from tqdm import tqdm
 from numba import jit
 ###############################################################################
 #             Read in the field values at the galaxy positions                #
 ###############################################################################
 def read_interpolated_field(survey_name, kind, galaxy_index, paths, MAS, grid,
@ -75,3 +81,142 @@ def read_interpolated_field(survey_name, kind, galaxy_index, paths, MAS, grid,
        ks[i] = j
    return out[:, ks, :]
 ###############################################################################
 #            Calculate the Bayesian bootstrapped correlation                 #
 ###############################################################################
@jit(nopython=True, fastmath=True, boundscheck=False)
 def dot_product(x, y):
    tot = 0.0
    for i in range(len(x)):
        tot += x[i] * y[i]
    return tot
@jit(nopython=True, fastmath=True, boundscheck=False)
 def cov(x, y, mean_x, mean_y, weights):
    tot = 0.0
    for i in range(len(x)):
        tot += (x[i] - mean_x) * (y[i] - mean_y) * weights[i]
    return tot
@jit(nopython=True, fastmath=True, boundscheck=False)
 def var(x, mean_x, weights):
    tot = 0.0
    for i in range(len(x)):
        tot += (x[i] - mean_x)**2 * weights[i]
    return tot
@jit(nopython=True, fastmath=True, boundscheck=False)
 def weighted_correlation(x, y, weights):
    mean_x = dot_product(x, weights)
    mean_y = dot_product(y, weights)
    cov_xy = cov(x, y, mean_x, mean_y, weights)
    var_x = var(x, mean_x, weights)
    var_y = var(y, mean_y, weights)
    return cov_xy / numpy.sqrt(var_x * var_y)
@jit(nopython=True, fastmath=True, boundscheck=False)
 def _bayesian_bootstrap_correlation(x, y, weights):
    nweights = len(weights)
    bootstrapped_correlations = numpy.full(nweights, numpy.nan, dtype=x.dtype)
    for i in range(nweights):
        bootstrapped_correlations[i] = weighted_correlation(x, y, weights[i])
    return bootstrapped_correlations
@jit(nopython=True, fastmath=True, boundscheck=False)
 def rank(x):
    order = numpy.argsort(x)
    ranks = order.argsort()
    return ranks
@jit(nopython=True, fastmath=True, boundscheck=False)
 def bayesian_bootstrap_correlation(x, y, kind="spearman", n_bootstrap=10000):
    """
    Calculate the Bayesian bootstrapped correlation between two arrays.
    Parameters
    ----------
    x, y : 1-dimensional arrays
        The two arrays to calculate the correlation between.
    kind : str, optional
        The type of correlation to calculate. Either `spearman` or `pearson`.
    n_bootstrap : int, optional
        The number of bootstrap samples to use.
    Returns
    -------
    corr : 1-dimensional array of shape `(n_bootstrap,)`
    """
    if len(x) != len(y):
        raise ValueError("Input arrays must have the same length")
    if kind not in ["spearman", "pearson"]:
        raise ValueError("kind must be either `spearman` or `pearson`")
    if kind == "spearman":
        dtype = x.dtype
        x = rank(x).astype(dtype)
        y = rank(y).astype(dtype)
    alphas = numpy.ones(len(x), dtype=x.dtype)
    weights = numpy.random.dirichlet(alphas, size=n_bootstrap)
    return _bayesian_bootstrap_correlation(x, y, weights)
 ###############################################################################
 #                       Distribution disagreement                             #
 ###############################################################################
 def distribution_disagreement(x, y):
    """
    Think about this more when stacking non-Gaussian distributions.
    """
    delta = x - y
    return numpy.abs(delta.mean()) / delta.std()
 """
 field will be of value (nsims, ngal, nsmooth)
 Calculate the correlation for each sim and smoothing scale (nsims, nsmooth)
 For each of the above stack the distributions?
 """
 def correlate_at_fixed_smoothing(field_values, galaxy_property,
                                 kind="spearman", n_bootstrap=1000):
    galaxy_property = galaxy_property.astype(field_values.dtype)
    nsims = len(field_values)
    distributions = numpy.empty((nsims, n_bootstrap), dtype=field_values.dtype)
    from tqdm import trange
    for i in trange(nsims):
        distributions[i] = bayesian_bootstrap_correlation(
            field_values[i], galaxy_property, kind=kind, n_bootstrap=n_bootstrap)
    return distributions
 def do_something(field_values, galaxy_property):
    pass
--- a/csiborgtools/summary/overlap_summary.py
+++ b/csiborgtools/summary/overlap_summary.py
@ -23,6 +23,30 @@ from tqdm import tqdm, trange
 from ..utils import periodic_distance
 ###############################################################################
 #                           Utility functions                             #
 ###############################################################################
 def find_peak(x, weights, shrink=0.95, min_obs=5):
    """
    Find the peak of a 1D distribution using a shrinking window.
    """
    assert shrink <= 1.
    xmin, xmax = numpy.min(x), numpy.max(x)
    xpos = (xmax + xmin) / 2
    rad = (xmax - xmin) / 2
    while True:
        mask = numpy.abs(x - xpos) < rad
        if mask.sum() < min_obs:
            return xpos
        xpos = numpy.average(x[mask], weights=weights[mask])
        rad *= shrink
 ###############################################################################
 #                         Overlap of two simulations                          #
 ###############################################################################
@ -251,42 +275,16 @@ class PairOverlap:
        ----------
        from_smoothed : bool
            Whether to use the smoothed overlap or not.
        Returns
        -------
        summed_overlap : 1-dimensional array of shape `(nhalos, )`
        """
        overlap = self.overlap(from_smoothed)
-        out = numpy.full(len(overlap), numpy.nan, dtype=numpy.float32)
+        out = numpy.zeros(len(overlap), dtype=numpy.float32)
        for i in range(len(overlap)):
            if len(overlap[i]) > 0:
                out[i] = numpy.sum(overlap[i])
            else:
                out[i] = 0
        return out
    def prob_nomatch(self, from_smoothed):
        """
        Probability of no match for each halo in the reference simulation with
        the cross simulation. Defined as a product of 1 - overlap with other
        halos.
        Parameters
        ----------
        from_smoothed : bool
            Whether to use the smoothed overlap or not.
        Returns
        -------
        prob_nomatch : 1-dimensional array of shape `(nhalos, )`
        """
        overlap = self.overlap(from_smoothed)
        out = numpy.full(len(overlap), numpy.nan, dtype=numpy.float32)
        for i in range(len(overlap)):
            if len(overlap[i]) > 0:
                out[i] = numpy.product(numpy.subtract(1, overlap[i]))
            else:
                out[i] = 1
        return out
    def dist(self, in_initial, boxsize, norm_kind=None):
@ -308,8 +306,7 @@ class PairOverlap:
        -------
        dist : array of 1-dimensional arrays of shape `(nhalos, )`
        """
-        assert (norm_kind is None
+        assert (norm_kind is None or norm_kind in ("r200c", "ref_patch", "sum_patch"))  # noqa
                or norm_kind in ("r200c", "ref_patch", "sum_patch"))
        # Get positions either in the initial or final snapshot
        pos0 = self.cat0().position(in_initial=in_initial)
        posx = self.catx().position(in_initial=in_initial)
@ -400,60 +397,6 @@ class PairOverlap:
        return out
    def counterpart_mass(self, from_smoothed, overlap_threshold=0.,
                         mass_kind="totpartmass"):
        """
        Calculate the expected counterpart mass of each halo in the reference
        simulation from the crossed simulation.
        Parameters
        ----------
        from_smoothed : bool
            Whether to use the smoothed overlap or not.
        overlap_threshold : float, optional
            Minimum overlap required for a halo to be considered a match. By
            default 0.0, i.e. no threshold.
        mass_kind : str, optional
            The mass kind whose ratio is to be calculated. Must be a valid
            catalogue key. By default `totpartmass`, i.e. the total particle
            mass associated with a halo.
        Returns
        -------
        mean, std : 1-dimensional arrays of shape `(nhalos, )`
        """
        mean = numpy.full(len(self), numpy.nan, dtype=numpy.float32)
        std = numpy.full(len(self), numpy.nan, dtype=numpy.float32)
        massx = self.catx(mass_kind)           # Create references to speed
        overlap = self.overlap(from_smoothed)  # up the loop below
        for i, match_ind in enumerate(self["match_indxs"]):
            # Skip if no match
            if len(match_ind) == 0:
                continue
            massx_ = massx[match_ind]  # Again just create references
            overlap_ = overlap[i]      # to the appropriate elements
            # Optionally apply overlap threshold
            if overlap_threshold > 0.:
                mask = overlap_ > overlap_threshold
                if numpy.sum(mask) == 0:
                    continue
                massx_ = massx_[mask]
                overlap_ = overlap_[mask]
            massx_ = numpy.log10(massx_)
            # Weighted average and *biased* standard deviation
            mean_ = numpy.average(massx_, weights=overlap_)
            std_ = numpy.average((massx_ - mean_)**2, weights=overlap_)**0.5
            mean[i] = mean_
            std[i] = std_
        return mean, std
    def copy_per_match(self, par):
        """
        Make an array like `self.match_indxs` where each of its element is an
@ -525,6 +468,82 @@ class PairOverlap:
        return self["match_indxs"].size
 ###############################################################################
 #                 Support functions for pair overlaps                         #
 ###############################################################################
 def max_overlap_agreement(cat0, catx, min_logmass, maxdist, paths):
    r"""
    Calculate whether for a halo `A` from catalogue `cat0` that has a maximum
    overlap with halo `B` from catalogue `catx` it is also `B` that has a
    maximum overlap with `A`.
    Parameters
    ----------
    cat0 : instance of :py:class:`csiborgtools.read.BaseCatalogue`
        Halo catalogue corresponding to the reference simulation.
    catx : instance of :py:class:`csiborgtools.read.BaseCatalogue`
        Halo catalogue corresponding to the cross simulation.
    min_logmass : float
        Minimum halo mass in :math:`\log_{10} M_\odot / h` to consider.
    maxdist : float, optional
        Maximum halo distance in :math:`\mathrm{Mpc} / h` from the centre
        of the high-resolution region.
    paths : py:class`csiborgtools.read.Paths`
        CSiBORG paths object.
    Returns
    -------
    agreement : 1-dimensional array of shape `(nhalos, )`
    """
    kwargs = {"paths": paths, "min_logmass": min_logmass, "maxdist": maxdist}
    pair_forward = PairOverlap(cat0, catx, **kwargs)
    pair_backward = PairOverlap(catx, cat0, **kwargs)
    nhalos = len(pair_forward.cat0())
    agreement = numpy.full(nhalos, numpy.nan, dtype=numpy.float32)
    for i in range(nhalos):
        match_indxs_forward = pair_forward["match_indxs"][i]
        if len(match_indxs_forward) == 0:
            continue
        overlap_forward = pair_forward["smoothed_overlap"][i]
        kmax = match_indxs_forward[numpy.argmax(overlap_forward)]
        match_indxs_backward = pair_backward["match_indxs"][kmax]
        overlap_backward = pair_backward["smoothed_overlap"][kmax]
        imatch = match_indxs_backward[numpy.argmax(overlap_backward)]
        agreement[i] = imatch == i
    return agreement
 def max_overlap_agreements(cat0, catxs, min_logmass, maxdist, paths,
                           verbose=True):
    """
    Repeat `max_overlap_agreement` for many cross simulations.
    Parameters
    ----------
    ...
    Returns
    -------
    agreements : 2-dimensional array of shape `(ncatxs, nhalos)`
    """
    agreements = [None] * len(catxs)
    desc = "Calculating maximum overlap agreement"
    for i, catx in enumerate(tqdm(catxs, desc=desc, disable=not verbose)):
        agreements[i] = max_overlap_agreement(cat0, catx, min_logmass,
                                              maxdist, paths)
    return numpy.asanyarray(agreements)
 def weighted_stats(x, weights, min_weight=0, verbose=False):
    """
    Calculate the weighted mean and standard deviation of `x` using `weights`
@ -544,11 +563,10 @@ def weighted_stats(x, weights, min_weight=0, verbose=False):
    Returns
    -------
-    stat : 2-dimensional array of shape `(len(x), 2)`
+    mu, std : 1-dimensional arrays of shape `(len(x), )`
        The first column is the weighted mean and the second column is the
        weighted standard deviation.
    """
-    out = numpy.full((x.size, 2), numpy.nan, dtype=numpy.float32)
+    mu = numpy.full(x.size, numpy.nan, dtype=numpy.float32)
    std = numpy.full(x.size, numpy.nan, dtype=numpy.float32)
    for i in trange(len(x), disable=not verbose):
        x_, w_ = numpy.asarray(x[i]), numpy.asarray(weights[i])
@ -557,9 +575,9 @@ def weighted_stats(x, weights, min_weight=0, verbose=False):
        w_ = w_[mask]
        if len(w_) == 0:
            continue
-        out[i, 0] = numpy.average(x_, weights=w_)
+        mu[i] = numpy.average(x_, weights=w_)
-        out[i, 1] = numpy.average((x_ - out[i, 0])**2, weights=w_)**0.5
+        std[i] = numpy.average((x_ - mu[i])**2, weights=w_)**0.5
-    return out
+    return mu, std
 ###############################################################################
@ -684,92 +702,87 @@ class NPairsOverlap:
            out[i] = pair.summed_overlap(from_smoothed)
        return numpy.vstack(out).T
-    def prob_nomatch(self, from_smoothed, verbose=True):
+    def expected_property_single(self, k, key, from_smoothed,  in_log=True):
-        """
+        ys = [None] * len(self)
-        Probability of no match for each halo in the reference simulation with
+        overlaps = [None] * len(self)
-        the cross simulation.
+        for i, pair in enumerate(self):
            overlap = pair.overlap(from_smoothed)
            if len(overlap[k]) == 0:
                ys[i] = numpy.nan
                overlaps[i] = numpy.nan
                continue
            match_indxs = pair["match_indxs"]
            j = numpy.argmax(overlap[k])
-        Parameters
+            ys[i] = pair.catx(key)[match_indxs[k][j]]
-        ----------
+            if in_log:
-        from_smoothed : bool
+                ys[i] = numpy.log10(ys[i])
-            Whether to use the smoothed overlap or not.
+            overlaps[i] = overlap[k][j]
        verbose : bool, optional
            Verbosity flag.
-        Returns
+        return ys, overlaps
        -------
        prob_nomatch : 2-dimensional array of shape `(nhalos, ncatxs)`
        """
        iterator = tqdm(self.pairs,
                        desc="Calculating probability of no match",
                        disable=not verbose
                        )
        out = [None] * len(self)
        for i, pair in enumerate(iterator):
            out[i] = pair.prob_nomatch(from_smoothed)
        return numpy.vstack(out).T
-    def counterpart_mass(self, from_smoothed, overlap_threshold=0.,
+    def expected_property(self, key, from_smoothed, min_logmass,
-                         mass_kind="totpartmass", return_full=False,
+                          in_log=True, mass_kind="totpartmass", verbose=True):
                         verbose=True):
        """
        Calculate the expected counterpart mass of each halo in the reference
        simulation from the crossed simulation.
        Parameters
        ----------
        key : str
            Property key.
        from_smoothed : bool
            Whether to use the smoothed overlap or not.
-        overlap_threshold : float, optional
+        min_logmass : float
-            Minimum overlap required for a halo to be considered a match. By
+            Minimum log mass of reference halos to consider.
-            default 0.0, i.e. no threshold.
+        in_log : bool, optional
            Whether to calculated the expected property in log10.
        mass_kind : str, optional
            The mass kind whose ratio is to be calculated. Must be a valid
            catalogue key. By default `totpartmass`, i.e. the total particle
            mass associated with a halo.
        return_full : bool, optional
            Whether to return the full results of matching each pair or
            calculate summary statistics by Gaussian averaging.
        verbose : bool, optional
            Verbosity flag.
        Returns
        -------
-        mu, std : 1-dimensional arrays of shape `(nhalos,)`
+        mean_expected : 1-dimensional array of shape `(nhalos, )`
-            Summary expected mass and standard deviation from all cross
+            Expected property from all cross simulations.
-            simulations.
+        std_expected : 1-dimensional array of shape `(nhalos, )`
-        mus, stds : 2-dimensional arrays of shape `(nhalos, ncatx)`, optional
+            Standard deviation of the expected property.
            Expected mass and standard deviation from each cross simulation.
            Returned only if `return_full` is `True`.
        """
-        iterator = tqdm(self.pairs,
+        log_mass0 = numpy.log10(self.cat0(mass_kind))
-                        desc="Calculating counterpart masses",
+        ntot = len(log_mass0)
-                        disable=not verbose)
+        mean_expected = numpy.full(ntot, numpy.nan, dtype=numpy.float32)
-        mus, stds = [None] * len(self), [None] * len(self)
+        std_expected = numpy.full(ntot, numpy.nan, dtype=numpy.float32)
        for i, pair in enumerate(iterator):
            mus[i], stds[i] = pair.counterpart_mass(
                from_smoothed=from_smoothed,
                overlap_threshold=overlap_threshold, mass_kind=mass_kind)
        mus, stds = numpy.vstack(mus).T, numpy.vstack(stds).T
-        # Prob of > 0 matches
+        indxs = numpy.where(log_mass0 > min_logmass)[0]
-        probmatch = 1 - self.prob_nomatch(from_smoothed)
+        for i in tqdm(indxs, disable=not verbose,
-        # Normalise it for weighted sums etc.
+                      desc="Calculating expectation"):
-        norm_probmatch = numpy.apply_along_axis(
+            ys = numpy.full(len(self), numpy.nan, dtype=numpy.float32)
-            lambda x: x / numpy.sum(x), axis=1, arr=probmatch)
+            weights = numpy.full(len(self), numpy.nan, dtype=numpy.float32)
            for j, pair in enumerate(self):
                overlap = pair.overlap(from_smoothed)
                if len(overlap[i]) == 0:
                    continue
-        # Mean and standard deviation of weighted stacked Gaussians
+                k = numpy.argmax(overlap[i])
-        mu = numpy.sum((norm_probmatch * mus), axis=1)
+                ys[j] = pair.catx(key)[pair["match_indxs"][i][k]]
-        std = numpy.sum((norm_probmatch * (mus**2 + stds**2)), axis=1) - mu**2
+                weights[j] = overlap[i][k]
        std **= 0.5
-        mask = mu <= 0
+                if in_log:
-        mu[mask] = numpy.nan
+                    ys[j] = numpy.log10(ys[j])
        std[mask] = numpy.nan
-        if return_full:
+            mask = numpy.isfinite(ys) & numpy.isfinite(weights)
-            return mu, std, mus, stds
+            if numpy.sum(mask) <= 2:
-        return mu, std
+                continue
            mean_expected[i] = find_peak(ys[mask], weights=weights[mask])
            std_expected[i] = numpy.average((ys[mask] - mean_expected[i])**2,
                                            weights=weights[mask])**0.5
            print(log_mass0[i], mean_expected[i], std_expected[i])
        return mean_expected, std_expected
    @property
    def pairs(self):
--- a/csiborgtools/utils.py
+++ b/csiborgtools/utils.py
@ -150,6 +150,35 @@ def radec_to_cartesian(X):
        ]).T
 def cosine_similarity(x, y):
    r"""
    Calculate the cosine similarity between two Cartesian vectors. Defined
    as :math:`\Sum_{i} x_i y_{i} / (|x| * |y|)`.
    Parameters
    ----------
    x : 1-dimensional array
        The first vector.
    y : 1- or 2-dimensional array
        The second vector. Can be 2-dimensional of shape `(n_samples, 3)`,
        in which case the calculation is broadcasted.
    Returns
    -------
    out : float or 1-dimensional array
    """
    if x.ndim != 1:
        raise ValueError("`x` must be a 1-dimensional array.")
    if y.ndim == 1:
        y = y.reshape(1, -1)
    out = numpy.sum(x * y, axis=1)
    out /= numpy.linalg.norm(x) * numpy.linalg.norm(y, axis=1)
    return out[0] if out.size == 1 else out
 def real2redshift(pos, vel, observer_location, observer_velocity, box,
                  periodic_wrap=True, make_copy=True):
    r"""
@ -219,3 +248,17 @@ def number_counts(x, bin_edges):
    for i in range(bin_edges.size - 1):
        out[i] = numpy.sum((x >= bin_edges[i]) & (x < bin_edges[i + 1]))
    return out
 def binned_statistic(x, y, left_edges, bin_width, statistic):
    """
    Calculate a binned statistic.
    """
    out = numpy.full(left_edges.size, numpy.nan, dtype=x.dtype)
    for i in range(left_edges.size):
        mask = (x >= left_edges[i]) & (x < left_edges[i] + bin_width)
        if numpy.any(mask):
            out[i] = statistic(y[mask])
    return out
--- a/scripts/sort_halomaker.py
+++ b/scripts/sort_halomaker.py
@ -0,0 +1,100 @@
 # Copyright (C) 2022 Richard Stiskalek
 # This program is free software; you can redistribute it and/or modify it
 # under the terms of the GNU General Public License as published by the
 # Free Software Foundation; either version 3 of the License, or (at your
 # option) any later version.
 #
 # This program is distributed in the hope that it will be useful, but
 # WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General
 # Public License for more details.
 #
 # You should have received a copy of the GNU General Public License along
 # with this program; if not, write to the Free Software Foundation, Inc.,
 # 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 """
 Script to sort the HaloMaker's `particle_membership` file to match the ordering
 of particles in the simulation snapshot.
 """
 from argparse import ArgumentParser
 from datetime import datetime
 from glob import iglob
 import h5py
 import numpy
 import pynbody
 from mpi4py import MPI
 from taskmaster import work_delegation
 from tqdm import trange
 import csiborgtools
 def sort_particle_membership(nsim, nsnap, method):
    """
    Read the FoF particle halo membership and sort the halo IDs to the ordering
    of particles in the PHEW clump IDs.
    Parameters
    ----------
    nsim : int
        IC realisation index.
    verbose : bool, optional
        Verbosity flag.
    """
    print(f"{datetime.now()}:   starting simulation {nsim}, snapshot {nsnap} and method {method}.")  # noqa
    paths = csiborgtools.read.Paths(**csiborgtools.paths_glamdring)
    fpath = next(iglob(f"/mnt/extraspace/rstiskalek/CSiBORG/halo_maker/ramses_{nsim}/output_{str(nsnap).zfill(5)}/**/*particle_membership*", recursive=True), None)  # noqa
    print(f"{datetime.now()}:   loading particle membership `{fpath}`.")
    # Columns are halo ID, particle ID
    membership = numpy.genfromtxt(fpath, dtype=int)
    print(f"{datetime.now()}:   loading particle IDs from the snapshot.")
    sim = pynbody.load(paths.snapshot(nsnap, nsim, "csiborg"))
    pids = numpy.asanyarray(sim["iord"])
    print(f"{datetime.now()}:   mapping particle IDs to their indices.")
    pids_idx = {pid: i for i, pid in enumerate(pids)}
    print(f"{datetime.now()}:   mapping HIDs to their array indices.")
    # Unassigned particle IDs are assigned a halo ID of 0.
    hids = numpy.zeros(pids.size, dtype=numpy.int32)
    for i in trange(membership.shape[0]):
        hid, pid = membership[i]
        hids[pids_idx[pid]] = hid
    fout = fpath + "_sorted.hdf5"
    print(f"{datetime.now()}:   saving the sorted data to ... `{fout}`")
    header = """
    This dataset represents halo indices for each particle.
        - The particles are ordered as they appear in the simulation snapshot.
        - Unassigned particles are given a halo index of 0.
        """
    with h5py.File(fout, 'w') as hdf:
        dset = hdf.create_dataset('hids_dataset', data=hids)
        dset.attrs['header'] = header
 if __name__ == "__main__":
    parser = ArgumentParser()
    parser.add_argument("--method", type=str, required=True,
                        help="HaloMaker method")
    parser.add_argument("--nsim", type=int, required=False, default=None,
                        help="IC index. If not set process all.")
    args = parser.parse_args()
    paths = csiborgtools.read.Paths(**csiborgtools.paths_glamdring)
    if args.nsim is None:
        ics = paths.get_ics("csiborg")
    else:
        ics = [args.nsim]
    snaps = numpy.array([max(paths.get_snapshots(nsim, "csiborg"))
                         for nsim in ics])
    def main(n):
        sort_particle_membership(ics[n], snaps[n], args.method)
    work_delegation(main, list(range(len(ics))), MPI.COMM_WORLD)
--- a/scripts/sort_initsnap.py
+++ b/scripts/sort_initsnap.py
@ -29,16 +29,9 @@ import numpy
 from mpi4py import MPI
 from taskmaster import work_delegation
 import csiborgtools
 from utils import get_nsims
 try:
    import csiborgtools
 except ModuleNotFoundError:
    import sys
    sys.path.append("../")
    import csiborgtools
 def _main(nsim, simname, verbose):
    """
@ -51,9 +44,7 @@ def _main(nsim, simname, verbose):
    else:
        partreader = csiborgtools.read.QuijoteReader(paths)
-    if verbose:
+    print(f"{datetime.now()}:   processing simulation `{nsim}`.", flush=True)
        print(f"{datetime.now()}: reading and processing simulation `{nsim}`.",
              flush=True)
    # We first load the particle IDs in the final snapshot.
    pidf = csiborgtools.read.read_h5(paths.particles(nsim, simname))
    pidf = pidf["particle_ids"]
--- a/scripts_plots/paper_match.py
+++ b/scripts_plots/paper_match.py
--- a/scripts_plots/plot_data.py
+++ b/scripts_plots/plot_data.py
@ -66,6 +66,12 @@ def plot_mass_vs_ncells(nsim, pdf=False):
    cat = open_csiborg(nsim)
    mpart = 4.38304044e+09
    x = numpy.log10(cat["totpartmass"])
    y = numpy.log10(cat["lagpatch_ncells"])
    p = numpy.polyfit(x, y, 1)
    print("Fitted parameters are: ", p)
    with plt.style.context(plt_utils.mplstyle):
        plt.figure()
        plt.scatter(cat["totpartmass"], cat["lagpatch_ncells"], s=0.25,
@ -105,9 +111,9 @@ def plot_hmf(pdf=False):
            csiborg_counts = numpy.full((len(csiborg_nsims), len(bins) - 1),
                                        numpy.nan, dtype=numpy.float32)
        csiborg_counts[i, :] = data["counts"]
-    # csiborg_counts /= numpy.diff(bins).reshape(1, -1)
+    csiborg_counts /= numpy.diff(bins).reshape(1, -1)
-    csiborg5511 = numpy.load(paths.halo_counts("csiborg", 5511))["counts"]
+    # csiborg5511 = numpy.load(paths.halo_counts("csiborg", 5511))["counts"]
    # csiborg5511 /= numpy.diff(data["bins"])
    print("Loading Quijote halo counts.", flush=True)
@ -121,73 +127,89 @@ def plot_hmf(pdf=False):
                (len(quijote_nsims) * nmax, len(bins) - 1), numpy.nan,
                dtype=numpy.float32)
        quijote_counts[i * nmax:(i + 1) * nmax, :] = data["counts"]
-    # quijote_counts /= numpy.diff(bins).reshape(1, -1)
+    quijote_counts /= numpy.diff(bins).reshape(1, -1)
-    # vol = 155.5**3
+    vol = 4 * numpy.pi / 3 * 155.5**3
-    # csiborg_counts /= vol
+    csiborg_counts /= vol
-    # quijote_counts /= vol
+    quijote_counts /= vol
    # csiborg5511 /= vol
    x = 10**(0.5 * (bins[1:] + bins[:-1]))
    # Edit lower limits
-    csiborg_counts[:, x < 1e12] = numpy.nan
+    csiborg_counts[:, x < 10**13.1] = numpy.nan
    quijote_counts[:, x < 10**(13.1)] = numpy.nan
    # Edit upper limits
    csiborg_counts[:, x > 3e15] = numpy.nan
    quijote_counts[:, x > 3e15] = numpy.nan
-    csiborg5511[x > 3e15] = numpy.nan
+    # csiborg5511[x > 3e15] = numpy.nan
    with plt.style.context(plt_utils.mplstyle):
        cols = plt.rcParams["axes.prop_cycle"].by_key()["color"]
-        fig, ax = plt.subplots(nrows=2, sharex=True,
+        fig, ax = plt.subplots(nrows=1, sharex=True,
-                               figsize=(3.5, 2.625 * 1.25),
+                               figsize=(3.5, 2.625))
-                               gridspec_kw={"height_ratios": [1, 0.45]})
+        ax = [ax]
-        fig.subplots_adjust(hspace=0, wspace=0)
+        # fig, ax = plt.subplots(nrows=2, sharex=True,
        #                        figsize=(3.5, 2.625 * 1.25),
        #                        gridspec_kw={"height_ratios": [1, 0.25]})
        # fig.subplots_adjust(hspace=0, wspace=0)
        # Upper panel data
        mean_csiborg = numpy.mean(csiborg_counts, axis=0)
        std_csiborg = numpy.std(csiborg_counts, axis=0)
-        ax[0].plot(x, mean_csiborg, label="CSiBORG", c=cols[0])
+
-        ax[0].fill_between(x, mean_csiborg - std_csiborg,
+        for i in range(len(csiborg_counts)):
-                           mean_csiborg + std_csiborg,
+            ax[0].plot(x, csiborg_counts[i, :], c="cornflowerblue", lw=0.5, zorder=0)
-                           alpha=0.5, color=cols[0])
+
        ax[0].plot(x, mean_csiborg, label="CSiBORG", c="mediumblue", zorder=1)
        # ax[0].fill_between(x, mean_csiborg - std_csiborg,
        #                    mean_csiborg + std_csiborg,
        #                    alpha=0.5, color=cols[0])
        mean_quijote = numpy.mean(quijote_counts, axis=0)
        std_quijote = numpy.std(quijote_counts, axis=0)
        ax[0].plot(x, mean_quijote, label="Quijote", c=cols[1])
        ax[0].fill_between(x, mean_quijote - std_quijote,
                           mean_quijote + std_quijote, alpha=0.5,
                           color=cols[1])
-        ax[0].plot(x, csiborg5511, label="CSiBORG 5511", c="k", ls="--")
+        for i in range(len(quijote_counts)):
-        std5511 = numpy.sqrt(csiborg5511)
+            ax[0].plot(x, quijote_counts[i, :], c="palegreen", lw=0.5, zorder=-1)
        ax[0].fill_between(x, csiborg5511 - std_csiborg, csiborg5511 + std5511,
                           alpha=0.2, color="k")
        # Lower panel data
        log_y = numpy.log10(mean_csiborg / mean_quijote)
        err = numpy.sqrt((std_csiborg / mean_csiborg / numpy.log(10))**2
                         + (std_quijote / mean_quijote / numpy.log(10))**2)
        ax[1].plot(x, 10**log_y, c=cols[0])
        ax[1].fill_between(x, 10**(log_y - err), 10**(log_y + err), alpha=0.5,
                           color=cols[0])
-        ax[1].plot(x, csiborg5511 / mean_quijote, c="k", ls="--")
+
        ax[0].plot(x, mean_quijote, label="Quijote", c="darkgreen", zorder=1)
        # ax[0].fill_between(x, mean_quijote - std_quijote,
        #                    mean_quijote + std_quijote, alpha=0.5,
        #                    color=cols[1])
        # ax[0].plot(x, csiborg5511, label="CSiBORG 5511", c="k", ls="--")
        # std5511 = numpy.sqrt(csiborg5511)
        # ax[0].fill_between(x, csiborg5511 - std_csiborg, csiborg5511 + std5511,
        #                    alpha=0.2, color="k")
        # # Lower panel data
        # log_y = numpy.log10(mean_csiborg / mean_quijote)
        # err = numpy.sqrt((std_csiborg / mean_csiborg / numpy.log(10))**2
        #                  + (std_quijote / mean_quijote / numpy.log(10))**2)
        # ax[1].plot(x, 10**log_y, c=cols[0])
        # ax[1].fill_between(x, 10**(log_y - err), 10**(log_y + err), alpha=0.5,
        #                    color="k")
        # ax[1].plot(x, csiborg5511 / mean_quijote, c="k", ls="--")
        # Labels and accesories
-        ax[1].axhline(1, color="k", ls="--",
+        # ax[1].axhline(1, color="k", ls="--",
-                      lw=0.5 * plt.rcParams["lines.linewidth"], zorder=0)
+        #               lw=0.5 * plt.rcParams["lines.linewidth"], zorder=0)
        # ax[0].set_ylabel(r"$\frac{\mathrm{d}^2 N}{\mathrm{d} V \mathrm{d}\log M_{\rm tot}}~[\mathrm{dex}^{-1} (\mathrm{Mpc} / h)^{-3}]$",  # noqa
        #                  fontsize="small")
-        ax[0].set_ylabel("Counts in bins")
+        m = numpy.isfinite(mean_quijote)
-        ax[1].set_xlabel(r"$M_{\rm tot}~[M_\odot / h]$", fontsize="small")
+        ax[0].set_xlim(x[m].min(), x[m].max())
-        ax[1].set_ylabel(r"$\mathrm{CSiBORG} / \mathrm{Quijote}$",
+        ax[0].set_ylabel(r"$\mathrm{HMF}~[\mathrm{dex}^{-1} (\mathrm{Mpc} / h)^{-3}]$")
-                         fontsize="small")
+        ax[0].set_xlabel(r"$M_{\rm tot}~[M_\odot / h]$", fontsize="small")
        # ax[1].set_ylabel(r"$\mathrm{CSiBORG} / \mathrm{Quijote}$",
        #                  fontsize="small")
        ax[0].set_xscale("log")
        ax[0].set_yscale("log")
-        ax[1].set_ylim(0.5, 2.0)
+        # ax[1].set_ylim(0.5, 1.5)
        # ax[1].set_yscale("log")
-        ax[0].legend(fontsize="small")
+        ax[0].legend()
        fig.tight_layout(h_pad=0, w_pad=0)
        for ext in ["png"] if pdf is False else ["png", "pdf"]:
@ -556,8 +578,8 @@ if __name__ == "__main__":
    if False:
        plot_mass_vs_ncells(7444, pdf=False)
-    if False:
+    if True:
-        plot_hmf(pdf=False)
+        plot_hmf(pdf=True)
    if False:
        plot_hmf_quijote_full(pdf=False)
@ -569,7 +591,7 @@ if __name__ == "__main__":
                              plot_groups=False, dmin=45, dmax=60,
                              plot_halos=5e13, volume_weight=True)
-    if True:
+    if False:
        kind = "environment"
        grid = 512
        smooth_scale = 8.0