kNN-CDF secondary halo bias (#40)

* Add seperate autoknn script & config file * edit ics * Edit submission script * Add threshold values * Edit batch sizign * Remove print * edit * Rename files * Rename * Update nb * edit runs * Edit submit * Add median threshold * add new auto reader * editt submit * edit submit * Edit submit * Add mean prk * Edit runs * Remove correlation file * Move split to clutering * Add init * Remove import * Add the file * Add correlation reading * Edit scripts * Add below and above median permutation for cross * Update imports * Move rvs_in_sphere * Create utils * Split * Add import * Add normalised marks * Add import * Edit readme * Clean up submission file * Stop tracking submit files * Update gitignore * Add poisson field analytical expression * Add abstract generators * Add generators * Pass in the generator * Add a check for if there are any files * Start saving average density * Update nb * Update readme * Update units * Edit jobs * Update submits * Update reader * Add random crossing * Update crossing script * Add crossing with random * Update readme * Update notebook
2025-06-08 09:51:12 +00:00 · 2023-04-09 20:57:05 +01:00 · 2023-04-09 20:57:05 +01:00 · 5784011de0
commit 5784011de0
parent 826ab61d2d
28 changed files with 2563 additions and 486 deletions
--- a/csiborgtools/init.py
+++ b/csiborgtools/init.py
@ -12,4 +12,4 @@
 # You should have received a copy of the GNU General Public License along
 # with this program; if not, write to the Free Software Foundation, Inc.,
 # 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
-from csiborgtools import (read, match, utils, units, fits, field)  # noqa
+from csiborgtools import (read, match, utils, units, fits, field, clustering)  # noqa
--- a/csiborgtools/match/correlation.py
+++ b/csiborgtools/match/correlation.py
@ -1,4 +1,4 @@
-# Copyright (C) 2022 Richard Stiskalek
+# Copyright (C) 2023 Richard Stiskalek
 # This program is free software; you can redistribute it and/or modify it
 # under the terms of the GNU General Public License as published by the
 # Free Software Foundation; either version 3 of the License, or (at your
@ -12,58 +12,15 @@
 # You should have received a copy of the GNU General Public License along
 # with this program; if not, write to the Free Software Foundation, Inc.,
 # 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+"""
+2PCF calculation.
+
+NOTE: This is an old script that needs to be updated.
+"""
 import numpy
 from Corrfunc.mocks import DDtheta_mocks
 from Corrfunc.utils import convert_3d_counts_to_cf
-from warnings import warn
-
-
-def get_randoms_sphere(N, seed=42):
-    """
-    Generate random points on a sphere.
-
-    Parameters
-    ----------
-    N : int
-        Number of points.
-    seed : int
-        Random seed.
-
-    Returns
-    -------
-    ra : 1-dimensional array
-        Right ascension in :math:`[0, 360)` degrees.
-    dec : 1-dimensional array
-        Declination in :math:`[-90, 90]` degrees.
-    """
-    gen = numpy.random.default_rng(seed)
-    ra = gen.random(N) * 360
-    dec = numpy.rad2deg(numpy.arcsin(2 * (gen.random(N) - 0.5)))
-    return ra, dec
-
-
-def wrapRA(ra, degrees=True):
-    """
-    Wrap the right ascension from :math:`[-180, 180)` to :math`[0, 360)`
-    degrees or equivalently if `degrees=False` in radians.
-
-    Paramaters
-    ----------
-    ra : 1-dimensional array
-        Right ascension values.
-    degrees : float, optional
-        Whether the right ascension is in degrees.
-
-    Returns
-    -------
-    ra : 1-dimensional array
-        Wrapped around right ascension.
-    """
-    mask = ra < 0
-    if numpy.sum(mask) == 0:
-        warn("No negative right ascension found.")
-    ra[mask] += 360 if degrees else 2 * numpy.pi
-    return ra
+from .utils import (rvs_on_sphere, wrapRA)


 def sphere_angular_tpcf(bins, RA1, DEC1, RA2=None, DEC2=None, nthreads=1,
@ -113,11 +70,11 @@ def sphere_angular_tpcf(bins, RA1, DEC1, RA2=None, DEC2=None, nthreads=1,
    NR1 = ND1 * Nmult
    NR2 = ND2 * Nmult
    # Generate randoms. Note that these are over the sphere!
-    randRA1, randDEC1 = get_randoms_sphere(NR1, seed1)
-    randRA2, randDEC2 = get_randoms_sphere(NR2, seed2)
+    randRA1, randDEC1 = rvs_on_sphere(NR1, indeg=True, random_state=seed1)
+    randRA2, randDEC2 = rvs_on_sphere(NR2, indeg=True, random_state=seed2)
    # Wrap RA
-    RA1 = wrapRA(numpy.copy(RA1))
-    RA2 = wrapRA(numpy.copy(RA2))
+    RA1 = wrapRA(numpy.copy(RA1), indeg=True)
+    RA2 = wrapRA(numpy.copy(RA2), indeg=True)
    # Calculate pairs
    D1D2 = DDtheta_mocks(0, nthreads, bins, RA1, DEC1, RA2=RA2, DEC2=DEC2)
    D1R2 = DDtheta_mocks(0, nthreads, bins, RA1, DEC1,
@ -127,4 +84,4 @@ def sphere_angular_tpcf(bins, RA1, DEC1, RA2=None, DEC2=None, nthreads=1,
    R1R2 = DDtheta_mocks(0, nthreads, bins, randRA1, randDEC1,
                         RA2=randRA2, DEC2=randDEC2)
    # Convert to the CF
-    return convert_3d_counts_to_cf(ND1, ND2, NR1, NR2, D1D2, D1R2, D2R1, R1R2)
+    return convert_3d_counts_to_cf(ND1, ND2, NR1, NR2, D1D2, D1R2, D2R1, R1R2)
--- a/csiborgtools/clustering/init.py
+++ b/csiborgtools/clustering/init.py
@ -0,0 +1,16 @@
+# Copyright (C) 2022 Richard Stiskalek
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation; either version 3 of the License, or (at your
+# option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General
+# Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+from .knn import kNN_CDF  # noqa
+from .utils import (RVSinsphere, RVSinbox, RVSonsphere, BaseRVS, normalised_marks)  # noqa
--- a/csiborgtools/clustering/knn.py
+++ b/csiborgtools/clustering/knn.py
@ -18,52 +18,16 @@ kNN-CDF calculation
 import numpy
 from scipy.interpolate import interp1d
 from scipy.stats import binned_statistic
-from tqdm import tqdm
+from .utils import BaseRVS


 class kNN_CDF:
-    """
-    Object to calculate the kNN-CDF for a set of CSiBORG halo catalogues from
-    their kNN objects.
-    """
-    @staticmethod
-    def rvs_in_sphere(nsamples, R, random_state=42, dtype=numpy.float32):
-        """
-        Generate random samples in a sphere of radius `R` centered at the
-        origin.
-
-        Parameters
-        ----------
-        nsamples : int
-            Number of samples to generate.
-        R : float
-            Radius of the sphere.
-        random_state : int, optional
-            Random state for the random number generator.
-        dtype : numpy dtype, optional
-            Data type, by default `numpy.float32`.
-
-        Returns
-        -------
-        samples : 2-dimensional array of shape `(nsamples, 3)`
-        """
-        gen = numpy.random.default_rng(random_state)
-        # Sample spherical coordinates
-        r = gen.uniform(0, 1, nsamples).astype(dtype)**(1/3) * R
-        theta = 2 * numpy.arcsin(gen.uniform(0, 1, nsamples).astype(dtype))
-        phi = 2 * numpy.pi * gen.uniform(0, 1, nsamples).astype(dtype)
-        # Convert to cartesian coordinates
-        x = r * numpy.sin(theta) * numpy.cos(phi)
-        y = r * numpy.sin(theta) * numpy.sin(phi)
-        z = r * numpy.cos(theta)
-
-        return numpy.vstack([x, y, z]).T
-
+    """Object to calculate the kNN-CDF statistic."""
    @staticmethod
    def cdf_from_samples(r, rmin=None, rmax=None, neval=None,
                         dtype=numpy.float32):
        """
-        Calculate the CDF from samples.
+        Calculate the kNN-CDF from a sampled PDF.

        Parameters
        ----------
@ -128,22 +92,21 @@ class kNN_CDF:
            corr[k, :] = joint_cdf[k, :] - cdf0[k, :] * cdf1[k, :]
        return corr

-    def brute_cdf(self, knn, nneighbours, Rmax, nsamples, rmin, rmax, neval,
+    def brute_cdf(self, knn, rvs_gen, nneighbours, nsamples, rmin, rmax, neval,
                  random_state=42, dtype=numpy.float32):
        """
-        Calculate the CDF for a kNN of CSiBORG halo catalogues without batch
-        sizing. This can become memory intense for large numbers of randoms
-        and, therefore, is only for testing purposes.
+        Calculate the kNN-CDF without batch sizing. This can become memory
+        intense for large numbers of randoms and, therefore, is primarily for
+        testing purposes.

        Parameters
        ----------
-        knns : `sklearn.neighbors.NearestNeighbors`
-            kNN of CSiBORG halo catalogues.
+        knn : `sklearn.neighbors.NearestNeighbors`
+            Catalogue NN object.
+        rvs_gen : :py:class:`csiborgtools.clustering.BaseRVS`
+            Uniform RVS generator matching `knn`.
        neighbours : int
            Maximum number of neighbours to use for the kNN-CDF calculation.
-        Rmax : float
-            Maximum radius of the sphere in which to sample random points for
-            the knn-CDF calculation. This should match the CSiBORG catalogues.
        nsamples : int
            Number of random points to sample for the knn-CDF calculation.
        rmin : float
@ -164,7 +127,8 @@ class kNN_CDF:
        cdfs : 2-dimensional array
            CDFs evaluated at `rs`.
        """
-        rand = self.rvs_in_sphere(nsamples, Rmax, random_state=random_state)
+        assert isinstance(rvs_gen, BaseRVS)
+        rand = rvs_gen(nsamples, random_state=random_state)

        dist, __ = knn.kneighbors(rand, nneighbours)
        dist = dist.astype(dtype)
@ -177,18 +141,20 @@ class kNN_CDF:
        cdf = numpy.asanyarray(cdf)
        return rs, cdf

-    def joint(self, knn0, knn1, nneighbours, Rmax, nsamples, rmin, rmax,
+    def joint(self, knn0, knn1, rvs_gen, nneighbours, nsamples, rmin, rmax,
              neval, batch_size=None, random_state=42,
              dtype=numpy.float32):
        """
-        Calculate the joint CDF for two kNNs of CSiBORG halo catalogues.
+        Calculate the joint knn-CDF.

        Parameters
        ----------
        knn0 : `sklearn.neighbors.NearestNeighbors` instance
-            kNN of the first CSiBORG halo catalogue.
+            NN object of the first catalogue.
        knn1 : `sklearn.neighbors.NearestNeighbors` instance
-            kNN of the second CSiBORG halo catalogue.
+            NN object of the second catalogue.
+        rvs_gen : :py:class:`csiborgtools.clustering.BaseRVS`
+            Uniform RVS generator matching `knn1` and `knn2`.
        neighbours : int
            Maximum number of neighbours to use for the kNN-CDF calculation.
        Rmax : float
@ -222,6 +188,7 @@ class kNN_CDF:
        joint_cdf : 2-dimensional array
            Joint CDF evaluated at `rs`.
        """
+        assert isinstance(rvs_gen, BaseRVS)
        batch_size = nsamples if batch_size is None else batch_size
        assert nsamples >= batch_size
        nbatches = nsamples // batch_size
@ -233,8 +200,7 @@ class kNN_CDF:

        jointdist = numpy.zeros((batch_size, 2), dtype=dtype)
        for j in range(nbatches):
-            rand = self.rvs_in_sphere(batch_size, Rmax,
-                                      random_state=random_state + j)
+            rand = rvs_gen(batch_size, random_state=random_state + j)
            dist0, __ = knn0.kneighbors(rand, nneighbours)
            dist1, __ = knn1.kneighbors(rand, nneighbours)

@ -269,21 +235,19 @@ class kNN_CDF:
        rs = (bins[1:] + bins[:-1]) / 2     # Bin centers
        return rs, cdf0, cdf1, joint_cdf

-    def __call__(self, *knns, nneighbours, Rmax, nsamples, rmin, rmax, neval,
-                 batch_size=None, verbose=True, random_state=42,
-                 dtype=numpy.float32):
+    def __call__(self, knn, rvs_gen, nneighbours, nsamples, rmin, rmax, neval,
+                 batch_size=None, random_state=42, dtype=numpy.float32):
        """
        Calculate the CDF for a set of kNNs of CSiBORG halo catalogues.

        Parameters
        ----------
-        *knns : `sklearn.neighbors.NearestNeighbors` instances
-            kNNs of CSiBORG halo catalogues.
+        knn : `sklearn.neighbors.NearestNeighbors`
+            Catalogue NN object.
+        rvs_gen : :py:class:`csiborgtools.clustering.BaseRVS`
+            Uniform RVS generator matching `knn1` and `knn2`.
        neighbours : int
            Maximum number of neighbours to use for the kNN-CDF calculation.
-        Rmax : float
-            Maximum radius of the sphere in which to sample random points for
-            the knn-CDF calculation. This should match the CSiBORG catalogues.
        nsamples : int
            Number of random points to sample for the knn-CDF calculation.
        rmin : float
@ -296,8 +260,6 @@ class kNN_CDF:
            Number of random points to sample in each batch. By default equal
            to `nsamples`, however recommeded to be smaller to avoid requesting
            too much memory,
-        verbose : bool, optional
-            Verbosity flag.
        random_state : int, optional
            Random state for the random number generator.
        dtype : numpy dtype, optional
@ -307,33 +269,30 @@ class kNN_CDF:
        -------
        rs : 1-dimensional array
            Distances at which the CDF is evaluated.
-        cdfs : 2 or 3-dimensional array
-            CDFs evaluated at `rs`.
+        cdf : 2-dimensional array
+            CDF evaluated at `rs`.
        """
+        assert isinstance(rvs_gen, BaseRVS)
        batch_size = nsamples if batch_size is None else batch_size
        assert nsamples >= batch_size
        nbatches = nsamples // batch_size

        # Preallocate the bins and the CDF array
        bins = numpy.logspace(numpy.log10(rmin), numpy.log10(rmax), neval)
-        cdfs = numpy.zeros((len(knns), nneighbours, neval - 1), dtype=dtype)
-        for i, knn in enumerate(tqdm(knns) if verbose else knns):
-            for j in range(nbatches):
-                rand = self.rvs_in_sphere(batch_size, Rmax,
-                                          random_state=random_state + j)
-                dist, __ = knn.kneighbors(rand, nneighbours)
+        cdf = numpy.zeros((nneighbours, neval - 1), dtype=dtype)
+        for i in range(nbatches):
+            rand = rvs_gen(batch_size, random_state=random_state + i)
+            dist, __ = knn.kneighbors(rand, nneighbours)

-                for k in range(nneighbours):  # Count for each neighbour
-                    _counts, __, __ = binned_statistic(
-                        dist[:, k], dist[:, k], bins=bins, statistic="count",
-                        range=(rmin, rmax))
-                    cdfs[i, k, :] += _counts
+            for k in range(nneighbours):  # Count for each neighbour
+                _counts, __, __ = binned_statistic(
+                    dist[:, k], dist[:, k], bins=bins, statistic="count",
+                    range=(rmin, rmax))
+                cdf[k, :] += _counts

-        cdfs = numpy.cumsum(cdfs, axis=-1)  # Cumulative sum, i.e. the CDF
-        for i in range(len(knns)):
-            for k in range(nneighbours):
-                cdfs[i, k, :] /= cdfs[i, k, -1]
+        cdf = numpy.cumsum(cdf, axis=-1)  # Cumulative sum, i.e. the CDF
+        for k in range(nneighbours):
+            cdf[k, :] /= cdf[k, -1]

        rs = (bins[1:] + bins[:-1]) / 2     # Bin centers
-        cdfs = cdfs[0, ...] if len(knns) == 1 else cdfs
-        return rs, cdfs
+        return rs, cdf
--- a/csiborgtools/clustering/utils.py
+++ b/csiborgtools/clustering/utils.py
@ -0,0 +1,193 @@
+# Copyright (C) 2022 Richard Stiskalek
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation; either version 3 of the License, or (at your
+# option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General
+# Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+"""Clustering support functions."""
+from abc import (ABC, abstractmethod)
+from warnings import warn
+import numpy
+
+
+###############################################################################
+#                            Random points                                    #
+###############################################################################
+
+
+class BaseRVS(ABC):
+    """
+    Base RVS generator.
+    """
+    @abstractmethod
+    def __call__(self, nsamples, random_state, dtype):
+        """
+        Generate RVS.
+
+        Parameters
+        ----------
+        nsamples : int
+            Number of samples to generate.
+        random_state : int, optional
+            Random state for the random number generator.
+        dtype : numpy dtype, optional
+            Data type, by default `numpy.float32`.
+
+        Returns
+        -------
+        samples : 2-dimensional array of shape `(nsamples, ndim)`
+        """
+        pass
+
+
+class RVSinsphere(BaseRVS):
+    """
+    Generator of uniform RVS in a sphere of radius `R` in Cartesian
+    coordinates centered at the origin.
+
+    Parameters
+    ----------
+    R : float
+        Radius of the sphere.
+    """
+    def __init__(self, R):
+        assert R > 0, "Radius must be positive."
+        self.R = R
+        BaseRVS.__init__(self)
+
+    def __call__(self, nsamples, random_state=42, dtype=numpy.float32):
+        gen = numpy.random.default_rng(random_state)
+        # Spherical
+        r = gen.random(nsamples, dtype=dtype)**(1/3) * self.R
+        theta = 2 * numpy.arcsin(gen.random(nsamples, dtype=dtype))
+        phi = 2 * numpy.pi * gen.random(nsamples, dtype=dtype)
+        # Cartesian
+        x = r * numpy.sin(theta) * numpy.cos(phi)
+        y = r * numpy.sin(theta) * numpy.sin(phi)
+        z = r * numpy.cos(theta)
+        return numpy.vstack([x, y, z]).T
+
+
+class RVSinbox(BaseRVS):
+    """
+    Generator of uniform RVS in a box of width `L` in Cartesian coordinates in
+    :math:`[0, L]^3`.
+
+    Parameters
+    ----------
+    width : float
+        Width of the box.
+    """
+    def __init__(self, width):
+        assert width > 0, "Width must be positive."
+        self.width = width
+        BaseRVS.__init__(self)
+
+    def __call__(self, nsamples, random_state=42, dtype=numpy.float32):
+        gen = numpy.random.default_rng(random_state)
+        x = gen.random(nsamples, dtype=dtype)
+        y = gen.random(nsamples, dtype=dtype)
+        z = gen.random(nsamples, dtype=dtype)
+        return self.width * numpy.vstack([x, y, z]).T
+
+
+class RVSonsphere(BaseRVS):
+    """
+    Generator of uniform RVS on the surface of a unit sphere. RA is in
+    :math:`[0, 2\pi)` and dec in :math:`[-\pi / 2, \pi / 2]`, respectively.
+    If `indeg` is `True` then converted to degrees.
+
+    Parameters
+    ----------
+    indeg : bool
+        Whether to generate the right ascension and declination in degrees.
+    """
+    def __init__(self, indeg):
+        assert isinstance(indeg, bool), "`indeg` must be a boolean."
+        self.indeg = indeg
+        BaseRVS.__init__(self)
+
+    def __call__(self, nsamples, random_state=42, dtype=numpy.float32):
+        gen = numpy.random.default_rng(random_state)
+        ra = 2 * numpy.pi * gen.random(nsamples, dtype=dtype)
+        dec = numpy.arcsin(2 * (gen.random(nsamples, dtype=dtype) - 0.5))
+        if self.indeg:
+            ra = numpy.rad2deg(ra)
+            dec = numpy.rad2deg(dec)
+        return numpy.vstack([ra, dec]).T
+
+
+###############################################################################
+#                               RA wrapping                                   #
+###############################################################################
+
+
+def wrapRA(ra, indeg):
+    """
+    Wrap RA from :math:`[-180, 180)` to :math`[0, 360)` degrees if `indeg` or
+    equivalently in radians otherwise.
+
+    Paramaters
+    ----------
+    ra : 1-dimensional array
+        Right ascension.
+    indeg : bool
+        Whether the right ascension is in degrees.
+
+    Returns
+    -------
+    wrapped_ra : 1-dimensional array
+    """
+    mask = ra < 0
+    if numpy.sum(mask) == 0:
+        warn("No negative right ascension found.", UserWarning())
+    ra[mask] += 360 if indeg else 2 * numpy.pi
+    return ra
+
+
+###############################################################################
+#                   Secondary assembly bias normalised marks                  #
+###############################################################################
+
+
+def normalised_marks(x, y, nbins):
+    """
+    Calculate the normalised marks of `y` binned by `x`.
+
+    Parameters
+    ----------
+    x : 1-dimensional array
+        Binning variable.
+    y : 1-dimensional array
+        The variable to be marked.
+    nbins : int
+        Number of percentile bins.
+
+    Returns
+    -------
+    marks : 1-dimensional array
+    """
+    assert x.ndim == y.ndim == 1
+    if y.dtype not in [numpy.float32, numpy.float64]:
+        raise NotImplemented("Marks from integers are not supported.")
+
+    bins = numpy.percentile(x, q=numpy.linspace(0, 100, nbins + 1))
+    marks = numpy.full_like(y, numpy.nan)
+    for i in range(nbins):
+        m = (x >= bins[i]) & (x < bins[i + 1])
+        # Calculate the normalised marks of this bin
+        _marks = numpy.full(numpy.sum(m), numpy.nan, dtype=marks.dtype)
+        for n, ind in enumerate(numpy.argsort(y[m])):
+            _marks[ind] = n
+        _marks /= numpy.nanmax(_marks)
+        marks[m] = _marks
+
+    return marks
--- a/csiborgtools/match/init.py
+++ b/csiborgtools/match/init.py
@ -18,5 +18,3 @@ from .match import (RealisationsMatcher, cosine_similarity,  # noqa
                    calculate_overlap, calculate_overlap_indxs,  # noqa
                    dist_centmass, dist_percentile)  # noqa
 from .num_density import (binned_counts, number_density)  # noqa
-from .knn import kNN_CDF
-# from .correlation import (get_randoms_sphere, sphere_angular_tpcf) # noqa
--- a/csiborgtools/read/summaries.py
+++ b/csiborgtools/read/summaries.py
@ -18,6 +18,7 @@ Tools for summarising various results.
 from os.path import (join, isfile)
 from glob import glob
 import numpy
+from scipy.special import factorial
 import joblib
 from tqdm import tqdm

@ -184,55 +185,53 @@ class kNNCDFReader:
    """
    Shortcut object to read in the kNN CDF data.
    """
-    def read(self, files, ks, rmin=None, rmax=None, to_clip=True):
+    def read(self, run, folder, rmin=None, rmax=None, to_clip=True):
        """
-        Read the kNN CDF data can be either the auto- or cross-correlation.
+        Read the auto- or cross-correlation kNN-CDF data. Infers the type from
+        the data files.

        Parameters
        ----------
-        files : list of str
-            List of file paths to read in.
-        ks : list of int
-            kNN values to read in.
+        run : str
+            Run ID to read in.
+        folder : str
+            Path to the folder where the auto-correlation kNN-CDF is stored.
        rmin : float, optional
            Minimum separation. By default ignored.
        rmax : float, optional
            Maximum separation. By default ignored.
        to_clip : bool, optional
-            Whether to clip the auto-correlation CDF. Ignored if reading in the
+            Whether to clip the auto-correlation CDF. Ignored for
            cross-correlation.

        Returns
        -------
-        rs : 1-dimensional array
-            Array of separations.
-        out : 4-dimensional array
-            Auto-correlation or cross-correlation kNN CDFs. The shape is
-            `(len(files), len(mass_thresholds), len(ks), neval)`.
-        mass_thresholds : 1-dimensional array
-            Array of mass thresholds.
+        rs : 1-dimensional array of shape `(neval, )`
+            Separations where the CDF is evaluated.
+        out : 3-dimensional array of shape `(len(files), len(ks), neval)`
+            Array of CDFs or cross-correlations.
        """
-        data = joblib.load(files[0])
-        if "cdf_0" in data.keys():
-            isauto = True
-            kind = "cdf"
-        elif "corr_0" in data.keys():
-            isauto = False
-            kind = "corr"
-        else:
-            raise ValueError("Unknown data format.")
-        rs = data["rs"]
-        mass_thresholds = data["mass_threshold"]
-        neval = data["{}_0".format(kind)].shape[1]
-        out = numpy.full((len(files), len(mass_thresholds), len(ks), neval),
-                         numpy.nan, dtype=numpy.float32)
+        run += ".p"
+        files = [f for f in glob(join(folder, "*")) if run in f]
+        if len(files) == 0:
+            raise RuntimeError("No files found for run `{}`.".format(run[:-2]))

-        for i, file in enumerate(tqdm(files)):
+        for i, file in enumerate(files):
            data = joblib.load(file)
-            for j in range(len(mass_thresholds)):
-                out[i, j, ...] = data["{}_{}".format(kind, j)][ks, :]
-                if isauto and to_clip:
-                    out[i, j, ...] = self.clipped_cdf(out[i, j, ...])
+            if i == 0:  # Initialise the array
+                if "corr" in data.keys():
+                    kind = "corr"
+                    isauto = False
+                else:
+                    kind = "cdf"
+                    isauto = True
+                out = numpy.full((len(files), *data[kind].shape), numpy.nan,
+                                 dtype=numpy.float32)
+                rs = data["rs"]
+            out[i, ...] = data[kind]
+
+            if isauto and to_clip:
+                out[i, ...] = self.clipped_cdf(out[i, ...])

        # Apply separation cuts
        mask = (rs >= rmin if rmin is not None else rs > 0)
@ -240,7 +239,7 @@ class kNNCDFReader:
        rs = rs[mask]
        out = out[..., mask]

-        return rs, out, mass_thresholds
+        return rs, out

    @staticmethod
    def peaked_cdf(cdf, make_copy=True):
@ -295,37 +294,74 @@ class kNNCDFReader:
        return cdf

    @staticmethod
-    def prob_kvolume(cdfs, rs=None, normalise=False):
-        """
-        Calculate the probability that a spherical volume contains :math:`k`=
-        objects from the kNN CDFs.
+    def prob_k(cdf):
+        r"""
+        Calculate the PDF that a spherical volume of radius :math:`r` contains
+        :math:`k` objects, i.e. :math:`P(k | V = 4 \pi r^3 / 3)`.

        Parameters
        ----------
-        cdf : 4-dimensional array of shape `(nfiles, nmasses, nknn, nrs)`
+        cdf : 3-dimensional array of shape `(len(files), len(ks), len(rs))`
            Array of CDFs
-        normalise : bool, optional
-            Whether to normalise the probability to 1.

        Returns
        -------
-        pk : 4-dimensional array of shape `(nfiles, nmasses, nknn - 1, nrs)`
+        pk : 3-dimensional array of shape `(len(files), len(ks)- 1, len(rs))`
        """
-        out = numpy.full_like(cdfs[..., 1:, :], numpy.nan, dtype=numpy.float32)
+        out = numpy.full_like(cdf[..., 1:, :], numpy.nan, dtype=numpy.float32)
+        nks = cdf.shape[-2]
+        out[..., 0, :] = 1 - cdf[..., 0, :]

-        for k in range(cdfs.shape[-2] - 1):
-            out[..., k, :] = cdfs[..., k, :] - cdfs[..., k + 1, :]
+        for k in range(1, nks - 1):
+            out[..., k, :] = cdf[..., k - 1, :] - cdf[..., k, :]

-        if normalise:
-            assert rs is not None, "rs must be provided to normalise."
-            assert rs.ndim == 1
-
-            norm = numpy.nansum(
-                0.5 * (out[..., 1:] + out[..., :-1]) * (rs[1:] - rs[:-1]),
-                axis=-1)
-            out /= norm.reshape(*norm.shape, 1)
        return out

+    def mean_prob_k(self, cdf):
+        """
+        Calculate the mean PDF that a spherical volume of radius :math:`r`
+        contains :math:`k` objects, i.e. :math:`P(k | V = 4 \pi r^3 / 3)`,
+        averaged over the IC realisations.
+
+        Parameters
+        ----------
+        cdf : 3-dimensional array of shape `(len(files), len(ks), len(rs))`
+            Array of CDFs
+        Returns
+        -------
+        out : 3-dimensional array of shape `(len(ks) - 1, len(rs), 2)`
+            Mean :math:`P(k | V = 4 \pi r^3 / 3) and its standard deviation,
+            stored along the last dimension, respectively.
+        """
+        pk = self.prob_k(cdf)
+        return numpy.stack([numpy.mean(pk, axis=0), numpy.std(pk, axis=0)],
+                           axis=-1)
+
+    def poisson_prob_k(self, rs, k, ndensity):
+        """
+        Calculate the analytical PDF that a spherical volume of
+        radius :math:`r` contains :math:`k` objects, i.e.
+        :math:`P(k | V = 4 \pi r^3 / 3)`, assuming a Poisson field (uniform
+        distribution of points).
+
+        Parameters
+        ----------
+        rs : 1-dimensional array
+            Array of separations.
+        k : int
+            Number of objects.
+        ndensity : float
+            Number density of objects.
+
+        Returns
+        -------
+        pk : 1-dimensional array
+            The PDF that a spherical volume of radius :math:`r` contains
+            :math:`k` objects.
+        """
+        V = 4 * numpy.pi / 3 * rs**3
+        return (ndensity * V)**k / factorial(k) * numpy.exp(-ndensity * V)
+
    @staticmethod
    def cross_files(ic, folder):
        """