kNN-CDF secondary halo bias (#40)

* Add seperate autoknn script & config file * edit ics * Edit submission script * Add threshold values * Edit batch sizign * Remove print * edit * Rename files * Rename * Update nb * edit runs * Edit submit * Add median threshold * add new auto reader * editt submit * edit submit * Edit submit * Add mean prk * Edit runs * Remove correlation file * Move split to clutering * Add init * Remove import * Add the file * Add correlation reading * Edit scripts * Add below and above median permutation for cross * Update imports * Move rvs_in_sphere * Create utils * Split * Add import * Add normalised marks * Add import * Edit readme * Clean up submission file * Stop tracking submit files * Update gitignore * Add poisson field analytical expression * Add abstract generators * Add generators * Pass in the generator * Add a check for if there are any files * Start saving average density * Update nb * Update readme * Update units * Edit jobs * Update submits * Update reader * Add random crossing * Update crossing script * Add crossing with random * Update readme * Update notebook
2025-06-30 19:41:12 +00:00 · 2023-04-09 20:57:05 +01:00 · 2023-04-09 20:57:05 +01:00 · 5784011de0
commit 5784011de0
parent 826ab61d2d
28 changed files with 2563 additions and 486 deletions
--- a/.gitignore
+++ b/.gitignore
@ -15,4 +15,4 @@ build/*
 csiborgtools.egg-info/*
 Pylians3/*
 scripts/plot_correlation.ipynb
-scripts/python.sh
+scripts/*.sh
--- a/README.md
+++ b/README.md
@ -7,12 +7,20 @@


 ## Project Clustering
- [ ] Add uncertainty to the kNN-CDF autocorrelation?
- [ ] Add kNN-CDF differences.
- [ ] Add reading halo catalogues at higher redshifts.
- [x] Add the joint kNN-CDF calculation.
- [x] Make kNN-CDF more memory friendly if generating many randoms.

+### Longterm
+- [ ] Add uncertainty to the kNN-CDF autocorrelation?
+- [ ] Add reading halo catalogues at higher redshifts.
+
+
+### April 9 2023 Sunday
+- [x] Add normalised marks calculation.
+- [x] Add normalised marks to the submission scripts.
+- [x] Verify analytical formula for the kNN of a uniform field.
+- [x] For the cross-correlation try making the second field randoms.
+- [ ] Clean up the reader code.
+- [x] Correct the crossing script.
+- [ ] Get started with the 2PCF calculation.

 ## Project Environmental Dependence
 - [ ] Add gradient and Hessian of the overdensity field.
--- a/csiborgtools/init.py
+++ b/csiborgtools/init.py
@ -12,4 +12,4 @@
 # You should have received a copy of the GNU General Public License along
 # with this program; if not, write to the Free Software Foundation, Inc.,
 # 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
-from csiborgtools import (read, match, utils, units, fits, field)  # noqa
+from csiborgtools import (read, match, utils, units, fits, field, clustering)  # noqa
--- a/csiborgtools/match/correlation.py
+++ b/csiborgtools/match/correlation.py
@ -1,4 +1,4 @@
-# Copyright (C) 2022 Richard Stiskalek
+# Copyright (C) 2023 Richard Stiskalek
 # This program is free software; you can redistribute it and/or modify it
 # under the terms of the GNU General Public License as published by the
 # Free Software Foundation; either version 3 of the License, or (at your
@ -12,58 +12,15 @@
 # You should have received a copy of the GNU General Public License along
 # with this program; if not, write to the Free Software Foundation, Inc.,
 # 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+"""
+2PCF calculation.
+
+NOTE: This is an old script that needs to be updated.
+"""
 import numpy
 from Corrfunc.mocks import DDtheta_mocks
 from Corrfunc.utils import convert_3d_counts_to_cf
-from warnings import warn
-
-
-def get_randoms_sphere(N, seed=42):
-    """
-    Generate random points on a sphere.
-
-    Parameters
-    ----------
-    N : int
-        Number of points.
-    seed : int
-        Random seed.
-
-    Returns
-    -------
-    ra : 1-dimensional array
-        Right ascension in :math:`[0, 360)` degrees.
-    dec : 1-dimensional array
-        Declination in :math:`[-90, 90]` degrees.
-    """
-    gen = numpy.random.default_rng(seed)
-    ra = gen.random(N) * 360
-    dec = numpy.rad2deg(numpy.arcsin(2 * (gen.random(N) - 0.5)))
-    return ra, dec
-
-
-def wrapRA(ra, degrees=True):
-    """
-    Wrap the right ascension from :math:`[-180, 180)` to :math`[0, 360)`
-    degrees or equivalently if `degrees=False` in radians.
-
-    Paramaters
-    ----------
-    ra : 1-dimensional array
-        Right ascension values.
-    degrees : float, optional
-        Whether the right ascension is in degrees.
-
-    Returns
-    -------
-    ra : 1-dimensional array
-        Wrapped around right ascension.
-    """
-    mask = ra < 0
-    if numpy.sum(mask) == 0:
-        warn("No negative right ascension found.")
-    ra[mask] += 360 if degrees else 2 * numpy.pi
-    return ra
+from .utils import (rvs_on_sphere, wrapRA)


 def sphere_angular_tpcf(bins, RA1, DEC1, RA2=None, DEC2=None, nthreads=1,
@ -113,11 +70,11 @@ def sphere_angular_tpcf(bins, RA1, DEC1, RA2=None, DEC2=None, nthreads=1,
    NR1 = ND1 * Nmult
    NR2 = ND2 * Nmult
    # Generate randoms. Note that these are over the sphere!
-    randRA1, randDEC1 = get_randoms_sphere(NR1, seed1)
-    randRA2, randDEC2 = get_randoms_sphere(NR2, seed2)
+    randRA1, randDEC1 = rvs_on_sphere(NR1, indeg=True, random_state=seed1)
+    randRA2, randDEC2 = rvs_on_sphere(NR2, indeg=True, random_state=seed2)
    # Wrap RA
-    RA1 = wrapRA(numpy.copy(RA1))
-    RA2 = wrapRA(numpy.copy(RA2))
+    RA1 = wrapRA(numpy.copy(RA1), indeg=True)
+    RA2 = wrapRA(numpy.copy(RA2), indeg=True)
    # Calculate pairs
    D1D2 = DDtheta_mocks(0, nthreads, bins, RA1, DEC1, RA2=RA2, DEC2=DEC2)
    D1R2 = DDtheta_mocks(0, nthreads, bins, RA1, DEC1,
@ -127,4 +84,4 @@ def sphere_angular_tpcf(bins, RA1, DEC1, RA2=None, DEC2=None, nthreads=1,
    R1R2 = DDtheta_mocks(0, nthreads, bins, randRA1, randDEC1,
                         RA2=randRA2, DEC2=randDEC2)
    # Convert to the CF
-    return convert_3d_counts_to_cf(ND1, ND2, NR1, NR2, D1D2, D1R2, D2R1, R1R2)
+    return convert_3d_counts_to_cf(ND1, ND2, NR1, NR2, D1D2, D1R2, D2R1, R1R2)
--- a/csiborgtools/clustering/init.py
+++ b/csiborgtools/clustering/init.py
@ -0,0 +1,16 @@
+# Copyright (C) 2022 Richard Stiskalek
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation; either version 3 of the License, or (at your
+# option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General
+# Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+from .knn import kNN_CDF  # noqa
+from .utils import (RVSinsphere, RVSinbox, RVSonsphere, BaseRVS, normalised_marks)  # noqa
--- a/csiborgtools/clustering/knn.py
+++ b/csiborgtools/clustering/knn.py
@ -18,52 +18,16 @@ kNN-CDF calculation
 import numpy
 from scipy.interpolate import interp1d
 from scipy.stats import binned_statistic
-from tqdm import tqdm
+from .utils import BaseRVS


 class kNN_CDF:
-    """
-    Object to calculate the kNN-CDF for a set of CSiBORG halo catalogues from
-    their kNN objects.
-    """
-    @staticmethod
-    def rvs_in_sphere(nsamples, R, random_state=42, dtype=numpy.float32):
-        """
-        Generate random samples in a sphere of radius `R` centered at the
-        origin.
-
-        Parameters
-        ----------
-        nsamples : int
-            Number of samples to generate.
-        R : float
-            Radius of the sphere.
-        random_state : int, optional
-            Random state for the random number generator.
-        dtype : numpy dtype, optional
-            Data type, by default `numpy.float32`.
-
-        Returns
-        -------
-        samples : 2-dimensional array of shape `(nsamples, 3)`
-        """
-        gen = numpy.random.default_rng(random_state)
-        # Sample spherical coordinates
-        r = gen.uniform(0, 1, nsamples).astype(dtype)**(1/3) * R
-        theta = 2 * numpy.arcsin(gen.uniform(0, 1, nsamples).astype(dtype))
-        phi = 2 * numpy.pi * gen.uniform(0, 1, nsamples).astype(dtype)
-        # Convert to cartesian coordinates
-        x = r * numpy.sin(theta) * numpy.cos(phi)
-        y = r * numpy.sin(theta) * numpy.sin(phi)
-        z = r * numpy.cos(theta)
-
-        return numpy.vstack([x, y, z]).T
-
+    """Object to calculate the kNN-CDF statistic."""
    @staticmethod
    def cdf_from_samples(r, rmin=None, rmax=None, neval=None,
                         dtype=numpy.float32):
        """
-        Calculate the CDF from samples.
+        Calculate the kNN-CDF from a sampled PDF.

        Parameters
        ----------
@ -128,22 +92,21 @@ class kNN_CDF:
            corr[k, :] = joint_cdf[k, :] - cdf0[k, :] * cdf1[k, :]
        return corr

-    def brute_cdf(self, knn, nneighbours, Rmax, nsamples, rmin, rmax, neval,
+    def brute_cdf(self, knn, rvs_gen, nneighbours, nsamples, rmin, rmax, neval,
                  random_state=42, dtype=numpy.float32):
        """
-        Calculate the CDF for a kNN of CSiBORG halo catalogues without batch
-        sizing. This can become memory intense for large numbers of randoms
-        and, therefore, is only for testing purposes.
+        Calculate the kNN-CDF without batch sizing. This can become memory
+        intense for large numbers of randoms and, therefore, is primarily for
+        testing purposes.

        Parameters
        ----------
-        knns : `sklearn.neighbors.NearestNeighbors`
-            kNN of CSiBORG halo catalogues.
+        knn : `sklearn.neighbors.NearestNeighbors`
+            Catalogue NN object.
+        rvs_gen : :py:class:`csiborgtools.clustering.BaseRVS`
+            Uniform RVS generator matching `knn`.
        neighbours : int
            Maximum number of neighbours to use for the kNN-CDF calculation.
-        Rmax : float
-            Maximum radius of the sphere in which to sample random points for
-            the knn-CDF calculation. This should match the CSiBORG catalogues.
        nsamples : int
            Number of random points to sample for the knn-CDF calculation.
        rmin : float
@ -164,7 +127,8 @@ class kNN_CDF:
        cdfs : 2-dimensional array
            CDFs evaluated at `rs`.
        """
-        rand = self.rvs_in_sphere(nsamples, Rmax, random_state=random_state)
+        assert isinstance(rvs_gen, BaseRVS)
+        rand = rvs_gen(nsamples, random_state=random_state)

        dist, __ = knn.kneighbors(rand, nneighbours)
        dist = dist.astype(dtype)
@ -177,18 +141,20 @@ class kNN_CDF:
        cdf = numpy.asanyarray(cdf)
        return rs, cdf

-    def joint(self, knn0, knn1, nneighbours, Rmax, nsamples, rmin, rmax,
+    def joint(self, knn0, knn1, rvs_gen, nneighbours, nsamples, rmin, rmax,
              neval, batch_size=None, random_state=42,
              dtype=numpy.float32):
        """
-        Calculate the joint CDF for two kNNs of CSiBORG halo catalogues.
+        Calculate the joint knn-CDF.

        Parameters
        ----------
        knn0 : `sklearn.neighbors.NearestNeighbors` instance
-            kNN of the first CSiBORG halo catalogue.
+            NN object of the first catalogue.
        knn1 : `sklearn.neighbors.NearestNeighbors` instance
-            kNN of the second CSiBORG halo catalogue.
+            NN object of the second catalogue.
+        rvs_gen : :py:class:`csiborgtools.clustering.BaseRVS`
+            Uniform RVS generator matching `knn1` and `knn2`.
        neighbours : int
            Maximum number of neighbours to use for the kNN-CDF calculation.
        Rmax : float
@ -222,6 +188,7 @@ class kNN_CDF:
        joint_cdf : 2-dimensional array
            Joint CDF evaluated at `rs`.
        """
+        assert isinstance(rvs_gen, BaseRVS)
        batch_size = nsamples if batch_size is None else batch_size
        assert nsamples >= batch_size
        nbatches = nsamples // batch_size
@ -233,8 +200,7 @@ class kNN_CDF:

        jointdist = numpy.zeros((batch_size, 2), dtype=dtype)
        for j in range(nbatches):
-            rand = self.rvs_in_sphere(batch_size, Rmax,
-                                      random_state=random_state + j)
+            rand = rvs_gen(batch_size, random_state=random_state + j)
            dist0, __ = knn0.kneighbors(rand, nneighbours)
            dist1, __ = knn1.kneighbors(rand, nneighbours)

@ -269,21 +235,19 @@ class kNN_CDF:
        rs = (bins[1:] + bins[:-1]) / 2     # Bin centers
        return rs, cdf0, cdf1, joint_cdf

-    def __call__(self, *knns, nneighbours, Rmax, nsamples, rmin, rmax, neval,
-                 batch_size=None, verbose=True, random_state=42,
-                 dtype=numpy.float32):
+    def __call__(self, knn, rvs_gen, nneighbours, nsamples, rmin, rmax, neval,
+                 batch_size=None, random_state=42, dtype=numpy.float32):
        """
        Calculate the CDF for a set of kNNs of CSiBORG halo catalogues.

        Parameters
        ----------
-        *knns : `sklearn.neighbors.NearestNeighbors` instances
-            kNNs of CSiBORG halo catalogues.
+        knn : `sklearn.neighbors.NearestNeighbors`
+            Catalogue NN object.
+        rvs_gen : :py:class:`csiborgtools.clustering.BaseRVS`
+            Uniform RVS generator matching `knn1` and `knn2`.
        neighbours : int
            Maximum number of neighbours to use for the kNN-CDF calculation.
-        Rmax : float
-            Maximum radius of the sphere in which to sample random points for
-            the knn-CDF calculation. This should match the CSiBORG catalogues.
        nsamples : int
            Number of random points to sample for the knn-CDF calculation.
        rmin : float
@ -296,8 +260,6 @@ class kNN_CDF:
            Number of random points to sample in each batch. By default equal
            to `nsamples`, however recommeded to be smaller to avoid requesting
            too much memory,
-        verbose : bool, optional
-            Verbosity flag.
        random_state : int, optional
            Random state for the random number generator.
        dtype : numpy dtype, optional
@ -307,33 +269,30 @@ class kNN_CDF:
        -------
        rs : 1-dimensional array
            Distances at which the CDF is evaluated.
-        cdfs : 2 or 3-dimensional array
-            CDFs evaluated at `rs`.
+        cdf : 2-dimensional array
+            CDF evaluated at `rs`.
        """
+        assert isinstance(rvs_gen, BaseRVS)
        batch_size = nsamples if batch_size is None else batch_size
        assert nsamples >= batch_size
        nbatches = nsamples // batch_size

        # Preallocate the bins and the CDF array
        bins = numpy.logspace(numpy.log10(rmin), numpy.log10(rmax), neval)
-        cdfs = numpy.zeros((len(knns), nneighbours, neval - 1), dtype=dtype)
-        for i, knn in enumerate(tqdm(knns) if verbose else knns):
-            for j in range(nbatches):
-                rand = self.rvs_in_sphere(batch_size, Rmax,
-                                          random_state=random_state + j)
-                dist, __ = knn.kneighbors(rand, nneighbours)
+        cdf = numpy.zeros((nneighbours, neval - 1), dtype=dtype)
+        for i in range(nbatches):
+            rand = rvs_gen(batch_size, random_state=random_state + i)
+            dist, __ = knn.kneighbors(rand, nneighbours)

-                for k in range(nneighbours):  # Count for each neighbour
-                    _counts, __, __ = binned_statistic(
-                        dist[:, k], dist[:, k], bins=bins, statistic="count",
-                        range=(rmin, rmax))
-                    cdfs[i, k, :] += _counts
+            for k in range(nneighbours):  # Count for each neighbour
+                _counts, __, __ = binned_statistic(
+                    dist[:, k], dist[:, k], bins=bins, statistic="count",
+                    range=(rmin, rmax))
+                cdf[k, :] += _counts

-        cdfs = numpy.cumsum(cdfs, axis=-1)  # Cumulative sum, i.e. the CDF
-        for i in range(len(knns)):
-            for k in range(nneighbours):
-                cdfs[i, k, :] /= cdfs[i, k, -1]
+        cdf = numpy.cumsum(cdf, axis=-1)  # Cumulative sum, i.e. the CDF
+        for k in range(nneighbours):
+            cdf[k, :] /= cdf[k, -1]

        rs = (bins[1:] + bins[:-1]) / 2     # Bin centers
-        cdfs = cdfs[0, ...] if len(knns) == 1 else cdfs
-        return rs, cdfs
+        return rs, cdf
--- a/csiborgtools/clustering/utils.py
+++ b/csiborgtools/clustering/utils.py
@ -0,0 +1,193 @@
+# Copyright (C) 2022 Richard Stiskalek
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation; either version 3 of the License, or (at your
+# option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General
+# Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+"""Clustering support functions."""
+from abc import (ABC, abstractmethod)
+from warnings import warn
+import numpy
+
+
+###############################################################################
+#                            Random points                                    #
+###############################################################################
+
+
+class BaseRVS(ABC):
+    """
+    Base RVS generator.
+    """
+    @abstractmethod
+    def __call__(self, nsamples, random_state, dtype):
+        """
+        Generate RVS.
+
+        Parameters
+        ----------
+        nsamples : int
+            Number of samples to generate.
+        random_state : int, optional
+            Random state for the random number generator.
+        dtype : numpy dtype, optional
+            Data type, by default `numpy.float32`.
+
+        Returns
+        -------
+        samples : 2-dimensional array of shape `(nsamples, ndim)`
+        """
+        pass
+
+
+class RVSinsphere(BaseRVS):
+    """
+    Generator of uniform RVS in a sphere of radius `R` in Cartesian
+    coordinates centered at the origin.
+
+    Parameters
+    ----------
+    R : float
+        Radius of the sphere.
+    """
+    def __init__(self, R):
+        assert R > 0, "Radius must be positive."
+        self.R = R
+        BaseRVS.__init__(self)
+
+    def __call__(self, nsamples, random_state=42, dtype=numpy.float32):
+        gen = numpy.random.default_rng(random_state)
+        # Spherical
+        r = gen.random(nsamples, dtype=dtype)**(1/3) * self.R
+        theta = 2 * numpy.arcsin(gen.random(nsamples, dtype=dtype))
+        phi = 2 * numpy.pi * gen.random(nsamples, dtype=dtype)
+        # Cartesian
+        x = r * numpy.sin(theta) * numpy.cos(phi)
+        y = r * numpy.sin(theta) * numpy.sin(phi)
+        z = r * numpy.cos(theta)
+        return numpy.vstack([x, y, z]).T
+
+
+class RVSinbox(BaseRVS):
+    """
+    Generator of uniform RVS in a box of width `L` in Cartesian coordinates in
+    :math:`[0, L]^3`.
+
+    Parameters
+    ----------
+    width : float
+        Width of the box.
+    """
+    def __init__(self, width):
+        assert width > 0, "Width must be positive."
+        self.width = width
+        BaseRVS.__init__(self)
+
+    def __call__(self, nsamples, random_state=42, dtype=numpy.float32):
+        gen = numpy.random.default_rng(random_state)
+        x = gen.random(nsamples, dtype=dtype)
+        y = gen.random(nsamples, dtype=dtype)
+        z = gen.random(nsamples, dtype=dtype)
+        return self.width * numpy.vstack([x, y, z]).T
+
+
+class RVSonsphere(BaseRVS):
+    """
+    Generator of uniform RVS on the surface of a unit sphere. RA is in
+    :math:`[0, 2\pi)` and dec in :math:`[-\pi / 2, \pi / 2]`, respectively.
+    If `indeg` is `True` then converted to degrees.
+
+    Parameters
+    ----------
+    indeg : bool
+        Whether to generate the right ascension and declination in degrees.
+    """
+    def __init__(self, indeg):
+        assert isinstance(indeg, bool), "`indeg` must be a boolean."
+        self.indeg = indeg
+        BaseRVS.__init__(self)
+
+    def __call__(self, nsamples, random_state=42, dtype=numpy.float32):
+        gen = numpy.random.default_rng(random_state)
+        ra = 2 * numpy.pi * gen.random(nsamples, dtype=dtype)
+        dec = numpy.arcsin(2 * (gen.random(nsamples, dtype=dtype) - 0.5))
+        if self.indeg:
+            ra = numpy.rad2deg(ra)
+            dec = numpy.rad2deg(dec)
+        return numpy.vstack([ra, dec]).T
+
+
+###############################################################################
+#                               RA wrapping                                   #
+###############################################################################
+
+
+def wrapRA(ra, indeg):
+    """
+    Wrap RA from :math:`[-180, 180)` to :math`[0, 360)` degrees if `indeg` or
+    equivalently in radians otherwise.
+
+    Paramaters
+    ----------
+    ra : 1-dimensional array
+        Right ascension.
+    indeg : bool
+        Whether the right ascension is in degrees.
+
+    Returns
+    -------
+    wrapped_ra : 1-dimensional array
+    """
+    mask = ra < 0
+    if numpy.sum(mask) == 0:
+        warn("No negative right ascension found.", UserWarning())
+    ra[mask] += 360 if indeg else 2 * numpy.pi
+    return ra
+
+
+###############################################################################
+#                   Secondary assembly bias normalised marks                  #
+###############################################################################
+
+
+def normalised_marks(x, y, nbins):
+    """
+    Calculate the normalised marks of `y` binned by `x`.
+
+    Parameters
+    ----------
+    x : 1-dimensional array
+        Binning variable.
+    y : 1-dimensional array
+        The variable to be marked.
+    nbins : int
+        Number of percentile bins.
+
+    Returns
+    -------
+    marks : 1-dimensional array
+    """
+    assert x.ndim == y.ndim == 1
+    if y.dtype not in [numpy.float32, numpy.float64]:
+        raise NotImplemented("Marks from integers are not supported.")
+
+    bins = numpy.percentile(x, q=numpy.linspace(0, 100, nbins + 1))
+    marks = numpy.full_like(y, numpy.nan)
+    for i in range(nbins):
+        m = (x >= bins[i]) & (x < bins[i + 1])
+        # Calculate the normalised marks of this bin
+        _marks = numpy.full(numpy.sum(m), numpy.nan, dtype=marks.dtype)
+        for n, ind in enumerate(numpy.argsort(y[m])):
+            _marks[ind] = n
+        _marks /= numpy.nanmax(_marks)
+        marks[m] = _marks
+
+    return marks
--- a/csiborgtools/match/init.py
+++ b/csiborgtools/match/init.py
@ -18,5 +18,3 @@ from .match import (RealisationsMatcher, cosine_similarity,  # noqa
                    calculate_overlap, calculate_overlap_indxs,  # noqa
                    dist_centmass, dist_percentile)  # noqa
 from .num_density import (binned_counts, number_density)  # noqa
-from .knn import kNN_CDF
-# from .correlation import (get_randoms_sphere, sphere_angular_tpcf) # noqa
--- a/csiborgtools/read/summaries.py
+++ b/csiborgtools/read/summaries.py
@ -18,6 +18,7 @@ Tools for summarising various results.
 from os.path import (join, isfile)
 from glob import glob
 import numpy
+from scipy.special import factorial
 import joblib
 from tqdm import tqdm

@ -184,55 +185,53 @@ class kNNCDFReader:
    """
    Shortcut object to read in the kNN CDF data.
    """
-    def read(self, files, ks, rmin=None, rmax=None, to_clip=True):
+    def read(self, run, folder, rmin=None, rmax=None, to_clip=True):
        """
-        Read the kNN CDF data can be either the auto- or cross-correlation.
+        Read the auto- or cross-correlation kNN-CDF data. Infers the type from
+        the data files.

        Parameters
        ----------
-        files : list of str
-            List of file paths to read in.
-        ks : list of int
-            kNN values to read in.
+        run : str
+            Run ID to read in.
+        folder : str
+            Path to the folder where the auto-correlation kNN-CDF is stored.
        rmin : float, optional
            Minimum separation. By default ignored.
        rmax : float, optional
            Maximum separation. By default ignored.
        to_clip : bool, optional
-            Whether to clip the auto-correlation CDF. Ignored if reading in the
+            Whether to clip the auto-correlation CDF. Ignored for
            cross-correlation.

        Returns
        -------
-        rs : 1-dimensional array
-            Array of separations.
-        out : 4-dimensional array
-            Auto-correlation or cross-correlation kNN CDFs. The shape is
-            `(len(files), len(mass_thresholds), len(ks), neval)`.
-        mass_thresholds : 1-dimensional array
-            Array of mass thresholds.
+        rs : 1-dimensional array of shape `(neval, )`
+            Separations where the CDF is evaluated.
+        out : 3-dimensional array of shape `(len(files), len(ks), neval)`
+            Array of CDFs or cross-correlations.
        """
-        data = joblib.load(files[0])
-        if "cdf_0" in data.keys():
-            isauto = True
-            kind = "cdf"
-        elif "corr_0" in data.keys():
-            isauto = False
-            kind = "corr"
-        else:
-            raise ValueError("Unknown data format.")
-        rs = data["rs"]
-        mass_thresholds = data["mass_threshold"]
-        neval = data["{}_0".format(kind)].shape[1]
-        out = numpy.full((len(files), len(mass_thresholds), len(ks), neval),
-                         numpy.nan, dtype=numpy.float32)
+        run += ".p"
+        files = [f for f in glob(join(folder, "*")) if run in f]
+        if len(files) == 0:
+            raise RuntimeError("No files found for run `{}`.".format(run[:-2]))

-        for i, file in enumerate(tqdm(files)):
+        for i, file in enumerate(files):
            data = joblib.load(file)
-            for j in range(len(mass_thresholds)):
-                out[i, j, ...] = data["{}_{}".format(kind, j)][ks, :]
-                if isauto and to_clip:
-                    out[i, j, ...] = self.clipped_cdf(out[i, j, ...])
+            if i == 0:  # Initialise the array
+                if "corr" in data.keys():
+                    kind = "corr"
+                    isauto = False
+                else:
+                    kind = "cdf"
+                    isauto = True
+                out = numpy.full((len(files), *data[kind].shape), numpy.nan,
+                                 dtype=numpy.float32)
+                rs = data["rs"]
+            out[i, ...] = data[kind]
+
+            if isauto and to_clip:
+                out[i, ...] = self.clipped_cdf(out[i, ...])

        # Apply separation cuts
        mask = (rs >= rmin if rmin is not None else rs > 0)
@ -240,7 +239,7 @@ class kNNCDFReader:
        rs = rs[mask]
        out = out[..., mask]

-        return rs, out, mass_thresholds
+        return rs, out

    @staticmethod
    def peaked_cdf(cdf, make_copy=True):
@ -295,37 +294,74 @@ class kNNCDFReader:
        return cdf

    @staticmethod
-    def prob_kvolume(cdfs, rs=None, normalise=False):
-        """
-        Calculate the probability that a spherical volume contains :math:`k`=
-        objects from the kNN CDFs.
+    def prob_k(cdf):
+        r"""
+        Calculate the PDF that a spherical volume of radius :math:`r` contains
+        :math:`k` objects, i.e. :math:`P(k | V = 4 \pi r^3 / 3)`.

        Parameters
        ----------
-        cdf : 4-dimensional array of shape `(nfiles, nmasses, nknn, nrs)`
+        cdf : 3-dimensional array of shape `(len(files), len(ks), len(rs))`
            Array of CDFs
-        normalise : bool, optional
-            Whether to normalise the probability to 1.

        Returns
        -------
-        pk : 4-dimensional array of shape `(nfiles, nmasses, nknn - 1, nrs)`
+        pk : 3-dimensional array of shape `(len(files), len(ks)- 1, len(rs))`
        """
-        out = numpy.full_like(cdfs[..., 1:, :], numpy.nan, dtype=numpy.float32)
+        out = numpy.full_like(cdf[..., 1:, :], numpy.nan, dtype=numpy.float32)
+        nks = cdf.shape[-2]
+        out[..., 0, :] = 1 - cdf[..., 0, :]

-        for k in range(cdfs.shape[-2] - 1):
-            out[..., k, :] = cdfs[..., k, :] - cdfs[..., k + 1, :]
+        for k in range(1, nks - 1):
+            out[..., k, :] = cdf[..., k - 1, :] - cdf[..., k, :]

-        if normalise:
-            assert rs is not None, "rs must be provided to normalise."
-            assert rs.ndim == 1
-
-            norm = numpy.nansum(
-                0.5 * (out[..., 1:] + out[..., :-1]) * (rs[1:] - rs[:-1]),
-                axis=-1)
-            out /= norm.reshape(*norm.shape, 1)
        return out

+    def mean_prob_k(self, cdf):
+        """
+        Calculate the mean PDF that a spherical volume of radius :math:`r`
+        contains :math:`k` objects, i.e. :math:`P(k | V = 4 \pi r^3 / 3)`,
+        averaged over the IC realisations.
+
+        Parameters
+        ----------
+        cdf : 3-dimensional array of shape `(len(files), len(ks), len(rs))`
+            Array of CDFs
+        Returns
+        -------
+        out : 3-dimensional array of shape `(len(ks) - 1, len(rs), 2)`
+            Mean :math:`P(k | V = 4 \pi r^3 / 3) and its standard deviation,
+            stored along the last dimension, respectively.
+        """
+        pk = self.prob_k(cdf)
+        return numpy.stack([numpy.mean(pk, axis=0), numpy.std(pk, axis=0)],
+                           axis=-1)
+
+    def poisson_prob_k(self, rs, k, ndensity):
+        """
+        Calculate the analytical PDF that a spherical volume of
+        radius :math:`r` contains :math:`k` objects, i.e.
+        :math:`P(k | V = 4 \pi r^3 / 3)`, assuming a Poisson field (uniform
+        distribution of points).
+
+        Parameters
+        ----------
+        rs : 1-dimensional array
+            Array of separations.
+        k : int
+            Number of objects.
+        ndensity : float
+            Number density of objects.
+
+        Returns
+        -------
+        pk : 1-dimensional array
+            The PDF that a spherical volume of radius :math:`r` contains
+            :math:`k` objects.
+        """
+        V = 4 * numpy.pi / 3 * rs**3
+        return (ndensity * V)**k / factorial(k) * numpy.exp(-ndensity * V)
+
    @staticmethod
    def cross_files(ic, folder):
        """
--- a/notebooks/knn.ipynb
+++ b/notebooks/knn.ipynb
--- a/scripts/run_crosspk.py
+++ b/scripts/run_crosspk.py
--- a/scripts/run_fieldprop.py
+++ b/scripts/run_fieldprop.py
--- a/scripts/run_fit_halos.py
+++ b/scripts/run_fit_halos.py
--- a/scripts/run_initmatch.py
+++ b/scripts/run_initmatch.py
--- a/scripts/knn_auto.py
+++ b/scripts/knn_auto.py
@ -0,0 +1,182 @@
+# Copyright (C) 2022 Richard Stiskalek
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation; either version 3 of the License, or (at your
+# option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General
+# Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+"""A script to calculate the KNN-CDF for a set of CSiBORG halo catalogues."""
+from os.path import join
+from warnings import warn
+from argparse import ArgumentParser
+from copy import deepcopy
+from datetime import datetime
+from mpi4py import MPI
+from TaskmasterMPI import master_process, worker_process
+import numpy
+from sklearn.neighbors import NearestNeighbors
+import joblib
+import yaml
+try:
+    import csiborgtools
+except ModuleNotFoundError:
+    import sys
+    sys.path.append("../")
+    import csiborgtools
+
+
+###############################################################################
+#                            MPI and arguments                                #
+###############################################################################
+comm = MPI.COMM_WORLD
+rank = comm.Get_rank()
+nproc = comm.Get_size()
+
+parser = ArgumentParser()
+parser.add_argument("--runs", type=str, nargs="+")
+args = parser.parse_args()
+with open('../scripts/knn_auto.yml', 'r') as file:
+    config = yaml.safe_load(file)
+
+Rmax = 155 / 0.705  # Mpc (h = 0.705) high resolution region radius
+totvol = 4 * numpy.pi * Rmax**3 / 3
+minmass = 1e12
+ics = [7444, 7468, 7492, 7516, 7540, 7564, 7588, 7612, 7636, 7660, 7684,
+       7708, 7732, 7756, 7780, 7804, 7828, 7852, 7876, 7900, 7924, 7948,
+       7972, 7996, 8020, 8044, 8068, 8092, 8116, 8140, 8164, 8188, 8212,
+       8236, 8260, 8284, 8308, 8332, 8356, 8380, 8404, 8428, 8452, 8476,
+       8500, 8524, 8548, 8572, 8596, 8620, 8644, 8668, 8692, 8716, 8740,
+       8764, 8788, 8812, 8836, 8860, 8884, 8908, 8932, 8956, 8980, 9004,
+       9028, 9052, 9076, 9100, 9124, 9148, 9172, 9196, 9220, 9244, 9268,
+       9292, 9316, 9340, 9364, 9388, 9412, 9436, 9460, 9484, 9508, 9532,
+       9556, 9580, 9604, 9628, 9652, 9676, 9700, 9724, 9748, 9772, 9796,
+       9820, 9844]
+dumpdir = "/mnt/extraspace/rstiskalek/csiborg/knn"
+fout = join(dumpdir, "auto", "knncdf_{}_{}.p")
+paths = csiborgtools.read.CSiBORGPaths()
+knncdf = csiborgtools.clustering.kNN_CDF()
+
+###############################################################################
+#                                 Analysis                                    #
+###############################################################################
+
+def read_single(selection, cat):
+    """Positions for single catalogue auto-correlation."""
+    mmask = numpy.ones(len(cat), dtype=bool)
+    pos = cat.positions(False)
+    # Primary selection
+    psel = selection["primary"]
+    pmin, pmax = psel.get("min", None), psel.get("max", None)
+    if pmin is not None:
+        mmask &= (cat[psel["name"]] >= pmin)
+    if pmax is not None:
+        mmask &= (cat[psel["name"]] < pmax)
+    pos = pos[mmask, ...]
+
+    # Secondary selection
+    if "secondary" not in selection:
+        return pos
+    smask = numpy.ones(pos.shape[0], dtype=bool)
+    ssel = selection["secondary"]
+    smin, smax = ssel.get("min", None), ssel.get("max", None)
+    prop = cat[ssel["name"]][mmask]
+    if ssel.get("toperm", False):
+        prop = numpy.random.permutation(prop)
+    if ssel.get("marked", True):
+        x = cat[psel["name"]][mmask]
+        prop = csiborgtools.clustering.normalised_marks(
+            x, prop, nbins=config["nbins_marks"])
+
+    if smin is not None:
+        smask &= (prop >= smin)
+    if smax is not None:
+        smask &= (prop < smax)
+
+    return pos[smask, ...]
+
+def do_auto(run, cat, ic):
+    """Calculate the kNN-CDF single catalgoue autocorrelation."""
+    _config = config.get(run, None)
+    if _config is None:
+        warn("No configuration for run {}.".format(run))
+        return
+
+    rvs_gen = csiborgtools.clustering.RVSinsphere(Rmax)
+    pos = read_single(_config, cat)
+    knn = NearestNeighbors()
+    knn.fit(pos)
+    rs, cdf = knncdf(
+        knn, rvs_gen=rvs_gen, nneighbours=config["nneighbours"],
+        rmin=config["rmin"], rmax=config["rmax"],
+        nsamples=int(config["nsamples"]), neval=int(config["neval"]),
+        batch_size=int(config["batch_size"]), random_state=config["seed"])
+
+    joblib.dump({"rs": rs, "cdf": cdf, "ndensity": pos.shape[0] / totvol},
+                fout.format(str(ic).zfill(5), run))
+
+def do_cross_rand(run, cat, ic):
+    """Calculate the kNN-CDF cross catalogue random correlation."""
+    _config = config.get(run, None)
+    if _config is None:
+        warn("No configuration for run {}.".format(run))
+        return
+
+    rvs_gen = csiborgtools.clustering.RVSinsphere(Rmax)
+    knn1, knn2 = NearestNeighbors(), NearestNeighbors()
+
+    pos1 = read_single(_config, cat)
+    knn1.fit(pos1)
+
+    pos2 = rvs_gen(pos1.shape[0])
+    knn2.fit(pos2)
+
+    rs, cdf0, cdf1, joint_cdf = knncdf.joint(
+        knn1, knn2, rvs_gen=rvs_gen, nneighbours=int(config["nneighbours"]),
+        rmin=config["rmin"], rmax=config["rmax"],
+        nsamples=int(config["nsamples"]), neval=int(config["neval"]),
+        batch_size=int(config["batch_size"]), random_state=config["seed"])
+    corr = knncdf.joint_to_corr(cdf0, cdf1, joint_cdf)
+
+    joblib.dump({"rs": rs, "corr": corr}, fout.format(str(ic).zfill(5), run))
+
+
+
+def do_runs(ic):
+    cat = csiborgtools.read.HaloCatalogue(ic, paths, max_dist=Rmax,
+                                          min_mass=minmass)
+    for run in args.runs:
+        if "random" in run:
+            do_cross_rand(run, cat, ic)
+        else:
+            do_auto(run, cat, ic)
+
+
+###############################################################################
+#                             MPI task delegation                             #
+###############################################################################
+
+
+if nproc > 1:
+    if rank == 0:
+        tasks = deepcopy(ics)
+        master_process(tasks, comm, verbose=True)
+    else:
+        worker_process(do_runs, comm, verbose=False)
+else:
+    tasks = deepcopy(ics)
+    for task in tasks:
+        print("{}: completing task `{}`.".format(datetime.now(), task))
+        do_runs(task)
+comm.Barrier()
+
+
+if rank == 0:
+    print("{}: all finished.".format(datetime.now()))
+quit()  # Force quit the script
--- a/scripts/knn_auto.yml
+++ b/scripts/knn_auto.yml
@ -0,0 +1,144 @@
+rmin: 0.1
+rmax: 100
+nneighbours: 64
+nsamples: 1.e+7
+batch_size: 1.e+6
+neval: 10000
+seed: 42
+nbins_marks: 10
+
+
+################################################################################
+#                                 totpartmass                                 #
+################################################################################
+
+
+"mass001":
+  primary:
+    name: totpartmass
+    min: 1.e+12
+    max: 1.e+13
+
+"mass002":
+  primary:
+    name: totpartmass
+    min: 1.e+13
+    max: 1.e+14
+
+"mass003":
+  primary:
+    name: totpartmass
+    min: 1.e+14
+
+
+################################################################################
+#                        totpartmass + lambda200c                             #
+################################################################################
+
+
+"mass001_spinlow":
+  primary:
+    name: totpartmass
+    min: 1.e+12
+    max: 1.e+13
+  secondary:
+    name: lambda200c
+    toperm: false
+    marked: false
+    max: 0.5
+
+"mass001_spinhigh":
+  primary:
+    name: totpartmass
+    min: 1.e+12
+    max: 1.e+13
+  secondary:
+    name: lambda200c
+    toperm: false
+    marked: true
+    min: 0.5
+
+"mass001_spinmedian_perm":
+  primary:
+    name: totpartmass
+    min: 1.e+12
+    max: 1.e+13
+  secondary:
+    name: lambda200c
+    toperm: true
+    marked : true
+    min: 0.5
+
+"mass002_spinlow":
+  primary:
+    name: totpartmass
+    min: 1.e+13
+    max: 1.e+14
+  secondary:
+    name: lambda200c
+    toperm: false
+    marked: false
+    max: 0.5
+
+"mass002_spinhigh":
+  primary:
+    name: totpartmass
+    min: 1.e+13
+    max: 1.e+14
+  secondary:
+    name: lambda200c
+    toperm: false
+    marked: true
+    min: 0.5
+
+"mass002_spinmedian_perm":
+  primary:
+    name: totpartmass
+    min: 1.e+13
+    max: 1.e+14
+  secondary:
+    name: lambda200c
+    toperm: true
+    marked : true
+    min: 0.5
+
+"mass003_spinlow":
+  primary:
+    name: totpartmass
+    min: 1.e+14
+  secondary:
+    name: lambda200c
+    toperm: false
+    marked: false
+    max: 0.5
+
+"mass003_spinhigh":
+  primary:
+    name: totpartmass
+    min: 1.e+14
+  secondary:
+    name: lambda200c
+    toperm: false
+    marked: true
+    min: 0.5
+
+"mass003_spinmedian_perm":
+  primary:
+    name: totpartmass
+    min: 1.e+14
+  secondary:
+    name: lambda200c
+    toperm: true
+    marked : true
+    min: 0.5
+
+
+################################################################################
+#                           Cross with random                                  #
+################################################################################
+
+"mass001_random":
+  primary:
+    name: totpartmass
+    min: 1.e+12
+    max: 1.e+13
--- a/scripts/knn_cross.py
+++ b/scripts/knn_cross.py
@ -13,6 +13,7 @@
 # with this program; if not, write to the Free Software Foundation, Inc.,
 # 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 """A script to calculate the KNN-CDF for a set of CSiBORG halo catalogues."""
+from warnings import warn
 from os.path import join
 from argparse import ArgumentParser
 from copy import deepcopy
@ -20,8 +21,10 @@ from datetime import datetime
 from itertools import combinations
 from mpi4py import MPI
 from TaskmasterMPI import master_process, worker_process
+import numpy
 from sklearn.neighbors import NearestNeighbors
 import joblib
+import yaml
 try:
    import csiborgtools
 except ModuleNotFoundError:
@ -38,17 +41,13 @@ rank = comm.Get_rank()
 nproc = comm.Get_size()

 parser = ArgumentParser()
-parser.add_argument("--rmin", type=float)
-parser.add_argument("--rmax", type=float)
-parser.add_argument("--nneighbours", type=int)
-parser.add_argument("--nsamples", type=int)
-parser.add_argument("--neval", type=int)
-parser.add_argument("--batch_size", type=int)
-parser.add_argument("--seed", type=int, default=42)
+parser.add_argument("--runs", type=str, nargs="+")
 args = parser.parse_args()
+with open('../scripts/knn_cross.yml', 'r') as file:
+    config = yaml.safe_load(file)

-Rmax = 155 / 0.705  # Mpc/h high resolution region radius
-mass_threshold = [1e12, 1e13, 1e14]  # Msun
+Rmax = 155 / 0.705  # Mpc (h = 0.705) high resolution region radius
+minmass = 1e12
 ics = [7444, 7468, 7492, 7516, 7540, 7564, 7588, 7612, 7636, 7660, 7684,
       7708, 7732, 7756, 7780, 7804, 7828, 7852, 7876, 7900, 7924, 7948,
       7972, 7996, 8020, 8044, 8068, 8092, 8116, 8140, 8164, 8188, 8212,
@ -59,80 +58,58 @@ ics = [7444, 7468, 7492, 7516, 7540, 7564, 7588, 7612, 7636, 7660, 7684,
       9292, 9316, 9340, 9364, 9388, 9412, 9436, 9460, 9484, 9508, 9532,
       9556, 9580, 9604, 9628, 9652, 9676, 9700, 9724, 9748, 9772, 9796,
       9820, 9844]
-dumpdir = "/mnt/extraspace/rstiskalek/csiborg/knn"
-fout_auto = join(dumpdir, "auto", "knncdf_{}.p")
-fout_cross = join(dumpdir, "cross", "knncdf_{}_{}.p")
 paths = csiborgtools.read.CSiBORGPaths()
-
+dumpdir = "/mnt/extraspace/rstiskalek/csiborg/knn"
+fout = join(dumpdir, "cross", "knncdf_{}_{}_{}.p")
+knncdf = csiborgtools.clustering.kNN_CDF()

 ###############################################################################
 #                               Analysis                                      #
 ###############################################################################
-knncdf = csiborgtools.match.kNN_CDF()

+def read_single(selection, cat):
+    mmask = numpy.ones(len(cat), dtype=bool)
+    pos = cat.positions(False)
+    # Primary selection
+    psel = selection["primary"]
+    pmin, pmax = psel.get("min", None), psel.get("max", None)
+    if pmin is not None:
+        mmask &= (cat[psel["name"]] >= pmin)
+    if pmax is not None:
+        mmask &= (cat[psel["name"]] < pmax)
+    return pos[mmask, ...]

-def do_auto(ic):
-    out = {}
-    cat = csiborgtools.read.HaloCatalogue(ic, paths, max_dist=Rmax)
+def do_cross(run, ics):
+    _config = config.get(run, None)
+    if _config is None:
+        warn("No configuration for run {}.".format(run))
+        return
+    rvs_gen = csiborgtools.clustering.RVSinsphere(Rmax)
+    knn1, knn2 = NearestNeighbors(), NearestNeighbors()

-    for i, mmin in enumerate(mass_threshold):
-        knn = NearestNeighbors()
-        knn.fit(cat.positions(False)[cat["totpartmass"] > mmin, ...])
-
-        rs, cdf = knncdf(knn, nneighbours=args.nneighbours, Rmax=Rmax,
-                         rmin=args.rmin, rmax=args.rmax, nsamples=args.nsamples,
-                         neval=args.neval, batch_size=args.batch_size,
-                         random_state=args.seed, verbose=False)
-        out.update({"cdf_{}".format(i): cdf})
-
-    out.update({"rs": rs, "mass_threshold": mass_threshold})
-    joblib.dump(out, fout_auto.format(ic))
-
-
-def do_cross(ics):
-    out = {}
    cat1 = csiborgtools.read.HaloCatalogue(ics[0], paths, max_dist=Rmax)
+    pos1 = read_single(_config, cat1)
+    knn1.fit(pos1)
+
    cat2 = csiborgtools.read.HaloCatalogue(ics[1], paths, max_dist=Rmax)
+    pos2 = read_single(_config, cat2)
+    knn2.fit(pos2)

-    for i, mmin in enumerate(mass_threshold):
-        knn1 = NearestNeighbors()
-        knn1.fit(cat1.positions()[cat1["totpartmass"] > mmin, ...])
+    rs, cdf0, cdf1, joint_cdf = knncdf.joint(
+        knn1, knn2, rvs_gen=rvs_gen, nneighbours=int(config["nneighbours"]),
+        rmin=config["rmin"], rmax=config["rmax"],
+        nsamples=int(config["nsamples"]), neval=int(config["neval"]),
+        batch_size=int(config["batch_size"]), random_state=config["seed"])

-        knn2 = NearestNeighbors()
-        knn2.fit(cat2.positions()[cat2["totpartmass"] > mmin, ...])
+    corr = knncdf.joint_to_corr(cdf0, cdf1, joint_cdf)

-        rs, cdf0, cdf1, joint_cdf = knncdf.joint(
-            knn1, knn2, nneighbours=args.nneighbours, Rmax=Rmax,
-            rmin=args.rmin, rmax=args.rmax, nsamples=args.nsamples,
-            neval=args.neval, batch_size=args.batch_size,
-            random_state=args.seed)
+    joblib.dump({"rs": rs, "corr": corr},
+                fout.format(str(ics[0]).zfill(5), str(ics[1]).zfill(5), run))

-        corr = knncdf.joint_to_corr(cdf0, cdf1, joint_cdf)
-
-        out.update({"corr_{}".format(i): corr})
-
-    out.update({"rs": rs, "mass_threshold": mass_threshold})
-    joblib.dump(out, fout_cross.format(*ics))
-
-
-
-###############################################################################
-#                          Autocorrelation calculation                        #
-###############################################################################
-
-
-if nproc > 1:
-    if rank == 0:
-        tasks = deepcopy(ics)
-        master_process(tasks, comm, verbose=True)
-    else:
-        worker_process(do_auto, comm, verbose=False)
-else:
-    tasks = deepcopy(ics)
-    for task in tasks:
-        print("{}: completing task `{}`.".format(datetime.now(), task))
-        do_auto(task)
-comm.Barrier()
+def do_runs(ics):
+    print(ics)
+    for run in args.runs:
+        do_cross(run, ics)


 ###############################################################################
@ -145,12 +122,12 @@ if nproc > 1:
        tasks = list(combinations(ics, 2))
        master_process(tasks, comm, verbose=True)
    else:
-        worker_process(do_cross, comm, verbose=False)
+        worker_process(do_runs, comm, verbose=False)
 else:
-    tasks = deepcopy(ics)
+    tasks = list(combinations(ics, 2))
    for task in tasks:
        print("{}: completing task `{}`.".format(datetime.now(), task))
-        do_cross(task)
+        do_runs(task)
 comm.Barrier()


--- a/scripts/knn_cross.yml
+++ b/scripts/knn_cross.yml
@ -0,0 +1,29 @@
+rmin: 0.1
+rmax: 100
+nneighbours: 64
+nsamples: 1.e+7
+batch_size: 1.e+6
+neval: 10000
+seed: 42
+
+
+################################################################################
+#                                 totpartmass                                 #
+################################################################################
+
+"mass001":
+  primary:
+    name: totpartmass
+    min: 1.e+12
+    max: 1.e+13
+
+"mass002":
+  primary:
+    name: totpartmass
+    min: 1.e+13
+    max: 1.e+14
+
+"mass003":
+  primary:
+    name: totpartmass
+    min: 1.e+14
--- a/scripts/python.sh
+++ b/scripts/python.sh
@ -1,46 +0,0 @@
-#!/bin/bash -l
-echo =========================================================   
-echo Job submitted  date = Fri Mar 31 16:17:57 BST 2023      
-date_start=`date +%s`
-echo $SLURM_JOB_NUM_NODES nodes \( $SMP processes per node \)        
-echo $SLURM_JOB_NUM_NODES hosts used: $SLURM_JOB_NODELIST      
-echo Job output begins                                           
-echo -----------------                                           
-echo   
-#hostname
-
-# Need to set the max locked memory very high otherwise IB can't allocate enough and fails with "UCX  ERROR Failed to allocate memory pool chunk: Input/output error"
-ulimit -l unlimited
-
-# To allow mvapich to run ok
-export MV2_SMP_USE_CMA=0
-
-#which mpirun
-export OMP_NUM_THEADS=1
- /usr/local/shared/slurm/bin/srun -u -n 5 --mpi=pmi2 --mem-per-cpu=7168 nice -n 10 /mnt/zfsusers/rstiskalek/csiborgtools/venv_galomatch/bin/python run_knn.py --rmin 0.05 --rmax 50 --nsamples 100000 --neval 10000
-# If we've been checkpointed
-#if [ -n "${DMTCP_CHECKPOINT_DIR}" ]; then
-  if [ -d "${DMTCP_CHECKPOINT_DIR}" ]; then
-#    echo -n "Job was checkpointed at "
-#    date
-#    echo 
-     sleep 1
-#  fi
-   echo -n
-else
-  echo ---------------                                           
-  echo Job output ends                                           
-  date_end=`date +%s`
-  seconds=$((date_end-date_start))
-  minutes=$((seconds/60))
-  seconds=$((seconds-60*minutes))
-  hours=$((minutes/60))
-  minutes=$((minutes-60*hours))
-  echo =========================================================   
-  echo PBS job: finished   date = `date`   
-  echo Total run time : $hours Hours $minutes Minutes $seconds Seconds
-  echo =========================================================
-fi
-if [ ${SLURM_NTASKS} -eq 1 ]; then
-  rm -f $fname
-fi
--- a/scripts/run_crosspk.sh
+++ b/scripts/run_crosspk.sh
@ -1,14 +0,0 @@
-nthreads=20
-memory=40
-queue="berg"
-env="/mnt/zfsusers/rstiskalek/csiborgtools/venv_galomatch/bin/python"
-file="run_crosspk.py"
-grid=1024
-halfwidth=0.13
-
-cm="addqueue -q $queue -n $nthreads -m $memory $env $file --grid $grid --halfwidth $halfwidth"
-
-echo "Submitting:"
-echo $cm
-echo
-$cm
--- a/scripts/run_fieldprop.sh
+++ b/scripts/run_fieldprop.sh
@ -1,14 +0,0 @@
-nthreads=10
-memory=32
-queue="berg"
-env="/mnt/zfsusers/rstiskalek/csiborgtools/venv_galomatch/bin/python"
-file="run_fieldprop.py"
-# grid=1024
-# halfwidth=0.1
-
-cm="addqueue -q $queue -n $nthreads -m $memory $env $file"
-
-echo "Submitting:"
-echo $cm
-echo
-$cm
--- a/scripts/run_fit_halos.sh
+++ b/scripts/run_fit_halos.sh
@ -1,12 +0,0 @@
-nthreads=100
-memory=3
-queue="berg"
-env="/mnt/zfsusers/rstiskalek/csiborgtools/venv_galomatch/bin/python"
-file="run_fit_halos.py"
-
-cm="addqueue -q $queue -n $nthreads -m $memory $env $file"
-
-echo "Submitting:"
-echo $cm
-echo
-$cm
--- a/scripts/run_initmatch.sh
+++ b/scripts/run_initmatch.sh
@ -1,14 +0,0 @@
-nthreads=15  # There isn't too much benefit going to too many CPUs...
-memory=32
-queue="berg"
-env="/mnt/zfsusers/rstiskalek/csiborgtools/venv_galomatch/bin/python"
-file="run_initmatch.py"
-
-dump_clumps="false"
-
-cm="addqueue -q $queue -n $nthreads -m $memory $env $file --dump_clumps $dump_clumps"
-
-echo "Submitting:"
-echo $cm
-echo
-$cm
--- a/scripts/run_knn.sh
+++ b/scripts/run_knn.sh
@ -1,23 +0,0 @@
-nthreads=151
-memory=4
-queue="cmb"
-env="/mnt/zfsusers/rstiskalek/csiborgtools/venv_galomatch/bin/python"
-file="run_knn.py"
-
-rmin=0.01
-rmax=100
-nneighbours=8
-nsamples=100000000
-batch_size=1000000
-neval=10000
-
-pythoncm="$env $file --rmin $rmin --rmax $rmax --nneighbours $nneighbours --nsamples $nsamples --batch_size $batch_size --neval $neval"
-
-# echo $pythoncm
-# $pythoncm
-
-cm="addqueue -q $queue -n $nthreads -m $memory $pythoncm"
-echo "Submitting:"
-echo $cm
-echo
-$cm
--- a/scripts/run_singlematch.sh
+++ b/scripts/run_singlematch.sh
@ -1,36 +0,0 @@
-#!/bin/bash
-# nthreads=1
-memory=16
-queue="berg"
-env="/mnt/zfsusers/rstiskalek/csiborgtools/venv_galomatch/bin/python"
-file="run_singlematch.py"
-
-nmult=1.
-sigma=1.
-
-sims=(7468 7588 8020 8452 8836)
-nsims=${#sims[@]}
-
-for i in $(seq 0 $((nsims-1))); do
-for j in $(seq 0 $((nsims-1))); do
-if [ $i -eq $j ]; then
-    continue
-elif [ $i -gt $j ]; then
-    continue
-else
-    :
-fi
-
-nsim0=${sims[$i]}
-nsimx=${sims[$j]}
-
-pythoncm="$env $file --nsim0 $nsim0 --nsimx $nsimx --nmult $nmult --sigma $sigma"
-
-cm="addqueue -q $queue -n 1x1 -m $memory $pythoncm"
-echo "Submitting:"
-echo $cm
-echo
-$cm
-sleep 0.05
-
-done; done
--- a/scripts/run_split_halos.sh
+++ b/scripts/run_split_halos.sh
@ -1,12 +0,0 @@
-nthreads=1
-memory=30
-queue="cmb"
-env="/mnt/zfsusers/rstiskalek/csiborgtools/venv_galomatch/bin/python"
-file="run_split_halos.py"
-
-cm="addqueue -q $queue -n $nthreads -m $memory $env $file"
-
-echo "Submitting:"
-echo $cm
-echo
-$cm
--- a/scripts/run_singlematch.py
+++ b/scripts/run_singlematch.py
--- a/scripts/run_split_halos.py
+++ b/scripts/run_split_halos.py