kNN-CDF implementation (#34)

* Rewrite doc * add kNN * edit loading of samples with no init * Add verbosity flag * add KNN submission script * do not make peaked cdf by default * Add submit script * stop ignore sh * Add mass thresholding * Edit gitignore * edits * Space points in logspace * Calculate for all ICs * Update TODO * Add dtype support * Update readme * Update nb
2025-06-08 18:01:11 +00:00 · 2023-03-31 18:13:41 +01:00 · 2023-03-31 18:13:41 +01:00 · 63ab3548b4
commit 63ab3548b4
parent 4d7827006a
17 changed files with 1248 additions and 29 deletions
--- a/.gitignore
+++ b/.gitignore
@ -9,10 +9,10 @@ plots/*
 csiborgtools/fits/_halo_profile.py
 csiborgtools/fits/_filenames.py
 csiborgtools/fits/analyse_voids_25.py
-scripts/*.sh
 scripts/*.out
 build/*
 .eggs/*
 csiborgtools.egg-info/*
 Pylians3/*
 scripts/plot_correlation.ipynb
+scripts/python.sh
--- a/README.md
+++ b/README.md
@ -1,24 +1,16 @@
-# CSiBORGTools
+# CSiBORG Analysis


-### Questions
- How well can observed clusters be matched to CSiBORG? Do their masses agree?
- Is the number of clusters in CSiBORG consistent?
+##  Project Overlap
+- [ ] Calculate the overlap between all 101 IC realisations on DiRAC.

-## CSiBORG Galaxy Environmental Dependence

-### TODO
+
+## Project Clustering
+- [ ] Add uncertainty to the kNN-CDF autocorrelation.
+- [ ] Add the joint kNN-CDF calculation.
+- [ ] Make kNN-CDF more memory friendly if generating many randoms.
+
+
+## Project Environmental Dependence
 - [ ] Add gradient and Hessian of the overdensity field.
-
-
-### Questions
- Environmental dependence of:
-  - $M_*$, colour and SFR.
-  - Galaxy alignment.
-  - HI content.
-
- Fields to calculate:
-    1. Overdensity field $\delta$
-    2. Gradient and Hessian of $\delta$
-    3. Gravitational field $\Phi$
-    4. Gradient and Hessian of $\Phi$
--- a/csiborgtools/match/init.py
+++ b/csiborgtools/match/init.py
@ -18,4 +18,5 @@ from .match import (brute_spatial_separation, RealisationsMatcher, cosine_simila
                    calculate_overlap, calculate_overlap_indxs,  # noqa
                    dist_centmass, dist_percentile)  # noqa
 from .num_density import (binned_counts, number_density)  # noqa
+from .knn import kNN_CDF
 # from .correlation import (get_randoms_sphere, sphere_angular_tpcf) # noqa
--- a/csiborgtools/match/knn.py
+++ b/csiborgtools/match/knn.py
@ -0,0 +1,181 @@
+# Copyright (C) 2022 Richard Stiskalek
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation; either version 3 of the License, or (at your
+# option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General
+# Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+"""
+kNN-CDF calculation
+"""
+from gc import collect
+import numpy
+from scipy.interpolate import interp1d
+from tqdm import tqdm
+
+
+class kNN_CDF:
+    """
+    Object to calculate the kNN-CDF for a set of CSiBORG halo catalogues from
+    their kNN objects.
+    """
+    @staticmethod
+    def rvs_in_sphere(nsamples, R, random_state=42, dtype=numpy.float32):
+        """
+        Generate random samples in a sphere of radius `R` centered at the
+        origin.
+
+        Parameters
+        ----------
+        nsamples : int
+            Number of samples to generate.
+        R : float
+            Radius of the sphere.
+        random_state : int, optional
+            Random state for the random number generator.
+        dtype : numpy dtype, optional
+            Data type, by default `numpy.float32`.
+
+        Returns
+        -------
+        samples : 2-dimensional array of shape `(nsamples, 3)`
+        """
+        gen = numpy.random.default_rng(random_state)
+        # Sample spherical coordinates
+        r = gen.uniform(0, 1, nsamples).astype(dtype)**(1/3) * R
+        theta = 2 * numpy.arcsin(gen.uniform(0, 1, nsamples).astype(dtype))
+        phi = 2 * numpy.pi * gen.uniform(0, 1, nsamples).astype(dtype)
+        # Convert to cartesian coordinates
+        x = r * numpy.sin(theta) * numpy.cos(phi)
+        y = r * numpy.sin(theta) * numpy.sin(phi)
+        z = r * numpy.cos(theta)
+
+        return numpy.vstack([x, y, z]).T
+
+    @staticmethod
+    def cdf_from_samples(r, rmin=None, rmax=None, neval=None,
+                         dtype=numpy.float32):
+        """
+        Calculate the CDF from samples.
+
+        Parameters
+        ----------
+        r : 1-dimensional array
+            Distance samples.
+        rmin : float, optional
+            Minimum distance to evaluate the CDF.
+        rmax : float, optional
+            Maximum distance to evaluate the CDF.
+        neval : int, optional
+            Number of points to evaluate the CDF. By default equal to `len(x)`.
+        dtype : numpy dtype, optional
+            Calculation data type. By default `numpy.float32`.
+
+        Returns
+        -------
+        r : 1-dimensional array
+            Distances at which the CDF is evaluated.
+        cdf : 1-dimensional array
+            CDF evaluated at `r`.
+        """
+        r = numpy.copy(r)  # Make a copy not to overwrite the original
+        # Make cuts on distance
+        r = r[r >= rmin] if rmin is not None else r
+        r = r[r <= rmax] if rmax is not None else r
+
+        # Calculate the CDF
+        r = numpy.sort(r)
+        cdf = numpy.arange(r.size) / r.size
+
+        if neval is not None:  # Optinally interpolate at given points
+            _r = numpy.logspace(numpy.log10(rmin), numpy.log10(rmax), neval,
+                                dtype=dtype)
+            cdf = interp1d(r, cdf, kind="linear", fill_value=numpy.nan,
+                           bounds_error=False)(_r).astype(dtype)
+            r = _r
+
+        return r, cdf
+
+    @staticmethod
+    def peaked_cdf(cdf, make_copy=True):
+        """
+        Transform the CDF to a peaked CDF.
+
+        Parameters
+        ----------
+        cdf : 1- or 2- or 3-dimensional array
+            CDF to be transformed along the last axis.
+        make_copy : bool, optional
+            Whether to make a copy of the CDF before transforming it to avoid
+            overwriting it.
+
+        Returns
+        -------
+        peaked_cdf : 1- or 2- or 3-dimensional array
+        """
+        cdf = numpy.copy(cdf) if make_copy else cdf
+        cdf[cdf > 0.5] = 1 - cdf[cdf > 0.5]
+        return cdf
+
+    def __call__(self, *knns, nneighbours, Rmax, nsamples, rmin, rmax, neval,
+                 verbose=True, random_state=42, dtype=numpy.float32):
+        """
+        Calculate the CDF for a set of kNNs of CSiBORG halo catalogues.
+
+        Parameters
+        ----------
+        *knns : `sklearn.neighbors.NearestNeighbors` instances
+            kNNs of CSiBORG halo catalogues.
+        neighbours : int
+            Maximum number of neighbours to use for the kNN-CDF calculation.
+        Rmax : float
+            Maximum radius of the sphere in which to sample random points for
+            the knn-CDF calculation. This should match the CSiBORG catalogues.
+        nsamples : int
+            Number of random points to sample for the knn-CDF calculation.
+        rmin : float
+            Minimum distance to evaluate the CDF.
+        rmax : float
+            Maximum distance to evaluate the CDF.
+        neval : int
+            Number of points to evaluate the CDF.
+        verbose : bool, optional
+            Verbosity flag.
+        random_state : int, optional
+            Random state for the random number generator.
+        dtype : numpy dtype, optional
+            Calculation data type. By default `numpy.float32`.
+
+        Returns
+        -------
+        rs : 1-dimensional array
+            Distances at which the CDF is evaluated.
+        cdfs : 2 or 3-dimensional array
+            CDFs evaluated at `rs`.
+        """
+        rand = self.rvs_in_sphere(nsamples, Rmax, random_state=random_state)
+
+        cdfs = [None] * len(knns)
+        for i, knn in enumerate(tqdm(knns) if verbose else knns):
+            dist, _indxs = knn.kneighbors(rand, nneighbours)
+            dist = dist.astype(dtype)
+            del _indxs
+            collect()
+
+
+            cdf = [None] * nneighbours
+            for j in range(nneighbours):
+                rs, cdf[j] = self.cdf_from_samples(
+                    dist[:, j], rmin=rmin, rmax=rmax, neval=neval)
+            cdfs[i] = cdf
+
+        cdfs = numpy.asanyarray(cdfs)
+        cdfs = cdfs[0, ...] if len(knns) == 1 else cdfs
+        return rs, cdfs
--- a/csiborgtools/read/make_cat.py
+++ b/csiborgtools/read/make_cat.py
@ -35,20 +35,22 @@ class HaloCatalogue:
        The minimum :math:`M_{rm tot} / M_\odot` mass. By default no threshold.
    max_dist : float, optional
        The maximum comoving distance of a halo. By default no upper limit.
+    load_init : bool, optional
+        Whether to load the initial snapshot information. By default False.
    """
    _box = None
    _paths = None
    _data = None
    _selmask = None

-    def __init__(self, nsim, min_mass=None, max_dist=None):
+    def __init__(self, nsim, min_mass=None, max_dist=None, load_init=False):
        # Set up paths
        paths = CSiBORGPaths(n_sim=nsim)
        paths.n_snap = paths.get_maximum_snapshot()
        self._paths = paths
        self._box = BoxUnits(paths)
        self._paths = paths
-        self._set_data(min_mass, max_dist)
+        self._set_data(min_mass, max_dist, load_init)

    @property
    def data(self):
@ -109,7 +111,7 @@ class HaloCatalogue:

    def knn(self, select_initial):
        """
-        The final snapshot k-nearest neighbour object.
+        kNN object of all halo positions.

        Parameters
        ----------
@ -123,7 +125,7 @@ class HaloCatalogue:
        knn = NearestNeighbors()
        return knn.fit(self.positions0 if select_initial else self.positions)

-    def _set_data(self, min_mass, max_dist):
+    def _set_data(self, min_mass, max_dist, load_init):
        """
        Loads the data, merges with mmain, does various coordinate transforms.
        """
@ -141,10 +143,11 @@ class HaloCatalogue:
        data = data[(data["npart"] > 100) & numpy.isfinite(data["m200"])]

        # Now also load the initial positions
-        initcm = read_initcm(self.n_sim, self.paths.initmatch_path)
-        if initcm is not None:
-            data = self.merge_initmatch_to_clumps(data, initcm)
-            flip_cols(data, "x0", "z0")
+        if load_init:
+            initcm = read_initcm(self.n_sim, self.paths.initmatch_path)
+            if initcm is not None:
+                data = self.merge_initmatch_to_clumps(data, initcm)
+                flip_cols(data, "x0", "z0")

 #        # Calculate redshift
 #        pos = [data["peak_{}".format(p)] - 0.5 for p in ("x", "y", "z")]
@ -168,7 +171,7 @@ class HaloCatalogue:
        data = add_columns(data, [d, ra, dec], ["dist", "ra", "dec"])

        # And do the unit transform
-        if initcm is not None:
+        if load_init and initcm is not None:
            data = self.box.convert_from_boxunits(
                data, ["x0", "y0", "z0", "lagpatch"])

--- a/notebooks/knn.ipynb
+++ b/notebooks/knn.ipynb
@ -0,0 +1,738 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "5a38ed25",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-03-31T17:09:12.165480Z",
+     "start_time": "2023-03-31T17:09:12.116708Z"
+    },
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The autoreload extension is already loaded. To reload it, use:\n",
+      "  %reload_ext autoreload\n"
+     ]
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "import matplotlib\n",
+    "import matplotlib.pyplot as plt\n",
+    "from sklearn.neighbors import NearestNeighbors\n",
+    "import joblib\n",
+    "from tqdm import tqdm\n",
+    "try:\n",
+    "    import csiborgtools\n",
+    "except ModuleNotFoundError:\n",
+    "    print(\"not found\")\n",
+    "    import sys\n",
+    "    sys.path.append(\"../\")\n",
+    "    import csiborgtools\n",
+    "\n",
+    "\n",
+    "%matplotlib notebook\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "4218b673",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-03-31T17:09:13.943312Z",
+     "start_time": "2023-03-31T17:09:12.167027Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "cat = csiborgtools.read.HaloCatalogue(7444, min_mass=1e13, max_dist=155 / 0.705)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "5ff7a1b6",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-03-31T17:10:18.303240Z",
+     "start_time": "2023-03-31T17:10:14.674751Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\r",
+      "  0%|          | 0/1 [00:00<?, ?it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "float32\n",
+      "float32\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 1/1 [00:03<00:00,  3.37s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "float32\n",
+      "float32\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "knn = NearestNeighbors()\n",
+    "knn.fit(cat.positions)\n",
+    "\n",
+    "knncdf = csiborgtools.match.kNN_CDF()\n",
+    "\n",
+    "rs, cdfs_high = knncdf(knn, nneighbours=3, Rmax=155 / 0.705, rmin=0.05, rmax=40,\n",
+    "                  nsamples=int(1e6), neval=int(1e4), random_state=42)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "08321431",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "58806ab9",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c59b3a19",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e345945c",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-03-31T09:35:49.059172Z",
+     "start_time": "2023-03-31T09:35:42.817291Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "m1 = (rs > 1) & (rs < 35)\n",
+    "\n",
+    "fig, axs = plt.subplots(ncols=3, figsize=(6.4 * 1.5, 4.8), sharey=True)\n",
+    "fig.subplots_adjust(wspace=0)\n",
+    "for k in range(3):\n",
+    "    for n in range(len(ics)):\n",
+    "        m = m1 & (cdfs[n, k, :] > 1e-3)\n",
+    "        axs[k].plot(rs[m], cdfs[n, k, m], c=\"black\", lw=0.05)\n",
+    "\n",
+    "    axs[k].set_xscale(\"log\")\n",
+    "    axs[k].set_yscale(\"log\")\n",
+    "    axs[k].set_title(r\"$k = {}$\".format(k))\n",
+    "    axs[k].set_xlabel(r\"$r~\\left[\\mathrm{Mpc}\\right]$\")\n",
+    "\n",
+    "axs[0].set_ylabel(r\"Peaked CDF\")\n",
+    "\n",
+    "plt.tight_layout(w_pad=0)\n",
+    "fig.savefig(\"../plots/peaked_cdf.png\", dpi=450)\n",
+    "fig.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9f8786c0",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-03-31T09:50:10.103650Z",
+     "start_time": "2023-03-31T09:50:02.221741Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "m = (rs > 0.5) & (rs < 35)\n",
+    "\n",
+    "fig, axs = plt.subplots(ncols=3, figsize=(6.4 * 1.5, 4.8), sharey=True)\n",
+    "fig.subplots_adjust(wspace=0)\n",
+    "for k in range(3):\n",
+    "    mu = np.nanmean(cdfs[:, k, :], axis=0)\n",
+    "\n",
+    "    for n in range(len(ics)):\n",
+    "        axs[k].plot(rs[m], (cdfs[n, k, :] / mu)[m], c=\"black\", lw=0.1)\n",
+    "\n",
+    "    axs[k].set_ylim(0.5, 1.5)\n",
+    "    axs[k].axhline(1, ls=\"--\", c=\"red\", zorder=0)\n",
+    "    axs[k].axvline(2.65 / 0.705, ls=\"--\", c=\"red\", zorder=0)\n",
+    "    axs[k].set_xscale(\"log\")\n",
+    "    axs[k].set_xlabel(r\"$r~\\left[\\mathrm{Mpc}\\right]$\")\n",
+    "    axs[k].set_title(r\"$k = {}$\".format(k))\n",
+    "    \n",
+    "axs[0].set_ylabel(r\"Relative peaked CDF\")\n",
+    "plt.tight_layout(w_pad=0)\n",
+    "fig.savefig(\"../plots/peaked_cdf_ratios.png\", dpi=450)\n",
+    "fig.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2f64cec1",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-03-30T15:46:31.532259Z",
+     "start_time": "2023-03-30T15:46:30.977449Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "plt.figure()\n",
+    "k = 2\n",
+    "mu = np.nanmean(cdfs[:, k, :], axis=0)\n",
+    "# plt.plot(rs, mu, c=\"black\")\n",
+    "for i in range(len(ics)):\n",
+    "    plt.plot(rs, cdfs[i, k, :] / mu)\n",
+    "\n",
+    "\n",
+    "plt.ylim(0.75, 1.25)\n",
+    "plt.axhline(1, ls=\"--\", c=\"black\")\n",
+    "plt.xscale(\"log\")\n",
+    "# plt.yscale(\"log\")\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a6784766",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b416efb3",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e650fe2c",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1311187d",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "03e49a11",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-03-30T14:58:29.937514Z",
+     "start_time": "2023-03-30T14:58:29.530552Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "x.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "24578cba",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b0024bbf",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6dc55410",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-03-30T14:41:24.290602Z",
+     "start_time": "2023-03-30T14:41:16.204679Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "dist0, __ = knn0.kneighbors(X, 3)\n",
+    "distx, __ = knnx.kneighbors(X, 3)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "11508c3c",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-03-30T14:41:24.560538Z",
+     "start_time": "2023-03-30T14:41:24.292674Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "x0, y0 = knncdf.peaked_cdf_from_samples(dist0[:, 0], 0.5, 20, neval=10000)\n",
+    "xx, yx = knncdf.peaked_cdf_from_samples(distx[:, 0], 0.5, 20, neval=10000)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "404501ad",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-03-30T14:41:24.598933Z",
+     "start_time": "2023-03-30T14:41:24.562062Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "distx[:, 0].min()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "43e08969",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-03-30T14:46:10.262865Z",
+     "start_time": "2023-03-30T14:46:09.486658Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "plt.figure()\n",
+    "plt.plot(x0, y0)\n",
+    "plt.plot(xx, yx)\n",
+    "\n",
+    "plt.yscale(\"log\")\n",
+    "plt.xscale(\"log\")\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "39547a75",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9e160b38",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-03-30T13:02:02.033125Z",
+     "start_time": "2023-03-30T13:02:00.674878Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "plt.figure()\n",
+    "\n",
+    "for i in range(3):\n",
+    "    plt.plot(*knncdf.cdf_from_samples(dist0[:, i], 1, 25))\n",
+    "    plt.plot(*knncdf.cdf_from_samples(distx[:, i], 1, 25))\n",
+    "\n",
+    "# plt.xlim(0.5, 25)\n",
+    "\n",
+    "plt.yscale(\"log\")\n",
+    "plt.xscale(\"log\")\n",
+    "plt.xlabel(r\"$r~\\left[\\mathrm{Mpc}\\right]$\")\n",
+    "\n",
+    "\n",
+    "\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4bfb65d8",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4703d81c",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-03-30T12:13:35.958444Z",
+     "start_time": "2023-03-30T12:13:35.924241Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "x = dist[:, 0]\n",
+    "q = np.linspace(0, 100, int(x.size / 5))\n",
+    "\n",
+    "p = np.percentile(x, q)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b054c6df",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-03-30T12:16:50.052225Z",
+     "start_time": "2023-03-30T12:16:50.020395Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "y = np.sort(x)\n",
+    "\n",
+    "yy = np.arange(y.size) / y.size"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5445c964",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-03-30T12:16:53.599925Z",
+     "start_time": "2023-03-30T12:16:53.521266Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "plt.figure()\n",
+    "plt.plot(p, q / 100)\n",
+    "\n",
+    "plt.plot(y, yy)\n",
+    "\n",
+    "# plt.yscale(\"log\")\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "87fe5874",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fb0ad6b9",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-03-30T12:03:34.387625Z",
+     "start_time": "2023-03-30T12:03:34.290961Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "plt.figure()\n",
+    "plt.hist(dist[:, 0], bins=\"auto\", histtype=\"step\")\n",
+    "plt.hist(dist[:, 1], bins=\"auto\", histtype=\"step\")\n",
+    "plt.hist(dist[:, 2], bins=\"auto\", histtype=\"step\")\n",
+    "\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c2aba833",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6f70f238",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "03bcb191",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-03-30T11:38:04.906150Z",
+     "start_time": "2023-03-30T11:38:04.758107Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "plt.figure()\n",
+    "plt.hist(cat0[\"dec\"], bins=\"auto\")\n",
+    "\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e5ad4722",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-03-30T11:53:23.004853Z",
+     "start_time": "2023-03-30T11:53:22.971967Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "gen = np.random.default_rng(22)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "785b530a",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-03-30T11:53:23.330397Z",
+     "start_time": "2023-03-30T11:53:23.296612Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "gen.normal()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b3d3b5e6",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "464b606d",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-03-30T11:36:13.649124Z",
+     "start_time": "2023-03-30T11:36:12.995693Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "theta = np.linspace( t, np.pi, 100)\n",
+    "\n",
+    "plt.figure()\n",
+    "plt.plot(theta, np.sin(theta))\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c29049f5",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cd2a3295",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "af9abf04",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-03-30T11:10:11.104389Z",
+     "start_time": "2023-03-30T11:10:11.070499Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "X = np.array([-3.9514747, -0.6966991,  2.97158]).reshape(1, -1)\n",
+    "\n",
+    "X"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e181b3c3",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-03-30T11:32:17.840355Z",
+     "start_time": "2023-03-30T11:32:17.351883Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "dist, indxs = knn0.kneighbors(X, n_neighbors=1)\n",
+    "\n",
+    "dist, indxs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d38fd960",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-03-30T11:10:18.182326Z",
+     "start_time": "2023-03-30T11:10:18.145629Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "cat0.positions[indxs]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a16ddc2f",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bbbe8fb6",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "759a0149",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "312c96c9",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b097637b",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2ced23cb",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "be26cbcc",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "venv_galomatch",
+   "language": "python",
+   "name": "venv_galomatch"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.0"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "f29d02a8350410abc2a9fb79641689d10bf7ab64afc03ec87ca3cf6ed2daa499"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/scripts/python.sh
+++ b/scripts/python.sh
@ -0,0 +1,46 @@
+#!/bin/bash -l
+echo =========================================================   
+echo Job submitted  date = Fri Mar 31 16:17:57 BST 2023      
+date_start=`date +%s`
+echo $SLURM_JOB_NUM_NODES nodes \( $SMP processes per node \)        
+echo $SLURM_JOB_NUM_NODES hosts used: $SLURM_JOB_NODELIST      
+echo Job output begins                                           
+echo -----------------                                           
+echo   
+#hostname
+
+# Need to set the max locked memory very high otherwise IB can't allocate enough and fails with "UCX  ERROR Failed to allocate memory pool chunk: Input/output error"
+ulimit -l unlimited
+
+# To allow mvapich to run ok
+export MV2_SMP_USE_CMA=0
+
+#which mpirun
+export OMP_NUM_THEADS=1
+ /usr/local/shared/slurm/bin/srun -u -n 5 --mpi=pmi2 --mem-per-cpu=7168 nice -n 10 /mnt/zfsusers/rstiskalek/csiborgtools/venv_galomatch/bin/python run_knn.py --rmin 0.05 --rmax 50 --nsamples 100000 --neval 10000
+# If we've been checkpointed
+#if [ -n "${DMTCP_CHECKPOINT_DIR}" ]; then
+  if [ -d "${DMTCP_CHECKPOINT_DIR}" ]; then
+#    echo -n "Job was checkpointed at "
+#    date
+#    echo 
+     sleep 1
+#  fi
+   echo -n
+else
+  echo ---------------                                           
+  echo Job output ends                                           
+  date_end=`date +%s`
+  seconds=$((date_end-date_start))
+  minutes=$((seconds/60))
+  seconds=$((seconds-60*minutes))
+  hours=$((minutes/60))
+  minutes=$((minutes-60*hours))
+  echo =========================================================   
+  echo PBS job: finished   date = `date`   
+  echo Total run time : $hours Hours $minutes Minutes $seconds Seconds
+  echo =========================================================
+fi
+if [ ${SLURM_NTASKS} -eq 1 ]; then
+  rm -f $fname
+fi
--- a/scripts/run_asciipos.sh
+++ b/scripts/run_asciipos.sh
@ -0,0 +1,13 @@
+nthreads=1
+memory=75
+queue="berg"
+env="/mnt/zfsusers/rstiskalek/csiborgtools/venv_galomatch/bin/python"
+file="run_asciipos.py"
+mode="dump"
+
+cm="addqueue -q $queue -n $nthreads -m $memory $env $file --mode $mode"
+
+echo "Submitting:"
+echo $cm
+echo
+$cm
--- a/scripts/run_crossmatch.sh
+++ b/scripts/run_crossmatch.sh
@ -0,0 +1,17 @@
+nthreads=1
+memory=32
+queue="berg"
+env="/mnt/zfsusers/rstiskalek/csiborgtools/venv_galomatch/bin/python"
+file="run_crossmatch.py"
+
+pythoncm="$env $file"
+# echo "Submitting:"
+# echo $pythoncm
+# echo
+# $pythoncm
+
+cm="addqueue -q $queue -n $nthreads -m $memory $pythoncm"
+echo "Submitting:"
+echo $cm
+echo
+$cm
--- a/scripts/run_crosspk.sh
+++ b/scripts/run_crosspk.sh
@ -0,0 +1,14 @@
+nthreads=20
+memory=40
+queue="berg"
+env="/mnt/zfsusers/rstiskalek/csiborgtools/venv_galomatch/bin/python"
+file="run_crosspk.py"
+grid=1024
+halfwidth=0.13
+
+cm="addqueue -q $queue -n $nthreads -m $memory $env $file --grid $grid --halfwidth $halfwidth"
+
+echo "Submitting:"
+echo $cm
+echo
+$cm
--- a/scripts/run_fieldprop.sh
+++ b/scripts/run_fieldprop.sh
@ -0,0 +1,14 @@
+nthreads=10
+memory=32
+queue="berg"
+env="/mnt/zfsusers/rstiskalek/csiborgtools/venv_galomatch/bin/python"
+file="run_fieldprop.py"
+# grid=1024
+# halfwidth=0.1
+
+cm="addqueue -q $queue -n $nthreads -m $memory $env $file"
+
+echo "Submitting:"
+echo $cm
+echo
+$cm
--- a/scripts/run_fit_halos.sh
+++ b/scripts/run_fit_halos.sh
@ -0,0 +1,12 @@
+nthreads=100
+memory=3
+queue="berg"
+env="/mnt/zfsusers/rstiskalek/csiborgtools/venv_galomatch/bin/python"
+file="run_fit_halos.py"
+
+cm="addqueue -q $queue -n $nthreads -m $memory $env $file"
+
+echo "Submitting:"
+echo $cm
+echo
+$cm
--- a/scripts/run_initmatch.sh
+++ b/scripts/run_initmatch.sh
@ -0,0 +1,14 @@
+nthreads=15  # There isn't too much benefit going to too many CPUs...
+memory=32
+queue="berg"
+env="/mnt/zfsusers/rstiskalek/csiborgtools/venv_galomatch/bin/python"
+file="run_initmatch.py"
+
+dump_clumps="false"
+
+cm="addqueue -q $queue -n $nthreads -m $memory $env $file --dump_clumps $dump_clumps"
+
+echo "Submitting:"
+echo $cm
+echo
+$cm
--- a/scripts/run_knn.py
+++ b/scripts/run_knn.py
@ -0,0 +1,104 @@
+# Copyright (C) 2022 Richard Stiskalek
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation; either version 3 of the License, or (at your
+# option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General
+# Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+"""A script to calculate the KNN-CDF for a set of CSiBORG halo catalogues."""
+from os.path import join
+from argparse import ArgumentParser
+from copy import deepcopy
+from datetime import datetime
+from mpi4py import MPI
+from TaskmasterMPI import master_process, worker_process
+from sklearn.neighbors import NearestNeighbors
+import joblib
+try:
+    import csiborgtools
+except ModuleNotFoundError:
+    import sys
+    sys.path.append("../")
+    import csiborgtools
+
+
+###############################################################################
+#                            MPI and arguments                                #
+###############################################################################
+comm = MPI.COMM_WORLD
+rank = comm.Get_rank()
+nproc = comm.Get_size()
+
+parser = ArgumentParser()
+parser.add_argument("--rmin", type=float)
+parser.add_argument("--rmax", type=float)
+parser.add_argument("--nneighbours", type=int)
+parser.add_argument("--nsamples", type=int)
+parser.add_argument("--neval", type=int)
+parser.add_argument("--seed", type=int, default=42)
+args = parser.parse_args()
+
+Rmax = 155 / 0.705  # Mpc/h high resolution region radius
+mass_threshold = [1e12, 1e13, 1e14]  # Msun
+ics = [7444, 7468, 7492, 7516, 7540, 7564, 7588, 7612, 7636, 7660, 7684,
+       7708, 7732, 7756, 7780, 7804, 7828, 7852, 7876, 7900, 7924, 7948,
+       7972, 7996, 8020, 8044, 8068, 8092, 8116, 8140, 8164, 8188, 8212,
+       8236, 8260, 8284, 8308, 8332, 8356, 8380, 8404, 8428, 8452, 8476,
+       8500, 8524, 8548, 8572, 8596, 8620, 8644, 8668, 8692, 8716, 8740,
+       8764, 8788, 8812, 8836, 8860, 8884, 8908, 8932, 8956, 8980, 9004,
+       9028, 9052, 9076, 9100, 9124, 9148, 9172, 9196, 9220, 9244, 9268,
+       9292, 9316, 9340, 9364, 9388, 9412, 9436, 9460, 9484, 9508, 9532,
+       9556, 9580, 9604, 9628, 9652, 9676, 9700, 9724, 9748, 9772, 9796,
+       9820, 9844]
+dumpdir = "/mnt/extraspace/rstiskalek/csiborg/knn"
+fout = join(dumpdir, "knncdf_{}.p")
+
+
+###############################################################################
+#                               Analysis                                      #
+###############################################################################
+knncdf = csiborgtools.match.kNN_CDF()
+
+
+def do_task(ic):
+    out = {}
+    cat = csiborgtools.read.HaloCatalogue(ic, max_dist=Rmax)
+
+    for i, mmin in enumerate(mass_threshold):
+        knn = NearestNeighbors()
+        knn.fit(cat.positions[cat["totpartmass"] > mmin, ...])
+
+        rs, cdf = knncdf(knn, nneighbours=args.nneighbours, Rmax=Rmax,
+                         rmin=args.rmin, rmax=args.rmax, nsamples=args.nsamples,
+                         neval=args.neval, random_state=args.seed,
+                         verbose=False)
+        out.update({"cdf_{}".format(i): cdf})
+
+    out.update({"rs": rs, "mass_threshold": mass_threshold})
+    joblib.dump(out, fout.format(ic))
+
+
+if nproc > 1:
+    if rank == 0:
+        tasks = deepcopy(ics)
+        master_process(tasks, comm, verbose=True)
+    else:
+        worker_process(do_task, comm, verbose=False)
+else:
+    tasks = deepcopy(ics)
+    for task in tasks:
+        print("{}: completing task `{}`.".format(datetime.now(), task))
+        do_task(task)
+
+
+comm.Barrier()
+if rank == 0:
+    print("{}: all finished.".format(datetime.now()))
+quit()  # Force quit the script
--- a/scripts/run_knn.sh
+++ b/scripts/run_knn.sh
@ -0,0 +1,22 @@
+nthreads=140
+memory=7
+queue="berg"
+env="/mnt/zfsusers/rstiskalek/csiborgtools/venv_galomatch/bin/python"
+file="run_knn.py"
+
+rmin=0.01
+rmax=100
+nneighbours=16
+nsamples=10000000
+neval=10000
+
+pythoncm="$env $file --rmin $rmin --rmax $rmax --nneighbours $nneighbours --nsamples $nsamples --neval $neval"
+
+# echo $pythoncm
+# $pythoncm
+
+cm="addqueue -q $queue -n $nthreads -m $memory $pythoncm"
+echo "Submitting:"
+echo $cm
+echo
+$cm
--- a/scripts/run_singlematch.sh
+++ b/scripts/run_singlematch.sh
@ -0,0 +1,36 @@
+#!/bin/bash
+# nthreads=1
+memory=16
+queue="berg"
+env="/mnt/zfsusers/rstiskalek/csiborgtools/venv_galomatch/bin/python"
+file="run_singlematch.py"
+
+nmult=1.
+sigma=1.
+
+sims=(7468 7588 8020 8452 8836)
+nsims=${#sims[@]}
+
+for i in $(seq 0 $((nsims-1))); do
+for j in $(seq 0 $((nsims-1))); do
+if [ $i -eq $j ]; then
+    continue
+elif [ $i -gt $j ]; then
+    continue
+else
+    :
+fi
+
+nsim0=${sims[$i]}
+nsimx=${sims[$j]}
+
+pythoncm="$env $file --nsim0 $nsim0 --nsimx $nsimx --nmult $nmult --sigma $sigma"
+
+cm="addqueue -q $queue -n 1x1 -m $memory $pythoncm"
+echo "Submitting:"
+echo $cm
+echo
+$cm
+sleep 0.05
+
+done; done
--- a/scripts/run_split_halos.sh
+++ b/scripts/run_split_halos.sh
@ -0,0 +1,12 @@
+nthreads=1
+memory=30
+queue="cmb"
+env="/mnt/zfsusers/rstiskalek/csiborgtools/venv_galomatch/bin/python"
+file="run_split_halos.py"
+
+cm="addqueue -q $queue -n $nthreads -m $memory $env $file"
+
+echo "Submitting:"
+echo $cm
+echo
+$cm