kNN memory batching (#35)

* Add batch sizing for less memory * Add batch size to submission * Update nb * Add brute KNN * unused variable * Update nb
2025-06-08 01:41:12 +00:00 · 2023-04-01 07:57:21 +01:00 · 2023-04-01 07:57:21 +01:00 · 513872ceb6
commit 513872ceb6
parent 63ab3548b4
4 changed files with 188 additions and 64 deletions
--- a/csiborgtools/match/knn.py
+++ b/csiborgtools/match/knn.py
@ -15,9 +15,9 @@
 """
 kNN-CDF calculation
 """
-from gc import collect
 import numpy
 from scipy.interpolate import interp1d
+from scipy.stats import binned_statistic
 from tqdm import tqdm


@ -124,8 +124,58 @@ class kNN_CDF:
        cdf[cdf > 0.5] = 1 - cdf[cdf > 0.5]
        return cdf

+    def brute_cdf(self, knn, nneighbours, Rmax, nsamples, rmin, rmax, neval,
+                 random_state=42, dtype=numpy.float32):
+        """
+        Calculate the CDF for a kNN of CSiBORG halo catalogues without batch
+        sizing. This can become memory intense for large numbers of randoms
+        and, therefore, is only for testing purposes.
+
+        Parameters
+        ----------
+        knns : `sklearn.neighbors.NearestNeighbors`
+            kNN of CSiBORG halo catalogues.
+        neighbours : int
+            Maximum number of neighbours to use for the kNN-CDF calculation.
+        Rmax : float
+            Maximum radius of the sphere in which to sample random points for
+            the knn-CDF calculation. This should match the CSiBORG catalogues.
+        nsamples : int
+            Number of random points to sample for the knn-CDF calculation.
+        rmin : float
+            Minimum distance to evaluate the CDF.
+        rmax : float
+            Maximum distance to evaluate the CDF.
+        neval : int
+            Number of points to evaluate the CDF.
+        random_state : int, optional
+            Random state for the random number generator.
+        dtype : numpy dtype, optional
+            Calculation data type. By default `numpy.float32`.
+
+        Returns
+        -------
+        rs : 1-dimensional array
+            Distances at which the CDF is evaluated.
+        cdfs : 2-dimensional array
+            CDFs evaluated at `rs`.
+        """
+        rand = self.rvs_in_sphere(nsamples, Rmax, random_state=random_state)
+
+        dist, __ = knn.kneighbors(rand, nneighbours)
+        dist = dist.astype(dtype)
+
+        cdf = [None] * nneighbours
+        for j in range(nneighbours):
+            rs, cdf[j] = self.cdf_from_samples(dist[:, j], rmin=rmin,
+                                               rmax=rmax, neval=neval)
+
+        cdf = numpy.asanyarray(cdf)
+        return rs, cdf
+
    def __call__(self, *knns, nneighbours, Rmax, nsamples, rmin, rmax, neval,
-                 verbose=True, random_state=42, dtype=numpy.float32):
+                batch_size=None, verbose=True, random_state=42,
+                left_nan=True, right_nan=True, dtype=numpy.float32):
        """
        Calculate the CDF for a set of kNNs of CSiBORG halo catalogues.

@ -146,10 +196,20 @@ class kNN_CDF:
            Maximum distance to evaluate the CDF.
        neval : int
            Number of points to evaluate the CDF.
+        batch_size : int, optional
+            Number of random points to sample in each batch. By default equal
+            to `nsamples`, however recommeded to be smaller to avoid requesting
+            too much memory,
        verbose : bool, optional
            Verbosity flag.
        random_state : int, optional
            Random state for the random number generator.
+        left_nan : bool, optional
+            Whether to set values where the CDF is 0 to `numpy.nan`. By
+            default `True`.
+        right_nan : bool, optional
+            Whether to set values where the CDF is 1 to `numpy.nan` after its
+            first occurence to 1. By default `True`.
        dtype : numpy dtype, optional
            Calculation data type. By default `numpy.float32`.

@ -160,22 +220,40 @@ class kNN_CDF:
        cdfs : 2 or 3-dimensional array
            CDFs evaluated at `rs`.
        """
-        rand = self.rvs_in_sphere(nsamples, Rmax, random_state=random_state)
+        batch_size = nsamples if batch_size is None else batch_size
+        assert nsamples >= batch_size
+        nbatches = nsamples // batch_size  # Number of batches

-        cdfs = [None] * len(knns)
+        # Preallocate the bins and the CDF array
+        bins = numpy.logspace(numpy.log10(rmin), numpy.log10(rmax), neval)
+        cdfs = numpy.zeros((len(knns), nneighbours, neval - 1), dtype=dtype)
        for i, knn in enumerate(tqdm(knns) if verbose else knns):
-            dist, _indxs = knn.kneighbors(rand, nneighbours)
-            dist = dist.astype(dtype)
-            del _indxs
-            collect()
+            # Loop over batches. This is to avoid generating large mocks
+            # requiring a lot of memory. Add counts to the CDF array
+            for j in range(nbatches):
+                rand = self.rvs_in_sphere(batch_size, Rmax,
+                                          random_state=random_state + j)
+                dist, __ = knn.kneighbors(rand, nneighbours)
+                for k in range(nneighbours):  # Count for each neighbour
+                    _counts, __, __ = binned_statistic(
+                        dist[:, k], dist[:, k], bins=bins, statistic="count",
+                        range=(rmin, rmax))
+                    cdfs[i, k, :] += _counts

+        rs = (bins[1:] + bins[:-1]) / 2     # Bin centers
+        cdfs = numpy.cumsum(cdfs, axis=-1)  # Cumulative sum, i.e. the CDF
+        for i in range(len(knns)):
+            for k in range(nneighbours):
+                cdfs[i, k, :] /= cdfs[i, k, -1]
+                # Set to NaN values after the first point where the CDF is 1
+                if right_nan:
+                    ns = numpy.where(cdfs[i, k, :] == 1.)[0]
+                    if ns.size > 1:
+                        cdfs[i, k, ns[1]:] = numpy.nan

-            cdf = [None] * nneighbours
-            for j in range(nneighbours):
-                rs, cdf[j] = self.cdf_from_samples(
-                    dist[:, j], rmin=rmin, rmax=rmax, neval=neval)
-            cdfs[i] = cdf
+        # Set to NaN values where the CDF is 0
+        if left_nan:
+            cdfs[cdfs == 0] = numpy.nan

-        cdfs = numpy.asanyarray(cdfs)
        cdfs = cdfs[0, ...] if len(knns) == 1 else cdfs
        return rs, cdfs
--- a/notebooks/knn.ipynb
+++ b/notebooks/knn.ipynb
@ -2,12 +2,12 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 1,
   "id": "5a38ed25",
   "metadata": {
    "ExecuteTime": {
-     "end_time": "2023-03-31T17:09:12.165480Z",
-     "start_time": "2023-03-31T17:09:12.116708Z"
+     "end_time": "2023-04-01T06:20:33.195162Z",
+     "start_time": "2023-04-01T06:20:29.474122Z"
    },
    "scrolled": true
   },
@ -16,8 +16,7 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "The autoreload extension is already loaded. To reload it, use:\n",
-      "  %reload_ext autoreload\n"
+      "not found\n"
     ]
    }
   ],
@ -44,12 +43,12 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 2,
   "id": "4218b673",
   "metadata": {
    "ExecuteTime": {
-     "end_time": "2023-03-31T17:09:13.943312Z",
-     "start_time": "2023-03-31T17:09:12.167027Z"
+     "end_time": "2023-04-01T06:20:35.273662Z",
+     "start_time": "2023-04-01T06:20:33.196875Z"
    }
   },
   "outputs": [],
@ -59,12 +58,12 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 24,
   "id": "5ff7a1b6",
   "metadata": {
    "ExecuteTime": {
-     "end_time": "2023-03-31T17:10:18.303240Z",
-     "start_time": "2023-03-31T17:10:14.674751Z"
+     "end_time": "2023-04-01T06:55:34.643955Z",
+     "start_time": "2023-04-01T06:55:28.334204Z"
    }
   },
   "outputs": [
@ -72,38 +71,7 @@
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "\r",
-      "  0%|          | 0/1 [00:00<?, ?it/s]"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "float32\n",
-      "float32\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "100%|██████████| 1/1 [00:03<00:00,  3.37s/it]"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "float32\n",
-      "float32\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\n"
+      "100%|██████████| 1/1 [00:02<00:00,  2.95s/it]\n"
     ]
    }
   ],
@ -113,18 +81,90 @@
    "\n",
    "knncdf = csiborgtools.match.kNN_CDF()\n",
    "\n",
-    "rs, cdfs_high = knncdf(knn, nneighbours=3, Rmax=155 / 0.705, rmin=0.05, rmax=40,\n",
-    "                  nsamples=int(1e6), neval=int(1e4), random_state=42)"
+    "rs, cdf = knncdf(knn, nneighbours=2, Rmax=155 / 0.705, rmin=0.01, rmax=100,\n",
+    "                  nsamples=int(1e6), neval=int(1e4), random_state=42, batch_size=int(1e6))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "08321431",
+   "id": "0d5f3d02",
   "metadata": {},
   "outputs": [],
   "source": []
  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8b9a8cf0",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a1825f00",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-04-01T06:01:29.388586Z",
+     "start_time": "2023-04-01T06:01:29.321025Z"
+    },
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "plt.figure()\n",
+    "plt.plot(rs, knncdf.peaked_cdf(cdf[0, :]))\n",
+    "\n",
+    "plt.yscale(\"log\" )\n",
+    "plt.xscale(\"log\")\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "289549a0",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-03-31T22:55:20.690887Z",
+     "start_time": "2023-03-31T22:55:20.656550Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "mask"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7a8c5202",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-03-31T22:54:52.330633Z",
+     "start_time": "2023-03-31T22:54:52.299548Z"
+    }
+   },
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "46f54897",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-03-31T22:54:25.138813Z",
+     "start_time": "2023-03-31T22:54:25.105044Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "dist"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,
--- a/scripts/run_knn.py
+++ b/scripts/run_knn.py
@ -42,6 +42,7 @@ parser.add_argument("--rmax", type=float)
 parser.add_argument("--nneighbours", type=int)
 parser.add_argument("--nsamples", type=int)
 parser.add_argument("--neval", type=int)
+parser.add_argument("--batch_size", type=int)
 parser.add_argument("--seed", type=int, default=42)
 args = parser.parse_args()

@ -77,8 +78,8 @@ def do_task(ic):

        rs, cdf = knncdf(knn, nneighbours=args.nneighbours, Rmax=Rmax,
                         rmin=args.rmin, rmax=args.rmax, nsamples=args.nsamples,
-                         neval=args.neval, random_state=args.seed,
-                         verbose=False)
+                         neval=args.neval, batch_size=args.batch_size,
+                         random_state=args.seed, verbose=False)
        out.update({"cdf_{}".format(i): cdf})

    out.update({"rs": rs, "mass_threshold": mass_threshold})
--- a/scripts/run_knn.sh
+++ b/scripts/run_knn.sh
@ -1,4 +1,4 @@
-nthreads=140
+nthreads=30
 memory=7
 queue="berg"
 env="/mnt/zfsusers/rstiskalek/csiborgtools/venv_galomatch/bin/python"
@ -7,9 +7,14 @@ file="run_knn.py"
 rmin=0.01
 rmax=100
 nneighbours=16
-nsamples=10000000
+nsamples=1000000000
+batch_size=10000000
 neval=10000

+# 1000,000,0
+# 10000000  # 1e7
+# 1000000000
+
 pythoncm="$env $file --rmin $rmin --rmax $rmax --nneighbours $nneighbours --nsamples $nsamples --neval $neval"

 # echo $pythoncm