From 513872ceb6c3f05da2d26fdc75552e2379730f4e Mon Sep 17 00:00:00 2001
From: Richard Stiskalek <richard.stiskalek@protonmail.com>
Date: Sat, 1 Apr 2023 07:57:21 +0100
Subject: [PATCH] kNN memory batching (#35)

* Add batch sizing for less memory

* Add batch size to submission

* Update nb

* Add brute KNN

* unused variable

* Update nb
---
 csiborgtools/match/knn.py | 106 ++++++++++++++++++++++++++----
 notebooks/knn.ipynb       | 132 +++++++++++++++++++++++++-------------
 scripts/run_knn.py        |   5 +-
 scripts/run_knn.sh        |   9 ++-
 4 files changed, 188 insertions(+), 64 deletions(-)

diff --git a/csiborgtools/match/knn.py b/csiborgtools/match/knn.py
index 99b81f1..aac55c7 100644
--- a/csiborgtools/match/knn.py
+++ b/csiborgtools/match/knn.py
@@ -15,9 +15,9 @@
 """
 kNN-CDF calculation
 """
-from gc import collect
 import numpy
 from scipy.interpolate import interp1d
+from scipy.stats import binned_statistic
 from tqdm import tqdm
 
 
@@ -124,8 +124,58 @@ class kNN_CDF:
         cdf[cdf > 0.5] = 1 - cdf[cdf > 0.5]
         return cdf
 
+    def brute_cdf(self, knn, nneighbours, Rmax, nsamples, rmin, rmax, neval,
+                 random_state=42, dtype=numpy.float32):
+        """
+        Calculate the CDF for a kNN of CSiBORG halo catalogues without batch
+        sizing. This can become memory intense for large numbers of randoms
+        and, therefore, is only for testing purposes.
+
+        Parameters
+        ----------
+        knns : `sklearn.neighbors.NearestNeighbors`
+            kNN of CSiBORG halo catalogues.
+        neighbours : int
+            Maximum number of neighbours to use for the kNN-CDF calculation.
+        Rmax : float
+            Maximum radius of the sphere in which to sample random points for
+            the knn-CDF calculation. This should match the CSiBORG catalogues.
+        nsamples : int
+            Number of random points to sample for the knn-CDF calculation.
+        rmin : float
+            Minimum distance to evaluate the CDF.
+        rmax : float
+            Maximum distance to evaluate the CDF.
+        neval : int
+            Number of points to evaluate the CDF.
+        random_state : int, optional
+            Random state for the random number generator.
+        dtype : numpy dtype, optional
+            Calculation data type. By default `numpy.float32`.
+
+        Returns
+        -------
+        rs : 1-dimensional array
+            Distances at which the CDF is evaluated.
+        cdfs : 2-dimensional array
+            CDFs evaluated at `rs`.
+        """
+        rand = self.rvs_in_sphere(nsamples, Rmax, random_state=random_state)
+
+        dist, __ = knn.kneighbors(rand, nneighbours)
+        dist = dist.astype(dtype)
+
+        cdf = [None] * nneighbours
+        for j in range(nneighbours):
+            rs, cdf[j] = self.cdf_from_samples(dist[:, j], rmin=rmin,
+                                               rmax=rmax, neval=neval)
+
+        cdf = numpy.asanyarray(cdf)
+        return rs, cdf
+
     def __call__(self, *knns, nneighbours, Rmax, nsamples, rmin, rmax, neval,
-                 verbose=True, random_state=42, dtype=numpy.float32):
+                batch_size=None, verbose=True, random_state=42,
+                left_nan=True, right_nan=True, dtype=numpy.float32):
         """
         Calculate the CDF for a set of kNNs of CSiBORG halo catalogues.
 
@@ -146,10 +196,20 @@ class kNN_CDF:
             Maximum distance to evaluate the CDF.
         neval : int
             Number of points to evaluate the CDF.
+        batch_size : int, optional
+            Number of random points to sample in each batch. By default equal
+            to `nsamples`, however recommeded to be smaller to avoid requesting
+            too much memory,
         verbose : bool, optional
             Verbosity flag.
         random_state : int, optional
             Random state for the random number generator.
+        left_nan : bool, optional
+            Whether to set values where the CDF is 0 to `numpy.nan`. By
+            default `True`.
+        right_nan : bool, optional
+            Whether to set values where the CDF is 1 to `numpy.nan` after its
+            first occurence to 1. By default `True`.
         dtype : numpy dtype, optional
             Calculation data type. By default `numpy.float32`.
 
@@ -160,22 +220,40 @@ class kNN_CDF:
         cdfs : 2 or 3-dimensional array
             CDFs evaluated at `rs`.
         """
-        rand = self.rvs_in_sphere(nsamples, Rmax, random_state=random_state)
+        batch_size = nsamples if batch_size is None else batch_size
+        assert nsamples >= batch_size
+        nbatches = nsamples // batch_size  # Number of batches
 
-        cdfs = [None] * len(knns)
+        # Preallocate the bins and the CDF array
+        bins = numpy.logspace(numpy.log10(rmin), numpy.log10(rmax), neval)
+        cdfs = numpy.zeros((len(knns), nneighbours, neval - 1), dtype=dtype)
         for i, knn in enumerate(tqdm(knns) if verbose else knns):
-            dist, _indxs = knn.kneighbors(rand, nneighbours)
-            dist = dist.astype(dtype)
-            del _indxs
-            collect()
+            # Loop over batches. This is to avoid generating large mocks
+            # requiring a lot of memory. Add counts to the CDF array
+            for j in range(nbatches):
+                rand = self.rvs_in_sphere(batch_size, Rmax,
+                                          random_state=random_state + j)
+                dist, __ = knn.kneighbors(rand, nneighbours)
+                for k in range(nneighbours):  # Count for each neighbour
+                    _counts, __, __ = binned_statistic(
+                        dist[:, k], dist[:, k], bins=bins, statistic="count",
+                        range=(rmin, rmax))
+                    cdfs[i, k, :] += _counts
 
+        rs = (bins[1:] + bins[:-1]) / 2     # Bin centers
+        cdfs = numpy.cumsum(cdfs, axis=-1)  # Cumulative sum, i.e. the CDF
+        for i in range(len(knns)):
+            for k in range(nneighbours):
+                cdfs[i, k, :] /= cdfs[i, k, -1]
+                # Set to NaN values after the first point where the CDF is 1
+                if right_nan:
+                    ns = numpy.where(cdfs[i, k, :] == 1.)[0]
+                    if ns.size > 1:
+                        cdfs[i, k, ns[1]:] = numpy.nan
 
-            cdf = [None] * nneighbours
-            for j in range(nneighbours):
-                rs, cdf[j] = self.cdf_from_samples(
-                    dist[:, j], rmin=rmin, rmax=rmax, neval=neval)
-            cdfs[i] = cdf
+        # Set to NaN values where the CDF is 0
+        if left_nan:
+            cdfs[cdfs == 0] = numpy.nan
 
-        cdfs = numpy.asanyarray(cdfs)
         cdfs = cdfs[0, ...] if len(knns) == 1 else cdfs
         return rs, cdfs
diff --git a/notebooks/knn.ipynb b/notebooks/knn.ipynb
index 8ebdb6d..59a005c 100644
--- a/notebooks/knn.ipynb
+++ b/notebooks/knn.ipynb
@@ -2,12 +2,12 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 1,
    "id": "5a38ed25",
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2023-03-31T17:09:12.165480Z",
-     "start_time": "2023-03-31T17:09:12.116708Z"
+     "end_time": "2023-04-01T06:20:33.195162Z",
+     "start_time": "2023-04-01T06:20:29.474122Z"
     },
     "scrolled": true
    },
@@ -16,8 +16,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "The autoreload extension is already loaded. To reload it, use:\n",
-      "  %reload_ext autoreload\n"
+      "not found\n"
      ]
     }
    ],
@@ -44,12 +43,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 2,
    "id": "4218b673",
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2023-03-31T17:09:13.943312Z",
-     "start_time": "2023-03-31T17:09:12.167027Z"
+     "end_time": "2023-04-01T06:20:35.273662Z",
+     "start_time": "2023-04-01T06:20:33.196875Z"
     }
    },
    "outputs": [],
@@ -59,12 +58,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 24,
    "id": "5ff7a1b6",
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2023-03-31T17:10:18.303240Z",
-     "start_time": "2023-03-31T17:10:14.674751Z"
+     "end_time": "2023-04-01T06:55:34.643955Z",
+     "start_time": "2023-04-01T06:55:28.334204Z"
     }
    },
    "outputs": [
@@ -72,38 +71,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\r",
-      "  0%|          | 0/1 [00:00<?, ?it/s]"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "float32\n",
-      "float32\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "100%|██████████| 1/1 [00:03<00:00,  3.37s/it]"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "float32\n",
-      "float32\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\n"
+      "100%|██████████| 1/1 [00:02<00:00,  2.95s/it]\n"
      ]
     }
    ],
@@ -113,18 +81,90 @@
     "\n",
     "knncdf = csiborgtools.match.kNN_CDF()\n",
     "\n",
-    "rs, cdfs_high = knncdf(knn, nneighbours=3, Rmax=155 / 0.705, rmin=0.05, rmax=40,\n",
-    "                  nsamples=int(1e6), neval=int(1e4), random_state=42)"
+    "rs, cdf = knncdf(knn, nneighbours=2, Rmax=155 / 0.705, rmin=0.01, rmax=100,\n",
+    "                  nsamples=int(1e6), neval=int(1e4), random_state=42, batch_size=int(1e6))"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "08321431",
+   "id": "0d5f3d02",
    "metadata": {},
    "outputs": [],
    "source": []
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8b9a8cf0",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a1825f00",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-04-01T06:01:29.388586Z",
+     "start_time": "2023-04-01T06:01:29.321025Z"
+    },
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "plt.figure()\n",
+    "plt.plot(rs, knncdf.peaked_cdf(cdf[0, :]))\n",
+    "\n",
+    "plt.yscale(\"log\" )\n",
+    "plt.xscale(\"log\")\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "289549a0",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-03-31T22:55:20.690887Z",
+     "start_time": "2023-03-31T22:55:20.656550Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "mask"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7a8c5202",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-03-31T22:54:52.330633Z",
+     "start_time": "2023-03-31T22:54:52.299548Z"
+    }
+   },
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "46f54897",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-03-31T22:54:25.138813Z",
+     "start_time": "2023-03-31T22:54:25.105044Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "dist"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
diff --git a/scripts/run_knn.py b/scripts/run_knn.py
index caac37e..1e61629 100644
--- a/scripts/run_knn.py
+++ b/scripts/run_knn.py
@@ -42,6 +42,7 @@ parser.add_argument("--rmax", type=float)
 parser.add_argument("--nneighbours", type=int)
 parser.add_argument("--nsamples", type=int)
 parser.add_argument("--neval", type=int)
+parser.add_argument("--batch_size", type=int)
 parser.add_argument("--seed", type=int, default=42)
 args = parser.parse_args()
 
@@ -77,8 +78,8 @@ def do_task(ic):
 
         rs, cdf = knncdf(knn, nneighbours=args.nneighbours, Rmax=Rmax,
                          rmin=args.rmin, rmax=args.rmax, nsamples=args.nsamples,
-                         neval=args.neval, random_state=args.seed,
-                         verbose=False)
+                         neval=args.neval, batch_size=args.batch_size,
+                         random_state=args.seed, verbose=False)
         out.update({"cdf_{}".format(i): cdf})
 
     out.update({"rs": rs, "mass_threshold": mass_threshold})
diff --git a/scripts/run_knn.sh b/scripts/run_knn.sh
index d6df448..a32625c 100644
--- a/scripts/run_knn.sh
+++ b/scripts/run_knn.sh
@@ -1,4 +1,4 @@
-nthreads=140
+nthreads=30
 memory=7
 queue="berg"
 env="/mnt/zfsusers/rstiskalek/csiborgtools/venv_galomatch/bin/python"
@@ -7,9 +7,14 @@ file="run_knn.py"
 rmin=0.01
 rmax=100
 nneighbours=16
-nsamples=10000000
+nsamples=1000000000
+batch_size=10000000
 neval=10000
 
+# 1000,000,0
+# 10000000  # 1e7
+# 1000000000
+
 pythoncm="$env $file --rmin $rmin --rmax $rmax --nneighbours $nneighbours --nsamples $nsamples --neval $neval"
 
 # echo $pythoncm