mirror of
https://github.com/Richard-Sti/csiborgtools.git
synced 2024-12-22 23:48:02 +00:00
Add batch sizing for less memory
This commit is contained in:
parent
63ab3548b4
commit
070b9b6c1b
1 changed files with 43 additions and 13 deletions
|
@ -18,6 +18,7 @@ kNN-CDF calculation
|
||||||
from gc import collect
|
from gc import collect
|
||||||
import numpy
|
import numpy
|
||||||
from scipy.interpolate import interp1d
|
from scipy.interpolate import interp1d
|
||||||
|
from scipy.stats import binned_statistic
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
|
||||||
|
@ -125,7 +126,8 @@ class kNN_CDF:
|
||||||
return cdf
|
return cdf
|
||||||
|
|
||||||
def __call__(self, *knns, nneighbours, Rmax, nsamples, rmin, rmax, neval,
|
def __call__(self, *knns, nneighbours, Rmax, nsamples, rmin, rmax, neval,
|
||||||
verbose=True, random_state=42, dtype=numpy.float32):
|
batch_size=None, verbose=True, random_state=42,
|
||||||
|
left_nan=True, right_nan=True, dtype=numpy.float32):
|
||||||
"""
|
"""
|
||||||
Calculate the CDF for a set of kNNs of CSiBORG halo catalogues.
|
Calculate the CDF for a set of kNNs of CSiBORG halo catalogues.
|
||||||
|
|
||||||
|
@ -146,10 +148,20 @@ class kNN_CDF:
|
||||||
Maximum distance to evaluate the CDF.
|
Maximum distance to evaluate the CDF.
|
||||||
neval : int
|
neval : int
|
||||||
Number of points to evaluate the CDF.
|
Number of points to evaluate the CDF.
|
||||||
|
batch_size : int, optional
|
||||||
|
Number of random points to sample in each batch. By default equal
|
||||||
|
to `nsamples`, however recommeded to be smaller to avoid requesting
|
||||||
|
too much memory,
|
||||||
verbose : bool, optional
|
verbose : bool, optional
|
||||||
Verbosity flag.
|
Verbosity flag.
|
||||||
random_state : int, optional
|
random_state : int, optional
|
||||||
Random state for the random number generator.
|
Random state for the random number generator.
|
||||||
|
left_nan : bool, optional
|
||||||
|
Whether to set values where the CDF is 0 to `numpy.nan`. By
|
||||||
|
default `True`.
|
||||||
|
right_nan : bool, optional
|
||||||
|
Whether to set values where the CDF is 1 to `numpy.nan` after its
|
||||||
|
first occurence to 1. By default `True`.
|
||||||
dtype : numpy dtype, optional
|
dtype : numpy dtype, optional
|
||||||
Calculation data type. By default `numpy.float32`.
|
Calculation data type. By default `numpy.float32`.
|
||||||
|
|
||||||
|
@ -160,22 +172,40 @@ class kNN_CDF:
|
||||||
cdfs : 2 or 3-dimensional array
|
cdfs : 2 or 3-dimensional array
|
||||||
CDFs evaluated at `rs`.
|
CDFs evaluated at `rs`.
|
||||||
"""
|
"""
|
||||||
rand = self.rvs_in_sphere(nsamples, Rmax, random_state=random_state)
|
batch_size = nsamples if batch_size is None else batch_size
|
||||||
|
assert nsamples >= batch_size
|
||||||
|
nbatches = nsamples // batch_size # Number of batches
|
||||||
|
|
||||||
cdfs = [None] * len(knns)
|
# Preallocate the bins and the CDF array
|
||||||
|
bins = numpy.logspace(numpy.log10(rmin), numpy.log10(rmax), neval)
|
||||||
|
cdfs = numpy.zeros((len(knns), nneighbours, neval - 1), dtype=dtype)
|
||||||
for i, knn in enumerate(tqdm(knns) if verbose else knns):
|
for i, knn in enumerate(tqdm(knns) if verbose else knns):
|
||||||
|
# Loop over batches. This is to avoid generating large mocks
|
||||||
|
# requiring a lot of memory. Add counts to the CDF array
|
||||||
|
for j in range(nbatches):
|
||||||
|
rand = self.rvs_in_sphere(batch_size, Rmax,
|
||||||
|
random_state=random_state + j)
|
||||||
dist, _indxs = knn.kneighbors(rand, nneighbours)
|
dist, _indxs = knn.kneighbors(rand, nneighbours)
|
||||||
dist = dist.astype(dtype)
|
for k in range(nneighbours): # Count for each neighbour
|
||||||
del _indxs
|
_counts, __, __ = binned_statistic(
|
||||||
collect()
|
dist[:, k], dist[:, k], bins=bins, statistic="count",
|
||||||
|
range=(rmin, rmax))
|
||||||
|
cdfs[i, k, :] += _counts
|
||||||
|
|
||||||
|
rs = (bins[1:] + bins[:-1]) / 2 # Bin centers
|
||||||
|
cdfs = numpy.cumsum(cdfs, axis=-1) # Cumulative sum, i.e. the CDF
|
||||||
|
for i in range(len(knns)):
|
||||||
|
for k in range(nneighbours):
|
||||||
|
cdfs[i, k, :] /= cdfs[i, k, -1]
|
||||||
|
# Set to NaN values after the first point where the CDF is 1
|
||||||
|
if right_nan:
|
||||||
|
ns = numpy.where(cdfs[i, k, :] == 1.)[0]
|
||||||
|
if ns.size > 1:
|
||||||
|
cdfs[i, k, ns[1]:] = numpy.nan
|
||||||
|
|
||||||
cdf = [None] * nneighbours
|
# Set to NaN values where the CDF is 0
|
||||||
for j in range(nneighbours):
|
if left_nan:
|
||||||
rs, cdf[j] = self.cdf_from_samples(
|
cdfs[cdfs == 0] = numpy.nan
|
||||||
dist[:, j], rmin=rmin, rmax=rmax, neval=neval)
|
|
||||||
cdfs[i] = cdf
|
|
||||||
|
|
||||||
cdfs = numpy.asanyarray(cdfs)
|
|
||||||
cdfs = cdfs[0, ...] if len(knns) == 1 else cdfs
|
cdfs = cdfs[0, ...] if len(knns) == 1 else cdfs
|
||||||
return rs, cdfs
|
return rs, cdfs
|
||||||
|
|
Loading…
Reference in a new issue