kNN-CDF implementation (#34)

* Rewrite doc

* add kNN

* edit loading of samples with no init

* Add verbosity flag

* add KNN submission script

* do not make peaked cdf by default

* Add submit script

* stop ignore sh

* Add mass thresholding

* Edit gitignore

* edits

* Space points in logspace

* Calculate for all ICs

* Update TODO

* Add dtype support

* Update readme

* Update nb
This commit is contained in:
Richard Stiskalek 2023-03-31 18:13:41 +01:00 committed by GitHub
parent 4d7827006a
commit 63ab3548b4
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
17 changed files with 1248 additions and 29 deletions

View file

@ -18,4 +18,5 @@ from .match import (brute_spatial_separation, RealisationsMatcher, cosine_simila
calculate_overlap, calculate_overlap_indxs, # noqa
dist_centmass, dist_percentile) # noqa
from .num_density import (binned_counts, number_density) # noqa
from .knn import kNN_CDF
# from .correlation import (get_randoms_sphere, sphere_angular_tpcf) # noqa

181
csiborgtools/match/knn.py Normal file
View file

@ -0,0 +1,181 @@
# Copyright (C) 2022 Richard Stiskalek
# This program is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the
# Free Software Foundation; either version 3 of the License, or (at your
# option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
# Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
"""
kNN-CDF calculation
"""
from gc import collect
import numpy
from scipy.interpolate import interp1d
from tqdm import tqdm
class kNN_CDF:
"""
Object to calculate the kNN-CDF for a set of CSiBORG halo catalogues from
their kNN objects.
"""
@staticmethod
def rvs_in_sphere(nsamples, R, random_state=42, dtype=numpy.float32):
"""
Generate random samples in a sphere of radius `R` centered at the
origin.
Parameters
----------
nsamples : int
Number of samples to generate.
R : float
Radius of the sphere.
random_state : int, optional
Random state for the random number generator.
dtype : numpy dtype, optional
Data type, by default `numpy.float32`.
Returns
-------
samples : 2-dimensional array of shape `(nsamples, 3)`
"""
gen = numpy.random.default_rng(random_state)
# Sample spherical coordinates
r = gen.uniform(0, 1, nsamples).astype(dtype)**(1/3) * R
theta = 2 * numpy.arcsin(gen.uniform(0, 1, nsamples).astype(dtype))
phi = 2 * numpy.pi * gen.uniform(0, 1, nsamples).astype(dtype)
# Convert to cartesian coordinates
x = r * numpy.sin(theta) * numpy.cos(phi)
y = r * numpy.sin(theta) * numpy.sin(phi)
z = r * numpy.cos(theta)
return numpy.vstack([x, y, z]).T
@staticmethod
def cdf_from_samples(r, rmin=None, rmax=None, neval=None,
dtype=numpy.float32):
"""
Calculate the CDF from samples.
Parameters
----------
r : 1-dimensional array
Distance samples.
rmin : float, optional
Minimum distance to evaluate the CDF.
rmax : float, optional
Maximum distance to evaluate the CDF.
neval : int, optional
Number of points to evaluate the CDF. By default equal to `len(x)`.
dtype : numpy dtype, optional
Calculation data type. By default `numpy.float32`.
Returns
-------
r : 1-dimensional array
Distances at which the CDF is evaluated.
cdf : 1-dimensional array
CDF evaluated at `r`.
"""
r = numpy.copy(r) # Make a copy not to overwrite the original
# Make cuts on distance
r = r[r >= rmin] if rmin is not None else r
r = r[r <= rmax] if rmax is not None else r
# Calculate the CDF
r = numpy.sort(r)
cdf = numpy.arange(r.size) / r.size
if neval is not None: # Optinally interpolate at given points
_r = numpy.logspace(numpy.log10(rmin), numpy.log10(rmax), neval,
dtype=dtype)
cdf = interp1d(r, cdf, kind="linear", fill_value=numpy.nan,
bounds_error=False)(_r).astype(dtype)
r = _r
return r, cdf
@staticmethod
def peaked_cdf(cdf, make_copy=True):
"""
Transform the CDF to a peaked CDF.
Parameters
----------
cdf : 1- or 2- or 3-dimensional array
CDF to be transformed along the last axis.
make_copy : bool, optional
Whether to make a copy of the CDF before transforming it to avoid
overwriting it.
Returns
-------
peaked_cdf : 1- or 2- or 3-dimensional array
"""
cdf = numpy.copy(cdf) if make_copy else cdf
cdf[cdf > 0.5] = 1 - cdf[cdf > 0.5]
return cdf
def __call__(self, *knns, nneighbours, Rmax, nsamples, rmin, rmax, neval,
verbose=True, random_state=42, dtype=numpy.float32):
"""
Calculate the CDF for a set of kNNs of CSiBORG halo catalogues.
Parameters
----------
*knns : `sklearn.neighbors.NearestNeighbors` instances
kNNs of CSiBORG halo catalogues.
neighbours : int
Maximum number of neighbours to use for the kNN-CDF calculation.
Rmax : float
Maximum radius of the sphere in which to sample random points for
the knn-CDF calculation. This should match the CSiBORG catalogues.
nsamples : int
Number of random points to sample for the knn-CDF calculation.
rmin : float
Minimum distance to evaluate the CDF.
rmax : float
Maximum distance to evaluate the CDF.
neval : int
Number of points to evaluate the CDF.
verbose : bool, optional
Verbosity flag.
random_state : int, optional
Random state for the random number generator.
dtype : numpy dtype, optional
Calculation data type. By default `numpy.float32`.
Returns
-------
rs : 1-dimensional array
Distances at which the CDF is evaluated.
cdfs : 2 or 3-dimensional array
CDFs evaluated at `rs`.
"""
rand = self.rvs_in_sphere(nsamples, Rmax, random_state=random_state)
cdfs = [None] * len(knns)
for i, knn in enumerate(tqdm(knns) if verbose else knns):
dist, _indxs = knn.kneighbors(rand, nneighbours)
dist = dist.astype(dtype)
del _indxs
collect()
cdf = [None] * nneighbours
for j in range(nneighbours):
rs, cdf[j] = self.cdf_from_samples(
dist[:, j], rmin=rmin, rmax=rmax, neval=neval)
cdfs[i] = cdf
cdfs = numpy.asanyarray(cdfs)
cdfs = cdfs[0, ...] if len(knns) == 1 else cdfs
return rs, cdfs

View file

@ -35,20 +35,22 @@ class HaloCatalogue:
The minimum :math:`M_{rm tot} / M_\odot` mass. By default no threshold.
max_dist : float, optional
The maximum comoving distance of a halo. By default no upper limit.
load_init : bool, optional
Whether to load the initial snapshot information. By default False.
"""
_box = None
_paths = None
_data = None
_selmask = None
def __init__(self, nsim, min_mass=None, max_dist=None):
def __init__(self, nsim, min_mass=None, max_dist=None, load_init=False):
# Set up paths
paths = CSiBORGPaths(n_sim=nsim)
paths.n_snap = paths.get_maximum_snapshot()
self._paths = paths
self._box = BoxUnits(paths)
self._paths = paths
self._set_data(min_mass, max_dist)
self._set_data(min_mass, max_dist, load_init)
@property
def data(self):
@ -109,7 +111,7 @@ class HaloCatalogue:
def knn(self, select_initial):
"""
The final snapshot k-nearest neighbour object.
kNN object of all halo positions.
Parameters
----------
@ -123,7 +125,7 @@ class HaloCatalogue:
knn = NearestNeighbors()
return knn.fit(self.positions0 if select_initial else self.positions)
def _set_data(self, min_mass, max_dist):
def _set_data(self, min_mass, max_dist, load_init):
"""
Loads the data, merges with mmain, does various coordinate transforms.
"""
@ -141,10 +143,11 @@ class HaloCatalogue:
data = data[(data["npart"] > 100) & numpy.isfinite(data["m200"])]
# Now also load the initial positions
initcm = read_initcm(self.n_sim, self.paths.initmatch_path)
if initcm is not None:
data = self.merge_initmatch_to_clumps(data, initcm)
flip_cols(data, "x0", "z0")
if load_init:
initcm = read_initcm(self.n_sim, self.paths.initmatch_path)
if initcm is not None:
data = self.merge_initmatch_to_clumps(data, initcm)
flip_cols(data, "x0", "z0")
# # Calculate redshift
# pos = [data["peak_{}".format(p)] - 0.5 for p in ("x", "y", "z")]
@ -168,7 +171,7 @@ class HaloCatalogue:
data = add_columns(data, [d, ra, dec], ["dist", "ra", "dec"])
# And do the unit transform
if initcm is not None:
if load_init and initcm is not None:
data = self.box.convert_from_boxunits(
data, ["x0", "y0", "z0", "lagpatch"])