mirror of
https://github.com/Richard-Sti/csiborgtools.git
synced 2024-12-22 21:28:03 +00:00
Add KL divergence
This commit is contained in:
parent
14dc8f85af
commit
f82633f816
1 changed files with 104 additions and 22 deletions
|
@ -19,10 +19,10 @@ the final snapshot.
|
||||||
from math import floor
|
from math import floor
|
||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
from scipy.interpolate import interp1d
|
|
||||||
from scipy.stats import kstest
|
|
||||||
from scipy.special import erfinv
|
|
||||||
from numba import jit
|
from numba import jit
|
||||||
|
from scipy.integrate import quad
|
||||||
|
from scipy.interpolate import interp1d
|
||||||
|
from scipy.stats import gaussian_kde, kstest
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
|
||||||
|
@ -35,10 +35,16 @@ class NearestNeighbourReader:
|
||||||
----------
|
----------
|
||||||
rmax_radial : float
|
rmax_radial : float
|
||||||
Radius of the high-resolution region.
|
Radius of the high-resolution region.
|
||||||
|
nbins_radial : int
|
||||||
|
Number of radial bins.
|
||||||
|
rmax_neighbour : float
|
||||||
|
Maximum distance to consider for the nearest neighbour.
|
||||||
|
nbins_neighbour : int
|
||||||
|
Number of bins for the nearest neighbour.
|
||||||
paths : py:class`csiborgtools.read.Paths`
|
paths : py:class`csiborgtools.read.Paths`
|
||||||
Paths object.
|
Paths object.
|
||||||
|
**kwargs : dict
|
||||||
TODO: docs
|
Other keyword arguments for backward compatibility. Not used.
|
||||||
"""
|
"""
|
||||||
_paths = None
|
_paths = None
|
||||||
_rmax_radial = None
|
_rmax_radial = None
|
||||||
|
@ -202,11 +208,11 @@ class NearestNeighbourReader:
|
||||||
fpath = self.paths.cross_nearest(simname, run, nsim, nobs)
|
fpath = self.paths.cross_nearest(simname, run, nsim, nobs)
|
||||||
return numpy.load(fpath)
|
return numpy.load(fpath)
|
||||||
|
|
||||||
def build_cdf(self, simname, run, verbose=True):
|
def build_dist(self, simname, run, kind, verbose=True):
|
||||||
"""
|
"""
|
||||||
Build the CDF for the nearest neighbour distribution. Counts the binned
|
Build the a PDF or a CDF for the nearest neighbour distribution.
|
||||||
number of neighbour for each halo as a funtion of its radial distance
|
Counts the binned number of neighbour for each halo as a funtion of its
|
||||||
from the centre of the high-resolution region and converts it to a CDF.
|
radial distance from the centre of the high-resolution region.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
|
@ -214,18 +220,23 @@ class NearestNeighbourReader:
|
||||||
Simulation name. Must be either `csiborg` or `quijote`.
|
Simulation name. Must be either `csiborg` or `quijote`.
|
||||||
run : str
|
run : str
|
||||||
Run name.
|
Run name.
|
||||||
|
kind : str
|
||||||
|
Distribution kind. Either `pdf` or `cdf`.
|
||||||
verbose : bool, optional
|
verbose : bool, optional
|
||||||
Verbosity flag.
|
Verbosity flag.
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
cdf : 2-dimensional array of shape `(nbins_radial, nbins_neighbour)`
|
dist : 2-dimensional array of shape `(nbins_radial, nbins_neighbour)`
|
||||||
"""
|
"""
|
||||||
assert simname in ["csiborg", "quijote"]
|
assert simname in ["csiborg", "quijote"]
|
||||||
|
assert kind in ["pdf", "cdf"]
|
||||||
rbin_edges = self.radial_bin_edges
|
rbin_edges = self.radial_bin_edges
|
||||||
# We first bin the distances as a function of each reference halo
|
# We first bin the distances as a function of each reference halo
|
||||||
# radial distance and then its nearest neighbour distance.
|
# radial distance and then its nearest neighbour distance.
|
||||||
fpaths = self.paths.cross_nearest(simname, run)
|
fpaths = self.paths.cross_nearest(simname, run)
|
||||||
|
if simname == "quijote":
|
||||||
|
fpaths = fpaths[:200] # TODO remove later.
|
||||||
out = numpy.zeros((self.nbins_radial, self.nbins_neighbour),
|
out = numpy.zeros((self.nbins_radial, self.nbins_neighbour),
|
||||||
dtype=numpy.float32)
|
dtype=numpy.float32)
|
||||||
for fpath in tqdm(fpaths) if verbose else fpaths:
|
for fpath in tqdm(fpaths) if verbose else fpaths:
|
||||||
|
@ -234,15 +245,89 @@ class NearestNeighbourReader:
|
||||||
out, data["ndist"], data["rdist"], rbin_edges,
|
out, data["ndist"], data["rdist"], rbin_edges,
|
||||||
self.rmax_neighbour, self.nbins_neighbour)
|
self.rmax_neighbour, self.nbins_neighbour)
|
||||||
|
|
||||||
# We then build up a CDF for each radial bin.
|
if kind == "pdf":
|
||||||
out = numpy.cumsum(out, axis=1, out=out)
|
neighbour_bin_edges = self.neighbour_bin_edges
|
||||||
out /= out[:, -1].reshape(-1, 1)
|
dx = neighbour_bin_edges[1] - neighbour_bin_edges[0]
|
||||||
|
out /= numpy.sum(dx * out, axis=1).reshape(-1, 1)
|
||||||
|
else:
|
||||||
|
out = numpy.cumsum(out, axis=1, out=out)
|
||||||
|
out /= out[:, -1].reshape(-1, 1)
|
||||||
return out
|
return out
|
||||||
|
|
||||||
def calc_significance(self, simname, run, nsim, cdf, nobs=None):
|
def kl_divergence(self, simname, run, nsim, pdf, nobs=None, verbose=True):
|
||||||
|
r"""
|
||||||
|
Calculate the Kullback-Leibler divergence of the nearest neighbour
|
||||||
|
distribution of a reference halo relative to an expected distribution
|
||||||
|
from an unconstrained suite of simulations. Approximates reference halo
|
||||||
|
neighbour distribution with a Gaussian KDE.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
simname : str
|
||||||
|
Simulation name. Must be either `csiborg` or `quijote`.
|
||||||
|
run : str
|
||||||
|
Run name.
|
||||||
|
nsim : int
|
||||||
|
Simulation index.
|
||||||
|
cdf : 2-dimensional array of shape `(nbins_radial, nbins_neighbour)`
|
||||||
|
CDF of the nearest neighbour distribution in an unconstrained
|
||||||
|
suite of simiulations.
|
||||||
|
nobs : int, optional
|
||||||
|
Fiducial Quijote observer index.
|
||||||
|
verbose : bool, optional
|
||||||
|
Verbosity flag.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
kl_divergence: 1-dimensional array of shape `(nhalos,)`
|
||||||
|
Information gain from going from the expected distribution to the
|
||||||
|
observed distribution in bits.
|
||||||
"""
|
"""
|
||||||
Calculate the significance of the nearest neighbour distribution of a
|
assert simname in ["csiborg", "quijote"]
|
||||||
reference halo relative to an unconstrained simulation.
|
data = self.read_single(simname, run, nsim, nobs)
|
||||||
|
|
||||||
|
rdist = data["rdist"]
|
||||||
|
ndist = data["ndist"]
|
||||||
|
rbin_edges = self.radial_bin_edges
|
||||||
|
|
||||||
|
# Create an interpolation function for each radial bin.
|
||||||
|
xbin = self.bin_centres("neighbour")
|
||||||
|
interp_kwargs = {"kind": "cubic",
|
||||||
|
"bounds_error": False,
|
||||||
|
"fill_value": 1e-16,
|
||||||
|
"assume_sorted": True}
|
||||||
|
pdf_interp = [interp1d(xbin, pdf[i, :], **interp_kwargs)
|
||||||
|
for i in range(self.nbins_radial)]
|
||||||
|
|
||||||
|
def KL_density(x, p, q, p_norm, q_norm):
|
||||||
|
p = p(x) / p_norm
|
||||||
|
q = q(x) / q_norm
|
||||||
|
return p * numpy.log2(p / q)
|
||||||
|
|
||||||
|
# We loop over each halo and find its radial bin. Then calculate the
|
||||||
|
# KL divergence between the Quijote distribution and the sampled
|
||||||
|
# distribution in CSiBORG.
|
||||||
|
out = numpy.full(rdist.size, numpy.nan, dtype=numpy.float32)
|
||||||
|
cells = numpy.digitize(rdist, rbin_edges) - 1
|
||||||
|
for i, radial_cell in enumerate(tqdm(cells) if verbose else cells):
|
||||||
|
xmin, xmax = numpy.min(ndist[i, :]), numpy.max(ndist[i, :])
|
||||||
|
xrange = numpy.linspace(xmin, xmax, 1000)
|
||||||
|
|
||||||
|
ykde = gaussian_kde(ndist[i, :])(xrange)
|
||||||
|
kde = interp1d(xrange, ykde, **interp_kwargs)
|
||||||
|
kde_norm = quad(kde, xmin, xmax)[0]
|
||||||
|
|
||||||
|
out[i] = quad(
|
||||||
|
KL_density, xmin, xmax,
|
||||||
|
args=(kde, pdf_interp[radial_cell], kde_norm, 1.0),
|
||||||
|
limit=250, points=(xmin, xmax), epsabs=1e-4, epsrel=1e-4)[0]
|
||||||
|
return out
|
||||||
|
|
||||||
|
def ks_significance(self, simname, run, nsim, cdf, nobs=None):
|
||||||
|
r"""
|
||||||
|
Calculate the p-value significance of the nearest neighbour of a
|
||||||
|
reference halo relative to an unconstrained simulation using the
|
||||||
|
Kolmogorov–Smirnov test.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
|
@ -260,8 +345,7 @@ class NearestNeighbourReader:
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
sigma : 1-dimensional array of shape `(nhalos,)`
|
pval : 1-dimensional array of shape `(nhalos,)`
|
||||||
Significance of the nearest neighbour distribution of each halo.
|
|
||||||
"""
|
"""
|
||||||
assert simname in ["csiborg", "quijote"]
|
assert simname in ["csiborg", "quijote"]
|
||||||
data = self.read_single(simname, run, nsim, nobs)
|
data = self.read_single(simname, run, nsim, nobs)
|
||||||
|
@ -282,13 +366,11 @@ class NearestNeighbourReader:
|
||||||
# p-value under null hypothesis and convert it to a sigma value.
|
# p-value under null hypothesis and convert it to a sigma value.
|
||||||
out = numpy.full(rdist.size, numpy.nan, dtype=numpy.float64)
|
out = numpy.full(rdist.size, numpy.nan, dtype=numpy.float64)
|
||||||
for i, radial_cell in enumerate(numpy.digitize(rdist, rbin_edges) - 1):
|
for i, radial_cell in enumerate(numpy.digitize(rdist, rbin_edges) - 1):
|
||||||
|
|
||||||
# The null hypothesis is that the distances in Quijote are larger
|
# The null hypothesis is that the distances in Quijote are larger
|
||||||
# or equal to CSiBORG.
|
# or equal to CSiBORG.
|
||||||
ks = kstest(ndist[i, :], cdf_interp[radial_cell], N=10000,
|
ks = kstest(ndist[i, :], cdf_interp[radial_cell], N=10000,
|
||||||
alternative="greater")
|
alternative="greater", method="exact")
|
||||||
# We convert the p-value to a sigma value.
|
out[i] = numpy.log10(ks.pvalue)
|
||||||
out[i] = - numpy.sqrt(2) * erfinv(ks.pvalue - 1)
|
|
||||||
return out
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue