Add KL divergence

This commit is contained in:
rstiskalek 2023-05-24 10:35:50 +01:00
parent 14dc8f85af
commit f82633f816

View file

@ -19,10 +19,10 @@ the final snapshot.
from math import floor from math import floor
import numpy import numpy
from scipy.interpolate import interp1d
from scipy.stats import kstest
from scipy.special import erfinv
from numba import jit from numba import jit
from scipy.integrate import quad
from scipy.interpolate import interp1d
from scipy.stats import gaussian_kde, kstest
from tqdm import tqdm from tqdm import tqdm
@ -35,10 +35,16 @@ class NearestNeighbourReader:
---------- ----------
rmax_radial : float rmax_radial : float
Radius of the high-resolution region. Radius of the high-resolution region.
nbins_radial : int
Number of radial bins.
rmax_neighbour : float
Maximum distance to consider for the nearest neighbour.
nbins_neighbour : int
Number of bins for the nearest neighbour.
paths : py:class`csiborgtools.read.Paths` paths : py:class`csiborgtools.read.Paths`
Paths object. Paths object.
**kwargs : dict
TODO: docs Other keyword arguments for backward compatibility. Not used.
""" """
_paths = None _paths = None
_rmax_radial = None _rmax_radial = None
@ -202,11 +208,11 @@ class NearestNeighbourReader:
fpath = self.paths.cross_nearest(simname, run, nsim, nobs) fpath = self.paths.cross_nearest(simname, run, nsim, nobs)
return numpy.load(fpath) return numpy.load(fpath)
def build_cdf(self, simname, run, verbose=True): def build_dist(self, simname, run, kind, verbose=True):
""" """
Build the CDF for the nearest neighbour distribution. Counts the binned Build the a PDF or a CDF for the nearest neighbour distribution.
number of neighbour for each halo as a funtion of its radial distance Counts the binned number of neighbour for each halo as a funtion of its
from the centre of the high-resolution region and converts it to a CDF. radial distance from the centre of the high-resolution region.
Parameters Parameters
---------- ----------
@ -214,18 +220,23 @@ class NearestNeighbourReader:
Simulation name. Must be either `csiborg` or `quijote`. Simulation name. Must be either `csiborg` or `quijote`.
run : str run : str
Run name. Run name.
kind : str
Distribution kind. Either `pdf` or `cdf`.
verbose : bool, optional verbose : bool, optional
Verbosity flag. Verbosity flag.
Returns Returns
------- -------
cdf : 2-dimensional array of shape `(nbins_radial, nbins_neighbour)` dist : 2-dimensional array of shape `(nbins_radial, nbins_neighbour)`
""" """
assert simname in ["csiborg", "quijote"] assert simname in ["csiborg", "quijote"]
assert kind in ["pdf", "cdf"]
rbin_edges = self.radial_bin_edges rbin_edges = self.radial_bin_edges
# We first bin the distances as a function of each reference halo # We first bin the distances as a function of each reference halo
# radial distance and then its nearest neighbour distance. # radial distance and then its nearest neighbour distance.
fpaths = self.paths.cross_nearest(simname, run) fpaths = self.paths.cross_nearest(simname, run)
if simname == "quijote":
fpaths = fpaths[:200] # TODO remove later.
out = numpy.zeros((self.nbins_radial, self.nbins_neighbour), out = numpy.zeros((self.nbins_radial, self.nbins_neighbour),
dtype=numpy.float32) dtype=numpy.float32)
for fpath in tqdm(fpaths) if verbose else fpaths: for fpath in tqdm(fpaths) if verbose else fpaths:
@ -234,15 +245,89 @@ class NearestNeighbourReader:
out, data["ndist"], data["rdist"], rbin_edges, out, data["ndist"], data["rdist"], rbin_edges,
self.rmax_neighbour, self.nbins_neighbour) self.rmax_neighbour, self.nbins_neighbour)
# We then build up a CDF for each radial bin. if kind == "pdf":
neighbour_bin_edges = self.neighbour_bin_edges
dx = neighbour_bin_edges[1] - neighbour_bin_edges[0]
out /= numpy.sum(dx * out, axis=1).reshape(-1, 1)
else:
out = numpy.cumsum(out, axis=1, out=out) out = numpy.cumsum(out, axis=1, out=out)
out /= out[:, -1].reshape(-1, 1) out /= out[:, -1].reshape(-1, 1)
return out return out
def calc_significance(self, simname, run, nsim, cdf, nobs=None): def kl_divergence(self, simname, run, nsim, pdf, nobs=None, verbose=True):
r"""
Calculate the Kullback-Leibler divergence of the nearest neighbour
distribution of a reference halo relative to an expected distribution
from an unconstrained suite of simulations. Approximates reference halo
neighbour distribution with a Gaussian KDE.
Parameters
----------
simname : str
Simulation name. Must be either `csiborg` or `quijote`.
run : str
Run name.
nsim : int
Simulation index.
cdf : 2-dimensional array of shape `(nbins_radial, nbins_neighbour)`
CDF of the nearest neighbour distribution in an unconstrained
suite of simiulations.
nobs : int, optional
Fiducial Quijote observer index.
verbose : bool, optional
Verbosity flag.
Returns
-------
kl_divergence: 1-dimensional array of shape `(nhalos,)`
Information gain from going from the expected distribution to the
observed distribution in bits.
""" """
Calculate the significance of the nearest neighbour distribution of a assert simname in ["csiborg", "quijote"]
reference halo relative to an unconstrained simulation. data = self.read_single(simname, run, nsim, nobs)
rdist = data["rdist"]
ndist = data["ndist"]
rbin_edges = self.radial_bin_edges
# Create an interpolation function for each radial bin.
xbin = self.bin_centres("neighbour")
interp_kwargs = {"kind": "cubic",
"bounds_error": False,
"fill_value": 1e-16,
"assume_sorted": True}
pdf_interp = [interp1d(xbin, pdf[i, :], **interp_kwargs)
for i in range(self.nbins_radial)]
def KL_density(x, p, q, p_norm, q_norm):
p = p(x) / p_norm
q = q(x) / q_norm
return p * numpy.log2(p / q)
# We loop over each halo and find its radial bin. Then calculate the
# KL divergence between the Quijote distribution and the sampled
# distribution in CSiBORG.
out = numpy.full(rdist.size, numpy.nan, dtype=numpy.float32)
cells = numpy.digitize(rdist, rbin_edges) - 1
for i, radial_cell in enumerate(tqdm(cells) if verbose else cells):
xmin, xmax = numpy.min(ndist[i, :]), numpy.max(ndist[i, :])
xrange = numpy.linspace(xmin, xmax, 1000)
ykde = gaussian_kde(ndist[i, :])(xrange)
kde = interp1d(xrange, ykde, **interp_kwargs)
kde_norm = quad(kde, xmin, xmax)[0]
out[i] = quad(
KL_density, xmin, xmax,
args=(kde, pdf_interp[radial_cell], kde_norm, 1.0),
limit=250, points=(xmin, xmax), epsabs=1e-4, epsrel=1e-4)[0]
return out
def ks_significance(self, simname, run, nsim, cdf, nobs=None):
r"""
Calculate the p-value significance of the nearest neighbour of a
reference halo relative to an unconstrained simulation using the
KolmogorovSmirnov test.
Parameters Parameters
---------- ----------
@ -260,8 +345,7 @@ class NearestNeighbourReader:
Returns Returns
------- -------
sigma : 1-dimensional array of shape `(nhalos,)` pval : 1-dimensional array of shape `(nhalos,)`
Significance of the nearest neighbour distribution of each halo.
""" """
assert simname in ["csiborg", "quijote"] assert simname in ["csiborg", "quijote"]
data = self.read_single(simname, run, nsim, nobs) data = self.read_single(simname, run, nsim, nobs)
@ -282,13 +366,11 @@ class NearestNeighbourReader:
# p-value under null hypothesis and convert it to a sigma value. # p-value under null hypothesis and convert it to a sigma value.
out = numpy.full(rdist.size, numpy.nan, dtype=numpy.float64) out = numpy.full(rdist.size, numpy.nan, dtype=numpy.float64)
for i, radial_cell in enumerate(numpy.digitize(rdist, rbin_edges) - 1): for i, radial_cell in enumerate(numpy.digitize(rdist, rbin_edges) - 1):
# The null hypothesis is that the distances in Quijote are larger # The null hypothesis is that the distances in Quijote are larger
# or equal to CSiBORG. # or equal to CSiBORG.
ks = kstest(ndist[i, :], cdf_interp[radial_cell], N=10000, ks = kstest(ndist[i, :], cdf_interp[radial_cell], N=10000,
alternative="greater") alternative="greater", method="exact")
# We convert the p-value to a sigma value. out[i] = numpy.log10(ks.pvalue)
out[i] = - numpy.sqrt(2) * erfinv(ks.pvalue - 1)
return out return out