Add KL divergence

2025-06-08 01:41:12 +00:00 · 2023-05-24 10:35:50 +01:00 · 2023-05-24 10:35:50 +01:00 · f82633f816
commit f82633f816
parent 14dc8f85af
1 changed files with 104 additions and 22 deletions
--- a/csiborgtools/read/nearest_neighbour_summary.py
+++ b/csiborgtools/read/nearest_neighbour_summary.py
@ -19,10 +19,10 @@ the final snapshot.
 from math import floor
 import numpy
 from scipy.interpolate import interp1d
 from scipy.stats import kstest
 from scipy.special import erfinv
 from numba import jit
 from scipy.integrate import quad
 from scipy.interpolate import interp1d
 from scipy.stats import gaussian_kde, kstest
 from tqdm import tqdm
@ -35,10 +35,16 @@ class NearestNeighbourReader:
    ----------
    rmax_radial : float
        Radius of the high-resolution region.
    nbins_radial : int
        Number of radial bins.
    rmax_neighbour : float
        Maximum distance to consider for the nearest neighbour.
    nbins_neighbour : int
        Number of bins for the nearest neighbour.
    paths : py:class`csiborgtools.read.Paths`
        Paths object.
-
+    **kwargs : dict
-    TODO: docs
+        Other keyword arguments for backward compatibility. Not used.
    """
    _paths = None
    _rmax_radial = None
@ -202,11 +208,11 @@ class NearestNeighbourReader:
        fpath = self.paths.cross_nearest(simname, run, nsim, nobs)
        return numpy.load(fpath)
-    def build_cdf(self, simname, run, verbose=True):
+    def build_dist(self, simname, run, kind, verbose=True):
        """
-        Build the CDF for the nearest neighbour distribution. Counts the binned
+        Build the a PDF or a CDF for the nearest neighbour distribution.
-        number of neighbour for each halo as a funtion of its radial distance
+        Counts the binned number of neighbour for each halo as a funtion of its
-        from the centre of the high-resolution region and converts it to a CDF.
+        radial distance from the centre of the high-resolution region.
        Parameters
        ----------
@ -214,18 +220,23 @@ class NearestNeighbourReader:
            Simulation name. Must be either `csiborg` or `quijote`.
        run : str
            Run name.
        kind : str
            Distribution kind. Either `pdf` or `cdf`.
        verbose : bool, optional
            Verbosity flag.
        Returns
        -------
-        cdf : 2-dimensional array of shape `(nbins_radial, nbins_neighbour)`
+        dist : 2-dimensional array of shape `(nbins_radial, nbins_neighbour)`
        """
        assert simname in ["csiborg", "quijote"]
        assert kind in ["pdf", "cdf"]
        rbin_edges = self.radial_bin_edges
        # We first bin the distances as a function of each reference halo
        # radial distance and then its nearest neighbour distance.
        fpaths = self.paths.cross_nearest(simname, run)
        if simname == "quijote":
            fpaths = fpaths[:200]  # TODO remove later.
        out = numpy.zeros((self.nbins_radial, self.nbins_neighbour),
                          dtype=numpy.float32)
        for fpath in tqdm(fpaths) if verbose else fpaths:
@ -234,15 +245,89 @@ class NearestNeighbourReader:
                out, data["ndist"], data["rdist"], rbin_edges,
                self.rmax_neighbour, self.nbins_neighbour)
-        # We then build up a CDF for each radial bin.
+        if kind == "pdf":
            neighbour_bin_edges = self.neighbour_bin_edges
            dx = neighbour_bin_edges[1] - neighbour_bin_edges[0]
            out /= numpy.sum(dx * out, axis=1).reshape(-1, 1)
        else:
            out = numpy.cumsum(out, axis=1, out=out)
            out /= out[:, -1].reshape(-1, 1)
        return out
-    def calc_significance(self, simname, run, nsim, cdf, nobs=None):
+    def kl_divergence(self, simname, run, nsim, pdf, nobs=None, verbose=True):
        r"""
        Calculate the Kullback-Leibler divergence of the nearest neighbour
        distribution of a reference halo relative to an expected distribution
        from an unconstrained suite of simulations. Approximates reference halo
        neighbour distribution with a Gaussian KDE.
        Parameters
        ----------
        simname : str
            Simulation name. Must be either `csiborg` or `quijote`.
        run : str
            Run name.
        nsim : int
            Simulation index.
        cdf : 2-dimensional array of shape `(nbins_radial, nbins_neighbour)`
            CDF of the nearest neighbour distribution in an unconstrained
            suite of simiulations.
        nobs : int, optional
            Fiducial Quijote observer index.
        verbose : bool, optional
            Verbosity flag.
        Returns
        -------
        kl_divergence: 1-dimensional array of shape `(nhalos,)`
            Information gain from going from the expected distribution to the
            observed distribution in bits.
        """
-        Calculate the significance of the nearest neighbour distribution of a
+        assert simname in ["csiborg", "quijote"]
-        reference halo relative to an unconstrained simulation.
+        data = self.read_single(simname, run, nsim, nobs)
        rdist = data["rdist"]
        ndist = data["ndist"]
        rbin_edges = self.radial_bin_edges
        # Create an interpolation function for each radial bin.
        xbin = self.bin_centres("neighbour")
        interp_kwargs = {"kind": "cubic",
                         "bounds_error": False,
                         "fill_value": 1e-16,
                         "assume_sorted": True}
        pdf_interp = [interp1d(xbin, pdf[i, :], **interp_kwargs)
                      for i in range(self.nbins_radial)]
        def KL_density(x, p, q, p_norm, q_norm):
            p = p(x) / p_norm
            q = q(x) / q_norm
            return p * numpy.log2(p / q)
        # We loop over each halo and find its radial bin. Then calculate the
        # KL divergence between the Quijote distribution and the sampled
        # distribution in CSiBORG.
        out = numpy.full(rdist.size, numpy.nan, dtype=numpy.float32)
        cells = numpy.digitize(rdist, rbin_edges) - 1
        for i, radial_cell in enumerate(tqdm(cells) if verbose else cells):
            xmin, xmax = numpy.min(ndist[i, :]), numpy.max(ndist[i, :])
            xrange = numpy.linspace(xmin, xmax, 1000)
            ykde = gaussian_kde(ndist[i, :])(xrange)
            kde = interp1d(xrange, ykde, **interp_kwargs)
            kde_norm = quad(kde, xmin, xmax)[0]
            out[i] = quad(
                KL_density, xmin, xmax,
                args=(kde, pdf_interp[radial_cell], kde_norm, 1.0),
                limit=250, points=(xmin, xmax), epsabs=1e-4, epsrel=1e-4)[0]
        return out
    def ks_significance(self, simname, run, nsim, cdf, nobs=None):
        r"""
        Calculate the p-value significance of the nearest neighbour of a
        reference halo relative to an unconstrained simulation using the
        Kolmogorov–Smirnov test.
        Parameters
        ----------
@ -260,8 +345,7 @@ class NearestNeighbourReader:
        Returns
        -------
-        sigma : 1-dimensional array of shape `(nhalos,)`
+        pval : 1-dimensional array of shape `(nhalos,)`
            Significance of the nearest neighbour distribution of each halo.
        """
        assert simname in ["csiborg", "quijote"]
        data = self.read_single(simname, run, nsim, nobs)
@ -282,13 +366,11 @@ class NearestNeighbourReader:
        # p-value under null hypothesis and convert it to a sigma value.
        out = numpy.full(rdist.size, numpy.nan, dtype=numpy.float64)
        for i, radial_cell in enumerate(numpy.digitize(rdist, rbin_edges) - 1):
            # The null hypothesis is that the distances in Quijote are larger
            # or equal to CSiBORG.
            ks = kstest(ndist[i, :], cdf_interp[radial_cell], N=10000,
-                        alternative="greater")
+                        alternative="greater", method="exact")
-            # We convert the p-value to a sigma value.
+            out[i] = numpy.log10(ks.pvalue)
            out[i] = - numpy.sqrt(2) * erfinv(ks.pvalue - 1)
        return out