More flow (#118)

* Add GoF calculation * Add import * Add base flow * Add reading of ndata * Update nb * Update plotting * Update script * Update plots * Updaet plo * Add script * Update nb * Update nb * Update script * Update script * Update nb * Remove imports * Improve labelling * Improve flow calibration * Add bulk flow plots * Update flow * Update scrit * Calculate more radial steps * Update bulk * Update script * Update nb
2025-07-18 19:53:03 +00:00 · 2024-03-21 16:50:37 +01:00 · 2024-03-21 16:50:37 +01:00 · f7285b2600
commit f7285b2600
parent a9cb8943d6
12 changed files with 1144 additions and 1023 deletions
--- a/csiborgtools/init.py
+++ b/csiborgtools/init.py
@ -19,7 +19,7 @@ from .utils import (center_of_mass, delta2ncells, number_counts,
                    binned_statistic, cosine_similarity, fprint,                # noqa
                    hms_to_degrees, dms_to_degrees, great_circle_distance,      # noqa
                    radec_to_cartesian, cartesian_to_radec,                     # noqa
-                    thin_samples_by_acl)                                        # noqa
+                    thin_samples_by_acl, numpyro_gof)                           # noqa
 from .params import paths_glamdring, simname2boxsize, simname2Omega_m           # noqa


--- a/csiborgtools/flow/flow_model.py
+++ b/csiborgtools/flow/flow_model.py
@ -19,6 +19,7 @@ References
 ----------
 [1] https://arxiv.org/abs/1912.09383.
 """
+from abc import ABC
 from datetime import datetime
 from warnings import catch_warnings, simplefilter, warn

@ -37,7 +38,7 @@ from jax.random import PRNGKey
 from numpyro.infer import Predictive, util
 from scipy.optimize import fmin_powell
 from sklearn.model_selection import KFold
-from tqdm import tqdm, trange
+from tqdm import trange
 from numdifftools import Hessian

 from ..params import simname2Omega_m
@ -82,6 +83,8 @@ class DataLoader:
    ----------
    simname : str
        Simulation name.
+    ksims : int
+        Index of the simulation to read in (not the IC index).
    catalogue : str
        Name of the catalogue with LOS objects.
    catalogue_fpath : str
@ -94,7 +97,7 @@ class DataLoader:
        Whether to store the full 3D velocity field. Otherwise stores only
        the radial velocity.
    """
-    def __init__(self, simname, catalogue, catalogue_fpath, paths,
+    def __init__(self, simname, ksim, catalogue, catalogue_fpath, paths,
                 ksmooth=None, store_full_velocity=False):
        print(f"{t()}: reading the catalogue.")
        self._cat = self._read_catalogue(catalogue, catalogue_fpath)
@ -102,7 +105,7 @@ class DataLoader:

        print(f"{t()}: reading the interpolated field.")
        self._field_rdist, self._los_density, self._los_velocity = self._read_field(  # noqa
-            simname, catalogue, ksmooth, paths)
+            simname, ksim, catalogue, ksmooth, paths)

        if len(self._field_rdist) % 2 == 0:
            warn(f"The number of radial steps is even. Skipping the first "
@ -117,7 +120,8 @@ class DataLoader:
                             "match the number of objects in the field.")

        print(f"{t()}: calculating the radial velocity.")
-        nobject, nsim = self._los_density.shape[:2]
+        nobject = len(self._los_density)
+        dtype = self._los_density.dtype

        # In case of Carrick 2015 the box is in galactic coordinates..
        if simname == "Carrick2015":
@ -125,12 +129,10 @@ class DataLoader:
        else:
            d1, d2 = self._cat["RA"], self._cat["DEC"]

-        radvel = np.empty((nobject, nsim, len(self._field_rdist)),
-                          self._los_velocity.dtype)
-        for i in trange(nobject):
-            for j in range(nsim):
-                radvel[i, j, :] = radial_velocity_los(
-                    self._los_velocity[:, i, j, ...], d1[i], d2[i])
+        radvel = np.empty((nobject, len(self._field_rdist)), dtype)
+        for i in range(nobject):
+            radvel[i, :] = radial_velocity_los(self._los_velocity[:, i, ...],
+                                               d1[i], d2[i])
        self._los_radial_velocity = radvel

        if not store_full_velocity:
@ -192,7 +194,7 @@ class DataLoader:

        Returns
        ----------
-        3-dimensional array of shape (n_objects, n_simulations, n_steps)
+        2-dimensional array of shape (n_objects, n_steps)
        """
        return self._los_density[self._mask]

@ -203,7 +205,7 @@ class DataLoader:

        Returns
        -------
-        4-dimensional array of shape (n_objects, n_simulations, 3, n_steps)
+        3-dimensional array of shape (3, n_objects, n_steps)
        """
        if self._los_velocity is None:
            raise ValueError("The 3D velocities were not stored.")
@ -216,38 +218,29 @@ class DataLoader:

        Returns
        -------
-        3-dimensional array of shape (n_objects, n_simulations, n_steps)
+        2-dimensional array of shape (n_objects, n_steps)
        """
        return self._los_radial_velocity[self._mask]

-    def _read_field(self, simname, catalogue, k, paths):
+    def _read_field(self, simname, ksim, catalogue, ksmooth, paths):
        """Read in the interpolated field."""
-        out_density = None
-        out_velocity = None
-        has_smoothed = False
-
        nsims = paths.get_ics(simname)
+        if not (0 <= ksim < len(nsims)):
+            raise ValueError("Invalid simulation index.")
+        nsim = nsims[ksim]
+
        with File(paths.field_los(simname, catalogue), 'r') as f:
-            has_smoothed = True if f[f"density_{nsims[0]}"].ndim > 2 else False
-            if has_smoothed and (k is None or not isinstance(k, int)):
+            has_smoothed = True if f[f"density_{nsim}"].ndim > 2 else False
+            if has_smoothed and (ksmooth is None or not isinstance(ksmooth, int)):  # noqa
                raise ValueError("The output contains smoothed field but "
                                 "`ksmooth` is None. It must be provided.")

-            for i, nsim in enumerate(tqdm(nsims)):
-                if out_density is None:
-                    nobject, nstep = f[f"density_{nsim}"].shape[:2]
-                    out_density = np.empty(
-                        (nobject, len(nsims), nstep), dtype=np.float32)
-                    out_velocity = np.empty(
-                        (3, nobject, len(nsims), nstep), dtype=np.float32)
-
-                indx = (..., k) if has_smoothed else (...)
-                out_density[:, i, :] = f[f"density_{nsim}"][indx]
-                out_velocity[:, :, i, :] = f[f"velocity_{nsim}"][indx]
-
+            indx = (..., ksmooth) if has_smoothed else (...)
+            los_density = f[f"density_{nsim}"][indx]
+            los_velocity = f[f"velocity_{nsim}"][indx]
            rdist = f[f"rdist_{nsims[0]}"][:]

-        return rdist, out_density, out_velocity
+        return rdist, los_density, los_velocity

    def _read_catalogue(self, catalogue, catalogue_fpath):
        """
@ -556,7 +549,17 @@ def calculate_ll_zobs(zobs, zobs_pred, sigma_v):
    return jnp.exp(-0.5 * (dcz / sigma_v)**2) / jnp.sqrt(2 * np.pi) / sigma_v


-class SD_PV_validation_model:
+class BaseFlowValidationModel(ABC):
+    """
+    Base class for the flow validation models.
+    """
+
+    @property
+    def ndata(self):
+        return len(self._RA)
+
+
+class SD_PV_validation_model(BaseFlowValidationModel):
    """
    Simple distance peculiar velocity (PV) validation model, assuming that
    we already have a calibrated estimate of the comoving distance to the
@ -657,7 +660,7 @@ class SD_PV_validation_model:
        numpyro.factor("ll", ll)


-class SN_PV_validation_model:
+class SN_PV_validation_model(BaseFlowValidationModel):
    """
    Supernova peculiar velocity (PV) validation model that includes the
    calibration of the SALT2 light curve parameters.
@ -793,11 +796,11 @@ class SN_PV_validation_model:
            return ll + jnp.log(self._f_simps(ptilde) / pnorm), None

        ll = 0.
-        ll, __ = scan(scan_body, ll, jnp.arange(len(self._RA)))
+        ll, __ = scan(scan_body, ll, jnp.arange(self.ndata))
        numpyro.factor("ll", ll)


-class TF_PV_validation_model:
+class TF_PV_validation_model(BaseFlowValidationModel):
    """
    Tully-Fisher peculiar velocity (PV) validation model that includes the
    calibration of the Tully-Fisher distance `mu = m - (a + b * eta)`.
@ -909,7 +912,7 @@ class TF_PV_validation_model:
            return ll + jnp.log(self._f_simps(ptilde) / pnorm), None

        ll = 0.
-        ll, __ = scan(scan_body, ll, jnp.arange(len(self._RA)))
+        ll, __ = scan(scan_body, ll, jnp.arange(self.ndata))

        numpyro.factor("ll", ll)

@ -919,7 +922,7 @@ class TF_PV_validation_model:
 ###############################################################################


-def get_model(loader, k, zcmb_max=None, verbose=True):
+def get_model(loader, zcmb_max=None, verbose=True):
    """
    Get a model and extract the relevant data from the loader.

@ -927,8 +930,6 @@ def get_model(loader, k, zcmb_max=None, verbose=True):
    ----------
    loader : DataLoader
        DataLoader instance.
-    k : int
-        Simulation index.
    zcmb_max : float, optional
        Maximum observed redshift in the CMB frame to include.
    verbose : bool, optional
@ -940,11 +941,8 @@ def get_model(loader, k, zcmb_max=None, verbose=True):
    """
    zcmb_max = np.infty if zcmb_max is None else zcmb_max

-    if k > loader.los_density.shape[1]:
-        raise ValueError(f"Simulation index `{k}` out of range.")
-
-    los_overdensity = loader.los_density[:, k, :]
-    los_velocity = loader.los_radial_velocity[:, k, :]
+    los_overdensity = loader.los_density
+    los_velocity = loader.los_radial_velocity
    kind = loader._catname

    if kind in ["LOSS", "Foundation"]:
@ -1160,4 +1158,5 @@ def optimize_model_with_jackknife(loader, k, n_splits=5, sample_alpha=True,
           for key in keys]
    stats = {key: (mean[i], std[i]) for i, key in enumerate(keys)}

+    loader.reset_mask()
    return samples, stats, fmin, logz, bic
--- a/csiborgtools/utils.py
+++ b/csiborgtools/utils.py
@ -16,10 +16,12 @@
 Collection of stand-off utility functions used in the scripts.
 """
 from copy import deepcopy
+from datetime import datetime

 import numpy as np
 from numba import jit
-from datetime import datetime
+from numpyro.infer import util
+from scipy.stats import multivariate_normal

 ###############################################################################
 #                           Positions                                         #
@ -428,3 +430,57 @@ def thin_samples_by_acl(samples):
        thinned_samples[key] = np.hstack(key_samples)

    return thinned_samples
+
+
+def numpyro_gof(model, mcmc, model_kwargs={}):
+    """
+    Get the goodness-of-fit statistics for a sampled Numpyro model. Calculates
+    the BIC and AIC using the maximum likelihood sampled point and the log
+    evidence using the Laplace approximation.
+
+    Parameters
+    ----------
+    model : numpyro model
+        The model to evaluate.
+    mcmc : numpyro MCMC
+        The MCMC object containing the samples.
+    ndata : int
+        The number of data points.
+    model_kwargs : dict, optional
+        Additional keyword arguments to pass to the model.
+
+    Returns
+    -------
+    gof : dict
+        Dictionary containing the BIC, AIC and logZ.
+    """
+    samples = mcmc.get_samples(group_by_chain=False)
+    log_likelihood = util.log_likelihood(model, samples, **model_kwargs)["ll"]
+
+    # Calculate the BIC using the maximum likelihood sampled point.
+    kmax = np.argmax(log_likelihood)
+    nparam = len(samples)
+    try:
+        ndata = model.ndata
+    except AttributeError as e:
+        raise AttributeError("The model must have an attribute `ndata` "
+                             "indicating the number of data points.") from e
+    BIC = -2 * log_likelihood[kmax] + nparam * np.log(ndata)
+
+    # Calculate AIC
+    AIC = 2 * nparam - 2 * log_likelihood[kmax]
+
+    # Calculate log(Z) using Laplace approximation.
+    X = np.vstack([samples[key] for key in samples.keys()]).T
+    mu, cov = multivariate_normal.fit(X)
+    test_sample = {key: mu[i] for i, key in enumerate(samples.keys())}
+
+    ll_mu = util.log_likelihood(model, test_sample, **model_kwargs)["ll"]
+    cov_det = np.linalg.det(cov)
+    D = len(mu)
+    logZ = ll_mu + 0.5 * np.log(cov_det) + D / 2 * np.log(2 * np.pi)
+
+    # Convert to float
+    out = {"BIC": BIC, "AIC": AIC, "logZ": logZ}
+    out = {key: float(val) for key, val in out.items()}
+    return out