Add marginalization over boxes (#131)

* Parallelize over simulations * Update docs * Update dependency * Update imports * Add adtitional dependencies * Update .gitignore * Update ERADME * Simplify numpyro GOF * Speed up GOF * Deepcopy samples * Update scripts * Add GPU acceleration * Select boxes * Update script * Optionally sample beta * Fix old code * Simplify code * Start saving log posterior * Start popping log_likeliood * Add imports * Add converting samples * Fix sctipt name * Add evidence with harmonic * Remove comment * Update imports * Update imports so that pylians not required * Stop requiring Pylians to be installed * Update submission scripts for loops * Update nb * Update nb * Add Manticore boxes * Add verbosity flag * Add bulk flow * Update script * Update nb * Update normalization * Update submit * Update nb
2025-06-08 09:51:12 +00:00 · 2024-06-26 10:43:26 +01:00 · 2024-06-26 10:43:26 +01:00 · ce55a2b47e
commit ce55a2b47e
parent ffaf92cd4b
16 changed files with 1436 additions and 1290 deletions
--- a/csiborgtools/init.py
+++ b/csiborgtools/init.py
@ -20,8 +20,9 @@ from .utils import (center_of_mass, delta2ncells, number_counts,
                    binned_statistic, cosine_similarity, fprint,                # noqa
                    hms_to_degrees, dms_to_degrees, great_circle_distance,      # noqa
                    radec_to_cartesian, cartesian_to_radec,                     # noqa
-                    thin_samples_by_acl, numpyro_gof, radec_to_galactic,        # noqa
-                    heliocentric_to_cmb, calculate_acl)                         # noqa
+                    thin_samples_by_acl, BIC_AIC, radec_to_galactic,            # noqa
+                    heliocentric_to_cmb, calculate_acl, harmonic_evidence,      # noqa
+                    dict_samples_to_array)                                      # noqa
 from .params import (paths_glamdring, simname2boxsize, simname2Omega_m,         # noqa
                     snap2redshift)                                             # noqa

--- a/csiborgtools/clustering/init.py
+++ b/csiborgtools/clustering/init.py
@ -12,16 +12,15 @@
 # You should have received a copy of the GNU General Public License along
 # with this program; if not, write to the Free Software Foundation, Inc.,
 # 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
-from warnings import warn
+from warnings import warn                                                       # noqa

-from csiborgtools.clustering.knn import kNN_1DCDF  # noqa
-from csiborgtools.clustering.utils import (BaseRVS, RVSinbox,  # noqa
-                                           RVSinsphere, RVSonsphere,
-                                           normalised_marks)
+from csiborgtools.clustering.knn import kNN_1DCDF                               # noqa
+from csiborgtools.clustering.utils import (                                     # noqa
+    BaseRVS, RVSinbox, RVSinsphere, RVSonsphere, normalised_marks)              # noqa

 try:
-    import Corrfunc  # noqa
-
-    from .tpcf import Mock2PCF  # noqa
+    import Corrfunc                                                             # noqa
+    from .tpcf import Mock2PCF                                                  # noqa
 except ImportError:
-    warn("`Corrfunc` not installed. 2PCF modules will not be available .")  # noqa
+    warn("`Corrfunc` not installed. 2PCF modules will not be available.",
+         UserWarning)  # noqa
--- a/csiborgtools/field/init.py
+++ b/csiborgtools/field/init.py
@ -12,15 +12,25 @@
 # You should have received a copy of the GNU General Public License along
 # with this program; if not, write to the Free Software Foundation, Inc.,
 # 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
-from .density import (DensityField, PotentialField, TidalTensorField,           # noqa
-                      VelocityField, radial_velocity, power_spectrum,           # noqa
-                      overdensity_field)                                        # noqa
+try:
+    import MAS_library as MASL                                                  # noqa
+    import Pk_library as PKL                                                    # noqa
+
+    from .density import (DensityField, PotentialField, TidalTensorField,       # noqa
+                          VelocityField, radial_velocity, power_spectrum,       # noqa
+                          overdensity_field)                                    # noqa
+    from .interp import (evaluate_cartesian_cic, evaluate_sky, evaluate_los,    # noqa
+                         field2rsp, fill_outside, make_sky,                     # noqa
+                         observer_peculiar_velocity, smoothen_field,            # noqa
+                         field_at_distance)                                     # noqa
+except ImportError:
+    from warnings import warn
+    warn("`MAS_library` and `Pk_library` not installed. `density` and "
+         "`interp` related modules are not available. "
+         "Please install `Pylians`.", UserWarning)
+
 from .enclosed_mass import (particles_enclosed_mass,                            # noqa
                            particles_enclosed_momentum, field_enclosed_mass,   # noqa
                            bulkflow_peery2018)                                 # noqa
-from .interp import (evaluate_cartesian_cic, evaluate_sky, evaluate_los,        # noqa
-                     field2rsp, fill_outside, make_sky,                         # noqa
-                     observer_peculiar_velocity, smoothen_field,                # noqa
-                     field_at_distance)                                         # noqa
 from .corr import bayesian_bootstrap_correlation                                # noqa
 from .utils import nside2radec                                                  # noqa
--- a/csiborgtools/field/enclosed_mass.py
+++ b/csiborgtools/field/enclosed_mass.py
@ -102,7 +102,7 @@ def _field_enclosed_mass(field, rmax, boxsize):
    return mass * cell_volume, volume * cell_volume


-def field_enclosed_mass(field, distances, boxsize):
+def field_enclosed_mass(field, distances, boxsize, verbose=True):
    """
    Calculate the approximate enclosed mass within a given radius from a
    density field, counts the mass in cells and volume of cells whose
@ -116,6 +116,8 @@ def field_enclosed_mass(field, distances, boxsize):
        Radii to calculate the enclosed mass at in `Mpc / h`.
    boxsize : float
        Box size in `Mpc / h`.
+    verbose : bool
+        Verbosity flag.

    Returns
    -------
@ -127,7 +129,7 @@ def field_enclosed_mass(field, distances, boxsize):
    enclosed_mass = np.zeros_like(distances)
    enclosed_volume = np.zeros_like(distances)

-    for i, dist in enumerate(tqdm(distances)):
+    for i, dist in enumerate(tqdm(distances, disable=not verbose)):
        enclosed_mass[i], enclosed_volume[i] = _field_enclosed_mass(
            field, dist, boxsize)

--- a/csiborgtools/flow/init.py
+++ b/csiborgtools/flow/init.py
@ -14,9 +14,6 @@
 # 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 from .flow_model import (DataLoader, radial_velocity_los, dist2redshift,        # noqa
                         dist2distmodulus, predict_zobs, project_Vext,          # noqa
-                         SD_PV_validation_model, SN_PV_validation_model,        # noqa
-                         TF_PV_validation_model, radec_to_galactic,             # noqa
-                         sample_prior, make_loss, get_model,                    # noqa
-                         optimize_model_with_jackknife, distmodulus2dist,       # noqa
+                         PV_validation_model, get_model, distmodulus2dist,      # noqa
                         Observed2CosmologicalRedshift,                         # noqa
                         stack_pzosmo_over_realizations)                        # noqa
--- a/csiborgtools/flow/flow_model.py
+++ b/csiborgtools/flow/flow_model.py
--- a/csiborgtools/utils.py
+++ b/csiborgtools/utils.py
@ -26,8 +26,6 @@ import numpy as np
 from astropy import units as u
 from astropy.coordinates import SkyCoord
 from numba import jit
-from numpyro.infer import util
-from scipy.stats import multivariate_normal

 ###############################################################################
 #                           Positions                                         #
@ -429,55 +427,127 @@ def thin_samples_by_acl(samples):
    return thinned_samples


-def numpyro_gof(model, mcmc, model_kwargs={}):
+###############################################################################
+#                            Model comparison                                 #
+###############################################################################
+
+
+def BIC_AIC(samples, log_likelihood, ndata):
    """
-    Get the goodness-of-fit statistics for a sampled Numpyro model. Calculates
-    the BIC and AIC using the maximum likelihood sampled point and the log
-    evidence using the Laplace approximation.
+    Get the BIC/AIC of HMC samples from a Numpyro model.

    Parameters
    ----------
-    model : numpyro model
-        The model to evaluate.
-    mcmc : numpyro MCMC
-        The MCMC object containing the samples.
-    ndata : int
-        The number of data points.
-    model_kwargs : dict, optional
-        Additional keyword arguments to pass to the model.
+    samples: dict
+        Dictionary of samples from the Numpyro MCMC object.
+    log_likelihood: numpy array
+        Log likelihood values of the samples.
+    ndata: int
+        Number of data points.

    Returns
    -------
-    gof : dict
-        Dictionary containing the BIC, AIC and logZ.
+    BIC, AIC: floats
    """
-    samples = mcmc.get_samples(group_by_chain=False)
-    log_likelihood = util.log_likelihood(model, samples, **model_kwargs)["ll"]
-
-    # Calculate the BIC using the maximum likelihood sampled point.
    kmax = np.argmax(log_likelihood)
-    nparam = len(samples)
-    try:
-        ndata = model.ndata
-    except AttributeError as e:
-        raise AttributeError("The model must have an attribute `ndata` "
-                             "indicating the number of data points.") from e
-    BIC = -2 * log_likelihood[kmax] + nparam * np.log(ndata)

-    # Calculate AIC
+    # How many parameters?
+    nparam = 0
+    for val in samples.values():
+        if val.ndim == 1:
+            nparam += 1
+        elif val.ndim == 2:
+            nparam += val.shape[-1]
+        else:
+            raise ValueError("Invalid dimensionality of samples to count the number of parameters.")  # noqa
+
+    BIC = nparam * np.log(ndata) - 2 * log_likelihood[kmax]
    AIC = 2 * nparam - 2 * log_likelihood[kmax]

-    # Calculate log(Z) using Laplace approximation.
-    X = np.vstack([samples[key] for key in samples.keys()]).T
-    mu, cov = multivariate_normal.fit(X)
-    test_sample = {key: mu[i] for i, key in enumerate(samples.keys())}
+    return float(BIC), float(AIC)

-    ll_mu = util.log_likelihood(model, test_sample, **model_kwargs)["ll"]
-    cov_det = np.linalg.det(cov)
-    D = len(mu)
-    logZ = ll_mu + 0.5 * np.log(cov_det) + D / 2 * np.log(2 * np.pi)

-    # Convert to float
-    out = {"BIC": BIC, "AIC": AIC, "logZ": logZ}
-    out = {key: float(val) for key, val in out.items()}
-    return out
+def dict_samples_to_array(samples):
+    """Convert a dictionary of samples to a 2-dimensional array."""
+    data = []
+    names = []
+
+    for key, value in samples.items():
+        if value.ndim == 1:
+            data.append(value)
+            names.append(key)
+        elif value.ndim == 2:
+            for i in range(value.shape[-1]):
+                data.append(value[:, i])
+                names.append(f"{key}_{i}")
+        else:
+            raise ValueError("Invalid dimensionality of samples to stack.")
+
+    return np.vstack(data).T, names
+
+
+def harmonic_evidence(samples, log_posterior, temperature=0.8, epochs_num=20,
+                      return_flow_samples=True, verbose=True):
+    """
+    Calculate the evidence using the `harmonic` package. The model has a few
+    more hyperparameters that are set to defaults now.
+
+    Parameters
+    ----------
+    samples: 3-dimensional array
+        MCMC samples of shape `(nchains, nsamples, ndim)`.
+    log_posterior: 2-dimensional array
+        Log posterior values of shape `(nchains, nsamples)`.
+    temperature: float, optional
+        Temperature of the `harmonic` model.
+    epochs_num: int, optional
+        Number of epochs for training the model.
+    return_flow_samples: bool, optional
+        Whether to return the flow samples.
+    verbose: bool, optional
+        Whether to print progress.
+
+    Returns
+    -------
+    ln_inv_evidence, err_ln_inv_evidence: float and tuple of floats
+        The log inverse evidence and its error.
+    flow_samples: 2-dimensional array, optional
+        Flow samples of shape `(nsamples, ndim)`. To check their agreement
+        with the input samples.
+    """
+    try:
+        import harmonic as hm
+    except ImportError:
+        raise ImportError("The `harmonic` package is required to calculate the evidence.") from None  # noqa
+
+    # Do some standard checks of inputs.
+    if samples.ndim != 3:
+        raise ValueError("The samples must be a 3-dimensional array of shape `(nchains, nsamples, ndim)`.")  # noqa
+
+    if log_posterior.ndim != 2 and log_posterior.shape[:2] != samples.shape[:2]:                             # noqa
+        raise ValueError("The log posterior must be a 2-dimensional array of shape `(nchains, nsamples)`.")  # noqa
+
+    ndim = samples.shape[-1]
+    chains = hm.Chains(ndim)
+    chains.add_chains_3d(samples, log_posterior)
+    chains_train, chains_infer = hm.utils.split_data(
+        chains, training_proportion=0.5)
+
+    # This has a few more hyperparameters that are set to defaults now.
+    model = hm.model.RQSplineModel(
+        ndim, standardize=True, temperature=temperature)
+    model.fit(chains_train.samples, epochs=epochs_num, verbose=verbose)
+
+    ev = hm.Evidence(chains_infer.nchains, model)
+    ev.add_chains(chains_infer)
+    ln_inv_evidence = ev.ln_evidence_inv
+    err_ln_inv_evidence = ev.compute_ln_inv_evidence_errors()
+
+    if return_flow_samples:
+        samples = samples.reshape((-1, ndim))
+        samp_num = samples.shape[0]
+        flow_samples = model.sample(samp_num)
+
+        return ln_inv_evidence, err_ln_inv_evidence, flow_samples
+
+    return ln_inv_evidence, err_ln_inv_evidence