More flow (#118)

* Add GoF calculation

* Add import

* Add base flow

* Add reading of ndata

* Update nb

* Update plotting

* Update script

* Update plots

* Updaet plo

* Add script

* Update nb

* Update nb

* Update script

* Update script

* Update nb

* Remove imports

* Improve labelling

* Improve flow calibration

* Add bulk flow plots

* Update flow

* Update scrit

* Calculate more radial steps

* Update bulk

* Update script

* Update nb
This commit is contained in:
Richard Stiskalek 2024-03-21 16:50:37 +01:00 committed by GitHub
parent a9cb8943d6
commit f7285b2600
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
12 changed files with 1144 additions and 1023 deletions

View file

@ -19,7 +19,7 @@ from .utils import (center_of_mass, delta2ncells, number_counts,
binned_statistic, cosine_similarity, fprint, # noqa
hms_to_degrees, dms_to_degrees, great_circle_distance, # noqa
radec_to_cartesian, cartesian_to_radec, # noqa
thin_samples_by_acl) # noqa
thin_samples_by_acl, numpyro_gof) # noqa
from .params import paths_glamdring, simname2boxsize, simname2Omega_m # noqa

View file

@ -19,6 +19,7 @@ References
----------
[1] https://arxiv.org/abs/1912.09383.
"""
from abc import ABC
from datetime import datetime
from warnings import catch_warnings, simplefilter, warn
@ -37,7 +38,7 @@ from jax.random import PRNGKey
from numpyro.infer import Predictive, util
from scipy.optimize import fmin_powell
from sklearn.model_selection import KFold
from tqdm import tqdm, trange
from tqdm import trange
from numdifftools import Hessian
from ..params import simname2Omega_m
@ -82,6 +83,8 @@ class DataLoader:
----------
simname : str
Simulation name.
ksims : int
Index of the simulation to read in (not the IC index).
catalogue : str
Name of the catalogue with LOS objects.
catalogue_fpath : str
@ -94,7 +97,7 @@ class DataLoader:
Whether to store the full 3D velocity field. Otherwise stores only
the radial velocity.
"""
def __init__(self, simname, catalogue, catalogue_fpath, paths,
def __init__(self, simname, ksim, catalogue, catalogue_fpath, paths,
ksmooth=None, store_full_velocity=False):
print(f"{t()}: reading the catalogue.")
self._cat = self._read_catalogue(catalogue, catalogue_fpath)
@ -102,7 +105,7 @@ class DataLoader:
print(f"{t()}: reading the interpolated field.")
self._field_rdist, self._los_density, self._los_velocity = self._read_field( # noqa
simname, catalogue, ksmooth, paths)
simname, ksim, catalogue, ksmooth, paths)
if len(self._field_rdist) % 2 == 0:
warn(f"The number of radial steps is even. Skipping the first "
@ -117,7 +120,8 @@ class DataLoader:
"match the number of objects in the field.")
print(f"{t()}: calculating the radial velocity.")
nobject, nsim = self._los_density.shape[:2]
nobject = len(self._los_density)
dtype = self._los_density.dtype
# In case of Carrick 2015 the box is in galactic coordinates..
if simname == "Carrick2015":
@ -125,12 +129,10 @@ class DataLoader:
else:
d1, d2 = self._cat["RA"], self._cat["DEC"]
radvel = np.empty((nobject, nsim, len(self._field_rdist)),
self._los_velocity.dtype)
for i in trange(nobject):
for j in range(nsim):
radvel[i, j, :] = radial_velocity_los(
self._los_velocity[:, i, j, ...], d1[i], d2[i])
radvel = np.empty((nobject, len(self._field_rdist)), dtype)
for i in range(nobject):
radvel[i, :] = radial_velocity_los(self._los_velocity[:, i, ...],
d1[i], d2[i])
self._los_radial_velocity = radvel
if not store_full_velocity:
@ -192,7 +194,7 @@ class DataLoader:
Returns
----------
3-dimensional array of shape (n_objects, n_simulations, n_steps)
2-dimensional array of shape (n_objects, n_steps)
"""
return self._los_density[self._mask]
@ -203,7 +205,7 @@ class DataLoader:
Returns
-------
4-dimensional array of shape (n_objects, n_simulations, 3, n_steps)
3-dimensional array of shape (3, n_objects, n_steps)
"""
if self._los_velocity is None:
raise ValueError("The 3D velocities were not stored.")
@ -216,38 +218,29 @@ class DataLoader:
Returns
-------
3-dimensional array of shape (n_objects, n_simulations, n_steps)
2-dimensional array of shape (n_objects, n_steps)
"""
return self._los_radial_velocity[self._mask]
def _read_field(self, simname, catalogue, k, paths):
def _read_field(self, simname, ksim, catalogue, ksmooth, paths):
"""Read in the interpolated field."""
out_density = None
out_velocity = None
has_smoothed = False
nsims = paths.get_ics(simname)
if not (0 <= ksim < len(nsims)):
raise ValueError("Invalid simulation index.")
nsim = nsims[ksim]
with File(paths.field_los(simname, catalogue), 'r') as f:
has_smoothed = True if f[f"density_{nsims[0]}"].ndim > 2 else False
if has_smoothed and (k is None or not isinstance(k, int)):
has_smoothed = True if f[f"density_{nsim}"].ndim > 2 else False
if has_smoothed and (ksmooth is None or not isinstance(ksmooth, int)): # noqa
raise ValueError("The output contains smoothed field but "
"`ksmooth` is None. It must be provided.")
for i, nsim in enumerate(tqdm(nsims)):
if out_density is None:
nobject, nstep = f[f"density_{nsim}"].shape[:2]
out_density = np.empty(
(nobject, len(nsims), nstep), dtype=np.float32)
out_velocity = np.empty(
(3, nobject, len(nsims), nstep), dtype=np.float32)
indx = (..., k) if has_smoothed else (...)
out_density[:, i, :] = f[f"density_{nsim}"][indx]
out_velocity[:, :, i, :] = f[f"velocity_{nsim}"][indx]
indx = (..., ksmooth) if has_smoothed else (...)
los_density = f[f"density_{nsim}"][indx]
los_velocity = f[f"velocity_{nsim}"][indx]
rdist = f[f"rdist_{nsims[0]}"][:]
return rdist, out_density, out_velocity
return rdist, los_density, los_velocity
def _read_catalogue(self, catalogue, catalogue_fpath):
"""
@ -556,7 +549,17 @@ def calculate_ll_zobs(zobs, zobs_pred, sigma_v):
return jnp.exp(-0.5 * (dcz / sigma_v)**2) / jnp.sqrt(2 * np.pi) / sigma_v
class SD_PV_validation_model:
class BaseFlowValidationModel(ABC):
"""
Base class for the flow validation models.
"""
@property
def ndata(self):
return len(self._RA)
class SD_PV_validation_model(BaseFlowValidationModel):
"""
Simple distance peculiar velocity (PV) validation model, assuming that
we already have a calibrated estimate of the comoving distance to the
@ -657,7 +660,7 @@ class SD_PV_validation_model:
numpyro.factor("ll", ll)
class SN_PV_validation_model:
class SN_PV_validation_model(BaseFlowValidationModel):
"""
Supernova peculiar velocity (PV) validation model that includes the
calibration of the SALT2 light curve parameters.
@ -793,11 +796,11 @@ class SN_PV_validation_model:
return ll + jnp.log(self._f_simps(ptilde) / pnorm), None
ll = 0.
ll, __ = scan(scan_body, ll, jnp.arange(len(self._RA)))
ll, __ = scan(scan_body, ll, jnp.arange(self.ndata))
numpyro.factor("ll", ll)
class TF_PV_validation_model:
class TF_PV_validation_model(BaseFlowValidationModel):
"""
Tully-Fisher peculiar velocity (PV) validation model that includes the
calibration of the Tully-Fisher distance `mu = m - (a + b * eta)`.
@ -909,7 +912,7 @@ class TF_PV_validation_model:
return ll + jnp.log(self._f_simps(ptilde) / pnorm), None
ll = 0.
ll, __ = scan(scan_body, ll, jnp.arange(len(self._RA)))
ll, __ = scan(scan_body, ll, jnp.arange(self.ndata))
numpyro.factor("ll", ll)
@ -919,7 +922,7 @@ class TF_PV_validation_model:
###############################################################################
def get_model(loader, k, zcmb_max=None, verbose=True):
def get_model(loader, zcmb_max=None, verbose=True):
"""
Get a model and extract the relevant data from the loader.
@ -927,8 +930,6 @@ def get_model(loader, k, zcmb_max=None, verbose=True):
----------
loader : DataLoader
DataLoader instance.
k : int
Simulation index.
zcmb_max : float, optional
Maximum observed redshift in the CMB frame to include.
verbose : bool, optional
@ -940,11 +941,8 @@ def get_model(loader, k, zcmb_max=None, verbose=True):
"""
zcmb_max = np.infty if zcmb_max is None else zcmb_max
if k > loader.los_density.shape[1]:
raise ValueError(f"Simulation index `{k}` out of range.")
los_overdensity = loader.los_density[:, k, :]
los_velocity = loader.los_radial_velocity[:, k, :]
los_overdensity = loader.los_density
los_velocity = loader.los_radial_velocity
kind = loader._catname
if kind in ["LOSS", "Foundation"]:
@ -1160,4 +1158,5 @@ def optimize_model_with_jackknife(loader, k, n_splits=5, sample_alpha=True,
for key in keys]
stats = {key: (mean[i], std[i]) for i, key in enumerate(keys)}
loader.reset_mask()
return samples, stats, fmin, logz, bic

View file

@ -16,10 +16,12 @@
Collection of stand-off utility functions used in the scripts.
"""
from copy import deepcopy
from datetime import datetime
import numpy as np
from numba import jit
from datetime import datetime
from numpyro.infer import util
from scipy.stats import multivariate_normal
###############################################################################
# Positions #
@ -428,3 +430,57 @@ def thin_samples_by_acl(samples):
thinned_samples[key] = np.hstack(key_samples)
return thinned_samples
def numpyro_gof(model, mcmc, model_kwargs={}):
"""
Get the goodness-of-fit statistics for a sampled Numpyro model. Calculates
the BIC and AIC using the maximum likelihood sampled point and the log
evidence using the Laplace approximation.
Parameters
----------
model : numpyro model
The model to evaluate.
mcmc : numpyro MCMC
The MCMC object containing the samples.
ndata : int
The number of data points.
model_kwargs : dict, optional
Additional keyword arguments to pass to the model.
Returns
-------
gof : dict
Dictionary containing the BIC, AIC and logZ.
"""
samples = mcmc.get_samples(group_by_chain=False)
log_likelihood = util.log_likelihood(model, samples, **model_kwargs)["ll"]
# Calculate the BIC using the maximum likelihood sampled point.
kmax = np.argmax(log_likelihood)
nparam = len(samples)
try:
ndata = model.ndata
except AttributeError as e:
raise AttributeError("The model must have an attribute `ndata` "
"indicating the number of data points.") from e
BIC = -2 * log_likelihood[kmax] + nparam * np.log(ndata)
# Calculate AIC
AIC = 2 * nparam - 2 * log_likelihood[kmax]
# Calculate log(Z) using Laplace approximation.
X = np.vstack([samples[key] for key in samples.keys()]).T
mu, cov = multivariate_normal.fit(X)
test_sample = {key: mu[i] for i, key in enumerate(samples.keys())}
ll_mu = util.log_likelihood(model, test_sample, **model_kwargs)["ll"]
cov_det = np.linalg.det(cov)
D = len(mu)
logZ = ll_mu + 0.5 * np.log(cov_det) + D / 2 * np.log(2 * np.pi)
# Convert to float
out = {"BIC": BIC, "AIC": AIC, "logZ": logZ}
out = {key: float(val) for key, val in out.items()}
return out