Matching paper plots (#91)

* Fix calculations of expected mass

* Add paper plots

* Edits to pltos

* Add overlap summary

* Add imports

* Add import

* Add binned stat

* Add fit

* Add more plots

* Add basic env

* Add histogram mode

* Edit expected mass

* Improve expected plots

* Clean up plot

* Improve separation plot

* Update plots

* Edit expected calculation

* Update plotting

* Update plots

* Update plots

* Update plots

* Add conc fraction

* Add halo maker sorting

* Renaming

* Add import

* Add NaN treatment

* add import

* Move cosine smi

* Update plots

* Move similarity

* Fix little bugs

* Shorten documentation

* Update plots
This commit is contained in:
Richard Stiskalek 2023-10-17 12:11:15 +01:00 committed by GitHub
parent 136c552369
commit 5500fbd2b9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 2193 additions and 294 deletions

View File

@ -12,10 +12,12 @@
# You should have received a copy of the GNU General Public License along # You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc., # with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
from csiborgtools import clustering, field, match, read, summary # noqa from csiborgtools import clustering, field, match, read, summary # noqa
from .utils import (center_of_mass, delta2ncells, number_counts, # noqa
periodic_distance, periodic_distance_two_points, # noqa
binned_statistic, cosine_similarity) # noqa
from .utils import (center_of_mass, delta2ncells, number_counts, # noqa
periodic_distance, periodic_distance_two_points) # noqa
# Arguments to csiborgtools.read.Paths. # Arguments to csiborgtools.read.Paths.
paths_glamdring = {"srcdir": "/mnt/extraspace/hdesmond/", paths_glamdring = {"srcdir": "/mnt/extraspace/hdesmond/",

View File

@ -14,5 +14,5 @@
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
from .match import (ParticleOverlap, RealisationsMatcher, # noqa from .match import (ParticleOverlap, RealisationsMatcher, # noqa
calculate_overlap, calculate_overlap_indxs, pos2cell, # noqa calculate_overlap, calculate_overlap_indxs, pos2cell, # noqa
cosine_similarity, find_neighbour, get_halo_cell_limits, # noqa find_neighbour, get_halo_cell_limits, # noqa
matching_max) # noqa matching_max) # noqa

View File

@ -660,19 +660,12 @@ class ParticleOverlap(BaseMatcher):
def pos2cell(pos, ncells): def pos2cell(pos, ncells):
"""
Convert position to cell number if there are `ncells` cells along the axis.
"""
if pos.dtype.char in numpy.typecodes["AllInteger"]: if pos.dtype.char in numpy.typecodes["AllInteger"]:
return pos return pos
return numpy.floor(pos * ncells).astype(numpy.int32) return numpy.floor(pos * ncells).astype(numpy.int32)
def read_nshift(smooth_kwargs): def read_nshift(smooth_kwargs):
"""
Determine the number of cells to pad the density field if smoothing is
applied. Defaults to the ceiling of three times the smoothing scale.
"""
return 0 if smooth_kwargs is None else ceil(3 * smooth_kwargs["sigma"]) return 0 if smooth_kwargs is None else ceil(3 * smooth_kwargs["sigma"])
@ -774,33 +767,26 @@ def get_halo_cell_limits(pos, ncells, nshift=0):
return mins, maxs return mins, maxs
@jit(nopython=True) @jit(nopython=True, boundscheck=False)
def calculate_overlap(delta1, delta2, cellmins, delta_bckg, box_size, def calculate_overlap(delta1, delta2, cellmins, delta_bckg, box_size,
bckg_halfsize): bckg_halfsize):
r""" """
Overlap between two halos whose density fields are evaluated on the Calculate overlap between two halos' density fields on the same grid.
same grid. This is a JIT implementation, hence it is outside of the main
class.
Parameters Parameters
---------- ----------
delta1: 3-dimensional array delta1, delta2 : 3D array
Density field of the first halo. Density fields of the first and second halos, respectively.
delta2 : 3-dimensional array cellmins : tuple (len=3)
Density field of the second halo. Lower cell ID in the full box.
cellmins : len-3 tuple delta_bckg : 3D array
Tuple of lower cell ID in the full box. Combined background density field of reference and cross simulations
delta_bckg : 3-dimensional array on `bckg_halfsize` grid.
Summed background density field of the reference and cross simulations
calculated with particles assigned to halos at the final snapshot.
Calculated on a grid determined by `bckg_halfsize`.
box_size : int box_size : int
Number of cells in the box. Cell count in the box.
bckg_halfsize : int bckg_halfsize : int
Background half-size for density field calculation. This is the Grid distance from box center for background density.
grid distance from the center of the box to each side over which to 0.5 * box_size.
evaluate the background density field. Must be less than or equal to
half the box size.
Returns Returns
------- -------
@ -834,39 +820,29 @@ def calculate_overlap(delta1, delta2, cellmins, delta_bckg, box_size,
return intersect / (totmass - intersect) return intersect / (totmass - intersect)
@jit(nopython=True) @jit(nopython=True, boundscheck=False)
def calculate_overlap_indxs(delta1, delta2, cellmins, delta_bckg, nonzero, def calculate_overlap_indxs(delta1, delta2, cellmins, delta_bckg, nonzero,
mass1, mass2, box_size, bckg_halfsize): mass1, mass2, box_size, bckg_halfsize):
r""" """
Overlap between two haloes whose density fields are evaluated on the Calculate overlap of two halos' density fields on the same grid.
same grid and `nonzero1` enumerates the non-zero cells of `delta1. This is
a JIT implementation, hence it is outside of the main class.
Parameters Parameters
---------- ----------
delta1: 3-dimensional array delta1, delta2 : 3D array
Density field of the first halo. Density fields of the first and second halos, respectively.
delta2 : 3-dimensional array cellmins : tuple (len=3)
Density field of the second halo. Lower cell ID in the full box.
cellmins : len-3 tuple delta_bckg : 3D array
Tuple of lower cell ID in the full box. Combined background density from reference and cross simulations
delta_bckg : 3-dimensional array on `bckg_halfsize` grid.
Summed background density field of the reference and cross simulations nonzero : 2D array (shape: (n_cells, 3))
calculated with particles assigned to halos at the final snapshot. Non-zero cells for the lower mass halo (from `fill_delta_indxs`).
Calculated on a grid determined by `bckg_halfsize`. mass1, mass2 : float, optional
nonzero : 2-dimensional array of shape `(n_cells, 3)` Halos' total masses. Calculated from density if not provided.
Indices of cells that are non-zero of the lower mass halo. Expected to
be precomputed from `fill_delta_indxs`.
mass1, mass2 : floats, optional
Total masses of the two haloes, respectively. Optional. If not provided
calculcated directly from the density field.
box_size : int box_size : int
Number of cells in the box. Cell count in the box.
bckg_halfsize : int bckg_halfsize : int
Background half-size for density field calculation. This is the Grid distance from box center for background density; 0.5 * box_size.
grid distance from the center of the box to each side over which to
evaluate the background density field. Must be less than or equal to
half the box size.
Returns Returns
------- -------
@ -1039,35 +1015,6 @@ def find_neighbour(nsim0, cats):
return dists, cross_hindxs return dists, cross_hindxs
def cosine_similarity(x, y):
r"""
Calculate the cosine similarity between two Cartesian vectors. Defined
as :math:`\Sum_{i} x_i y_{i} / (|x| * |y|)`.
Parameters
----------
x : 1-dimensional array
The first vector.
y : 1- or 2-dimensional array
The second vector. Can be 2-dimensional of shape `(n_samples, 3)`,
in which case the calculation is broadcasted.
Returns
-------
out : float or 1-dimensional array
"""
if x.ndim != 1:
raise ValueError("`x` must be a 1-dimensional array.")
if y.ndim == 1:
y = y.reshape(1, -1)
out = numpy.sum(x * y, axis=1)
out /= numpy.linalg.norm(x) * numpy.linalg.norm(y, axis=1)
return out[0] if out.size == 1 else out
def matching_max(cat0, catx, mass_kind, mult, periodic, overlap=None, def matching_max(cat0, catx, mass_kind, mult, periodic, overlap=None,
match_indxs=None, verbose=True): match_indxs=None, verbose=True):
""" """

View File

@ -16,7 +16,11 @@
from .knn_summary import kNNCDFReader # noqa from .knn_summary import kNNCDFReader # noqa
from .nearest_neighbour_summary import NearestNeighbourReader # noqa from .nearest_neighbour_summary import NearestNeighbourReader # noqa
from .overlap_summary import weighted_stats # noqa from .overlap_summary import weighted_stats # noqa
from .overlap_summary import NPairsOverlap, PairOverlap, get_cross_sims # noqa from .overlap_summary import (NPairsOverlap, PairOverlap, get_cross_sims, # noqa
max_overlap_agreement, max_overlap_agreements, # noqa
find_peak) # noqa
from .pk_summary import PKReader # noqa from .pk_summary import PKReader # noqa
from .tpcf_summary import TPCFReader # noqa from .tpcf_summary import TPCFReader # noqa
from .field_interp import read_interpolated_field # noqa from .field_interp import (read_interpolated_field, # noqa
bayesian_bootstrap_correlation, # noqa
correlate_at_fixed_smoothing) # noqa

View File

@ -15,6 +15,12 @@
import numpy import numpy
from tqdm import tqdm from tqdm import tqdm
from numba import jit
###############################################################################
# Read in the field values at the galaxy positions #
###############################################################################
def read_interpolated_field(survey_name, kind, galaxy_index, paths, MAS, grid, def read_interpolated_field(survey_name, kind, galaxy_index, paths, MAS, grid,
@ -75,3 +81,142 @@ def read_interpolated_field(survey_name, kind, galaxy_index, paths, MAS, grid,
ks[i] = j ks[i] = j
return out[:, ks, :] return out[:, ks, :]
###############################################################################
# Calculate the Bayesian bootstrapped correlation #
###############################################################################
@jit(nopython=True, fastmath=True, boundscheck=False)
def dot_product(x, y):
tot = 0.0
for i in range(len(x)):
tot += x[i] * y[i]
return tot
@jit(nopython=True, fastmath=True, boundscheck=False)
def cov(x, y, mean_x, mean_y, weights):
tot = 0.0
for i in range(len(x)):
tot += (x[i] - mean_x) * (y[i] - mean_y) * weights[i]
return tot
@jit(nopython=True, fastmath=True, boundscheck=False)
def var(x, mean_x, weights):
tot = 0.0
for i in range(len(x)):
tot += (x[i] - mean_x)**2 * weights[i]
return tot
@jit(nopython=True, fastmath=True, boundscheck=False)
def weighted_correlation(x, y, weights):
mean_x = dot_product(x, weights)
mean_y = dot_product(y, weights)
cov_xy = cov(x, y, mean_x, mean_y, weights)
var_x = var(x, mean_x, weights)
var_y = var(y, mean_y, weights)
return cov_xy / numpy.sqrt(var_x * var_y)
@jit(nopython=True, fastmath=True, boundscheck=False)
def _bayesian_bootstrap_correlation(x, y, weights):
nweights = len(weights)
bootstrapped_correlations = numpy.full(nweights, numpy.nan, dtype=x.dtype)
for i in range(nweights):
bootstrapped_correlations[i] = weighted_correlation(x, y, weights[i])
return bootstrapped_correlations
@jit(nopython=True, fastmath=True, boundscheck=False)
def rank(x):
order = numpy.argsort(x)
ranks = order.argsort()
return ranks
@jit(nopython=True, fastmath=True, boundscheck=False)
def bayesian_bootstrap_correlation(x, y, kind="spearman", n_bootstrap=10000):
"""
Calculate the Bayesian bootstrapped correlation between two arrays.
Parameters
----------
x, y : 1-dimensional arrays
The two arrays to calculate the correlation between.
kind : str, optional
The type of correlation to calculate. Either `spearman` or `pearson`.
n_bootstrap : int, optional
The number of bootstrap samples to use.
Returns
-------
corr : 1-dimensional array of shape `(n_bootstrap,)`
"""
if len(x) != len(y):
raise ValueError("Input arrays must have the same length")
if kind not in ["spearman", "pearson"]:
raise ValueError("kind must be either `spearman` or `pearson`")
if kind == "spearman":
dtype = x.dtype
x = rank(x).astype(dtype)
y = rank(y).astype(dtype)
alphas = numpy.ones(len(x), dtype=x.dtype)
weights = numpy.random.dirichlet(alphas, size=n_bootstrap)
return _bayesian_bootstrap_correlation(x, y, weights)
###############################################################################
# Distribution disagreement #
###############################################################################
def distribution_disagreement(x, y):
"""
Think about this more when stacking non-Gaussian distributions.
"""
delta = x - y
return numpy.abs(delta.mean()) / delta.std()
"""
field will be of value (nsims, ngal, nsmooth)
Calculate the correlation for each sim and smoothing scale (nsims, nsmooth)
For each of the above stack the distributions?
"""
def correlate_at_fixed_smoothing(field_values, galaxy_property,
kind="spearman", n_bootstrap=1000):
galaxy_property = galaxy_property.astype(field_values.dtype)
nsims = len(field_values)
distributions = numpy.empty((nsims, n_bootstrap), dtype=field_values.dtype)
from tqdm import trange
for i in trange(nsims):
distributions[i] = bayesian_bootstrap_correlation(
field_values[i], galaxy_property, kind=kind, n_bootstrap=n_bootstrap)
return distributions
def do_something(field_values, galaxy_property):
pass

View File

@ -23,6 +23,30 @@ from tqdm import tqdm, trange
from ..utils import periodic_distance from ..utils import periodic_distance
###############################################################################
# Utility functions #
###############################################################################
def find_peak(x, weights, shrink=0.95, min_obs=5):
"""
Find the peak of a 1D distribution using a shrinking window.
"""
assert shrink <= 1.
xmin, xmax = numpy.min(x), numpy.max(x)
xpos = (xmax + xmin) / 2
rad = (xmax - xmin) / 2
while True:
mask = numpy.abs(x - xpos) < rad
if mask.sum() < min_obs:
return xpos
xpos = numpy.average(x[mask], weights=weights[mask])
rad *= shrink
############################################################################### ###############################################################################
# Overlap of two simulations # # Overlap of two simulations #
############################################################################### ###############################################################################
@ -251,42 +275,16 @@ class PairOverlap:
---------- ----------
from_smoothed : bool from_smoothed : bool
Whether to use the smoothed overlap or not. Whether to use the smoothed overlap or not.
Returns Returns
------- -------
summed_overlap : 1-dimensional array of shape `(nhalos, )` summed_overlap : 1-dimensional array of shape `(nhalos, )`
""" """
overlap = self.overlap(from_smoothed) overlap = self.overlap(from_smoothed)
out = numpy.full(len(overlap), numpy.nan, dtype=numpy.float32) out = numpy.zeros(len(overlap), dtype=numpy.float32)
for i in range(len(overlap)): for i in range(len(overlap)):
if len(overlap[i]) > 0: if len(overlap[i]) > 0:
out[i] = numpy.sum(overlap[i]) out[i] = numpy.sum(overlap[i])
else:
out[i] = 0
return out
def prob_nomatch(self, from_smoothed):
"""
Probability of no match for each halo in the reference simulation with
the cross simulation. Defined as a product of 1 - overlap with other
halos.
Parameters
----------
from_smoothed : bool
Whether to use the smoothed overlap or not.
Returns
-------
prob_nomatch : 1-dimensional array of shape `(nhalos, )`
"""
overlap = self.overlap(from_smoothed)
out = numpy.full(len(overlap), numpy.nan, dtype=numpy.float32)
for i in range(len(overlap)):
if len(overlap[i]) > 0:
out[i] = numpy.product(numpy.subtract(1, overlap[i]))
else:
out[i] = 1
return out return out
def dist(self, in_initial, boxsize, norm_kind=None): def dist(self, in_initial, boxsize, norm_kind=None):
@ -308,8 +306,7 @@ class PairOverlap:
------- -------
dist : array of 1-dimensional arrays of shape `(nhalos, )` dist : array of 1-dimensional arrays of shape `(nhalos, )`
""" """
assert (norm_kind is None assert (norm_kind is None or norm_kind in ("r200c", "ref_patch", "sum_patch")) # noqa
or norm_kind in ("r200c", "ref_patch", "sum_patch"))
# Get positions either in the initial or final snapshot # Get positions either in the initial or final snapshot
pos0 = self.cat0().position(in_initial=in_initial) pos0 = self.cat0().position(in_initial=in_initial)
posx = self.catx().position(in_initial=in_initial) posx = self.catx().position(in_initial=in_initial)
@ -400,60 +397,6 @@ class PairOverlap:
return out return out
def counterpart_mass(self, from_smoothed, overlap_threshold=0.,
mass_kind="totpartmass"):
"""
Calculate the expected counterpart mass of each halo in the reference
simulation from the crossed simulation.
Parameters
----------
from_smoothed : bool
Whether to use the smoothed overlap or not.
overlap_threshold : float, optional
Minimum overlap required for a halo to be considered a match. By
default 0.0, i.e. no threshold.
mass_kind : str, optional
The mass kind whose ratio is to be calculated. Must be a valid
catalogue key. By default `totpartmass`, i.e. the total particle
mass associated with a halo.
Returns
-------
mean, std : 1-dimensional arrays of shape `(nhalos, )`
"""
mean = numpy.full(len(self), numpy.nan, dtype=numpy.float32)
std = numpy.full(len(self), numpy.nan, dtype=numpy.float32)
massx = self.catx(mass_kind) # Create references to speed
overlap = self.overlap(from_smoothed) # up the loop below
for i, match_ind in enumerate(self["match_indxs"]):
# Skip if no match
if len(match_ind) == 0:
continue
massx_ = massx[match_ind] # Again just create references
overlap_ = overlap[i] # to the appropriate elements
# Optionally apply overlap threshold
if overlap_threshold > 0.:
mask = overlap_ > overlap_threshold
if numpy.sum(mask) == 0:
continue
massx_ = massx_[mask]
overlap_ = overlap_[mask]
massx_ = numpy.log10(massx_)
# Weighted average and *biased* standard deviation
mean_ = numpy.average(massx_, weights=overlap_)
std_ = numpy.average((massx_ - mean_)**2, weights=overlap_)**0.5
mean[i] = mean_
std[i] = std_
return mean, std
def copy_per_match(self, par): def copy_per_match(self, par):
""" """
Make an array like `self.match_indxs` where each of its element is an Make an array like `self.match_indxs` where each of its element is an
@ -525,6 +468,82 @@ class PairOverlap:
return self["match_indxs"].size return self["match_indxs"].size
###############################################################################
# Support functions for pair overlaps #
###############################################################################
def max_overlap_agreement(cat0, catx, min_logmass, maxdist, paths):
r"""
Calculate whether for a halo `A` from catalogue `cat0` that has a maximum
overlap with halo `B` from catalogue `catx` it is also `B` that has a
maximum overlap with `A`.
Parameters
----------
cat0 : instance of :py:class:`csiborgtools.read.BaseCatalogue`
Halo catalogue corresponding to the reference simulation.
catx : instance of :py:class:`csiborgtools.read.BaseCatalogue`
Halo catalogue corresponding to the cross simulation.
min_logmass : float
Minimum halo mass in :math:`\log_{10} M_\odot / h` to consider.
maxdist : float, optional
Maximum halo distance in :math:`\mathrm{Mpc} / h` from the centre
of the high-resolution region.
paths : py:class`csiborgtools.read.Paths`
CSiBORG paths object.
Returns
-------
agreement : 1-dimensional array of shape `(nhalos, )`
"""
kwargs = {"paths": paths, "min_logmass": min_logmass, "maxdist": maxdist}
pair_forward = PairOverlap(cat0, catx, **kwargs)
pair_backward = PairOverlap(catx, cat0, **kwargs)
nhalos = len(pair_forward.cat0())
agreement = numpy.full(nhalos, numpy.nan, dtype=numpy.float32)
for i in range(nhalos):
match_indxs_forward = pair_forward["match_indxs"][i]
if len(match_indxs_forward) == 0:
continue
overlap_forward = pair_forward["smoothed_overlap"][i]
kmax = match_indxs_forward[numpy.argmax(overlap_forward)]
match_indxs_backward = pair_backward["match_indxs"][kmax]
overlap_backward = pair_backward["smoothed_overlap"][kmax]
imatch = match_indxs_backward[numpy.argmax(overlap_backward)]
agreement[i] = imatch == i
return agreement
def max_overlap_agreements(cat0, catxs, min_logmass, maxdist, paths,
verbose=True):
"""
Repeat `max_overlap_agreement` for many cross simulations.
Parameters
----------
...
Returns
-------
agreements : 2-dimensional array of shape `(ncatxs, nhalos)`
"""
agreements = [None] * len(catxs)
desc = "Calculating maximum overlap agreement"
for i, catx in enumerate(tqdm(catxs, desc=desc, disable=not verbose)):
agreements[i] = max_overlap_agreement(cat0, catx, min_logmass,
maxdist, paths)
return numpy.asanyarray(agreements)
def weighted_stats(x, weights, min_weight=0, verbose=False): def weighted_stats(x, weights, min_weight=0, verbose=False):
""" """
Calculate the weighted mean and standard deviation of `x` using `weights` Calculate the weighted mean and standard deviation of `x` using `weights`
@ -544,11 +563,10 @@ def weighted_stats(x, weights, min_weight=0, verbose=False):
Returns Returns
------- -------
stat : 2-dimensional array of shape `(len(x), 2)` mu, std : 1-dimensional arrays of shape `(len(x), )`
The first column is the weighted mean and the second column is the
weighted standard deviation.
""" """
out = numpy.full((x.size, 2), numpy.nan, dtype=numpy.float32) mu = numpy.full(x.size, numpy.nan, dtype=numpy.float32)
std = numpy.full(x.size, numpy.nan, dtype=numpy.float32)
for i in trange(len(x), disable=not verbose): for i in trange(len(x), disable=not verbose):
x_, w_ = numpy.asarray(x[i]), numpy.asarray(weights[i]) x_, w_ = numpy.asarray(x[i]), numpy.asarray(weights[i])
@ -557,9 +575,9 @@ def weighted_stats(x, weights, min_weight=0, verbose=False):
w_ = w_[mask] w_ = w_[mask]
if len(w_) == 0: if len(w_) == 0:
continue continue
out[i, 0] = numpy.average(x_, weights=w_) mu[i] = numpy.average(x_, weights=w_)
out[i, 1] = numpy.average((x_ - out[i, 0])**2, weights=w_)**0.5 std[i] = numpy.average((x_ - mu[i])**2, weights=w_)**0.5
return out return mu, std
############################################################################### ###############################################################################
@ -684,92 +702,87 @@ class NPairsOverlap:
out[i] = pair.summed_overlap(from_smoothed) out[i] = pair.summed_overlap(from_smoothed)
return numpy.vstack(out).T return numpy.vstack(out).T
def prob_nomatch(self, from_smoothed, verbose=True): def expected_property_single(self, k, key, from_smoothed, in_log=True):
""" ys = [None] * len(self)
Probability of no match for each halo in the reference simulation with overlaps = [None] * len(self)
the cross simulation. for i, pair in enumerate(self):
overlap = pair.overlap(from_smoothed)
if len(overlap[k]) == 0:
ys[i] = numpy.nan
overlaps[i] = numpy.nan
continue
match_indxs = pair["match_indxs"]
j = numpy.argmax(overlap[k])
Parameters ys[i] = pair.catx(key)[match_indxs[k][j]]
---------- if in_log:
from_smoothed : bool ys[i] = numpy.log10(ys[i])
Whether to use the smoothed overlap or not. overlaps[i] = overlap[k][j]
verbose : bool, optional
Verbosity flag.
Returns return ys, overlaps
-------
prob_nomatch : 2-dimensional array of shape `(nhalos, ncatxs)`
"""
iterator = tqdm(self.pairs,
desc="Calculating probability of no match",
disable=not verbose
)
out = [None] * len(self)
for i, pair in enumerate(iterator):
out[i] = pair.prob_nomatch(from_smoothed)
return numpy.vstack(out).T
def counterpart_mass(self, from_smoothed, overlap_threshold=0., def expected_property(self, key, from_smoothed, min_logmass,
mass_kind="totpartmass", return_full=False, in_log=True, mass_kind="totpartmass", verbose=True):
verbose=True):
""" """
Calculate the expected counterpart mass of each halo in the reference Calculate the expected counterpart mass of each halo in the reference
simulation from the crossed simulation. simulation from the crossed simulation.
Parameters Parameters
---------- ----------
key : str
Property key.
from_smoothed : bool from_smoothed : bool
Whether to use the smoothed overlap or not. Whether to use the smoothed overlap or not.
overlap_threshold : float, optional min_logmass : float
Minimum overlap required for a halo to be considered a match. By Minimum log mass of reference halos to consider.
default 0.0, i.e. no threshold. in_log : bool, optional
Whether to calculated the expected property in log10.
mass_kind : str, optional mass_kind : str, optional
The mass kind whose ratio is to be calculated. Must be a valid The mass kind whose ratio is to be calculated. Must be a valid
catalogue key. By default `totpartmass`, i.e. the total particle catalogue key. By default `totpartmass`, i.e. the total particle
mass associated with a halo. mass associated with a halo.
return_full : bool, optional
Whether to return the full results of matching each pair or
calculate summary statistics by Gaussian averaging.
verbose : bool, optional verbose : bool, optional
Verbosity flag. Verbosity flag.
Returns Returns
------- -------
mu, std : 1-dimensional arrays of shape `(nhalos,)` mean_expected : 1-dimensional array of shape `(nhalos, )`
Summary expected mass and standard deviation from all cross Expected property from all cross simulations.
simulations. std_expected : 1-dimensional array of shape `(nhalos, )`
mus, stds : 2-dimensional arrays of shape `(nhalos, ncatx)`, optional Standard deviation of the expected property.
Expected mass and standard deviation from each cross simulation.
Returned only if `return_full` is `True`.
""" """
iterator = tqdm(self.pairs, log_mass0 = numpy.log10(self.cat0(mass_kind))
desc="Calculating counterpart masses", ntot = len(log_mass0)
disable=not verbose) mean_expected = numpy.full(ntot, numpy.nan, dtype=numpy.float32)
mus, stds = [None] * len(self), [None] * len(self) std_expected = numpy.full(ntot, numpy.nan, dtype=numpy.float32)
for i, pair in enumerate(iterator):
mus[i], stds[i] = pair.counterpart_mass(
from_smoothed=from_smoothed,
overlap_threshold=overlap_threshold, mass_kind=mass_kind)
mus, stds = numpy.vstack(mus).T, numpy.vstack(stds).T
# Prob of > 0 matches indxs = numpy.where(log_mass0 > min_logmass)[0]
probmatch = 1 - self.prob_nomatch(from_smoothed) for i in tqdm(indxs, disable=not verbose,
# Normalise it for weighted sums etc. desc="Calculating expectation"):
norm_probmatch = numpy.apply_along_axis( ys = numpy.full(len(self), numpy.nan, dtype=numpy.float32)
lambda x: x / numpy.sum(x), axis=1, arr=probmatch) weights = numpy.full(len(self), numpy.nan, dtype=numpy.float32)
for j, pair in enumerate(self):
overlap = pair.overlap(from_smoothed)
if len(overlap[i]) == 0:
continue
# Mean and standard deviation of weighted stacked Gaussians k = numpy.argmax(overlap[i])
mu = numpy.sum((norm_probmatch * mus), axis=1) ys[j] = pair.catx(key)[pair["match_indxs"][i][k]]
std = numpy.sum((norm_probmatch * (mus**2 + stds**2)), axis=1) - mu**2 weights[j] = overlap[i][k]
std **= 0.5
mask = mu <= 0 if in_log:
mu[mask] = numpy.nan ys[j] = numpy.log10(ys[j])
std[mask] = numpy.nan
if return_full: mask = numpy.isfinite(ys) & numpy.isfinite(weights)
return mu, std, mus, stds if numpy.sum(mask) <= 2:
return mu, std continue
mean_expected[i] = find_peak(ys[mask], weights=weights[mask])
std_expected[i] = numpy.average((ys[mask] - mean_expected[i])**2,
weights=weights[mask])**0.5
print(log_mass0[i], mean_expected[i], std_expected[i])
return mean_expected, std_expected
@property @property
def pairs(self): def pairs(self):

View File

@ -150,6 +150,35 @@ def radec_to_cartesian(X):
]).T ]).T
def cosine_similarity(x, y):
r"""
Calculate the cosine similarity between two Cartesian vectors. Defined
as :math:`\Sum_{i} x_i y_{i} / (|x| * |y|)`.
Parameters
----------
x : 1-dimensional array
The first vector.
y : 1- or 2-dimensional array
The second vector. Can be 2-dimensional of shape `(n_samples, 3)`,
in which case the calculation is broadcasted.
Returns
-------
out : float or 1-dimensional array
"""
if x.ndim != 1:
raise ValueError("`x` must be a 1-dimensional array.")
if y.ndim == 1:
y = y.reshape(1, -1)
out = numpy.sum(x * y, axis=1)
out /= numpy.linalg.norm(x) * numpy.linalg.norm(y, axis=1)
return out[0] if out.size == 1 else out
def real2redshift(pos, vel, observer_location, observer_velocity, box, def real2redshift(pos, vel, observer_location, observer_velocity, box,
periodic_wrap=True, make_copy=True): periodic_wrap=True, make_copy=True):
r""" r"""
@ -219,3 +248,17 @@ def number_counts(x, bin_edges):
for i in range(bin_edges.size - 1): for i in range(bin_edges.size - 1):
out[i] = numpy.sum((x >= bin_edges[i]) & (x < bin_edges[i + 1])) out[i] = numpy.sum((x >= bin_edges[i]) & (x < bin_edges[i + 1]))
return out return out
def binned_statistic(x, y, left_edges, bin_width, statistic):
"""
Calculate a binned statistic.
"""
out = numpy.full(left_edges.size, numpy.nan, dtype=x.dtype)
for i in range(left_edges.size):
mask = (x >= left_edges[i]) & (x < left_edges[i] + bin_width)
if numpy.any(mask):
out[i] = statistic(y[mask])
return out

100
scripts/sort_halomaker.py Normal file
View File

@ -0,0 +1,100 @@
# Copyright (C) 2022 Richard Stiskalek
# This program is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the
# Free Software Foundation; either version 3 of the License, or (at your
# option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
# Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
"""
Script to sort the HaloMaker's `particle_membership` file to match the ordering
of particles in the simulation snapshot.
"""
from argparse import ArgumentParser
from datetime import datetime
from glob import iglob
import h5py
import numpy
import pynbody
from mpi4py import MPI
from taskmaster import work_delegation
from tqdm import trange
import csiborgtools
def sort_particle_membership(nsim, nsnap, method):
"""
Read the FoF particle halo membership and sort the halo IDs to the ordering
of particles in the PHEW clump IDs.
Parameters
----------
nsim : int
IC realisation index.
verbose : bool, optional
Verbosity flag.
"""
print(f"{datetime.now()}: starting simulation {nsim}, snapshot {nsnap} and method {method}.") # noqa
paths = csiborgtools.read.Paths(**csiborgtools.paths_glamdring)
fpath = next(iglob(f"/mnt/extraspace/rstiskalek/CSiBORG/halo_maker/ramses_{nsim}/output_{str(nsnap).zfill(5)}/**/*particle_membership*", recursive=True), None) # noqa
print(f"{datetime.now()}: loading particle membership `{fpath}`.")
# Columns are halo ID, particle ID
membership = numpy.genfromtxt(fpath, dtype=int)
print(f"{datetime.now()}: loading particle IDs from the snapshot.")
sim = pynbody.load(paths.snapshot(nsnap, nsim, "csiborg"))
pids = numpy.asanyarray(sim["iord"])
print(f"{datetime.now()}: mapping particle IDs to their indices.")
pids_idx = {pid: i for i, pid in enumerate(pids)}
print(f"{datetime.now()}: mapping HIDs to their array indices.")
# Unassigned particle IDs are assigned a halo ID of 0.
hids = numpy.zeros(pids.size, dtype=numpy.int32)
for i in trange(membership.shape[0]):
hid, pid = membership[i]
hids[pids_idx[pid]] = hid
fout = fpath + "_sorted.hdf5"
print(f"{datetime.now()}: saving the sorted data to ... `{fout}`")
header = """
This dataset represents halo indices for each particle.
- The particles are ordered as they appear in the simulation snapshot.
- Unassigned particles are given a halo index of 0.
"""
with h5py.File(fout, 'w') as hdf:
dset = hdf.create_dataset('hids_dataset', data=hids)
dset.attrs['header'] = header
if __name__ == "__main__":
parser = ArgumentParser()
parser.add_argument("--method", type=str, required=True,
help="HaloMaker method")
parser.add_argument("--nsim", type=int, required=False, default=None,
help="IC index. If not set process all.")
args = parser.parse_args()
paths = csiborgtools.read.Paths(**csiborgtools.paths_glamdring)
if args.nsim is None:
ics = paths.get_ics("csiborg")
else:
ics = [args.nsim]
snaps = numpy.array([max(paths.get_snapshots(nsim, "csiborg"))
for nsim in ics])
def main(n):
sort_particle_membership(ics[n], snaps[n], args.method)
work_delegation(main, list(range(len(ics))), MPI.COMM_WORLD)

View File

@ -29,16 +29,9 @@ import numpy
from mpi4py import MPI from mpi4py import MPI
from taskmaster import work_delegation from taskmaster import work_delegation
import csiborgtools
from utils import get_nsims from utils import get_nsims
try:
import csiborgtools
except ModuleNotFoundError:
import sys
sys.path.append("../")
import csiborgtools
def _main(nsim, simname, verbose): def _main(nsim, simname, verbose):
""" """
@ -51,9 +44,7 @@ def _main(nsim, simname, verbose):
else: else:
partreader = csiborgtools.read.QuijoteReader(paths) partreader = csiborgtools.read.QuijoteReader(paths)
if verbose: print(f"{datetime.now()}: processing simulation `{nsim}`.", flush=True)
print(f"{datetime.now()}: reading and processing simulation `{nsim}`.",
flush=True)
# We first load the particle IDs in the final snapshot. # We first load the particle IDs in the final snapshot.
pidf = csiborgtools.read.read_h5(paths.particles(nsim, simname)) pidf = csiborgtools.read.read_h5(paths.particles(nsim, simname))
pidf = pidf["particle_ids"] pidf = pidf["particle_ids"]

1632
scripts_plots/paper_match.py Normal file

File diff suppressed because it is too large Load Diff

View File

@ -66,6 +66,12 @@ def plot_mass_vs_ncells(nsim, pdf=False):
cat = open_csiborg(nsim) cat = open_csiborg(nsim)
mpart = 4.38304044e+09 mpart = 4.38304044e+09
x = numpy.log10(cat["totpartmass"])
y = numpy.log10(cat["lagpatch_ncells"])
p = numpy.polyfit(x, y, 1)
print("Fitted parameters are: ", p)
with plt.style.context(plt_utils.mplstyle): with plt.style.context(plt_utils.mplstyle):
plt.figure() plt.figure()
plt.scatter(cat["totpartmass"], cat["lagpatch_ncells"], s=0.25, plt.scatter(cat["totpartmass"], cat["lagpatch_ncells"], s=0.25,
@ -105,9 +111,9 @@ def plot_hmf(pdf=False):
csiborg_counts = numpy.full((len(csiborg_nsims), len(bins) - 1), csiborg_counts = numpy.full((len(csiborg_nsims), len(bins) - 1),
numpy.nan, dtype=numpy.float32) numpy.nan, dtype=numpy.float32)
csiborg_counts[i, :] = data["counts"] csiborg_counts[i, :] = data["counts"]
# csiborg_counts /= numpy.diff(bins).reshape(1, -1) csiborg_counts /= numpy.diff(bins).reshape(1, -1)
csiborg5511 = numpy.load(paths.halo_counts("csiborg", 5511))["counts"] # csiborg5511 = numpy.load(paths.halo_counts("csiborg", 5511))["counts"]
# csiborg5511 /= numpy.diff(data["bins"]) # csiborg5511 /= numpy.diff(data["bins"])
print("Loading Quijote halo counts.", flush=True) print("Loading Quijote halo counts.", flush=True)
@ -121,73 +127,89 @@ def plot_hmf(pdf=False):
(len(quijote_nsims) * nmax, len(bins) - 1), numpy.nan, (len(quijote_nsims) * nmax, len(bins) - 1), numpy.nan,
dtype=numpy.float32) dtype=numpy.float32)
quijote_counts[i * nmax:(i + 1) * nmax, :] = data["counts"] quijote_counts[i * nmax:(i + 1) * nmax, :] = data["counts"]
# quijote_counts /= numpy.diff(bins).reshape(1, -1) quijote_counts /= numpy.diff(bins).reshape(1, -1)
# vol = 155.5**3 vol = 4 * numpy.pi / 3 * 155.5**3
# csiborg_counts /= vol csiborg_counts /= vol
# quijote_counts /= vol quijote_counts /= vol
# csiborg5511 /= vol # csiborg5511 /= vol
x = 10**(0.5 * (bins[1:] + bins[:-1])) x = 10**(0.5 * (bins[1:] + bins[:-1]))
# Edit lower limits # Edit lower limits
csiborg_counts[:, x < 1e12] = numpy.nan csiborg_counts[:, x < 10**13.1] = numpy.nan
quijote_counts[:, x < 10**(13.1)] = numpy.nan quijote_counts[:, x < 10**(13.1)] = numpy.nan
# Edit upper limits # Edit upper limits
csiborg_counts[:, x > 3e15] = numpy.nan csiborg_counts[:, x > 3e15] = numpy.nan
quijote_counts[:, x > 3e15] = numpy.nan quijote_counts[:, x > 3e15] = numpy.nan
csiborg5511[x > 3e15] = numpy.nan # csiborg5511[x > 3e15] = numpy.nan
with plt.style.context(plt_utils.mplstyle): with plt.style.context(plt_utils.mplstyle):
cols = plt.rcParams["axes.prop_cycle"].by_key()["color"] cols = plt.rcParams["axes.prop_cycle"].by_key()["color"]
fig, ax = plt.subplots(nrows=2, sharex=True, fig, ax = plt.subplots(nrows=1, sharex=True,
figsize=(3.5, 2.625 * 1.25), figsize=(3.5, 2.625))
gridspec_kw={"height_ratios": [1, 0.45]}) ax = [ax]
fig.subplots_adjust(hspace=0, wspace=0) # fig, ax = plt.subplots(nrows=2, sharex=True,
# figsize=(3.5, 2.625 * 1.25),
# gridspec_kw={"height_ratios": [1, 0.25]})
# fig.subplots_adjust(hspace=0, wspace=0)
# Upper panel data # Upper panel data
mean_csiborg = numpy.mean(csiborg_counts, axis=0) mean_csiborg = numpy.mean(csiborg_counts, axis=0)
std_csiborg = numpy.std(csiborg_counts, axis=0) std_csiborg = numpy.std(csiborg_counts, axis=0)
ax[0].plot(x, mean_csiborg, label="CSiBORG", c=cols[0])
ax[0].fill_between(x, mean_csiborg - std_csiborg, for i in range(len(csiborg_counts)):
mean_csiborg + std_csiborg, ax[0].plot(x, csiborg_counts[i, :], c="cornflowerblue", lw=0.5, zorder=0)
alpha=0.5, color=cols[0])
ax[0].plot(x, mean_csiborg, label="CSiBORG", c="mediumblue", zorder=1)
# ax[0].fill_between(x, mean_csiborg - std_csiborg,
# mean_csiborg + std_csiborg,
# alpha=0.5, color=cols[0])
mean_quijote = numpy.mean(quijote_counts, axis=0) mean_quijote = numpy.mean(quijote_counts, axis=0)
std_quijote = numpy.std(quijote_counts, axis=0) std_quijote = numpy.std(quijote_counts, axis=0)
ax[0].plot(x, mean_quijote, label="Quijote", c=cols[1])
ax[0].fill_between(x, mean_quijote - std_quijote,
mean_quijote + std_quijote, alpha=0.5,
color=cols[1])
ax[0].plot(x, csiborg5511, label="CSiBORG 5511", c="k", ls="--") for i in range(len(quijote_counts)):
std5511 = numpy.sqrt(csiborg5511) ax[0].plot(x, quijote_counts[i, :], c="palegreen", lw=0.5, zorder=-1)
ax[0].fill_between(x, csiborg5511 - std_csiborg, csiborg5511 + std5511,
alpha=0.2, color="k")
# Lower panel data
log_y = numpy.log10(mean_csiborg / mean_quijote)
err = numpy.sqrt((std_csiborg / mean_csiborg / numpy.log(10))**2
+ (std_quijote / mean_quijote / numpy.log(10))**2)
ax[1].plot(x, 10**log_y, c=cols[0])
ax[1].fill_between(x, 10**(log_y - err), 10**(log_y + err), alpha=0.5,
color=cols[0])
ax[1].plot(x, csiborg5511 / mean_quijote, c="k", ls="--")
ax[0].plot(x, mean_quijote, label="Quijote", c="darkgreen", zorder=1)
# ax[0].fill_between(x, mean_quijote - std_quijote,
# mean_quijote + std_quijote, alpha=0.5,
# color=cols[1])
# ax[0].plot(x, csiborg5511, label="CSiBORG 5511", c="k", ls="--")
# std5511 = numpy.sqrt(csiborg5511)
# ax[0].fill_between(x, csiborg5511 - std_csiborg, csiborg5511 + std5511,
# alpha=0.2, color="k")
# # Lower panel data
# log_y = numpy.log10(mean_csiborg / mean_quijote)
# err = numpy.sqrt((std_csiborg / mean_csiborg / numpy.log(10))**2
# + (std_quijote / mean_quijote / numpy.log(10))**2)
# ax[1].plot(x, 10**log_y, c=cols[0])
# ax[1].fill_between(x, 10**(log_y - err), 10**(log_y + err), alpha=0.5,
# color="k")
# ax[1].plot(x, csiborg5511 / mean_quijote, c="k", ls="--")
# Labels and accesories # Labels and accesories
ax[1].axhline(1, color="k", ls="--", # ax[1].axhline(1, color="k", ls="--",
lw=0.5 * plt.rcParams["lines.linewidth"], zorder=0) # lw=0.5 * plt.rcParams["lines.linewidth"], zorder=0)
# ax[0].set_ylabel(r"$\frac{\mathrm{d}^2 N}{\mathrm{d} V \mathrm{d}\log M_{\rm tot}}~[\mathrm{dex}^{-1} (\mathrm{Mpc} / h)^{-3}]$", # noqa # ax[0].set_ylabel(r"$\frac{\mathrm{d}^2 N}{\mathrm{d} V \mathrm{d}\log M_{\rm tot}}~[\mathrm{dex}^{-1} (\mathrm{Mpc} / h)^{-3}]$", # noqa
# fontsize="small") # fontsize="small")
ax[0].set_ylabel("Counts in bins") m = numpy.isfinite(mean_quijote)
ax[1].set_xlabel(r"$M_{\rm tot}~[M_\odot / h]$", fontsize="small") ax[0].set_xlim(x[m].min(), x[m].max())
ax[1].set_ylabel(r"$\mathrm{CSiBORG} / \mathrm{Quijote}$", ax[0].set_ylabel(r"$\mathrm{HMF}~[\mathrm{dex}^{-1} (\mathrm{Mpc} / h)^{-3}]$")
fontsize="small") ax[0].set_xlabel(r"$M_{\rm tot}~[M_\odot / h]$", fontsize="small")
# ax[1].set_ylabel(r"$\mathrm{CSiBORG} / \mathrm{Quijote}$",
# fontsize="small")
ax[0].set_xscale("log") ax[0].set_xscale("log")
ax[0].set_yscale("log") ax[0].set_yscale("log")
ax[1].set_ylim(0.5, 2.0) # ax[1].set_ylim(0.5, 1.5)
# ax[1].set_yscale("log") # ax[1].set_yscale("log")
ax[0].legend(fontsize="small") ax[0].legend()
fig.tight_layout(h_pad=0, w_pad=0) fig.tight_layout(h_pad=0, w_pad=0)
for ext in ["png"] if pdf is False else ["png", "pdf"]: for ext in ["png"] if pdf is False else ["png", "pdf"]:
@ -556,8 +578,8 @@ if __name__ == "__main__":
if False: if False:
plot_mass_vs_ncells(7444, pdf=False) plot_mass_vs_ncells(7444, pdf=False)
if False: if True:
plot_hmf(pdf=False) plot_hmf(pdf=True)
if False: if False:
plot_hmf_quijote_full(pdf=False) plot_hmf_quijote_full(pdf=False)
@ -569,7 +591,7 @@ if __name__ == "__main__":
plot_groups=False, dmin=45, dmax=60, plot_groups=False, dmin=45, dmax=60,
plot_halos=5e13, volume_weight=True) plot_halos=5e13, volume_weight=True)
if True: if False:
kind = "environment" kind = "environment"
grid = 512 grid = 512
smooth_scale = 8.0 smooth_scale = 8.0