Overlap reader thresholds (#31)

* Improve data

* Add comment

* Update how KNN is called

* Bring back indices

* New function output

* return catx["index"]

* Remove unnecessary arguments

* Remove useless arguments

* Rename output

* thin up catalogues

* Add thresholding

* Update README
This commit is contained in:
Richard Stiskalek 2023-03-15 20:55:50 +00:00 committed by GitHub
parent 6e290ccfb4
commit 153f1c0002
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 183 additions and 202 deletions

View file

@ -1,21 +1,14 @@
# CSiBORG tools
# CSiBORGTools
## CSiBORG Matching
### TODO
- [ ] Modify the call to tN
### Questions
- What scaling of the search region? No reason for it to be a multiple of $R_{200c}$.
- How well can observed clusters be matched to CSiBORG? Do their masses agree?
- Is the number of clusters in CSiBORG consistent?
## CSiBORG Galaxy Environmental Dependence
### TODO
- [ ] Add gradient and Hessian of the overdensity field.
- [x] Write a script to smoothen an overdensity field, calculate the derived fields and evaluate them at the galaxy positions.
### Questions

View file

@ -215,7 +215,7 @@ class RealisationsMatcher:
mapping[ind2] = ind1
return mapping
def cross(self, nsim0, nsimx, cat0, catx, overlap=False, verbose=True):
def cross(self, cat0, catx, overlap=False, verbose=True):
r"""
Find all neighbours whose CM separation is less than `nmult` times the
sum of their initial Lagrangian patch sizes. Enforces that the
@ -223,10 +223,9 @@ class RealisationsMatcher:
Parameters
----------
nsim0, nsimx : int
The reference and cross simulation IDs.
cat0, catx: :py:class:`csiborgtools.read.HaloCatalogue`
Halo catalogues corresponding to `nsim0` and `nsimx`, respectively.
Halo catalogues corresponding to the reference and cross
simulations.
overlap : bool, optional
whether to calculate overlap between clumps in the initial
snapshot. by default `false`. this operation is slow.
@ -235,22 +234,22 @@ class RealisationsMatcher:
Returns
-------
indxs : 1-dimensional array of shape `(nhalos, )`
ref_indxs : 1-dimensional array
Indices of halos in the reference catalogue.
cross_indxs : 1-dimensional array
Indices of halos in the cross catalogue.
match_indxs : 1-dimensional array of arrays
Indices of halo counterparts in the cross catalogue.
overlaps : 1-dimensional array of arrays
Overlaps with the cross catalogue.
"""
assert (nsim0 == cat0.paths.n_sim) & (nsimx == catx.paths.n_sim)
# Query the KNN
if verbose:
print("{}: querying the KNN.".format(datetime.now()), flush=True)
match_indxs = radius_neighbours(
catx.knn0, cat0.positions0, radiusX=cat0["lagpatch"],
radiusKNN=catx["lagpatch"], nmult=self.nmult, enforce_in32=True,
verbose=verbose)
catx.knn(select_initial=True), cat0.positions0,
radiusX=cat0["lagpatch"], radiusKNN=catx["lagpatch"],
nmult=self.nmult, enforce_in32=True, verbose=verbose)
# Remove neighbours whose mass is too large/small
if self.dlogmass is not None:
@ -265,9 +264,9 @@ class RealisationsMatcher:
if overlap:
if verbose:
print("Loading the clump particles", flush=True)
with open(cat0.paths.clump0_path(nsim0), "rb") as f:
with open(cat0.paths.clump0_path(cat0.n_sim), "rb") as f:
clumps0 = numpy.load(f, allow_pickle=True)
with open(catx.paths.clump0_path(nsimx), 'rb') as f:
with open(catx.paths.clump0_path(catx.n_sim), 'rb') as f:
clumpsx = numpy.load(f, allow_pickle=True)
# Convert 3D positions to particle IDs
@ -320,7 +319,7 @@ class RealisationsMatcher:
match_indxs[k] = match_indxs[k][mask]
cross[k] = cross[k][mask]
return cat0["index"], match_indxs, cross
return cat0["index"], catx["index"], match_indxs, cross
###############################################################################

View file

@ -31,38 +31,24 @@ class HaloCatalogue:
----------
paths : py:class:`csiborgtools.read.CSiBORGPaths`
CSiBORG paths-handling object with set `n_sim` and `n_snap`.
min_m500 : float, optional
The minimum :math:`M_{rm 500c} / M_\odot` mass. By default no
threshold.
min_mass : float, optional
The minimum :math:`M_{rm tot} / M_\odot` mass. By default no threshold.
max_dist : float, optional
The maximum comoving distance of a halo. By default no upper limit.
"""
_box = None
_paths = None
_data = None
_knn = None
_knn0 = None
_positions = None
_positions0 = None
_selmask = None
def __init__(self, nsim, min_m500=None, max_dist=None):
def __init__(self, nsim, min_mass=None, max_dist=None):
# Set up paths
paths = CSiBORGPaths(n_sim=nsim)
paths.n_snap = paths.get_maximum_snapshot()
self._paths = paths
self._box = BoxUnits(paths)
min_m500 = 0 if min_m500 is None else min_m500
max_dist = numpy.infty if max_dist is None else max_dist
self._paths = paths
self._set_data(min_m500, max_dist)
# Initialise the KNN at z = 0 and at z = 70
knn = NearestNeighbors()
knn.fit(self.positions)
self._knn = knn
knn0 = NearestNeighbors()
knn0.fit(self.positions0)
self._knn0 = knn0
self._set_data(min_mass, max_dist)
@property
def data(self):
@ -88,17 +74,6 @@ class HaloCatalogue:
"""
return self._box
@property
def cosmo(self):
"""
The box cosmology.
Returns
-------
cosmo : `astropy` cosmology object
"""
return self.box.cosmo
@property
def paths(self):
"""
@ -132,29 +107,23 @@ class HaloCatalogue:
"""
return self.paths.n_sim
@property
def knn(self):
def knn(self, select_initial):
"""
The final snapshot k-nearest neighbour object.
Returns
-------
knn : :py:class:`sklearn.neighbors.NearestNeighbors`
"""
return self._knn
@property
def knn0(self):
"""
The initial snapshot k-nearest neighbour object.
Parameters
----------
select_initial : bool
Whether to define the KNN on the initial or final snapshot.
Returns
-------
knn : :py:class:`sklearn.neighbors.NearestNeighbors`
"""
return self._knn0
knn = NearestNeighbors()
return knn.fit(self.positions0 if select_initial else self.positions)
def _set_data(self, min_m500, max_dist):
def _set_data(self, min_mass, max_dist):
"""
Loads the data, merges with mmain, does various coordinate transforms.
"""
@ -168,7 +137,7 @@ class HaloCatalogue:
data = self.merge_mmain_to_clumps(data, mmain)
flip_cols(data, "peak_x", "peak_z")
# Cut on number of particles and finite m200
# Cut on number of particles and finite m200. Do not change! Hardcoded
data = data[(data["npart"] > 100) & numpy.isfinite(data["m200"])]
# Now also load the initial positions
@ -177,16 +146,15 @@ class HaloCatalogue:
data = self.merge_initmatch_to_clumps(data, initcm)
flip_cols(data, "x0", "z0")
# Calculate redshift
pos = [data["peak_{}".format(p)] - 0.5 for p in ("x", "y", "z")]
vel = [data["v{}".format(p)] for p in ("x", "y", "z")]
zpec = self.box.box2pecredshift(*vel, *pos)
zobs = self.box.box2obsredshift(*vel, *pos)
zcosmo = self.box.box2cosmoredshift(
sum(pos[i]**2 for i in range(3))**0.5)
data = add_columns(data, [zpec, zobs, zcosmo],
["zpec", "zobs", "zcosmo"])
# # Calculate redshift
# pos = [data["peak_{}".format(p)] - 0.5 for p in ("x", "y", "z")]
# vel = [data["v{}".format(p)] for p in ("x", "y", "z")]
# zpec = self.box.box2pecredshift(*vel, *pos)
# zobs = self.box.box2obsredshift(*vel, *pos)
# zcosmo = self.box.box2cosmoredshift(
# sum(pos[i]**2 for i in range(3))**0.5)
# data = add_columns(data, [zpec, zobs, zcosmo],
# ["zpec", "zobs", "zcosmo"])
# Unit conversion
convert_cols = ["m200", "m500", "totpartmass", "mass_mmain",
@ -194,29 +162,15 @@ class HaloCatalogue:
"peak_x", "peak_y", "peak_z"]
data = self.box.convert_from_boxunits(data, convert_cols)
# Cut on mass. Note that this is in Msun
data = data[data["m500"] > min_m500]
# Now calculate spherical coordinates
d, ra, dec = cartesian_to_radec(
data["peak_x"], data["peak_y"], data["peak_z"])
data = add_columns(data, [d, ra, dec], ["dist", "ra", "dec"])
# Cut on separation
data = data[data["dist"] < max_dist]
# Pre-allocate the positions arrays
self._positions = numpy.vstack(
[data["peak_{}".format(p)] for p in ("x", "y", "z")]).T
self._positions = self._positions.astype(numpy.float32)
# And do the unit transform
if initcm is not None:
data = self.box.convert_from_boxunits(
data, ["x0", "y0", "z0", "lagpatch"])
self._positions0 = numpy.vstack(
[data["{}0".format(p)] for p in ("x", "y", "z")]).T
self._positions0 = self._positions0.astype(numpy.float32)
# Convert all that is not an integer to float32
names = list(data.dtype.names)
@ -227,9 +181,13 @@ class HaloCatalogue:
else:
formats.append(numpy.float32)
dtype = numpy.dtype({"names": names, "formats": formats})
data = data.astype(dtype)
self._data = data
# Apply cuts on distance and min500 if any
data = data[data["dist"] < max_dist] if max_dist is not None else data
data = (data[data["totpartmass"] > min_mass]
if min_mass is not None else data)
self._data = data.astype(dtype)
def merge_mmain_to_clumps(self, clumps, mmain):
"""
@ -288,8 +246,8 @@ class HaloCatalogue:
@property
def positions(self):
"""
3D positions of halos in comoving units of Mpc.
r"""
3D positions of halos in :math:`\mathrm{cMpc}`.
Returns
-------
@ -297,12 +255,13 @@ class HaloCatalogue:
Array of shape `(n_halos, 3)`, where the latter axis represents
`x`, `y` and `z`.
"""
return self._positions
return numpy.vstack(
[self._data["peak_{}".format(p)] for p in ("x", "y", "z")]).T
@property
def positions0(self):
r"""
3D positions of halos in the initial snapshot in comoving units of Mpc.
3D positions of halos in the initial snapshot in :math:`\mathrm{cMpc}`.
Returns
-------
@ -310,9 +269,11 @@ class HaloCatalogue:
Array of shape `(n_halos, 3)`, where the latter axis represents
`x`, `y` and `z`.
"""
if self._positions0 is None:
try:
return numpy.vstack(
[self._data["{}".format(p)] for p in ("x0", "y0", "z0")]).T
except KeyError:
raise RuntimeError("Initial positions are not set!")
return self._positions0
@property
def velocities(self):
@ -365,7 +326,7 @@ class HaloCatalogue:
"""
if not (X.ndim == 2 and X.shape[1] == 3):
raise TypeError("`X` must be an array of shape `(n_samples, 3)`.")
knn = self.knn0 if select_initial else self.knn # Pick the right KNN
knn = self.knn(select_initial)
return knn.radius_neighbors(X, radius, sort_results=True)
@property

View file

@ -17,9 +17,8 @@ Tools for summarising various results.
"""
import numpy
import joblib
from os.path import isfile
from os.path import (join, isfile)
from tqdm import tqdm
from .make_cat import HaloCatalogue
class PKReader:
@ -171,73 +170,64 @@ class PKReader:
class OverlapReader:
"""
r"""
A shortcut object for reading in the results of matching two simulations.
Parameters
----------
nsim0 : int
The reference simulation ID.
nsimx : int
The cross simulation ID.
cat0, catx: :py:class:`csiborgtools.read.HaloCatalogue`
Halo catalogues corresponding to the reference and cross
simulations.
fskel : str, optional
Path to the overlap. By default `None`, i.e.
`/mnt/extraspace/rstiskalek/csiborg/overlap/cross_{}_{}.npz`.
min_mass : float, optional
The minimum :math:`M_{\rm tot} / M_\odot` mass. By default no
threshold.
max_dist : float, optional
The maximum comoving distance of a halo. By default no upper limit.
"""
def __init__(self, nsim0, nsimx, fskel=None):
if fskel is None:
fskel = "/mnt/extraspace/rstiskalek/csiborg/overlap/"
fskel += "cross_{}_{}.npz"
_cat0 = None
_catx = None
_refmask = None
fpath = fskel.format(nsim0, nsimx)
fpath_inv = fskel.format(nsimx, nsim0)
is_inverted = False
def __init__(self, cat0, catx, fskel=None, min_mass=None, max_dist=None):
self._cat0 = cat0
self._catx = catx
if fskel is None:
fskel = join("/mnt/extraspace/rstiskalek/csiborg/overlap",
"cross_{}_{}.npz")
fpath = fskel.format(cat0.n_sim, catx.n_sim)
fpath_inv = fskel.format(catx.n_sim, cat0.n_sim)
if isfile(fpath):
pass
is_inverted = False
elif isfile(fpath_inv):
fpath = fpath_inv
is_inverted = True
nsim0, nsimx = nsimx, nsim0
else:
raise FileNotFoundError(
"No overlap file found for combination `{}` and `{}`."
.format(nsim0, nsimx))
.format(cat0.n_sim, catx.n_sim))
# We can set catalogues already now even if inverted
self._set_cats(nsim0, nsimx)
print(is_inverted)
data = numpy.load(fpath, allow_pickle=True)
if is_inverted:
inv_match_indxs, inv_overlap = self._invert_match(
data["match_indxs"], data["cross"], self.cat0["index"].size,)
# Overwrite the original file and store as a dictionary
data = {"indxs": self.cat0["index"],
"match_indxs": inv_match_indxs,
"cross": inv_overlap,
}
self._data = data
data["match_indxs"], data["overlap"],
data["cross_indxs"].size,)
self._data = {
"index": data["cross_indxs"],
"match_indxs": inv_match_indxs,
"overlap": inv_overlap}
else:
self._data = {
"index": data["ref_indxs"],
"match_indxs": data["match_indxs"],
"overlap": data["overlap"]}
@property
def nsim0(self):
"""
The reference simulation ID.
Returns
-------
nsim0 : int
"""
return self._nsim0
@property
def nsimx(self):
"""
The cross simulation ID.
Returns
-------
nsimx : int
"""
return self._nsimx
self._make_refmask(min_mass, max_dist)
@property
def cat0(self):
@ -261,24 +251,6 @@ class OverlapReader:
"""
return self._catx
def _set_cats(self, nsim0, nsimx):
"""
Set the simulation IDs and catalogues.
Parameters
----------
nsim0, nsimx : int
The reference and cross simulation IDs.
Returns
-------
None
"""
self._nsim0 = nsim0
self._nsimx = nsimx
self._cat0 = HaloCatalogue(nsim0)
self._catx = HaloCatalogue(nsimx)
@staticmethod
def _invert_match(match_indxs, overlap, cross_size):
"""
@ -304,8 +276,8 @@ class OverlapReader:
The corresponding overlaps to `inv_match_indxs`.
"""
# 1. Invert the match. Each reference halo has a list of counterparts
# so loop over those to each counterpart assign a reference halo.
# Add the same time also add the overlaps
# so loop over those to each counterpart assign a reference halo
# and at the same time also add the overlaps
inv_match_indxs = [[] for __ in range(cross_size)]
inv_overlap = [[] for __ in range(cross_size)]
for ref_id in range(match_indxs.size):
@ -330,6 +302,35 @@ class OverlapReader:
return inv_match_indxs, inv_overlap
def _make_refmask(self, min_mass, max_dist):
r"""
Create a mask for the reference catalogue that accounts for the mass
and distance cuts. Note that *no* masking is applied to the cross
catalogue.
Parameters
----------
min_mass : float, optional
The minimum :math:`M_{rm tot} / M_\odot` mass.
max_dist : float, optional
The maximum comoving distance of a halo.
Returns
-------
None
"""
# Enforce a cut on the reference catalogue
min_mass = 0 if min_mass is None else min_mass
max_dist = numpy.infty if max_dist is None else max_dist
m = ((self.cat0["totpartmass"] > min_mass)
& (self.cat0["dist"] < max_dist))
# Now remove indices that are below this cut
self._data["index"] = self._data["index"][m]
self._data["match_indxs"] = self._data["match_indxs"][m]
self._data["overlap"] = self._data["overlap"][m]
self._refmask = m
@property
def indxs(self):
"""
@ -339,7 +340,7 @@ class OverlapReader:
-------
indxs : 1-dimensional array
"""
return self._data["indxs"]
return self._data["index"]
@property
def match_indxs(self):
@ -361,47 +362,73 @@ class OverlapReader:
-------
overlap : array of 1-dimensional arrays of shape `(nhalos, )`
"""
return self._data["cross"]
return self._data["overlap"]
def dist(self, in_initial, norm=None):
@property
def refmask(self):
"""
Final snapshot pair distances.
Mask of the reference catalogue to match the calculated overlaps.
Returns
-------
refmask : 1-dimensional boolean array
"""
return self._refmask
def dist(self, in_initial, norm_kind=None):
"""
Pair distances of matched halos between the reference and cross
simulations.
Parameters
----------
in_initial : bool
Whether to calculate separation in the initial or final snapshot.
norm_kind : str, optional
The kind of normalisation to apply to the distances. Can be `r200`,
`ref_patch` or `sum_patch`.
Returns
-------
dist : array of 1-dimensional arrays of shape `(nhalos, )`
"""
assert norm is None or norm in ("r200", "ref_patch", "sum_patch")
# Positions either in the initial or final snapshot
assert (norm_kind is None
or norm_kind in ("r200", "ref_patch", "sum_patch"))
# Get positions either in the initial or final snapshot
if in_initial:
pos0 = self.cat0.positions0
posx = self.catx.positions0
pos0, posx = self.cat0.positions0, self.catx.positions0
else:
pos0 = self.cat0.positions
posx = self.catx.positions
pos0, posx = self.cat0.positions, self.catx.positions
pos0 = pos0[self.refmask, :] # Apply the reference catalogue mask
# Get the normalisation array if applicable
if norm_kind == "r200":
norm = self.cat0["r200"][self.refmask]
if norm_kind == "ref_patch":
norm = self.cat0["lagpatch"][self.refmask]
if norm_kind == "sum_patch":
patch0 = self.cat0["lagpatch"][self.refmask]
patchx = self.catx["lagpatch"]
norm = [None] * self.indxs.size
for i, ind in enumerate(self.match_indxs):
norm[i] = patch0[i] + patchx[ind]
norm = numpy.array(norm, dtype=object)
# Now calculate distances
dist = [None] * self.indxs.size
for n, ind in enumerate(self.match_indxs):
dist[n] = numpy.linalg.norm(pos0[n, :] - posx[ind, :], axis=1)
for i, ind in enumerate(self.match_indxs):
# n refers to the reference halo catalogue position
dist[i] = numpy.linalg.norm(pos0[i, :] - posx[ind, :], axis=1)
if norm_kind is not None:
dist[i] /= norm[i]
# Normalisation
if norm == "r200":
dist[n] /= self.cat0["r200"][n]
if norm == "ref_patch":
dist[n] /= self.cat0["lagpatch"][n]
if norm == "sum_patch":
dist[n] /= (self.cat0["lagpatch"][n]
+ self.catx["lagpatch"][ind])
return numpy.array(dist, dtype=object)
def mass_ratio(self, mass_kind="totpartmass", in_log=True, in_abs=True):
"""
Pair mass ratio.
Pair mass ratio of matched halos between the reference and cross
simulations.
Parameters
----------
@ -418,16 +445,16 @@ class OverlapReader:
-------
ratio : array of 1-dimensional arrays of shape `(nhalos, )`
"""
mass0 = self.cat0[mass_kind]
mass0 = self.cat0[mass_kind][self.refmask]
massx = self.catx[mass_kind]
ratio = [None] * self.indxs.size
for n, ind in enumerate(self.match_indxs):
ratio[n] = mass0[n] / massx[ind]
for i, ind in enumerate(self.match_indxs):
ratio[i] = mass0[i] / massx[ind]
if in_log:
ratio[n] = numpy.log10(ratio[n])
ratio[i] = numpy.log10(ratio[i])
if in_abs:
ratio[n] = numpy.abs(ratio[n])
ratio[i] = numpy.abs(ratio[i])
return numpy.array(ratio, dtype=object)
def summed_overlap(self):
@ -456,9 +483,10 @@ class OverlapReader:
-------
out : 1-dimensional array of shape `(nhalos, )`
"""
vals = self.cat0[par][self.refmask]
out = [None] * self.indxs.size
for n, ind in enumerate(self.match_indxs):
out[n] = numpy.ones(ind.size) * self.cat0[par][n]
for i, ind in enumerate(self.match_indxs):
out[i] = numpy.ones(ind.size) * vals[i]
return numpy.array(out, dtype=object)
def prob_nomatch(self):
@ -504,14 +532,13 @@ class OverlapReader:
massx = self.catx[mass_kind] # Create references to the arrays here
overlap = self.overlap # to speed up the loop below.
# Is the iterator verbose?
for n, match_ind in enumerate((self.match_indxs)):
for i, match_ind in enumerate(self.match_indxs):
# Skip if no match
if match_ind.size == 0:
continue
massx_ = massx[match_ind] # Again just create references
overlap_ = overlap[n] # to the appropriate elements
overlap_ = overlap[i] # to the appropriate elements
# Optionally apply overlap threshold
if overlap_threshold > 0.:
@ -530,8 +557,8 @@ class OverlapReader:
mean_ = 10**mean_ if in_log else mean_
std_ = mean_ * std_ * numpy.log(10) if in_log else std_
mean[n] = mean_
std[n] = std_
mean[i] = mean_
std[i] = std_
return mean, std

View file

@ -46,12 +46,13 @@ catx = csiborgtools.read.HaloCatalogue(args.nsimx)
matcher = csiborgtools.match.RealisationsMatcher()
print("{}: crossing the simulations.".format(datetime.now()), flush=True)
indxs, match_indxs, cross = matcher.cross(
args.nsim0, args.nsimx, cat0, catx, overlap=args.overlap)
ref_indxs, cross_indxs, match_indxs, overlap = matcher.cross(
cat0, catx, overlap=args.overlap)
# Dump the result
print("Saving results to `{}`.".format(fout), flush=True)
with open(fout, "wb") as f:
numpy.savez(fout, indxs=indxs, match_indxs=match_indxs, cross=cross)
numpy.savez(fout, ref_indxs=ref_indxs, cross_indxs=cross_indxs,
match_indxs=match_indxs, overlap=overlap)
print("All finished.", flush=True)