Overlap calculation (#18)

* Move cosine similarity out

* Basic overlap calculation

* add overlap import

* Add clump0 dict path

* Update README

* fix path bug

* Save as dict instead

* Change format to array of arrays

* Update paths

* Take fewer ICs for now

* Change to structured array

* Add overlap calculation

* Start saving IDs

* Add a blank space

* Update TODO
This commit is contained in:
Richard Stiskalek 2022-12-19 11:58:22 +01:00 committed by GitHub
parent 13a9d11afe
commit 18f09767f4
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 297 additions and 65 deletions

View file

@ -7,14 +7,11 @@
## Short-term TODO ## Short-term TODO
- [x] Add code to calculate the cross-correlation for resolved region only. - [ ] Implement the CIC binning.
- [ ] Calculate the spectra for all 101 boxes and visualise them. - [ ] Write a script to perform the matching on a node.
- [ ] See about the $z=70$ particles.
## Long-term TODO ## Long-term TODO
- [ ] Calculate the cross-correlation in CSiBORG. Should see the scale of the constraints?
- [ ] Find the distribution of particles in the first snapshot
- [ ] Implement a custom model for matchin galaxies to halos. - [ ] Implement a custom model for matchin galaxies to halos.

View file

@ -13,6 +13,6 @@
# with this program; if not, write to the Free Software Foundation, Inc., # with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
from .match import (brute_spatial_separation, RealisationsMatcher) # noqa from .match import (brute_spatial_separation, RealisationsMatcher, cosine_similarity, ParticleOverlap) # noqa
from .num_density import (binned_counts, number_density) # noqa from .num_density import (binned_counts, number_density) # noqa
# from .correlation import (get_randoms_sphere, sphere_angular_tpcf) # noqa # from .correlation import (get_randoms_sphere, sphere_angular_tpcf) # noqa

View file

@ -79,6 +79,9 @@ class RealisationsMatcher:
---------- ----------
cats : :py:class`csiborgtools.read.CombinedHaloCatalogue` cats : :py:class`csiborgtools.read.CombinedHaloCatalogue`
Combined halo catalogue to search. Combined halo catalogue to search.
# NOTE add later
# dtype : dtype, optional
# Output precision. By default `numpy.float32`.
""" """
_cats = None _cats = None
@ -122,7 +125,216 @@ class RealisationsMatcher:
""" """
return [i for i in range(self.cats.N) if i != n_sim] return [i for i in range(self.cats.N) if i != n_sim]
def cosine_similarity(self, x, y): def _check_masskind(self, mass_kind):
"""Check that `mass_kind` is a valid key."""
if mass_kind not in self.cats[0].keys:
raise ValueError("Invalid mass kind `{}`.".format(mass_kind))
@staticmethod
def _cat2clump_mapping(cat_indxs, clump_indxs):
"""
Create a mapping from a catalogue array index to a clump array index.
Parameters
----------
cat_indxs : 1-dimensional array
Clump indices in the catalogue array.
clump_indxs : 1-dimensional array
Clump indices in the clump array.
Returns
-------
mapping : 1-dimensional array
Mapping. The array indices match catalogue array and values are
array positions in the clump array.
"""
mapping = numpy.full(cat_indxs.size, numpy.nan, dtype=int)
__, ind1, ind2 = numpy.intersect1d(clump_indxs, cat_indxs,
return_indices=True)
mapping[ind2] = ind1
return mapping
def cross_knn_position_single(self, n_sim, nmult=5, dlogmass=None,
mass_kind="totpartmass", init_dist=False,
overlap=False, verbose=True):
r"""
Find all neighbours within :math:`n_{\rm mult} R_{200c}` of halos in
the `nsim`th simulation. Also enforces that the neighbours'
:math:`\log M / M_\odot` be within `dlogmass` dex.
Parameters
----------
n_sim : int
Index of an IC realisation in `self.cats` whose halos' neighbours
in the remaining simulations to search for.
nmult : float or int, optional
Multiple of :math:`R_{200c}` within which to return neighbours. By
default 5.
dlogmass : float, optional
Tolerance on mass logarithmic mass difference. By default `None`.
mass_kind : str, optional
The mass kind whose similarity is to be checked. Must be a valid
catalogue key. By default `totpartmass`, i.e. the total particle
mass associated with a halo.
init_dist : bool, optional
Whether to calculate separation of the initial CMs. By default
`False`.
overlap : bool, optional
Whether to calculate overlap between clumps in the initial
snapshot. By default `False`. Note that this operation is
substantially slower.
verbose : bool, optional
Iterator verbosity flag. By default `True`.
Returns
-------
matches : composite array
Array, indices are `(n_sims - 1, 4, n_halos, n_matches)`. The
2nd axis is `index` of the neighbouring halo in its catalogue,
`dist` is the 3D distance to the halo whose neighbours are
searched, `dist0` is the separation of the initial CMs and
`overlap` is the overlap over the initial clumps, all respectively.
The latter two are calculated only if `init_dist` or `overlap` is
`True`.
TODO:
- [ ] Precalculate the mapping from halo index to clump array position
"""
self._check_masskind(mass_kind)
# Radius, mass and positions of halos in `n_sim` IC realisation
logmass = numpy.log10(self.cats[n_sim][mass_kind])
R = self.cats[n_sim]["r200"]
pos = self.cats[n_sim].positions
if init_dist:
pos0 = self.cats[n_sim].positions0 # These are CM positions
if overlap:
if verbose:
print("Loading initial clump particles for `n_sim = {}`."
.format(n_sim))
# Grab a paths object. What it is set to is unimportant
paths = self.cats[0].paths
with open(paths.clump0_path(self.cats.n_sims[n_sim]), "rb") as f:
clumps0 = numpy.load(f, allow_pickle=True)
overlapper = ParticleOverlap()
cat2clumps0 = self._cat2clump_mapping(self.cats[n_sim]["index"],
clumps0["ID"])
matches = [None] * (self.cats.N - 1)
# Verbose iterator
if verbose:
iters = enumerate(tqdm(self.search_sim_indices(n_sim)))
else:
iters = enumerate(self.search_sim_indices(n_sim))
iters = enumerate(self.search_sim_indices(n_sim))
# Search for neighbours in the other simulations
for count, i in iters:
dist, indxs = self.cats[i].radius_neigbours(pos, R * nmult)
# Get rid of neighbors whose mass is too off
if dlogmass is not None:
for j, indx in enumerate(indxs):
match_logmass = numpy.log10(self.cats[i][mass_kind][indx])
mask = numpy.abs(match_logmass - logmass[j]) < dlogmass
dist[j] = dist[j][mask]
indxs[j] = indx[mask]
# Find distance to the between the initial CM
dist0 = [numpy.asanyarray([], dtype=numpy.float64)] * dist.size
if init_dist:
with_neigbours = numpy.where([ii.size > 0 for ii in indxs])[0]
# Fill the pre-allocated array on positions with neighbours
for k in with_neigbours:
dist0[k] = numpy.linalg.norm(
pos0[k] - self.cats[i].positions0[indxs[k]], axis=1)
# Calculate the initial snapshot overlap
cross = [numpy.asanyarray([], dtype=numpy.float64)] * dist.size
if overlap:
if verbose:
print("Loading initial clump particles for `n_sim = {}` "
"to compare against `n_sim = {}`.".format(i, n_sim))
with open(paths.clump0_path(self.cats.n_sims[i]), 'rb') as f:
clumpsx = numpy.load(f, allow_pickle=True)
cat2clumpsx = self._cat2clump_mapping(self.cats[i]["index"],
clumpsx["ID"])
# Loop only over halos that have neighbours
with_neigbours = numpy.where([ii.size > 0 for ii in indxs])[0]
for k in tqdm(with_neigbours) if verbose else with_neigbours:
# Find which clump matches index of this halo from cat
match0 = cat2clumps0[k]
# Get the clump and pre-calculate its cell assignment
cl0 = clumps0["clump"][match0]
cl0_cells = overlapper.assign_to_cell(
*(cl0[p] for p in ('x', 'y', 'z')))
dint = numpy.full(indxs[k].size, numpy.nan, numpy.float64)
# Loop over the ones we cross-correlate with
for ii, ind in enumerate(indxs[k]):
# Again which cross clump to this index
matchx = cat2clumpsx[ind]
dint[ii] = overlapper.mass_overlap(
cl0, clumpsx["clump"][matchx], cl0_cells)
cross[k] = dint
# Append as a composite array
matches[count] = numpy.asarray(
[indxs, dist, dist0, cross], dtype=object)
return numpy.asarray(matches, dtype=object)
def cross_knn_position_all(self, nmult=5, dlogmass=None,
mass_kind="totpartmass", init_dist=False,
overlap=False, verbose=True):
r"""
Find all neighbours within :math:`n_{\rm mult} R_{200c}` of halos in
all simulations listed in `self.cats`. Also enforces that the
neighbours' :math:`\log M_{200c}` be within `dlogmass` dex.
Parameters
----------
nmult : float or int, optional
Multiple of :math:`R_{200c}` within which to return neighbours. By
default 5.
dlogmass : float, optional
Tolerance on mass logarithmic mass difference. By default `None`.
mass_kind : str, optional
The mass kind whose similarity is to be checked. Must be a valid
catalogue key. By default `totpartmass`, i.e. the total particle
mass associated with a halo.
init_dist : bool, optional
Whether to calculate separation of the initial CMs. By default
`False`.
overlap : bool, optional
Whether to calculate overlap between clumps in the initial
snapshot. By default `False`. Note that this operation is
substantially slower.
verbose : bool, optional
Iterator verbosity flag. By default `True`.
Returns
-------
matches : list of composite arrays
List whose length is `n_sims`. For description of its elements see
`self.cross_knn_position_single(...)`.
"""
N = self.cats.N # Number of catalogues
matches = [None] * N
# Loop over each catalogue
for i in trange(N) if verbose else range(N):
matches[i] = self.cross_knn_position_single(
i, nmult, dlogmass, mass_kind=mass_kind, init_dist=init_dist,
overlap=overlap, verbose=verbose)
return matches
###############################################################################
# Matching statistics #
###############################################################################
def cosine_similarity(x, y):
r""" r"""
Calculate the cosine similarity between two Cartesian vectors. Defined Calculate the cosine similarity between two Cartesian vectors. Defined
as :math:`\Sum_{i} x_i y_{i} / (|x| * |y|)`. as :math:`\Sum_{i} x_i y_{i} / (|x| * |y|)`.
@ -152,104 +364,96 @@ class RealisationsMatcher:
return out[0] return out[0]
return out return out
def cross_knn_position_single(self, n_sim, nmult=5, dlogmass=None,
init_dist=False, verbose=True):
r"""
Find all neighbours within :math:`n_{\rm mult} R_{200c}` of halos in
the `nsim`th simulation. Also enforces that the neighbours'
:math:`\log M_{200c}` be within `dlogmass` dex.
Parameters class ParticleOverlap:
---------- """
n_sim : int TODO:
Index of an IC realisation in `self.cats` whose halos' neighbours - [ ] Class documentation
in the remaining simulations to search for. """
nmult : float or int, optional _bins = None
Multiple of :math:`R_{200c}` within which to return neighbours. By
default 5. def __init__(self, bins=None):
dlogmass : float, optional if bins is None:
Tolerance on mass logarithmic mass difference. By default `None`. dx = 1 / 2**11
init_dist : bool, optional bins = numpy.arange(0, 1 + dx, dx)
Whether to calculate separation of the initial CMs. By default self.bins = bins
`False`.
verbose : bool, optional @property
Iterator verbosity flag. By default `True`. def bins(self):
"""
The grid spacing. Assumed to be equal for all three dimensions. Units
ought to match the requested coordinates.
Returns Returns
------- -------
matches : composite array bins : 1-dimensional array
Array, indices are `(n_sims - 1, 3, n_halos, n_matches)`. The
2nd axis is `index` of the neighbouring halo in its catalogue,
`dist`, which is the 3D distance to the halo whose neighbours are
searched, and `dist0` which is the separation of the initial CMs.
The latter is calculated only if `init_dist` is `True`.
""" """
# Radius, M200c and positions of halos in `n_sim` IC realisation return self._bins
logm200 = numpy.log10(self.cats[n_sim]["m200"])
R = self.cats[n_sim]["r200"]
pos = self.cats[n_sim].positions
if init_dist:
pos0 = self.cats[n_sim].positions0 # These are CM positions
matches = [None] * (self.cats.N - 1)
# Verbose iterator
if verbose:
iters = enumerate(tqdm(self.search_sim_indices(n_sim)))
else:
iters = enumerate(self.search_sim_indices(n_sim))
# Search for neighbours in the other simulations
for count, i in iters:
dist, indxs = self.cats[i].radius_neigbours(pos, R * nmult)
# Get rid of neighbors whose mass is too off
if dlogmass is not None:
for j, indx in enumerate(indxs):
match_logm200 = numpy.log10(self.cats[i]["m200"][indx])
mask = numpy.abs(match_logm200 - logm200[j]) < dlogmass
dist[j] = dist[j][mask]
indxs[j] = indx[mask]
# Find distance to the between the initial CM
dist0 = [numpy.asanyarray([], dtype=numpy.float64)] * dist.size
if init_dist:
with_neigbours = numpy.where([ii.size > 0 for ii in indxs])[0]
# Fill the pre-allocated array on positions with neighbours
for k in with_neigbours:
dist0[k] = numpy.linalg.norm(
pos0[k] - self.cats[i].positions0[indxs[k]], axis=1)
# Append as a composite array @bins.setter
matches[count] = numpy.asarray([indxs, dist, dist0], dtype=object) def bins(self, bins):
"""Sets `bins`."""
bins = numpy.asarray(bins) if isinstance(bins, list) else bins
assert bins.ndim == 1, "`bins` must be a 1-dimensional array."
self._bins = bins
return numpy.asarray(matches, dtype=object) def assign_to_cell(self, x, y, z):
"""
def cross_knn_position_all(self, nmult=5, dlogmass=None, Assign particles specified by coordinates `x`, `y`, and `z` to grid
init_dist=False, verbose=True): cells.
r"""
Find all neighbours within :math:`n_{\rm mult} R_{200c}` of halos in
all simulations listed in `self.cats`. Also enforces that the
neighbours' :math:`\log M_{200c}` be within `dlogmass` dex.
Parameters Parameters
---------- ----------
nmult : float or int, optional x, y, z : 1-dimensional arrays
Multiple of :math:`R_{200c}` within which to return neighbours. By Positions of particles in the box.
default 5.
dlogmass : float, optional
Tolerance on mass logarithmic mass difference. By default `None`.
init_dist : bool, optional
Whether to calculate separation of the initial CMs. By default
`False`.
verbose : bool, optional
Iterator verbosity flag. By default `True`.
Returns Returns
------- -------
matches : list of composite arrays cells : 1-dimensional array
List whose length is `n_sims`. For description of its elements see Cell ID of each particle.
`self.cross_knn_position_single(...)`.
""" """
N = self.cats.N # Number of catalogues assert x.ndim == 1 and x.size == y.size == z.size
matches = [None] * N xbin = numpy.digitize(x, self.bins)
# Loop over each catalogue ybin = numpy.digitize(y, self.bins)
for i in trange(N) if verbose else range(N): zbin = numpy.digitize(z, self.bins)
matches[i] = self.cross_knn_position_single( N = self.bins.size
i, nmult, dlogmass, init_dist)
return matches return xbin + ybin * N + zbin * N**2
def mass_overlap(self, clump1, clump2, cells1=None):
r"""
Calculate the particle, mass-weighted overlap between two halos.
Defined as
..math::
(M_{u,1} + M_{u,2}) / (M_1 + M_2),
where :math:`M_{u, 1}` is the mass of particles of the first halo in
cells that are also present in the second halo and :math:`M_1` is the
total particle mass of the first halo.
Parameters
----------
clump1, clump2 : structured arrays
Structured arrays corresponding to the two clumps. Should contain
keys `x`, `y`, `z` and `M`.
cells1 : 1-dimensional array, optional
Optionlaly precomputed cells of `clump1`. Be careful when using
this to ensure it matches `clump1`.
Returns
-------
overlap : float
"""
# 1-dimensional cell ID of each particle in clump1 and clump2
if cells1 is None:
cells1 = self.assign_to_cell(*[clump1[p] for p in ('x', 'y', 'z')])
cells2 = self.assign_to_cell(*[clump2[p] for p in ('x', 'y', 'z')])
# Elementwise cells1 in cells2 and vice versa
m1 = numpy.isin(cells1, cells2)
m2 = numpy.isin(cells2, cells1)
# Summed shared mass and the total
interp = numpy.sum(clump1["M"][m1]) + numpy.sum(clump2["M"][m2])
mtot = numpy.sum(clump1["M"]) + numpy.sum(clump2["M"])
return interp / mtot

View file

@ -378,7 +378,8 @@ class CombinedHaloCatalogue:
def __init__(self, paths, min_m500=None, max_dist=None, verbose=True): def __init__(self, paths, min_m500=None, max_dist=None, verbose=True):
# Read simulations and their maximum snapshots # Read simulations and their maximum snapshots
# NOTE later change this back to all simulations # NOTE later change this back to all simulations
self._n_sims = [7468, 7588, 8020, 8452, 8836] # self._n_sims = [7468, 7588, 8020, 8452, 8836]
self._n_sims = [7468, 7588]
# self._n_sims = paths.ic_ids # self._n_sims = paths.ic_ids
n_snaps = [paths.get_maximum_snapshot(i) for i in self._n_sims] n_snaps = [paths.get_maximum_snapshot(i) for i in self._n_sims]
self._n_snaps = numpy.asanyarray(n_snaps) self._n_snaps = numpy.asanyarray(n_snaps)

View file

@ -72,7 +72,6 @@ class CSiBORGPaths:
_initmatch_path = None _initmatch_path = None
_to_new = None _to_new = None
# NOTE deuglify this stuff
def __init__(self, n_sim=None, n_snap=None, srcdir=None, dumpdir=None, def __init__(self, n_sim=None, n_snap=None, srcdir=None, dumpdir=None,
mmain_path=None, initmatch_path=None, to_new=False): mmain_path=None, initmatch_path=None, to_new=False):
if srcdir is None: if srcdir is None:
@ -425,6 +424,24 @@ class CSiBORGPaths:
n_sim = self.get_n_sim(n_sim) n_sim = self.get_n_sim(n_sim)
return min(self.get_snapshots(n_sim)) return min(self.get_snapshots(n_sim))
def clump0_path(self, nsim):
"""
Path to a single dumped clump's particles. This is expected to point
to a dictonary whose keys are the clump indices and items structured
arrays with the clump's particles in the initial snapshot.
Parameters
----------
nsim : int
Index of the initial conditions (IC) realisation.
Returns
-------
path : str
"""
cdir = join(self.dumpdir, "initmatch")
return join(cdir, "clump_{}_{}.npy".format(nsim, "particles"))
def snapshot_path(self, n_snap=None, n_sim=None): def snapshot_path(self, n_snap=None, n_sim=None):
""" """
Path to a CSiBORG IC realisation snapshot. Path to a CSiBORG IC realisation snapshot.
@ -839,7 +856,7 @@ def read_mmain(n, srcdir, fname="Mmain_{}.npy"):
return out return out
def read_initcm(n, srcdir, fname="clump_cm_{}.npy"): def read_initcm(n, srcdir, fname="clump_{}_cm.npy"):
""" """
Read `clump_cm`, i.e. the center of mass of a clump at redshift z = 70. Read `clump_cm`, i.e. the center of mass of a clump at redshift z = 70.
If the file does not exist returns `None`. If the file does not exist returns `None`.

View file

@ -24,8 +24,7 @@ import numpy
from datetime import datetime from datetime import datetime
from mpi4py import MPI from mpi4py import MPI
from distutils.util import strtobool from distutils.util import strtobool
from os.path import join, isdir from os.path import join
from os import mkdir
from os import remove from os import remove
from sys import stdout from sys import stdout
from gc import collect from gc import collect
@ -51,21 +50,15 @@ fin_paths = csiborgtools.read.CSiBORGPaths(to_new=False)
nsims = init_paths.ic_ids nsims = init_paths.ic_ids
# Output files # Output files
dumpdir = "/mnt/extraspace/rstiskalek/csiborg/initmatch" dumpdir = "/mnt/extraspace/rstiskalek/csiborg/"
ftemp = join(dumpdir, "temp", "temp_{}_{}.npy") ftemp = join(dumpdir, "temp_initmatch", "temp_{}_{}_{}.npy")
fperm = join(dumpdir, "clump_cm_{}.npy") fpermcm = join(dumpdir, "initmatch", "clump_{}_cm.npy")
fpermpart = join(dumpdir, "initmatch", "clump_{}_particles.npy")
for nsim in nsims: for nsim in nsims:
if rank == 0: if rank == 0:
print("{}: reading simulation {}.".format(datetime.now(), nsim)) print("{}: reading simulation {}.".format(datetime.now(), nsim))
stdout.flush() stdout.flush()
# Check that the output folder for this sim exists
clumpdumpdir = join(dumpdir, "out_{}".format(nsim))
if args.dump_clumps and rank == 0 and not isdir(clumpdumpdir):
mkdir(clumpdumpdir)
# Barrier to make sure we created the directory with the rank 0
comm.Barrier()
# Set the snapshot numbers # Set the snapshot numbers
init_paths.set_info(nsim, init_paths.get_minimum_snapshot(nsim)) init_paths.set_info(nsim, init_paths.get_minimum_snapshot(nsim))
@ -111,15 +104,16 @@ for nsim in nsims:
cm = numpy.asanyarray( cm = numpy.asanyarray(
[numpy.average(x0[p], weights=x0["M"]) for p in ('x', 'y', 'z')]) [numpy.average(x0[p], weights=x0["M"]) for p in ('x', 'y', 'z')])
# Dump the center of mass # Dump the center of mass
with open(ftemp.format(nsim, n), 'wb') as f: with open(ftemp.format(nsim, n, "cm"), 'wb') as f:
numpy.save(f, cm) numpy.save(f, cm)
# Optionally dump the entire clump # Optionally dump the entire clump
if args.dump_clumps: if args.dump_clumps:
fout = join(clumpdumpdir, "clump_{}.npy".format(n)) with open(ftemp.format(nsim, n, "clump"), "wb") as f:
stdout.flush()
with open(fout, "wb") as f:
numpy.save(f, x0) numpy.save(f, x0)
del part0, clump_ids
collect()
comm.Barrier() comm.Barrier()
if rank == 0: if rank == 0:
print("Collecting CM files...") print("Collecting CM files...")
@ -130,14 +124,33 @@ for nsim in nsims:
out = numpy.full(njobs, numpy.nan, dtype=dtype) out = numpy.full(njobs, numpy.nan, dtype=dtype)
for i, n in enumerate(unique_clumpids): for i, n in enumerate(unique_clumpids):
with open(ftemp.format(nsim, n), 'rb') as f: fpath = ftemp.format(nsim, n, "cm")
with open(fpath, 'rb') as f:
fin = numpy.load(f) fin = numpy.load(f)
out['x'][i] = fin[0] out['x'][i] = fin[0]
out['y'][i] = fin[1] out['y'][i] = fin[1]
out['z'][i] = fin[2] out['z'][i] = fin[2]
out["ID"][i] = n out["ID"][i] = n
remove(ftemp.format(nsim, n)) remove(fpath)
print("Dumping CM files to .. `{}`.".format(fpermcm.format(nsim)))
print("Dumping CM files to .. `{}`.".format(fperm.format(nsim))) with open(fpermcm.format(nsim), 'wb') as f:
with open(fperm.format(nsim), 'wb') as f:
numpy.save(f, out) numpy.save(f, out)
print("Collecting clump files...")
stdout.flush()
out = [None] * unique_clumpids.size
dtype = {"names": ["clump", "ID"], "formats": [object, numpy.int32]}
out = numpy.full(unique_clumpids.size, numpy.nan, dtype=dtype)
for i, n in enumerate(unique_clumpids):
fpath = ftemp.format(nsim, n, "clump")
with open(fpath, 'rb') as f:
fin = numpy.load(f)
out["clump"][i] = fin
out["ID"][i] = n
remove(fpath)
print("Dumping clump files to .. `{}`.".format(fpermpart.format(nsim)))
with open(fpermpart.format(nsim), "wb") as f:
numpy.save(f, out)
del out
collect()