Overlap calculation (#18)

* Move cosine similarity out

* Basic overlap calculation

* add overlap import

* Add clump0 dict path

* Update README

* fix path bug

* Save as dict instead

* Change format to array of arrays

* Update paths

* Take fewer ICs for now

* Change to structured array

* Add overlap calculation

* Start saving IDs

* Add a blank space

* Update TODO
This commit is contained in:
Richard Stiskalek 2022-12-19 11:58:22 +01:00 committed by GitHub
parent 13a9d11afe
commit 18f09767f4
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 297 additions and 65 deletions

View file

@ -7,14 +7,11 @@
## Short-term TODO ## Short-term TODO
- [x] Add code to calculate the cross-correlation for resolved region only. - [ ] Implement the CIC binning.
- [ ] Calculate the spectra for all 101 boxes and visualise them. - [ ] Write a script to perform the matching on a node.
- [ ] See about the $z=70$ particles.
## Long-term TODO ## Long-term TODO
- [ ] Calculate the cross-correlation in CSiBORG. Should see the scale of the constraints?
- [ ] Find the distribution of particles in the first snapshot
- [ ] Implement a custom model for matchin galaxies to halos. - [ ] Implement a custom model for matchin galaxies to halos.

View file

@ -13,6 +13,6 @@
# with this program; if not, write to the Free Software Foundation, Inc., # with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
from .match import (brute_spatial_separation, RealisationsMatcher) # noqa from .match import (brute_spatial_separation, RealisationsMatcher, cosine_similarity, ParticleOverlap) # noqa
from .num_density import (binned_counts, number_density) # noqa from .num_density import (binned_counts, number_density) # noqa
# from .correlation import (get_randoms_sphere, sphere_angular_tpcf) # noqa # from .correlation import (get_randoms_sphere, sphere_angular_tpcf) # noqa

View file

@ -79,6 +79,9 @@ class RealisationsMatcher:
---------- ----------
cats : :py:class`csiborgtools.read.CombinedHaloCatalogue` cats : :py:class`csiborgtools.read.CombinedHaloCatalogue`
Combined halo catalogue to search. Combined halo catalogue to search.
# NOTE add later
# dtype : dtype, optional
# Output precision. By default `numpy.float32`.
""" """
_cats = None _cats = None
@ -122,42 +125,42 @@ class RealisationsMatcher:
""" """
return [i for i in range(self.cats.N) if i != n_sim] return [i for i in range(self.cats.N) if i != n_sim]
def cosine_similarity(self, x, y): def _check_masskind(self, mass_kind):
r""" """Check that `mass_kind` is a valid key."""
Calculate the cosine similarity between two Cartesian vectors. Defined if mass_kind not in self.cats[0].keys:
as :math:`\Sum_{i} x_i y_{i} / (|x| * |y|)`. raise ValueError("Invalid mass kind `{}`.".format(mass_kind))
@staticmethod
def _cat2clump_mapping(cat_indxs, clump_indxs):
"""
Create a mapping from a catalogue array index to a clump array index.
Parameters Parameters
---------- ----------
x : 1-dimensional array cat_indxs : 1-dimensional array
The first vector. Clump indices in the catalogue array.
y : 1- or 2-dimensional array clump_indxs : 1-dimensional array
The second vector. Can be 2-dimensional of shape `(n_samples, 3)`, Clump indices in the clump array.
in which case the calculation is broadcasted.
Returns Returns
------- -------
out : float or 1-dimensional array mapping : 1-dimensional array
The cosine similarity. If y is 1-dimensinal returns only a float. Mapping. The array indices match catalogue array and values are
array positions in the clump array.
""" """
# Quick check of dimensions mapping = numpy.full(cat_indxs.size, numpy.nan, dtype=int)
if x.ndim != 1: __, ind1, ind2 = numpy.intersect1d(clump_indxs, cat_indxs,
raise ValueError("`x` must be a 1-dimensional array.") return_indices=True)
y = y.reshape(-1, 3) if y.ndim == 1 else y mapping[ind2] = ind1
return mapping
out = numpy.sum(x * y, axis=1)
out /= numpy.linalg.norm(x) * numpy.linalg.norm(y, axis=1)
if out.size == 1:
return out[0]
return out
def cross_knn_position_single(self, n_sim, nmult=5, dlogmass=None, def cross_knn_position_single(self, n_sim, nmult=5, dlogmass=None,
init_dist=False, verbose=True): mass_kind="totpartmass", init_dist=False,
overlap=False, verbose=True):
r""" r"""
Find all neighbours within :math:`n_{\rm mult} R_{200c}` of halos in Find all neighbours within :math:`n_{\rm mult} R_{200c}` of halos in
the `nsim`th simulation. Also enforces that the neighbours' the `nsim`th simulation. Also enforces that the neighbours'
:math:`\log M_{200c}` be within `dlogmass` dex. :math:`\log M / M_\odot` be within `dlogmass` dex.
Parameters Parameters
---------- ----------
@ -169,43 +172,71 @@ class RealisationsMatcher:
default 5. default 5.
dlogmass : float, optional dlogmass : float, optional
Tolerance on mass logarithmic mass difference. By default `None`. Tolerance on mass logarithmic mass difference. By default `None`.
mass_kind : str, optional
The mass kind whose similarity is to be checked. Must be a valid
catalogue key. By default `totpartmass`, i.e. the total particle
mass associated with a halo.
init_dist : bool, optional init_dist : bool, optional
Whether to calculate separation of the initial CMs. By default Whether to calculate separation of the initial CMs. By default
`False`. `False`.
overlap : bool, optional
Whether to calculate overlap between clumps in the initial
snapshot. By default `False`. Note that this operation is
substantially slower.
verbose : bool, optional verbose : bool, optional
Iterator verbosity flag. By default `True`. Iterator verbosity flag. By default `True`.
Returns Returns
------- -------
matches : composite array matches : composite array
Array, indices are `(n_sims - 1, 3, n_halos, n_matches)`. The Array, indices are `(n_sims - 1, 4, n_halos, n_matches)`. The
2nd axis is `index` of the neighbouring halo in its catalogue, 2nd axis is `index` of the neighbouring halo in its catalogue,
`dist`, which is the 3D distance to the halo whose neighbours are `dist` is the 3D distance to the halo whose neighbours are
searched, and `dist0` which is the separation of the initial CMs. searched, `dist0` is the separation of the initial CMs and
The latter is calculated only if `init_dist` is `True`. `overlap` is the overlap over the initial clumps, all respectively.
The latter two are calculated only if `init_dist` or `overlap` is
`True`.
TODO:
- [ ] Precalculate the mapping from halo index to clump array position
""" """
# Radius, M200c and positions of halos in `n_sim` IC realisation self._check_masskind(mass_kind)
logm200 = numpy.log10(self.cats[n_sim]["m200"]) # Radius, mass and positions of halos in `n_sim` IC realisation
logmass = numpy.log10(self.cats[n_sim][mass_kind])
R = self.cats[n_sim]["r200"] R = self.cats[n_sim]["r200"]
pos = self.cats[n_sim].positions pos = self.cats[n_sim].positions
if init_dist: if init_dist:
pos0 = self.cats[n_sim].positions0 # These are CM positions pos0 = self.cats[n_sim].positions0 # These are CM positions
if overlap:
if verbose:
print("Loading initial clump particles for `n_sim = {}`."
.format(n_sim))
# Grab a paths object. What it is set to is unimportant
paths = self.cats[0].paths
with open(paths.clump0_path(self.cats.n_sims[n_sim]), "rb") as f:
clumps0 = numpy.load(f, allow_pickle=True)
overlapper = ParticleOverlap()
cat2clumps0 = self._cat2clump_mapping(self.cats[n_sim]["index"],
clumps0["ID"])
matches = [None] * (self.cats.N - 1) matches = [None] * (self.cats.N - 1)
# Verbose iterator # Verbose iterator
if verbose: if verbose:
iters = enumerate(tqdm(self.search_sim_indices(n_sim))) iters = enumerate(tqdm(self.search_sim_indices(n_sim)))
else: else:
iters = enumerate(self.search_sim_indices(n_sim)) iters = enumerate(self.search_sim_indices(n_sim))
iters = enumerate(self.search_sim_indices(n_sim))
# Search for neighbours in the other simulations # Search for neighbours in the other simulations
for count, i in iters: for count, i in iters:
dist, indxs = self.cats[i].radius_neigbours(pos, R * nmult) dist, indxs = self.cats[i].radius_neigbours(pos, R * nmult)
# Get rid of neighbors whose mass is too off # Get rid of neighbors whose mass is too off
if dlogmass is not None: if dlogmass is not None:
for j, indx in enumerate(indxs): for j, indx in enumerate(indxs):
match_logm200 = numpy.log10(self.cats[i]["m200"][indx]) match_logmass = numpy.log10(self.cats[i][mass_kind][indx])
mask = numpy.abs(match_logm200 - logm200[j]) < dlogmass mask = numpy.abs(match_logmass - logmass[j]) < dlogmass
dist[j] = dist[j][mask] dist[j] = dist[j][mask]
indxs[j] = indx[mask] indxs[j] = indx[mask]
# Find distance to the between the initial CM # Find distance to the between the initial CM
dist0 = [numpy.asanyarray([], dtype=numpy.float64)] * dist.size dist0 = [numpy.asanyarray([], dtype=numpy.float64)] * dist.size
if init_dist: if init_dist:
@ -215,13 +246,47 @@ class RealisationsMatcher:
dist0[k] = numpy.linalg.norm( dist0[k] = numpy.linalg.norm(
pos0[k] - self.cats[i].positions0[indxs[k]], axis=1) pos0[k] - self.cats[i].positions0[indxs[k]], axis=1)
# Calculate the initial snapshot overlap
cross = [numpy.asanyarray([], dtype=numpy.float64)] * dist.size
if overlap:
if verbose:
print("Loading initial clump particles for `n_sim = {}` "
"to compare against `n_sim = {}`.".format(i, n_sim))
with open(paths.clump0_path(self.cats.n_sims[i]), 'rb') as f:
clumpsx = numpy.load(f, allow_pickle=True)
cat2clumpsx = self._cat2clump_mapping(self.cats[i]["index"],
clumpsx["ID"])
# Loop only over halos that have neighbours
with_neigbours = numpy.where([ii.size > 0 for ii in indxs])[0]
for k in tqdm(with_neigbours) if verbose else with_neigbours:
# Find which clump matches index of this halo from cat
match0 = cat2clumps0[k]
# Get the clump and pre-calculate its cell assignment
cl0 = clumps0["clump"][match0]
cl0_cells = overlapper.assign_to_cell(
*(cl0[p] for p in ('x', 'y', 'z')))
dint = numpy.full(indxs[k].size, numpy.nan, numpy.float64)
# Loop over the ones we cross-correlate with
for ii, ind in enumerate(indxs[k]):
# Again which cross clump to this index
matchx = cat2clumpsx[ind]
dint[ii] = overlapper.mass_overlap(
cl0, clumpsx["clump"][matchx], cl0_cells)
cross[k] = dint
# Append as a composite array # Append as a composite array
matches[count] = numpy.asarray([indxs, dist, dist0], dtype=object) matches[count] = numpy.asarray(
[indxs, dist, dist0, cross], dtype=object)
return numpy.asarray(matches, dtype=object) return numpy.asarray(matches, dtype=object)
def cross_knn_position_all(self, nmult=5, dlogmass=None, def cross_knn_position_all(self, nmult=5, dlogmass=None,
init_dist=False, verbose=True): mass_kind="totpartmass", init_dist=False,
overlap=False, verbose=True):
r""" r"""
Find all neighbours within :math:`n_{\rm mult} R_{200c}` of halos in Find all neighbours within :math:`n_{\rm mult} R_{200c}` of halos in
all simulations listed in `self.cats`. Also enforces that the all simulations listed in `self.cats`. Also enforces that the
@ -234,9 +299,17 @@ class RealisationsMatcher:
default 5. default 5.
dlogmass : float, optional dlogmass : float, optional
Tolerance on mass logarithmic mass difference. By default `None`. Tolerance on mass logarithmic mass difference. By default `None`.
mass_kind : str, optional
The mass kind whose similarity is to be checked. Must be a valid
catalogue key. By default `totpartmass`, i.e. the total particle
mass associated with a halo.
init_dist : bool, optional init_dist : bool, optional
Whether to calculate separation of the initial CMs. By default Whether to calculate separation of the initial CMs. By default
`False`. `False`.
overlap : bool, optional
Whether to calculate overlap between clumps in the initial
snapshot. By default `False`. Note that this operation is
substantially slower.
verbose : bool, optional verbose : bool, optional
Iterator verbosity flag. By default `True`. Iterator verbosity flag. By default `True`.
@ -251,5 +324,136 @@ class RealisationsMatcher:
# Loop over each catalogue # Loop over each catalogue
for i in trange(N) if verbose else range(N): for i in trange(N) if verbose else range(N):
matches[i] = self.cross_knn_position_single( matches[i] = self.cross_knn_position_single(
i, nmult, dlogmass, init_dist) i, nmult, dlogmass, mass_kind=mass_kind, init_dist=init_dist,
overlap=overlap, verbose=verbose)
return matches return matches
###############################################################################
# Matching statistics #
###############################################################################
def cosine_similarity(x, y):
r"""
Calculate the cosine similarity between two Cartesian vectors. Defined
as :math:`\Sum_{i} x_i y_{i} / (|x| * |y|)`.
Parameters
----------
x : 1-dimensional array
The first vector.
y : 1- or 2-dimensional array
The second vector. Can be 2-dimensional of shape `(n_samples, 3)`,
in which case the calculation is broadcasted.
Returns
-------
out : float or 1-dimensional array
The cosine similarity. If y is 1-dimensinal returns only a float.
"""
# Quick check of dimensions
if x.ndim != 1:
raise ValueError("`x` must be a 1-dimensional array.")
y = y.reshape(-1, 3) if y.ndim == 1 else y
out = numpy.sum(x * y, axis=1)
out /= numpy.linalg.norm(x) * numpy.linalg.norm(y, axis=1)
if out.size == 1:
return out[0]
return out
class ParticleOverlap:
"""
TODO:
- [ ] Class documentation
"""
_bins = None
def __init__(self, bins=None):
if bins is None:
dx = 1 / 2**11
bins = numpy.arange(0, 1 + dx, dx)
self.bins = bins
@property
def bins(self):
"""
The grid spacing. Assumed to be equal for all three dimensions. Units
ought to match the requested coordinates.
Returns
-------
bins : 1-dimensional array
"""
return self._bins
@bins.setter
def bins(self, bins):
"""Sets `bins`."""
bins = numpy.asarray(bins) if isinstance(bins, list) else bins
assert bins.ndim == 1, "`bins` must be a 1-dimensional array."
self._bins = bins
def assign_to_cell(self, x, y, z):
"""
Assign particles specified by coordinates `x`, `y`, and `z` to grid
cells.
Parameters
----------
x, y, z : 1-dimensional arrays
Positions of particles in the box.
Returns
-------
cells : 1-dimensional array
Cell ID of each particle.
"""
assert x.ndim == 1 and x.size == y.size == z.size
xbin = numpy.digitize(x, self.bins)
ybin = numpy.digitize(y, self.bins)
zbin = numpy.digitize(z, self.bins)
N = self.bins.size
return xbin + ybin * N + zbin * N**2
def mass_overlap(self, clump1, clump2, cells1=None):
r"""
Calculate the particle, mass-weighted overlap between two halos.
Defined as
..math::
(M_{u,1} + M_{u,2}) / (M_1 + M_2),
where :math:`M_{u, 1}` is the mass of particles of the first halo in
cells that are also present in the second halo and :math:`M_1` is the
total particle mass of the first halo.
Parameters
----------
clump1, clump2 : structured arrays
Structured arrays corresponding to the two clumps. Should contain
keys `x`, `y`, `z` and `M`.
cells1 : 1-dimensional array, optional
Optionlaly precomputed cells of `clump1`. Be careful when using
this to ensure it matches `clump1`.
Returns
-------
overlap : float
"""
# 1-dimensional cell ID of each particle in clump1 and clump2
if cells1 is None:
cells1 = self.assign_to_cell(*[clump1[p] for p in ('x', 'y', 'z')])
cells2 = self.assign_to_cell(*[clump2[p] for p in ('x', 'y', 'z')])
# Elementwise cells1 in cells2 and vice versa
m1 = numpy.isin(cells1, cells2)
m2 = numpy.isin(cells2, cells1)
# Summed shared mass and the total
interp = numpy.sum(clump1["M"][m1]) + numpy.sum(clump2["M"][m2])
mtot = numpy.sum(clump1["M"]) + numpy.sum(clump2["M"])
return interp / mtot

View file

@ -378,7 +378,8 @@ class CombinedHaloCatalogue:
def __init__(self, paths, min_m500=None, max_dist=None, verbose=True): def __init__(self, paths, min_m500=None, max_dist=None, verbose=True):
# Read simulations and their maximum snapshots # Read simulations and their maximum snapshots
# NOTE later change this back to all simulations # NOTE later change this back to all simulations
self._n_sims = [7468, 7588, 8020, 8452, 8836] # self._n_sims = [7468, 7588, 8020, 8452, 8836]
self._n_sims = [7468, 7588]
# self._n_sims = paths.ic_ids # self._n_sims = paths.ic_ids
n_snaps = [paths.get_maximum_snapshot(i) for i in self._n_sims] n_snaps = [paths.get_maximum_snapshot(i) for i in self._n_sims]
self._n_snaps = numpy.asanyarray(n_snaps) self._n_snaps = numpy.asanyarray(n_snaps)

View file

@ -72,7 +72,6 @@ class CSiBORGPaths:
_initmatch_path = None _initmatch_path = None
_to_new = None _to_new = None
# NOTE deuglify this stuff
def __init__(self, n_sim=None, n_snap=None, srcdir=None, dumpdir=None, def __init__(self, n_sim=None, n_snap=None, srcdir=None, dumpdir=None,
mmain_path=None, initmatch_path=None, to_new=False): mmain_path=None, initmatch_path=None, to_new=False):
if srcdir is None: if srcdir is None:
@ -425,6 +424,24 @@ class CSiBORGPaths:
n_sim = self.get_n_sim(n_sim) n_sim = self.get_n_sim(n_sim)
return min(self.get_snapshots(n_sim)) return min(self.get_snapshots(n_sim))
def clump0_path(self, nsim):
"""
Path to a single dumped clump's particles. This is expected to point
to a dictonary whose keys are the clump indices and items structured
arrays with the clump's particles in the initial snapshot.
Parameters
----------
nsim : int
Index of the initial conditions (IC) realisation.
Returns
-------
path : str
"""
cdir = join(self.dumpdir, "initmatch")
return join(cdir, "clump_{}_{}.npy".format(nsim, "particles"))
def snapshot_path(self, n_snap=None, n_sim=None): def snapshot_path(self, n_snap=None, n_sim=None):
""" """
Path to a CSiBORG IC realisation snapshot. Path to a CSiBORG IC realisation snapshot.
@ -839,7 +856,7 @@ def read_mmain(n, srcdir, fname="Mmain_{}.npy"):
return out return out
def read_initcm(n, srcdir, fname="clump_cm_{}.npy"): def read_initcm(n, srcdir, fname="clump_{}_cm.npy"):
""" """
Read `clump_cm`, i.e. the center of mass of a clump at redshift z = 70. Read `clump_cm`, i.e. the center of mass of a clump at redshift z = 70.
If the file does not exist returns `None`. If the file does not exist returns `None`.

View file

@ -24,8 +24,7 @@ import numpy
from datetime import datetime from datetime import datetime
from mpi4py import MPI from mpi4py import MPI
from distutils.util import strtobool from distutils.util import strtobool
from os.path import join, isdir from os.path import join
from os import mkdir
from os import remove from os import remove
from sys import stdout from sys import stdout
from gc import collect from gc import collect
@ -51,21 +50,15 @@ fin_paths = csiborgtools.read.CSiBORGPaths(to_new=False)
nsims = init_paths.ic_ids nsims = init_paths.ic_ids
# Output files # Output files
dumpdir = "/mnt/extraspace/rstiskalek/csiborg/initmatch" dumpdir = "/mnt/extraspace/rstiskalek/csiborg/"
ftemp = join(dumpdir, "temp", "temp_{}_{}.npy") ftemp = join(dumpdir, "temp_initmatch", "temp_{}_{}_{}.npy")
fperm = join(dumpdir, "clump_cm_{}.npy") fpermcm = join(dumpdir, "initmatch", "clump_{}_cm.npy")
fpermpart = join(dumpdir, "initmatch", "clump_{}_particles.npy")
for nsim in nsims: for nsim in nsims:
if rank == 0: if rank == 0:
print("{}: reading simulation {}.".format(datetime.now(), nsim)) print("{}: reading simulation {}.".format(datetime.now(), nsim))
stdout.flush() stdout.flush()
# Check that the output folder for this sim exists
clumpdumpdir = join(dumpdir, "out_{}".format(nsim))
if args.dump_clumps and rank == 0 and not isdir(clumpdumpdir):
mkdir(clumpdumpdir)
# Barrier to make sure we created the directory with the rank 0
comm.Barrier()
# Set the snapshot numbers # Set the snapshot numbers
init_paths.set_info(nsim, init_paths.get_minimum_snapshot(nsim)) init_paths.set_info(nsim, init_paths.get_minimum_snapshot(nsim))
@ -111,15 +104,16 @@ for nsim in nsims:
cm = numpy.asanyarray( cm = numpy.asanyarray(
[numpy.average(x0[p], weights=x0["M"]) for p in ('x', 'y', 'z')]) [numpy.average(x0[p], weights=x0["M"]) for p in ('x', 'y', 'z')])
# Dump the center of mass # Dump the center of mass
with open(ftemp.format(nsim, n), 'wb') as f: with open(ftemp.format(nsim, n, "cm"), 'wb') as f:
numpy.save(f, cm) numpy.save(f, cm)
# Optionally dump the entire clump # Optionally dump the entire clump
if args.dump_clumps: if args.dump_clumps:
fout = join(clumpdumpdir, "clump_{}.npy".format(n)) with open(ftemp.format(nsim, n, "clump"), "wb") as f:
stdout.flush()
with open(fout, "wb") as f:
numpy.save(f, x0) numpy.save(f, x0)
del part0, clump_ids
collect()
comm.Barrier() comm.Barrier()
if rank == 0: if rank == 0:
print("Collecting CM files...") print("Collecting CM files...")
@ -130,14 +124,33 @@ for nsim in nsims:
out = numpy.full(njobs, numpy.nan, dtype=dtype) out = numpy.full(njobs, numpy.nan, dtype=dtype)
for i, n in enumerate(unique_clumpids): for i, n in enumerate(unique_clumpids):
with open(ftemp.format(nsim, n), 'rb') as f: fpath = ftemp.format(nsim, n, "cm")
with open(fpath, 'rb') as f:
fin = numpy.load(f) fin = numpy.load(f)
out['x'][i] = fin[0] out['x'][i] = fin[0]
out['y'][i] = fin[1] out['y'][i] = fin[1]
out['z'][i] = fin[2] out['z'][i] = fin[2]
out["ID"][i] = n out["ID"][i] = n
remove(ftemp.format(nsim, n)) remove(fpath)
print("Dumping CM files to .. `{}`.".format(fpermcm.format(nsim)))
print("Dumping CM files to .. `{}`.".format(fperm.format(nsim))) with open(fpermcm.format(nsim), 'wb') as f:
with open(fperm.format(nsim), 'wb') as f:
numpy.save(f, out) numpy.save(f, out)
print("Collecting clump files...")
stdout.flush()
out = [None] * unique_clumpids.size
dtype = {"names": ["clump", "ID"], "formats": [object, numpy.int32]}
out = numpy.full(unique_clumpids.size, numpy.nan, dtype=dtype)
for i, n in enumerate(unique_clumpids):
fpath = ftemp.format(nsim, n, "clump")
with open(fpath, 'rb') as f:
fin = numpy.load(f)
out["clump"][i] = fin
out["ID"][i] = n
remove(fpath)
print("Dumping clump files to .. `{}`.".format(fpermpart.format(nsim)))
with open(fpermpart.format(nsim), "wb") as f:
numpy.save(f, out)
del out
collect()