Overlap calculation (#18)

* Move cosine similarity out

* Basic overlap calculation

* add overlap import

* Add clump0 dict path

* Update README

* fix path bug

* Save as dict instead

* Change format to array of arrays

* Update paths

* Take fewer ICs for now

* Change to structured array

* Add overlap calculation

* Start saving IDs

* Add a blank space

* Update TODO
This commit is contained in:
Richard Stiskalek 2022-12-19 11:58:22 +01:00 committed by GitHub
parent 13a9d11afe
commit 18f09767f4
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 297 additions and 65 deletions

View file

@ -7,14 +7,11 @@
## Short-term TODO
- [x] Add code to calculate the cross-correlation for resolved region only.
- [ ] Calculate the spectra for all 101 boxes and visualise them.
- [ ] See about the $z=70$ particles.
- [ ] Implement the CIC binning.
- [ ] Write a script to perform the matching on a node.
## Long-term TODO
- [ ] Calculate the cross-correlation in CSiBORG. Should see the scale of the constraints?
- [ ] Find the distribution of particles in the first snapshot
- [ ] Implement a custom model for matchin galaxies to halos.

View file

@ -13,6 +13,6 @@
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
from .match import (brute_spatial_separation, RealisationsMatcher) # noqa
from .match import (brute_spatial_separation, RealisationsMatcher, cosine_similarity, ParticleOverlap) # noqa
from .num_density import (binned_counts, number_density) # noqa
# from .correlation import (get_randoms_sphere, sphere_angular_tpcf) # noqa

View file

@ -79,6 +79,9 @@ class RealisationsMatcher:
----------
cats : :py:class`csiborgtools.read.CombinedHaloCatalogue`
Combined halo catalogue to search.
# NOTE add later
# dtype : dtype, optional
# Output precision. By default `numpy.float32`.
"""
_cats = None
@ -122,42 +125,42 @@ class RealisationsMatcher:
"""
return [i for i in range(self.cats.N) if i != n_sim]
def cosine_similarity(self, x, y):
r"""
Calculate the cosine similarity between two Cartesian vectors. Defined
as :math:`\Sum_{i} x_i y_{i} / (|x| * |y|)`.
def _check_masskind(self, mass_kind):
"""Check that `mass_kind` is a valid key."""
if mass_kind not in self.cats[0].keys:
raise ValueError("Invalid mass kind `{}`.".format(mass_kind))
@staticmethod
def _cat2clump_mapping(cat_indxs, clump_indxs):
"""
Create a mapping from a catalogue array index to a clump array index.
Parameters
----------
x : 1-dimensional array
The first vector.
y : 1- or 2-dimensional array
The second vector. Can be 2-dimensional of shape `(n_samples, 3)`,
in which case the calculation is broadcasted.
cat_indxs : 1-dimensional array
Clump indices in the catalogue array.
clump_indxs : 1-dimensional array
Clump indices in the clump array.
Returns
-------
out : float or 1-dimensional array
The cosine similarity. If y is 1-dimensinal returns only a float.
mapping : 1-dimensional array
Mapping. The array indices match catalogue array and values are
array positions in the clump array.
"""
# Quick check of dimensions
if x.ndim != 1:
raise ValueError("`x` must be a 1-dimensional array.")
y = y.reshape(-1, 3) if y.ndim == 1 else y
out = numpy.sum(x * y, axis=1)
out /= numpy.linalg.norm(x) * numpy.linalg.norm(y, axis=1)
if out.size == 1:
return out[0]
return out
mapping = numpy.full(cat_indxs.size, numpy.nan, dtype=int)
__, ind1, ind2 = numpy.intersect1d(clump_indxs, cat_indxs,
return_indices=True)
mapping[ind2] = ind1
return mapping
def cross_knn_position_single(self, n_sim, nmult=5, dlogmass=None,
init_dist=False, verbose=True):
mass_kind="totpartmass", init_dist=False,
overlap=False, verbose=True):
r"""
Find all neighbours within :math:`n_{\rm mult} R_{200c}` of halos in
the `nsim`th simulation. Also enforces that the neighbours'
:math:`\log M_{200c}` be within `dlogmass` dex.
:math:`\log M / M_\odot` be within `dlogmass` dex.
Parameters
----------
@ -169,43 +172,71 @@ class RealisationsMatcher:
default 5.
dlogmass : float, optional
Tolerance on mass logarithmic mass difference. By default `None`.
mass_kind : str, optional
The mass kind whose similarity is to be checked. Must be a valid
catalogue key. By default `totpartmass`, i.e. the total particle
mass associated with a halo.
init_dist : bool, optional
Whether to calculate separation of the initial CMs. By default
`False`.
overlap : bool, optional
Whether to calculate overlap between clumps in the initial
snapshot. By default `False`. Note that this operation is
substantially slower.
verbose : bool, optional
Iterator verbosity flag. By default `True`.
Returns
-------
matches : composite array
Array, indices are `(n_sims - 1, 3, n_halos, n_matches)`. The
Array, indices are `(n_sims - 1, 4, n_halos, n_matches)`. The
2nd axis is `index` of the neighbouring halo in its catalogue,
`dist`, which is the 3D distance to the halo whose neighbours are
searched, and `dist0` which is the separation of the initial CMs.
The latter is calculated only if `init_dist` is `True`.
`dist` is the 3D distance to the halo whose neighbours are
searched, `dist0` is the separation of the initial CMs and
`overlap` is the overlap over the initial clumps, all respectively.
The latter two are calculated only if `init_dist` or `overlap` is
`True`.
TODO:
- [ ] Precalculate the mapping from halo index to clump array position
"""
# Radius, M200c and positions of halos in `n_sim` IC realisation
logm200 = numpy.log10(self.cats[n_sim]["m200"])
self._check_masskind(mass_kind)
# Radius, mass and positions of halos in `n_sim` IC realisation
logmass = numpy.log10(self.cats[n_sim][mass_kind])
R = self.cats[n_sim]["r200"]
pos = self.cats[n_sim].positions
if init_dist:
pos0 = self.cats[n_sim].positions0 # These are CM positions
if overlap:
if verbose:
print("Loading initial clump particles for `n_sim = {}`."
.format(n_sim))
# Grab a paths object. What it is set to is unimportant
paths = self.cats[0].paths
with open(paths.clump0_path(self.cats.n_sims[n_sim]), "rb") as f:
clumps0 = numpy.load(f, allow_pickle=True)
overlapper = ParticleOverlap()
cat2clumps0 = self._cat2clump_mapping(self.cats[n_sim]["index"],
clumps0["ID"])
matches = [None] * (self.cats.N - 1)
# Verbose iterator
if verbose:
iters = enumerate(tqdm(self.search_sim_indices(n_sim)))
else:
iters = enumerate(self.search_sim_indices(n_sim))
iters = enumerate(self.search_sim_indices(n_sim))
# Search for neighbours in the other simulations
for count, i in iters:
dist, indxs = self.cats[i].radius_neigbours(pos, R * nmult)
# Get rid of neighbors whose mass is too off
if dlogmass is not None:
for j, indx in enumerate(indxs):
match_logm200 = numpy.log10(self.cats[i]["m200"][indx])
mask = numpy.abs(match_logm200 - logm200[j]) < dlogmass
match_logmass = numpy.log10(self.cats[i][mass_kind][indx])
mask = numpy.abs(match_logmass - logmass[j]) < dlogmass
dist[j] = dist[j][mask]
indxs[j] = indx[mask]
# Find distance to the between the initial CM
dist0 = [numpy.asanyarray([], dtype=numpy.float64)] * dist.size
if init_dist:
@ -215,13 +246,47 @@ class RealisationsMatcher:
dist0[k] = numpy.linalg.norm(
pos0[k] - self.cats[i].positions0[indxs[k]], axis=1)
# Calculate the initial snapshot overlap
cross = [numpy.asanyarray([], dtype=numpy.float64)] * dist.size
if overlap:
if verbose:
print("Loading initial clump particles for `n_sim = {}` "
"to compare against `n_sim = {}`.".format(i, n_sim))
with open(paths.clump0_path(self.cats.n_sims[i]), 'rb') as f:
clumpsx = numpy.load(f, allow_pickle=True)
cat2clumpsx = self._cat2clump_mapping(self.cats[i]["index"],
clumpsx["ID"])
# Loop only over halos that have neighbours
with_neigbours = numpy.where([ii.size > 0 for ii in indxs])[0]
for k in tqdm(with_neigbours) if verbose else with_neigbours:
# Find which clump matches index of this halo from cat
match0 = cat2clumps0[k]
# Get the clump and pre-calculate its cell assignment
cl0 = clumps0["clump"][match0]
cl0_cells = overlapper.assign_to_cell(
*(cl0[p] for p in ('x', 'y', 'z')))
dint = numpy.full(indxs[k].size, numpy.nan, numpy.float64)
# Loop over the ones we cross-correlate with
for ii, ind in enumerate(indxs[k]):
# Again which cross clump to this index
matchx = cat2clumpsx[ind]
dint[ii] = overlapper.mass_overlap(
cl0, clumpsx["clump"][matchx], cl0_cells)
cross[k] = dint
# Append as a composite array
matches[count] = numpy.asarray([indxs, dist, dist0], dtype=object)
matches[count] = numpy.asarray(
[indxs, dist, dist0, cross], dtype=object)
return numpy.asarray(matches, dtype=object)
def cross_knn_position_all(self, nmult=5, dlogmass=None,
init_dist=False, verbose=True):
mass_kind="totpartmass", init_dist=False,
overlap=False, verbose=True):
r"""
Find all neighbours within :math:`n_{\rm mult} R_{200c}` of halos in
all simulations listed in `self.cats`. Also enforces that the
@ -234,9 +299,17 @@ class RealisationsMatcher:
default 5.
dlogmass : float, optional
Tolerance on mass logarithmic mass difference. By default `None`.
mass_kind : str, optional
The mass kind whose similarity is to be checked. Must be a valid
catalogue key. By default `totpartmass`, i.e. the total particle
mass associated with a halo.
init_dist : bool, optional
Whether to calculate separation of the initial CMs. By default
`False`.
overlap : bool, optional
Whether to calculate overlap between clumps in the initial
snapshot. By default `False`. Note that this operation is
substantially slower.
verbose : bool, optional
Iterator verbosity flag. By default `True`.
@ -251,5 +324,136 @@ class RealisationsMatcher:
# Loop over each catalogue
for i in trange(N) if verbose else range(N):
matches[i] = self.cross_knn_position_single(
i, nmult, dlogmass, init_dist)
i, nmult, dlogmass, mass_kind=mass_kind, init_dist=init_dist,
overlap=overlap, verbose=verbose)
return matches
###############################################################################
# Matching statistics #
###############################################################################
def cosine_similarity(x, y):
r"""
Calculate the cosine similarity between two Cartesian vectors. Defined
as :math:`\Sum_{i} x_i y_{i} / (|x| * |y|)`.
Parameters
----------
x : 1-dimensional array
The first vector.
y : 1- or 2-dimensional array
The second vector. Can be 2-dimensional of shape `(n_samples, 3)`,
in which case the calculation is broadcasted.
Returns
-------
out : float or 1-dimensional array
The cosine similarity. If y is 1-dimensinal returns only a float.
"""
# Quick check of dimensions
if x.ndim != 1:
raise ValueError("`x` must be a 1-dimensional array.")
y = y.reshape(-1, 3) if y.ndim == 1 else y
out = numpy.sum(x * y, axis=1)
out /= numpy.linalg.norm(x) * numpy.linalg.norm(y, axis=1)
if out.size == 1:
return out[0]
return out
class ParticleOverlap:
"""
TODO:
- [ ] Class documentation
"""
_bins = None
def __init__(self, bins=None):
if bins is None:
dx = 1 / 2**11
bins = numpy.arange(0, 1 + dx, dx)
self.bins = bins
@property
def bins(self):
"""
The grid spacing. Assumed to be equal for all three dimensions. Units
ought to match the requested coordinates.
Returns
-------
bins : 1-dimensional array
"""
return self._bins
@bins.setter
def bins(self, bins):
"""Sets `bins`."""
bins = numpy.asarray(bins) if isinstance(bins, list) else bins
assert bins.ndim == 1, "`bins` must be a 1-dimensional array."
self._bins = bins
def assign_to_cell(self, x, y, z):
"""
Assign particles specified by coordinates `x`, `y`, and `z` to grid
cells.
Parameters
----------
x, y, z : 1-dimensional arrays
Positions of particles in the box.
Returns
-------
cells : 1-dimensional array
Cell ID of each particle.
"""
assert x.ndim == 1 and x.size == y.size == z.size
xbin = numpy.digitize(x, self.bins)
ybin = numpy.digitize(y, self.bins)
zbin = numpy.digitize(z, self.bins)
N = self.bins.size
return xbin + ybin * N + zbin * N**2
def mass_overlap(self, clump1, clump2, cells1=None):
r"""
Calculate the particle, mass-weighted overlap between two halos.
Defined as
..math::
(M_{u,1} + M_{u,2}) / (M_1 + M_2),
where :math:`M_{u, 1}` is the mass of particles of the first halo in
cells that are also present in the second halo and :math:`M_1` is the
total particle mass of the first halo.
Parameters
----------
clump1, clump2 : structured arrays
Structured arrays corresponding to the two clumps. Should contain
keys `x`, `y`, `z` and `M`.
cells1 : 1-dimensional array, optional
Optionlaly precomputed cells of `clump1`. Be careful when using
this to ensure it matches `clump1`.
Returns
-------
overlap : float
"""
# 1-dimensional cell ID of each particle in clump1 and clump2
if cells1 is None:
cells1 = self.assign_to_cell(*[clump1[p] for p in ('x', 'y', 'z')])
cells2 = self.assign_to_cell(*[clump2[p] for p in ('x', 'y', 'z')])
# Elementwise cells1 in cells2 and vice versa
m1 = numpy.isin(cells1, cells2)
m2 = numpy.isin(cells2, cells1)
# Summed shared mass and the total
interp = numpy.sum(clump1["M"][m1]) + numpy.sum(clump2["M"][m2])
mtot = numpy.sum(clump1["M"]) + numpy.sum(clump2["M"])
return interp / mtot

View file

@ -378,7 +378,8 @@ class CombinedHaloCatalogue:
def __init__(self, paths, min_m500=None, max_dist=None, verbose=True):
# Read simulations and their maximum snapshots
# NOTE later change this back to all simulations
self._n_sims = [7468, 7588, 8020, 8452, 8836]
# self._n_sims = [7468, 7588, 8020, 8452, 8836]
self._n_sims = [7468, 7588]
# self._n_sims = paths.ic_ids
n_snaps = [paths.get_maximum_snapshot(i) for i in self._n_sims]
self._n_snaps = numpy.asanyarray(n_snaps)

View file

@ -72,7 +72,6 @@ class CSiBORGPaths:
_initmatch_path = None
_to_new = None
# NOTE deuglify this stuff
def __init__(self, n_sim=None, n_snap=None, srcdir=None, dumpdir=None,
mmain_path=None, initmatch_path=None, to_new=False):
if srcdir is None:
@ -425,6 +424,24 @@ class CSiBORGPaths:
n_sim = self.get_n_sim(n_sim)
return min(self.get_snapshots(n_sim))
def clump0_path(self, nsim):
"""
Path to a single dumped clump's particles. This is expected to point
to a dictonary whose keys are the clump indices and items structured
arrays with the clump's particles in the initial snapshot.
Parameters
----------
nsim : int
Index of the initial conditions (IC) realisation.
Returns
-------
path : str
"""
cdir = join(self.dumpdir, "initmatch")
return join(cdir, "clump_{}_{}.npy".format(nsim, "particles"))
def snapshot_path(self, n_snap=None, n_sim=None):
"""
Path to a CSiBORG IC realisation snapshot.
@ -839,7 +856,7 @@ def read_mmain(n, srcdir, fname="Mmain_{}.npy"):
return out
def read_initcm(n, srcdir, fname="clump_cm_{}.npy"):
def read_initcm(n, srcdir, fname="clump_{}_cm.npy"):
"""
Read `clump_cm`, i.e. the center of mass of a clump at redshift z = 70.
If the file does not exist returns `None`.

View file

@ -24,8 +24,7 @@ import numpy
from datetime import datetime
from mpi4py import MPI
from distutils.util import strtobool
from os.path import join, isdir
from os import mkdir
from os.path import join
from os import remove
from sys import stdout
from gc import collect
@ -51,21 +50,15 @@ fin_paths = csiborgtools.read.CSiBORGPaths(to_new=False)
nsims = init_paths.ic_ids
# Output files
dumpdir = "/mnt/extraspace/rstiskalek/csiborg/initmatch"
ftemp = join(dumpdir, "temp", "temp_{}_{}.npy")
fperm = join(dumpdir, "clump_cm_{}.npy")
dumpdir = "/mnt/extraspace/rstiskalek/csiborg/"
ftemp = join(dumpdir, "temp_initmatch", "temp_{}_{}_{}.npy")
fpermcm = join(dumpdir, "initmatch", "clump_{}_cm.npy")
fpermpart = join(dumpdir, "initmatch", "clump_{}_particles.npy")
for nsim in nsims:
if rank == 0:
print("{}: reading simulation {}.".format(datetime.now(), nsim))
stdout.flush()
# Check that the output folder for this sim exists
clumpdumpdir = join(dumpdir, "out_{}".format(nsim))
if args.dump_clumps and rank == 0 and not isdir(clumpdumpdir):
mkdir(clumpdumpdir)
# Barrier to make sure we created the directory with the rank 0
comm.Barrier()
# Set the snapshot numbers
init_paths.set_info(nsim, init_paths.get_minimum_snapshot(nsim))
@ -111,15 +104,16 @@ for nsim in nsims:
cm = numpy.asanyarray(
[numpy.average(x0[p], weights=x0["M"]) for p in ('x', 'y', 'z')])
# Dump the center of mass
with open(ftemp.format(nsim, n), 'wb') as f:
with open(ftemp.format(nsim, n, "cm"), 'wb') as f:
numpy.save(f, cm)
# Optionally dump the entire clump
if args.dump_clumps:
fout = join(clumpdumpdir, "clump_{}.npy".format(n))
stdout.flush()
with open(fout, "wb") as f:
with open(ftemp.format(nsim, n, "clump"), "wb") as f:
numpy.save(f, x0)
del part0, clump_ids
collect()
comm.Barrier()
if rank == 0:
print("Collecting CM files...")
@ -130,14 +124,33 @@ for nsim in nsims:
out = numpy.full(njobs, numpy.nan, dtype=dtype)
for i, n in enumerate(unique_clumpids):
with open(ftemp.format(nsim, n), 'rb') as f:
fpath = ftemp.format(nsim, n, "cm")
with open(fpath, 'rb') as f:
fin = numpy.load(f)
out['x'][i] = fin[0]
out['y'][i] = fin[1]
out['z'][i] = fin[2]
out["ID"][i] = n
remove(ftemp.format(nsim, n))
print("Dumping CM files to .. `{}`.".format(fperm.format(nsim)))
with open(fperm.format(nsim), 'wb') as f:
remove(fpath)
print("Dumping CM files to .. `{}`.".format(fpermcm.format(nsim)))
with open(fpermcm.format(nsim), 'wb') as f:
numpy.save(f, out)
print("Collecting clump files...")
stdout.flush()
out = [None] * unique_clumpids.size
dtype = {"names": ["clump", "ID"], "formats": [object, numpy.int32]}
out = numpy.full(unique_clumpids.size, numpy.nan, dtype=dtype)
for i, n in enumerate(unique_clumpids):
fpath = ftemp.format(nsim, n, "clump")
with open(fpath, 'rb') as f:
fin = numpy.load(f)
out["clump"][i] = fin
out["ID"][i] = n
remove(fpath)
print("Dumping clump files to .. `{}`.".format(fpermpart.format(nsim)))
with open(fpermpart.format(nsim), "wb") as f:
numpy.save(f, out)
del out
collect()