Overlapper improvements (#53)

* Store indices as f32

* Fix init sorting

* Organise imports

* Rename pathing

* Add particle loading

* Improve particle reading

* Add h5py reader

* edit particle path

* Update particles loading

* update particles loading

* Fix particle dumping

* Add init fitting

* Fix bug due to insufficient precision

* Add commnet

* Add comment

* Add clumps catalogue to halo cat

* Add comment

* Make sure PIDS never forced to float32

* fix pid reading

* fix pid reading

* Update matching to work with new arrays

* Stop using cubical sub boxes, turn off nshift if no smoothing

* Improve caching

* Move function definitions

* Simplify calculation

* Add import

* Small updates to the halo

* Simplify calculation

* Simplify looping calculation

* fix tonew

* Add initial data

* Add skip condition

* Add unit conversion

* Add loading background in batches

* Rename mmain index

* Switch overlaps to h5

* Add finite lagpatch check

* fix column name

* Add verbosity flags

* Save halo IDs instead.

* Switch back to npz

* Delte nbs

* Reduce size of the box

* Load correct bckg of halos being matched

* Remove verbosity

* verbosity edits

* Change lower thresholds
This commit is contained in:
Richard Stiskalek 2023-05-06 16:52:48 +01:00 committed by GitHub
parent 1c9dacfde5
commit 56e39a8b1d
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
20 changed files with 864 additions and 3816 deletions

View file

@ -18,9 +18,7 @@ realisation must have been split in advance by `runsplit_halos`.
"""
from argparse import ArgumentParser
from datetime import datetime
from os.path import join
import h5py
import numpy
from mpi4py import MPI
from tqdm import tqdm
@ -33,20 +31,26 @@ except ModuleNotFoundError:
sys.path.append("../")
import csiborgtools
parser = ArgumentParser()
parser.add_argument("--kind", type=str, choices=["halos", "clumps"])
args = parser.parse_args()
# Get MPI things
comm = MPI.COMM_WORLD
rank = comm.Get_rank()
nproc = comm.Get_size()
verbose = nproc == 1
parser = ArgumentParser()
parser.add_argument("--kind", type=str, choices=["halos", "clumps"])
parser.add_argument("--ics", type=int, nargs="+", default=None,
help="IC realisations. If `-1` processes all simulations.")
args = parser.parse_args()
paths = csiborgtools.read.CSiBORGPaths(**csiborgtools.paths_glamdring)
partreader = csiborgtools.read.ParticleReader(paths)
nfwpost = csiborgtools.fits.NFWPosterior()
ftemp = join(paths.temp_dumpdir, "fit_clump_{}_{}_{}.npy")
if args.ics is None or args.ics[0] == -1:
ics = paths.get_ics(tonew=False)
else:
ics = args.ics
cols_collect = [
("index", numpy.int32),
("npart", numpy.int32),
@ -63,7 +67,7 @@ cols_collect = [
("lambda200c", numpy.float32),
("r200m", numpy.float32),
("m200m", numpy.float32),
]
]
def fit_clump(particles, clump_info, box):
@ -95,46 +99,19 @@ def fit_clump(particles, clump_info, box):
return out
def load_clump_particles(clumpid, particles, clump_map):
"""
Load a clump's particles. If it is not there, i.e clump has no associated
particles, return `None`.
"""
try:
return particles[clump_map[clumpid], :]
except KeyError:
return None
def load_parent_particles(clumpid, particles, clump_map, clumps_cat):
"""
Load a parent halo's particles.
"""
indxs = clumps_cat["index"][clumps_cat["parent"] == clumpid]
# We first load the particles of each clump belonging to this parent
# and then concatenate them for further analysis.
clumps = []
for ind in indxs:
parts = load_clump_particles(ind, particles, clump_map)
if parts is not None:
clumps.append(parts)
if len(clumps) == 0:
return None
return numpy.concatenate(clumps)
# We now start looping over all simulations
for i, nsim in enumerate(paths.get_ics(tonew=False)):
if rank == 0:
print(f"{datetime.now()}: calculating {i}th simulation `{nsim}`.",
flush=True)
# We MPI loop over all simulations.
jobs = csiborgtools.fits.split_jobs(len(ics), nproc)[rank]
for nsim in [ics[i] for i in jobs]:
print(f"{datetime.now()}: rank {rank} calculating simulation `{nsim}`.",
flush=True)
nsnap = max(paths.get_snapshots(nsim))
box = csiborgtools.read.BoxUnits(nsnap, nsim, paths)
# Particle archive
particles = h5py.File(paths.particle_h5py_path(nsim), 'r')["particles"]
clump_map = h5py.File(paths.particle_h5py_path(nsim, "clumpmap"), 'r')
f = csiborgtools.read.read_h5(paths.particles_path(nsim))
particles = f["particles"]
clump_map = f["clumpmap"]
clid2map = {clid: i for i, clid in enumerate(clump_map[:, 0])}
clumps_cat = csiborgtools.read.ClumpsCatalogue(nsim, paths, rawdata=True,
load_fitted=False)
# We check whether we fit halos or clumps, will be indexing over different
@ -143,66 +120,39 @@ for i, nsim in enumerate(paths.get_ics(tonew=False)):
ismain = clumps_cat.ismain
else:
ismain = numpy.ones(len(clumps_cat), dtype=bool)
ntasks = len(clumps_cat)
# We split the clumps among the processes. Each CPU calculates a fraction
# of them and dumps the results in a structured array. Even if we are
# calculating parent halo this index runs over all clumps.
jobs = csiborgtools.fits.split_jobs(ntasks, nproc)[rank]
out = csiborgtools.read.cols_to_structured(len(jobs), cols_collect)
for i, j in enumerate(tqdm(jobs)) if nproc == 1 else enumerate(jobs):
clumpid = clumps_cat["index"][j]
out["index"][i] = clumpid
# Even if we are calculating parent halo this index runs over all clumps.
out = csiborgtools.read.cols_to_structured(len(clumps_cat), cols_collect)
indxs = clumps_cat["index"]
for i, clid in enumerate(tqdm(indxs)) if verbose else enumerate(indxs):
clid = clumps_cat["index"][i]
out["index"][i] = clid
# If we are fitting halos and this clump is not a main, then continue.
if args.kind == "halos" and not ismain[j]:
if args.kind == "halos" and not ismain[i]:
continue
if args.kind == "halos":
part = load_parent_particles(clumpid, particles, clump_map,
clumps_cat)
part = csiborgtools.read.load_parent_particles(
clid, particles, clump_map, clid2map, clumps_cat)
else:
part = load_clump_particles(clumpid, particles, clump_map)
part = csiborgtools.read.load_clump_particles(clid, particles,
clump_map, clid2map)
# We fit the particles if there are any. If not we assign the index,
# otherwise it would be NaN converted to integers (-2147483648) and
# yield an error further down.
if part is not None:
_out = fit_clump(part, clumps_cat[j], box)
for key in _out.keys():
out[key][i] = _out[key]
if part is None:
continue
fout = ftemp.format(str(nsim).zfill(5), str(nsnap).zfill(5), rank)
if nproc == 0:
print(f"{datetime.now()}: rank {rank} saving to `{fout}`.", flush=True)
_out = fit_clump(part, clumps_cat[i], box)
for key in _out.keys():
out[key][i] = _out[key]
# Finally, we save the results. If we were analysing main halos, then
# remove array indices that do not correspond to parent halos.
if args.kind == "halos":
out = out[ismain]
fout = paths.structfit_path(nsnap, nsim, args.kind)
print(f"Saving to `{fout}`.", flush=True)
numpy.save(fout, out)
# We saved this CPU's results in a temporary file. Wait now for the other
# CPUs and then collect results from the 0th rank and save them.
comm.Barrier()
if rank == 0:
print(f"{datetime.now()}: collecting results for simulation `{nsim}`.",
flush=True)
# We write to the output array. Load data from each CPU and append to
# the output array.
out = csiborgtools.read.cols_to_structured(ntasks, cols_collect)
clumpid2outpos = {indx: i
for i, indx in enumerate(clumps_cat["index"])}
for i in range(nproc):
inp = numpy.load(ftemp.format(str(nsim).zfill(5),
str(nsnap).zfill(5), i))
for j, clumpid in enumerate(inp["index"]):
k = clumpid2outpos[clumpid]
for key in inp.dtype.names:
out[key][k] = inp[key][j]
# If we were analysing main halos, then remove array indices that do
# not correspond to parent halos.
if args.kind == "halos":
out = out[ismain]
fout = paths.structfit_path(nsnap, nsim, args.kind)
print(f"Saving to `{fout}`.", flush=True)
numpy.save(fout, out)
# We now wait before moving on to another simulation.
comm.Barrier()

104
scripts/fit_init.py Normal file
View file

@ -0,0 +1,104 @@
# Copyright (C) 2022 Richard Stiskalek
# This program is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the
# Free Software Foundation; either version 3 of the License, or (at your
# option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
# Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
"""
Script to calculate the particle centre of mass, Lagrangian patch size in the
initial snapshot. The initial snapshot particles are read from the sorted
files.
"""
from argparse import ArgumentParser
from datetime import datetime
import numpy
from mpi4py import MPI
from tqdm import tqdm
try:
import csiborgtools
except ModuleNotFoundError:
import sys
sys.path.append("../")
import csiborgtools
# Get MPI things
comm = MPI.COMM_WORLD
rank = comm.Get_rank()
nproc = comm.Get_size()
verbose = nproc == 1
# Argument parser
parser = ArgumentParser()
parser.add_argument("--ics", type=int, nargs="+", default=None,
help="IC realisations. If `-1` processes all simulations.")
args = parser.parse_args()
paths = csiborgtools.read.CSiBORGPaths(**csiborgtools.paths_glamdring)
partreader = csiborgtools.read.ParticleReader(paths)
if args.ics is None or args.ics[0] == -1:
ics = paths.get_ics(tonew=True)
else:
ics = args.ics
cols_collect = [("index", numpy.int32),
("x", numpy.float32),
("y", numpy.float32),
("z", numpy.float32),
("lagpatch", numpy.float32),]
# MPI loop over simulations
jobs = csiborgtools.fits.split_jobs(len(ics), nproc)[rank]
for nsim in [ics[i] for i in jobs]:
nsnap = max(paths.get_snapshots(nsim))
print(f"{datetime.now()}: rank {rank} calculating simulation `{nsim}`.",
flush=True)
parts = csiborgtools.read.read_h5(paths.initmatch_path(nsim, "particles"))
parts = parts['particles']
clump_map = csiborgtools.read.read_h5(paths.particles_path(nsim))
clump_map = clump_map["clumpmap"]
clumps_cat = csiborgtools.read.ClumpsCatalogue(nsim, paths, rawdata=True,
load_fitted=False)
clid2map = {clid: i for i, clid in enumerate(clump_map[:, 0])}
ismain = clumps_cat.ismain
out = csiborgtools.read.cols_to_structured(len(clumps_cat), cols_collect)
indxs = clumps_cat["index"]
for i, hid in enumerate(tqdm(indxs) if verbose else indxs):
out["index"][i] = hid
if not ismain[i]:
continue
part = csiborgtools.read.load_parent_particles(hid, parts, clump_map,
clid2map, clumps_cat)
# Skip if the halo is too small.
if part is None or part.size < 100:
continue
dist, cm = csiborgtools.fits.dist_centmass(part)
# We enforce a maximum patchsize of 0.075 in box coordinates.
patchsize = min(numpy.percentile(dist, 99), 0.075)
out["x"][i], out["y"][i], out["z"][i] = cm
out["lagpatch"][i] = patchsize
out = out[ismain]
# Now save it
fout = paths.initmatch_path(nsim, "fit")
print(f"{datetime.now()}: dumping fits to .. `{fout}`.",
flush=True)
with open(fout, "wb") as f:
numpy.save(f, out)

View file

@ -54,35 +54,6 @@ else:
nsims = args.ics
def load_clump_particles(clumpid, particles, clump_map):
"""
Load a clump's particles. If it is not there, i.e clump has no associated
particles, return `None`.
"""
try:
return particles[clump_map[clumpid], :]
except KeyError:
return None
def load_parent_particles(clumpid, particles, clump_map, clumps_cat):
"""
Load a parent halo's particles.
"""
indxs = clumps_cat["index"][clumps_cat["parent"] == clumpid]
# We first load the particles of each clump belonging to this parent
# and then concatenate them for further analysis.
clumps = []
for ind in indxs:
parts = load_clump_particles(ind, particles, clump_map)
if parts is not None:
clumps.append(parts)
if len(clumps) == 0:
return None
return numpy.concatenate(clumps)
# We loop over simulations. Here later optionally add MPI.
for i, nsim in enumerate(nsims):
if rank == 0:
@ -91,10 +62,11 @@ for i, nsim in enumerate(nsims):
nsnap = max(paths.get_snapshots(nsim))
box = csiborgtools.read.BoxUnits(nsnap, nsim, paths)
particles = h5py.File(paths.particle_h5py_path(nsim), 'r')["particles"]
clump_map = h5py.File(paths.particle_h5py_path(nsim, "clumpmap"), 'r')
clumps_cat = csiborgtools.read.ClumpsCatalogue(nsim, paths, maxdist=None,
minmass=None, rawdata=True,
f = csiborgtools.read.read_h5(paths.particles_path(nsim))
particles = f["particles"]
clump_map = f["clumpmap"]
clid2map = {clid: i for i, clid in enumerate(clump_map[:, 0])}
clumps_cat = csiborgtools.read.ClumpsCatalogue(nsim, paths, rawdata=True,
load_fitted=False)
ismain = clumps_cat.ismain
ntasks = len(clumps_cat)
@ -108,8 +80,8 @@ for i, nsim in enumerate(nsims):
continue
clumpid = clumps_cat["index"][j]
parts = load_parent_particles(clumpid, particles, clump_map,
clumps_cat)
parts = csiborgtools.read.load_parent_particles(
clumpid, particles, clump_map, clid2map, clumps_cat)
# If we have no particles, then do not save anything.
if parts is None:
continue
@ -124,8 +96,7 @@ for i, nsim in enumerate(nsims):
_out["r"] = r[mask]
_out["M"] = obj["M"][mask]
out[str(clumps_cat["index"][j])] = _out
out[str(clumpid)] = _out
# Finished, so we save everything.
fout = paths.radpos_path(nsnap, nsim)

View file

@ -13,6 +13,7 @@
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
"""A script to calculate overlap between two CSiBORG realisations."""
from argparse import ArgumentParser
from copy import deepcopy
from datetime import datetime
from distutils.util import strtobool
@ -26,13 +27,16 @@ except ModuleNotFoundError:
sys.path.append("../")
import csiborgtools
from csiborgtools.read import HaloCatalogue, read_h5
# Argument parser
parser = ArgumentParser()
parser.add_argument("--nsim0", type=int)
parser.add_argument("--nsimx", type=int)
parser.add_argument("--nmult", type=float)
parser.add_argument("--sigma", type=float)
parser.add_argument("--sigma", type=float, default=None)
parser.add_argument("--smoothen", type=lambda x: bool(strtobool(x)),
default=None)
parser.add_argument("--verbose", type=lambda x: bool(strtobool(x)),
default=False)
args = parser.parse_args()
@ -43,27 +47,52 @@ matcher = csiborgtools.match.RealisationsMatcher()
# Load the raw catalogues (i.e. no selection) including the initial CM
# positions and the particle archives.
cat0 = csiborgtools.read.HaloCatalogue(args.nsim0, paths, load_initial=True,
rawdata=True)
catx = csiborgtools.read.HaloCatalogue(args.nsimx, paths, load_initial=True,
rawdata=True)
halos0_archive = paths.initmatch_path(args.nsim0, "particles")
halosx_archive = paths.initmatch_path(args.nsimx, "particles")
cat0 = HaloCatalogue(args.nsim0, paths, load_initial=True,
minmass=("totpartmass", 1e12), with_lagpatch=True)
catx = HaloCatalogue(args.nsimx, paths, load_initial=True,
minmass=("totpartmass", 1e12), with_lagpatch=True)
clumpmap0 = read_h5(paths.particles_path(args.nsim0))["clumpmap"]
parts0 = read_h5(paths.initmatch_path(args.nsim0, "particles"))["particles"]
clid2map0 = {clid: i for i, clid in enumerate(clumpmap0[:, 0])}
clumpmapx = read_h5(paths.particles_path(args.nsimx))["clumpmap"]
partsx = read_h5(paths.initmatch_path(args.nsimx, "particles"))["particles"]
clid2mapx = {clid: i for i, clid in enumerate(clumpmapx[:, 0])}
# We generate the background density fields. Loads halos's particles one by one
# from the archive, concatenates them and calculates the NGP density field.
if args.verbose:
print(f"{datetime.now()}: generating the background density fields.",
flush=True)
delta_bckg = overlapper.make_bckg_delta(halos0_archive, verbose=args.verbose)
delta_bckg = overlapper.make_bckg_delta(halosx_archive, delta=delta_bckg,
delta_bckg = overlapper.make_bckg_delta(parts0, clumpmap0, clid2map0, cat0,
verbose=args.verbose)
delta_bckg = overlapper.make_bckg_delta(partsx, clumpmapx, clid2mapx, catx,
delta=delta_bckg, verbose=args.verbose)
# We calculate the overlap between the NGP fields.
if args.verbose:
print(f"{datetime.now()}: crossing the simulations.", flush=True)
match_indxs, ngp_overlap = matcher.cross(cat0, catx, halos0_archive,
halosx_archive, delta_bckg)
match_indxs, ngp_overlap = matcher.cross(cat0, catx, parts0, partsx, clumpmap0,
clumpmapx, delta_bckg,
verbose=args.verbose)
# We wish to store the halo IDs of the matches, not their array positions in
# the catalogues
match_hids = deepcopy(match_indxs)
for i, matches in enumerate(match_indxs):
for j, match in enumerate(matches):
match_hids[i][j] = catx["index"][match]
fout = paths.overlap_path(args.nsim0, args.nsimx, smoothed=False)
numpy.savez(fout, ref_hids=cat0["index"], match_hids=match_hids,
ngp_overlap=ngp_overlap)
if args.verbose:
print(f"{datetime.now()}: calculated NGP overlap, saved to {fout}.",
flush=True)
if not args.smoothen:
quit()
# We now smoothen up the background density field for the smoothed overlap
# calculation.
@ -72,16 +101,12 @@ if args.verbose:
gaussian_filter(delta_bckg, output=delta_bckg, **smooth_kwargs)
# We calculate the smoothed overlap for the pairs whose NGP overlap is > 0.
if args.verbose:
print(f"{datetime.now()}: calculating smoothed overlaps.", flush=True)
smoothed_overlap = matcher.smoothed_cross(cat0, catx, halos0_archive,
halosx_archive, delta_bckg,
smoothed_overlap = matcher.smoothed_cross(cat0, catx, parts0, partsx,
clumpmap0, clumpmapx, delta_bckg,
match_indxs, smooth_kwargs)
# We save the results at long last.
fout = paths.overlap_path(args.nsim0, args.nsimx)
fout = paths.overlap_path(args.nsim0, args.nsimx, smoothed=True)
numpy.savez(fout, smoothed_overlap=smoothed_overlap, sigma=args.sigma)
if args.verbose:
print(f"{datetime.now()}: saving results to `{fout}`.", flush=True)
numpy.savez(fout, match_indxs=match_indxs, ngp_overlap=ngp_overlap,
smoothed_overlap=smoothed_overlap, sigma=args.sigma)
print(f"{datetime.now()}: all finished.", flush=True)
print(f"{datetime.now()}: calculated smoothed overlap, saved to {fout}.",
flush=True)

View file

@ -12,18 +12,20 @@
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
"""
Script to load in the simulation particles and dump them to a HDF5 file.
Creates a mapping to access directly particles of a single clump.
Script to load in the simulation particles, load them by their clump ID and
dump into a HDF5 file. Stores the first and last index of each clump in the
particle array. This can be used for fast slicing of the array to acces
particles of a single clump.
"""
from datetime import datetime
from distutils.util import strtobool
from gc import collect
import h5py
import numba
import numpy
from mpi4py import MPI
from tqdm import tqdm
from tqdm import trange
try:
import csiborgtools
@ -44,75 +46,109 @@ nproc = comm.Get_size()
parser = ArgumentParser()
parser.add_argument("--ics", type=int, nargs="+", default=None,
help="IC realisations. If `-1` processes all simulations.")
parser.add_argument("--pos_only", type=lambda x: bool(strtobool(x)),
help="Do we only dump positions?")
parser.add_argument("--dtype", type=str, choices=["float32", "float64"],
default="float32",)
args = parser.parse_args()
verbose = nproc == 1
paths = csiborgtools.read.CSiBORGPaths(**csiborgtools.paths_glamdring)
partreader = csiborgtools.read.ParticleReader(paths)
if args.pos_only:
pars_extract = ['x', 'y', 'z', 'M']
else:
pars_extract = ['x', 'y', 'z', 'vx', 'vy', 'vz', 'M']
# Keep "ID" as the last column!
pars_extract = ['x', 'y', 'z', 'vx', 'vy', 'vz', 'M', "ID"]
if args.ics is None or args.ics[0] == -1:
ics = paths.get_ics(tonew=False)
else:
ics = args.ics
@numba.jit(nopython=True)
def minmax_clump(clid, clump_ids, start_loop=0):
"""
Find the start and end index of a clump in a sorted array of clump IDs.
This is much faster than using `numpy.where` and then `numpy.min` and
`numpy.max`.
"""
start = None
end = None
for i in range(start_loop, clump_ids.size):
n = clump_ids[i]
if n == clid:
if start is None:
start = i
end = i
elif n > clid:
break
return start, end
# MPI loop over individual simulations. We read in the particles from RAMSES
# files and dump them to a HDF5 file.
jobs = csiborgtools.fits.split_jobs(len(ics), nproc)[rank]
for i in jobs:
nsim = ics[i]
nsnap = max(paths.get_snapshots(nsim))
print(f"{datetime.now()}: Rank {rank} loading particles {nsim}.",
fname = paths.particles_path(nsim)
# We first read in the clump IDs of the particles and infer the sorting.
# Right away we dump the clump IDs to a HDF5 file and clear up memory.
print(f"{datetime.now()}: rank {rank} loading particles {nsim}.",
flush=True)
part_cids = partreader.read_clumpid(nsnap, nsim, verbose=verbose)
sort_indxs = numpy.argsort(part_cids).astype(numpy.int32)
part_cids = part_cids[sort_indxs]
with h5py.File(fname, "w") as f:
f.create_dataset("clump_ids", data=part_cids)
f.close()
del part_cids
collect()
parts = partreader.read_particle(nsnap, nsim, pars_extract,
return_structured=False, verbose=verbose)
if args.dtype == "float64":
parts = parts.astype(numpy.float64)
kind = "pos" if args.pos_only else None
print(f"{datetime.now()}: Rank {rank} dumping particles from {nsim}.",
# Next we read in the particles and sort them by their clump ID.
# We cannot directly read this as an unstructured array because the float32
# precision is insufficient to capture the clump IDs.
parts, pids = partreader.read_particle(
nsnap, nsim, pars_extract, return_structured=False, verbose=verbose)
# Now we in two steps save the particles and particle IDs.
print(f"{datetime.now()}: rank {rank} dumping particles from {nsim}.",
flush=True)
parts = parts[sort_indxs]
pids = pids[sort_indxs]
del sort_indxs
collect()
with h5py.File(paths.particle_h5py_path(nsim, kind, args.dtype), "w") as f:
with h5py.File(fname, "r+") as f:
f.create_dataset("particle_ids", data=pids)
f.close()
del pids
collect()
with h5py.File(fname, "r+") as f:
f.create_dataset("particles", data=parts)
f.close()
del parts
collect()
print(f"{datetime.now()}: Rank {rank} finished dumping of {nsim}.",
flush=True)
# If we are dumping only particle positions, then we are done.
if args.pos_only:
continue
print(f"{datetime.now()}: Rank {rank} mapping particles from {nsim}.",
print(f"{datetime.now()}: rank {rank} creating clump mapping for {nsim}.",
flush=True)
# If not, then load the clump IDs and prepare the memory mapping. We find
# which array positions correspond to which clump IDs and save it. With
# this we can then lazily load into memory the particles for each clump.
part_cids = partreader.read_clumpid(nsnap, nsim, verbose=verbose)
cat = csiborgtools.read.ClumpsCatalogue(nsim, paths, load_fitted=False,
rawdata=True)
clumpinds = cat["index"]
# Some of the clumps have no particles, so we do not loop over them
clumpinds = clumpinds[numpy.isin(clumpinds, part_cids)]
out = {}
for i, cid in enumerate(tqdm(clumpinds) if verbose else clumpinds):
out.update({str(cid): numpy.where(part_cids == cid)[0]})
# Load clump IDs back to memory
with h5py.File(fname, "r") as f:
part_cids = f["clump_ids"][:]
# We loop over the unique clump IDs.
unique_clump_ids = numpy.unique(part_cids)
clump_map = numpy.full((unique_clump_ids.size, 3), numpy.nan,
dtype=numpy.int32)
start_loop = 0
niters = unique_clump_ids.size
for i in trange(niters) if verbose else range(niters):
clid = unique_clump_ids[i]
k0, kf = minmax_clump(clid, part_cids, start_loop=start_loop)
clump_map[i, 0] = clid
clump_map[i, 1] = k0
clump_map[i, 2] = kf
start_loop = kf
# We save the mapping to a HDF5 file
with h5py.File(paths.particle_h5py_path(nsim, "clumpmap"), "w") as f:
for cid, indxs in out.items():
f.create_dataset(cid, data=indxs)
with h5py.File(paths.particles_path(nsim), "r+") as f:
f.create_dataset("clumpmap", data=clump_map)
f.close()
del part_cids, cat, clumpinds, out
del part_cids
collect()

View file

@ -1,199 +0,0 @@
# Copyright (C) 2022 Richard Stiskalek
# This program is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the
# Free Software Foundation; either version 3 of the License, or (at your
# option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
# Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
"""
Script to calculate the particle centre of mass, Lagrangian patch size in the
initial snapshot and the particle mapping.
"""
from argparse import ArgumentParser
from os.path import join
from datetime import datetime
from gc import collect
import joblib
from os import remove
import h5py
import numpy
from mpi4py import MPI
from tqdm import trange
try:
import csiborgtools
except ModuleNotFoundError:
import sys
sys.path.append("../")
import csiborgtools
# Get MPI things
comm = MPI.COMM_WORLD
rank = comm.Get_rank()
nproc = comm.Get_size()
verbose = nproc == 1
# Argument parser
parser = ArgumentParser()
parser.add_argument("--ics", type=int, nargs="+", default=None,
help="IC realisations. If `-1` processes all simulations.")
args = parser.parse_args()
paths = csiborgtools.read.CSiBORGPaths(**csiborgtools.paths_glamdring)
partreader = csiborgtools.read.ParticleReader(paths)
ftemp = lambda kind, nsim, rank: join(paths.temp_dumpdir, f"{kind}_{nsim}_{rank}.p") # noqa
if args.ics is None or args.ics[0] == -1:
ics = paths.get_ics(tonew=True)
else:
ics = args.ics
# We loop over simulations. Each simulation is then procesed with MPI, rank 0
# loads the data and broadcasts it to other ranks.
for nsim in ics:
nsnap = max(paths.get_snapshots(nsim))
if rank == 0:
print(f"{datetime.now()}: reading simulation {nsim}.", flush=True)
# We first load particles in the initial and final snapshots and sort
# them by their particle IDs so that we can match them by array
# position. `clump_ids` are the clump IDs of particles.
part0 = partreader.read_particle(1, nsim, ["x", "y", "z", "M", "ID"],
verbose=True,
return_structured=False)
part0 = part0[numpy.argsort(part0[:, -1])]
part0 = part0[:, :-1] # Now we no longer need the particle IDs
pid = partreader.read_particle(nsnap, nsim, ["ID"], verbose=True,
return_structured=False).reshape(-1, )
clump_ids = partreader.read_clumpid(nsnap, nsim, verbose=True)
clump_ids = clump_ids[numpy.argsort(pid)]
# Release the particle IDs, we will not need them anymore now that both
# particle arrays are matched in ordering.
del pid
collect()
# Particles whose clump ID is 0 are unassigned to a clump, so we can
# get rid of them to speed up subsequent operations. We will not need
# these. Again we release the mask.
mask = clump_ids > 0
clump_ids = clump_ids[mask]
part0 = part0[mask, :]
del mask
collect()
print(f"{datetime.now()}: dumping particles for {nsim}.", flush=True)
with h5py.File(paths.initmatch_path(nsim, "particles"), "w") as f:
f.create_dataset("particles", data=part0)
print(f"{datetime.now()}: broadcasting simulation {nsim}.", flush=True)
# Stop all ranks and figure out array shapes from the 0th rank
comm.Barrier()
if rank == 0:
shape = numpy.array([*part0.shape], dtype=numpy.int32)
else:
shape = numpy.empty(2, dtype=numpy.int32)
comm.Bcast(shape, root=0)
# Now broadcast the particle arrays to all ranks
if rank > 0:
part0 = numpy.empty(shape, dtype=numpy.float32)
clump_ids = numpy.empty(shape[0], dtype=numpy.int32)
comm.Bcast(part0, root=0)
comm.Bcast(clump_ids, root=0)
if rank == 0:
print(f"{datetime.now()}: simulation {nsim} broadcasted.", flush=True)
# Calculate the centre of mass of each parent halo, the Lagrangian patch
# size and optionally the initial snapshot particles belonging to this
# parent halo. Dumping the particles will take majority of time.
if rank == 0:
print(f"{datetime.now()}: calculating simulation {nsim}.", flush=True)
# We load up the clump catalogue which contains information about the
# ultimate parent halos of each clump. We will loop only over the clump
# IDs of ultimate parent halos and add their substructure particles and at
# the end save these.
cat = csiborgtools.read.ClumpsCatalogue(nsim, paths, load_fitted=False,
rawdata=True)
parent_ids = cat["index"][cat.ismain]
parent_ids = parent_ids
hid2arrpos = {indx: j for j, indx in enumerate(parent_ids)}
# And we pre-allocate the output array for this simulation.
dtype = {"names": ["index", "x", "y", "z", "lagpatch"],
"formats": [numpy.int32] + [numpy.float32] * 4}
# We MPI loop over the individual halos
jobs = csiborgtools.fits.split_jobs(parent_ids.size, nproc)[rank]
_out_fits = numpy.full(len(jobs), numpy.nan, dtype=dtype)
_out_map = {}
for i in trange(len(jobs)) if verbose else range(len(jobs)):
clid = parent_ids[jobs[i]]
_out_fits["index"][i] = clid
mmain_indxs = cat["index"][cat["parent"] == clid]
mmain_mask = numpy.isin(clump_ids, mmain_indxs, assume_unique=True)
mmain_particles = part0[mmain_mask, :]
# If the number of particles is too small, we skip this halo.
if mmain_particles.size < 100:
continue
raddist, cmpos = csiborgtools.match.dist_centmass(mmain_particles)
patchsize = csiborgtools.match.dist_percentile(raddist, [99],
distmax=0.075)
# Write the temporary results
_out_fits["x"][i], _out_fits["y"][i], _out_fits["z"][i] = cmpos
_out_fits["lagpatch"][i] = patchsize
_out_map.update({str(clid): numpy.where(mmain_mask)[0]})
# Dump the results of this rank to a temporary file.
joblib.dump(_out_fits, ftemp("fits", nsim, rank))
joblib.dump(_out_map, ftemp("map", nsim, rank))
del part0, clump_ids,
collect()
# Now we wait for all ranks, then collect the results and save it.
comm.Barrier()
if rank == 0:
print(f"{datetime.now()}: collecting results for {nsim}.", flush=True)
out_fits = numpy.full(parent_ids.size, numpy.nan, dtype=dtype)
out_map = {}
for i in range(nproc):
# Merge the map dictionaries
out_map = out_map | joblib.load(ftemp("map", nsim, i))
# Now merge the structured arrays
_out_fits = joblib.load(ftemp("fits", nsim, i))
for j in range(_out_fits.size):
k = hid2arrpos[_out_fits["index"][j]]
for par in dtype["names"]:
out_fits[par][k] = _out_fits[par][j]
remove(ftemp("fits", nsim, i))
remove(ftemp("map", nsim, i))
# Now save it
fout_fit = paths.initmatch_path(nsim, "fit")
print(f"{datetime.now()}: dumping fits to .. `{fout_fit}`.",
flush=True)
with open(fout_fit, "wb") as f:
numpy.save(f, out_fits)
fout_map = paths.initmatch_path(nsim, "halomap")
print(f"{datetime.now()}: dumping mapping to .. `{fout_map}`.",
flush=True)
with h5py.File(fout_map, "w") as f:
for hid, indxs in out_map.items():
f.create_dataset(hid, data=indxs)
# We force clean up the memory before continuing.
del out_map, out_fits
collect()

82
scripts/pre_sortinit.py Normal file
View file

@ -0,0 +1,82 @@
# Copyright (C) 2022 Richard Stiskalek
# This program is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the
# Free Software Foundation; either version 3 of the License, or (at your
# option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
# Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
"""
Script to sort the initial snapshot particles according to their final
snapshot ordering, which is sorted by the clump IDs.
"""
from argparse import ArgumentParser
from datetime import datetime
import h5py
from gc import collect
import numpy
from mpi4py import MPI
try:
import csiborgtools
except ModuleNotFoundError:
import sys
sys.path.append("../")
import csiborgtools
# Get MPI things
comm = MPI.COMM_WORLD
rank = comm.Get_rank()
nproc = comm.Get_size()
verbose = nproc == 1
# Argument parser
parser = ArgumentParser()
parser.add_argument("--ics", type=int, nargs="+", default=None,
help="IC realisations. If `-1` processes all simulations.")
args = parser.parse_args()
paths = csiborgtools.read.CSiBORGPaths(**csiborgtools.paths_glamdring)
partreader = csiborgtools.read.ParticleReader(paths)
# NOTE: ID has to be the last column.
pars_extract = ["x", "y", "z", "M", "ID"]
if args.ics is None or args.ics[0] == -1:
ics = paths.get_ics(tonew=True)
else:
ics = args.ics
# We loop over simulations. Each simulation is then procesed with MPI, rank 0
# loads the data and broadcasts it to other ranks.
jobs = csiborgtools.fits.split_jobs(len(ics), nproc)[rank]
for i in jobs:
nsim = ics[i]
nsnap = max(paths.get_snapshots(nsim))
print(f"{datetime.now()}: reading and processing simulation {nsim}.",
flush=True)
# We first load the particle IDs in the final snapshot.
pidf = csiborgtools.read.read_h5(paths.particles_path(nsim))
pidf = pidf["particle_ids"]
# Then we load the particles in the initil snapshot and make sure that
# their particle IDs are sorted as in the final snapshot.
# Again, because of precision this must be read as structured.
part0, pid0 = partreader.read_particle(
1, nsim, pars_extract, return_structured=False, verbose=verbose)
# First enforce them to already be sorted and then apply reverse
# sorting from the final snapshot.
part0 = part0[numpy.argsort(pid0)]
del pid0
collect()
part0 = part0[numpy.argsort(numpy.argsort(pidf))]
print(f"{datetime.now()}: dumping particles for {nsim}.", flush=True)
with h5py.File(paths.initmatch_path(nsim, "particles"), "w") as f:
f.create_dataset("particles", data=part0)