Fix overlap runs (#125)

* Update nb * Update script * Update script * Rename * Update script * Update script * Remove warning * Ignore minors when extracting MAH * Fix paths bug * Move notebooks * Move files * Rename and delete things * Rename file * Move file * Rename things * Remove old print statement * Add basic MAH plot * Add random MAH path * Output snapshot numbers * Add MAH random extraction * Fix redshift bug * Edit script * Add extracting random MAH * Little updates * Add CB2 redshift * Add some caching * Add diagnostic plots * Add caching * Minor updates * Update nb * Update notebook * Update script * Add Sorce randoms * Add CB2 varysmall * Update nb * Update nb * Update nb * Use catalogue HMF * Move definition of radec2galactic * Update nb * Update import * Update import * Add galatic coords to catalogues * Update nb
2025-07-12 00:53:02 +00:00 · 2024-04-08 11:23:21 +02:00 · 2024-04-08 11:23:21 +02:00 · ee222cd010
commit ee222cd010
parent c71f5a8513
31 changed files with 1813 additions and 798 deletions
--- a/notebooks/MAH/mah.ipynb
+++ b/notebooks/MAH/mah.ipynb
--- a/notebooks/MAH/mah.py
+++ b/notebooks/MAH/mah.py
@ -0,0 +1,221 @@
+# Copyright (C) 2024 Richard Stiskalek
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation; either version 3 of the License, or (at your
+# option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General
+# Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+"""Script to help with `mah.py`."""
+from datetime import datetime
+
+import csiborgtools
+import numpy as np
+from astropy.cosmology import FlatLambdaCDM
+from h5py import File
+from tqdm import tqdm, trange
+from cache_to_disk import cache_to_disk
+from os.path import join
+
+
+RANDOM_MAH_Sorce_Virgo_UPPER = np.array(
+    [[2.18554217, 0.16246594],
+     [2.93253012, 0.17284951],
+     [3.2939759, 0.34169001],
+     [3.75180723, 0.42006683],
+     [4.28192771, 0.44691426],
+     [4.61927711, 0.53819753],
+     [5.34216867, 0.58454257],
+     [5.89638554, 0.68954882],
+     [6.23373494, 0.73361948],
+     [6.45060241, 0.81341823],
+     [7.05301205, 0.92071572],
+     [7.82409639, 0.92071572],
+     [8.28192771, 0.95953933],
+     [8.61927711, 0.97956078],
+     [9.70361446, 1.],
+     [11.17349398, 1.],
+     [13.07710843, 1.],
+     [13.82409639, 1.]]
+    )
+
+RANDOM_MAH_SORCE_Virgo_LOWER = np.array(
+    [[3.36626506e+00, 1.00000000e-02],
+     [3.75180723e+00, 1.10877404e-02],
+     [3.99277108e+00, 1.04216677e-02],
+     [4.30602410e+00, 1.15552746e-02],
+     [4.61927711e+00, 1.67577322e-02],
+     [4.98072289e+00, 2.14703224e-02],
+     [5.39036145e+00, 3.82789169e-02],
+     [5.89638554e+00, 5.00670000e-02],
+     [6.30602410e+00, 5.11116827e-02],
+     [7.29397590e+00, 5.32668971e-02],
+     [7.77590361e+00, 5.55129899e-02],
+     [8.11325301e+00, 6.68516464e-02],
+     [8.57108434e+00, 8.56515893e-02],
+     [9.60722892e+00, 1.32152759e-01],
+     [1.04265060e+01, 1.46527548e-01],
+     [1.07638554e+01, 1.49584947e-01],
+     [1.11493976e+01, 1.72849513e-01],
+     [1.18240964e+01, 2.16931625e-01],
+     [1.21855422e+01, 2.45546942e-01],
+     [1.25951807e+01, 3.48819614e-01],
+     [1.30771084e+01, 5.27197199e-01],
+     [1.36795181e+01, 8.83462949e-01],
+     [1.38000000e+01, 1.00000000e+00]]
+    )
+
+
+def t():
+    return datetime.now()
+
+
+@cache_to_disk(90)
+def load_data(nsim0, simname, min_logmass):
+    """
+    Load the reference catalogue, the cross catalogues, the merger trees and
+    the overlap reader (in this order).
+    """
+    paths = csiborgtools.read.Paths(**csiborgtools.paths_glamdring)
+    nsims = paths.get_ics(simname)
+    if "csiborg2_" in simname:
+        kind = simname.split("_")[-1]
+        print(f"{t()}: loading {len(nsims)} halo catalogues.")
+        cat0 = csiborgtools.read.CSiBORG2Catalogue(nsim0, 99, kind)
+        catxs = [csiborgtools.read.CSiBORG2Catalogue(n, 99, kind)
+                 for n in nsims if n != nsim0]
+
+        print(f"{t()}: loading {len(nsims)} merger trees.")
+        merger_trees = {}
+        for nsim in tqdm(nsims):
+            merger_trees[nsim] = csiborgtools.read.CSiBORG2MergerTreeReader(
+                nsim, kind)
+    else:
+        raise ValueError(f"Unknown simname: {simname}")
+
+    overlaps = csiborgtools.summary.NPairsOverlap(cat0, catxs, min_logmass)
+
+    return cat0, catxs, merger_trees, overlaps
+
+
+def extract_main_progenitor_maxoverlap(group_nr, overlaps, merger_trees):
+    """
+    Follow the main progenitor of a reference group and its maximum overlap
+    group in the cross catalogues.
+    """
+    min_overlap = 0
+
+    # NOTE these can be all cached in the overlap object.
+    max_overlaps = overlaps.max_overlap(0, True)[group_nr]
+    if np.sum(max_overlaps > 0) == 0:
+        raise ValueError(f"No overlaps for group {group_nr}.")
+
+    max_overlap_indxs = overlaps.max_overlap_key(
+        "index", min_overlap, True)[group_nr]
+
+    out = {}
+    for i in trange(len(overlaps), desc="Cross main progenitors"):
+        nsimx = overlaps[i].catx().nsim
+        group_nr_cross = max_overlap_indxs[i]
+
+        if np.isnan(group_nr_cross):
+            continue
+
+        x = merger_trees[nsimx].main_progenitor(int(group_nr_cross))
+        x["Overlap"] = max_overlaps[i]
+
+        out[nsimx] = x
+
+    nsim0 = overlaps.cat0().nsim
+    print(f"Appending main progenitor for {nsim0}.")
+    out[nsim0] = merger_trees[nsim0].main_progenitor(group_nr)
+
+    return out
+
+
+def summarize_extracted_mah(simname, data, nsim0, nsimxs, key,
+                            min_age=0, include_nsim0=True):
+    """
+    Turn the dictionaries of extracted MAHs into a single array.
+    """
+    if "csiborg2_" in simname:
+        nsnap = 100
+    else:
+        raise ValueError(f"Unknown simname: {simname}")
+
+    X = []
+    for nsimx in nsimxs + [nsim0] if include_nsim0 else nsimxs:
+        try:
+            d = data[nsimx]
+        except KeyError:
+            continue
+
+        x = np.full(nsnap, np.nan, dtype=np.float32)
+        x[d["SnapNum"]] = d[key]
+
+        X.append(x)
+
+    cosmo = FlatLambdaCDM(H0=67.76, Om0=csiborgtools.simname2Omega_m(simname))
+    zs = [csiborgtools.snap2redshift(i, simname) for i in range(nsnap)]
+    age = cosmo.age(zs).value
+
+    mask = age > min_age
+    return age[mask], np.vstack(X)[:, mask]
+
+
+def extract_mah(simname, logmass_bounds, key, min_age=0):
+    """
+    Extract the random MAHs for a given simulation and mass range and key.
+    Keys are for example: "MainProgenitorMass" or "GroupMass"
+    """
+    paths = csiborgtools.read.Paths(**csiborgtools.paths_glamdring)
+    nsims = paths.get_ics(simname)
+
+    X = []
+    for i, nsim in enumerate(nsims):
+        with File(paths.random_mah(simname, nsim), 'r') as f:
+            mah = f[key][:]
+            final_mass = mah[:, -1]
+
+            # Select the mass range
+            mask = final_mass >= 10**logmass_bounds[0]
+            mask &= final_mass < 10**logmass_bounds[1]
+
+            X.append(mah[mask])
+
+            if i == 0:
+                redshift = f["Redshift"][:]
+
+    X = np.vstack(X)
+
+    cosmo = FlatLambdaCDM(H0=67.76, Om0=csiborgtools.simname2Omega_m(simname))
+    age = cosmo.age(redshift).value
+
+    mask = age > min_age
+    return age[mask], X[:, mask]
+
+
+def extract_mah_mdpl2(logmass_bounds, min_age=1.5):
+    """
+    MAH extraction for the MDPL2 simulation. Data comes from
+    `https://arxiv.org/abs/2105.05859`
+    """
+    fdir = "/mnt/extraspace/rstiskalek/catalogs/"
+
+    age = np.genfromtxt(join(fdir, "mdpl2_cosmic_time.txt"))
+    with File(join(fdir, "diffmah_mdpl2.h5"), 'r') as f:
+        log_mp = f["logmp_sim"][:]
+        log_mah_sim = f["log_mah_sim"][...]
+
+    xmin, xmax = logmass_bounds
+    ks = np.where((log_mp > xmin) & (log_mp < xmax))[0]
+    X = 10**log_mah_sim[ks]
+
+    mask = age > min_age
+    return age[mask], X[:, mask]
--- a/notebooks/diagnostic/hmf.ipynb
+++ b/notebooks/diagnostic/hmf.ipynb
--- a/notebooks/diagnostic/hmf.py
+++ b/notebooks/diagnostic/hmf.py
@ -0,0 +1,48 @@
+# Copyright (C) 2024 Richard Stiskalek
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation; either version 3 of the License, or (at your
+# option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General
+# Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+"""Script to help with `hmf.py`."""
+
+import csiborgtools
+import numpy as np
+from tqdm import tqdm
+
+
+def calculate_hmf(simname, bin_edges, halofinder="FOF", max_distance=135):
+    """
+    Calculate the halo mass function for a given simulation from catalogues.
+    """
+    paths = csiborgtools.read.Paths(**csiborgtools.paths_glamdring)
+    nsims = paths.get_ics(simname)
+    bounds = {"dist": (0, max_distance)}
+
+    hmf = np.full((len(nsims), len(bin_edges) - 1), np.nan)
+    volume = 4 / 3 * np.pi * max_distance**3
+    for i, nsim in enumerate(tqdm(nsims)):
+        if "csiborg2_" in simname:
+            kind = simname.split("_")[-1]
+            if halofinder == "FOF":
+                cat = csiborgtools.read.CSiBORG2Catalogue(
+                    nsim, 99, kind, bounds=bounds)
+            elif halofinder == "SUBFIND":
+                cat = csiborgtools.read.CSiBORG2SUBFINDCatalogue(
+                    nsim, 99, kind, kind, bounds=bounds)
+            else:
+                raise ValueError(f"Unknown halofinder: {halofinder}")
+        else:
+            raise ValueError(f"Unknown simname: {simname}")
+
+        hmf[i] = cat.halo_mass_function(bin_edges, volume, "totmass")[1]
+
+    return hmf
--- a/notebooks/diagnostic/plot_correlation.ipynb
+++ b/notebooks/diagnostic/plot_correlation.ipynb
--- a/notebooks/diagnostic/powerspectrum_H0.ipynb
+++ b/notebooks/diagnostic/powerspectrum_H0.ipynb
--- a/notebooks/field_sample.ipynb
+++ b/notebooks/field_sample.ipynb
@ -38,7 +38,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
@ -47,7 +47,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
@ -66,7 +66,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
@ -81,7 +81,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
--- a/notebooks/flow/PV_process.ipynb
+++ b/notebooks/flow/PV_process.ipynb
--- a/notebooks/flow/field_velocity_fof_sph.ipynb
+++ b/notebooks/flow/field_velocity_fof_sph.ipynb
--- a/notebooks/flow/flow_bulk.ipynb
+++ b/notebooks/flow/flow_bulk.ipynb
--- a/notebooks/flow/flow_bulk.py
+++ b/notebooks/flow/flow_bulk.py
@ -60,6 +60,6 @@ def read_enclosed_flow(simname):

    for n in range(nsim):
        V_n = csiborgtools.cartesian_to_radec(V[n])
-        l[n], b[n] = csiborgtools.flow.radec_to_galactic(V_n[:, 1], V_n[:, 2])
+        l[n], b[n] = csiborgtools.radec_to_galactic(V_n[:, 1], V_n[:, 2])

    return r, Vmag, l, b
--- a/notebooks/flow/flow_calibration.ipynb
+++ b/notebooks/flow/flow_calibration.ipynb
--- a/notebooks/flow/flow_calibration.py
+++ b/notebooks/flow/flow_calibration.py
@ -122,7 +122,7 @@ def read_samples(catalogue, simname, ksmooth, include_calibration=False,
    # Calculate direction in galactic coordinates of V_ext
    V = np.vstack([Vx, Vy, Vz]).T
    V = csiborgtools.cartesian_to_radec(V)
-    l, b = csiborgtools.flow.radec_to_galactic(V[:, 1], V[:, 2])
+    l, b = csiborgtools.radec_to_galactic(V[:, 1], V[:, 2])

    data = [alpha, beta, Vmag, l, b, sigma_v]
    names = ["alpha", "beta", "Vmag", "l", "b", "sigma_v"]
--- a/notebooks/flow/flow_mapping.ipynb
+++ b/notebooks/flow/flow_mapping.ipynb
--- a/notebooks/flow/flow_mock_pv.ipynb
+++ b/notebooks/flow/flow_mock_pv.ipynb
--- a/notebooks/overlap/plot_match.py
+++ b/notebooks/overlap/plot_match.py
--- a/notebooks/utils.py
+++ b/notebooks/utils.py
@ -0,0 +1,132 @@
+# Copyright (C) 2023 Richard Stiskalek
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation; either version 3 of the License, or (at your
+# option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General
+# Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+"""
+Various utility functions.
+"""
+
+import numpy
+from scipy.special import erf
+
+dpi = 600
+fout = "../plots/"
+mplstyle = ["science"]
+
+
+def latex_float(*floats, n=2):
+    """
+    Convert a float or a list of floats to a LaTeX string(s). Taken from [1].
+
+    Parameters
+    ----------
+    floats : float or list of floats
+        The float(s) to be converted.
+    n : int, optional
+        The number of significant figures to be used in the LaTeX string.
+
+    Returns
+    -------
+    latex_floats : str or list of str
+        The LaTeX string(s) representing the float(s).
+
+    References
+    ----------
+    [1] https://stackoverflow.com/questions/13490292/format-number-using-latex-notation-in-python  # noqa
+    """
+    latex_floats = [None] * len(floats)
+    for i, f in enumerate(floats):
+        float_str = "{0:.{1}g}".format(f, n)
+        if "e" in float_str:
+            base, exponent = float_str.split("e")
+            latex_floats[i] = r"{0} \times 10^{{{1}}}".format(base,
+                                                              int(exponent))
+        else:
+            latex_floats[i] = float_str
+
+    if len(floats) == 1:
+        return latex_floats[0]
+    return latex_floats
+
+
+def nan_weighted_average(arr, weights=None, axis=None):
+    if weights is None:
+        weights = numpy.ones_like(arr)
+
+    valid_entries = ~numpy.isnan(arr)
+
+    # Set NaN entries in arr to 0 for computation
+    arr = numpy.where(valid_entries, arr, 0)
+
+    # Set weights of NaN entries to 0
+    weights = numpy.where(valid_entries, weights, 0)
+
+    # Compute the weighted sum and the sum of weights along the axis
+    weighted_sum = numpy.sum(arr * weights, axis=axis)
+    sum_weights = numpy.sum(weights, axis=axis)
+
+    return weighted_sum / sum_weights
+
+
+def nan_weighted_std(arr, weights=None, axis=None, ddof=0):
+    if weights is None:
+        weights = numpy.ones_like(arr)
+
+    valid_entries = ~numpy.isnan(arr)
+
+    # Set NaN entries in arr to 0 for computation
+    arr = numpy.where(valid_entries, arr, 0)
+
+    # Set weights of NaN entries to 0
+    weights = numpy.where(valid_entries, weights, 0)
+
+    # Calculate weighted mean
+    weighted_mean = numpy.sum(
+        arr * weights, axis=axis) / numpy.sum(weights, axis=axis)
+
+    # Calculate the weighted variance
+    variance = numpy.sum(
+        weights * (arr - numpy.expand_dims(weighted_mean, axis))**2, axis=axis)
+    variance /= numpy.sum(weights, axis=axis) - ddof
+
+    return numpy.sqrt(variance)
+
+
+def compute_error_bars(x, y, xbins, sigma):
+    bin_indices = numpy.digitize(x, xbins)
+    y_medians = numpy.array([numpy.median(y[bin_indices == i])
+                             for i in range(1, len(xbins))])
+
+    lower_pct = 100 * 0.5 * (1 - erf(sigma / numpy.sqrt(2)))
+    upper_pct = 100 - lower_pct
+
+    y_lower = numpy.full(len(y_medians), numpy.nan)
+    y_upper = numpy.full(len(y_medians), numpy.nan)
+
+    for i in range(len(y_medians)):
+        if numpy.sum(bin_indices == i + 1) == 0:
+            continue
+
+        y_lower[i] = numpy.percentile(y[bin_indices == i + 1], lower_pct)
+        y_upper[i] = numpy.percentile(y[bin_indices == i + 1], upper_pct)
+
+    yerr = (y_medians - numpy.array(y_lower), numpy.array(y_upper) - y_medians)
+
+    return y_medians, yerr
+
+
+def normalize_hexbin(hb):
+    hexagon_counts = hb.get_array()
+    normalized_counts = hexagon_counts / hexagon_counts.sum()
+    hb.set_array(normalized_counts)
+    hb.set_clim(normalized_counts.min(), normalized_counts.max())