Better plots (#73)

* Edits paths of saved files * Add upper threshold options * Add upper threshold options * add latex_float option * Add weighted stats * add new plot
2025-06-08 09:51:12 +00:00 · 2023-06-28 15:22:42 +01:00 · 2023-06-28 15:22:42 +01:00 · fbf9c2a4b7
commit fbf9c2a4b7
parent de7def61d5
5 changed files with 225 additions and 39 deletions
--- a/csiborgtools/init.py
+++ b/csiborgtools/init.py
@ -17,8 +17,7 @@ from csiborgtools import clustering, field, fits, match, read  # noqa
 # Arguments to csiborgtools.read.Paths.
 paths_glamdring = {"srcdir": "/mnt/extraspace/hdesmond/",
                   "postdir": "/mnt/extraspace/rstiskalek/CSiBORG/",
-                   "BORG_dir": "/mnt/extraspace/rstiskalek/BORG/",
+                   "borg_dir": "/users/hdesmond/BORG_final/",
                   "BORG_final_density": "/users/hdesmond/BORG_final/",
                   "quijote_dir": "/mnt/extraspace/rstiskalek/Quijote",
                   }
--- a/csiborgtools/read/init.py
+++ b/csiborgtools/read/init.py
@ -19,12 +19,14 @@ from .knn_summary import kNNCDFReader  # noqa
 from .nearest_neighbour_summary import NearestNeighbourReader  # noqa
 from .obs import (SDSS, MCXCClusters, PlanckClusters, TwoMPPGalaxies,  # noqa
                  TwoMPPGroups)
 from .overlap_summary import weighted_stats  # noqa
 from .overlap_summary import (NPairsOverlap, PairOverlap,  # noqa
-                              binned_resample_mean, get_cross_sims)
+                              binned_resample_mean, get_cross_sims)  # noqa
 from .paths import Paths  # noqa
 from .pk_summary import PKReader  # noqa
 from .readsim import (MmainReader, ParticleReader, halfwidth_mask,  # noqa
                      load_clump_particles, load_parent_particles, read_initcm)
 from .tpcf_summary import TPCFReader  # noqa
-from .utils import (cartesian_to_radec, cols_to_structured,  # noqa
+from .utils import (M200_to_R200, cartesian_to_radec,  # noqa
-                    radec_to_cartesian, read_h5, real2redshift, M200_to_R200)
+                    cols_to_structured, radec_to_cartesian, read_h5,
                    real2redshift)
--- a/csiborgtools/read/overlap_summary.py
+++ b/csiborgtools/read/overlap_summary.py
@ -19,7 +19,7 @@ from functools import lru_cache
 from os.path import isfile
 import numpy
-from tqdm import tqdm
+from tqdm import tqdm, trange
 ###############################################################################
 #                         Overlap of two simulations                          #
@ -293,7 +293,6 @@ class PairOverlap:
            if norm_kind is not None:
                dist[i] /= norm[i]
        return numpy.array(dist, dtype=object)
    def mass_ratio(self, mass_kind="totpartmass", in_log=True, in_abs=True):
@ -406,7 +405,7 @@ class PairOverlap:
        vals = self.cat0(par)
        out = [None] * len(self)
        for i, ind in enumerate(self["match_indxs"]):
-            out[i] = numpy.ones(ind.size) * vals[i]
+            out[i] = numpy.ones(len(ind)) * vals[i]
        return numpy.array(out, dtype=object)
    def cat0(self, key=None, index=None):
@ -459,6 +458,43 @@ class PairOverlap:
        return self["match_indxs"].size
 def weighted_stats(x, weights, min_weight=0, verbose=False):
    """
    Calculate the weighted mean and standard deviation of `x` using `weights`
    for each array of `x`.
    Parameters
    ----------
    x : array of arrays
        Array of arrays of values to calculate the weighted mean and standard
        deviation for.
    weights : array of arrays
        Array of arrays of weights to use for the calculation.
    min_weight : float, optional
        Minimum weight required for a value to be included in the calculation.
    verbose : bool, optional
        Verbosity flag.
    Returns
    -------
    stat : 2-dimensional array of shape `(len(x), 2)`
        The first column is the weighted mean and the second column is the
        weighted standard deviation.
    """
    out = numpy.full((x.size, 2), numpy.nan, dtype=numpy.float32)
    for i in trange(len(x)) if verbose else range(len(x)):
        x_, w_ = numpy.asarray(x[i]), numpy.asarray(weights[i])
        mask = w_ > min_weight
        x_ = x_[mask]
        w_ = w_[mask]
        if len(w_) == 0:
            continue
        out[i, 0] = numpy.average(x_, weights=w_)
        out[i, 1] = numpy.average((x_ - out[i, 0])**2, weights=w_)**0.5
    return out
 ###############################################################################
 #                  Overlap of many pairs of simulations.                      #
 ###############################################################################
--- a/scripts_plots/plot_match.py
+++ b/scripts_plots/plot_match.py
@ -14,6 +14,7 @@
 # 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 from argparse import ArgumentParser
 from gc import collect
 from os.path import join
 import matplotlib as mpl
@ -21,6 +22,7 @@ import matplotlib.pyplot as plt
 import numpy
 import scienceplots  # noqa
 from cache_to_disk import cache_to_disk, delete_disk_caches_for_function
 from scipy.stats import kendalltau
 from tqdm import tqdm
 import plt_utils
@ -88,6 +90,7 @@ def get_overlap(nsim0):
    reader = csiborgtools.read.NPairsOverlap(cat0, catxs, paths)
    mass = reader.cat0("totpartmass")
    hindxs = reader.cat0("index")
    summed_overlap = reader.summed_overlap(True)
    prob_nomatch = reader.prob_nomatch(True)
@ -96,7 +99,7 @@ def get_overlap(nsim0):
 def plot_summed_overlap_vs_mass(nsim0):
    """
-    Plot the summer overlap of probaiblity of no matching for a single
+    Plot the summed overlap of probability of no matching for a single
    reference simulations as a function of the reference halo mass, along with
    their comparison.
@ -110,6 +113,8 @@ def plot_summed_overlap_vs_mass(nsim0):
    None
    """
    x, __, summed_overlap, prob_nomatch = get_overlap(nsim0)
    del __
    collect()
    mean_overlap = numpy.mean(summed_overlap, axis=1)
    std_overlap = numpy.std(summed_overlap, axis=1)
@ -178,6 +183,63 @@ def plot_summed_overlap_vs_mass(nsim0):
        plt.close()
 def plot_mass_vs_separation(nsim0, nsimx, min_overlap=0.0):
    """
    Plot the mass of a reference halo against the weighted separation of
    its counterparts.
    Parameters
    ----------
    nsim0 : int
        Reference simulation index.
    nsimx : int
        Cross simulation index.
    min_overlap : float, optional
        Minimum overlap to consider.
    Returns
    -------
    None
    """
    paths = csiborgtools.read.Paths(**csiborgtools.paths_glamdring)
    cat0 = open_cat(nsim0)
    catx = open_cat(nsimx)
    reader = csiborgtools.read.PairOverlap(cat0, catx, paths)
    mass = numpy.log10(reader.cat0("totpartmass"))
    dist = reader.dist(in_initial=False, norm_kind="r200c")
    overlap = reader.overlap(True)
    dist = csiborgtools.read.weighted_stats(dist, overlap,
                                            min_weight=min_overlap)
    mask = numpy.isfinite(dist[:, 0])
    mass = mass[mask]
    dist = dist[mask, :]
    dist = numpy.log10(dist)
    p = numpy.polyfit(mass, dist[:, 0], 1)
    xrange = numpy.linspace(numpy.min(mass), numpy.max(mass), 1000)
    txt = r"$m = {}$, $c = {}$".format(*plt_utils.latex_float(*p, n=3))
    with plt.style.context(plt_utils.mplstyle):
        fig, ax = plt.subplots()
        ax.set_title(txt, fontsize="small")
        cx = ax.hexbin(mass, dist[:, 0], mincnt=1, bins="log", gridsize=50)
        ax.plot(xrange, numpy.polyval(p, xrange), color="red",
                linestyle="--")
        fig.colorbar(cx, label="Bin counts")
        ax.set_xlabel(r"$\log M_{\rm tot} / M_\odot$")
        ax.set_ylabel(r"$\log \langle \Delta R / R_{\rm 200c}\rangle$")
        fig.tight_layout()
        for ext in ["png"]:
            fout = join(plt_utils.fout, f"mass_vs_sep_{nsim0}_{nsimx}.{ext}")
            print(f"Saving to `{fout}`.")
            fig.savefig(fout, dpi=plt_utils.dpi, bbox_inches="tight")
        plt.close()
 ###############################################################################
 #                        Nearest neighbour plotting                           #
 ###############################################################################
@ -596,6 +658,7 @@ def plot_significance(simname, runs, nsim, nobs, kind, kwargs, runs_to_mass):
        Nearest neighbour reader keyword arguments.
    runs_to_mass : dict
        Dictionary mapping run names to total halo mass range.
    upper_threshold : bool, optional
    Returns
    -------
@ -642,7 +705,10 @@ def plot_significance(simname, runs, nsim, nobs, kind, kwargs, runs_to_mass):
            if simname == "quijote":
                paths = csiborgtools.read.Paths(**kwargs["paths_kind"])
                nsim = paths.quijote_fiducial_nsim(nsim, nobs)
-            fout = join(plt_utils.fout, f"significance_{kind}_{simname}_{str(nsim).zfill(5)}.{ext}")  # noqa
+            nsim = str(nsim).zfill(5)
            fout = join(
                plt_utils.fout,
                f"significance_{kind}_{simname}_{nsim}_{runs}.{ext}")
            print(f"Saving to `{fout}`.")
            fig.savefig(fout, dpi=plt_utils.dpi, bbox_inches="tight")
        plt.close()
@ -673,7 +739,7 @@ def make_binlims(run, runs_to_mass):
 def plot_significance_vs_mass(simname, runs, nsim, nobs, kind, kwargs,
-                              runs_to_mass):
+                              runs_to_mass, upper_threshold=False):
    """
    Plot significance of the 1NN distance as a function of the total mass.
@ -694,6 +760,8 @@ def plot_significance_vs_mass(simname, runs, nsim, nobs, kind, kwargs,
        Nearest neighbour reader keyword arguments.
    runs_to_mass : dict
        Dictionary mapping run names to total halo mass range.
    upper_threshold : bool, optional
        Whether to enforce an upper threshold on halo mass.
    Returns
    -------
@ -715,17 +783,20 @@ def plot_significance_vs_mass(simname, runs, nsim, nobs, kind, kwargs,
                y = numpy.log10(make_ks(simname, run, nsim, nobs, kwargs))
            xmin, xmax = make_binlims(run, runs_to_mass)
-            mask = (x >= xmin) & (x < xmax)
+
-            xs.append(x[mask])
+            mask = (x >= xmin) & (x < xmax if upper_threshold else True)
            xs.append(numpy.log10(x[mask]))
            ys.append(y[mask])
        xs = numpy.concatenate(xs)
        ys = numpy.concatenate(ys)
-        plt.hexbin(xs, ys, gridsize=75, mincnt=1, xscale="log", bins="log")
+        plt.hexbin(xs, ys, gridsize=75, mincnt=1, bins="log")
        mask = numpy.isfinite(xs) & numpy.isfinite(ys)
        corr = plt_utils.latex_float(*kendalltau(xs[mask], ys[mask]))
        plt.title(r"$\tau = {}, p = {}$".format(*corr), fontsize="small")
-        plt.xlabel(r"$M_{\rm tot} / M_\odot$")
+        plt.xlabel(r"$\log M_{\rm tot} / M_\odot$")
        plt.xlim(numpy.min(xs))
        if kind == "ks":
            plt.ylabel(r"$\log p$-value of $r_{1\mathrm{NN}}$ distribution")
            plt.ylim(top=0)
@ -738,15 +809,18 @@ def plot_significance_vs_mass(simname, runs, nsim, nobs, kind, kwargs,
        for ext in ["png"]:
            if simname == "quijote":
                nsim = paths.quijote_fiducial_nsim(nsim, nobs)
-            fout = (f"significance_vs_mass_{kind}_{simname}"
+            nsim = str(nsim).zfill(5)
-                    + f"_{str(nsim).zfill(5)}.{ext}")
+            fout = f"sgnf_vs_mass_{kind}_{simname}_{nsim}_{runs}.{ext}"
            if upper_threshold:
                fout = fout.replace(f".{ext}", f"_upper_threshold.{ext}")
            fout = join(plt_utils.fout, fout)
            print(f"Saving to `{fout}`.")
            plt.savefig(fout, dpi=plt_utils.dpi, bbox_inches="tight")
        plt.close()
-def plot_kl_vs_ks(simname, runs, nsim, nobs, kwargs, runs_to_mass):
+def plot_kl_vs_ks(simname, runs, nsim, nobs, kwargs, runs_to_mass,
                  upper_threshold=False):
    """
    Plot Kullback-Leibler divergence vs Kolmogorov-Smirnov statistic p-value.
@ -764,6 +838,8 @@ def plot_kl_vs_ks(simname, runs, nsim, nobs, kwargs, runs_to_mass):
        Nearest neighbour reader keyword arguments.
    runs_to_mass : dict
        Dictionary mapping run names to total halo mass range.
    upper_threshold : bool, optional
        Whether to enforce an upper threshold on halo mass.
    Returns
    -------
@ -779,7 +855,7 @@ def plot_kl_vs_ks(simname, runs, nsim, nobs, kwargs, runs_to_mass):
        y = make_ks(simname, run, nsim, nobs, kwargs)
        cmin, cmax = make_binlims(run, runs_to_mass)
-        mask = (c >= cmin) & (c < cmax)
+        mask = (c >= cmin) & (c < cmax if upper_threshold else True)
        xs.append(x[mask])
        ys.append(y[mask])
        cs.append(c[mask])
@ -792,6 +868,9 @@ def plot_kl_vs_ks(simname, runs, nsim, nobs, kwargs, runs_to_mass):
        plt.figure()
        plt.hexbin(xs, ys, C=cs, gridsize=50, mincnt=0,
                   reduce_C_function=numpy.median)
        mask = numpy.isfinite(xs) & numpy.isfinite(ys)
        corr = plt_utils.latex_float(*kendalltau(xs[mask], ys[mask]))
        plt.title(r"$\tau = {}, p = {}$".format(*corr), fontsize="small")
        plt.colorbar(label=r"$\log M_{\rm tot} / M_\odot$")
        plt.xlabel(r"$D_{\mathrm{KL}}$ of $r_{1\mathrm{NN}}$ distribution")
@ -801,14 +880,19 @@ def plot_kl_vs_ks(simname, runs, nsim, nobs, kwargs, runs_to_mass):
        for ext in ["png"]:
            if simname == "quijote":
                nsim = paths.quijote_fiducial_nsim(nsim, nobs)
-            fout = join(plt_utils.fout,
+            nsim = str(nsim).zfill(5)
-                        f"kl_vs_ks_{simname}_{run}_{str(nsim).zfill(5)}.{ext}")
+            fout = join(
                plt_utils.fout,
                f"kl_vs_ks_{simname}_{run}_{nsim}_{runs}.{ext}")
            if upper_threshold:
                fout = fout.replace(f".{ext}", f"_upper_threshold.{ext}")
            print(f"Saving to `{fout}`.")
            plt.savefig(fout, dpi=plt_utils.dpi, bbox_inches="tight")
        plt.close()
-def plot_kl_vs_overlap(runs, nsim, kwargs, runs_to_mass):
+def plot_kl_vs_overlap(runs, nsim, kwargs, runs_to_mass, plot_std=True,
                       upper_threshold=False):
    """
    Plot KL divergence vs overlap for CSiBORG.
@ -822,6 +906,10 @@ def plot_kl_vs_overlap(runs, nsim, kwargs, runs_to_mass):
        Nearest neighbour reader keyword arguments.
    runs_to_mass : dict
        Dictionary mapping run names to total halo mass range.
    plot_std : bool, optional
        Whether to plot the standard deviation of the overlap distribution.
    upper_threshold : bool, optional
        Whether to enforce an upper threshold on halo mass.
    Returns
    -------
@ -848,7 +936,7 @@ def plot_kl_vs_overlap(runs, nsim, kwargs, runs_to_mass):
        y1 = 1 - numpy.mean(prob_nomatch, axis=1)
        y2 = numpy.std(prob_nomatch, axis=1)
        cmin, cmax = make_binlims(run, runs_to_mass)
-        mask = (mass >= cmin) & (mass < cmax)
+        mask = (mass >= cmin) & (mass < cmax if upper_threshold else True)
        xs.append(x[mask])
        ys1.append(y1[mask])
        ys2.append(y2[mask])
@ -863,18 +951,28 @@ def plot_kl_vs_overlap(runs, nsim, kwargs, runs_to_mass):
        plt.figure()
        plt.hexbin(xs, ys1, C=cs, gridsize=50, mincnt=0,
                   reduce_C_function=numpy.median)
        mask = numpy.isfinite(xs) & numpy.isfinite(ys1)
        corr = plt_utils.latex_float(*kendalltau(xs[mask], ys1[mask]))
        plt.title(r"$\tau = {}, p = {}$".format(*corr), fontsize="small")
        plt.colorbar(label=r"$\log M_{\rm tot} / M_\odot$")
        plt.xlabel(r"$D_{\mathrm{KL}}$ of $r_{1\mathrm{NN}}$ distribution")
        plt.ylabel(r"$1 - \langle \eta^{\mathcal{B}}_a \rangle_{\mathcal{B}}$")
        plt.tight_layout()
        for ext in ["png"]:
            nsim = str(nsim).zfill(5)
            fout = join(plt_utils.fout,
-                        f"kl_vs_overlap_mean_{str(nsim).zfill(5)}.{ext}")
+                        f"kl_vs_overlap_mean_{nsim}_{runs}.{ext}")
            if upper_threshold:
                fout = fout.replace(f".{ext}", f"_upper_threshold.{ext}")
            print(f"Saving to `{fout}`.")
            plt.savefig(fout, dpi=plt_utils.dpi, bbox_inches="tight")
        plt.close()
    if not plot_std:
        return
    with plt.style.context(plt_utils.mplstyle):
        plt.figure()
        plt.hexbin(xs, ys2, C=cs, gridsize=50, mincnt=0,
@ -882,11 +980,17 @@ def plot_kl_vs_overlap(runs, nsim, kwargs, runs_to_mass):
        plt.colorbar(label=r"$\log M_{\rm tot} / M_\odot$")
        plt.xlabel(r"$D_{\mathrm{KL}}$ of $r_{1\mathrm{NN}}$ distribution")
        plt.ylabel(r"Ensemble std of summed overlap")
        mask = numpy.isfinite(xs) & numpy.isfinite(ys2)
        corr = plt_utils.latex_float(*kendalltau(xs[mask], ys2[mask]))
        plt.title(r"$\tau = {}, p = {}$".format(*corr), fontsize="small")
        plt.tight_layout()
        for ext in ["png"]:
            nsim = str(nsim).zfill(5)
            fout = join(plt_utils.fout,
-                        f"kl_vs_overlap_std_{str(nsim).zfill(5)}.{ext}")
+                        f"kl_vs_overlap_std_{nsim}_{runs}.{ext}")
            if upper_threshold:
                fout = fout.replace(f".{ext}", f"_upper_threshold.{ext}")
            print(f"Saving to `{fout}`.")
            plt.savefig(fout, dpi=plt_utils.dpi, bbox_inches="tight")
        plt.close()
@ -915,14 +1019,19 @@ if __name__ == "__main__":
        "mass009": (14.0, 14.4),  # There is no upper limit.
        }
-    cached_funcs = ["get_overlap", "read_dist", "make_kl", "make_ks"]
+    # cached_funcs = ["get_overlap", "read_dist", "make_kl", "make_ks"]
    cached_funcs = ["get_overlap"]
    if args.clean:
        for func in cached_funcs:
            print(f"Cleaning cache for function {func}.")
            delete_disk_caches_for_function(func)
    # Plot 1NN distance distributions.
    if True:
        plot_mass_vs_separation(7444 + 24, 8956 + 24 * 3)
    # Plot 1NN distance distributions.
    if False:
        for i in range(1, 10):
            run = f"mass00{i}"
            for pulled_cdf in [True, False]:
@ -931,30 +1040,35 @@ if __name__ == "__main__":
            plot_dist(run, "pdf", neighbour_kwargs, runs_to_mass)
    # Plot 1NN CDF differences.
-    if True:
+    if False:
        runs = [f"mass00{i}" for i in range(1, 10)]
        for pulled_cdf in [True, False]:
            plot_cdf_diff(runs, neighbour_kwargs, pulled_cdf=pulled_cdf,
                          runs_to_mass=runs_to_mass)
-    if True:
+    if False:
        runs = [f"mass00{i}" for i in range(1, 9)]
        for kind in ["kl", "ks"]:
            plot_significance("csiborg", runs, 7444, nobs=None, kind=kind,
                              kwargs=neighbour_kwargs,
                              runs_to_mass=runs_to_mass)
-    if True:
+    if False:
-        runs = [f"mass00{i}" for i in range(1, 10)]
+        # runs = [f"mass00{i}" for i in range(1, 10)]
        runs = ["mass007"]
        for kind in ["kl", "ks"]:
            plot_significance_vs_mass("csiborg", runs, 7444, nobs=None,
                                      kind=kind, kwargs=neighbour_kwargs,
-                                      runs_to_mass=runs_to_mass)
+                                      runs_to_mass=runs_to_mass,
                                      upper_threshold=True)
-    if True:
+    if False:
-        runs = [f"mass00{i}" for i in range(1, 10)]
+        # runs = [f"mass00{i}" for i in range(1, 10)]
        runs = ["mass006"]
        plot_kl_vs_ks("csiborg", runs, 7444, None, kwargs=neighbour_kwargs,
-                      runs_to_mass=runs_to_mass)
+                      runs_to_mass=runs_to_mass, upper_threshold=True)
-    if True:
+    if False:
-        runs = [f"mass00{i}" for i in range(1, 10)]
+        # runs = [f"mass00{i}" for i in range(1, 10)]
-        plot_kl_vs_overlap(runs, 7444, neighbour_kwargs, runs_to_mass)
+        runs = ["mass006"]
        plot_kl_vs_overlap(runs, 7444, neighbour_kwargs, runs_to_mass,
                           upper_threshold=True, plot_std=False)
--- a/scripts_plots/plt_utils.py
+++ b/scripts_plots/plt_utils.py
@ -16,3 +16,38 @@
 dpi = 600
 fout = "../plots/"
 mplstyle = ["science"]
 def latex_float(*floats, n=2):
    """
    Convert a float or a list of floats to a LaTeX string(s). Taken from [1].
    Parameters
    ----------
    floats : float or list of floats
        The float(s) to be converted.
    n : int, optional
        The number of significant figures to be used in the LaTeX string.
    Returns
    -------
    latex_floats : str or list of str
        The LaTeX string(s) representing the float(s).
    References
    ----------
    [1] https://stackoverflow.com/questions/13490292/format-number-using-latex-notation-in-python  # noqa
    """
    latex_floats = [None] * len(floats)
    for i, f in enumerate(floats):
        float_str = "{0:.{1}g}".format(f, n)
        if "e" in float_str:
            base, exponent = float_str.split("e")
            latex_floats[i] = r"{0} \times 10^{{{1}}}".format(base,
                                                              int(exponent))
        else:
            latex_floats[i] = float_str
    if len(floats) == 1:
        return latex_floats[0]
    return latex_floats