Better plots (#73)

* Edits paths of saved files * Add upper threshold options * Add upper threshold options * add latex_float option * Add weighted stats * add new plot
2025-07-12 00:53:02 +00:00 · 2023-06-28 15:22:42 +01:00 · 2023-06-28 15:22:42 +01:00 · fbf9c2a4b7
commit fbf9c2a4b7
parent de7def61d5
5 changed files with 225 additions and 39 deletions
--- a/csiborgtools/init.py
+++ b/csiborgtools/init.py
@ -17,8 +17,7 @@ from csiborgtools import clustering, field, fits, match, read  # noqa
 # Arguments to csiborgtools.read.Paths.
 paths_glamdring = {"srcdir": "/mnt/extraspace/hdesmond/",
                   "postdir": "/mnt/extraspace/rstiskalek/CSiBORG/",
-                   "BORG_dir": "/mnt/extraspace/rstiskalek/BORG/",
-                   "BORG_final_density": "/users/hdesmond/BORG_final/",
+                   "borg_dir": "/users/hdesmond/BORG_final/",
                   "quijote_dir": "/mnt/extraspace/rstiskalek/Quijote",
                   }

--- a/csiborgtools/read/init.py
+++ b/csiborgtools/read/init.py
@ -19,12 +19,14 @@ from .knn_summary import kNNCDFReader  # noqa
 from .nearest_neighbour_summary import NearestNeighbourReader  # noqa
 from .obs import (SDSS, MCXCClusters, PlanckClusters, TwoMPPGalaxies,  # noqa
                  TwoMPPGroups)
+from .overlap_summary import weighted_stats  # noqa
 from .overlap_summary import (NPairsOverlap, PairOverlap,  # noqa
-                              binned_resample_mean, get_cross_sims)
+                              binned_resample_mean, get_cross_sims)  # noqa
 from .paths import Paths  # noqa
 from .pk_summary import PKReader  # noqa
 from .readsim import (MmainReader, ParticleReader, halfwidth_mask,  # noqa
                      load_clump_particles, load_parent_particles, read_initcm)
 from .tpcf_summary import TPCFReader  # noqa
-from .utils import (cartesian_to_radec, cols_to_structured,  # noqa
-                    radec_to_cartesian, read_h5, real2redshift, M200_to_R200)
+from .utils import (M200_to_R200, cartesian_to_radec,  # noqa
+                    cols_to_structured, radec_to_cartesian, read_h5,
+                    real2redshift)
--- a/csiborgtools/read/overlap_summary.py
+++ b/csiborgtools/read/overlap_summary.py
@ -19,7 +19,7 @@ from functools import lru_cache
 from os.path import isfile

 import numpy
-from tqdm import tqdm
+from tqdm import tqdm, trange

 ###############################################################################
 #                         Overlap of two simulations                          #
@ -293,7 +293,6 @@ class PairOverlap:

            if norm_kind is not None:
                dist[i] /= norm[i]
-
        return numpy.array(dist, dtype=object)

    def mass_ratio(self, mass_kind="totpartmass", in_log=True, in_abs=True):
@ -406,7 +405,7 @@ class PairOverlap:
        vals = self.cat0(par)
        out = [None] * len(self)
        for i, ind in enumerate(self["match_indxs"]):
-            out[i] = numpy.ones(ind.size) * vals[i]
+            out[i] = numpy.ones(len(ind)) * vals[i]
        return numpy.array(out, dtype=object)

    def cat0(self, key=None, index=None):
@ -459,6 +458,43 @@ class PairOverlap:
        return self["match_indxs"].size


+def weighted_stats(x, weights, min_weight=0, verbose=False):
+    """
+    Calculate the weighted mean and standard deviation of `x` using `weights`
+    for each array of `x`.
+
+    Parameters
+    ----------
+    x : array of arrays
+        Array of arrays of values to calculate the weighted mean and standard
+        deviation for.
+    weights : array of arrays
+        Array of arrays of weights to use for the calculation.
+    min_weight : float, optional
+        Minimum weight required for a value to be included in the calculation.
+    verbose : bool, optional
+        Verbosity flag.
+
+    Returns
+    -------
+    stat : 2-dimensional array of shape `(len(x), 2)`
+        The first column is the weighted mean and the second column is the
+        weighted standard deviation.
+    """
+    out = numpy.full((x.size, 2), numpy.nan, dtype=numpy.float32)
+
+    for i in trange(len(x)) if verbose else range(len(x)):
+        x_, w_ = numpy.asarray(x[i]), numpy.asarray(weights[i])
+        mask = w_ > min_weight
+        x_ = x_[mask]
+        w_ = w_[mask]
+        if len(w_) == 0:
+            continue
+        out[i, 0] = numpy.average(x_, weights=w_)
+        out[i, 1] = numpy.average((x_ - out[i, 0])**2, weights=w_)**0.5
+    return out
+
+
 ###############################################################################
 #                  Overlap of many pairs of simulations.                      #
 ###############################################################################
--- a/scripts_plots/plot_match.py
+++ b/scripts_plots/plot_match.py
@ -14,6 +14,7 @@
 # 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.

 from argparse import ArgumentParser
+from gc import collect
 from os.path import join

 import matplotlib as mpl
@ -21,6 +22,7 @@ import matplotlib.pyplot as plt
 import numpy
 import scienceplots  # noqa
 from cache_to_disk import cache_to_disk, delete_disk_caches_for_function
+from scipy.stats import kendalltau
 from tqdm import tqdm

 import plt_utils
@ -88,6 +90,7 @@ def get_overlap(nsim0):

    reader = csiborgtools.read.NPairsOverlap(cat0, catxs, paths)
    mass = reader.cat0("totpartmass")
+
    hindxs = reader.cat0("index")
    summed_overlap = reader.summed_overlap(True)
    prob_nomatch = reader.prob_nomatch(True)
@ -96,7 +99,7 @@ def get_overlap(nsim0):

 def plot_summed_overlap_vs_mass(nsim0):
    """
-    Plot the summer overlap of probaiblity of no matching for a single
+    Plot the summed overlap of probability of no matching for a single
    reference simulations as a function of the reference halo mass, along with
    their comparison.

@ -110,6 +113,8 @@ def plot_summed_overlap_vs_mass(nsim0):
    None
    """
    x, __, summed_overlap, prob_nomatch = get_overlap(nsim0)
+    del __
+    collect()

    mean_overlap = numpy.mean(summed_overlap, axis=1)
    std_overlap = numpy.std(summed_overlap, axis=1)
@ -178,6 +183,63 @@ def plot_summed_overlap_vs_mass(nsim0):
        plt.close()


+def plot_mass_vs_separation(nsim0, nsimx, min_overlap=0.0):
+    """
+    Plot the mass of a reference halo against the weighted separation of
+    its counterparts.
+
+    Parameters
+    ----------
+    nsim0 : int
+        Reference simulation index.
+    nsimx : int
+        Cross simulation index.
+    min_overlap : float, optional
+        Minimum overlap to consider.
+
+    Returns
+    -------
+    None
+    """
+    paths = csiborgtools.read.Paths(**csiborgtools.paths_glamdring)
+    cat0 = open_cat(nsim0)
+    catx = open_cat(nsimx)
+
+    reader = csiborgtools.read.PairOverlap(cat0, catx, paths)
+    mass = numpy.log10(reader.cat0("totpartmass"))
+    dist = reader.dist(in_initial=False, norm_kind="r200c")
+    overlap = reader.overlap(True)
+    dist = csiborgtools.read.weighted_stats(dist, overlap,
+                                            min_weight=min_overlap)
+
+    mask = numpy.isfinite(dist[:, 0])
+    mass = mass[mask]
+    dist = dist[mask, :]
+    dist = numpy.log10(dist)
+
+    p = numpy.polyfit(mass, dist[:, 0], 1)
+    xrange = numpy.linspace(numpy.min(mass), numpy.max(mass), 1000)
+    txt = r"$m = {}$, $c = {}$".format(*plt_utils.latex_float(*p, n=3))
+
+    with plt.style.context(plt_utils.mplstyle):
+        fig, ax = plt.subplots()
+        ax.set_title(txt, fontsize="small")
+
+        cx = ax.hexbin(mass, dist[:, 0], mincnt=1, bins="log", gridsize=50)
+        ax.plot(xrange, numpy.polyval(p, xrange), color="red",
+                linestyle="--")
+        fig.colorbar(cx, label="Bin counts")
+        ax.set_xlabel(r"$\log M_{\rm tot} / M_\odot$")
+        ax.set_ylabel(r"$\log \langle \Delta R / R_{\rm 200c}\rangle$")
+
+        fig.tight_layout()
+        for ext in ["png"]:
+            fout = join(plt_utils.fout, f"mass_vs_sep_{nsim0}_{nsimx}.{ext}")
+            print(f"Saving to `{fout}`.")
+            fig.savefig(fout, dpi=plt_utils.dpi, bbox_inches="tight")
+        plt.close()
+
+
 ###############################################################################
 #                        Nearest neighbour plotting                           #
 ###############################################################################
@ -596,6 +658,7 @@ def plot_significance(simname, runs, nsim, nobs, kind, kwargs, runs_to_mass):
        Nearest neighbour reader keyword arguments.
    runs_to_mass : dict
        Dictionary mapping run names to total halo mass range.
+    upper_threshold : bool, optional

    Returns
    -------
@ -642,7 +705,10 @@ def plot_significance(simname, runs, nsim, nobs, kind, kwargs, runs_to_mass):
            if simname == "quijote":
                paths = csiborgtools.read.Paths(**kwargs["paths_kind"])
                nsim = paths.quijote_fiducial_nsim(nsim, nobs)
-            fout = join(plt_utils.fout, f"significance_{kind}_{simname}_{str(nsim).zfill(5)}.{ext}")  # noqa
+            nsim = str(nsim).zfill(5)
+            fout = join(
+                plt_utils.fout,
+                f"significance_{kind}_{simname}_{nsim}_{runs}.{ext}")
            print(f"Saving to `{fout}`.")
            fig.savefig(fout, dpi=plt_utils.dpi, bbox_inches="tight")
        plt.close()
@ -673,7 +739,7 @@ def make_binlims(run, runs_to_mass):


 def plot_significance_vs_mass(simname, runs, nsim, nobs, kind, kwargs,
-                              runs_to_mass):
+                              runs_to_mass, upper_threshold=False):
    """
    Plot significance of the 1NN distance as a function of the total mass.

@ -694,6 +760,8 @@ def plot_significance_vs_mass(simname, runs, nsim, nobs, kind, kwargs,
        Nearest neighbour reader keyword arguments.
    runs_to_mass : dict
        Dictionary mapping run names to total halo mass range.
+    upper_threshold : bool, optional
+        Whether to enforce an upper threshold on halo mass.

    Returns
    -------
@ -715,17 +783,20 @@ def plot_significance_vs_mass(simname, runs, nsim, nobs, kind, kwargs,
                y = numpy.log10(make_ks(simname, run, nsim, nobs, kwargs))

            xmin, xmax = make_binlims(run, runs_to_mass)
-            mask = (x >= xmin) & (x < xmax)
-            xs.append(x[mask])
+
+            mask = (x >= xmin) & (x < xmax if upper_threshold else True)
+            xs.append(numpy.log10(x[mask]))
            ys.append(y[mask])

        xs = numpy.concatenate(xs)
        ys = numpy.concatenate(ys)

-        plt.hexbin(xs, ys, gridsize=75, mincnt=1, xscale="log", bins="log")
+        plt.hexbin(xs, ys, gridsize=75, mincnt=1, bins="log")
+        mask = numpy.isfinite(xs) & numpy.isfinite(ys)
+        corr = plt_utils.latex_float(*kendalltau(xs[mask], ys[mask]))
+        plt.title(r"$\tau = {}, p = {}$".format(*corr), fontsize="small")

-        plt.xlabel(r"$M_{\rm tot} / M_\odot$")
-        plt.xlim(numpy.min(xs))
+        plt.xlabel(r"$\log M_{\rm tot} / M_\odot$")
        if kind == "ks":
            plt.ylabel(r"$\log p$-value of $r_{1\mathrm{NN}}$ distribution")
            plt.ylim(top=0)
@ -738,15 +809,18 @@ def plot_significance_vs_mass(simname, runs, nsim, nobs, kind, kwargs,
        for ext in ["png"]:
            if simname == "quijote":
                nsim = paths.quijote_fiducial_nsim(nsim, nobs)
-            fout = (f"significance_vs_mass_{kind}_{simname}"
-                    + f"_{str(nsim).zfill(5)}.{ext}")
+            nsim = str(nsim).zfill(5)
+            fout = f"sgnf_vs_mass_{kind}_{simname}_{nsim}_{runs}.{ext}"
+            if upper_threshold:
+                fout = fout.replace(f".{ext}", f"_upper_threshold.{ext}")
            fout = join(plt_utils.fout, fout)
            print(f"Saving to `{fout}`.")
            plt.savefig(fout, dpi=plt_utils.dpi, bbox_inches="tight")
        plt.close()


-def plot_kl_vs_ks(simname, runs, nsim, nobs, kwargs, runs_to_mass):
+def plot_kl_vs_ks(simname, runs, nsim, nobs, kwargs, runs_to_mass,
+                  upper_threshold=False):
    """
    Plot Kullback-Leibler divergence vs Kolmogorov-Smirnov statistic p-value.

@ -764,6 +838,8 @@ def plot_kl_vs_ks(simname, runs, nsim, nobs, kwargs, runs_to_mass):
        Nearest neighbour reader keyword arguments.
    runs_to_mass : dict
        Dictionary mapping run names to total halo mass range.
+    upper_threshold : bool, optional
+        Whether to enforce an upper threshold on halo mass.

    Returns
    -------
@ -779,7 +855,7 @@ def plot_kl_vs_ks(simname, runs, nsim, nobs, kwargs, runs_to_mass):
        y = make_ks(simname, run, nsim, nobs, kwargs)

        cmin, cmax = make_binlims(run, runs_to_mass)
-        mask = (c >= cmin) & (c < cmax)
+        mask = (c >= cmin) & (c < cmax if upper_threshold else True)
        xs.append(x[mask])
        ys.append(y[mask])
        cs.append(c[mask])
@ -792,6 +868,9 @@ def plot_kl_vs_ks(simname, runs, nsim, nobs, kwargs, runs_to_mass):
        plt.figure()
        plt.hexbin(xs, ys, C=cs, gridsize=50, mincnt=0,
                   reduce_C_function=numpy.median)
+        mask = numpy.isfinite(xs) & numpy.isfinite(ys)
+        corr = plt_utils.latex_float(*kendalltau(xs[mask], ys[mask]))
+        plt.title(r"$\tau = {}, p = {}$".format(*corr), fontsize="small")
        plt.colorbar(label=r"$\log M_{\rm tot} / M_\odot$")

        plt.xlabel(r"$D_{\mathrm{KL}}$ of $r_{1\mathrm{NN}}$ distribution")
@ -801,14 +880,19 @@ def plot_kl_vs_ks(simname, runs, nsim, nobs, kwargs, runs_to_mass):
        for ext in ["png"]:
            if simname == "quijote":
                nsim = paths.quijote_fiducial_nsim(nsim, nobs)
-            fout = join(plt_utils.fout,
-                        f"kl_vs_ks_{simname}_{run}_{str(nsim).zfill(5)}.{ext}")
+            nsim = str(nsim).zfill(5)
+            fout = join(
+                plt_utils.fout,
+                f"kl_vs_ks_{simname}_{run}_{nsim}_{runs}.{ext}")
+            if upper_threshold:
+                fout = fout.replace(f".{ext}", f"_upper_threshold.{ext}")
            print(f"Saving to `{fout}`.")
            plt.savefig(fout, dpi=plt_utils.dpi, bbox_inches="tight")
        plt.close()


-def plot_kl_vs_overlap(runs, nsim, kwargs, runs_to_mass):
+def plot_kl_vs_overlap(runs, nsim, kwargs, runs_to_mass, plot_std=True,
+                       upper_threshold=False):
    """
    Plot KL divergence vs overlap for CSiBORG.

@ -822,6 +906,10 @@ def plot_kl_vs_overlap(runs, nsim, kwargs, runs_to_mass):
        Nearest neighbour reader keyword arguments.
    runs_to_mass : dict
        Dictionary mapping run names to total halo mass range.
+    plot_std : bool, optional
+        Whether to plot the standard deviation of the overlap distribution.
+    upper_threshold : bool, optional
+        Whether to enforce an upper threshold on halo mass.

    Returns
    -------
@ -848,7 +936,7 @@ def plot_kl_vs_overlap(runs, nsim, kwargs, runs_to_mass):
        y1 = 1 - numpy.mean(prob_nomatch, axis=1)
        y2 = numpy.std(prob_nomatch, axis=1)
        cmin, cmax = make_binlims(run, runs_to_mass)
-        mask = (mass >= cmin) & (mass < cmax)
+        mask = (mass >= cmin) & (mass < cmax if upper_threshold else True)
        xs.append(x[mask])
        ys1.append(y1[mask])
        ys2.append(y2[mask])
@ -863,18 +951,28 @@ def plot_kl_vs_overlap(runs, nsim, kwargs, runs_to_mass):
        plt.figure()
        plt.hexbin(xs, ys1, C=cs, gridsize=50, mincnt=0,
                   reduce_C_function=numpy.median)
+        mask = numpy.isfinite(xs) & numpy.isfinite(ys1)
+        corr = plt_utils.latex_float(*kendalltau(xs[mask], ys1[mask]))
+        plt.title(r"$\tau = {}, p = {}$".format(*corr), fontsize="small")
+
        plt.colorbar(label=r"$\log M_{\rm tot} / M_\odot$")
        plt.xlabel(r"$D_{\mathrm{KL}}$ of $r_{1\mathrm{NN}}$ distribution")
        plt.ylabel(r"$1 - \langle \eta^{\mathcal{B}}_a \rangle_{\mathcal{B}}$")

        plt.tight_layout()
        for ext in ["png"]:
+            nsim = str(nsim).zfill(5)
            fout = join(plt_utils.fout,
-                        f"kl_vs_overlap_mean_{str(nsim).zfill(5)}.{ext}")
+                        f"kl_vs_overlap_mean_{nsim}_{runs}.{ext}")
+            if upper_threshold:
+                fout = fout.replace(f".{ext}", f"_upper_threshold.{ext}")
            print(f"Saving to `{fout}`.")
            plt.savefig(fout, dpi=plt_utils.dpi, bbox_inches="tight")
        plt.close()

+    if not plot_std:
+        return
+
    with plt.style.context(plt_utils.mplstyle):
        plt.figure()
        plt.hexbin(xs, ys2, C=cs, gridsize=50, mincnt=0,
@ -882,11 +980,17 @@ def plot_kl_vs_overlap(runs, nsim, kwargs, runs_to_mass):
        plt.colorbar(label=r"$\log M_{\rm tot} / M_\odot$")
        plt.xlabel(r"$D_{\mathrm{KL}}$ of $r_{1\mathrm{NN}}$ distribution")
        plt.ylabel(r"Ensemble std of summed overlap")
+        mask = numpy.isfinite(xs) & numpy.isfinite(ys2)
+        corr = plt_utils.latex_float(*kendalltau(xs[mask], ys2[mask]))
+        plt.title(r"$\tau = {}, p = {}$".format(*corr), fontsize="small")

        plt.tight_layout()
        for ext in ["png"]:
+            nsim = str(nsim).zfill(5)
            fout = join(plt_utils.fout,
-                        f"kl_vs_overlap_std_{str(nsim).zfill(5)}.{ext}")
+                        f"kl_vs_overlap_std_{nsim}_{runs}.{ext}")
+            if upper_threshold:
+                fout = fout.replace(f".{ext}", f"_upper_threshold.{ext}")
            print(f"Saving to `{fout}`.")
            plt.savefig(fout, dpi=plt_utils.dpi, bbox_inches="tight")
        plt.close()
@ -915,14 +1019,19 @@ if __name__ == "__main__":
        "mass009": (14.0, 14.4),  # There is no upper limit.
        }

-    cached_funcs = ["get_overlap", "read_dist", "make_kl", "make_ks"]
+    # cached_funcs = ["get_overlap", "read_dist", "make_kl", "make_ks"]
+    cached_funcs = ["get_overlap"]
    if args.clean:
        for func in cached_funcs:
            print(f"Cleaning cache for function {func}.")
            delete_disk_caches_for_function(func)

-    # Plot 1NN distance distributions.
    if True:
+        plot_mass_vs_separation(7444 + 24, 8956 + 24 * 3)
+
+
+    # Plot 1NN distance distributions.
+    if False:
        for i in range(1, 10):
            run = f"mass00{i}"
            for pulled_cdf in [True, False]:
@ -931,30 +1040,35 @@ if __name__ == "__main__":
            plot_dist(run, "pdf", neighbour_kwargs, runs_to_mass)

    # Plot 1NN CDF differences.
-    if True:
+    if False:
        runs = [f"mass00{i}" for i in range(1, 10)]
        for pulled_cdf in [True, False]:
            plot_cdf_diff(runs, neighbour_kwargs, pulled_cdf=pulled_cdf,
                          runs_to_mass=runs_to_mass)
-    if True:
+    if False:
        runs = [f"mass00{i}" for i in range(1, 9)]
        for kind in ["kl", "ks"]:
            plot_significance("csiborg", runs, 7444, nobs=None, kind=kind,
                              kwargs=neighbour_kwargs,
                              runs_to_mass=runs_to_mass)

-    if True:
-        runs = [f"mass00{i}" for i in range(1, 10)]
+    if False:
+        # runs = [f"mass00{i}" for i in range(1, 10)]
+        runs = ["mass007"]
        for kind in ["kl", "ks"]:
            plot_significance_vs_mass("csiborg", runs, 7444, nobs=None,
                                      kind=kind, kwargs=neighbour_kwargs,
-                                      runs_to_mass=runs_to_mass)
+                                      runs_to_mass=runs_to_mass,
+                                      upper_threshold=True)

-    if True:
-        runs = [f"mass00{i}" for i in range(1, 10)]
+    if False:
+        # runs = [f"mass00{i}" for i in range(1, 10)]
+        runs = ["mass006"]
        plot_kl_vs_ks("csiborg", runs, 7444, None, kwargs=neighbour_kwargs,
-                      runs_to_mass=runs_to_mass)
+                      runs_to_mass=runs_to_mass, upper_threshold=True)

-    if True:
-        runs = [f"mass00{i}" for i in range(1, 10)]
-        plot_kl_vs_overlap(runs, 7444, neighbour_kwargs, runs_to_mass)
+    if False:
+        # runs = [f"mass00{i}" for i in range(1, 10)]
+        runs = ["mass006"]
+        plot_kl_vs_overlap(runs, 7444, neighbour_kwargs, runs_to_mass,
+                           upper_threshold=True, plot_std=False)
--- a/scripts_plots/plt_utils.py
+++ b/scripts_plots/plt_utils.py
@ -16,3 +16,38 @@
 dpi = 600
 fout = "../plots/"
 mplstyle = ["science"]
+
+
+def latex_float(*floats, n=2):
+    """
+    Convert a float or a list of floats to a LaTeX string(s). Taken from [1].
+
+    Parameters
+    ----------
+    floats : float or list of floats
+        The float(s) to be converted.
+    n : int, optional
+        The number of significant figures to be used in the LaTeX string.
+
+    Returns
+    -------
+    latex_floats : str or list of str
+        The LaTeX string(s) representing the float(s).
+
+    References
+    ----------
+    [1] https://stackoverflow.com/questions/13490292/format-number-using-latex-notation-in-python  # noqa
+    """
+    latex_floats = [None] * len(floats)
+    for i, f in enumerate(floats):
+        float_str = "{0:.{1}g}".format(f, n)
+        if "e" in float_str:
+            base, exponent = float_str.split("e")
+            latex_floats[i] = r"{0} \times 10^{{{1}}}".format(base,
+                                                              int(exponent))
+        else:
+            latex_floats[i] = float_str
+
+    if len(floats) == 1:
+        return latex_floats[0]
+    return latex_floats