From fbf9c2a4b705da74adc409c80470ef0c4e3f7fae Mon Sep 17 00:00:00 2001 From: Richard Stiskalek Date: Wed, 28 Jun 2023 15:22:42 +0100 Subject: [PATCH] Better plots (#73) * Edits paths of saved files * Add upper threshold options * Add upper threshold options * add latex_float option * Add weighted stats * add new plot --- csiborgtools/__init__.py | 3 +- csiborgtools/read/__init__.py | 8 +- csiborgtools/read/overlap_summary.py | 42 ++++++- scripts_plots/plot_match.py | 176 ++++++++++++++++++++++----- scripts_plots/plt_utils.py | 35 ++++++ 5 files changed, 225 insertions(+), 39 deletions(-) diff --git a/csiborgtools/__init__.py b/csiborgtools/__init__.py index 65a40f3..554a029 100644 --- a/csiborgtools/__init__.py +++ b/csiborgtools/__init__.py @@ -17,8 +17,7 @@ from csiborgtools import clustering, field, fits, match, read # noqa # Arguments to csiborgtools.read.Paths. paths_glamdring = {"srcdir": "/mnt/extraspace/hdesmond/", "postdir": "/mnt/extraspace/rstiskalek/CSiBORG/", - "BORG_dir": "/mnt/extraspace/rstiskalek/BORG/", - "BORG_final_density": "/users/hdesmond/BORG_final/", + "borg_dir": "/users/hdesmond/BORG_final/", "quijote_dir": "/mnt/extraspace/rstiskalek/Quijote", } diff --git a/csiborgtools/read/__init__.py b/csiborgtools/read/__init__.py index 91db25f..2723ae7 100644 --- a/csiborgtools/read/__init__.py +++ b/csiborgtools/read/__init__.py @@ -19,12 +19,14 @@ from .knn_summary import kNNCDFReader # noqa from .nearest_neighbour_summary import NearestNeighbourReader # noqa from .obs import (SDSS, MCXCClusters, PlanckClusters, TwoMPPGalaxies, # noqa TwoMPPGroups) +from .overlap_summary import weighted_stats # noqa from .overlap_summary import (NPairsOverlap, PairOverlap, # noqa - binned_resample_mean, get_cross_sims) + binned_resample_mean, get_cross_sims) # noqa from .paths import Paths # noqa from .pk_summary import PKReader # noqa from .readsim import (MmainReader, ParticleReader, halfwidth_mask, # noqa load_clump_particles, load_parent_particles, read_initcm) from .tpcf_summary import TPCFReader # noqa -from .utils import (cartesian_to_radec, cols_to_structured, # noqa - radec_to_cartesian, read_h5, real2redshift, M200_to_R200) +from .utils import (M200_to_R200, cartesian_to_radec, # noqa + cols_to_structured, radec_to_cartesian, read_h5, + real2redshift) diff --git a/csiborgtools/read/overlap_summary.py b/csiborgtools/read/overlap_summary.py index d3060a9..bf66efd 100644 --- a/csiborgtools/read/overlap_summary.py +++ b/csiborgtools/read/overlap_summary.py @@ -19,7 +19,7 @@ from functools import lru_cache from os.path import isfile import numpy -from tqdm import tqdm +from tqdm import tqdm, trange ############################################################################### # Overlap of two simulations # @@ -293,7 +293,6 @@ class PairOverlap: if norm_kind is not None: dist[i] /= norm[i] - return numpy.array(dist, dtype=object) def mass_ratio(self, mass_kind="totpartmass", in_log=True, in_abs=True): @@ -406,7 +405,7 @@ class PairOverlap: vals = self.cat0(par) out = [None] * len(self) for i, ind in enumerate(self["match_indxs"]): - out[i] = numpy.ones(ind.size) * vals[i] + out[i] = numpy.ones(len(ind)) * vals[i] return numpy.array(out, dtype=object) def cat0(self, key=None, index=None): @@ -459,6 +458,43 @@ class PairOverlap: return self["match_indxs"].size +def weighted_stats(x, weights, min_weight=0, verbose=False): + """ + Calculate the weighted mean and standard deviation of `x` using `weights` + for each array of `x`. + + Parameters + ---------- + x : array of arrays + Array of arrays of values to calculate the weighted mean and standard + deviation for. + weights : array of arrays + Array of arrays of weights to use for the calculation. + min_weight : float, optional + Minimum weight required for a value to be included in the calculation. + verbose : bool, optional + Verbosity flag. + + Returns + ------- + stat : 2-dimensional array of shape `(len(x), 2)` + The first column is the weighted mean and the second column is the + weighted standard deviation. + """ + out = numpy.full((x.size, 2), numpy.nan, dtype=numpy.float32) + + for i in trange(len(x)) if verbose else range(len(x)): + x_, w_ = numpy.asarray(x[i]), numpy.asarray(weights[i]) + mask = w_ > min_weight + x_ = x_[mask] + w_ = w_[mask] + if len(w_) == 0: + continue + out[i, 0] = numpy.average(x_, weights=w_) + out[i, 1] = numpy.average((x_ - out[i, 0])**2, weights=w_)**0.5 + return out + + ############################################################################### # Overlap of many pairs of simulations. # ############################################################################### diff --git a/scripts_plots/plot_match.py b/scripts_plots/plot_match.py index b2d85ef..05e94a7 100644 --- a/scripts_plots/plot_match.py +++ b/scripts_plots/plot_match.py @@ -14,6 +14,7 @@ # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. from argparse import ArgumentParser +from gc import collect from os.path import join import matplotlib as mpl @@ -21,6 +22,7 @@ import matplotlib.pyplot as plt import numpy import scienceplots # noqa from cache_to_disk import cache_to_disk, delete_disk_caches_for_function +from scipy.stats import kendalltau from tqdm import tqdm import plt_utils @@ -88,6 +90,7 @@ def get_overlap(nsim0): reader = csiborgtools.read.NPairsOverlap(cat0, catxs, paths) mass = reader.cat0("totpartmass") + hindxs = reader.cat0("index") summed_overlap = reader.summed_overlap(True) prob_nomatch = reader.prob_nomatch(True) @@ -96,7 +99,7 @@ def get_overlap(nsim0): def plot_summed_overlap_vs_mass(nsim0): """ - Plot the summer overlap of probaiblity of no matching for a single + Plot the summed overlap of probability of no matching for a single reference simulations as a function of the reference halo mass, along with their comparison. @@ -110,6 +113,8 @@ def plot_summed_overlap_vs_mass(nsim0): None """ x, __, summed_overlap, prob_nomatch = get_overlap(nsim0) + del __ + collect() mean_overlap = numpy.mean(summed_overlap, axis=1) std_overlap = numpy.std(summed_overlap, axis=1) @@ -178,6 +183,63 @@ def plot_summed_overlap_vs_mass(nsim0): plt.close() +def plot_mass_vs_separation(nsim0, nsimx, min_overlap=0.0): + """ + Plot the mass of a reference halo against the weighted separation of + its counterparts. + + Parameters + ---------- + nsim0 : int + Reference simulation index. + nsimx : int + Cross simulation index. + min_overlap : float, optional + Minimum overlap to consider. + + Returns + ------- + None + """ + paths = csiborgtools.read.Paths(**csiborgtools.paths_glamdring) + cat0 = open_cat(nsim0) + catx = open_cat(nsimx) + + reader = csiborgtools.read.PairOverlap(cat0, catx, paths) + mass = numpy.log10(reader.cat0("totpartmass")) + dist = reader.dist(in_initial=False, norm_kind="r200c") + overlap = reader.overlap(True) + dist = csiborgtools.read.weighted_stats(dist, overlap, + min_weight=min_overlap) + + mask = numpy.isfinite(dist[:, 0]) + mass = mass[mask] + dist = dist[mask, :] + dist = numpy.log10(dist) + + p = numpy.polyfit(mass, dist[:, 0], 1) + xrange = numpy.linspace(numpy.min(mass), numpy.max(mass), 1000) + txt = r"$m = {}$, $c = {}$".format(*plt_utils.latex_float(*p, n=3)) + + with plt.style.context(plt_utils.mplstyle): + fig, ax = plt.subplots() + ax.set_title(txt, fontsize="small") + + cx = ax.hexbin(mass, dist[:, 0], mincnt=1, bins="log", gridsize=50) + ax.plot(xrange, numpy.polyval(p, xrange), color="red", + linestyle="--") + fig.colorbar(cx, label="Bin counts") + ax.set_xlabel(r"$\log M_{\rm tot} / M_\odot$") + ax.set_ylabel(r"$\log \langle \Delta R / R_{\rm 200c}\rangle$") + + fig.tight_layout() + for ext in ["png"]: + fout = join(plt_utils.fout, f"mass_vs_sep_{nsim0}_{nsimx}.{ext}") + print(f"Saving to `{fout}`.") + fig.savefig(fout, dpi=plt_utils.dpi, bbox_inches="tight") + plt.close() + + ############################################################################### # Nearest neighbour plotting # ############################################################################### @@ -596,6 +658,7 @@ def plot_significance(simname, runs, nsim, nobs, kind, kwargs, runs_to_mass): Nearest neighbour reader keyword arguments. runs_to_mass : dict Dictionary mapping run names to total halo mass range. + upper_threshold : bool, optional Returns ------- @@ -642,7 +705,10 @@ def plot_significance(simname, runs, nsim, nobs, kind, kwargs, runs_to_mass): if simname == "quijote": paths = csiborgtools.read.Paths(**kwargs["paths_kind"]) nsim = paths.quijote_fiducial_nsim(nsim, nobs) - fout = join(plt_utils.fout, f"significance_{kind}_{simname}_{str(nsim).zfill(5)}.{ext}") # noqa + nsim = str(nsim).zfill(5) + fout = join( + plt_utils.fout, + f"significance_{kind}_{simname}_{nsim}_{runs}.{ext}") print(f"Saving to `{fout}`.") fig.savefig(fout, dpi=plt_utils.dpi, bbox_inches="tight") plt.close() @@ -673,7 +739,7 @@ def make_binlims(run, runs_to_mass): def plot_significance_vs_mass(simname, runs, nsim, nobs, kind, kwargs, - runs_to_mass): + runs_to_mass, upper_threshold=False): """ Plot significance of the 1NN distance as a function of the total mass. @@ -694,6 +760,8 @@ def plot_significance_vs_mass(simname, runs, nsim, nobs, kind, kwargs, Nearest neighbour reader keyword arguments. runs_to_mass : dict Dictionary mapping run names to total halo mass range. + upper_threshold : bool, optional + Whether to enforce an upper threshold on halo mass. Returns ------- @@ -715,17 +783,20 @@ def plot_significance_vs_mass(simname, runs, nsim, nobs, kind, kwargs, y = numpy.log10(make_ks(simname, run, nsim, nobs, kwargs)) xmin, xmax = make_binlims(run, runs_to_mass) - mask = (x >= xmin) & (x < xmax) - xs.append(x[mask]) + + mask = (x >= xmin) & (x < xmax if upper_threshold else True) + xs.append(numpy.log10(x[mask])) ys.append(y[mask]) xs = numpy.concatenate(xs) ys = numpy.concatenate(ys) - plt.hexbin(xs, ys, gridsize=75, mincnt=1, xscale="log", bins="log") + plt.hexbin(xs, ys, gridsize=75, mincnt=1, bins="log") + mask = numpy.isfinite(xs) & numpy.isfinite(ys) + corr = plt_utils.latex_float(*kendalltau(xs[mask], ys[mask])) + plt.title(r"$\tau = {}, p = {}$".format(*corr), fontsize="small") - plt.xlabel(r"$M_{\rm tot} / M_\odot$") - plt.xlim(numpy.min(xs)) + plt.xlabel(r"$\log M_{\rm tot} / M_\odot$") if kind == "ks": plt.ylabel(r"$\log p$-value of $r_{1\mathrm{NN}}$ distribution") plt.ylim(top=0) @@ -738,15 +809,18 @@ def plot_significance_vs_mass(simname, runs, nsim, nobs, kind, kwargs, for ext in ["png"]: if simname == "quijote": nsim = paths.quijote_fiducial_nsim(nsim, nobs) - fout = (f"significance_vs_mass_{kind}_{simname}" - + f"_{str(nsim).zfill(5)}.{ext}") + nsim = str(nsim).zfill(5) + fout = f"sgnf_vs_mass_{kind}_{simname}_{nsim}_{runs}.{ext}" + if upper_threshold: + fout = fout.replace(f".{ext}", f"_upper_threshold.{ext}") fout = join(plt_utils.fout, fout) print(f"Saving to `{fout}`.") plt.savefig(fout, dpi=plt_utils.dpi, bbox_inches="tight") plt.close() -def plot_kl_vs_ks(simname, runs, nsim, nobs, kwargs, runs_to_mass): +def plot_kl_vs_ks(simname, runs, nsim, nobs, kwargs, runs_to_mass, + upper_threshold=False): """ Plot Kullback-Leibler divergence vs Kolmogorov-Smirnov statistic p-value. @@ -764,6 +838,8 @@ def plot_kl_vs_ks(simname, runs, nsim, nobs, kwargs, runs_to_mass): Nearest neighbour reader keyword arguments. runs_to_mass : dict Dictionary mapping run names to total halo mass range. + upper_threshold : bool, optional + Whether to enforce an upper threshold on halo mass. Returns ------- @@ -779,7 +855,7 @@ def plot_kl_vs_ks(simname, runs, nsim, nobs, kwargs, runs_to_mass): y = make_ks(simname, run, nsim, nobs, kwargs) cmin, cmax = make_binlims(run, runs_to_mass) - mask = (c >= cmin) & (c < cmax) + mask = (c >= cmin) & (c < cmax if upper_threshold else True) xs.append(x[mask]) ys.append(y[mask]) cs.append(c[mask]) @@ -792,6 +868,9 @@ def plot_kl_vs_ks(simname, runs, nsim, nobs, kwargs, runs_to_mass): plt.figure() plt.hexbin(xs, ys, C=cs, gridsize=50, mincnt=0, reduce_C_function=numpy.median) + mask = numpy.isfinite(xs) & numpy.isfinite(ys) + corr = plt_utils.latex_float(*kendalltau(xs[mask], ys[mask])) + plt.title(r"$\tau = {}, p = {}$".format(*corr), fontsize="small") plt.colorbar(label=r"$\log M_{\rm tot} / M_\odot$") plt.xlabel(r"$D_{\mathrm{KL}}$ of $r_{1\mathrm{NN}}$ distribution") @@ -801,14 +880,19 @@ def plot_kl_vs_ks(simname, runs, nsim, nobs, kwargs, runs_to_mass): for ext in ["png"]: if simname == "quijote": nsim = paths.quijote_fiducial_nsim(nsim, nobs) - fout = join(plt_utils.fout, - f"kl_vs_ks_{simname}_{run}_{str(nsim).zfill(5)}.{ext}") + nsim = str(nsim).zfill(5) + fout = join( + plt_utils.fout, + f"kl_vs_ks_{simname}_{run}_{nsim}_{runs}.{ext}") + if upper_threshold: + fout = fout.replace(f".{ext}", f"_upper_threshold.{ext}") print(f"Saving to `{fout}`.") plt.savefig(fout, dpi=plt_utils.dpi, bbox_inches="tight") plt.close() -def plot_kl_vs_overlap(runs, nsim, kwargs, runs_to_mass): +def plot_kl_vs_overlap(runs, nsim, kwargs, runs_to_mass, plot_std=True, + upper_threshold=False): """ Plot KL divergence vs overlap for CSiBORG. @@ -822,6 +906,10 @@ def plot_kl_vs_overlap(runs, nsim, kwargs, runs_to_mass): Nearest neighbour reader keyword arguments. runs_to_mass : dict Dictionary mapping run names to total halo mass range. + plot_std : bool, optional + Whether to plot the standard deviation of the overlap distribution. + upper_threshold : bool, optional + Whether to enforce an upper threshold on halo mass. Returns ------- @@ -848,7 +936,7 @@ def plot_kl_vs_overlap(runs, nsim, kwargs, runs_to_mass): y1 = 1 - numpy.mean(prob_nomatch, axis=1) y2 = numpy.std(prob_nomatch, axis=1) cmin, cmax = make_binlims(run, runs_to_mass) - mask = (mass >= cmin) & (mass < cmax) + mask = (mass >= cmin) & (mass < cmax if upper_threshold else True) xs.append(x[mask]) ys1.append(y1[mask]) ys2.append(y2[mask]) @@ -863,18 +951,28 @@ def plot_kl_vs_overlap(runs, nsim, kwargs, runs_to_mass): plt.figure() plt.hexbin(xs, ys1, C=cs, gridsize=50, mincnt=0, reduce_C_function=numpy.median) + mask = numpy.isfinite(xs) & numpy.isfinite(ys1) + corr = plt_utils.latex_float(*kendalltau(xs[mask], ys1[mask])) + plt.title(r"$\tau = {}, p = {}$".format(*corr), fontsize="small") + plt.colorbar(label=r"$\log M_{\rm tot} / M_\odot$") plt.xlabel(r"$D_{\mathrm{KL}}$ of $r_{1\mathrm{NN}}$ distribution") plt.ylabel(r"$1 - \langle \eta^{\mathcal{B}}_a \rangle_{\mathcal{B}}$") plt.tight_layout() for ext in ["png"]: + nsim = str(nsim).zfill(5) fout = join(plt_utils.fout, - f"kl_vs_overlap_mean_{str(nsim).zfill(5)}.{ext}") + f"kl_vs_overlap_mean_{nsim}_{runs}.{ext}") + if upper_threshold: + fout = fout.replace(f".{ext}", f"_upper_threshold.{ext}") print(f"Saving to `{fout}`.") plt.savefig(fout, dpi=plt_utils.dpi, bbox_inches="tight") plt.close() + if not plot_std: + return + with plt.style.context(plt_utils.mplstyle): plt.figure() plt.hexbin(xs, ys2, C=cs, gridsize=50, mincnt=0, @@ -882,11 +980,17 @@ def plot_kl_vs_overlap(runs, nsim, kwargs, runs_to_mass): plt.colorbar(label=r"$\log M_{\rm tot} / M_\odot$") plt.xlabel(r"$D_{\mathrm{KL}}$ of $r_{1\mathrm{NN}}$ distribution") plt.ylabel(r"Ensemble std of summed overlap") + mask = numpy.isfinite(xs) & numpy.isfinite(ys2) + corr = plt_utils.latex_float(*kendalltau(xs[mask], ys2[mask])) + plt.title(r"$\tau = {}, p = {}$".format(*corr), fontsize="small") plt.tight_layout() for ext in ["png"]: + nsim = str(nsim).zfill(5) fout = join(plt_utils.fout, - f"kl_vs_overlap_std_{str(nsim).zfill(5)}.{ext}") + f"kl_vs_overlap_std_{nsim}_{runs}.{ext}") + if upper_threshold: + fout = fout.replace(f".{ext}", f"_upper_threshold.{ext}") print(f"Saving to `{fout}`.") plt.savefig(fout, dpi=plt_utils.dpi, bbox_inches="tight") plt.close() @@ -915,14 +1019,19 @@ if __name__ == "__main__": "mass009": (14.0, 14.4), # There is no upper limit. } - cached_funcs = ["get_overlap", "read_dist", "make_kl", "make_ks"] + # cached_funcs = ["get_overlap", "read_dist", "make_kl", "make_ks"] + cached_funcs = ["get_overlap"] if args.clean: for func in cached_funcs: print(f"Cleaning cache for function {func}.") delete_disk_caches_for_function(func) - # Plot 1NN distance distributions. if True: + plot_mass_vs_separation(7444 + 24, 8956 + 24 * 3) + + + # Plot 1NN distance distributions. + if False: for i in range(1, 10): run = f"mass00{i}" for pulled_cdf in [True, False]: @@ -931,30 +1040,35 @@ if __name__ == "__main__": plot_dist(run, "pdf", neighbour_kwargs, runs_to_mass) # Plot 1NN CDF differences. - if True: + if False: runs = [f"mass00{i}" for i in range(1, 10)] for pulled_cdf in [True, False]: plot_cdf_diff(runs, neighbour_kwargs, pulled_cdf=pulled_cdf, runs_to_mass=runs_to_mass) - if True: + if False: runs = [f"mass00{i}" for i in range(1, 9)] for kind in ["kl", "ks"]: plot_significance("csiborg", runs, 7444, nobs=None, kind=kind, kwargs=neighbour_kwargs, runs_to_mass=runs_to_mass) - if True: - runs = [f"mass00{i}" for i in range(1, 10)] + if False: + # runs = [f"mass00{i}" for i in range(1, 10)] + runs = ["mass007"] for kind in ["kl", "ks"]: plot_significance_vs_mass("csiborg", runs, 7444, nobs=None, kind=kind, kwargs=neighbour_kwargs, - runs_to_mass=runs_to_mass) + runs_to_mass=runs_to_mass, + upper_threshold=True) - if True: - runs = [f"mass00{i}" for i in range(1, 10)] + if False: + # runs = [f"mass00{i}" for i in range(1, 10)] + runs = ["mass006"] plot_kl_vs_ks("csiborg", runs, 7444, None, kwargs=neighbour_kwargs, - runs_to_mass=runs_to_mass) + runs_to_mass=runs_to_mass, upper_threshold=True) - if True: - runs = [f"mass00{i}" for i in range(1, 10)] - plot_kl_vs_overlap(runs, 7444, neighbour_kwargs, runs_to_mass) + if False: + # runs = [f"mass00{i}" for i in range(1, 10)] + runs = ["mass006"] + plot_kl_vs_overlap(runs, 7444, neighbour_kwargs, runs_to_mass, + upper_threshold=True, plot_std=False) diff --git a/scripts_plots/plt_utils.py b/scripts_plots/plt_utils.py index ba0e8dc..e86c053 100644 --- a/scripts_plots/plt_utils.py +++ b/scripts_plots/plt_utils.py @@ -16,3 +16,38 @@ dpi = 600 fout = "../plots/" mplstyle = ["science"] + + +def latex_float(*floats, n=2): + """ + Convert a float or a list of floats to a LaTeX string(s). Taken from [1]. + + Parameters + ---------- + floats : float or list of floats + The float(s) to be converted. + n : int, optional + The number of significant figures to be used in the LaTeX string. + + Returns + ------- + latex_floats : str or list of str + The LaTeX string(s) representing the float(s). + + References + ---------- + [1] https://stackoverflow.com/questions/13490292/format-number-using-latex-notation-in-python # noqa + """ + latex_floats = [None] * len(floats) + for i, f in enumerate(floats): + float_str = "{0:.{1}g}".format(f, n) + if "e" in float_str: + base, exponent = float_str.split("e") + latex_floats[i] = r"{0} \times 10^{{{1}}}".format(base, + int(exponent)) + else: + latex_floats[i] = float_str + + if len(floats) == 1: + return latex_floats[0] + return latex_floats