diff --git a/.gitignore b/.gitignore index 4427a94..1814f26 100644 --- a/.gitignore +++ b/.gitignore @@ -9,10 +9,10 @@ plots/* csiborgtools/fits/_halo_profile.py csiborgtools/fits/_filenames.py csiborgtools/fits/analyse_voids_25.py -scripts/*.sh scripts/*.out build/* .eggs/* csiborgtools.egg-info/* Pylians3/* scripts/plot_correlation.ipynb +scripts/python.sh diff --git a/README.md b/README.md index d1317c6..84aa30c 100644 --- a/README.md +++ b/README.md @@ -1,24 +1,16 @@ -# CSiBORGTools +# CSiBORG Analysis -### Questions -- How well can observed clusters be matched to CSiBORG? Do their masses agree? -- Is the number of clusters in CSiBORG consistent? +## Project Overlap +- [ ] Calculate the overlap between all 101 IC realisations on DiRAC. -## CSiBORG Galaxy Environmental Dependence -### TODO + +## Project Clustering +- [ ] Add uncertainty to the kNN-CDF autocorrelation. +- [ ] Add the joint kNN-CDF calculation. +- [ ] Make kNN-CDF more memory friendly if generating many randoms. + + +## Project Environmental Dependence - [ ] Add gradient and Hessian of the overdensity field. - - -### Questions -- Environmental dependence of: - - $M_*$, colour and SFR. - - Galaxy alignment. - - HI content. - -- Fields to calculate: - 1. Overdensity field $\delta$ - 2. Gradient and Hessian of $\delta$ - 3. Gravitational field $\Phi$ - 4. Gradient and Hessian of $\Phi$ diff --git a/csiborgtools/match/__init__.py b/csiborgtools/match/__init__.py index 90fe4d6..62c1310 100644 --- a/csiborgtools/match/__init__.py +++ b/csiborgtools/match/__init__.py @@ -18,4 +18,5 @@ from .match import (brute_spatial_separation, RealisationsMatcher, cosine_simila calculate_overlap, calculate_overlap_indxs, # noqa dist_centmass, dist_percentile) # noqa from .num_density import (binned_counts, number_density) # noqa +from .knn import kNN_CDF # from .correlation import (get_randoms_sphere, sphere_angular_tpcf) # noqa diff --git a/csiborgtools/match/knn.py b/csiborgtools/match/knn.py new file mode 100644 index 0000000..99b81f1 --- /dev/null +++ b/csiborgtools/match/knn.py @@ -0,0 +1,181 @@ +# Copyright (C) 2022 Richard Stiskalek +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation; either version 3 of the License, or (at your +# option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +# Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +""" +kNN-CDF calculation +""" +from gc import collect +import numpy +from scipy.interpolate import interp1d +from tqdm import tqdm + + +class kNN_CDF: + """ + Object to calculate the kNN-CDF for a set of CSiBORG halo catalogues from + their kNN objects. + """ + @staticmethod + def rvs_in_sphere(nsamples, R, random_state=42, dtype=numpy.float32): + """ + Generate random samples in a sphere of radius `R` centered at the + origin. + + Parameters + ---------- + nsamples : int + Number of samples to generate. + R : float + Radius of the sphere. + random_state : int, optional + Random state for the random number generator. + dtype : numpy dtype, optional + Data type, by default `numpy.float32`. + + Returns + ------- + samples : 2-dimensional array of shape `(nsamples, 3)` + """ + gen = numpy.random.default_rng(random_state) + # Sample spherical coordinates + r = gen.uniform(0, 1, nsamples).astype(dtype)**(1/3) * R + theta = 2 * numpy.arcsin(gen.uniform(0, 1, nsamples).astype(dtype)) + phi = 2 * numpy.pi * gen.uniform(0, 1, nsamples).astype(dtype) + # Convert to cartesian coordinates + x = r * numpy.sin(theta) * numpy.cos(phi) + y = r * numpy.sin(theta) * numpy.sin(phi) + z = r * numpy.cos(theta) + + return numpy.vstack([x, y, z]).T + + @staticmethod + def cdf_from_samples(r, rmin=None, rmax=None, neval=None, + dtype=numpy.float32): + """ + Calculate the CDF from samples. + + Parameters + ---------- + r : 1-dimensional array + Distance samples. + rmin : float, optional + Minimum distance to evaluate the CDF. + rmax : float, optional + Maximum distance to evaluate the CDF. + neval : int, optional + Number of points to evaluate the CDF. By default equal to `len(x)`. + dtype : numpy dtype, optional + Calculation data type. By default `numpy.float32`. + + Returns + ------- + r : 1-dimensional array + Distances at which the CDF is evaluated. + cdf : 1-dimensional array + CDF evaluated at `r`. + """ + r = numpy.copy(r) # Make a copy not to overwrite the original + # Make cuts on distance + r = r[r >= rmin] if rmin is not None else r + r = r[r <= rmax] if rmax is not None else r + + # Calculate the CDF + r = numpy.sort(r) + cdf = numpy.arange(r.size) / r.size + + if neval is not None: # Optinally interpolate at given points + _r = numpy.logspace(numpy.log10(rmin), numpy.log10(rmax), neval, + dtype=dtype) + cdf = interp1d(r, cdf, kind="linear", fill_value=numpy.nan, + bounds_error=False)(_r).astype(dtype) + r = _r + + return r, cdf + + @staticmethod + def peaked_cdf(cdf, make_copy=True): + """ + Transform the CDF to a peaked CDF. + + Parameters + ---------- + cdf : 1- or 2- or 3-dimensional array + CDF to be transformed along the last axis. + make_copy : bool, optional + Whether to make a copy of the CDF before transforming it to avoid + overwriting it. + + Returns + ------- + peaked_cdf : 1- or 2- or 3-dimensional array + """ + cdf = numpy.copy(cdf) if make_copy else cdf + cdf[cdf > 0.5] = 1 - cdf[cdf > 0.5] + return cdf + + def __call__(self, *knns, nneighbours, Rmax, nsamples, rmin, rmax, neval, + verbose=True, random_state=42, dtype=numpy.float32): + """ + Calculate the CDF for a set of kNNs of CSiBORG halo catalogues. + + Parameters + ---------- + *knns : `sklearn.neighbors.NearestNeighbors` instances + kNNs of CSiBORG halo catalogues. + neighbours : int + Maximum number of neighbours to use for the kNN-CDF calculation. + Rmax : float + Maximum radius of the sphere in which to sample random points for + the knn-CDF calculation. This should match the CSiBORG catalogues. + nsamples : int + Number of random points to sample for the knn-CDF calculation. + rmin : float + Minimum distance to evaluate the CDF. + rmax : float + Maximum distance to evaluate the CDF. + neval : int + Number of points to evaluate the CDF. + verbose : bool, optional + Verbosity flag. + random_state : int, optional + Random state for the random number generator. + dtype : numpy dtype, optional + Calculation data type. By default `numpy.float32`. + + Returns + ------- + rs : 1-dimensional array + Distances at which the CDF is evaluated. + cdfs : 2 or 3-dimensional array + CDFs evaluated at `rs`. + """ + rand = self.rvs_in_sphere(nsamples, Rmax, random_state=random_state) + + cdfs = [None] * len(knns) + for i, knn in enumerate(tqdm(knns) if verbose else knns): + dist, _indxs = knn.kneighbors(rand, nneighbours) + dist = dist.astype(dtype) + del _indxs + collect() + + + cdf = [None] * nneighbours + for j in range(nneighbours): + rs, cdf[j] = self.cdf_from_samples( + dist[:, j], rmin=rmin, rmax=rmax, neval=neval) + cdfs[i] = cdf + + cdfs = numpy.asanyarray(cdfs) + cdfs = cdfs[0, ...] if len(knns) == 1 else cdfs + return rs, cdfs diff --git a/csiborgtools/read/make_cat.py b/csiborgtools/read/make_cat.py index 2b4b158..b051e5b 100644 --- a/csiborgtools/read/make_cat.py +++ b/csiborgtools/read/make_cat.py @@ -35,20 +35,22 @@ class HaloCatalogue: The minimum :math:`M_{rm tot} / M_\odot` mass. By default no threshold. max_dist : float, optional The maximum comoving distance of a halo. By default no upper limit. + load_init : bool, optional + Whether to load the initial snapshot information. By default False. """ _box = None _paths = None _data = None _selmask = None - def __init__(self, nsim, min_mass=None, max_dist=None): + def __init__(self, nsim, min_mass=None, max_dist=None, load_init=False): # Set up paths paths = CSiBORGPaths(n_sim=nsim) paths.n_snap = paths.get_maximum_snapshot() self._paths = paths self._box = BoxUnits(paths) self._paths = paths - self._set_data(min_mass, max_dist) + self._set_data(min_mass, max_dist, load_init) @property def data(self): @@ -109,7 +111,7 @@ class HaloCatalogue: def knn(self, select_initial): """ - The final snapshot k-nearest neighbour object. + kNN object of all halo positions. Parameters ---------- @@ -123,7 +125,7 @@ class HaloCatalogue: knn = NearestNeighbors() return knn.fit(self.positions0 if select_initial else self.positions) - def _set_data(self, min_mass, max_dist): + def _set_data(self, min_mass, max_dist, load_init): """ Loads the data, merges with mmain, does various coordinate transforms. """ @@ -141,10 +143,11 @@ class HaloCatalogue: data = data[(data["npart"] > 100) & numpy.isfinite(data["m200"])] # Now also load the initial positions - initcm = read_initcm(self.n_sim, self.paths.initmatch_path) - if initcm is not None: - data = self.merge_initmatch_to_clumps(data, initcm) - flip_cols(data, "x0", "z0") + if load_init: + initcm = read_initcm(self.n_sim, self.paths.initmatch_path) + if initcm is not None: + data = self.merge_initmatch_to_clumps(data, initcm) + flip_cols(data, "x0", "z0") # # Calculate redshift # pos = [data["peak_{}".format(p)] - 0.5 for p in ("x", "y", "z")] @@ -168,7 +171,7 @@ class HaloCatalogue: data = add_columns(data, [d, ra, dec], ["dist", "ra", "dec"]) # And do the unit transform - if initcm is not None: + if load_init and initcm is not None: data = self.box.convert_from_boxunits( data, ["x0", "y0", "z0", "lagpatch"]) diff --git a/notebooks/knn.ipynb b/notebooks/knn.ipynb new file mode 100644 index 0000000..8ebdb6d --- /dev/null +++ b/notebooks/knn.ipynb @@ -0,0 +1,738 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 4, + "id": "5a38ed25", + "metadata": { + "ExecuteTime": { + "end_time": "2023-03-31T17:09:12.165480Z", + "start_time": "2023-03-31T17:09:12.116708Z" + }, + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The autoreload extension is already loaded. To reload it, use:\n", + " %reload_ext autoreload\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "import matplotlib\n", + "import matplotlib.pyplot as plt\n", + "from sklearn.neighbors import NearestNeighbors\n", + "import joblib\n", + "from tqdm import tqdm\n", + "try:\n", + " import csiborgtools\n", + "except ModuleNotFoundError:\n", + " print(\"not found\")\n", + " import sys\n", + " sys.path.append(\"../\")\n", + " import csiborgtools\n", + "\n", + "\n", + "%matplotlib notebook\n", + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "4218b673", + "metadata": { + "ExecuteTime": { + "end_time": "2023-03-31T17:09:13.943312Z", + "start_time": "2023-03-31T17:09:12.167027Z" + } + }, + "outputs": [], + "source": [ + "cat = csiborgtools.read.HaloCatalogue(7444, min_mass=1e13, max_dist=155 / 0.705)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "5ff7a1b6", + "metadata": { + "ExecuteTime": { + "end_time": "2023-03-31T17:10:18.303240Z", + "start_time": "2023-03-31T17:10:14.674751Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\r", + " 0%| | 0/1 [00:00 1) & (rs < 35)\n", + "\n", + "fig, axs = plt.subplots(ncols=3, figsize=(6.4 * 1.5, 4.8), sharey=True)\n", + "fig.subplots_adjust(wspace=0)\n", + "for k in range(3):\n", + " for n in range(len(ics)):\n", + " m = m1 & (cdfs[n, k, :] > 1e-3)\n", + " axs[k].plot(rs[m], cdfs[n, k, m], c=\"black\", lw=0.05)\n", + "\n", + " axs[k].set_xscale(\"log\")\n", + " axs[k].set_yscale(\"log\")\n", + " axs[k].set_title(r\"$k = {}$\".format(k))\n", + " axs[k].set_xlabel(r\"$r~\\left[\\mathrm{Mpc}\\right]$\")\n", + "\n", + "axs[0].set_ylabel(r\"Peaked CDF\")\n", + "\n", + "plt.tight_layout(w_pad=0)\n", + "fig.savefig(\"../plots/peaked_cdf.png\", dpi=450)\n", + "fig.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9f8786c0", + "metadata": { + "ExecuteTime": { + "end_time": "2023-03-31T09:50:10.103650Z", + "start_time": "2023-03-31T09:50:02.221741Z" + } + }, + "outputs": [], + "source": [ + "m = (rs > 0.5) & (rs < 35)\n", + "\n", + "fig, axs = plt.subplots(ncols=3, figsize=(6.4 * 1.5, 4.8), sharey=True)\n", + "fig.subplots_adjust(wspace=0)\n", + "for k in range(3):\n", + " mu = np.nanmean(cdfs[:, k, :], axis=0)\n", + "\n", + " for n in range(len(ics)):\n", + " axs[k].plot(rs[m], (cdfs[n, k, :] / mu)[m], c=\"black\", lw=0.1)\n", + "\n", + " axs[k].set_ylim(0.5, 1.5)\n", + " axs[k].axhline(1, ls=\"--\", c=\"red\", zorder=0)\n", + " axs[k].axvline(2.65 / 0.705, ls=\"--\", c=\"red\", zorder=0)\n", + " axs[k].set_xscale(\"log\")\n", + " axs[k].set_xlabel(r\"$r~\\left[\\mathrm{Mpc}\\right]$\")\n", + " axs[k].set_title(r\"$k = {}$\".format(k))\n", + " \n", + "axs[0].set_ylabel(r\"Relative peaked CDF\")\n", + "plt.tight_layout(w_pad=0)\n", + "fig.savefig(\"../plots/peaked_cdf_ratios.png\", dpi=450)\n", + "fig.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2f64cec1", + "metadata": { + "ExecuteTime": { + "end_time": "2023-03-30T15:46:31.532259Z", + "start_time": "2023-03-30T15:46:30.977449Z" + } + }, + "outputs": [], + "source": [ + "plt.figure()\n", + "k = 2\n", + "mu = np.nanmean(cdfs[:, k, :], axis=0)\n", + "# plt.plot(rs, mu, c=\"black\")\n", + "for i in range(len(ics)):\n", + " plt.plot(rs, cdfs[i, k, :] / mu)\n", + "\n", + "\n", + "plt.ylim(0.75, 1.25)\n", + "plt.axhline(1, ls=\"--\", c=\"black\")\n", + "plt.xscale(\"log\")\n", + "# plt.yscale(\"log\")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a6784766", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b416efb3", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e650fe2c", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1311187d", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "03e49a11", + "metadata": { + "ExecuteTime": { + "end_time": "2023-03-30T14:58:29.937514Z", + "start_time": "2023-03-30T14:58:29.530552Z" + } + }, + "outputs": [], + "source": [ + "x.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "24578cba", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b0024bbf", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6dc55410", + "metadata": { + "ExecuteTime": { + "end_time": "2023-03-30T14:41:24.290602Z", + "start_time": "2023-03-30T14:41:16.204679Z" + } + }, + "outputs": [], + "source": [ + "dist0, __ = knn0.kneighbors(X, 3)\n", + "distx, __ = knnx.kneighbors(X, 3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "11508c3c", + "metadata": { + "ExecuteTime": { + "end_time": "2023-03-30T14:41:24.560538Z", + "start_time": "2023-03-30T14:41:24.292674Z" + } + }, + "outputs": [], + "source": [ + "x0, y0 = knncdf.peaked_cdf_from_samples(dist0[:, 0], 0.5, 20, neval=10000)\n", + "xx, yx = knncdf.peaked_cdf_from_samples(distx[:, 0], 0.5, 20, neval=10000)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "404501ad", + "metadata": { + "ExecuteTime": { + "end_time": "2023-03-30T14:41:24.598933Z", + "start_time": "2023-03-30T14:41:24.562062Z" + } + }, + "outputs": [], + "source": [ + "distx[:, 0].min()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "43e08969", + "metadata": { + "ExecuteTime": { + "end_time": "2023-03-30T14:46:10.262865Z", + "start_time": "2023-03-30T14:46:09.486658Z" + } + }, + "outputs": [], + "source": [ + "plt.figure()\n", + "plt.plot(x0, y0)\n", + "plt.plot(xx, yx)\n", + "\n", + "plt.yscale(\"log\")\n", + "plt.xscale(\"log\")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "39547a75", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9e160b38", + "metadata": { + "ExecuteTime": { + "end_time": "2023-03-30T13:02:02.033125Z", + "start_time": "2023-03-30T13:02:00.674878Z" + } + }, + "outputs": [], + "source": [ + "plt.figure()\n", + "\n", + "for i in range(3):\n", + " plt.plot(*knncdf.cdf_from_samples(dist0[:, i], 1, 25))\n", + " plt.plot(*knncdf.cdf_from_samples(distx[:, i], 1, 25))\n", + "\n", + "# plt.xlim(0.5, 25)\n", + "\n", + "plt.yscale(\"log\")\n", + "plt.xscale(\"log\")\n", + "plt.xlabel(r\"$r~\\left[\\mathrm{Mpc}\\right]$\")\n", + "\n", + "\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4bfb65d8", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4703d81c", + "metadata": { + "ExecuteTime": { + "end_time": "2023-03-30T12:13:35.958444Z", + "start_time": "2023-03-30T12:13:35.924241Z" + } + }, + "outputs": [], + "source": [ + "x = dist[:, 0]\n", + "q = np.linspace(0, 100, int(x.size / 5))\n", + "\n", + "p = np.percentile(x, q)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b054c6df", + "metadata": { + "ExecuteTime": { + "end_time": "2023-03-30T12:16:50.052225Z", + "start_time": "2023-03-30T12:16:50.020395Z" + } + }, + "outputs": [], + "source": [ + "y = np.sort(x)\n", + "\n", + "yy = np.arange(y.size) / y.size" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5445c964", + "metadata": { + "ExecuteTime": { + "end_time": "2023-03-30T12:16:53.599925Z", + "start_time": "2023-03-30T12:16:53.521266Z" + } + }, + "outputs": [], + "source": [ + "plt.figure()\n", + "plt.plot(p, q / 100)\n", + "\n", + "plt.plot(y, yy)\n", + "\n", + "# plt.yscale(\"log\")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "87fe5874", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fb0ad6b9", + "metadata": { + "ExecuteTime": { + "end_time": "2023-03-30T12:03:34.387625Z", + "start_time": "2023-03-30T12:03:34.290961Z" + } + }, + "outputs": [], + "source": [ + "plt.figure()\n", + "plt.hist(dist[:, 0], bins=\"auto\", histtype=\"step\")\n", + "plt.hist(dist[:, 1], bins=\"auto\", histtype=\"step\")\n", + "plt.hist(dist[:, 2], bins=\"auto\", histtype=\"step\")\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c2aba833", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6f70f238", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "03bcb191", + "metadata": { + "ExecuteTime": { + "end_time": "2023-03-30T11:38:04.906150Z", + "start_time": "2023-03-30T11:38:04.758107Z" + } + }, + "outputs": [], + "source": [ + "plt.figure()\n", + "plt.hist(cat0[\"dec\"], bins=\"auto\")\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e5ad4722", + "metadata": { + "ExecuteTime": { + "end_time": "2023-03-30T11:53:23.004853Z", + "start_time": "2023-03-30T11:53:22.971967Z" + } + }, + "outputs": [], + "source": [ + "gen = np.random.default_rng(22)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "785b530a", + "metadata": { + "ExecuteTime": { + "end_time": "2023-03-30T11:53:23.330397Z", + "start_time": "2023-03-30T11:53:23.296612Z" + } + }, + "outputs": [], + "source": [ + "gen.normal()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b3d3b5e6", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "464b606d", + "metadata": { + "ExecuteTime": { + "end_time": "2023-03-30T11:36:13.649124Z", + "start_time": "2023-03-30T11:36:12.995693Z" + } + }, + "outputs": [], + "source": [ + "theta = np.linspace( t, np.pi, 100)\n", + "\n", + "plt.figure()\n", + "plt.plot(theta, np.sin(theta))\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c29049f5", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cd2a3295", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "af9abf04", + "metadata": { + "ExecuteTime": { + "end_time": "2023-03-30T11:10:11.104389Z", + "start_time": "2023-03-30T11:10:11.070499Z" + } + }, + "outputs": [], + "source": [ + "X = np.array([-3.9514747, -0.6966991, 2.97158]).reshape(1, -1)\n", + "\n", + "X" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e181b3c3", + "metadata": { + "ExecuteTime": { + "end_time": "2023-03-30T11:32:17.840355Z", + "start_time": "2023-03-30T11:32:17.351883Z" + } + }, + "outputs": [], + "source": [ + "dist, indxs = knn0.kneighbors(X, n_neighbors=1)\n", + "\n", + "dist, indxs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d38fd960", + "metadata": { + "ExecuteTime": { + "end_time": "2023-03-30T11:10:18.182326Z", + "start_time": "2023-03-30T11:10:18.145629Z" + } + }, + "outputs": [], + "source": [ + "cat0.positions[indxs]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a16ddc2f", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bbbe8fb6", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "759a0149", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "312c96c9", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b097637b", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2ced23cb", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "be26cbcc", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv_galomatch", + "language": "python", + "name": "venv_galomatch" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.0" + }, + "vscode": { + "interpreter": { + "hash": "f29d02a8350410abc2a9fb79641689d10bf7ab64afc03ec87ca3cf6ed2daa499" + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/scripts/python.sh b/scripts/python.sh new file mode 100644 index 0000000..45328c4 --- /dev/null +++ b/scripts/python.sh @@ -0,0 +1,46 @@ +#!/bin/bash -l +echo ========================================================= +echo Job submitted date = Fri Mar 31 16:17:57 BST 2023 +date_start=`date +%s` +echo $SLURM_JOB_NUM_NODES nodes \( $SMP processes per node \) +echo $SLURM_JOB_NUM_NODES hosts used: $SLURM_JOB_NODELIST +echo Job output begins +echo ----------------- +echo +#hostname + +# Need to set the max locked memory very high otherwise IB can't allocate enough and fails with "UCX ERROR Failed to allocate memory pool chunk: Input/output error" +ulimit -l unlimited + +# To allow mvapich to run ok +export MV2_SMP_USE_CMA=0 + +#which mpirun +export OMP_NUM_THEADS=1 + /usr/local/shared/slurm/bin/srun -u -n 5 --mpi=pmi2 --mem-per-cpu=7168 nice -n 10 /mnt/zfsusers/rstiskalek/csiborgtools/venv_galomatch/bin/python run_knn.py --rmin 0.05 --rmax 50 --nsamples 100000 --neval 10000 +# If we've been checkpointed +#if [ -n "${DMTCP_CHECKPOINT_DIR}" ]; then + if [ -d "${DMTCP_CHECKPOINT_DIR}" ]; then +# echo -n "Job was checkpointed at " +# date +# echo + sleep 1 +# fi + echo -n +else + echo --------------- + echo Job output ends + date_end=`date +%s` + seconds=$((date_end-date_start)) + minutes=$((seconds/60)) + seconds=$((seconds-60*minutes)) + hours=$((minutes/60)) + minutes=$((minutes-60*hours)) + echo ========================================================= + echo PBS job: finished date = `date` + echo Total run time : $hours Hours $minutes Minutes $seconds Seconds + echo ========================================================= +fi +if [ ${SLURM_NTASKS} -eq 1 ]; then + rm -f $fname +fi diff --git a/scripts/run_asciipos.sh b/scripts/run_asciipos.sh new file mode 100644 index 0000000..814ac12 --- /dev/null +++ b/scripts/run_asciipos.sh @@ -0,0 +1,13 @@ +nthreads=1 +memory=75 +queue="berg" +env="/mnt/zfsusers/rstiskalek/csiborgtools/venv_galomatch/bin/python" +file="run_asciipos.py" +mode="dump" + +cm="addqueue -q $queue -n $nthreads -m $memory $env $file --mode $mode" + +echo "Submitting:" +echo $cm +echo +$cm diff --git a/scripts/run_crossmatch.sh b/scripts/run_crossmatch.sh new file mode 100644 index 0000000..63aaeb5 --- /dev/null +++ b/scripts/run_crossmatch.sh @@ -0,0 +1,17 @@ +nthreads=1 +memory=32 +queue="berg" +env="/mnt/zfsusers/rstiskalek/csiborgtools/venv_galomatch/bin/python" +file="run_crossmatch.py" + +pythoncm="$env $file" +# echo "Submitting:" +# echo $pythoncm +# echo +# $pythoncm + +cm="addqueue -q $queue -n $nthreads -m $memory $pythoncm" +echo "Submitting:" +echo $cm +echo +$cm diff --git a/scripts/run_crosspk.sh b/scripts/run_crosspk.sh new file mode 100644 index 0000000..374cb88 --- /dev/null +++ b/scripts/run_crosspk.sh @@ -0,0 +1,14 @@ +nthreads=20 +memory=40 +queue="berg" +env="/mnt/zfsusers/rstiskalek/csiborgtools/venv_galomatch/bin/python" +file="run_crosspk.py" +grid=1024 +halfwidth=0.13 + +cm="addqueue -q $queue -n $nthreads -m $memory $env $file --grid $grid --halfwidth $halfwidth" + +echo "Submitting:" +echo $cm +echo +$cm diff --git a/scripts/run_fieldprop.sh b/scripts/run_fieldprop.sh new file mode 100644 index 0000000..76e8e17 --- /dev/null +++ b/scripts/run_fieldprop.sh @@ -0,0 +1,14 @@ +nthreads=10 +memory=32 +queue="berg" +env="/mnt/zfsusers/rstiskalek/csiborgtools/venv_galomatch/bin/python" +file="run_fieldprop.py" +# grid=1024 +# halfwidth=0.1 + +cm="addqueue -q $queue -n $nthreads -m $memory $env $file" + +echo "Submitting:" +echo $cm +echo +$cm diff --git a/scripts/run_fit_halos.sh b/scripts/run_fit_halos.sh new file mode 100644 index 0000000..5bc4f8b --- /dev/null +++ b/scripts/run_fit_halos.sh @@ -0,0 +1,12 @@ +nthreads=100 +memory=3 +queue="berg" +env="/mnt/zfsusers/rstiskalek/csiborgtools/venv_galomatch/bin/python" +file="run_fit_halos.py" + +cm="addqueue -q $queue -n $nthreads -m $memory $env $file" + +echo "Submitting:" +echo $cm +echo +$cm diff --git a/scripts/run_initmatch.sh b/scripts/run_initmatch.sh new file mode 100644 index 0000000..3de6233 --- /dev/null +++ b/scripts/run_initmatch.sh @@ -0,0 +1,14 @@ +nthreads=15 # There isn't too much benefit going to too many CPUs... +memory=32 +queue="berg" +env="/mnt/zfsusers/rstiskalek/csiborgtools/venv_galomatch/bin/python" +file="run_initmatch.py" + +dump_clumps="false" + +cm="addqueue -q $queue -n $nthreads -m $memory $env $file --dump_clumps $dump_clumps" + +echo "Submitting:" +echo $cm +echo +$cm diff --git a/scripts/run_knn.py b/scripts/run_knn.py new file mode 100644 index 0000000..caac37e --- /dev/null +++ b/scripts/run_knn.py @@ -0,0 +1,104 @@ +# Copyright (C) 2022 Richard Stiskalek +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation; either version 3 of the License, or (at your +# option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +# Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +"""A script to calculate the KNN-CDF for a set of CSiBORG halo catalogues.""" +from os.path import join +from argparse import ArgumentParser +from copy import deepcopy +from datetime import datetime +from mpi4py import MPI +from TaskmasterMPI import master_process, worker_process +from sklearn.neighbors import NearestNeighbors +import joblib +try: + import csiborgtools +except ModuleNotFoundError: + import sys + sys.path.append("../") + import csiborgtools + + +############################################################################### +# MPI and arguments # +############################################################################### +comm = MPI.COMM_WORLD +rank = comm.Get_rank() +nproc = comm.Get_size() + +parser = ArgumentParser() +parser.add_argument("--rmin", type=float) +parser.add_argument("--rmax", type=float) +parser.add_argument("--nneighbours", type=int) +parser.add_argument("--nsamples", type=int) +parser.add_argument("--neval", type=int) +parser.add_argument("--seed", type=int, default=42) +args = parser.parse_args() + +Rmax = 155 / 0.705 # Mpc/h high resolution region radius +mass_threshold = [1e12, 1e13, 1e14] # Msun +ics = [7444, 7468, 7492, 7516, 7540, 7564, 7588, 7612, 7636, 7660, 7684, + 7708, 7732, 7756, 7780, 7804, 7828, 7852, 7876, 7900, 7924, 7948, + 7972, 7996, 8020, 8044, 8068, 8092, 8116, 8140, 8164, 8188, 8212, + 8236, 8260, 8284, 8308, 8332, 8356, 8380, 8404, 8428, 8452, 8476, + 8500, 8524, 8548, 8572, 8596, 8620, 8644, 8668, 8692, 8716, 8740, + 8764, 8788, 8812, 8836, 8860, 8884, 8908, 8932, 8956, 8980, 9004, + 9028, 9052, 9076, 9100, 9124, 9148, 9172, 9196, 9220, 9244, 9268, + 9292, 9316, 9340, 9364, 9388, 9412, 9436, 9460, 9484, 9508, 9532, + 9556, 9580, 9604, 9628, 9652, 9676, 9700, 9724, 9748, 9772, 9796, + 9820, 9844] +dumpdir = "/mnt/extraspace/rstiskalek/csiborg/knn" +fout = join(dumpdir, "knncdf_{}.p") + + +############################################################################### +# Analysis # +############################################################################### +knncdf = csiborgtools.match.kNN_CDF() + + +def do_task(ic): + out = {} + cat = csiborgtools.read.HaloCatalogue(ic, max_dist=Rmax) + + for i, mmin in enumerate(mass_threshold): + knn = NearestNeighbors() + knn.fit(cat.positions[cat["totpartmass"] > mmin, ...]) + + rs, cdf = knncdf(knn, nneighbours=args.nneighbours, Rmax=Rmax, + rmin=args.rmin, rmax=args.rmax, nsamples=args.nsamples, + neval=args.neval, random_state=args.seed, + verbose=False) + out.update({"cdf_{}".format(i): cdf}) + + out.update({"rs": rs, "mass_threshold": mass_threshold}) + joblib.dump(out, fout.format(ic)) + + +if nproc > 1: + if rank == 0: + tasks = deepcopy(ics) + master_process(tasks, comm, verbose=True) + else: + worker_process(do_task, comm, verbose=False) +else: + tasks = deepcopy(ics) + for task in tasks: + print("{}: completing task `{}`.".format(datetime.now(), task)) + do_task(task) + + +comm.Barrier() +if rank == 0: + print("{}: all finished.".format(datetime.now())) +quit() # Force quit the script \ No newline at end of file diff --git a/scripts/run_knn.sh b/scripts/run_knn.sh new file mode 100644 index 0000000..d6df448 --- /dev/null +++ b/scripts/run_knn.sh @@ -0,0 +1,22 @@ +nthreads=140 +memory=7 +queue="berg" +env="/mnt/zfsusers/rstiskalek/csiborgtools/venv_galomatch/bin/python" +file="run_knn.py" + +rmin=0.01 +rmax=100 +nneighbours=16 +nsamples=10000000 +neval=10000 + +pythoncm="$env $file --rmin $rmin --rmax $rmax --nneighbours $nneighbours --nsamples $nsamples --neval $neval" + +# echo $pythoncm +# $pythoncm + +cm="addqueue -q $queue -n $nthreads -m $memory $pythoncm" +echo "Submitting:" +echo $cm +echo +$cm diff --git a/scripts/run_singlematch.sh b/scripts/run_singlematch.sh new file mode 100755 index 0000000..58cc8d7 --- /dev/null +++ b/scripts/run_singlematch.sh @@ -0,0 +1,36 @@ +#!/bin/bash +# nthreads=1 +memory=16 +queue="berg" +env="/mnt/zfsusers/rstiskalek/csiborgtools/venv_galomatch/bin/python" +file="run_singlematch.py" + +nmult=1. +sigma=1. + +sims=(7468 7588 8020 8452 8836) +nsims=${#sims[@]} + +for i in $(seq 0 $((nsims-1))); do +for j in $(seq 0 $((nsims-1))); do +if [ $i -eq $j ]; then + continue +elif [ $i -gt $j ]; then + continue +else + : +fi + +nsim0=${sims[$i]} +nsimx=${sims[$j]} + +pythoncm="$env $file --nsim0 $nsim0 --nsimx $nsimx --nmult $nmult --sigma $sigma" + +cm="addqueue -q $queue -n 1x1 -m $memory $pythoncm" +echo "Submitting:" +echo $cm +echo +$cm +sleep 0.05 + +done; done diff --git a/scripts/run_split_halos.sh b/scripts/run_split_halos.sh new file mode 100644 index 0000000..84e93af --- /dev/null +++ b/scripts/run_split_halos.sh @@ -0,0 +1,12 @@ +nthreads=1 +memory=30 +queue="cmb" +env="/mnt/zfsusers/rstiskalek/csiborgtools/venv_galomatch/bin/python" +file="run_split_halos.py" + +cm="addqueue -q $queue -n $nthreads -m $memory $env $file" + +echo "Submitting:" +echo $cm +echo +$cm