kNN-CDF implementation (#34)

* Rewrite doc

* add kNN

* edit loading of samples with no init

* Add verbosity flag

* add KNN submission script

* do not make peaked cdf by default

* Add submit script

* stop ignore sh

* Add mass thresholding

* Edit gitignore

* edits

* Space points in logspace

* Calculate for all ICs

* Update TODO

* Add dtype support

* Update readme

* Update nb
This commit is contained in:
Richard Stiskalek 2023-03-31 18:13:41 +01:00 committed by GitHub
parent 4d7827006a
commit 63ab3548b4
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
17 changed files with 1248 additions and 29 deletions

2
.gitignore vendored
View file

@ -9,10 +9,10 @@ plots/*
csiborgtools/fits/_halo_profile.py csiborgtools/fits/_halo_profile.py
csiborgtools/fits/_filenames.py csiborgtools/fits/_filenames.py
csiborgtools/fits/analyse_voids_25.py csiborgtools/fits/analyse_voids_25.py
scripts/*.sh
scripts/*.out scripts/*.out
build/* build/*
.eggs/* .eggs/*
csiborgtools.egg-info/* csiborgtools.egg-info/*
Pylians3/* Pylians3/*
scripts/plot_correlation.ipynb scripts/plot_correlation.ipynb
scripts/python.sh

View file

@ -1,24 +1,16 @@
# CSiBORGTools # CSiBORG Analysis
### Questions ## Project Overlap
- How well can observed clusters be matched to CSiBORG? Do their masses agree? - [ ] Calculate the overlap between all 101 IC realisations on DiRAC.
- Is the number of clusters in CSiBORG consistent?
## CSiBORG Galaxy Environmental Dependence
### TODO
## Project Clustering
- [ ] Add uncertainty to the kNN-CDF autocorrelation.
- [ ] Add the joint kNN-CDF calculation.
- [ ] Make kNN-CDF more memory friendly if generating many randoms.
## Project Environmental Dependence
- [ ] Add gradient and Hessian of the overdensity field. - [ ] Add gradient and Hessian of the overdensity field.
### Questions
- Environmental dependence of:
- $M_*$, colour and SFR.
- Galaxy alignment.
- HI content.
- Fields to calculate:
1. Overdensity field $\delta$
2. Gradient and Hessian of $\delta$
3. Gravitational field $\Phi$
4. Gradient and Hessian of $\Phi$

View file

@ -18,4 +18,5 @@ from .match import (brute_spatial_separation, RealisationsMatcher, cosine_simila
calculate_overlap, calculate_overlap_indxs, # noqa calculate_overlap, calculate_overlap_indxs, # noqa
dist_centmass, dist_percentile) # noqa dist_centmass, dist_percentile) # noqa
from .num_density import (binned_counts, number_density) # noqa from .num_density import (binned_counts, number_density) # noqa
from .knn import kNN_CDF
# from .correlation import (get_randoms_sphere, sphere_angular_tpcf) # noqa # from .correlation import (get_randoms_sphere, sphere_angular_tpcf) # noqa

181
csiborgtools/match/knn.py Normal file
View file

@ -0,0 +1,181 @@
# Copyright (C) 2022 Richard Stiskalek
# This program is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the
# Free Software Foundation; either version 3 of the License, or (at your
# option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
# Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
"""
kNN-CDF calculation
"""
from gc import collect
import numpy
from scipy.interpolate import interp1d
from tqdm import tqdm
class kNN_CDF:
"""
Object to calculate the kNN-CDF for a set of CSiBORG halo catalogues from
their kNN objects.
"""
@staticmethod
def rvs_in_sphere(nsamples, R, random_state=42, dtype=numpy.float32):
"""
Generate random samples in a sphere of radius `R` centered at the
origin.
Parameters
----------
nsamples : int
Number of samples to generate.
R : float
Radius of the sphere.
random_state : int, optional
Random state for the random number generator.
dtype : numpy dtype, optional
Data type, by default `numpy.float32`.
Returns
-------
samples : 2-dimensional array of shape `(nsamples, 3)`
"""
gen = numpy.random.default_rng(random_state)
# Sample spherical coordinates
r = gen.uniform(0, 1, nsamples).astype(dtype)**(1/3) * R
theta = 2 * numpy.arcsin(gen.uniform(0, 1, nsamples).astype(dtype))
phi = 2 * numpy.pi * gen.uniform(0, 1, nsamples).astype(dtype)
# Convert to cartesian coordinates
x = r * numpy.sin(theta) * numpy.cos(phi)
y = r * numpy.sin(theta) * numpy.sin(phi)
z = r * numpy.cos(theta)
return numpy.vstack([x, y, z]).T
@staticmethod
def cdf_from_samples(r, rmin=None, rmax=None, neval=None,
dtype=numpy.float32):
"""
Calculate the CDF from samples.
Parameters
----------
r : 1-dimensional array
Distance samples.
rmin : float, optional
Minimum distance to evaluate the CDF.
rmax : float, optional
Maximum distance to evaluate the CDF.
neval : int, optional
Number of points to evaluate the CDF. By default equal to `len(x)`.
dtype : numpy dtype, optional
Calculation data type. By default `numpy.float32`.
Returns
-------
r : 1-dimensional array
Distances at which the CDF is evaluated.
cdf : 1-dimensional array
CDF evaluated at `r`.
"""
r = numpy.copy(r) # Make a copy not to overwrite the original
# Make cuts on distance
r = r[r >= rmin] if rmin is not None else r
r = r[r <= rmax] if rmax is not None else r
# Calculate the CDF
r = numpy.sort(r)
cdf = numpy.arange(r.size) / r.size
if neval is not None: # Optinally interpolate at given points
_r = numpy.logspace(numpy.log10(rmin), numpy.log10(rmax), neval,
dtype=dtype)
cdf = interp1d(r, cdf, kind="linear", fill_value=numpy.nan,
bounds_error=False)(_r).astype(dtype)
r = _r
return r, cdf
@staticmethod
def peaked_cdf(cdf, make_copy=True):
"""
Transform the CDF to a peaked CDF.
Parameters
----------
cdf : 1- or 2- or 3-dimensional array
CDF to be transformed along the last axis.
make_copy : bool, optional
Whether to make a copy of the CDF before transforming it to avoid
overwriting it.
Returns
-------
peaked_cdf : 1- or 2- or 3-dimensional array
"""
cdf = numpy.copy(cdf) if make_copy else cdf
cdf[cdf > 0.5] = 1 - cdf[cdf > 0.5]
return cdf
def __call__(self, *knns, nneighbours, Rmax, nsamples, rmin, rmax, neval,
verbose=True, random_state=42, dtype=numpy.float32):
"""
Calculate the CDF for a set of kNNs of CSiBORG halo catalogues.
Parameters
----------
*knns : `sklearn.neighbors.NearestNeighbors` instances
kNNs of CSiBORG halo catalogues.
neighbours : int
Maximum number of neighbours to use for the kNN-CDF calculation.
Rmax : float
Maximum radius of the sphere in which to sample random points for
the knn-CDF calculation. This should match the CSiBORG catalogues.
nsamples : int
Number of random points to sample for the knn-CDF calculation.
rmin : float
Minimum distance to evaluate the CDF.
rmax : float
Maximum distance to evaluate the CDF.
neval : int
Number of points to evaluate the CDF.
verbose : bool, optional
Verbosity flag.
random_state : int, optional
Random state for the random number generator.
dtype : numpy dtype, optional
Calculation data type. By default `numpy.float32`.
Returns
-------
rs : 1-dimensional array
Distances at which the CDF is evaluated.
cdfs : 2 or 3-dimensional array
CDFs evaluated at `rs`.
"""
rand = self.rvs_in_sphere(nsamples, Rmax, random_state=random_state)
cdfs = [None] * len(knns)
for i, knn in enumerate(tqdm(knns) if verbose else knns):
dist, _indxs = knn.kneighbors(rand, nneighbours)
dist = dist.astype(dtype)
del _indxs
collect()
cdf = [None] * nneighbours
for j in range(nneighbours):
rs, cdf[j] = self.cdf_from_samples(
dist[:, j], rmin=rmin, rmax=rmax, neval=neval)
cdfs[i] = cdf
cdfs = numpy.asanyarray(cdfs)
cdfs = cdfs[0, ...] if len(knns) == 1 else cdfs
return rs, cdfs

View file

@ -35,20 +35,22 @@ class HaloCatalogue:
The minimum :math:`M_{rm tot} / M_\odot` mass. By default no threshold. The minimum :math:`M_{rm tot} / M_\odot` mass. By default no threshold.
max_dist : float, optional max_dist : float, optional
The maximum comoving distance of a halo. By default no upper limit. The maximum comoving distance of a halo. By default no upper limit.
load_init : bool, optional
Whether to load the initial snapshot information. By default False.
""" """
_box = None _box = None
_paths = None _paths = None
_data = None _data = None
_selmask = None _selmask = None
def __init__(self, nsim, min_mass=None, max_dist=None): def __init__(self, nsim, min_mass=None, max_dist=None, load_init=False):
# Set up paths # Set up paths
paths = CSiBORGPaths(n_sim=nsim) paths = CSiBORGPaths(n_sim=nsim)
paths.n_snap = paths.get_maximum_snapshot() paths.n_snap = paths.get_maximum_snapshot()
self._paths = paths self._paths = paths
self._box = BoxUnits(paths) self._box = BoxUnits(paths)
self._paths = paths self._paths = paths
self._set_data(min_mass, max_dist) self._set_data(min_mass, max_dist, load_init)
@property @property
def data(self): def data(self):
@ -109,7 +111,7 @@ class HaloCatalogue:
def knn(self, select_initial): def knn(self, select_initial):
""" """
The final snapshot k-nearest neighbour object. kNN object of all halo positions.
Parameters Parameters
---------- ----------
@ -123,7 +125,7 @@ class HaloCatalogue:
knn = NearestNeighbors() knn = NearestNeighbors()
return knn.fit(self.positions0 if select_initial else self.positions) return knn.fit(self.positions0 if select_initial else self.positions)
def _set_data(self, min_mass, max_dist): def _set_data(self, min_mass, max_dist, load_init):
""" """
Loads the data, merges with mmain, does various coordinate transforms. Loads the data, merges with mmain, does various coordinate transforms.
""" """
@ -141,6 +143,7 @@ class HaloCatalogue:
data = data[(data["npart"] > 100) & numpy.isfinite(data["m200"])] data = data[(data["npart"] > 100) & numpy.isfinite(data["m200"])]
# Now also load the initial positions # Now also load the initial positions
if load_init:
initcm = read_initcm(self.n_sim, self.paths.initmatch_path) initcm = read_initcm(self.n_sim, self.paths.initmatch_path)
if initcm is not None: if initcm is not None:
data = self.merge_initmatch_to_clumps(data, initcm) data = self.merge_initmatch_to_clumps(data, initcm)
@ -168,7 +171,7 @@ class HaloCatalogue:
data = add_columns(data, [d, ra, dec], ["dist", "ra", "dec"]) data = add_columns(data, [d, ra, dec], ["dist", "ra", "dec"])
# And do the unit transform # And do the unit transform
if initcm is not None: if load_init and initcm is not None:
data = self.box.convert_from_boxunits( data = self.box.convert_from_boxunits(
data, ["x0", "y0", "z0", "lagpatch"]) data, ["x0", "y0", "z0", "lagpatch"])

738
notebooks/knn.ipynb Normal file
View file

@ -0,0 +1,738 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 4,
"id": "5a38ed25",
"metadata": {
"ExecuteTime": {
"end_time": "2023-03-31T17:09:12.165480Z",
"start_time": "2023-03-31T17:09:12.116708Z"
},
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The autoreload extension is already loaded. To reload it, use:\n",
" %reload_ext autoreload\n"
]
}
],
"source": [
"import numpy as np\n",
"import matplotlib\n",
"import matplotlib.pyplot as plt\n",
"from sklearn.neighbors import NearestNeighbors\n",
"import joblib\n",
"from tqdm import tqdm\n",
"try:\n",
" import csiborgtools\n",
"except ModuleNotFoundError:\n",
" print(\"not found\")\n",
" import sys\n",
" sys.path.append(\"../\")\n",
" import csiborgtools\n",
"\n",
"\n",
"%matplotlib notebook\n",
"%load_ext autoreload\n",
"%autoreload 2"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "4218b673",
"metadata": {
"ExecuteTime": {
"end_time": "2023-03-31T17:09:13.943312Z",
"start_time": "2023-03-31T17:09:12.167027Z"
}
},
"outputs": [],
"source": [
"cat = csiborgtools.read.HaloCatalogue(7444, min_mass=1e13, max_dist=155 / 0.705)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "5ff7a1b6",
"metadata": {
"ExecuteTime": {
"end_time": "2023-03-31T17:10:18.303240Z",
"start_time": "2023-03-31T17:10:14.674751Z"
}
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
" 0%| | 0/1 [00:00<?, ?it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"float32\n",
"float32\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 1/1 [00:03<00:00, 3.37s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"float32\n",
"float32\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"knn = NearestNeighbors()\n",
"knn.fit(cat.positions)\n",
"\n",
"knncdf = csiborgtools.match.kNN_CDF()\n",
"\n",
"rs, cdfs_high = knncdf(knn, nneighbours=3, Rmax=155 / 0.705, rmin=0.05, rmax=40,\n",
" nsamples=int(1e6), neval=int(1e4), random_state=42)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "08321431",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "58806ab9",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "c59b3a19",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "e345945c",
"metadata": {
"ExecuteTime": {
"end_time": "2023-03-31T09:35:49.059172Z",
"start_time": "2023-03-31T09:35:42.817291Z"
}
},
"outputs": [],
"source": [
"m1 = (rs > 1) & (rs < 35)\n",
"\n",
"fig, axs = plt.subplots(ncols=3, figsize=(6.4 * 1.5, 4.8), sharey=True)\n",
"fig.subplots_adjust(wspace=0)\n",
"for k in range(3):\n",
" for n in range(len(ics)):\n",
" m = m1 & (cdfs[n, k, :] > 1e-3)\n",
" axs[k].plot(rs[m], cdfs[n, k, m], c=\"black\", lw=0.05)\n",
"\n",
" axs[k].set_xscale(\"log\")\n",
" axs[k].set_yscale(\"log\")\n",
" axs[k].set_title(r\"$k = {}$\".format(k))\n",
" axs[k].set_xlabel(r\"$r~\\left[\\mathrm{Mpc}\\right]$\")\n",
"\n",
"axs[0].set_ylabel(r\"Peaked CDF\")\n",
"\n",
"plt.tight_layout(w_pad=0)\n",
"fig.savefig(\"../plots/peaked_cdf.png\", dpi=450)\n",
"fig.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9f8786c0",
"metadata": {
"ExecuteTime": {
"end_time": "2023-03-31T09:50:10.103650Z",
"start_time": "2023-03-31T09:50:02.221741Z"
}
},
"outputs": [],
"source": [
"m = (rs > 0.5) & (rs < 35)\n",
"\n",
"fig, axs = plt.subplots(ncols=3, figsize=(6.4 * 1.5, 4.8), sharey=True)\n",
"fig.subplots_adjust(wspace=0)\n",
"for k in range(3):\n",
" mu = np.nanmean(cdfs[:, k, :], axis=0)\n",
"\n",
" for n in range(len(ics)):\n",
" axs[k].plot(rs[m], (cdfs[n, k, :] / mu)[m], c=\"black\", lw=0.1)\n",
"\n",
" axs[k].set_ylim(0.5, 1.5)\n",
" axs[k].axhline(1, ls=\"--\", c=\"red\", zorder=0)\n",
" axs[k].axvline(2.65 / 0.705, ls=\"--\", c=\"red\", zorder=0)\n",
" axs[k].set_xscale(\"log\")\n",
" axs[k].set_xlabel(r\"$r~\\left[\\mathrm{Mpc}\\right]$\")\n",
" axs[k].set_title(r\"$k = {}$\".format(k))\n",
" \n",
"axs[0].set_ylabel(r\"Relative peaked CDF\")\n",
"plt.tight_layout(w_pad=0)\n",
"fig.savefig(\"../plots/peaked_cdf_ratios.png\", dpi=450)\n",
"fig.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2f64cec1",
"metadata": {
"ExecuteTime": {
"end_time": "2023-03-30T15:46:31.532259Z",
"start_time": "2023-03-30T15:46:30.977449Z"
}
},
"outputs": [],
"source": [
"plt.figure()\n",
"k = 2\n",
"mu = np.nanmean(cdfs[:, k, :], axis=0)\n",
"# plt.plot(rs, mu, c=\"black\")\n",
"for i in range(len(ics)):\n",
" plt.plot(rs, cdfs[i, k, :] / mu)\n",
"\n",
"\n",
"plt.ylim(0.75, 1.25)\n",
"plt.axhline(1, ls=\"--\", c=\"black\")\n",
"plt.xscale(\"log\")\n",
"# plt.yscale(\"log\")\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a6784766",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "b416efb3",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "e650fe2c",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "1311187d",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "03e49a11",
"metadata": {
"ExecuteTime": {
"end_time": "2023-03-30T14:58:29.937514Z",
"start_time": "2023-03-30T14:58:29.530552Z"
}
},
"outputs": [],
"source": [
"x.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "24578cba",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "b0024bbf",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "6dc55410",
"metadata": {
"ExecuteTime": {
"end_time": "2023-03-30T14:41:24.290602Z",
"start_time": "2023-03-30T14:41:16.204679Z"
}
},
"outputs": [],
"source": [
"dist0, __ = knn0.kneighbors(X, 3)\n",
"distx, __ = knnx.kneighbors(X, 3)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "11508c3c",
"metadata": {
"ExecuteTime": {
"end_time": "2023-03-30T14:41:24.560538Z",
"start_time": "2023-03-30T14:41:24.292674Z"
}
},
"outputs": [],
"source": [
"x0, y0 = knncdf.peaked_cdf_from_samples(dist0[:, 0], 0.5, 20, neval=10000)\n",
"xx, yx = knncdf.peaked_cdf_from_samples(distx[:, 0], 0.5, 20, neval=10000)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "404501ad",
"metadata": {
"ExecuteTime": {
"end_time": "2023-03-30T14:41:24.598933Z",
"start_time": "2023-03-30T14:41:24.562062Z"
}
},
"outputs": [],
"source": [
"distx[:, 0].min()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "43e08969",
"metadata": {
"ExecuteTime": {
"end_time": "2023-03-30T14:46:10.262865Z",
"start_time": "2023-03-30T14:46:09.486658Z"
}
},
"outputs": [],
"source": [
"plt.figure()\n",
"plt.plot(x0, y0)\n",
"plt.plot(xx, yx)\n",
"\n",
"plt.yscale(\"log\")\n",
"plt.xscale(\"log\")\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "39547a75",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "9e160b38",
"metadata": {
"ExecuteTime": {
"end_time": "2023-03-30T13:02:02.033125Z",
"start_time": "2023-03-30T13:02:00.674878Z"
}
},
"outputs": [],
"source": [
"plt.figure()\n",
"\n",
"for i in range(3):\n",
" plt.plot(*knncdf.cdf_from_samples(dist0[:, i], 1, 25))\n",
" plt.plot(*knncdf.cdf_from_samples(distx[:, i], 1, 25))\n",
"\n",
"# plt.xlim(0.5, 25)\n",
"\n",
"plt.yscale(\"log\")\n",
"plt.xscale(\"log\")\n",
"plt.xlabel(r\"$r~\\left[\\mathrm{Mpc}\\right]$\")\n",
"\n",
"\n",
"\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4bfb65d8",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "4703d81c",
"metadata": {
"ExecuteTime": {
"end_time": "2023-03-30T12:13:35.958444Z",
"start_time": "2023-03-30T12:13:35.924241Z"
}
},
"outputs": [],
"source": [
"x = dist[:, 0]\n",
"q = np.linspace(0, 100, int(x.size / 5))\n",
"\n",
"p = np.percentile(x, q)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b054c6df",
"metadata": {
"ExecuteTime": {
"end_time": "2023-03-30T12:16:50.052225Z",
"start_time": "2023-03-30T12:16:50.020395Z"
}
},
"outputs": [],
"source": [
"y = np.sort(x)\n",
"\n",
"yy = np.arange(y.size) / y.size"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5445c964",
"metadata": {
"ExecuteTime": {
"end_time": "2023-03-30T12:16:53.599925Z",
"start_time": "2023-03-30T12:16:53.521266Z"
}
},
"outputs": [],
"source": [
"plt.figure()\n",
"plt.plot(p, q / 100)\n",
"\n",
"plt.plot(y, yy)\n",
"\n",
"# plt.yscale(\"log\")\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "87fe5874",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "fb0ad6b9",
"metadata": {
"ExecuteTime": {
"end_time": "2023-03-30T12:03:34.387625Z",
"start_time": "2023-03-30T12:03:34.290961Z"
}
},
"outputs": [],
"source": [
"plt.figure()\n",
"plt.hist(dist[:, 0], bins=\"auto\", histtype=\"step\")\n",
"plt.hist(dist[:, 1], bins=\"auto\", histtype=\"step\")\n",
"plt.hist(dist[:, 2], bins=\"auto\", histtype=\"step\")\n",
"\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c2aba833",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "6f70f238",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "03bcb191",
"metadata": {
"ExecuteTime": {
"end_time": "2023-03-30T11:38:04.906150Z",
"start_time": "2023-03-30T11:38:04.758107Z"
}
},
"outputs": [],
"source": [
"plt.figure()\n",
"plt.hist(cat0[\"dec\"], bins=\"auto\")\n",
"\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e5ad4722",
"metadata": {
"ExecuteTime": {
"end_time": "2023-03-30T11:53:23.004853Z",
"start_time": "2023-03-30T11:53:22.971967Z"
}
},
"outputs": [],
"source": [
"gen = np.random.default_rng(22)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "785b530a",
"metadata": {
"ExecuteTime": {
"end_time": "2023-03-30T11:53:23.330397Z",
"start_time": "2023-03-30T11:53:23.296612Z"
}
},
"outputs": [],
"source": [
"gen.normal()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b3d3b5e6",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "464b606d",
"metadata": {
"ExecuteTime": {
"end_time": "2023-03-30T11:36:13.649124Z",
"start_time": "2023-03-30T11:36:12.995693Z"
}
},
"outputs": [],
"source": [
"theta = np.linspace( t, np.pi, 100)\n",
"\n",
"plt.figure()\n",
"plt.plot(theta, np.sin(theta))\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c29049f5",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "cd2a3295",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "af9abf04",
"metadata": {
"ExecuteTime": {
"end_time": "2023-03-30T11:10:11.104389Z",
"start_time": "2023-03-30T11:10:11.070499Z"
}
},
"outputs": [],
"source": [
"X = np.array([-3.9514747, -0.6966991, 2.97158]).reshape(1, -1)\n",
"\n",
"X"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e181b3c3",
"metadata": {
"ExecuteTime": {
"end_time": "2023-03-30T11:32:17.840355Z",
"start_time": "2023-03-30T11:32:17.351883Z"
}
},
"outputs": [],
"source": [
"dist, indxs = knn0.kneighbors(X, n_neighbors=1)\n",
"\n",
"dist, indxs"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d38fd960",
"metadata": {
"ExecuteTime": {
"end_time": "2023-03-30T11:10:18.182326Z",
"start_time": "2023-03-30T11:10:18.145629Z"
}
},
"outputs": [],
"source": [
"cat0.positions[indxs]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a16ddc2f",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "bbbe8fb6",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "759a0149",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "312c96c9",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "b097637b",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "2ced23cb",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "be26cbcc",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "venv_galomatch",
"language": "python",
"name": "venv_galomatch"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.0"
},
"vscode": {
"interpreter": {
"hash": "f29d02a8350410abc2a9fb79641689d10bf7ab64afc03ec87ca3cf6ed2daa499"
}
}
},
"nbformat": 4,
"nbformat_minor": 5
}

46
scripts/python.sh Normal file
View file

@ -0,0 +1,46 @@
#!/bin/bash -l
echo =========================================================
echo Job submitted date = Fri Mar 31 16:17:57 BST 2023
date_start=`date +%s`
echo $SLURM_JOB_NUM_NODES nodes \( $SMP processes per node \)
echo $SLURM_JOB_NUM_NODES hosts used: $SLURM_JOB_NODELIST
echo Job output begins
echo -----------------
echo
#hostname
# Need to set the max locked memory very high otherwise IB can't allocate enough and fails with "UCX ERROR Failed to allocate memory pool chunk: Input/output error"
ulimit -l unlimited
# To allow mvapich to run ok
export MV2_SMP_USE_CMA=0
#which mpirun
export OMP_NUM_THEADS=1
/usr/local/shared/slurm/bin/srun -u -n 5 --mpi=pmi2 --mem-per-cpu=7168 nice -n 10 /mnt/zfsusers/rstiskalek/csiborgtools/venv_galomatch/bin/python run_knn.py --rmin 0.05 --rmax 50 --nsamples 100000 --neval 10000
# If we've been checkpointed
#if [ -n "${DMTCP_CHECKPOINT_DIR}" ]; then
if [ -d "${DMTCP_CHECKPOINT_DIR}" ]; then
# echo -n "Job was checkpointed at "
# date
# echo
sleep 1
# fi
echo -n
else
echo ---------------
echo Job output ends
date_end=`date +%s`
seconds=$((date_end-date_start))
minutes=$((seconds/60))
seconds=$((seconds-60*minutes))
hours=$((minutes/60))
minutes=$((minutes-60*hours))
echo =========================================================
echo PBS job: finished date = `date`
echo Total run time : $hours Hours $minutes Minutes $seconds Seconds
echo =========================================================
fi
if [ ${SLURM_NTASKS} -eq 1 ]; then
rm -f $fname
fi

13
scripts/run_asciipos.sh Normal file
View file

@ -0,0 +1,13 @@
nthreads=1
memory=75
queue="berg"
env="/mnt/zfsusers/rstiskalek/csiborgtools/venv_galomatch/bin/python"
file="run_asciipos.py"
mode="dump"
cm="addqueue -q $queue -n $nthreads -m $memory $env $file --mode $mode"
echo "Submitting:"
echo $cm
echo
$cm

17
scripts/run_crossmatch.sh Normal file
View file

@ -0,0 +1,17 @@
nthreads=1
memory=32
queue="berg"
env="/mnt/zfsusers/rstiskalek/csiborgtools/venv_galomatch/bin/python"
file="run_crossmatch.py"
pythoncm="$env $file"
# echo "Submitting:"
# echo $pythoncm
# echo
# $pythoncm
cm="addqueue -q $queue -n $nthreads -m $memory $pythoncm"
echo "Submitting:"
echo $cm
echo
$cm

14
scripts/run_crosspk.sh Normal file
View file

@ -0,0 +1,14 @@
nthreads=20
memory=40
queue="berg"
env="/mnt/zfsusers/rstiskalek/csiborgtools/venv_galomatch/bin/python"
file="run_crosspk.py"
grid=1024
halfwidth=0.13
cm="addqueue -q $queue -n $nthreads -m $memory $env $file --grid $grid --halfwidth $halfwidth"
echo "Submitting:"
echo $cm
echo
$cm

14
scripts/run_fieldprop.sh Normal file
View file

@ -0,0 +1,14 @@
nthreads=10
memory=32
queue="berg"
env="/mnt/zfsusers/rstiskalek/csiborgtools/venv_galomatch/bin/python"
file="run_fieldprop.py"
# grid=1024
# halfwidth=0.1
cm="addqueue -q $queue -n $nthreads -m $memory $env $file"
echo "Submitting:"
echo $cm
echo
$cm

12
scripts/run_fit_halos.sh Normal file
View file

@ -0,0 +1,12 @@
nthreads=100
memory=3
queue="berg"
env="/mnt/zfsusers/rstiskalek/csiborgtools/venv_galomatch/bin/python"
file="run_fit_halos.py"
cm="addqueue -q $queue -n $nthreads -m $memory $env $file"
echo "Submitting:"
echo $cm
echo
$cm

14
scripts/run_initmatch.sh Normal file
View file

@ -0,0 +1,14 @@
nthreads=15 # There isn't too much benefit going to too many CPUs...
memory=32
queue="berg"
env="/mnt/zfsusers/rstiskalek/csiborgtools/venv_galomatch/bin/python"
file="run_initmatch.py"
dump_clumps="false"
cm="addqueue -q $queue -n $nthreads -m $memory $env $file --dump_clumps $dump_clumps"
echo "Submitting:"
echo $cm
echo
$cm

104
scripts/run_knn.py Normal file
View file

@ -0,0 +1,104 @@
# Copyright (C) 2022 Richard Stiskalek
# This program is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the
# Free Software Foundation; either version 3 of the License, or (at your
# option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
# Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
"""A script to calculate the KNN-CDF for a set of CSiBORG halo catalogues."""
from os.path import join
from argparse import ArgumentParser
from copy import deepcopy
from datetime import datetime
from mpi4py import MPI
from TaskmasterMPI import master_process, worker_process
from sklearn.neighbors import NearestNeighbors
import joblib
try:
import csiborgtools
except ModuleNotFoundError:
import sys
sys.path.append("../")
import csiborgtools
###############################################################################
# MPI and arguments #
###############################################################################
comm = MPI.COMM_WORLD
rank = comm.Get_rank()
nproc = comm.Get_size()
parser = ArgumentParser()
parser.add_argument("--rmin", type=float)
parser.add_argument("--rmax", type=float)
parser.add_argument("--nneighbours", type=int)
parser.add_argument("--nsamples", type=int)
parser.add_argument("--neval", type=int)
parser.add_argument("--seed", type=int, default=42)
args = parser.parse_args()
Rmax = 155 / 0.705 # Mpc/h high resolution region radius
mass_threshold = [1e12, 1e13, 1e14] # Msun
ics = [7444, 7468, 7492, 7516, 7540, 7564, 7588, 7612, 7636, 7660, 7684,
7708, 7732, 7756, 7780, 7804, 7828, 7852, 7876, 7900, 7924, 7948,
7972, 7996, 8020, 8044, 8068, 8092, 8116, 8140, 8164, 8188, 8212,
8236, 8260, 8284, 8308, 8332, 8356, 8380, 8404, 8428, 8452, 8476,
8500, 8524, 8548, 8572, 8596, 8620, 8644, 8668, 8692, 8716, 8740,
8764, 8788, 8812, 8836, 8860, 8884, 8908, 8932, 8956, 8980, 9004,
9028, 9052, 9076, 9100, 9124, 9148, 9172, 9196, 9220, 9244, 9268,
9292, 9316, 9340, 9364, 9388, 9412, 9436, 9460, 9484, 9508, 9532,
9556, 9580, 9604, 9628, 9652, 9676, 9700, 9724, 9748, 9772, 9796,
9820, 9844]
dumpdir = "/mnt/extraspace/rstiskalek/csiborg/knn"
fout = join(dumpdir, "knncdf_{}.p")
###############################################################################
# Analysis #
###############################################################################
knncdf = csiborgtools.match.kNN_CDF()
def do_task(ic):
out = {}
cat = csiborgtools.read.HaloCatalogue(ic, max_dist=Rmax)
for i, mmin in enumerate(mass_threshold):
knn = NearestNeighbors()
knn.fit(cat.positions[cat["totpartmass"] > mmin, ...])
rs, cdf = knncdf(knn, nneighbours=args.nneighbours, Rmax=Rmax,
rmin=args.rmin, rmax=args.rmax, nsamples=args.nsamples,
neval=args.neval, random_state=args.seed,
verbose=False)
out.update({"cdf_{}".format(i): cdf})
out.update({"rs": rs, "mass_threshold": mass_threshold})
joblib.dump(out, fout.format(ic))
if nproc > 1:
if rank == 0:
tasks = deepcopy(ics)
master_process(tasks, comm, verbose=True)
else:
worker_process(do_task, comm, verbose=False)
else:
tasks = deepcopy(ics)
for task in tasks:
print("{}: completing task `{}`.".format(datetime.now(), task))
do_task(task)
comm.Barrier()
if rank == 0:
print("{}: all finished.".format(datetime.now()))
quit() # Force quit the script

22
scripts/run_knn.sh Normal file
View file

@ -0,0 +1,22 @@
nthreads=140
memory=7
queue="berg"
env="/mnt/zfsusers/rstiskalek/csiborgtools/venv_galomatch/bin/python"
file="run_knn.py"
rmin=0.01
rmax=100
nneighbours=16
nsamples=10000000
neval=10000
pythoncm="$env $file --rmin $rmin --rmax $rmax --nneighbours $nneighbours --nsamples $nsamples --neval $neval"
# echo $pythoncm
# $pythoncm
cm="addqueue -q $queue -n $nthreads -m $memory $pythoncm"
echo "Submitting:"
echo $cm
echo
$cm

36
scripts/run_singlematch.sh Executable file
View file

@ -0,0 +1,36 @@
#!/bin/bash
# nthreads=1
memory=16
queue="berg"
env="/mnt/zfsusers/rstiskalek/csiborgtools/venv_galomatch/bin/python"
file="run_singlematch.py"
nmult=1.
sigma=1.
sims=(7468 7588 8020 8452 8836)
nsims=${#sims[@]}
for i in $(seq 0 $((nsims-1))); do
for j in $(seq 0 $((nsims-1))); do
if [ $i -eq $j ]; then
continue
elif [ $i -gt $j ]; then
continue
else
:
fi
nsim0=${sims[$i]}
nsimx=${sims[$j]}
pythoncm="$env $file --nsim0 $nsim0 --nsimx $nsimx --nmult $nmult --sigma $sigma"
cm="addqueue -q $queue -n 1x1 -m $memory $pythoncm"
echo "Submitting:"
echo $cm
echo
$cm
sleep 0.05
done; done

View file

@ -0,0 +1,12 @@
nthreads=1
memory=30
queue="cmb"
env="/mnt/zfsusers/rstiskalek/csiborgtools/venv_galomatch/bin/python"
file="run_split_halos.py"
cm="addqueue -q $queue -n $nthreads -m $memory $env $file"
echo "Submitting:"
echo $cm
echo
$cm