kNN-CDF secondary halo bias (#40)

* Add seperate autoknn script & config file

* edit ics

* Edit submission script

* Add threshold values

* Edit batch sizign

* Remove print

* edit

* Rename files

* Rename

* Update nb

* edit runs

* Edit submit

* Add median threshold

* add new auto reader

* editt submit

* edit submit

* Edit submit

* Add mean prk

* Edit runs

* Remove correlation file

* Move split to clutering

* Add init

* Remove import

* Add the file

* Add correlation reading

* Edit scripts

* Add below and above median permutation for cross

* Update imports

* Move rvs_in_sphere

* Create utils

* Split

* Add import

* Add normalised marks

* Add import

* Edit readme

* Clean up submission file

* Stop tracking submit files

* Update gitignore

* Add poisson field analytical expression

* Add abstract generators

* Add generators

* Pass in the generator

* Add a check for if there are any files

* Start saving average density

* Update nb

* Update readme

* Update units

* Edit jobs

* Update submits

* Update reader

* Add random crossing

* Update crossing script

* Add crossing with random

* Update readme

* Update notebook
This commit is contained in:
Richard Stiskalek 2023-04-09 20:57:05 +01:00 committed by GitHub
parent 826ab61d2d
commit 5784011de0
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
28 changed files with 2563 additions and 486 deletions

182
scripts/knn_auto.py Normal file
View file

@ -0,0 +1,182 @@
# Copyright (C) 2022 Richard Stiskalek
# This program is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the
# Free Software Foundation; either version 3 of the License, or (at your
# option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
# Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
"""A script to calculate the KNN-CDF for a set of CSiBORG halo catalogues."""
from os.path import join
from warnings import warn
from argparse import ArgumentParser
from copy import deepcopy
from datetime import datetime
from mpi4py import MPI
from TaskmasterMPI import master_process, worker_process
import numpy
from sklearn.neighbors import NearestNeighbors
import joblib
import yaml
try:
import csiborgtools
except ModuleNotFoundError:
import sys
sys.path.append("../")
import csiborgtools
###############################################################################
# MPI and arguments #
###############################################################################
comm = MPI.COMM_WORLD
rank = comm.Get_rank()
nproc = comm.Get_size()
parser = ArgumentParser()
parser.add_argument("--runs", type=str, nargs="+")
args = parser.parse_args()
with open('../scripts/knn_auto.yml', 'r') as file:
config = yaml.safe_load(file)
Rmax = 155 / 0.705 # Mpc (h = 0.705) high resolution region radius
totvol = 4 * numpy.pi * Rmax**3 / 3
minmass = 1e12
ics = [7444, 7468, 7492, 7516, 7540, 7564, 7588, 7612, 7636, 7660, 7684,
7708, 7732, 7756, 7780, 7804, 7828, 7852, 7876, 7900, 7924, 7948,
7972, 7996, 8020, 8044, 8068, 8092, 8116, 8140, 8164, 8188, 8212,
8236, 8260, 8284, 8308, 8332, 8356, 8380, 8404, 8428, 8452, 8476,
8500, 8524, 8548, 8572, 8596, 8620, 8644, 8668, 8692, 8716, 8740,
8764, 8788, 8812, 8836, 8860, 8884, 8908, 8932, 8956, 8980, 9004,
9028, 9052, 9076, 9100, 9124, 9148, 9172, 9196, 9220, 9244, 9268,
9292, 9316, 9340, 9364, 9388, 9412, 9436, 9460, 9484, 9508, 9532,
9556, 9580, 9604, 9628, 9652, 9676, 9700, 9724, 9748, 9772, 9796,
9820, 9844]
dumpdir = "/mnt/extraspace/rstiskalek/csiborg/knn"
fout = join(dumpdir, "auto", "knncdf_{}_{}.p")
paths = csiborgtools.read.CSiBORGPaths()
knncdf = csiborgtools.clustering.kNN_CDF()
###############################################################################
# Analysis #
###############################################################################
def read_single(selection, cat):
"""Positions for single catalogue auto-correlation."""
mmask = numpy.ones(len(cat), dtype=bool)
pos = cat.positions(False)
# Primary selection
psel = selection["primary"]
pmin, pmax = psel.get("min", None), psel.get("max", None)
if pmin is not None:
mmask &= (cat[psel["name"]] >= pmin)
if pmax is not None:
mmask &= (cat[psel["name"]] < pmax)
pos = pos[mmask, ...]
# Secondary selection
if "secondary" not in selection:
return pos
smask = numpy.ones(pos.shape[0], dtype=bool)
ssel = selection["secondary"]
smin, smax = ssel.get("min", None), ssel.get("max", None)
prop = cat[ssel["name"]][mmask]
if ssel.get("toperm", False):
prop = numpy.random.permutation(prop)
if ssel.get("marked", True):
x = cat[psel["name"]][mmask]
prop = csiborgtools.clustering.normalised_marks(
x, prop, nbins=config["nbins_marks"])
if smin is not None:
smask &= (prop >= smin)
if smax is not None:
smask &= (prop < smax)
return pos[smask, ...]
def do_auto(run, cat, ic):
"""Calculate the kNN-CDF single catalgoue autocorrelation."""
_config = config.get(run, None)
if _config is None:
warn("No configuration for run {}.".format(run))
return
rvs_gen = csiborgtools.clustering.RVSinsphere(Rmax)
pos = read_single(_config, cat)
knn = NearestNeighbors()
knn.fit(pos)
rs, cdf = knncdf(
knn, rvs_gen=rvs_gen, nneighbours=config["nneighbours"],
rmin=config["rmin"], rmax=config["rmax"],
nsamples=int(config["nsamples"]), neval=int(config["neval"]),
batch_size=int(config["batch_size"]), random_state=config["seed"])
joblib.dump({"rs": rs, "cdf": cdf, "ndensity": pos.shape[0] / totvol},
fout.format(str(ic).zfill(5), run))
def do_cross_rand(run, cat, ic):
"""Calculate the kNN-CDF cross catalogue random correlation."""
_config = config.get(run, None)
if _config is None:
warn("No configuration for run {}.".format(run))
return
rvs_gen = csiborgtools.clustering.RVSinsphere(Rmax)
knn1, knn2 = NearestNeighbors(), NearestNeighbors()
pos1 = read_single(_config, cat)
knn1.fit(pos1)
pos2 = rvs_gen(pos1.shape[0])
knn2.fit(pos2)
rs, cdf0, cdf1, joint_cdf = knncdf.joint(
knn1, knn2, rvs_gen=rvs_gen, nneighbours=int(config["nneighbours"]),
rmin=config["rmin"], rmax=config["rmax"],
nsamples=int(config["nsamples"]), neval=int(config["neval"]),
batch_size=int(config["batch_size"]), random_state=config["seed"])
corr = knncdf.joint_to_corr(cdf0, cdf1, joint_cdf)
joblib.dump({"rs": rs, "corr": corr}, fout.format(str(ic).zfill(5), run))
def do_runs(ic):
cat = csiborgtools.read.HaloCatalogue(ic, paths, max_dist=Rmax,
min_mass=minmass)
for run in args.runs:
if "random" in run:
do_cross_rand(run, cat, ic)
else:
do_auto(run, cat, ic)
###############################################################################
# MPI task delegation #
###############################################################################
if nproc > 1:
if rank == 0:
tasks = deepcopy(ics)
master_process(tasks, comm, verbose=True)
else:
worker_process(do_runs, comm, verbose=False)
else:
tasks = deepcopy(ics)
for task in tasks:
print("{}: completing task `{}`.".format(datetime.now(), task))
do_runs(task)
comm.Barrier()
if rank == 0:
print("{}: all finished.".format(datetime.now()))
quit() # Force quit the script

144
scripts/knn_auto.yml Normal file
View file

@ -0,0 +1,144 @@
rmin: 0.1
rmax: 100
nneighbours: 64
nsamples: 1.e+7
batch_size: 1.e+6
neval: 10000
seed: 42
nbins_marks: 10
################################################################################
# totpartmass #
################################################################################
"mass001":
primary:
name: totpartmass
min: 1.e+12
max: 1.e+13
"mass002":
primary:
name: totpartmass
min: 1.e+13
max: 1.e+14
"mass003":
primary:
name: totpartmass
min: 1.e+14
################################################################################
# totpartmass + lambda200c #
################################################################################
"mass001_spinlow":
primary:
name: totpartmass
min: 1.e+12
max: 1.e+13
secondary:
name: lambda200c
toperm: false
marked: false
max: 0.5
"mass001_spinhigh":
primary:
name: totpartmass
min: 1.e+12
max: 1.e+13
secondary:
name: lambda200c
toperm: false
marked: true
min: 0.5
"mass001_spinmedian_perm":
primary:
name: totpartmass
min: 1.e+12
max: 1.e+13
secondary:
name: lambda200c
toperm: true
marked : true
min: 0.5
"mass002_spinlow":
primary:
name: totpartmass
min: 1.e+13
max: 1.e+14
secondary:
name: lambda200c
toperm: false
marked: false
max: 0.5
"mass002_spinhigh":
primary:
name: totpartmass
min: 1.e+13
max: 1.e+14
secondary:
name: lambda200c
toperm: false
marked: true
min: 0.5
"mass002_spinmedian_perm":
primary:
name: totpartmass
min: 1.e+13
max: 1.e+14
secondary:
name: lambda200c
toperm: true
marked : true
min: 0.5
"mass003_spinlow":
primary:
name: totpartmass
min: 1.e+14
secondary:
name: lambda200c
toperm: false
marked: false
max: 0.5
"mass003_spinhigh":
primary:
name: totpartmass
min: 1.e+14
secondary:
name: lambda200c
toperm: false
marked: true
min: 0.5
"mass003_spinmedian_perm":
primary:
name: totpartmass
min: 1.e+14
secondary:
name: lambda200c
toperm: true
marked : true
min: 0.5
################################################################################
# Cross with random #
################################################################################
"mass001_random":
primary:
name: totpartmass
min: 1.e+12
max: 1.e+13

View file

@ -13,6 +13,7 @@
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
"""A script to calculate the KNN-CDF for a set of CSiBORG halo catalogues."""
from warnings import warn
from os.path import join
from argparse import ArgumentParser
from copy import deepcopy
@ -20,8 +21,10 @@ from datetime import datetime
from itertools import combinations
from mpi4py import MPI
from TaskmasterMPI import master_process, worker_process
import numpy
from sklearn.neighbors import NearestNeighbors
import joblib
import yaml
try:
import csiborgtools
except ModuleNotFoundError:
@ -38,17 +41,13 @@ rank = comm.Get_rank()
nproc = comm.Get_size()
parser = ArgumentParser()
parser.add_argument("--rmin", type=float)
parser.add_argument("--rmax", type=float)
parser.add_argument("--nneighbours", type=int)
parser.add_argument("--nsamples", type=int)
parser.add_argument("--neval", type=int)
parser.add_argument("--batch_size", type=int)
parser.add_argument("--seed", type=int, default=42)
parser.add_argument("--runs", type=str, nargs="+")
args = parser.parse_args()
with open('../scripts/knn_cross.yml', 'r') as file:
config = yaml.safe_load(file)
Rmax = 155 / 0.705 # Mpc/h high resolution region radius
mass_threshold = [1e12, 1e13, 1e14] # Msun
Rmax = 155 / 0.705 # Mpc (h = 0.705) high resolution region radius
minmass = 1e12
ics = [7444, 7468, 7492, 7516, 7540, 7564, 7588, 7612, 7636, 7660, 7684,
7708, 7732, 7756, 7780, 7804, 7828, 7852, 7876, 7900, 7924, 7948,
7972, 7996, 8020, 8044, 8068, 8092, 8116, 8140, 8164, 8188, 8212,
@ -59,80 +58,58 @@ ics = [7444, 7468, 7492, 7516, 7540, 7564, 7588, 7612, 7636, 7660, 7684,
9292, 9316, 9340, 9364, 9388, 9412, 9436, 9460, 9484, 9508, 9532,
9556, 9580, 9604, 9628, 9652, 9676, 9700, 9724, 9748, 9772, 9796,
9820, 9844]
dumpdir = "/mnt/extraspace/rstiskalek/csiborg/knn"
fout_auto = join(dumpdir, "auto", "knncdf_{}.p")
fout_cross = join(dumpdir, "cross", "knncdf_{}_{}.p")
paths = csiborgtools.read.CSiBORGPaths()
dumpdir = "/mnt/extraspace/rstiskalek/csiborg/knn"
fout = join(dumpdir, "cross", "knncdf_{}_{}_{}.p")
knncdf = csiborgtools.clustering.kNN_CDF()
###############################################################################
# Analysis #
###############################################################################
knncdf = csiborgtools.match.kNN_CDF()
def read_single(selection, cat):
mmask = numpy.ones(len(cat), dtype=bool)
pos = cat.positions(False)
# Primary selection
psel = selection["primary"]
pmin, pmax = psel.get("min", None), psel.get("max", None)
if pmin is not None:
mmask &= (cat[psel["name"]] >= pmin)
if pmax is not None:
mmask &= (cat[psel["name"]] < pmax)
return pos[mmask, ...]
def do_auto(ic):
out = {}
cat = csiborgtools.read.HaloCatalogue(ic, paths, max_dist=Rmax)
def do_cross(run, ics):
_config = config.get(run, None)
if _config is None:
warn("No configuration for run {}.".format(run))
return
rvs_gen = csiborgtools.clustering.RVSinsphere(Rmax)
knn1, knn2 = NearestNeighbors(), NearestNeighbors()
for i, mmin in enumerate(mass_threshold):
knn = NearestNeighbors()
knn.fit(cat.positions(False)[cat["totpartmass"] > mmin, ...])
rs, cdf = knncdf(knn, nneighbours=args.nneighbours, Rmax=Rmax,
rmin=args.rmin, rmax=args.rmax, nsamples=args.nsamples,
neval=args.neval, batch_size=args.batch_size,
random_state=args.seed, verbose=False)
out.update({"cdf_{}".format(i): cdf})
out.update({"rs": rs, "mass_threshold": mass_threshold})
joblib.dump(out, fout_auto.format(ic))
def do_cross(ics):
out = {}
cat1 = csiborgtools.read.HaloCatalogue(ics[0], paths, max_dist=Rmax)
pos1 = read_single(_config, cat1)
knn1.fit(pos1)
cat2 = csiborgtools.read.HaloCatalogue(ics[1], paths, max_dist=Rmax)
pos2 = read_single(_config, cat2)
knn2.fit(pos2)
for i, mmin in enumerate(mass_threshold):
knn1 = NearestNeighbors()
knn1.fit(cat1.positions()[cat1["totpartmass"] > mmin, ...])
rs, cdf0, cdf1, joint_cdf = knncdf.joint(
knn1, knn2, rvs_gen=rvs_gen, nneighbours=int(config["nneighbours"]),
rmin=config["rmin"], rmax=config["rmax"],
nsamples=int(config["nsamples"]), neval=int(config["neval"]),
batch_size=int(config["batch_size"]), random_state=config["seed"])
knn2 = NearestNeighbors()
knn2.fit(cat2.positions()[cat2["totpartmass"] > mmin, ...])
corr = knncdf.joint_to_corr(cdf0, cdf1, joint_cdf)
rs, cdf0, cdf1, joint_cdf = knncdf.joint(
knn1, knn2, nneighbours=args.nneighbours, Rmax=Rmax,
rmin=args.rmin, rmax=args.rmax, nsamples=args.nsamples,
neval=args.neval, batch_size=args.batch_size,
random_state=args.seed)
joblib.dump({"rs": rs, "corr": corr},
fout.format(str(ics[0]).zfill(5), str(ics[1]).zfill(5), run))
corr = knncdf.joint_to_corr(cdf0, cdf1, joint_cdf)
out.update({"corr_{}".format(i): corr})
out.update({"rs": rs, "mass_threshold": mass_threshold})
joblib.dump(out, fout_cross.format(*ics))
###############################################################################
# Autocorrelation calculation #
###############################################################################
if nproc > 1:
if rank == 0:
tasks = deepcopy(ics)
master_process(tasks, comm, verbose=True)
else:
worker_process(do_auto, comm, verbose=False)
else:
tasks = deepcopy(ics)
for task in tasks:
print("{}: completing task `{}`.".format(datetime.now(), task))
do_auto(task)
comm.Barrier()
def do_runs(ics):
print(ics)
for run in args.runs:
do_cross(run, ics)
###############################################################################
@ -145,12 +122,12 @@ if nproc > 1:
tasks = list(combinations(ics, 2))
master_process(tasks, comm, verbose=True)
else:
worker_process(do_cross, comm, verbose=False)
worker_process(do_runs, comm, verbose=False)
else:
tasks = deepcopy(ics)
tasks = list(combinations(ics, 2))
for task in tasks:
print("{}: completing task `{}`.".format(datetime.now(), task))
do_cross(task)
do_runs(task)
comm.Barrier()

29
scripts/knn_cross.yml Normal file
View file

@ -0,0 +1,29 @@
rmin: 0.1
rmax: 100
nneighbours: 64
nsamples: 1.e+7
batch_size: 1.e+6
neval: 10000
seed: 42
################################################################################
# totpartmass #
################################################################################
"mass001":
primary:
name: totpartmass
min: 1.e+12
max: 1.e+13
"mass002":
primary:
name: totpartmass
min: 1.e+13
max: 1.e+14
"mass003":
primary:
name: totpartmass
min: 1.e+14

View file

@ -1,46 +0,0 @@
#!/bin/bash -l
echo =========================================================
echo Job submitted date = Fri Mar 31 16:17:57 BST 2023
date_start=`date +%s`
echo $SLURM_JOB_NUM_NODES nodes \( $SMP processes per node \)
echo $SLURM_JOB_NUM_NODES hosts used: $SLURM_JOB_NODELIST
echo Job output begins
echo -----------------
echo
#hostname
# Need to set the max locked memory very high otherwise IB can't allocate enough and fails with "UCX ERROR Failed to allocate memory pool chunk: Input/output error"
ulimit -l unlimited
# To allow mvapich to run ok
export MV2_SMP_USE_CMA=0
#which mpirun
export OMP_NUM_THEADS=1
/usr/local/shared/slurm/bin/srun -u -n 5 --mpi=pmi2 --mem-per-cpu=7168 nice -n 10 /mnt/zfsusers/rstiskalek/csiborgtools/venv_galomatch/bin/python run_knn.py --rmin 0.05 --rmax 50 --nsamples 100000 --neval 10000
# If we've been checkpointed
#if [ -n "${DMTCP_CHECKPOINT_DIR}" ]; then
if [ -d "${DMTCP_CHECKPOINT_DIR}" ]; then
# echo -n "Job was checkpointed at "
# date
# echo
sleep 1
# fi
echo -n
else
echo ---------------
echo Job output ends
date_end=`date +%s`
seconds=$((date_end-date_start))
minutes=$((seconds/60))
seconds=$((seconds-60*minutes))
hours=$((minutes/60))
minutes=$((minutes-60*hours))
echo =========================================================
echo PBS job: finished date = `date`
echo Total run time : $hours Hours $minutes Minutes $seconds Seconds
echo =========================================================
fi
if [ ${SLURM_NTASKS} -eq 1 ]; then
rm -f $fname
fi

View file

@ -1,14 +0,0 @@
nthreads=20
memory=40
queue="berg"
env="/mnt/zfsusers/rstiskalek/csiborgtools/venv_galomatch/bin/python"
file="run_crosspk.py"
grid=1024
halfwidth=0.13
cm="addqueue -q $queue -n $nthreads -m $memory $env $file --grid $grid --halfwidth $halfwidth"
echo "Submitting:"
echo $cm
echo
$cm

View file

@ -1,14 +0,0 @@
nthreads=10
memory=32
queue="berg"
env="/mnt/zfsusers/rstiskalek/csiborgtools/venv_galomatch/bin/python"
file="run_fieldprop.py"
# grid=1024
# halfwidth=0.1
cm="addqueue -q $queue -n $nthreads -m $memory $env $file"
echo "Submitting:"
echo $cm
echo
$cm

View file

@ -1,12 +0,0 @@
nthreads=100
memory=3
queue="berg"
env="/mnt/zfsusers/rstiskalek/csiborgtools/venv_galomatch/bin/python"
file="run_fit_halos.py"
cm="addqueue -q $queue -n $nthreads -m $memory $env $file"
echo "Submitting:"
echo $cm
echo
$cm

View file

@ -1,14 +0,0 @@
nthreads=15 # There isn't too much benefit going to too many CPUs...
memory=32
queue="berg"
env="/mnt/zfsusers/rstiskalek/csiborgtools/venv_galomatch/bin/python"
file="run_initmatch.py"
dump_clumps="false"
cm="addqueue -q $queue -n $nthreads -m $memory $env $file --dump_clumps $dump_clumps"
echo "Submitting:"
echo $cm
echo
$cm

View file

@ -1,23 +0,0 @@
nthreads=151
memory=4
queue="cmb"
env="/mnt/zfsusers/rstiskalek/csiborgtools/venv_galomatch/bin/python"
file="run_knn.py"
rmin=0.01
rmax=100
nneighbours=8
nsamples=100000000
batch_size=1000000
neval=10000
pythoncm="$env $file --rmin $rmin --rmax $rmax --nneighbours $nneighbours --nsamples $nsamples --batch_size $batch_size --neval $neval"
# echo $pythoncm
# $pythoncm
cm="addqueue -q $queue -n $nthreads -m $memory $pythoncm"
echo "Submitting:"
echo $cm
echo
$cm

View file

@ -1,36 +0,0 @@
#!/bin/bash
# nthreads=1
memory=16
queue="berg"
env="/mnt/zfsusers/rstiskalek/csiborgtools/venv_galomatch/bin/python"
file="run_singlematch.py"
nmult=1.
sigma=1.
sims=(7468 7588 8020 8452 8836)
nsims=${#sims[@]}
for i in $(seq 0 $((nsims-1))); do
for j in $(seq 0 $((nsims-1))); do
if [ $i -eq $j ]; then
continue
elif [ $i -gt $j ]; then
continue
else
:
fi
nsim0=${sims[$i]}
nsimx=${sims[$j]}
pythoncm="$env $file --nsim0 $nsim0 --nsimx $nsimx --nmult $nmult --sigma $sigma"
cm="addqueue -q $queue -n 1x1 -m $memory $pythoncm"
echo "Submitting:"
echo $cm
echo
$cm
sleep 0.05
done; done

View file

@ -1,12 +0,0 @@
nthreads=1
memory=30
queue="cmb"
env="/mnt/zfsusers/rstiskalek/csiborgtools/venv_galomatch/bin/python"
file="run_split_halos.py"
cm="addqueue -q $queue -n $nthreads -m $memory $env $file"
echo "Submitting:"
echo $cm
echo
$cm