csiborgtools/notebooks/knn.ipynb
Richard Stiskalek 5784011de0
kNN-CDF secondary halo bias (#40)
* Add seperate autoknn script & config file

* edit ics

* Edit submission script

* Add threshold values

* Edit batch sizign

* Remove print

* edit

* Rename files

* Rename

* Update nb

* edit runs

* Edit submit

* Add median threshold

* add new auto reader

* editt submit

* edit submit

* Edit submit

* Add mean prk

* Edit runs

* Remove correlation file

* Move split to clutering

* Add init

* Remove import

* Add the file

* Add correlation reading

* Edit scripts

* Add below and above median permutation for cross

* Update imports

* Move rvs_in_sphere

* Create utils

* Split

* Add import

* Add normalised marks

* Add import

* Edit readme

* Clean up submission file

* Stop tracking submit files

* Update gitignore

* Add poisson field analytical expression

* Add abstract generators

* Add generators

* Pass in the generator

* Add a check for if there are any files

* Start saving average density

* Update nb

* Update readme

* Update units

* Edit jobs

* Update submits

* Update reader

* Add random crossing

* Update crossing script

* Add crossing with random

* Update readme

* Update notebook
2023-04-09 20:57:05 +01:00

717 KiB

In [1]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
import joblib
from glob import glob
from tqdm import tqdm
try:
    import csiborgtools
except ModuleNotFoundError:
    print("not found")
    import sys
    sys.path.append("../")
    import csiborgtools
import yaml


%matplotlib notebook
%load_ext autoreload
%autoreload 2
not found
In [44]:
with open('../scripts/knn_auto.yml', 'r') as file:
    config = yaml.safe_load(file)

paths = csiborgtools.read.CSiBORGPaths()
knnreader = csiborgtools.read.kNNCDFReader()

auto_folder = "/mnt/extraspace/rstiskalek/csiborg/knn/auto/"
cross_folder = "/mnt/extraspace/rstiskalek/csiborg/knn/cross/"
In [52]:
rs, corr = knnreader.read("mass001", cross_folder, rmin=1, rmax=50)
rs, corr_rand = knnreader.read("mass001_random", auto_folder, rmin=1, rmax=50)
In [60]:
cols = plt.rcParams["axes.prop_cycle"].by_key()["color"]
plt.figure()
for k in range(64):
    plt.plot(rs, corr[0, k, :], c=cols[0], lw=0.5)
#     plt.plot(rs, np.abs(corr_rand[0, k, :]), c=cols[1], lw=0.5)

plt.xscale("log")
plt.axvline(2.65 / 0.705, c="red", ls="--")
plt.show()
No description has been provided for this image
In [ ]:

In [ ]:

In [ ]:

In [57]:
rs, cdf = knnreader.read_auto("mass001_spinloww", folder, rmin=1, rmax=50)
pk = knnreader.mean_prob_k(cdf)


# rs, cdf = knnreader.read_auto("mass001_spinhigh", folder, rmin=1, rmax=50)
# pk_perm = knnreader.mean_prob_k(cdf)
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
Cell In [57], line 1
----> 1 rs, cdf = knnreader.read_auto("mass001_spinloww", folder, rmin=1, rmax=50)
      2 pk = knnreader.mean_prob_k(cdf)

File ~/csiborgtools/csiborgtools/read/summaries.py:215, in kNNCDFReader.read_auto(self, run, folder, rmin, rmax, to_clip)
    213 files = [f for f in glob(join(folder, "*")) if run in f]
    214 if len(files) == 0:
--> 215     raise RuntimeError("No files found for run `{}`.".format(run[:-2]))
    216 kind = "corr" if "cross" in run else "cdf"
    218 for i, file in enumerate(files):

RuntimeError: No files found for run `mass001_spinloww`.
In [48]:
pk.shape
Out[48]:
(63, 5663, 2)
In [58]:
cols = plt.rcParams["axes.prop_cycle"].by_key()["color"]


plt.figure()
# plt.plot(rs, np.nansum(pk, axis=0)[:, 0], c=cols[k])
# plt.plot(rs, np.nansum(pk_perm, axis=0)[:, 0], c=cols[k], ls="--")
for k in range(1, 2):
    plt.plot(rs, pk[k, :, 0], c=cols[k])
#     plt.plot(rs, pk_perm[k, :, 0], c=cols[k], ls="--")
# plt.fill_between(rs, pk[k, :, 0] - pk[k, :, 1], pk[k, :, 0] + pk[k, :, 1])

# plt.plot(rs, np.sum(pk, axis=0)[:, 0])

# plt.xscale("log")
plt.show()
No description has been provided for this image
In [ ]:

In [ ]:

In [ ]:
plt.figure()


k = 2

rs, mu, std = mean_auto(k, "mass_003_spinhigh")
rs, mu_perm, std_perm = mean_auto(k, "mass003_spinlow")
z = mu / mu_perm
deltaz = z * np.sqrt((std / mu)**2 + (std_perm / mu_perm)**2)
plt.plot(rs, z, c=cols[0])
plt.fill_between(rs, z - deltaz, z + deltaz, color=cols[0], alpha=0.3)


# rs, mu, std = mean_auto(k, "mass001_spinhigh")
# rs, mu_perm, std_perm = mean_auto(k, "mass001_spinhigh_perm")
# z = mu / mu_perm
# deltaz = z * np.sqrt((std / mu)**2 + (std_perm / mu_perm)**2)
# plt.plot(rs, z, c=cols[1])
# plt.fill_between(rs, z - deltaz, z + deltaz, color=cols[1], alpha=0.3)


plt.axhline(1, c="black", ls="--")
# plt.fill_between(rs, mu - std, mu + std, color=cols[2], alpha=0.3)

plt.xscale("log")
# plt.yscale("log")
plt.show()
In [ ]:
# rs, corr = knnreader.read_auto("mass001_spinmedian_cross", folder, rmin=0.5)
# rs, corr_perm = knnreader.read_auto("mass001_spinmedian_cross_perm", folder, rmin=0.5)

# rs, cdf_low = knnreader.read_auto("mass001_spinmedian_cross_perm", folder, rmin=0.5)
rs, cdf_high = knnreader.read_auto("mass001_spinmedian_cross_perm", folder, rmin=0.5)
In [ ]:

In [ ]:
plt.figure()
rs, mu, std = mean_auto(0, "mass001_spinlow")
plt.plot(rs, mu, c=cols[0])
plt.fill_between(rs, mu - std, mu + std, color=cols[0], alpha=0.5)

rs, mu, std = mean_auto(0, "mass001_spinlow_perm")
plt.plot(rs, mu, c=cols[1])
plt.fill_between(rs, mu - std, mu + std, color=cols[1], alpha=0.5)
plt.show()
In [ ]:
prk_low = knnreader.prk(rs, cdf_low)
prk_high = knnreader.prk(rs, cdf_high)
In [ ]:
k = 0

plt.figure()

for i in range(101):
    plt.plot(rs, prk_low[i, k, :], lw=0.1, c=cols[0])
    plt.plot(rs, prk_high[i, k, :], lw=0.1, c=cols[1])

    
rs, mu, std = mean_auto(0, "mass001")
plt.plot(rs, mu, c=cols[2])
plt.fill_between(rs, mu - std, mu + std, color=cols[2], alpha=0.5)

plt.show()
In [ ]:
rs, corr = knnreader.read_auto("mass001_spinmedian_cross", folder, rmin=0.5)
rs, corrperm = knnreader.read_auto("mass001_spinmedian_cross_perm", folder, rmin=0.5)

# rs, corr_low = knnreader.read_auto("mass001_spinlow_cross_perm", folder, rmin=0.5)
# rs, corr_high = knnreader.read_auto("mass001_spinhigh_cross_perm", folder, rmin=0.5)
In [ ]:
k = 2
plt.figure()
for i in range(101):
    plt.plot(rs, corr[i, k, :], lw=0.1, c=cols[0])
    plt.plot(rs, corrperm[i, k, :], lw=0.1, c=cols[1])
#     plt.plot(rs, corr[i, k, :] - corrperm[i, k, :], lw=0.1, c=cols[0])
#     plt.plot(rs, , lw=0.1, c=cols[1])
    
plt.xscale("log")
plt.yscale("log")
plt.show()
In [ ]:
k = 0

plt.figure()
for i in range(101):
    plt.plot(rs, corr[i, k, :], c=cols[0], lw=0.1)
    
    
for i in range(101):
    plt.plot(rs, corr_perm[i, k, :], c=cols[1], lw=0.1)

plt.xscale("log")
plt.yscale("log")
plt.show()
In [ ]:
mean_prk.shape
In [ ]:

In [ ]:

In [ ]:

In [ ]:

In [ ]:

In [ ]:
cols = plt.rcParams["axes.prop_cycle"].by_key()["color"]


plt.figure()

rs, cdf = knnreader.read_auto("mass001_spinlow", folder, rmin=0.5)
pk = knnreader.prob_kvolume(cdf, rs, True)
for i in range(101):
    plt.plot(rs, pk[i, 5, :], lw=0.1, c=cols[0])
    
    
rs, cdf = knnreader.read_auto("mass001_spinhigh", folder, rmin=0.5)
pk = knnreader.prob_kvolume(cdf, rs, True)
for i in range(101):
    plt.plot(rs, pk[i, 5, :], lw=0.1, c=cols[1])    
    
    
rs, cdf = knnreader.read_auto("mass001_spinhigh_perm", folder, rmin=0.5)
pk = knnreader.prob_kvolume(cdf, rs, True)
for i in range(101):
    plt.plot(rs, pk[i, 5, :], lw=0.1, c=cols[2])

# plt.xscale("log")

plt.show()
In [ ]:

In [ ]:

In [ ]:

In [ ]:
cat = csiborgtools.read.HaloCatalogue(7444, paths, min_mass=1e12, max_dist=155/0.705)
In [ ]:

In [ ]:
x = ""
for key in auto_config.keys():
    x += key + " "
In [ ]:
x
In [ ]:
auto_config
In [ ]:
auto_folder = "/mnt/extraspace/rstiskalek/csiborg/knn/auto/"
In [ ]:
np.log10(cat["totpartmass"].min())
In [ ]:
cols = plt.rcParams["axes.prop_cycle"].by_key()["color"]


plt.figure()
rs, cdf = knnreader.read_auto("004", auto_folder)
pk = knnreader.prob_kvolume(cdf, rs, normalise=True)
for i in range(101):
    plt.plot(rs, pk[i, 0, :], c=cols[0], lw=0.1)


rs, cdf = knnreader.read_auto("005", auto_folder)
pk = knnreader.prob_kvolume(cdf, rs, normalise=True)
for i in range(101):
    plt.plot(rs, pk[i, 0, :], c=cols[1], lw=0.1)

    
rs, cdf = knnreader.read_auto("001", auto_folder)
pk = knnreader.prob_kvolume(cdf, rs, normalise=True)
for i in range(101):
    plt.plot(rs, pk[i, 0, :], c=cols[2], lw=0.1)

# plt.xscale("log")
# plt.yscale("log")


plt.show()
In [ ]:

In [ ]:

In [ ]:
cat = csiborgtools.read.HaloCatalogue(7444, paths)
In [ ]:
from tqdm import trange
x = np.full((len(ics), 3), np.nan)
for i in trange(len(ics)):
    cat = csiborgtools.read.HaloCatalogue(ics[i], paths, max_dist=155 / 0.705)
    for j, th in enumerate([1e12, 1e13, 1e14]):
        mask = cat["totpartmass"] > th
        x[i, j] = np.nanmedian(cat["lambda200c"][mask])
In [ ]:
np.mean(x[:, 2]), np.std(x[:, 2])
In [ ]:
cdf.shape
In [ ]:

In [ ]:
np.nanmedian()
In [ ]:
plt.figure()

plt.scatter(cat["m200"], cat["lambda200c"], s=1)

plt.xscale("log")
plt.yscale("log")


plt.show()
In [ ]:

In [ ]:
files = glob("/mnt/extraspace/rstiskalek/csiborg/knn/auto/*")

ks = [0, 1, 2, 3, 4, 5, 6, 7]
rs, cdf, thresholds = knnreader.read(files, ks, rmin=0.01, rmax=100)
In [ ]:
pk = knnreader.prob_kvolume(cdf, rs, True)
In [ ]:
cols = plt.rcParams["axes.prop_cycle"].by_key()["color"]

plt.figure()
n = 1
for k in range(7):
    plt.plot(rs, np.mean(pk[:, n, k, :], axis=0), c=cols[k], label=r"$k = {}$".format(k))
    for i in range(101):
        plt.plot(rs, pk[i, n, k, :], c=cols[k], lw=0.05)

plt.legend(frameon=False)
plt.xlabel(r"$r~\left[\mathrm{Mpc}\right]$")
plt.ylabel(r"$P\left(k | V = 4 \pi r^3 / 3\right)$")
# plt.savefig("../plots/test.png", dpi=450)
plt.show()
In [ ]:

In [ ]:
n = 2
k = 1

x = cdf[:, n, k - 1, :] - cdf[:, n, k, :]

plt.figure()
for i in range(101):
    plt.plot(rs, x[i, :])

plt.xscale("log")

plt.show()
In [ ]:
files = knnreader.cross_files(7444, "/mnt/extraspace/rstiskalek/csiborg/knn/cross/")
In [ ]:
ks = [0, 1, 2, 3, 4, 5, 6, 7]
rs, cross, threshold = knnreader.read(files, ks, rmin=0.5)
In [ ]:
n = 0
k = 1

plt.figure()
for i in range(100):
    plt.plot(rs, cross[i, n, k - 1, :] - cross[i, n, k, :])

plt.xscale("log")
plt.axvline(2.65 / 0.705)
plt.show()
In [ ]:
"/mnt/extraspace/hdesmond/ramses_out_7444/output_00950/clump_00950.dat"
In [ ]:

In [ ]:

In [ ]:

In [ ]:

In [ ]:

In [ ]:

In [ ]:

In [ ]:
cat1 = csiborgtools.read.HaloCatalogue(7444, min_mass=1e13, max_dist=155 / 0.705)
cat2 = csiborgtools.read.HaloCatalogue(7468, min_mass=1e13, max_dist=155 / 0.705)
In [ ]:
knncdf = csiborgtools.match.kNN_CDF()


knn1 = NearestNeighbors()
knn1.fit(cat1.positions)

knn2 = NearestNeighbors()
knn2.fit(cat2.positions)

# rs, cdf = knncdf(knn, nneighbours=2, Rmax=155 / 0.705, rmin=0.01, rmax=100,
#                   nsamples=int(1e6), neval=int(1e4), random_state=42, batch_size=int(1e6))
In [ ]:
!ls /mnt/extraspace/rstiskalek/csiborg/knn/cross/knncdf_7444_7468.p
In [ ]:
from glob import glob
In [ ]:
files = glob("/mnt/extraspace/rstiskalek/csiborg/knn/cross/*")
In [ ]:

In [ ]:
cols = plt.rcParams["axes.prop_cycle"].by_key()["color"]

plt.figure()
for file in files:
    d = joblib.load(file)
    mask = d["rs"] > 0.1
    plt.plot(d["rs"][mask], d["corr_0"][0, mask], c=cols[0], lw=0.4)

plt.xscale("log")
plt.axvline(2.65 / 0.705, lw=0.8, c="red", ls="--")
# plt.yscale("log")

plt.show()
In [ ]:

In [ ]:
5500 / comb(5, 3)
In [ ]:
plt.figure()
plt.plot(d["rs"], d["corr_0"][1, :])
plt.plot(d["rs"], d["corr_1"][1, :])
plt.plot(d["rs"], d["corr_2"][1, :])

# plt.yscale("log")
# plt.xscale("log")
plt.show()
In [ ]:

In [ ]:
# rs, cdf = knncdf(knn1, nneighbours=2, Rmax=155 / 0.705, rmin=0.01, rmax=100,
#                  nsamples=int(1e6), neval=int(1e4), random_state=42, batch_size=int(1e6))

rs, cdf0, cdf1, joint_cdf = knncdf.joint(knn1, knn2, nneighbours=8, Rmax=155 / 0.705,
                                         rmin=0.01, rmax=100, nsamples=int(1e6), neval=int(1e4),
                                         random_state=42, batch_size=int(1e6))
In [ ]:
cdf0 = knncdf.clipped_cdf(cdf0)
cdf1 = knncdf.clipped_cdf(cdf1)
joint_cdf = knncdf.clipped_cdf(joint_cdf)
In [ ]:
corr = knncdf.joint_to_corr(cdf0, cdf1, joint_cdf)
In [ ]:
ics = [7444, 7468, 7492, 7516, 7540, 7564, 7588, 7612, 7636, 7660, 7684,
       7708, 7732, 7756, 7780, 7804, 7828, 7852, 7876, 7900, 7924, 7948,
       7972, 7996, 8020, 8044, 8068, 8092, 8116, 8140, 8164, 8188, 8212,
       8236, 8260, 8284, 8308, 8332, 8356, 8380, 8404, 8428, 8452, 8476,
       8500, 8524, 8548, 8572, 8596, 8620, 8644, 8668, 8692, 8716, 8740,
       8764, 8788, 8812, 8836, 8860, 8884, 8908, 8932, 8956, 8980, 9004,
       9028, 9052, 9076, 9100, 9124, 9148, 9172, 9196, 9220, 9244, 9268,
       9292, 9316, 9340, 9364, 9388, 9412, 9436, 9460, 9484, 9508, 9532,
       9556, 9580, 9604, 9628, 9652, 9676, 9700, 9724, 9748, 9772, 9796,
       9820, 9844]
In [ ]:
from scipy.special import comb

from itertools import combinations
# for subset in itertools.combinations(stuff, L):
In [ ]:
list(combinations(ics, 2))
In [ ]:

In [ ]:
comb()
In [ ]:

In [ ]:
plt.figure()

# plt.plot(rs, knncdf.peaked_cdf(cdf0[0, :]))
# plt.plot(rs, knncdf.peaked_cdf(cdf1[0, :]))
# plt.plot(rs, knncdf.peaked_cdf(joint_cdf[0, :]))
for i in range(8):
    plt.plot(rs, corr[i, :])


# plt.yscale("log")
# plt.xscale("log")
plt.axvline(2.65 / 0.705, c="red", ls="--")

plt.show()
In [ ]:

In [ ]:
dist1, dist2 = knncdf.joint(knn1, knn2, nneighbours=2, Rmax=155 / 0.705, rmin=0.01, rmax=100,
             nsamples=int(1e6), neval=int(1e4), random_state=42, batch_size=int(1e6))
In [ ]:

In [ ]:

In [ ]:

In [ ]:
plt.figure()
plt.plot(rs, knncdf.peaked_cdf(cdf[0, :]))

plt.yscale("log" )
plt.xscale("log")
plt.show()
In [ ]:
mask
In [ ]:

In [ ]:
dist
In [ ]:

In [ ]:

In [ ]:
m1 = (rs > 1) & (rs < 35)

fig, axs = plt.subplots(ncols=3, figsize=(6.4 * 1.5, 4.8), sharey=True)
fig.subplots_adjust(wspace=0)
for k in range(3):
    for n in range(len(ics)):
        m = m1 & (cdfs[n, k, :] > 1e-3)
        axs[k].plot(rs[m], cdfs[n, k, m], c="black", lw=0.05)

    axs[k].set_xscale("log")
    axs[k].set_yscale("log")
    axs[k].set_title(r"$k = {}$".format(k))
    axs[k].set_xlabel(r"$r~\left[\mathrm{Mpc}\right]$")

axs[0].set_ylabel(r"Peaked CDF")

plt.tight_layout(w_pad=0)
fig.savefig("../plots/peaked_cdf.png", dpi=450)
fig.show()
In [ ]:
m = (rs > 0.5) & (rs < 35)

fig, axs = plt.subplots(ncols=3, figsize=(6.4 * 1.5, 4.8), sharey=True)
fig.subplots_adjust(wspace=0)
for k in range(3):
    mu = np.nanmean(cdfs[:, k, :], axis=0)

    for n in range(len(ics)):
        axs[k].plot(rs[m], (cdfs[n, k, :] / mu)[m], c="black", lw=0.1)

    axs[k].set_ylim(0.5, 1.5)
    axs[k].axhline(1, ls="--", c="red", zorder=0)
    axs[k].axvline(2.65 / 0.705, ls="--", c="red", zorder=0)
    axs[k].set_xscale("log")
    axs[k].set_xlabel(r"$r~\left[\mathrm{Mpc}\right]$")
    axs[k].set_title(r"$k = {}$".format(k))
    
axs[0].set_ylabel(r"Relative peaked CDF")
plt.tight_layout(w_pad=0)
fig.savefig("../plots/peaked_cdf_ratios.png", dpi=450)
fig.show()
In [ ]:
plt.figure()
k = 2
mu = np.nanmean(cdfs[:, k, :], axis=0)
# plt.plot(rs, mu, c="black")
for i in range(len(ics)):
    plt.plot(rs, cdfs[i, k, :] / mu)


plt.ylim(0.75, 1.25)
plt.axhline(1, ls="--", c="black")
plt.xscale("log")
# plt.yscale("log")
plt.show()
In [ ]:

In [ ]:

In [ ]:

In [ ]:

In [ ]:
x.shape
In [ ]:

In [ ]:

In [ ]:
dist0, __ = knn0.kneighbors(X, 3)
distx, __ = knnx.kneighbors(X, 3)
In [ ]:
x0, y0 = knncdf.peaked_cdf_from_samples(dist0[:, 0], 0.5, 20, neval=10000)
xx, yx = knncdf.peaked_cdf_from_samples(distx[:, 0], 0.5, 20, neval=10000)
In [ ]:
distx[:, 0].min()
In [ ]:
plt.figure()
plt.plot(x0, y0)
plt.plot(xx, yx)

plt.yscale("log")
plt.xscale("log")
plt.show()
In [ ]:

In [ ]:
plt.figure()

for i in range(3):
    plt.plot(*knncdf.cdf_from_samples(dist0[:, i], 1, 25))
    plt.plot(*knncdf.cdf_from_samples(distx[:, i], 1, 25))

# plt.xlim(0.5, 25)

plt.yscale("log")
plt.xscale("log")
plt.xlabel(r"$r~\left[\mathrm{Mpc}\right]$")



plt.show()
In [ ]:

In [ ]:
x = dist[:, 0]
q = np.linspace(0, 100, int(x.size / 5))

p = np.percentile(x, q)
In [ ]:
y = np.sort(x)

yy = np.arange(y.size) / y.size
In [ ]:
plt.figure()
plt.plot(p, q / 100)

plt.plot(y, yy)

# plt.yscale("log")
plt.show()
In [ ]:

In [ ]:
plt.figure()
plt.hist(dist[:, 0], bins="auto", histtype="step")
plt.hist(dist[:, 1], bins="auto", histtype="step")
plt.hist(dist[:, 2], bins="auto", histtype="step")

plt.show()
In [ ]:

In [ ]:

In [ ]:
plt.figure()
plt.hist(cat0["dec"], bins="auto")

plt.show()
In [ ]:
gen = np.random.default_rng(22)
In [ ]:
gen.normal()
In [ ]:

In [ ]:
theta = np.linspace( t, np.pi, 100)

plt.figure()
plt.plot(theta, np.sin(theta))
plt.show()
In [ ]:

In [ ]:

In [ ]:
X = np.array([-3.9514747, -0.6966991,  2.97158]).reshape(1, -1)

X
In [ ]:
dist, indxs = knn0.kneighbors(X, n_neighbors=1)

dist, indxs
In [ ]:
cat0.positions[indxs]
In [ ]:

In [ ]:

In [ ]:

In [ ]:

In [ ]:

In [ ]:

In [ ]: