Remove old merger tree (#93)

* Edit docs * Delete merger tree files * Edit README * Edit docs
2025-06-28 02:41:11 +00:00 · 2023-12-07 14:45:06 +00:00 · 2023-12-07 14:45:06 +00:00 · 944fea5510
commit 944fea5510
parent e972f8e3f2
6 changed files with 7 additions and 1134 deletions
--- a/README.md
+++ b/README.md
@ -8,6 +8,7 @@ however with little effort it can support other simulations as well.
 ## TODO
 - [x] Prune old CSiBORG1 merger tree things.
 - [ ] Add full support for CSiBORG2 suite of simulations.
 - [ ] Add SPH field calculation from cosmotools.
--- a/csiborgtools/read/paths.py
+++ b/csiborgtools/read/paths.py
@ -12,7 +12,9 @@
 # You should have received a copy of the GNU General Public License along
 # with this program; if not, write to the Free Software Foundation, Inc.,
 # 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
-"""CSiBORG paths manager."""
+"""
 CSiBORG paths manager.
 """
 from glob import glob, iglob
 from os import makedirs
 from os.path import isdir, join
@ -312,26 +314,6 @@ class Paths:
            nsnap = str(nsnap).zfill(3)
            return join(simpath, f"snapdir_{nsnap}", f"snap_{nsnap}")
    def merger_tree_file(self, nsnap, nsim):
        """
        Path to the CSiBORG on-the-fly generated merger tree file.
        Parameters
        ----------
        nsnap : int
            Snapshot index.
        nsim : int
            IC realisation index.
        Returns
        -------
        str
        """
        nsim = str(nsim)
        nsnap = str(nsnap).zfill(5)
        return join(self.srcdir, f"ramses_out_{nsim}",
                    f"output_{nsnap}", f"mergertree_{nsnap}.dat")
    def processed_output(self, nsim, simname, halo_finder):
        """
        Path to the files containing all particles of a CSiBORG realisation at
@ -378,23 +360,6 @@ class Paths:
        try_create_directory(fdir)
        return join(fdir, f"phew_{str(nsim).zfill(5)}.hdf5")
    def processed_merger_tree(self, nsim):
        """
        Path to the files containing the processed original merger tree files.
        Parameters
        ----------
        nsim : int
            IC realisation index.
        Returns
        -------
        str
        """
        fdir = join(self.postdir, "processed_output")
        try_create_directory(fdir)
        return join(fdir, f"merger_{str(nsim).zfill(5)}.hdf5")
    def halomaker_particle_membership(self, nsnap, nsim, halo_finder):
        """
        Path to the HaloMaker particle membership file (CSiBORG only).
--- a/csiborgtools/read/readsim.py
+++ b/csiborgtools/read/readsim.py
@ -145,8 +145,6 @@ class CSiBORGReader(BaseReader):
    ----------
    paths : py:class`csiborgtools.read.Paths`
    """
    # _snapshot_cache = {}
    def __init__(self, paths):
        self.paths = paths
@ -433,37 +431,6 @@ class CSiBORGReader(BaseReader):
        return parent_arr, parent_mass
    def read_merger_tree(self, nsnap, nsim):
        """
        Read in the raw merger tree file.
        Parameters
        ----------
        nsnap : int
            Snapshot index.
        nsim : int
            IC realisation index.
        Returns
        -------
        data : 2-dimensional array
        """
        fname = self.paths.merger_tree_file(nsnap, nsim)
        # Do some checks if the file exists or is empty
        if not isfile(fname) or getsize(fname) == 0:
            raise FileExistsError(f"Merger file `{fname}` does not exist.")
        data = numpy.genfromtxt(fname)
        if data.ndim == 1:
            raise FileExistsError(f"Invalid merger file `{fname}`.")
        # Convert to Msun / h and cMpc / h but keep velocity in box units.
        data[:, 3] *= 2.6543271649678946e+19
        data[:, 5:8] *= 677.7
        return data
 ###############################################################################
 #                         Quijote particle reader                             #
--- a/csiborgtools/utils.py
+++ b/csiborgtools/utils.py
@ -12,7 +12,9 @@
 # You should have received a copy of the GNU General Public License along
 # with this program; if not, write to the Free Software Foundation, Inc.,
 # 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
-"""Collection of stand-off utility functions used in the scripts."""
+"""
 Collection of stand-off utility functions used in the scripts.
 """
 import numpy
 from numba import jit
 from datetime import datetime
--- a/scripts/mergertree_extract.py
+++ b/scripts/mergertree_extract.py
@ -1,979 +0,0 @@
 # Copyright (C) 2023 Mladen Ivkovic, Richard Stiskalek
 # This program is free software; you can redistribute it and/or modify it
 # under the terms of the GNU General Public License as published by the
 # Free Software Foundation; either version 3 of the License, or (at your
 # option) any later version.
 #
 # This program is distributed in the hope that it will be useful, but
 # WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General
 # Public License for more details.
 #
 # You should have received a copy of the GNU General Public License along
 # with this program; if not, write to the Free Software Foundation, Inc.,
 # 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 import copy
 import os
 from os.path import exists, join
 from os import makedirs
 from sys import argv
 from datetime import datetime
 import numpy as np
 from joblib import dump, load
 from tqdm import trange
 errmsg = """
 ------------------------------------
    mergertree-extract.py
 ------------------------------------
 ---------------
    Usage
 ---------------
 This script extracts the masses of clumps and haloes written by the mergertree
 patch.
 It needs output_XXXXX/mergertree_XXXXX.txtYYYYY and
 output_XXXXX/clump_XXXXX.txtYYYYY files to work.
 You need to run it from the directory where the output_XXXXX directories are
 in.
 There are three working modes defined:
 1) do for one clump only.
    You need to provide the clump ID you want it done for.
    You can provide a starting directory, but by default the script will
    search for the directory where z = 0.
    run with `python3 mergertree-extract.py <clumpid> [--options] `
    this creates the file mergertree_XXXXX_halo-<halo-ID>.txt. Its contents are
    discussed below.
 2) do for one halo.
    You need to provide the halo ID you want it done for, and the flag
    -c or --children.
    The script will by itself find all the child clumps and walk through
    their main branches as well, and write them down.
    run with `python3 mergertree-extract.py <haloid> -c [--options]`
          or `python3 mergertree-extract.py <haloid> --children [--options]`
    this creates the hollowing files:
        - halo_hierarchy_XXXXX-<halo-ID>.txt
            contains the halo ID, how many children it has, and the children
            IDs
        - mergertree_XXXXX_halo-<halo-ID>.txt
            mergertree data for halo that you chose.
        - mergertree_XXXXX_subhalo-<child-ID>.txt
            mergertree data for subhalos of the halo you chose.  One file will
            be created for each subhalo.
        The contents of the mergertree_XXXXX* files are discussed below.
 3) do for all haloes
    The script will just walk off all haloes in the z = 0 directory. Note:
    Haloes, not clumps!
    run with `python3 mergertree-extract.py -a [--options]`
          or `python3 mergertree-extract.py --all [--options]`
    This will create the same type of files as in mode (2), just for all
    haloes.
 If only an integer is given as cmdline arg, mode (1) [one clump only] will be
 run. If no cmd line argument is given, mode (3) [--all] will be run.
 ---------------
    Output
 ---------------
 the mergertree_XXXXX* files have 6 columns:
 snapshot            The snapshot from which this data is taken from
 redshift            The redshift of that snapshot
 clump_ID            The clump ID of the clump at that snapshot
 mass                The mass of the clump at that snapshot, based on what's in
                    the output_XXXXX/mergertree_XXXXX.txtYYYYY files, not the
                    output_XXXXX/clump_XXXXX.txtYYYYY files.
 mass_from_mergers   how much mass has been merged into this clump in this
                    snapshot, i.e. the sum of all the clump masses that have
                    been found to merge with this clump at this snapshot. This
                    does not include the mass of clumps which only seem to
                    merge with this clump, but re-emerge later.
 mass_from_jumpers   The mass of all clumps that seem to merge with this clump,
                    but re-emerge at a later time.
 ----------------
    Options
 ----------------
 List of all flags:
 Running modes
    -a, --all:      make trees for all clumps in output where z = 0
    -c --children:  make trees for a halo and all its subhaloes. You need to
                    specify which halo via its halo ID.
    -h, --help:     print this help and exit.
 Options:
    --start-at=INT      don't start at z = 0 snapshot, but with the specified
                        directory output_00INT.
    --prefix=some/path/ path where you want your output written to.
    -v, --verbose:      be more verbose about what you're doing
 -----------------
  Requirements
 -----------------
 It needs output_XXXXX/mergertree_XXXXX.txtYYYYY and
 output_XXXXX/clump_XXXXX.txtYYYYY files to work, which are created using the
 mergertree patch in ramses.
 Also needs numpy.
 """
 ###############################################################################
 #                             Clump data                                      #
 ###############################################################################
 class ClumpData:
    """
    Data from clump_XXXXX.txt
    Parameters
    ----------
    par : params object
    """
    def __init__(self, par):
        self.clumpids = np.zeros(1)     # clump ID
        self.parent = np.zeros(1)       # parent ID
        self.level = np.zeros(1)        # clump level
    def read_clumpdata(self, par):
        """Reads in the clump data for the z = 0 directory."""
        if par.verbose:
            print("Reading clump data.")
        out = p.z0
        raw_data = [None for i in range(par.ncpu)]
        dirnrstr = str(par.outputnrs[out]).zfill(5)
        dirname = 'output_' + dirnrstr
        i = 0
        for cpu in range(1):
            fname = join(par.workdir, dirname, 'clump_' + dirnrstr + '.dat')
            new_data = np.loadtxt(fname, dtype='int', skiprows=1,
                                  usecols=[0, 1, 2])
            if new_data.ndim == 2:
                raw_data[i] = new_data
                i += 1
            elif new_data.shape[0] == 3:  # if only 1 row is present in file
                raw_data[i] = np.atleast_2d(new_data)
                i += 1
        fulldata = np.concatenate(raw_data[:i], axis=0)
        self.clumpids = fulldata[:, 0]
        self.level = fulldata[:, 1]
        self.parent = fulldata[:, 2]
    def cleanup_clumpdata(self, par, mtd):
        """
        The particle unbinding can remove entire clumps from the catalogue.
        If the option isn't set in the namelist, the clumpfinder output will
        still be made not based on the clumpfinder. If that is the case, the
        clumpfinder catalogue will contain clumps which the mergertree data
        doesn't have, leading to problems. So remove those here.
        """
        for i, c in enumerate(self.clumpids):
            if c not in mtd.descendants[par.z0]:
                self.clumpids[i] = 0
                self.level[i] = 0
                self.parent[i] = -1  # don't make it the same as clumpid
    def find_children(self, clumpid):
        """Find the children for given clump ID."""
        children = []
        last_added = [clumpid]
        loopcounter = 0
        while True:
            loopcounter += 1
            this_level_parents = copy.copy(last_added)
            children += this_level_parents
            last_added = []
            for i, cid in enumerate(self.clumpids):
                if self.parent[i] in this_level_parents and cid != clumpid:
                    last_added.append(cid)
            if len(last_added) == 0:
                break
            if loopcounter == 100:
                print("Finished 100 iterations, we shouldn't be this deep")
                break
        return children[1:]  # don't return top level parent
    def write_children(self, par, clumpid, children):
        """Write the children to file."""
        hfile = join(par.outdir, f"{par.halofilename}-{str(clumpid)}.txt")
        with open(hfile, 'w') as f:
            f.write("# {0:>18} {1:>18} {2:>18}\n".format("halo", "nr_of_children", "children"))  # noqa
            nc = len(children)
            dumpstring = "  {0:18d} {1:18d}".format(clumpid, nc)
            dumpstring = "".join([dumpstring] + [" {0:18d}".format(c) for c in children] + ['\n'])  # noqa
            f.write(dumpstring)
 ###############################################################################
 #                            Constants object                                 #
 ###############################################################################
 class Constants:
    """
    Class holding constants.
    """
    def __init__(self):
        self.Mpc = 3.086e24                 # cm
        self.M_Sol = 1.98855e33             # g
        self.Gyr = (24 * 3600 * 365 * 1e9)  # s
        self.G = 4.492e-15                  # Mpc^3/(M_sol Gyr^2)
        self.H0 = 100                      # km/s/Mpc
        self.omega_m = 0.307000011205673
        self.omega_l = 0.693000018596649
        self.omega_k = 0.0
        self.omega_b = 0.0
 ###############################################################################
 #                             Params object                                   #
 ###############################################################################
 class Params:
    """
    Global parameters to be stored
    """
    def __init__(self):
        # self.workdir = f"/mnt/extraspace/hdesmond/ramses_out_{self.nsim}"
        # self.outdir = f"/mnt/extraspace/rstiskalek/CSiBORG/cleaned_mtree/ramses_out_{self.nsim}"  # noqa
        # if not exists(self.outdir):
        #     makedirs(self.outdir)
        self.lastdir = ""               # last output_XXXXX directory
        self.lastdirnr = -1             # XXXX from lastdir
        self.ncpu = 1                   # Number of CPUs used
        self.noutput = 1                # how many output_XXXXX dirs exist
        self.nout = 1                   # how many outputs we're gonna deal with. (Some might not have merger tree data)  # noqa
        self.outputnrs = None           # numpy array of output numbers
        self.output_lowest = 0          # lowest snapshot number that we're dealing with (>= 1)  # noqa
        self.z0 = 0                     # index of z=0 snapshot (or whichever you want to start with)  # noqa
        # NOTE: params.nout will be defined such that you can easily loop
        self.verbose = False            # verbosity
        self.start_at = 0               # output dir to start with, if given
        self.output_prefix = ""         # user given prefix for output files
        self.outputfilename = ""        # output filename. Stores prefix/mergertree_XXXXX part of name only  # noqa
        self.halofilename = ""          # output filename for halo hierarchy. Stores prefix/halo_hierarchy_XXXXX part of filename only  # noqa
        self.one_halo_only = False      # do the tree for one clump only
        self.halo_and_children = False  # do the tree for one halo, including subhaloes  # noqa
        self.do_all = False             # do for all clumps at z=0 output
        self.clumpid = 0                # which clump ID to work for.
        self.nsim = None
        # Dictionnary of accepted keyword command line arguments
        self.accepted_flags = {
            '-a': self.set_do_all,
            '--all': self.set_do_all,
            '-r': self.set_halo_and_children,
            '--recursive': self.set_halo_and_children,
            '-c': self.set_halo_and_children,
            '--children': self.set_halo_and_children,
            '-h': self.get_help,
            '--help': self.get_help,
            '-v': self.set_verbose,
            '--verbose': self.set_verbose,
            }
        self.accepted_flags_with_args = {
            "--nsim": self.set_nsim,
            '--start-at': self.set_startnr,
            '--prefix': self.set_prefix,
            }
    # -----------------------------
    # Setter methods
    # -----------------------------
    def set_do_all(self):
        self.do_all = True
        return
    def set_halo_and_children(self):
        self.halo_and_children = True
        return
    def get_help(self):
        print(errmsg)
        quit()
        return
    def set_verbose(self):
        self.verbose = True
        return
    def set_startnr(self, arg):
        flag, startnr = arg.split("=")
        try:
            self.start_at = int(startnr)
        except ValueError:
            print("given value for --start-at=INT isn't an integer?")
    def set_prefix(self, arg):
        flag, prefix = arg.split("=")
        #  try:
        self.output_prefix = prefix
        try:
            os.makedirs(self.output_prefix)
        except FileExistsError:
            pass
        return
    def set_nsim(self, arg):
        flag, nsim = arg.split("=")
        try:
            self.nsim = int(nsim)
        except ValueError:
            print("given value for --nsim=INT isn't an integer?")
    def read_cmdlineargs(self):
        """
        Reads in the command line arguments and store them in the
        global_params object.
        """
        nargs = len(argv)
        i = 1  # first cmdlinearg is filename of this file, so skip it
        while i < nargs:
            arg = argv[i]
            arg = arg.strip()
            if arg in self.accepted_flags.keys():
                self.accepted_flags[arg]()
            else:
                for key in self.accepted_flags_with_args.keys():
                    if arg.startswith(key):
                        self.accepted_flags_with_args[key](arg)
                        break
                else:
                    try:
                        self.clumpid = int(arg)
                    except ValueError:
                        print(f"I didn't recognize the argument '{arg}'. Use "
                              "mergertre-extract.py -h or --help to print "
                              "help message.")
                        quit()
            i += 1
        if self.nsim is None:
            raise ValueError("nsim not set. Use --nsim=INT to set it.")
    @property
    def workdir(self):
        return f"/mnt/extraspace/hdesmond/ramses_out_{self.nsim}"
    @property
    def outdir(self):
        fname = f"/mnt/extraspace/rstiskalek/CSiBORG/cleaned_mtree/ramses_out_{self.nsim}"  # noqa
        if not exists(fname):
            makedirs(fname)
        return fname
    def get_output_info(self):
        """
        Read in the output info based on the files in the current working
        directory. Reads in last directory, ncpu, noutputs. Doesn't read
        infofiles.
        """
        # self.workdir = os.getcwd()
        filelist = os.listdir(self.workdir)
        outputlist = []
        for filename in filelist:
            if filename.startswith('output_'):
                outputlist.append(filename)
        if len(outputlist) < 1:
            print("I didn't find any output_XXXXX directories in current "
                  "working directory. Are you in the correct workdir? "
                  "Use mergertree-extract.py -h or --help to print help "
                  "message.")
            quit()
        outputlist.sort()
        self.lastdir = outputlist[-1]
        self.lastdirnr = int(self.lastdir[-5:])
        self.noutput = len(outputlist)
        if (self.start_at > 0):
            # check that directory exists
            startnrstr = str(self.start_at).zfill(5)
            if 'output_' + startnrstr not in outputlist:
                print("Didn't find specified starting directory "
                      f"output_{startnrstr} use mergertree-extract.py -h or "
                      "--help to print help message.")
                quit()
        # read ncpu from infofile in last output directory
        infofile = join(self.workdir, self.lastdir,
                        f"info_{self.lastdir[-5:]}.txt")
        with open(infofile, 'r') as f:
            ncpuline = f.readline()
            line = ncpuline.split()
            self.ncpu = int(line[-1])
    def setup_and_checks(self, sd):
        """
        Do checks and additional setups once you have all the cmd line args and
        output infos
        Parameters
        ----------
        sd: snapshotdata object
        """
        # set running mode
        if not self.do_all:
            if self.clumpid <= 0:
                print("No or wrong clump id given. Setting the --all mode.")
                self.set_do_all()
            else:
                if not self.halo_and_children:
                    self.one_halo_only = True
        # generate list of outputdirnumbers
        startnr = self.lastdirnr
        self.outputnrs = np.array(range(startnr, startnr - self.noutput, -1))
        # find starting output directory
        self.z0 = np.argmin(np.absolute(sd.redshift))
        if self.start_at > 0:
            # replace z0 dir with starting dir
            self.z0 = self.lastdirnr - self.start_at
        # generate output filename
        dirnrstr = str(self.outputnrs[self.z0]).zfill(5)
        fname = "mergertree_" + dirnrstr
        self.outputfilename = join(self.output_prefix, fname)
        # generate halo output filename
        fname = "halo_hierarchy_" + dirnrstr
        self.halofilename = join(self.output_prefix, fname)
        # rename output_prefix to something if it wasn't set
        if self.output_prefix == "":
            self.output_prefix = os.path.relpath(self.workdir)
        # find self.nout; i.e. how many outputs we are actually going to have
        for out in range(self.noutput - 1, -1, -1):
            dirnrstr = str(self.outputnrs[out]).zfill(5)
            mtreefile = join(self.workdir,
                             f"output_{dirnrstr}",
                             f"mergertree_{dirnrstr}.dat")
            if os.path.exists(mtreefile):
                print("Loading mergertree data from ", mtreefile)
                # if there is a file, this is lowest snapshot number directory
                # that we'll be dealing with, and hence will have the highest
                # index number in the arrays I'm using
                # NOTE: params.nout will be defined such that you can easily
                # loop for out in range(p.z0, p.nout)
                self.nout = out + 1
                break
    def print_params(self):
        """Prints out the parameters that are set."""
        if self.do_all:
            print("Working mode:             all clumps")
        else:
            if self.halo_and_children:
                print("Working mode:             halo", self.clumpid, "and its children")  # noqa
            else:
                print("Working mode:             clump ", self.clumpid)
        print("workdir:                 ", self.workdir)
        print("snapshot of tree root:   ", self.outputnrs[self.z0])
        print("p.one_halo_only          ", p.one_halo_only)
        print("p.do_all                 ", p.do_all)
        print("p.halo_and_children      ", p.halo_and_children)
        print("p.one_halo_only          ", p.one_halo_only)
 ###############################################################################
 #                             Merger tree data                                #
 ###############################################################################
 class MTreeData:
    """
    Merger tree data lists
    Parameters
    ----------
    par : params object
    """
    def __init__(self, par):
        self.progenitors = [np.zeros(1) for i in range(par.noutput)]            # progenitor IDs  # noqa
        self.descendants = [np.zeros(1) for i in range(par.noutput)]            # descendant IDs  # noqa
        self.progenitor_outputnrs = [np.zeros(1) for i in range(par.noutput)]   # snapshot number of progenitor  # noqa
        self.mass = [np.zeros(1) for i in range(par.noutput)]                   # descendant mass  # noqa
        self.mass_to_remove = [np.zeros(1) for i in range(par.noutput)]         # descendant mass  # noqa
    def read_mergertree_data(self, par, sd):
        """Reads in mergertree data."""
        if par.verbose:
            print("Reading in mergertree data")
        # Preparation
        # define new datatype for mergertree output
        mtree = np.dtype([('clump', 'i4'),
                          ('prog', 'i4'),
                          ('prog_outnr', 'i4'),
                          ('mass', 'f8'),
                          ('npart', 'f8'),
                          ('x', 'f8'),
                          ('y', 'f8'),
                          ('z', 'f8'),
                          ('vx', 'f8'),
                          ('vy', 'f8'),
                          ('vz', 'f8')
                          ])
        # ---------------------------
        # Loop over directories
        # ---------------------------
        startnr = par.lastdirnr
        # READ THE ONES BEFORE z0 TOO!
        for output in trange(par.nout, desc="Reading merger"):
            dirnr = str(startnr - output).zfill(5)
            srcdir = 'output_' + dirnr
            fnames = [srcdir + '/' + "mergertree_" + dirnr + '.dat']
            fnames[0] = join(par.workdir, fnames[0])
            datalist = [np.zeros((1, 3)) for i in range(par.ncpu)]
            i = 0
            nofile = 0
            for f in fnames:
                if os.path.exists(f):
                    datalist[i] = np.atleast_1d(np.genfromtxt(f, dtype=mtree,
                                                              skip_header=1))
                    i += 1
                else:
                    nofile += 1
            if nofile == p.ncpu:
                print("Didn't find any mergertree data in", srcdir)
            # ---------------------------------
            # Sort out data
            # ---------------------------------
            if i > 0:
                fulldata = np.concatenate(datalist[:i], axis=0)
                self.descendants[output] = fulldata[:]['clump']
                self.progenitors[output] = fulldata[:]['prog']
                self.progenitor_outputnrs[output] = fulldata[:]['prog_outnr']
                self.mass[output] = fulldata[:]['mass']
                #  self.npart[output] = fulldata[:]['npart']
                #  self.x[output] = fulldata[:]['x']
                #  self.y[output] = fulldata[:]['y']
                #  self.z[output] = fulldata[:]['z']
                #  self.vx[output] = fulldata[:]['vx']
                #  self.vy[output] = fulldata[:]['vy']
                #  self.vz[output] = fulldata[:]['vz']
        # --------------------------------------
        # Transform units to physical units
        # --------------------------------------
        # transform units to physical units
        for i in range(len(self.descendants)):
            self.mass[i] *= sd.unit_m[i]
            #  self.x[i] *= sd.unit_l[i] # only transform later when needed; Need to check for periodicity first!  # noqa
            #  self.y[i] *= sd.unit_l[i]
            #  self.z[i] *= sd.unit_l[i]
            #  self.vx[i] *= sd.unit_l[i]/sd.unit_t[i]
            #  self.vy[i] *= sd.unit_l[i]/sd.unit_t[i]
            #  self.vz[i] *= sd.unit_l[i]/sd.unit_t[i]
    def clean_up_jumpers(self, par):
        """
        Remove jumpers from the merger list. Take note of how much mass should
        be removed from the descendant because the jumper is to be removed.
        """
        # First initialize mass_to_remove arrays
        self.mass_to_remove = [np.zeros(self.descendants[out].shape)
                               for out in range(par.noutput)]
        nreplaced = 0
        for out in trange(par.nout + par.z0 - 1, desc="Cleaning jumpers"):
            for i, pr in enumerate(self.progenitors[out]):
                if pr < 0:
                    # Subtract 1 here from snapind:
                    # progenitor_outputnrs gives the snapshot number where the
                    # jumper was a descendant for the last time
                    # so you need to overwrite the merging one snapshot later,
                    # where the clump is the progenitor
                    snapind = get_snap_ind(p, self.progenitor_outputnrs[out][i]) - 1  # noqa
                    # NOTE bottleneck
                    jumpind = self.progenitors[snapind] == -pr
                    # NOTE bottleneck
                    # find index of descendant into which this clump will
                    # appearingly merge into
                    mergerind = self.descendants[snapind] == - self.descendants[snapind][jumpind]  # noqa
                    # overwrite merging event so it won't count
                    self.descendants[snapind][jumpind] = 0
                    # find mass of jumper in previous snapshot
                    jumpmassind = self.descendants[snapind + 1] == -pr
                    # note how much mass might need to be removed for whatever
                    # you need it
                    self.mass_to_remove[snapind][mergerind] += self.mass[snapind + 1][jumpmassind]  # noqa
                    nreplaced += 1
        print("Cleaned out", nreplaced, "jumpers")
    def get_tree(self, par, tree, sd, clumpid):
        """Follow the main branch down."""
        if par.verbose:
            print("Computing tree for clump", clumpid)
        dind = self.descendants[par.z0] == clumpid
        desc_snap_ind = p.z0
        desc = self.descendants[p.z0][dind]
        prog = self.progenitors[p.z0][dind]
        def get_prog_indices(prog, desc_snap_ind):
            """
            Compute snapshot index at which given progenitor has been a
            descendant and its index in the array
            prog:           progenitor ID
            desc_snap_ind:  snapshot index of descendant of given prog
            returns:
            p_snap_ind:     snapshot index of the progenitor
            pind:           progenitor index (np.array mask) of progenitor in
                            array where it is descendant
            """
            if prog > 0:  # if progenitor isn't jumper
                # find progenitor's index in previous snapshot
                p_snap_ind = desc_snap_ind + 1
                pind = self.descendants[p_snap_ind] == prog
            elif prog < 0:
                p_snap_ind = get_snap_ind(
                    par, self.progenitor_outputnrs[desc_snap_ind][dind])
                pind = self.descendants[p_snap_ind] == -prog
            return p_snap_ind, pind
        while True:
            # first calculate merger mass
            mergers = self.descendants[desc_snap_ind] == -desc
            mergermass = 0.0
            if mergers.any():
                for m in self.progenitors[desc_snap_ind][mergers]:
                    # find mass of merger. That's been written down at the
                    # place where merger was descendant.
                    m_snap_ind, mergerind = get_prog_indices(m, desc_snap_ind)
                    mergermass += self.mass[m_snap_ind][mergerind]
            # add the descendant to the tree
            tree.add_snap(par.outputnrs[desc_snap_ind],
                          sd.redshift[desc_snap_ind], desc,
                          self.mass[desc_snap_ind][dind], mergermass,
                          self.mass_to_remove[desc_snap_ind][dind])
            # now descend down the main branch
            if prog != 0:
                p_snap_ind, pind = get_prog_indices(prog, desc_snap_ind)
            else:
                # stop at progenitor = 0
                break
            # prepare for next round
            desc_snap_ind = p_snap_ind
            dind = pind
            desc = abs(prog)
            prog = self.progenitors[p_snap_ind][pind]
 ###############################################################################
 #                             Snapshot data                                   #
 ###############################################################################
 class SnapshotData():
    """Snapshot specific data"""
    def __init__(self, par):
        # read in
        self.aexp = np.zeros(par.noutput)
        self.unit_l = np.zeros(par.noutput)
        self.unit_m = np.zeros(par.noutput)
        self.unit_t = np.zeros(par.noutput)
        self.unit_dens = np.zeros(par.noutput)
        # to be computed
        self.redshift = np.zeros(par.noutput)  # z
    def read_infofiles(self, par, const):
        """Read the info_XXXXX.txt files."""
        if par.verbose:
            print("Reading info files.")
        startnr = par.lastdirnr
        for output in range(p.noutput):
            # Start with last directory (e.g. output_00060),
            # work your way to first directory (e.g. output_00001)
            # p.z0 isn't decided yet, so just read in everything here.
            dirnr = str(startnr - output).zfill(5)
            srcdir = 'output_' + dirnr
            try:
                # ------------------------------------------------------
                # get time, redshift, and units even for output_00001
                # ------------------------------------------------------
                fileloc = srcdir + '/info_' + dirnr + '.txt'
                fileloc = join(par.workdir, fileloc)
                infofile = open(fileloc)
                for i in range(9):
                    infofile.readline()  # skip first 9 lines
                # get expansion factor
                aline = infofile.readline()
                astring, equal, aval = aline.partition("=")
                afloat = float(aval)
                sd.aexp[output] = afloat
                for i in range(5):
                    infofile.readline()  # skip 5 lines
                # get unit_l
                unitline = infofile.readline()
                unitstring, equal, unitval = unitline.partition("=")
                unitfloat = float(unitval)
                sd.unit_l[output] = unitfloat
                # get unit_dens
                unitline = infofile.readline()
                unitstring, equal, unitval = unitline.partition("=")
                unitfloat = float(unitval)
                sd.unit_dens[output] = unitfloat
                # get unit_t
                unitline = infofile.readline()
                unitstring, equal, unitval = unitline.partition("=")
                unitfloat = float(unitval)
                sd.unit_t[output] = unitfloat
                infofile.close()
            except IOError:  # If file doesn't exist
                print("Didn't find any info data in ", srcdir)
                break
        self.unit_m = self.unit_dens * self.unit_l ** 3 / const.M_Sol
        self.unit_l /= const.Mpc
        self.unit_t /= const.Gyr
        self.redshift = 1. / self.aexp - 1
 ###############################################################################
 #                             Tree object                                     #
 ###############################################################################
 class Tree:
    """
    Holds tree result data. It's not really a tree, it's just the values along
    the main branch, but let's call it a tree anyway. Sue me.
    Parameters
    ----------
    nelements : int
        Estimate for how many snapshots you need to allocate space for.
    """
    def __init__(self, nelements):
        self.n = 0                                              # number of elements in tree  # noqa
        self.snapshotnr = -np.ones(nelements, dtype=int)        # snapshot number of array values  # noqa
        self.redshift = -np.ones(nelements, dtype=float)        # redshift at that snapshot  # noqa
        self.clumpids = -np.ones(nelements, dtype=int)          # clump id of halo in that snapshot  # noqa
        self.mass = np.zeros(nelements, dtype=float)            # mass at that snapshot  # noqa
        self.mergermass = np.zeros(nelements, dtype=float)      # sum of mass of swallowed up clumps  # noqa
        self.mass_to_remove = np.zeros(nelements, dtype=float)  # sum of mass of swallowed up clumps  # noqa
    def add_snap(self, nr, z, ID, m, mm, mdel):
        """Add new result."""
        n = self.n
        self.snapshotnr[n] = nr
        self.redshift[n] = z
        self.clumpids[n] = ID
        self.mass[n] = m
        self.mergermass[n] = mm
        self.mass_to_remove[n] = mdel
        self.n += 1
    def write_tree(self, par, case='halo'):
        """Write the results to file."""
        resfile = join(
            par.outdir,
            f"{par.outputfilename}_{case}-{str(self.clumpids[0])}.txt")
        with open(resfile, 'w') as f:
            f.write('# {0:>12} {1:>12} {2:>16} {3:>18} {4:>18} {5:>18}\n'.format(  # noqa
                "snapshot", "redshift", "clump_ID", "mass[M_sol]",
                "mass_from_mergers", "mass_from_jumpers"))
            for i in range(self.n):
                f.write('  {0:12d} {1:12.4f} {2:16d} {3:18.6e} {4:18.6e} {5:18.6e}\n'.format(  # noqa
                    self.snapshotnr[i], self.redshift[i], self.clumpids[i],
                    self.mass[i], self.mergermass[i], self.mass_to_remove[i]))
        return
 def get_snap_ind(p, snap):
    """
    Computes the snapshot index in mtreedata/halodata/snapshotdata arrays for a
    given snapshot number snap
    """
    return (p.noutput - snap).item()
 if __name__ == '__main__':
    p = Params()
    c = Constants()
    # Read cmdlineargs, available output, get global parameters
    p.read_cmdlineargs()
    p.get_output_info()
    sd = SnapshotData(p)
    sd.read_infofiles(p, c)
    # finish setup
    p.setup_and_checks(sd)
    p.print_params()
    # now read in mergertree data
    fname = join(p.outdir, "mtreedata.p")
    if exists(fname):
        print(f"{datetime.now()}: loading mergertree data from `{fname}`.",
              flush=True)
        mtd = load(fname)
        print(f"{datetime.now()}: finished loading mergertree data from `{fname}`.",  # noqa
              flush=True)
    else:
        print("Generating mergertree data.", flush=True)
        mtd = MTreeData(p)
        mtd.read_mergertree_data(p, sd)
        # clean up jumpers
        mtd.clean_up_jumpers(p)
        print("Saving mergertree data.", flush=True)
        dump(mtd, fname)
    # read in clump data if required
    if p.do_all or p.halo_and_children:
        cd = ClumpData(p)
        cd.read_clumpdata(p)
        # clean up halo catalogue
        cd.cleanup_clumpdata(p, mtd)
        # find children, and write them down
        if p.verbose:
            print("Searching for child clumps.")
        if p.halo_and_children:
            children = cd.find_children(p.clumpid)
            cd.write_children(p, p.clumpid, children)
        if p.do_all:
            is_halo = cd.clumpids == cd.parent
            childlist = [None for c in cd.clumpids[is_halo]]
            for i, halo in enumerate(cd.clumpids[is_halo]):
                children = cd.find_children(halo)
                cd.write_children(p, halo, children)
                childlist[i] = children
    # finally, get the bloody tree
    if p.one_halo_only:
        newtree = Tree(p.nout)
        mtd.get_tree(p, newtree, sd, p.clumpid)
        newtree.write_tree(p, 'halo')
    if p.halo_and_children:
        newtree = Tree(p.nout)
        mtd.get_tree(p, newtree, sd, p.clumpid)
        newtree.write_tree(p, 'halo')
        for c in children:
            newtree = Tree(p.nout)
            mtd.get_tree(p, newtree, sd, c)
            newtree.write_tree(p, 'subhalo')
    if p.do_all:
        for i, halo in enumerate(cd.clumpids[is_halo]):
            newtree = Tree(p.nout)
            mtd.get_tree(p, newtree, sd, halo)
            newtree.write_tree(p, 'halo')
            for c in childlist[i]:
                newtree = Tree(p.nout)
                mtd.get_tree(p, newtree, sd, c)
                newtree.write_tree(p, 'subhalo')
    print('Finished.')
--- a/scripts/process_snapshot.py
+++ b/scripts/process_snapshot.py
@ -337,78 +337,6 @@ def make_phew_halo_catalogue(nsim, verbose):
        f.close()
 def make_merger_tree_file(nsim, verbose):
    """
    Process the `.dat` merger tree files and dump them into a HDF5 file.
    """
    paths = csiborgtools.read.Paths(**csiborgtools.paths_glamdring)
    reader = csiborgtools.read.CSiBORGReader(paths)
    snaps = paths.get_snapshots(nsim, "csiborg")
    fname = paths.processed_merger_tree(nsim)
    with h5py.File(fname, "w") as f:
        f.close()
    for nsnap in tqdm(snaps, desc="Loading merger files",
                      disable=not verbose):
        try:
            data = reader.read_merger_tree(nsnap, nsim)
        except FileExistsError:
            continue
        with h5py.File(fname, "r+") as f:
            grp = f.create_group(str(nsnap))
            grp.create_dataset("clump",
                               data=data[:, 0].astype(numpy.int32))
            grp.create_dataset("progenitor",
                               data=data[:, 1].astype(numpy.int32))
            grp.create_dataset("progenitor_outputnr",
                               data=data[:, 2].astype(numpy.int32))
            grp.create_dataset("desc_mass",
                               data=data[:, 3].astype(numpy.float32))
            grp.create_dataset("desc_npart",
                               data=data[:, 4].astype(numpy.int32))
            grp.create_dataset("desc_pos",
                               data=data[:, 5:8].astype(numpy.float32))
            grp.create_dataset("desc_vel",
                               data=data[:, 8:11].astype(numpy.float32))
            f.close()
 def append_merger_tree_mass_to_phew_catalogue(nsim, verbose):
    """
    Append mass of haloes from mergertree files to the PHEW catalogue. The
    difference between this and the PHEW value is that the latter is written
    before unbinding is performed.
    Note that currently only does this for the highest snapshot.
    """
    paths = csiborgtools.read.Paths(**csiborgtools.paths_glamdring)
    snapshots = paths.get_snapshots(nsim, "csiborg")
    merger_reader = csiborgtools.read.MergerReader(nsim, paths)
    for nsnap in tqdm(snapshots, disable=not verbose, desc="Snapshot"):
        # TODO do this for all later
        if nsnap < 930:
            continue
        try:
            phewcat = csiborgtools.read.CSiBORGPHEWCatalogue(nsnap, nsim,
                                                             paths)
        except ValueError:
            phewcat.close()
            continue
        mergertree_mass = merger_reader.match_mass_to_phewcat(phewcat)
        phewcat.close()
        fname = paths.processed_phew(nsim)
        with h5py.File(fname, "r+") as f:
            grp = f[str(nsnap)]
            grp.create_dataset("mergertree_mass_new", data=mergertree_mass)
            f.close()
 def main(nsim, args):
    if args.make_final:
        process_snapshot(nsim, args.simname, args.halofinder, True)
@ -420,12 +348,6 @@ def main(nsim, args):
    if args.make_phew:
        make_phew_halo_catalogue(nsim, True)
    if args.make_merger:
        make_merger_tree_file(nsim, True)
    if args.append_merger_mass:
        append_merger_tree_mass_to_phew_catalogue(nsim, True)
 if __name__ == "__main__":
    parser = ArgumentParser()
@ -441,11 +363,6 @@ if __name__ == "__main__":
                        help="Process the initial snapshot.")
    parser.add_argument("--make_phew", action="store_true", default=False,
                        help="Process the PHEW halo catalogue.")
    parser.add_argument("--make_merger", action="store_true", default=False,
                        help="Process the merger tree files.")
    parser.add_argument("--append_merger_mass", action="store_true",
                        default=False,
                        help="Append the merger tree mass to the PHEW cat.")
    args = parser.parse_args()
    paths = csiborgtools.read.Paths(**csiborgtools.paths_glamdring)