map2map/scripts/dis2den.slurm
Yin Li b67079bf72 Add runtime address and port determination and share them via file
Together with slurm step node counts, make it possible to launch
multiple training in one job
2020-02-13 19:56:54 -06:00

47 lines
1006 B
Bash

#!/bin/bash
#SBATCH --job-name=dis2den
#SBATCH --output=%x-%j.out
#SBATCH --partition=gpu
#SBATCH --gres=gpu:v100-32gb:4
#SBATCH --exclusive
#SBATCH --nodes=4
#SBATCH --time=7-00:00:00
hostname; pwd; date
module load gcc python3
#source $HOME/anaconda3/bin/activate torch
data_root_dir="/mnt/ceph/users/yinli/Quijote"
in_dir="linear"
tgt_dir="nonlin"
train_dirs="*[0-8]"
val_dirs="*[0-8]9"
in_files="dis.npy"
tgt_files="den.npy"
srun m2m.py train \
--train-in-patterns "$data_root_dir/$in_dir/$train_dirs/$in_files" \
--train-tgt-patterns "$data_root_dir/$tgt_dir/$train_dirs/$tgt_files" \
--val-in-patterns "$data_root_dir/$in_dir/$val_dirs/$in_files" \
--val-tgt-patterns "$data_root_dir/$tgt_dir/$val_dirs/$tgt_files" \
--in-norms cosmology.dis --tgt-norms torch.log1p --augment --crop 128 --pad 20 \
--model UNet \
--lr 0.0001 --batches 1 --loader-workers 0 \
--epochs 1024 --seed $RANDOM \
--cache --div-data
# --load-state checkpoint.pth \
date