Fix slurm script for training on 2 nodes on Frontera

This commit is contained in:
Yin Li 2020-01-27 11:15:46 -05:00
parent de71df51f5
commit ec074748d0

View File

@ -4,18 +4,19 @@
#SBATCH --output=%x-%j.out
#SBATCH --partition=rtx
#SBATCH --gres=gpu:4
##SBATCH --gres=gpu:4
#SBATCH --exclusive
#SBATCH --nodes=1
#SBATCH --time=7-00:00:00
#SBATCH --nodes=2
#SBATCH --ntasks-per-node=1
#SBATCH --time=2-00:00:00
hostname; pwd; date
#module load gcc python3
source $HOME/anaconda/bin/activate torch
source $HOME/anaconda3/bin/activate
export MASTER_ADDR=$HOSTNAME
@ -27,8 +28,8 @@ data_root_dir="/scratch1/06431/yueyingn/dmo-50MPC-train"
in_dir="low-resl"
tgt_dir="high-resl"
train_dirs="set[0-3]/output/PART_004"
val_dirs="set4/output/PART_004"
train_dirs="set[0-7]/output/PART_004"
#val_dirs="set4/output/PART_004"
in_files_1="disp.npy"
in_files_2="vel.npy"
@ -39,13 +40,13 @@ tgt_files_2="vel.npy"
srun m2m.py train \
--train-in-patterns "$data_root_dir/$in_dir/$train_dirs/$in_files_1,$data_root_dir/$in_dir/$train_dirs/$in_files_2" \
--train-tgt-patterns "$data_root_dir/$tgt_dir/$train_dirs/$tgt_files_1,$data_root_dir/$tgt_dir/$train_dirs/$tgt_files_2" \
--val-in-patterns "$data_root_dir/$in_dir/$val_dirs/$in_files_1,$data_root_dir/$in_dir/$val_dirs/$in_files_2" \
--val-tgt-patterns "$data_root_dir/$tgt_dir/$val_dirs/$tgt_files_1,$data_root_dir/$tgt_dir/$val_dirs/$tgt_files_2" \
--in-norms cosmology.dis,cosmology.vel --tgt-norms cosmology.dis,cosmology.vel --augment --crop 88 --pad 20 --scale-factor 2 \
--model VNet --adv-model PatchGAN --cgan \
--lr 0.0001 --adv-lr 0.0004 --batches 1 --loader-workers 0 \
--epochs 128 --seed $RANDOM \
--cache --div-data
# --val-in-patterns "$data_root_dir/$in_dir/$val_dirs/$in_files_1,$data_root_dir/$in_dir/$val_dirs/$in_files_2" \
# --val-tgt-patterns "$data_root_dir/$tgt_dir/$val_dirs/$tgt_files_1,$data_root_dir/$tgt_dir/$val_dirs/$tgt_files_2" \
# --load-state checkpoint.pth \