Fix slurm script for training on 2 nodes on Frontera
This commit is contained in:
parent
de71df51f5
commit
ec074748d0
@ -4,18 +4,19 @@
|
||||
#SBATCH --output=%x-%j.out
|
||||
|
||||
#SBATCH --partition=rtx
|
||||
#SBATCH --gres=gpu:4
|
||||
##SBATCH --gres=gpu:4
|
||||
|
||||
#SBATCH --exclusive
|
||||
#SBATCH --nodes=1
|
||||
#SBATCH --time=7-00:00:00
|
||||
#SBATCH --nodes=2
|
||||
#SBATCH --ntasks-per-node=1
|
||||
#SBATCH --time=2-00:00:00
|
||||
|
||||
|
||||
hostname; pwd; date
|
||||
|
||||
|
||||
#module load gcc python3
|
||||
source $HOME/anaconda/bin/activate torch
|
||||
source $HOME/anaconda3/bin/activate
|
||||
|
||||
|
||||
export MASTER_ADDR=$HOSTNAME
|
||||
@ -27,8 +28,8 @@ data_root_dir="/scratch1/06431/yueyingn/dmo-50MPC-train"
|
||||
in_dir="low-resl"
|
||||
tgt_dir="high-resl"
|
||||
|
||||
train_dirs="set[0-3]/output/PART_004"
|
||||
val_dirs="set4/output/PART_004"
|
||||
train_dirs="set[0-7]/output/PART_004"
|
||||
#val_dirs="set4/output/PART_004"
|
||||
|
||||
in_files_1="disp.npy"
|
||||
in_files_2="vel.npy"
|
||||
@ -39,13 +40,13 @@ tgt_files_2="vel.npy"
|
||||
srun m2m.py train \
|
||||
--train-in-patterns "$data_root_dir/$in_dir/$train_dirs/$in_files_1,$data_root_dir/$in_dir/$train_dirs/$in_files_2" \
|
||||
--train-tgt-patterns "$data_root_dir/$tgt_dir/$train_dirs/$tgt_files_1,$data_root_dir/$tgt_dir/$train_dirs/$tgt_files_2" \
|
||||
--val-in-patterns "$data_root_dir/$in_dir/$val_dirs/$in_files_1,$data_root_dir/$in_dir/$val_dirs/$in_files_2" \
|
||||
--val-tgt-patterns "$data_root_dir/$tgt_dir/$val_dirs/$tgt_files_1,$data_root_dir/$tgt_dir/$val_dirs/$tgt_files_2" \
|
||||
--in-norms cosmology.dis,cosmology.vel --tgt-norms cosmology.dis,cosmology.vel --augment --crop 88 --pad 20 --scale-factor 2 \
|
||||
--model VNet --adv-model PatchGAN --cgan \
|
||||
--lr 0.0001 --adv-lr 0.0004 --batches 1 --loader-workers 0 \
|
||||
--epochs 128 --seed $RANDOM \
|
||||
--cache --div-data
|
||||
# --val-in-patterns "$data_root_dir/$in_dir/$val_dirs/$in_files_1,$data_root_dir/$in_dir/$val_dirs/$in_files_2" \
|
||||
# --val-tgt-patterns "$data_root_dir/$tgt_dir/$val_dirs/$tgt_files_1,$data_root_dir/$tgt_dir/$val_dirs/$tgt_files_2" \
|
||||
# --load-state checkpoint.pth \
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user