Fix slurm script for training on 2 nodes on Frontera
This commit is contained in:
parent
de71df51f5
commit
ec074748d0
@ -4,18 +4,19 @@
|
|||||||
#SBATCH --output=%x-%j.out
|
#SBATCH --output=%x-%j.out
|
||||||
|
|
||||||
#SBATCH --partition=rtx
|
#SBATCH --partition=rtx
|
||||||
#SBATCH --gres=gpu:4
|
##SBATCH --gres=gpu:4
|
||||||
|
|
||||||
#SBATCH --exclusive
|
#SBATCH --exclusive
|
||||||
#SBATCH --nodes=1
|
#SBATCH --nodes=2
|
||||||
#SBATCH --time=7-00:00:00
|
#SBATCH --ntasks-per-node=1
|
||||||
|
#SBATCH --time=2-00:00:00
|
||||||
|
|
||||||
|
|
||||||
hostname; pwd; date
|
hostname; pwd; date
|
||||||
|
|
||||||
|
|
||||||
#module load gcc python3
|
#module load gcc python3
|
||||||
source $HOME/anaconda/bin/activate torch
|
source $HOME/anaconda3/bin/activate
|
||||||
|
|
||||||
|
|
||||||
export MASTER_ADDR=$HOSTNAME
|
export MASTER_ADDR=$HOSTNAME
|
||||||
@ -27,8 +28,8 @@ data_root_dir="/scratch1/06431/yueyingn/dmo-50MPC-train"
|
|||||||
in_dir="low-resl"
|
in_dir="low-resl"
|
||||||
tgt_dir="high-resl"
|
tgt_dir="high-resl"
|
||||||
|
|
||||||
train_dirs="set[0-3]/output/PART_004"
|
train_dirs="set[0-7]/output/PART_004"
|
||||||
val_dirs="set4/output/PART_004"
|
#val_dirs="set4/output/PART_004"
|
||||||
|
|
||||||
in_files_1="disp.npy"
|
in_files_1="disp.npy"
|
||||||
in_files_2="vel.npy"
|
in_files_2="vel.npy"
|
||||||
@ -39,13 +40,13 @@ tgt_files_2="vel.npy"
|
|||||||
srun m2m.py train \
|
srun m2m.py train \
|
||||||
--train-in-patterns "$data_root_dir/$in_dir/$train_dirs/$in_files_1,$data_root_dir/$in_dir/$train_dirs/$in_files_2" \
|
--train-in-patterns "$data_root_dir/$in_dir/$train_dirs/$in_files_1,$data_root_dir/$in_dir/$train_dirs/$in_files_2" \
|
||||||
--train-tgt-patterns "$data_root_dir/$tgt_dir/$train_dirs/$tgt_files_1,$data_root_dir/$tgt_dir/$train_dirs/$tgt_files_2" \
|
--train-tgt-patterns "$data_root_dir/$tgt_dir/$train_dirs/$tgt_files_1,$data_root_dir/$tgt_dir/$train_dirs/$tgt_files_2" \
|
||||||
--val-in-patterns "$data_root_dir/$in_dir/$val_dirs/$in_files_1,$data_root_dir/$in_dir/$val_dirs/$in_files_2" \
|
|
||||||
--val-tgt-patterns "$data_root_dir/$tgt_dir/$val_dirs/$tgt_files_1,$data_root_dir/$tgt_dir/$val_dirs/$tgt_files_2" \
|
|
||||||
--in-norms cosmology.dis,cosmology.vel --tgt-norms cosmology.dis,cosmology.vel --augment --crop 88 --pad 20 --scale-factor 2 \
|
--in-norms cosmology.dis,cosmology.vel --tgt-norms cosmology.dis,cosmology.vel --augment --crop 88 --pad 20 --scale-factor 2 \
|
||||||
--model VNet --adv-model PatchGAN --cgan \
|
--model VNet --adv-model PatchGAN --cgan \
|
||||||
--lr 0.0001 --adv-lr 0.0004 --batches 1 --loader-workers 0 \
|
--lr 0.0001 --adv-lr 0.0004 --batches 1 --loader-workers 0 \
|
||||||
--epochs 128 --seed $RANDOM \
|
--epochs 128 --seed $RANDOM \
|
||||||
--cache --div-data
|
--cache --div-data
|
||||||
|
# --val-in-patterns "$data_root_dir/$in_dir/$val_dirs/$in_files_1,$data_root_dir/$in_dir/$val_dirs/$in_files_2" \
|
||||||
|
# --val-tgt-patterns "$data_root_dir/$tgt_dir/$val_dirs/$tgt_files_1,$data_root_dir/$tgt_dir/$val_dirs/$tgt_files_2" \
|
||||||
# --load-state checkpoint.pth \
|
# --load-state checkpoint.pth \
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user