Add runtime address and port determination and share them via file

Together with slurm step node counts, make it possible to launch
multiple training in one job
This commit is contained in:
Yin Li 2020-02-13 19:56:54 -06:00
parent 1818e11265
commit b67079bf72
5 changed files with 60 additions and 43 deletions

View file

@ -18,10 +18,6 @@ module load gcc python3
#source $HOME/anaconda3/bin/activate torch
export MASTER_ADDR=$HOSTNAME
export MASTER_PORT=60606
data_root_dir="/mnt/ceph/users/yinli/Quijote"
in_dir="linear"

View file

@ -18,10 +18,6 @@ module load gcc python3
#source $HOME/anaconda3/bin/activate torch
export MASTER_ADDR=$HOSTNAME
export MASTER_PORT=60606
data_root_dir="/mnt/ceph/users/yinli/Quijote"
in_dir="linear"

View file

@ -19,10 +19,6 @@ hostname; pwd; date
source $HOME/anaconda3/bin/activate
export MASTER_ADDR=$HOSTNAME
export MASTER_PORT=60606
data_root_dir="/scratch1/06431/yueyingn/dmo-50MPC-train"
in_dir="low-resl"

View file

@ -18,10 +18,6 @@ module load gcc python3
#source $HOME/anaconda3/bin/activate torch
export MASTER_ADDR=$HOSTNAME
export MASTER_PORT=60606
data_root_dir="/mnt/ceph/users/yinli/Quijote"
in_dir="linear"