Add runtime address and port determination and share them via file
Together with slurm step node counts, make it possible to launch multiple training in one job
This commit is contained in:
parent
1818e11265
commit
b67079bf72
5 changed files with 60 additions and 43 deletions
|
@ -18,10 +18,6 @@ module load gcc python3
|
|||
#source $HOME/anaconda3/bin/activate torch
|
||||
|
||||
|
||||
export MASTER_ADDR=$HOSTNAME
|
||||
export MASTER_PORT=60606
|
||||
|
||||
|
||||
data_root_dir="/mnt/ceph/users/yinli/Quijote"
|
||||
|
||||
in_dir="linear"
|
||||
|
|
|
@ -18,10 +18,6 @@ module load gcc python3
|
|||
#source $HOME/anaconda3/bin/activate torch
|
||||
|
||||
|
||||
export MASTER_ADDR=$HOSTNAME
|
||||
export MASTER_PORT=60606
|
||||
|
||||
|
||||
data_root_dir="/mnt/ceph/users/yinli/Quijote"
|
||||
|
||||
in_dir="linear"
|
||||
|
|
|
@ -19,10 +19,6 @@ hostname; pwd; date
|
|||
source $HOME/anaconda3/bin/activate
|
||||
|
||||
|
||||
export MASTER_ADDR=$HOSTNAME
|
||||
export MASTER_PORT=60606
|
||||
|
||||
|
||||
data_root_dir="/scratch1/06431/yueyingn/dmo-50MPC-train"
|
||||
|
||||
in_dir="low-resl"
|
||||
|
|
|
@ -18,10 +18,6 @@ module load gcc python3
|
|||
#source $HOME/anaconda3/bin/activate torch
|
||||
|
||||
|
||||
export MASTER_ADDR=$HOSTNAME
|
||||
export MASTER_PORT=60606
|
||||
|
||||
|
||||
data_root_dir="/mnt/ceph/users/yinli/Quijote"
|
||||
|
||||
in_dir="linear"
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue