Add synchronized random seed to training
This commit is contained in:
parent
11c9caa1e2
commit
f64b1e42e9
@ -64,7 +64,7 @@ def add_train_args(parser):
|
|||||||
# help='weight decay')
|
# help='weight decay')
|
||||||
parser.add_argument('--dist-backend', default='nccl', type=str,
|
parser.add_argument('--dist-backend', default='nccl', type=str,
|
||||||
choices=['gloo', 'nccl'], help='distributed backend')
|
choices=['gloo', 'nccl'], help='distributed backend')
|
||||||
parser.add_argument('--seed', default=42, type=int,
|
parser.add_argument('--seed', type=int,
|
||||||
help='seed for initializing training')
|
help='seed for initializing training')
|
||||||
parser.add_argument('--log-interval', default=20, type=int,
|
parser.add_argument('--log-interval', default=20, type=int,
|
||||||
help='interval between logging training loss')
|
help='interval between logging training loss')
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
import os
|
import os
|
||||||
import shutil
|
import shutil
|
||||||
|
import random
|
||||||
import torch
|
import torch
|
||||||
from torch.multiprocessing import spawn
|
from torch.multiprocessing import spawn
|
||||||
from torch.distributed import init_process_group, destroy_process_group, all_reduce
|
from torch.distributed import init_process_group, destroy_process_group, all_reduce
|
||||||
@ -13,6 +14,8 @@ from .models import UNet, narrow_like
|
|||||||
|
|
||||||
|
|
||||||
def node_worker(args):
|
def node_worker(args):
|
||||||
|
if args.seed is None:
|
||||||
|
args.seed = random.randint(0, 65535)
|
||||||
torch.manual_seed(args.seed) # NOTE: why here not in gpu_worker?
|
torch.manual_seed(args.seed) # NOTE: why here not in gpu_worker?
|
||||||
#torch.backends.cudnn.deterministic = True # NOTE: test perf
|
#torch.backends.cudnn.deterministic = True # NOTE: test perf
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user