Add autograd detect anomaly flag, and test cudnn.deterministic
cudnn.deterministic is only 10% slower
This commit is contained in:
parent
d8c6be797d
commit
85efb9e3a3
@ -144,7 +144,9 @@ def add_train_args(parser):
|
||||
parser.add_argument('--dist-backend', default='nccl', type=str,
|
||||
choices=['gloo', 'nccl'], help='distributed backend')
|
||||
parser.add_argument('--log-interval', default=100, type=int,
|
||||
help='interval between logging training loss')
|
||||
help='interval (batches) between logging training loss')
|
||||
parser.add_argument('--detect-anomaly', action='store_true',
|
||||
help='enable anomaly detection for the autograd engine')
|
||||
|
||||
|
||||
def add_test_args(parser):
|
||||
|
@ -53,7 +53,8 @@ def gpu_worker(local_rank, node, args):
|
||||
# Need randomness across processes, for sampler, augmentation, noise etc.
|
||||
# Note DDP broadcasts initial model states from rank 0
|
||||
torch.manual_seed(args.seed + rank)
|
||||
#torch.backends.cudnn.deterministic = True # NOTE: test perf
|
||||
# good practice to disable cudnn.benchmark if enabling cudnn.deterministic
|
||||
#torch.backends.cudnn.deterministic = True
|
||||
|
||||
dist_init(rank, args)
|
||||
|
||||
@ -180,7 +181,10 @@ def gpu_worker(local_rank, node, args):
|
||||
|
||||
del state
|
||||
|
||||
torch.backends.cudnn.benchmark = True # NOTE: test perf
|
||||
torch.backends.cudnn.benchmark = True
|
||||
|
||||
if args.detect_anomaly:
|
||||
torch.autograd.set_detect_anomaly(True)
|
||||
|
||||
logger = None
|
||||
if rank == 0:
|
||||
|
Loading…
Reference in New Issue
Block a user