Add autograd detect anomaly flag, and test cudnn.deterministic

cudnn.deterministic is only 10% slower
This commit is contained in:
Yin Li 2020-09-12 15:57:47 -04:00
parent d8c6be797d
commit 85efb9e3a3
2 changed files with 9 additions and 3 deletions

View file

@ -144,7 +144,9 @@ def add_train_args(parser):
parser.add_argument('--dist-backend', default='nccl', type=str,
choices=['gloo', 'nccl'], help='distributed backend')
parser.add_argument('--log-interval', default=100, type=int,
help='interval between logging training loss')
help='interval (batches) between logging training loss')
parser.add_argument('--detect-anomaly', action='store_true',
help='enable anomaly detection for the autograd engine')
def add_test_args(parser):

View file

@ -53,7 +53,8 @@ def gpu_worker(local_rank, node, args):
# Need randomness across processes, for sampler, augmentation, noise etc.
# Note DDP broadcasts initial model states from rank 0
torch.manual_seed(args.seed + rank)
#torch.backends.cudnn.deterministic = True # NOTE: test perf
# good practice to disable cudnn.benchmark if enabling cudnn.deterministic
#torch.backends.cudnn.deterministic = True
dist_init(rank, args)
@ -180,7 +181,10 @@ def gpu_worker(local_rank, node, args):
del state
torch.backends.cudnn.benchmark = True # NOTE: test perf
torch.backends.cudnn.benchmark = True
if args.detect_anomaly:
torch.autograd.set_detect_anomaly(True)
logger = None
if rank == 0: