Add autograd detect anomaly flag, and test cudnn.deterministic

cudnn.deterministic is only 10% slower
2020-09-12 15:57:47 -04:00 · 2020-09-12 15:57:47 -04:00 · 85efb9e3a3
commit 85efb9e3a3
parent d8c6be797d
2 changed files with 9 additions and 3 deletions
--- a/map2map/args.py
+++ b/map2map/args.py
@ -144,7 +144,9 @@ def add_train_args(parser):
    parser.add_argument('--dist-backend', default='nccl', type=str,
            choices=['gloo', 'nccl'], help='distributed backend')
    parser.add_argument('--log-interval', default=100, type=int,
-            help='interval between logging training loss')
+            help='interval (batches) between logging training loss')
+    parser.add_argument('--detect-anomaly', action='store_true',
+            help='enable anomaly detection for the autograd engine')


 def add_test_args(parser):
--- a/map2map/train.py
+++ b/map2map/train.py
@ -53,7 +53,8 @@ def gpu_worker(local_rank, node, args):
    # Need randomness across processes, for sampler, augmentation, noise etc.
    # Note DDP broadcasts initial model states from rank 0
    torch.manual_seed(args.seed + rank)
-    #torch.backends.cudnn.deterministic = True  # NOTE: test perf
+    # good practice to disable cudnn.benchmark if enabling cudnn.deterministic
+    #torch.backends.cudnn.deterministic = True

    dist_init(rank, args)

@ -180,7 +181,10 @@ def gpu_worker(local_rank, node, args):

        del state

-    torch.backends.cudnn.benchmark = True  # NOTE: test perf
+    torch.backends.cudnn.benchmark = True
+
+    if args.detect_anomaly:
+        torch.autograd.set_detect_anomaly(True)

    logger = None
    if rank == 0: