Add autograd detect anomaly flag, and test cudnn.deterministic
cudnn.deterministic is only 10% slower
This commit is contained in:
parent
d8c6be797d
commit
85efb9e3a3
@ -144,7 +144,9 @@ def add_train_args(parser):
|
|||||||
parser.add_argument('--dist-backend', default='nccl', type=str,
|
parser.add_argument('--dist-backend', default='nccl', type=str,
|
||||||
choices=['gloo', 'nccl'], help='distributed backend')
|
choices=['gloo', 'nccl'], help='distributed backend')
|
||||||
parser.add_argument('--log-interval', default=100, type=int,
|
parser.add_argument('--log-interval', default=100, type=int,
|
||||||
help='interval between logging training loss')
|
help='interval (batches) between logging training loss')
|
||||||
|
parser.add_argument('--detect-anomaly', action='store_true',
|
||||||
|
help='enable anomaly detection for the autograd engine')
|
||||||
|
|
||||||
|
|
||||||
def add_test_args(parser):
|
def add_test_args(parser):
|
||||||
|
@ -53,7 +53,8 @@ def gpu_worker(local_rank, node, args):
|
|||||||
# Need randomness across processes, for sampler, augmentation, noise etc.
|
# Need randomness across processes, for sampler, augmentation, noise etc.
|
||||||
# Note DDP broadcasts initial model states from rank 0
|
# Note DDP broadcasts initial model states from rank 0
|
||||||
torch.manual_seed(args.seed + rank)
|
torch.manual_seed(args.seed + rank)
|
||||||
#torch.backends.cudnn.deterministic = True # NOTE: test perf
|
# good practice to disable cudnn.benchmark if enabling cudnn.deterministic
|
||||||
|
#torch.backends.cudnn.deterministic = True
|
||||||
|
|
||||||
dist_init(rank, args)
|
dist_init(rank, args)
|
||||||
|
|
||||||
@ -180,7 +181,10 @@ def gpu_worker(local_rank, node, args):
|
|||||||
|
|
||||||
del state
|
del state
|
||||||
|
|
||||||
torch.backends.cudnn.benchmark = True # NOTE: test perf
|
torch.backends.cudnn.benchmark = True
|
||||||
|
|
||||||
|
if args.detect_anomaly:
|
||||||
|
torch.autograd.set_detect_anomaly(True)
|
||||||
|
|
||||||
logger = None
|
logger = None
|
||||||
if rank == 0:
|
if rank == 0:
|
||||||
|
Loading…
Reference in New Issue
Block a user