Fix DistributedDataParallel model save and load during training, leave testing for later

2019-12-08 21:00:51 -05:00 · 2019-12-08 21:00:51 -05:00 · 437126e296
commit 437126e296
parent f2e9af6d5f
1 changed files with 2 additions and 2 deletions
--- a/map2map/train.py
+++ b/map2map/train.py
@ -94,7 +94,7 @@ def gpu_worker(local_rank, args):
    if args.load_state:
        state = torch.load(args.load_state, map_location=args.device)
        args.start_epoch = state['epoch']
-        model.load_state_dict(state['model'])
+        model.module.load_state_dict(state['model'])
        optimizer.load_state_dict(state['optimizer'])
        scheduler.load_state_dict(state['scheduler'])
        torch.set_rng_state(state['rng'].cpu())  # move rng state back
@ -129,7 +129,7 @@ def gpu_worker(local_rank, args):

            state = {
                'epoch': epoch + 1,
-                'model': model.state_dict(),
+                'model': model.module.state_dict(),
                'optimizer' : optimizer.state_dict(),
                'scheduler' : scheduler.state_dict(),
                'rng' : torch.get_rng_state(),