From 11c9caa1e241b698654a06c60352669367ea9e27 Mon Sep 17 00:00:00 2001 From: Yin Li Date: Sun, 8 Dec 2019 21:02:08 -0500 Subject: [PATCH] Fix unstable training by limiting pytorch version to 1.1 --- map2map/test.py | 2 +- map2map/train.py | 15 +++++++++------ scripts/dis2dis-test.slurm | 7 ++----- scripts/dis2dis.slurm | 5 +---- scripts/vel2vel-test.slurm | 7 ++----- scripts/vel2vel.slurm | 5 +---- setup.py | 4 ++-- 7 files changed, 18 insertions(+), 27 deletions(-) diff --git a/map2map/test.py b/map2map/test.py index 4319014..54e4b3f 100644 --- a/map2map/test.py +++ b/map2map/test.py @@ -48,7 +48,7 @@ def test(args): loss = criterion(output, target) - print('sample {} loss: {}'.format(i, loss)) + print('sample {} loss: {}'.format(i, loss.item())) if args.norms is not None: norm = test_dataset.norms[0] # FIXME diff --git a/map2map/train.py b/map2map/train.py index a2f58a1..cf03d2e 100644 --- a/map2map/train.py +++ b/map2map/train.py @@ -48,7 +48,8 @@ def gpu_worker(local_rank, args): norms=args.norms, pad_or_crop=args.pad_or_crop, ) - train_sampler = DistributedSampler(train_dataset, shuffle=True) + #train_sampler = DistributedSampler(train_dataset, shuffle=True) + train_sampler = DistributedSampler(train_dataset) train_loader = DataLoader( train_dataset, batch_size=args.batches, @@ -65,7 +66,8 @@ def gpu_worker(local_rank, args): norms=args.norms, pad_or_crop=args.pad_or_crop, ) - val_sampler = DistributedSampler(val_dataset, shuffle=False) + #val_sampler = DistributedSampler(val_dataset, shuffle=False) + val_sampler = DistributedSampler(val_dataset) val_loader = DataLoader( val_dataset, batch_size=args.batches, @@ -112,9 +114,9 @@ def gpu_worker(local_rank, args): if args.rank == 0: args.logger = SummaryWriter() - hparam = {k: v if isinstance(v, (int, float, str, bool, torch.Tensor)) - else str(v) for k, v in vars(args).items()} - args.logger.add_hparams(hparam_dict=hparam, metric_dict={}) + #hparam = {k: v if isinstance(v, (int, float, str, bool, torch.Tensor)) + # else str(v) for k, v in vars(args).items()} + #args.logger.add_hparams(hparam_dict=hparam, metric_dict={}) for epoch in range(args.start_epoch, args.epochs): train_sampler.set_epoch(epoch) @@ -125,7 +127,8 @@ def gpu_worker(local_rank, args): scheduler.step(val_loss) if args.rank == 0: - args.logger.close() + print(end='', flush=True) + args.logger.flush() state = { 'epoch': epoch + 1, diff --git a/scripts/dis2dis-test.slurm b/scripts/dis2dis-test.slurm index 65a71fb..597e4ca 100644 --- a/scripts/dis2dis-test.slurm +++ b/scripts/dis2dis-test.slurm @@ -15,10 +15,7 @@ hostname; pwd; date -module load gcc openmpi2 -module load cuda/10.1.243_418.87.00 cudnn/v7.6.2-cuda-10.1 - -source $HOME/anaconda3/bin/activate torch +module load gcc python3 export OMP_NUM_THREADS=$SLURM_CPUS_ON_NODE @@ -37,7 +34,7 @@ in_files="$files" tgt_files="$files" -srun m2m.py test \ +m2m.py test \ --test-in-patterns "$data_root_dir/$in_dir/$test_dirs/$in_files" \ --test-tgt-patterns "$data_root_dir/$tgt_dir/$test_dirs/$tgt_files" \ --in-channels 3 --out-channels 3 --norms cosmology.dis \ diff --git a/scripts/dis2dis.slurm b/scripts/dis2dis.slurm index cc347af..4e98ad9 100644 --- a/scripts/dis2dis.slurm +++ b/scripts/dis2dis.slurm @@ -17,10 +17,7 @@ hostname; pwd; date -module load gcc openmpi2 -module load cuda/10.1.243_418.87.00 cudnn/v7.6.2-cuda-10.1 - -source $HOME/anaconda3/bin/activate torch +module load gcc python3 export MASTER_ADDR=$HOSTNAME diff --git a/scripts/vel2vel-test.slurm b/scripts/vel2vel-test.slurm index c6486fb..b3c6305 100644 --- a/scripts/vel2vel-test.slurm +++ b/scripts/vel2vel-test.slurm @@ -15,10 +15,7 @@ hostname; pwd; date -module load gcc openmpi2 -module load cuda/10.1.243_418.87.00 cudnn/v7.6.2-cuda-10.1 - -source $HOME/anaconda3/bin/activate torch +module load gcc python3 export OMP_NUM_THREADS=$SLURM_CPUS_ON_NODE @@ -37,7 +34,7 @@ in_files="$files" tgt_files="$files" -srun m2m.py test \ +m2m.py test \ --test-in-patterns "$data_root_dir/$in_dir/$test_dirs/$in_files" \ --test-tgt-patterns "$data_root_dir/$tgt_dir/$test_dirs/$tgt_files" \ --in-channels 3 --out-channels 3 --norms cosmology.vel \ diff --git a/scripts/vel2vel.slurm b/scripts/vel2vel.slurm index 30cfe0c..9849c5e 100644 --- a/scripts/vel2vel.slurm +++ b/scripts/vel2vel.slurm @@ -17,10 +17,7 @@ hostname; pwd; date -module load gcc openmpi2 -module load cuda/10.1.243_418.87.00 cudnn/v7.6.2-cuda-10.1 - -source $HOME/anaconda3/bin/activate torch +module load gcc python3 export MASTER_ADDR=$HOSTNAME diff --git a/setup.py b/setup.py index 7e3647f..d5e6a11 100644 --- a/setup.py +++ b/setup.py @@ -5,11 +5,11 @@ setup( name='map2map', version='0.0', description='Neural network emulators to transform field data', - author='Yin Li', + author='Yin Li et al.', author_email='eelregit@gmail.com', packages=find_packages(), install_requires=[ - 'torch', + 'torch==1.1', 'numpy', 'scipy', 'tensorboard',