ML_GravPotBCs/sCOCA_ML/train/train_gravpot.py
2025-06-17 18:07:06 +02:00

150 lines
5.8 KiB
Python

import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import torch
import time
from ..prepare_data.prepare_gravpot_data import prepare_data
def train_model(model,
dataloader,
optimizer=None,
num_epochs=10,
device='cuda',
print_timers=False,
save_model_path=None,
scheduler=None):
"""
Train a model with the given dataloader and optimizer.
Parameters:
- model: The model to train.
- dataloader: A dictionary with 'train' and 'val' DataLoader objects.
- optimizer: The optimizer to use for training (default is Adam with lr=1e-3).
- num_epochs: Number of epochs to train the model (default is 10).
- device: Device to run the model on (default is 'cuda').
- print_timers: If True, print timing information for each epoch (default is False).
- save_model_path: If provided, the model will be saved to this path after each epoch.
- scheduler: Learning rate scheduler (optional).
Returns:
- train_loss_log: List of training losses for each batch.
- val_loss_log: List of validation losses for each epoch."""
if optimizer is None:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
if scheduler is None:
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=num_epochs//4)
model.to(device)
loss_fn = torch.nn.MSELoss()
train_loss_log = []
val_loss_log = []
for epoch in range(num_epochs):
model.train()
progress_bar = tqdm(dataloader['train'], desc=f"Epoch {epoch+1}/{num_epochs}", unit='batch')
io_time = 0.0
forward_time = 0.0
backward_time = 0.0
validation_time = 0.0
epoch_start_time = time.time()
prev_time = epoch_start_time # For I/O timing
for batch in progress_bar:
# I/O timer: time since last batch processed
t0 = time.time()
io_time += t0 - prev_time
batch = prepare_data(batch)
input = batch['input'].to(device)
target = batch['target'].to(device)
style = batch['style'].to(device)
optimizer.zero_grad()
# Forward pass
t1 = time.time()
output = model(input, style)
loss = loss_fn(output, target)
forward_time += time.time() - t1
# Backward pass and optimization
t2 = time.time()
loss.backward()
optimizer.step()
backward_time += time.time() - t2
train_loss_log.append(loss.item())
progress_bar.set_postfix(loss=f"{loss.item():2.5f}")
prev_time = time.time() # End of loop, for next I/O timing
# End of epoch, validate the model
t3 = time.time()
val_loss, style_bins_means, style_bins = validate(model, dataloader['val'], loss_fn, device)
val_loss_log.append(val_loss)
validation_time += time.time() - t3
# Prepare new samples for the next epoch
dataloader['train'].dataset.on_epoch_end()
dataloader['val'].dataset.on_epoch_end()
if save_model_path is not None:
torch.save(model.state_dict(), save_model_path+ f"_epoch_{epoch+1}.pth")
torch.save(dict(train_loss_log=train_loss_log,
val_loss_log=val_loss_log,
style_bins_means=style_bins_means,
style_bins=style_bins),
save_model_path + f"_epoch_{epoch+1}_stats.pth")
if scheduler is not None:
scheduler.step(val_loss)
print()
print(f"================ Epoch {epoch+1} Summary ================")
print(f"Validation Loss: {val_loss:2.6f}")
bin_width = max([len(f"{m:.2f}") for m in style_bins_means[:-1] + [2]]) # +[2] to avoid empty
bins_str = "Style Bins: " + " | ".join([f" {b:>{bin_width}.2f} " for b in style_bins[:-1]])
means_str = "Means: " + " | ".join([f"{m:>{bin_width}.2e}" for m in style_bins_means])
print(bins_str)
print(means_str)
print()
if print_timers:
total_time = time.time() - epoch_start_time
print(f"Epoch {epoch+1} Timings: {total_time:9.0f} s")
print(f" I/O time (train): {io_time:8.0f} s\t | {100 * io_time / total_time:2.2f}%")
print(f" Forward time: {forward_time:8.0f} s\t | {100 * forward_time / total_time:2.2f}%")
print(f" Backward time: {backward_time:8.0f} s\t | {100 * backward_time / total_time:2.2f}%")
print(f" Validation time: {validation_time:8.0f} s\t | {100 * validation_time / total_time:2.2f}%")
print()
return train_loss_log, val_loss_log
def validate(model, val_loader, loss_fn, device='cuda'):
model.eval()
losses = []
styles = []
progress_bar = tqdm(val_loader, desc="Validation", unit='batch')
with torch.no_grad():
for batch in progress_bar:
batch = prepare_data(batch)
input = batch['input'].to(device)
target = batch['target'].to(device)
style = batch['style'].to(device)
output = model(input, style)
loss = loss_fn(output, target)
losses.append(loss.item())
styles.append(style[:, 0].cpu().numpy().mean()) # BEWARE: if batch size > 1, this will average the styles and make no sense
progress_bar.set_postfix(loss=f"{loss.item():2.5f}")
# Bin loss by style[0]
styles = np.array(styles)
losses = np.array(losses)
bins = np.linspace(styles.min(), styles.max(), 10)
digitized = np.digitize(styles, bins)
bin_means = [losses[digitized == i].mean() if np.any(digitized == i) else 0 for i in range(1, len(bins))]
return losses.mean(), bin_means, bins