Training Pipeline

Training, evaluation, and model optimization workflows.

Training entrypoint

project_name.train

train

train(cfg: DictConfig) -> None

Train the GNN model on QM9 dataset.

Parameters:

Name	Type	Description	Default
`cfg`	`DictConfig`	Hydra configuration object containing all parameters.	required

Source code in src/project_name/train.py

@hydra.main(version_base=None, config_path=_CONFIG_PATH, config_name="config")
def train(cfg: DictConfig) -> None:
    """Train the GNN model on QM9 dataset.

    Args:
        cfg: Hydra configuration object containing all parameters.
    """
    device: torch.device = _get_device()
    logger.info("Using device: %s", device)

    model_dir: Path = get_data_path(
        cfg.training.model_dir,
        gcs_bucket=OmegaConf.select(cfg, "training.gcs_bucket"),
    )
    model_dir.mkdir(parents=True, exist_ok=True)
    print(cfg)

    profile: bool = cfg.training.profile
    profiler_run_dir: str = cfg.training.profiler_run_dir
    run = _init_wandb(cfg)
    if run is not None:
        logger.info("wandb logging enabled (run: %s)", run.id)
    else:
        logger.info("wandb logging disabled")
    with timing_checkpoint("Load dataset", enabled=profile):
        logger.info("Loading QM9 dataset...")
        data_path = get_data_path(
            cfg.training.data_path,
            gcs_bucket=OmegaConf.select(cfg, "training.gcs_bucket"),
        )
        dataset: Dataset = QM9Dataset(data_path)

    # Apply normalization transform
    dataset.transform = NormalizeScale()

    # Split dataset
    n: int = len(dataset)
    train_size: int = int(cfg.training.train_ratio * n)
    val_size: int = int(cfg.training.val_ratio * n)
    test_size: int = n - train_size - val_size

    train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(
        dataset,
        [train_size, val_size, test_size],
        generator=torch.Generator().manual_seed(cfg.seed),
    )

    # Create data loaders
    # Create data loaders (parallel loading)
    workers = _num_workers(cfg)
    logger.info("DataLoader num_workers=%d", workers)

    train_loader: DataLoader = DataLoader(
        train_dataset,
        batch_size=cfg.training.batch_size,
        shuffle=True,
        num_workers=workers,
        pin_memory=torch.cuda.is_available(),
        persistent_workers=(workers > 0),
    )

    val_loader: DataLoader = DataLoader(
        val_dataset,
        batch_size=cfg.training.batch_size,
        shuffle=False,
        num_workers=workers,
        pin_memory=torch.cuda.is_available(),
        persistent_workers=(workers > 0),
    )

    test_loader: DataLoader = DataLoader(
        test_dataset,
        batch_size=cfg.training.batch_size,
        shuffle=False,
        num_workers=workers,
        pin_memory=torch.cuda.is_available(),
        persistent_workers=(workers > 0),
    )

    logger.info("Dataset split - Train: %d, Val: %d, Test: %d", len(train_dataset), len(val_dataset), len(test_dataset))

    # Get target indices and infer output dimension
    target_indices: list[int] = list(cfg.training.target_indices)
    num_targets: int = len(target_indices)
    logger.info("Predicting %d target(s): %s", num_targets, target_indices)

    # Initialize model
    model: GraphNeuralNetwork | nn.DataParallel = GraphNeuralNetwork(
        num_node_features=cfg.model.num_node_features,
        hidden_dim=cfg.model.hidden_dim,
        num_layers=cfg.model.num_layers,
        output_dim=num_targets,
    ).to(device)

    if torch.cuda.is_available() and torch.cuda.device_count() > 1:
        model = nn.DataParallel(model, device_ids=list(range(torch.cuda.device_count())))

    optimizer: Optimizer = torch.optim.Adam(model.parameters(), lr=cfg.training.learning_rate)

    # Early stopping variables
    best_val_loss: float = float("inf")
    patience: int = cfg.training.patience
    patience_counter: int = 0

    logger.info(
        "Starting training for %d epochs (batch_size=%d, lr=%g, patience=%d)",
        cfg.training.epochs,
        cfg.training.batch_size,
        cfg.training.learning_rate,
        patience,
    )
    profiler = TrainingProfiler(enabled=profile, output_dir=Path(f"profiling_results/{profiler_run_dir}"))

    for epoch in range(1, cfg.training.epochs + 1):
        train_loss: float = train_epoch(model, train_loader, optimizer, device, target_indices)

        # compute validation metrics (mse/rmse/mae/r2)
        val_metrics = evaluate_with_metrics(model, val_loader, device, target_indices)
        val_loss: float = float(val_metrics["mse"])  # keep early-stopping tied to MSE

        if epoch % LOG_INTERVAL == 0 or epoch == 1:
            logger.info(
                "Epoch %3d | Train Loss: %.6f | Val MSE: %.6f | Val RMSE: %.6f | Val MAE: %.6f | Val R2: %.6f",
                epoch,
                train_loss,
                val_metrics["mse"],
                val_metrics["rmse"],
                val_metrics["mae"],
                val_metrics["r2"],
            )

        improved = val_loss < best_val_loss
        if improved:
            best_val_loss = val_loss
            patience_counter = 0
            best_model_path: Path = model_dir / "best_model.pt"
            torch.save(
                model.module.state_dict() if isinstance(model, nn.DataParallel) else model.state_dict(),
                best_model_path,
            )
            logger.debug("Saved best model to %s", best_model_path)
        else:
            patience_counter += 1

        if run is not None:
            wandb.log(
                {
                    "epoch": epoch,
                    "loss/train": train_loss,
                    # keep your existing val loss key if you want
                    "loss/val": val_loss,
                    # add full validation metrics
                    "val/mse": val_metrics["mse"],
                    "val/rmse": val_metrics["rmse"],
                    "val/mae": val_metrics["mae"],
                    "val/r2": val_metrics["r2"],
                    "early_stopping/patience_counter": patience_counter,
                    "early_stopping/best_val_loss": best_val_loss,
                }
            )

        if patience_counter >= patience:
            logger.info("Early stopping triggered at epoch %d (best_val_loss=%.6f)", epoch, best_val_loss)
            break

        profiler.step()
    profiler.finalize()

    # Load best model and evaluate on test set
    best_model_path = model_dir / "best_model.pt"

    try:
        state = torch.load(best_model_path, weights_only=True)
    except TypeError:
        state = torch.load(best_model_path)

    if isinstance(model, nn.DataParallel):
        model.module.load_state_dict(state)
    else:
        model.load_state_dict(state)

    test_metrics: dict[str, float] = evaluate_with_metrics(model, test_loader, device, target_indices)
    test_loss: float = float(test_metrics["mse"])
    logger.info("Final test loss: %.6f", test_loss)

    # Save final model
    final_model_path: Path = model_dir / "final_model.pt"
    torch.save(
        model.module.state_dict() if isinstance(model, nn.DataParallel) else model.state_dict(),
        final_model_path,
    )
    logger.info("Training complete. Models saved to %s", model_dir)

    # wandb: final logs
    if run is not None:
        # Log full test metrics (assumes you computed `test_metrics` as shown before)
        wandb.log(
            {
                "loss/test": test_loss,  # keep compatibility (MSE)
                "test/mse": test_metrics["mse"],
                "test/rmse": test_metrics["rmse"],
                "test/mae": test_metrics["mae"],
                "test/r2": test_metrics["r2"],
            }
        )

        # Optionally log model artifact
        artifact = None
        if bool(OmegaConf.select(cfg, "wandb.log_artifacts", default=True)):
            artifact = wandb.Artifact(
                name="qm9-gnn",
                type="model",
                description="Trained model",
                metadata={
                    "target_indices": target_indices,
                    "best_val_loss": best_val_loss,
                    "test_mse": test_metrics["mse"],
                    "test_rmse": test_metrics["rmse"],
                    "test_mae": test_metrics["mae"],
                    "test_r2": test_metrics["r2"],
                },
            )
            artifact.add_file(str(best_model_path))
            run.log_artifact(artifact)

            # Link only if we actually created an artifact
            run.link_artifact(
                artifact=artifact,
                target_path="model-registry/mlops-molecules",
                aliases=["latest"],
            )

        wandb.finish()

train_epoch

train_epoch(model: GraphNeuralNetwork, loader: DataLoader, optimizer: Optimizer, device: torch.device, target_indices: list[int]) -> float

Train for one epoch.

Source code in src/project_name/train.py

def train_epoch(
    model: GraphNeuralNetwork,
    loader: DataLoader,
    optimizer: Optimizer,
    device: torch.device,
    target_indices: list[int],
) -> float:
    """Train for one epoch."""
    model.train()
    total_loss: float = 0.0
    num_samples: int = 0

    for batch in loader:
        batch = batch.to(device)
        optimizer.zero_grad(set_to_none=True)

        pred: torch.Tensor = model(batch)
        target: torch.Tensor = batch.y[:, target_indices]

        loss: torch.Tensor = F.mse_loss(pred, target)
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * batch.num_graphs
        num_samples += batch.num_graphs

    return total_loss / num_samples

Evaluation

project_name.evaluate

evaluate

evaluate(model: GraphNeuralNetwork, loader: DataLoader, device: torch.device, target_indices: Sequence[int]) -> float

Evaluate model on a dataloader.

Computes mean MSE loss per graph over the entire loader, matching train_epoch.

Parameters:

Name	Type	Description	Default
`model`	`GraphNeuralNetwork`	Trained GNN model.	required
`loader`	`DataLoader`	DataLoader for validation/test set.	required
`device`	`device`	Torch device.	required
`target_indices`	`Sequence[int]`	Indices of target properties in batch.y.	required

Returns:

Type	Description
`float`	Mean MSE loss per graph.

Source code in src/project_name/evaluate.py

@torch.no_grad()
def evaluate(
    model: GraphNeuralNetwork,
    loader: DataLoader,
    device: torch.device,
    target_indices: Sequence[int],
) -> float:
    """Evaluate model on a dataloader.

    Computes mean MSE loss per graph over the entire loader, matching train_epoch.

    Args:
        model: Trained GNN model.
        loader: DataLoader for validation/test set.
        device: Torch device.
        target_indices: Indices of target properties in batch.y.

    Returns:
        Mean MSE loss per graph.
    """
    model.eval()

    total_loss: float = 0.0
    num_samples: int = 0

    target_idx = list(target_indices)

    for batch in loader:
        batch = batch.to(device)

        pred: torch.Tensor = model(batch)
        target: torch.Tensor = batch.y[:, target_idx]

        loss: torch.Tensor = F.mse_loss(pred, target)
        total_loss += loss.item() * batch.num_graphs
        num_samples += batch.num_graphs

    if num_samples == 0:
        return 0.0

    return total_loss / num_samples

evaluate_with_metrics

evaluate_with_metrics(model: GraphNeuralNetwork, loader: DataLoader, device: torch.device, target_indices: Sequence[int]) -> dict[str, float]

Evaluate model on a dataloader with multiple metrics.

Parameters:

Name	Type	Description	Default
`model`	`GraphNeuralNetwork`	Trained GNN model.	required
`loader`	`DataLoader`	DataLoader for validation/test set.	required
`device`	`device`	Torch device.	required
`target_indices`	`Sequence[int]`	Indices of target properties in batch.y.	required

Returns:

Type	Description
`dict[str, float]`	Dictionary with metrics: mse, rmse, mae, r2.

Source code in src/project_name/evaluate.py

@torch.no_grad()
def evaluate_with_metrics(
    model: GraphNeuralNetwork,
    loader: DataLoader,
    device: torch.device,
    target_indices: Sequence[int],
) -> dict[str, float]:
    """Evaluate model on a dataloader with multiple metrics.

    Args:
        model: Trained GNN model.
        loader: DataLoader for validation/test set.
        device: Torch device.
        target_indices: Indices of target properties in batch.y.

    Returns:
        Dictionary with metrics: mse, rmse, mae, r2.
    """
    model.eval()

    all_preds: list[torch.Tensor] = []
    all_targets: list[torch.Tensor] = []

    target_idx = list(target_indices)

    for batch in loader:
        batch = batch.to(device)

        pred: torch.Tensor = model(batch)
        target: torch.Tensor = batch.y[:, target_idx]

        all_preds.append(pred)
        all_targets.append(target)

    if len(all_preds) == 0:
        return {"mse": 0.0, "rmse": 0.0, "mae": 0.0, "r2": 0.0}

    predictions = torch.cat(all_preds, dim=0)
    targets = torch.cat(all_targets, dim=0)

    # MSE
    mse = F.mse_loss(predictions, targets).item()

    # RMSE
    rmse = torch.sqrt(F.mse_loss(predictions, targets)).item()

    # MAE
    mae = F.l1_loss(predictions, targets).item()

    # R² score
    ss_res = torch.sum((targets - predictions) ** 2).item()
    ss_tot = torch.sum((targets - torch.mean(targets)) ** 2).item()
    r2 = 1 - (ss_res / ss_tot) if ss_tot > 0 else 0.0

    return {
        "mse": mse,
        "rmse": rmse,
        "mae": mae,
        "r2": r2,
    }

get_device

get_device() -> torch.device

Get the best available device for computation.

Source code in src/project_name/evaluate.py

def get_device() -> torch.device:
    """Get the best available device for computation."""
    if torch.cuda.is_available():
        return torch.device("cuda")
    if torch.backends.mps.is_available():
        return torch.device("mps")
    return torch.device("cpu")

main

main(cfg: DictConfig) -> None

Load best model and evaluate on test set with comprehensive metrics.

Source code in src/project_name/evaluate.py

@hydra.main(version_base=None, config_path="../../configs", config_name="config")
def main(cfg: DictConfig) -> None:
    """Load best model and evaluate on test set with comprehensive metrics."""
    device = get_device()

    # Load dataset
    dataset = QM9Dataset(cfg.training.data_path)
    dataset.transform = NormalizeScale()

    n = len(dataset)
    train_size = int(cfg.training.train_ratio * n)
    val_size = int(cfg.training.val_ratio * n)
    test_size = n - train_size - val_size

    _, _, test_dataset = torch.utils.data.random_split(
        dataset,
        [train_size, val_size, test_size],
        generator=torch.Generator().manual_seed(cfg.seed),
    )

    test_loader = DataLoader(
        test_dataset,
        batch_size=cfg.training.batch_size,
        shuffle=False,
    )

    target_indices = list(cfg.training.target_indices)
    num_targets = len(target_indices)

    # Build model
    model = GraphNeuralNetwork(
        num_node_features=cfg.model.num_node_features,
        hidden_dim=cfg.model.hidden_dim,
        num_layers=cfg.model.num_layers,
        output_dim=num_targets,
    ).to(device)

    # Load best model
    best_model_path = Path(cfg.training.model_dir) / "best_model.pt"
    print(f"Loading model from: {best_model_path}")

    try:
        state = torch.load(best_model_path, weights_only=True)
    except TypeError:
        state = torch.load(best_model_path)

    model.load_state_dict(state)

    # Evaluate with multiple metrics
    metrics = evaluate_with_metrics(model, test_loader, device, target_indices)

    print("\n" + "=" * 50)
    print("Test Set Evaluation (Best Model)")
    print("=" * 50)
    print(f"MSE:  {metrics['mse']:.6f}")
    print(f"RMSE: {metrics['rmse']:.6f}")
    print(f"MAE:  {metrics['mae']:.6f}")
    print(f"R²:   {metrics['r2']:.6f}")
    print("=" * 50 + "\n")

Model pruning

project_name.prune

apply_unstructured_pruning

apply_unstructured_pruning(model: torch.nn.Module, amount: float) -> dict[str, Any]

Apply unstructured L1 pruning to FC layers only, then make it permanent.

Source code in src/project_name/prune.py

def apply_unstructured_pruning(
    model: torch.nn.Module,
    amount: float,
) -> dict[str, Any]:
    """Apply unstructured L1 pruning to FC layers only, then make it permanent."""
    if not (0.0 <= amount < 1.0):
        raise ValueError(f"Prune amount must be in [0, 1). Got: {amount}")

    pruned_modules = list(_iter_prunable_weight_params(model))
    if not pruned_modules:
        logger.warning("No prunable fully-connected (nn.Linear) layers found.")
        return {"modules_pruned": 0, "global_sparsity": 0.0}

    for m, pname in pruned_modules:
        prune.l1_unstructured(m, name=pname, amount=amount)

    for m, pname in pruned_modules:
        prune.remove(m, pname)

    total_elems = 0
    zero_elems = 0
    for m, pname in pruned_modules:
        w = getattr(m, pname)
        total_elems += w.numel()
        zero_elems += int((w == 0).sum().item())

    global_sparsity = (zero_elems / total_elems) if total_elems > 0 else 0.0
    return {
        "modules_pruned": len(pruned_modules),
        "global_sparsity": global_sparsity,
        "zero_elems": zero_elems,
        "total_elems": total_elems,
    }

evaluate_mse

evaluate_mse(model: torch.nn.Module, loader: DataLoader, device: torch.device, target_indices: Sequence[int]) -> float

Mean MSE per graph (matches your train/eval convention).

Source code in src/project_name/prune.py

@torch.no_grad()
def evaluate_mse(
    model: torch.nn.Module,
    loader: DataLoader,
    device: torch.device,
    target_indices: Sequence[int],
) -> float:
    """Mean MSE per graph (matches your train/eval convention)."""
    model.eval()

    total_loss: float = 0.0
    num_samples: int = 0
    target_idx = list(target_indices)

    for batch in loader:
        batch = batch.to(device)
        pred = model(batch)
        target = batch.y[:, target_idx]
        loss = torch.nn.functional.mse_loss(pred, target)
        total_loss += float(loss.item()) * batch.num_graphs
        num_samples += batch.num_graphs

    return total_loss / max(1, num_samples)

measure_inference_latency

measure_inference_latency(model: torch.nn.Module, loader: DataLoader, device: torch.device, *, warmup_batches: int = 10, timed_batches: int = 50) -> dict[str, float]

Measures average latency per batch (ms) over a fixed number of batches.

Notes: - Uses torch.inference_mode() via @torch.no_grad() + model.eval() - Syncs CUDA for accurate timing

Source code in src/project_name/prune.py

@torch.no_grad()
def measure_inference_latency(
    model: torch.nn.Module,
    loader: DataLoader,
    device: torch.device,
    *,
    warmup_batches: int = 10,
    timed_batches: int = 50,
) -> dict[str, float]:
    """
    Measures average latency per batch (ms) over a fixed number of batches.

    Notes:
    - Uses torch.inference_mode() via @torch.no_grad() + model.eval()
    - Syncs CUDA for accurate timing
    """
    model.eval()

    def _sync() -> None:
        if device.type == "cuda":
            torch.cuda.synchronize()

    it = iter(loader)

    # Warmup
    for _ in range(warmup_batches):
        try:
            batch = next(it)
        except StopIteration:
            it = iter(loader)
            batch = next(it)
        batch = batch.to(device)
        _ = model(batch)
    _sync()

    # Timed
    times: list[float] = []
    for _ in range(timed_batches):
        try:
            batch = next(it)
        except StopIteration:
            it = iter(loader)
            batch = next(it)
        batch = batch.to(device)

        _sync()
        t0 = time.perf_counter()
        _ = model(batch)
        _sync()
        t1 = time.perf_counter()
        times.append(t1 - t0)

    if not times:
        return {"ms_per_batch": 0.0, "batches": 0}

    avg_s = sum(times) / len(times)
    return {"ms_per_batch": avg_s * 1000.0, "batches": float(len(times))}

Quantization

project_name.quantize

measure_inference_latency

measure_inference_latency(model: torch.nn.Module, loader: DataLoader, device: torch.device, *, warmup_batches: int = 10, timed_batches: int = 50) -> dict[str, float]

Average latency per batch in ms. Note: for quantized CPU models this is the typical use case.

Source code in src/project_name/quantize.py

@torch.no_grad()
def measure_inference_latency(
    model: torch.nn.Module,
    loader: DataLoader,
    device: torch.device,
    *,
    warmup_batches: int = 10,
    timed_batches: int = 50,
) -> dict[str, float]:
    """
    Average latency per batch in ms.
    Note: for quantized CPU models this is the typical use case.
    """
    model.eval()
    it = iter(loader)

    # Warmup
    for _ in range(warmup_batches):
        try:
            batch = next(it)
        except StopIteration:
            it = iter(loader)
            batch = next(it)
        batch = batch.to(device)
        _ = model(batch)

    times: list[float] = []
    for _ in range(timed_batches):
        try:
            batch = next(it)
        except StopIteration:
            it = iter(loader)
            batch = next(it)

        batch = batch.to(device)
        t0 = time.perf_counter()
        _ = model(batch)
        t1 = time.perf_counter()
        times.append(t1 - t0)

    if not times:
        return {"ms_per_batch": 0.0, "batches": 0.0}

    avg_s = sum(times) / len(times)
    return {"ms_per_batch": avg_s * 1000.0, "batches": float(len(times))}

quantize_full_model

quantize_full_model(model: torch.nn.Module, scheme: str) -> torch.nn.Module

Apply weight-only INT8 quantization to all linear layers in the model, including those nested inside GraphConv blocks. Uses torchao when available and falls back to the torch.ao dynamic quantization API otherwise.

scheme

"torchao_int8_weight_only" (default)
"torch_ao_dynamic" (fallback-style dynamic quantization)

Source code in src/project_name/quantize.py

def quantize_full_model(model: torch.nn.Module, scheme: str) -> torch.nn.Module:
    """
    Apply weight-only INT8 quantization to *all* linear layers in the model, including those
    nested inside GraphConv blocks. Uses torchao when available and falls back to the
    torch.ao dynamic quantization API otherwise.

    scheme:
      - "torchao_int8_weight_only" (default)
      - "torch_ao_dynamic" (fallback-style dynamic quantization)
    """
    if scheme == "torch_ao_dynamic":
        from torch.ao.quantization import quantize_dynamic  # older weights-only API

        return quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)

    # default: torchao
    try:
        from torchao.quantization import quantize_
        from torchao.quantization import Int8WeightOnlyConfig
    except Exception as e:
        logger.warning("torchao not available (%s). Falling back to torch.ao.quantization.quantize_dynamic.", e)
        from torch.ao.quantization import quantize_dynamic

        return quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)

    # torchao quantize_ is inplace; returns None or model depending on version
    quantize_(
        model, Int8WeightOnlyConfig()
    )  #  [oai_citation:3‡PyTorch Documentation](https://docs.pytorch.org/ao/stable/generated/torchao.quantization.quantize_.html)
    return model

Promotion and comparison

project_name.compare_promote

Profiling

project_name.profiling

Profiling utilities for training and evaluation.

TrainingProfiler

Manages profiling across entire training session.

Source code in src/project_name/profiling.py

class TrainingProfiler:
    """Manages profiling across entire training session."""

    def __init__(
        self,
        enabled: bool = False,
        output_dir: Optional[Path] = None,
        warmup_steps: int = 1,
        active_steps: int = 10,
        repeat_steps: int = 1,
    ) -> None:
        """Initialize the training profiler.

        Args:
            enabled: Whether to enable profiling.
            output_dir: Directory to save profiling results.
        """
        self.enabled = enabled
        self.output_dir = output_dir or Path("profiling_results/run")
        self.prof: Optional[profile] = None

        if self.enabled:
            self.output_dir.mkdir(parents=True, exist_ok=True)
            self.prof = profile(
                activities=[ProfilerActivity.CPU]
                if not torch.cuda.is_available()
                else [ProfilerActivity.CPU, ProfilerActivity.CUDA],
                record_shapes=True,
                profile_memory=True,
                schedule=torch.profiler.schedule(
                    wait=0,
                    warmup=warmup_steps,
                    active=active_steps,
                    repeat=repeat_steps,
                ),
                on_trace_ready=tensorboard_trace_handler(output_dir),
            )
            self.prof.__enter__()

    def step(self) -> None:
        """Record a step (epoch) in the profiler."""
        if self.prof:
            self.prof.step()

    def finalize(self) -> None:
        """Finalize profiling and export trace."""
        if self.prof:
            self.prof.__exit__(None, None, None)
            print(f"✅ Profiling trace saved to {self.output_dir}")

init

__init__(enabled: bool = False, output_dir: Optional[Path] = None, warmup_steps: int = 1, active_steps: int = 10, repeat_steps: int = 1) -> None

Initialize the training profiler.

Parameters:

Name	Type	Description	Default
`enabled`	`bool`	Whether to enable profiling.	`False`
`output_dir`	`Optional[Path]`	Directory to save profiling results.	`None`

Source code in src/project_name/profiling.py

def __init__(
    self,
    enabled: bool = False,
    output_dir: Optional[Path] = None,
    warmup_steps: int = 1,
    active_steps: int = 10,
    repeat_steps: int = 1,
) -> None:
    """Initialize the training profiler.

    Args:
        enabled: Whether to enable profiling.
        output_dir: Directory to save profiling results.
    """
    self.enabled = enabled
    self.output_dir = output_dir or Path("profiling_results/run")
    self.prof: Optional[profile] = None

    if self.enabled:
        self.output_dir.mkdir(parents=True, exist_ok=True)
        self.prof = profile(
            activities=[ProfilerActivity.CPU]
            if not torch.cuda.is_available()
            else [ProfilerActivity.CPU, ProfilerActivity.CUDA],
            record_shapes=True,
            profile_memory=True,
            schedule=torch.profiler.schedule(
                wait=0,
                warmup=warmup_steps,
                active=active_steps,
                repeat=repeat_steps,
            ),
            on_trace_ready=tensorboard_trace_handler(output_dir),
        )
        self.prof.__enter__()

finalize

finalize() -> None

Finalize profiling and export trace.

Source code in src/project_name/profiling.py

def finalize(self) -> None:
    """Finalize profiling and export trace."""
    if self.prof:
        self.prof.__exit__(None, None, None)
        print(f"✅ Profiling trace saved to {self.output_dir}")

step

step() -> None

Record a step (epoch) in the profiler.

Source code in src/project_name/profiling.py

def step(self) -> None:
    """Record a step (epoch) in the profiler."""
    if self.prof:
        self.prof.step()

timing_checkpoint

timing_checkpoint(name: str, enabled: bool = True) -> Generator

Context manager for simple timing measurements.

Parameters:

Name	Type	Description	Default
`name`	`str`	Name for this checkpoint.	required
`enabled`	`bool`	Whether to enable timing.	`True`

Yields:

Type	Description
`Generator`	Dictionary with timing results.

Source code in src/project_name/profiling.py

@contextmanager
def timing_checkpoint(name: str, enabled: bool = True) -> Generator:
    """Context manager for simple timing measurements.

    Args:
        name: Name for this checkpoint.
        enabled: Whether to enable timing.

    Yields:
        Dictionary with timing results.
    """
    result = {"name": name, "duration": 0.0}

    if not enabled:
        yield result
        return

    start = time.perf_counter()
    try:
        yield result
    finally:
        result["duration"] = time.perf_counter() - start
        print(f"⏱️  {name}: {result['duration']:.4f}s")