Spaces:

mv63
/

BaseChange

Runtime error

Vedant Jigarbhai Mehta commited on 26 days ago

Commit

0cbf4d6

1 Parent(s): b25c087

Implement full training loop and visualization utilities

train.py: AMP, gradient accumulation, gradient clipping, warmup +
cosine scheduler, MetricTracker integration, early stopping on val F1,
checkpoint resume (model + optimizer + scheduler + scaler state),
auto GPU batch-size detection, TensorBoard logging with prediction grids,
conditional Colab/local paths, training time summary.

utils/visualization.py: Agg backend for headless environments,
plot_prediction (1x4 grid), overlay_changes (uint8 output),
plot_metrics_history (per-metric subplots),
log_predictions_to_tensorboard (interleaved sample grid).

Files changed (2) hide show

train.py +474 -211
utils/visualization.py +217 -61

train.py CHANGED Viewed

@@ -1,25 +1,28 @@
 """Main training script for change detection models.
-Supports AMP, gradient clipping, early stopping, checkpoint saving to Google
-Drive, and resume from checkpoint after Colab disconnects.
 Usage:
     python train.py --config configs/config.yaml --model unet_pp
-    python train.py --config configs/config.yaml --model changeformer --resume checkpoints/changeformer_last.pth
 """
 import argparse
 import logging
 import random
 from pathlib import Path
-from typing import Any, Dict, Tuple
 import numpy as np
 import torch
 import torch.nn as nn
 from torch.cuda.amp import GradScaler, autocast
 from torch.optim import AdamW
-from torch.optim.lr_scheduler import CosineAnnealingLR
 from torch.utils.data import DataLoader
 from torch.utils.tensorboard import SummaryWriter
 from tqdm import tqdm
@@ -28,14 +31,21 @@ import yaml
 from data.dataset import ChangeDetectionDataset
 from models import get_model
 from utils.losses import get_loss
-from utils.metrics import ConfusionMatrix
-from utils.visualization import plot_prediction
 logger = logging.getLogger(__name__)
 def set_seed(seed: int) -> None:
-    """Set random seeds for reproducibility.
     Args:
         seed: Random seed value.
@@ -48,11 +58,15 @@ def set_seed(seed: int) -> None:
     torch.backends.cudnn.benchmark = False
 def detect_gpu_type() -> str:
-    """Detect the current GPU type for batch size selection.
     Returns:
-        GPU type string ('T4', 'V100', or 'default').
     """
     if not torch.cuda.is_available():
         return "default"
@@ -65,80 +79,252 @@ def detect_gpu_type() -> str:
 def get_batch_size(config: Dict[str, Any], model_name: str) -> int:
-    """Get appropriate batch size based on GPU and model.
     Args:
-        config: Full config dict.
-        model_name: Model name string.
     Returns:
-        Batch size integer.
     """
     gpu_type = detect_gpu_type()
-    batch_sizes = config.get("batch_sizes", {}).get(model_name, {})
-    return batch_sizes.get(gpu_type, batch_sizes.get("default", 4))
-def get_paths(config: Dict[str, Any]) -> Dict[str, Path]:
-    """Resolve paths based on whether running on Colab or locally.
     Args:
-        config: Full config dict.
     Returns:
-        Dict with keys: 'data', 'checkpoints', 'logs', 'outputs'.
     """
     if config.get("colab", {}).get("enabled", False):
-        colab = config["colab"]
         return {
-            "data": Path(colab["data_dir"]),
-            "checkpoints": Path(colab["checkpoint_dir"]),
-            "logs": Path(colab["log_dir"]),
-            "outputs": Path(colab["output_dir"]),
-        }
-    else:
-        paths = config.get("paths", {})
-        return {
-            "data": Path(paths.get("processed_data", "./processed_data")),
-            "checkpoints": Path(paths.get("checkpoint_dir", "./checkpoints")),
-            "logs": Path(paths.get("log_dir", "./logs")),
-            "outputs": Path(paths.get("output_dir", "./outputs")),
         }
 def build_dataloaders(
     config: Dict[str, Any],
     data_dir: Path,
     batch_size: int,
 ) -> Tuple[DataLoader, DataLoader]:
-    """Create train and validation DataLoaders.
     Args:
-        config: Full config dict.
-        data_dir: Path to processed dataset root.
-        batch_size: Batch size.
     Returns:
-        Tuple of (train_loader, val_loader).
     """
     ds_cfg = config.get("dataset", {})
     num_workers = ds_cfg.get("num_workers", 4)
     pin_memory = ds_cfg.get("pin_memory", True)
-    train_ds = ChangeDetectionDataset(data_dir / "train", split="train", config=config)
-    val_ds = ChangeDetectionDataset(data_dir / "val", split="val", config=config)
     train_loader = DataLoader(
-        train_ds, batch_size=batch_size, shuffle=True,
-        num_workers=num_workers, pin_memory=pin_memory, drop_last=True,
     )
     val_loader = DataLoader(
-        val_ds, batch_size=batch_size, shuffle=False,
-        num_workers=num_workers, pin_memory=pin_memory,
     )
     return train_loader, val_loader
 def train_one_epoch(
     model: nn.Module,
     loader: DataLoader,
@@ -146,36 +332,37 @@ def train_one_epoch(
     optimizer: torch.optim.Optimizer,
     scaler: GradScaler,
     device: torch.device,
-    config: Dict[str, Any],
 ) -> Tuple[float, Dict[str, float]]:
-    """Run one training epoch.
     Args:
-        model: The change detection model.
-        loader: Training DataLoader.
-        criterion: Loss function.
-        optimizer: Optimizer.
-        scaler: GradScaler for AMP.
-        device: Target device.
-        config: Full config dict.
     Returns:
-        Tuple of (average loss, metrics dict).
     """
     model.train()
     running_loss = 0.0
-    cm = ConfusionMatrix()
-    train_cfg = config.get("training", {})
-    accum_steps = train_cfg.get("gradient_accumulation_steps", 1)
-    grad_clip = train_cfg.get("grad_clip_max_norm", 1.0)
-    threshold = config.get("evaluation", {}).get("threshold", 0.5)
-    optimizer.zero_grad()
-    for step, batch in enumerate(tqdm(loader, desc="Train", leave=False)):
-        img_a = batch["A"].to(device)
-        img_b = batch["B"].to(device)
-        mask = batch["mask"].to(device)
         with autocast():
             logits = model(img_a, img_b)
@@ -188,17 +375,19 @@ def train_one_epoch(
             nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
             scaler.step(optimizer)
             scaler.update()
-            optimizer.zero_grad()
         running_loss += loss.item() * accum_steps
-        # Metrics
-        with torch.no_grad():
-            preds = (torch.sigmoid(logits) > threshold).float()
-            cm.update(preds, mask)
-    avg_loss = running_loss / len(loader)
-    metrics = cm.compute()
     return avg_loss, metrics
@@ -208,210 +397,284 @@ def validate(
     loader: DataLoader,
     criterion: nn.Module,
     device: torch.device,
-    threshold: float = 0.5,
-) -> Tuple[float, Dict[str, float]]:
-    """Run validation.
     Args:
-        model: The change detection model.
-        loader: Validation DataLoader.
-        criterion: Loss function.
         device: Target device.
-        threshold: Binarization threshold.
     Returns:
-        Tuple of (average loss, metrics dict).
     """
     model.eval()
     running_loss = 0.0
-    cm = ConfusionMatrix()
-    for batch in tqdm(loader, desc="Val", leave=False):
-        img_a = batch["A"].to(device)
-        img_b = batch["B"].to(device)
-        mask = batch["mask"].to(device)
         logits = model(img_a, img_b)
         loss = criterion(logits, mask)
-        running_loss += loss.item()
-        preds = (torch.sigmoid(logits) > threshold).float()
-        cm.update(preds, mask)
-    avg_loss = running_loss / len(loader)
-    metrics = cm.compute()
-    return avg_loss, metrics
-def save_checkpoint(
-    model: nn.Module,
-    optimizer: torch.optim.Optimizer,
-    scheduler: Any,
-    scaler: GradScaler,
-    epoch: int,
-    best_f1: float,
-    save_path: Path,
-) -> None:
-    """Save a training checkpoint.
-    Args:
-        model: Model to save.
-        optimizer: Optimizer state.
-        scheduler: LR scheduler state.
-        scaler: GradScaler state.
-        epoch: Current epoch number.
-        best_f1: Best validation F1 so far.
-        save_path: Path to save the checkpoint.
-    """
-    save_path.parent.mkdir(parents=True, exist_ok=True)
-    torch.save({
-        "epoch": epoch,
-        "model_state_dict": model.state_dict(),
-        "optimizer_state_dict": optimizer.state_dict(),
-        "scheduler_state_dict": scheduler.state_dict(),
-        "scaler_state_dict": scaler.state_dict(),
-        "best_f1": best_f1,
-    }, save_path)
-    logger.info("Saved checkpoint: %s", save_path)
-def load_checkpoint(
-    path: Path,
-    model: nn.Module,
-    optimizer: torch.optim.Optimizer,
-    scheduler: Any,
-    scaler: GradScaler,
-    device: torch.device,
-) -> Tuple[int, float]:
-    """Load a training checkpoint for resume.
-    Args:
-        path: Path to the checkpoint file.
-        model: Model to load weights into.
-        optimizer: Optimizer to load state into.
-        scheduler: Scheduler to load state into.
-        scaler: GradScaler to load state into.
-        device: Target device.
-    Returns:
-        Tuple of (start_epoch, best_f1).
-    """
-    ckpt = torch.load(path, map_location=device)
-    model.load_state_dict(ckpt["model_state_dict"])
-    optimizer.load_state_dict(ckpt["optimizer_state_dict"])
-    scheduler.load_state_dict(ckpt["scheduler_state_dict"])
-    scaler.load_state_dict(ckpt["scaler_state_dict"])
-    logger.info("Resumed from epoch %d (best F1: %.4f)", ckpt["epoch"], ckpt["best_f1"])
-    return ckpt["epoch"], ckpt["best_f1"]
 def main() -> None:
-    """Main training entry point."""
-    parser = argparse.ArgumentParser(description="Train change detection model")
-    parser.add_argument("--config", type=Path, default=Path("configs/config.yaml"))
-    parser.add_argument("--model", type=str, default=None, help="Override model name from config")
-    parser.add_argument("--resume", type=Path, default=None, help="Path to checkpoint for resume")
     args = parser.parse_args()
-    logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
-    # Load config
-    with open(args.config, "r") as f:
-        config = yaml.safe_load(f)
-    model_name = args.model or config["model"]["name"]
-    seed = config.get("project", {}).get("seed", 42)
     set_seed(seed)
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    logger.info("Device: %s", device)
-    # Resolve paths
-    paths = get_paths(config)
     for p in paths.values():
         p.mkdir(parents=True, exist_ok=True)
-    # Model
     model = get_model(model_name, config).to(device)
-    logger.info("Model: %s (%.1fM params)", model_name,
-                sum(p.numel() for p in model.parameters()) / 1e6)
-    # Data
-    batch_size = get_batch_size(config, model_name)
     train_loader, val_loader = build_dataloaders(config, paths["data"], batch_size)
-    # Loss, optimizer, scheduler
-    criterion = get_loss(config)
-    lr = config.get("learning_rates", {}).get(model_name, config["training"]["learning_rate"])
-    epochs = config.get("epoch_counts", {}).get(model_name, config["training"]["epochs"])
-    optimizer = AdamW(model.parameters(), lr=lr, weight_decay=config["training"]["weight_decay"])
-    scheduler = CosineAnnealingLR(optimizer, T_max=epochs)
-    scaler = GradScaler()
-    # TensorBoard
     writer = SummaryWriter(log_dir=str(paths["logs"] / model_name))
-    # Resume
-    start_epoch = 0
-    best_f1 = 0.0
-    if args.resume and args.resume.exists():
-        start_epoch, best_f1 = load_checkpoint(
-            args.resume, model, optimizer, scheduler, scaler, device
         )
-    # Early stopping state
-    es_cfg = config["training"]["early_stopping"]
-    patience = es_cfg.get("patience", 15)
-    patience_counter = 0
-    threshold = config.get("evaluation", {}).get("threshold", 0.5)
-    # Training loop
-    for epoch in range(start_epoch, epochs):
-        logger.info("Epoch %d/%d", epoch + 1, epochs)
         train_loss, train_metrics = train_one_epoch(
-            model, train_loader, criterion, optimizer, scaler, device, config
         )
-        val_loss, val_metrics = validate(model, val_loader, criterion, device, threshold)
         scheduler.step()
-        # Log
-        writer.add_scalar("Loss/train", train_loss, epoch)
-        writer.add_scalar("Loss/val", val_loss, epoch)
-        for k, v in val_metrics.items():
-            writer.add_scalar(f"Val/{k}", v, epoch)
         logger.info(
-            "  Train Loss: %.4f | Val Loss: %.4f | Val F1: %.4f | Val IoU: %.4f",
-            train_loss, val_loss, val_metrics["f1"], val_metrics["iou"],
         )
-        # Save last checkpoint (always)
         save_checkpoint(
-            model, optimizer, scheduler, scaler, epoch + 1, best_f1,
-            paths["checkpoints"] / f"{model_name}_last.pth",
         )
-        # Save best checkpoint
         if val_metrics["f1"] > best_f1:
             best_f1 = val_metrics["f1"]
             patience_counter = 0
             save_checkpoint(
-                model, optimizer, scheduler, scaler, epoch + 1, best_f1,
-                paths["checkpoints"] / f"{model_name}_best.pth",
             )
-            logger.info("  New best F1: %.4f", best_f1)
         else:
             patience_counter += 1
-        # Early stopping
-        if es_cfg.get("enabled", True) and patience_counter >= patience:
-            logger.info("Early stopping triggered at epoch %d", epoch + 1)
             break
     writer.close()
-    logger.info("Training complete. Best F1: %.4f", best_f1)
 if __name__ == "__main__":

 """Main training script for change detection models.
+Supports mixed-precision training, gradient accumulation, gradient clipping,
+early stopping on validation F1, checkpoint saving (best + last) to Google
+Drive or local disk, and full resume from checkpoint after Colab disconnects.
 Usage:
     python train.py --config configs/config.yaml --model unet_pp
+    python train.py --config configs/config.yaml --model changeformer \
+        --resume /content/drive/MyDrive/change-detection/checkpoints/changeformer_last.pth
 """
 import argparse
 import logging
 import random
+import time
 from pathlib import Path
+from typing import Any, Dict, Optional, Tuple
 import numpy as np
 import torch
 import torch.nn as nn
 from torch.cuda.amp import GradScaler, autocast
 from torch.optim import AdamW
+from torch.optim.lr_scheduler import CosineAnnealingLR, LinearLR, SequentialLR
 from torch.utils.data import DataLoader
 from torch.utils.tensorboard import SummaryWriter
 from tqdm import tqdm
 from data.dataset import ChangeDetectionDataset
 from models import get_model
 from utils.losses import get_loss
+from utils.metrics import MetricTracker
+from utils.visualization import log_predictions_to_tensorboard
 logger = logging.getLogger(__name__)
+# ---------------------------------------------------------------------------
+# Reproducibility
+# ---------------------------------------------------------------------------
 def set_seed(seed: int) -> None:
+    """Set all random seeds for reproducibility.
+    Configures Python, NumPy, PyTorch (CPU + CUDA), and cuDNN for
+    deterministic behaviour.
     Args:
         seed: Random seed value.
     torch.backends.cudnn.benchmark = False
+# ---------------------------------------------------------------------------
+# GPU / config helpers
+# ---------------------------------------------------------------------------
 def detect_gpu_type() -> str:
+    """Detect the current GPU type for automatic batch-size selection.
     Returns:
+        One of ``'T4'``, ``'V100'``, or ``'default'``.
     """
     if not torch.cuda.is_available():
         return "default"
 def get_batch_size(config: Dict[str, Any], model_name: str) -> int:
+    """Look up the batch size for the current GPU + model combination.
     Args:
+        config: Full project config dict.
+        model_name: Model identifier string.
     Returns:
+        Batch size as an integer.
     """
     gpu_type = detect_gpu_type()
+    model_sizes = config.get("batch_sizes", {}).get(model_name, {})
+    return model_sizes.get(gpu_type, model_sizes.get("default", 4))
+def get_learning_rate(config: Dict[str, Any], model_name: str) -> float:
+    """Look up the per-model learning rate, falling back to the global default.
+    Args:
+        config: Full project config dict.
+        model_name: Model identifier string.
+    Returns:
+        Learning rate as a float.
+    """
+    return config.get("learning_rates", {}).get(
+        model_name, config["training"]["learning_rate"]
+    )
+def get_num_epochs(config: Dict[str, Any], model_name: str) -> int:
+    """Look up the per-model epoch count, falling back to the global default.
+    Args:
+        config: Full project config dict.
+        model_name: Model identifier string.
+    Returns:
+        Number of epochs as an integer.
+    """
+    return config.get("epoch_counts", {}).get(
+        model_name, config["training"]["epochs"]
+    )
+def resolve_paths(config: Dict[str, Any]) -> Dict[str, Path]:
+    """Build a path dict based on whether Colab mode is enabled.
+    When ``config["colab"]["enabled"]`` is ``True`` all persistent artefacts
+    point to Google Drive; otherwise they use the local ``paths`` section.
     Args:
+        config: Full project config dict.
     Returns:
+        Dict with keys ``'data'``, ``'checkpoints'``, ``'logs'``,
+        ``'outputs'``.
     """
     if config.get("colab", {}).get("enabled", False):
+        c = config["colab"]
         return {
+            "data": Path(c["data_dir"]),
+            "checkpoints": Path(c["checkpoint_dir"]),
+            "logs": Path(c["log_dir"]),
+            "outputs": Path(c["output_dir"]),
         }
+    p = config.get("paths", {})
+    return {
+        "data": Path(p.get("processed_data", "./processed_data")),
+        "checkpoints": Path(p.get("checkpoint_dir", "./checkpoints")),
+        "logs": Path(p.get("log_dir", "./logs")),
+        "outputs": Path(p.get("output_dir", "./outputs")),
+    }
+# ---------------------------------------------------------------------------
+# Data
+# ---------------------------------------------------------------------------
 def build_dataloaders(
     config: Dict[str, Any],
     data_dir: Path,
     batch_size: int,
 ) -> Tuple[DataLoader, DataLoader]:
+    """Create training and validation ``DataLoader`` instances.
     Args:
+        config: Full project config dict.
+        data_dir: Root of the processed dataset (contains ``train/``, ``val/``).
+        batch_size: Mini-batch size.
     Returns:
+        Tuple of ``(train_loader, val_loader)``.
     """
     ds_cfg = config.get("dataset", {})
     num_workers = ds_cfg.get("num_workers", 4)
     pin_memory = ds_cfg.get("pin_memory", True)
+    train_ds = ChangeDetectionDataset(
+        root=data_dir / "train", split="train", config=config,
+    )
+    val_ds = ChangeDetectionDataset(
+        root=data_dir / "val", split="val", config=config,
+    )
     train_loader = DataLoader(
+        train_ds,
+        batch_size=batch_size,
+        shuffle=True,
+        num_workers=num_workers,
+        pin_memory=pin_memory,
+        drop_last=True,
     )
     val_loader = DataLoader(
+        val_ds,
+        batch_size=batch_size,
+        shuffle=False,
+        num_workers=num_workers,
+        pin_memory=pin_memory,
     )
     return train_loader, val_loader
+# ---------------------------------------------------------------------------
+# Scheduler with linear warmup
+# ---------------------------------------------------------------------------
+def build_scheduler(
+    optimizer: torch.optim.Optimizer,
+    total_epochs: int,
+    warmup_epochs: int,
+) -> torch.optim.lr_scheduler._LRScheduler:
+    """Create a CosineAnnealingLR scheduler preceded by linear warmup.
+    During the first ``warmup_epochs`` the LR ramps linearly from
+    ``start_factor`` to the base LR, then cosine-decays for the remainder.
+    Args:
+        optimizer: Optimizer whose LR groups will be scheduled.
+        total_epochs: Total number of training epochs.
+        warmup_epochs: Number of warmup epochs (0 to disable).
+    Returns:
+        A learning-rate scheduler instance.
+    """
+    if warmup_epochs > 0 and warmup_epochs < total_epochs:
+        warmup = LinearLR(
+            optimizer,
+            start_factor=0.01,
+            end_factor=1.0,
+            total_iters=warmup_epochs,
+        )
+        cosine = CosineAnnealingLR(
+            optimizer,
+            T_max=total_epochs - warmup_epochs,
+        )
+        return SequentialLR(
+            optimizer,
+            schedulers=[warmup, cosine],
+            milestones=[warmup_epochs],
+        )
+    return CosineAnnealingLR(optimizer, T_max=total_epochs)
+# ---------------------------------------------------------------------------
+# Checkpointing
+# ---------------------------------------------------------------------------
+def save_checkpoint(
+    model: nn.Module,
+    optimizer: torch.optim.Optimizer,
+    scheduler: torch.optim.lr_scheduler._LRScheduler,
+    scaler: GradScaler,
+    epoch: int,
+    best_f1: float,
+    best_epoch: int,
+    save_path: Path,
+) -> None:
+    """Persist a full training checkpoint to disk.
+    Args:
+        model: Model whose weights to save.
+        optimizer: Optimizer state to save.
+        scheduler: LR scheduler state to save.
+        scaler: ``GradScaler`` state to save.
+        epoch: Epoch number just completed (1-indexed).
+        best_f1: Best validation F1 achieved so far.
+        best_epoch: Epoch that achieved ``best_f1``.
+        save_path: Destination file path.
+    """
+    save_path.parent.mkdir(parents=True, exist_ok=True)
+    torch.save(
+        {
+            "epoch": epoch,
+            "model_state_dict": model.state_dict(),
+            "optimizer_state_dict": optimizer.state_dict(),
+            "scheduler_state_dict": scheduler.state_dict(),
+            "scaler_state_dict": scaler.state_dict(),
+            "best_f1": best_f1,
+            "best_epoch": best_epoch,
+        },
+        save_path,
+    )
+    logger.info("Checkpoint saved → %s", save_path)
+def load_checkpoint(
+    path: Path,
+    model: nn.Module,
+    optimizer: torch.optim.Optimizer,
+    scheduler: torch.optim.lr_scheduler._LRScheduler,
+    scaler: GradScaler,
+    device: torch.device,
+) -> Tuple[int, float, int]:
+    """Restore training state from a checkpoint.
+    Args:
+        path: Checkpoint file to load.
+        model: Model to receive saved weights.
+        optimizer: Optimizer to receive saved state.
+        scheduler: Scheduler to receive saved state.
+        scaler: ``GradScaler`` to receive saved state.
+        device: Target device for ``map_location``.
+    Returns:
+        Tuple of ``(start_epoch, best_f1, best_epoch)``.
+    """
+    ckpt = torch.load(path, map_location=device)
+    model.load_state_dict(ckpt["model_state_dict"])
+    optimizer.load_state_dict(ckpt["optimizer_state_dict"])
+    scheduler.load_state_dict(ckpt["scheduler_state_dict"])
+    scaler.load_state_dict(ckpt["scaler_state_dict"])
+    best_f1 = ckpt["best_f1"]
+    best_epoch = ckpt.get("best_epoch", ckpt["epoch"])
+    logger.info(
+        "Resumed from epoch %d (best F1: %.4f @ epoch %d)",
+        ckpt["epoch"], best_f1, best_epoch,
+    )
+    return ckpt["epoch"], best_f1, best_epoch
+# ---------------------------------------------------------------------------
+# Train / validate one epoch
+# ---------------------------------------------------------------------------
 def train_one_epoch(
     model: nn.Module,
     loader: DataLoader,
     optimizer: torch.optim.Optimizer,
     scaler: GradScaler,
     device: torch.device,
+    tracker: MetricTracker,
+    accum_steps: int,
+    grad_clip: float,
 ) -> Tuple[float, Dict[str, float]]:
+    """Execute one full training epoch.
     Args:
+        model: Change-detection model.
+        loader: Training ``DataLoader``.
+        criterion: Loss module (operates on raw logits).
+        optimizer: Optimiser instance.
+        scaler: ``GradScaler`` for mixed-precision training.
+        device: Target CUDA / CPU device.
+        tracker: ``MetricTracker`` (reset externally before this call).
+        accum_steps: Number of gradient-accumulation micro-steps.
+        grad_clip: Maximum gradient norm for clipping.
     Returns:
+        Tuple of ``(average_loss, metrics_dict)``.
     """
     model.train()
     running_loss = 0.0
+    num_batches = 0
+    optimizer.zero_grad(set_to_none=True)
+    pbar = tqdm(loader, desc="  Train", leave=False, dynamic_ncols=True)
+    for step, batch in enumerate(pbar):
+        img_a = batch["A"].to(device, non_blocking=True)
+        img_b = batch["B"].to(device, non_blocking=True)
+        mask = batch["mask"].to(device, non_blocking=True)
         with autocast():
             logits = model(img_a, img_b)
             nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
             scaler.step(optimizer)
             scaler.update()
+            optimizer.zero_grad(set_to_none=True)
+        # Track loss (undo the accumulation scaling for logging)
         running_loss += loss.item() * accum_steps
+        num_batches += 1
+        # Track metrics (MetricTracker handles sigmoid + threshold internally)
+        tracker.update(logits.detach(), mask)
+        pbar.set_postfix(loss=f"{running_loss / num_batches:.4f}")
+    avg_loss = running_loss / max(num_batches, 1)
+    metrics = tracker.compute()
     return avg_loss, metrics
     loader: DataLoader,
     criterion: nn.Module,
     device: torch.device,
+    tracker: MetricTracker,
+) -> Tuple[float, Dict[str, float], Optional[Dict[str, torch.Tensor]]]:
+    """Run one full validation pass.
     Args:
+        model: Change-detection model (set to eval internally).
+        loader: Validation ``DataLoader``.
+        criterion: Loss module (operates on raw logits).
         device: Target device.
+        tracker: ``MetricTracker`` (reset externally before this call).
     Returns:
+        Tuple of ``(average_loss, metrics_dict, last_batch)`` where
+        ``last_batch`` is the final mini-batch dict (for visualisation).
     """
     model.eval()
     running_loss = 0.0
+    num_batches = 0
+    last_batch: Optional[Dict[str, torch.Tensor]] = None
+    pbar = tqdm(loader, desc="  Val  ", leave=False, dynamic_ncols=True)
+    for batch in pbar:
+        img_a = batch["A"].to(device, non_blocking=True)
+        img_b = batch["B"].to(device, non_blocking=True)
+        mask = batch["mask"].to(device, non_blocking=True)
         logits = model(img_a, img_b)
         loss = criterion(logits, mask)
+        running_loss += loss.item()
+        num_batches += 1
+        tracker.update(logits, mask)
+        # Keep the last batch for TensorBoard visualisation
+        last_batch = {
+            "A": img_a,
+            "B": img_b,
+            "mask": mask,
+            "logits": logits,
+        }
+        pbar.set_postfix(loss=f"{running_loss / num_batches:.4f}")
+    avg_loss = running_loss / max(num_batches, 1)
+    metrics = tracker.compute()
+    return avg_loss, metrics, last_batch
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
 def main() -> None:
+    """Entry point — parse CLI args, build components, run training loop."""
+    # ---- CLI ----------------------------------------------------------
+    parser = argparse.ArgumentParser(
+        description="Train a change-detection model",
+    )
+    parser.add_argument(
+        "--config", type=Path, default=Path("configs/config.yaml"),
+        help="Path to the YAML configuration file.",
+    )
+    parser.add_argument(
+        "--model", type=str, default=None,
+        help="Override the model name from config (siamese_cnn | unet_pp | changeformer).",
+    )
+    parser.add_argument(
+        "--resume", type=Path, default=None,
+        help="Path to a checkpoint file to resume training from.",
+    )
     args = parser.parse_args()
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s [%(levelname)s] %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
+    # ---- Config -------------------------------------------------------
+    with open(args.config, "r") as fh:
+        config: Dict[str, Any] = yaml.safe_load(fh)
+    model_name: str = args.model or config["model"]["name"]
+    train_cfg = config["training"]
+    seed: int = config.get("project", {}).get("seed", 42)
     set_seed(seed)
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    gpu_type = detect_gpu_type()
+    logger.info("Device: %s | GPU type: %s", device, gpu_type)
+    # ---- Paths --------------------------------------------------------
+    paths = resolve_paths(config)
     for p in paths.values():
         p.mkdir(parents=True, exist_ok=True)
+    # ---- Hyperparams (auto from per-model tables) ---------------------
+    batch_size = get_batch_size(config, model_name)
+    lr = get_learning_rate(config, model_name)
+    num_epochs = get_num_epochs(config, model_name)
+    accum_steps: int = train_cfg.get("gradient_accumulation_steps", 1)
+    grad_clip: float = train_cfg.get("grad_clip_max_norm", 1.0)
+    warmup_epochs: int = train_cfg.get("warmup_epochs", 5)
+    vis_interval: int = train_cfg.get("vis_interval", 5)
+    threshold: float = config.get("evaluation", {}).get("threshold", 0.5)
+    logger.info(
+        "Hyperparams → model=%s  bs=%d  lr=%.1e  epochs=%d  accum=%d  warmup=%d",
+        model_name, batch_size, lr, num_epochs, accum_steps, warmup_epochs,
+    )
+    # ---- Model --------------------------------------------------------
     model = get_model(model_name, config).to(device)
+    param_count = sum(p.numel() for p in model.parameters()) / 1e6
+    logger.info("Model: %s (%.2fM parameters)", model_name, param_count)
+    # ---- Data ---------------------------------------------------------
     train_loader, val_loader = build_dataloaders(config, paths["data"], batch_size)
+    logger.info(
+        "Data: %d train batches, %d val batches (batch_size=%d)",
+        len(train_loader), len(val_loader), batch_size,
+    )
+    # ---- Loss / optimiser / scheduler ---------------------------------
+    criterion = get_loss(config).to(device)
+    optimizer = AdamW(
+        model.parameters(),
+        lr=lr,
+        weight_decay=train_cfg["weight_decay"],
+    )
+    scheduler = build_scheduler(optimizer, num_epochs, warmup_epochs)
+    scaler = GradScaler(enabled=train_cfg.get("amp", True))
+    # ---- TensorBoard --------------------------------------------------
     writer = SummaryWriter(log_dir=str(paths["logs"] / model_name))
+    # ---- MetricTrackers -----------------------------------------------
+    train_tracker = MetricTracker(threshold=threshold)
+    val_tracker = MetricTracker(threshold=threshold)
+    # ---- Resume -------------------------------------------------------
+    start_epoch: int = 0
+    best_f1: float = 0.0
+    best_epoch: int = 0
+    if args.resume is not None and args.resume.exists():
+        start_epoch, best_f1, best_epoch = load_checkpoint(
+            args.resume, model, optimizer, scheduler, scaler, device,
         )
+    elif args.resume is not None:
+        logger.warning("Resume path does not exist: %s — training from scratch", args.resume)
+    # ---- Early stopping state -----------------------------------------
+    es_cfg = train_cfg.get("early_stopping", {})
+    es_enabled: bool = es_cfg.get("enabled", True)
+    patience: int = es_cfg.get("patience", 15)
+    patience_counter: int = 0
+    # ---- Training loop ------------------------------------------------
+    wall_start = time.monotonic()
+    logger.info("=" * 60)
+    logger.info("Starting training from epoch %d", start_epoch + 1)
+    logger.info("=" * 60)
+    for epoch in range(start_epoch, num_epochs):
+        epoch_start = time.monotonic()
+        epoch_num = epoch + 1  # 1-indexed for display / checkpoints
+        current_lr = optimizer.param_groups[0]["lr"]
+        logger.info("Epoch %d/%d  (lr=%.2e)", epoch_num, num_epochs, current_lr)
+        # -- Train ------------------------------------------------------
+        train_tracker.reset()
         train_loss, train_metrics = train_one_epoch(
+            model, train_loader, criterion, optimizer, scaler, device,
+            train_tracker, accum_steps, grad_clip,
         )
+        # -- Validate ---------------------------------------------------
+        val_tracker.reset()
+        val_loss, val_metrics, last_val_batch = validate(
+            model, val_loader, criterion, device, val_tracker,
+        )
+        # -- Step scheduler (after both train + val) --------------------
         scheduler.step()
+        # -- TensorBoard scalars ----------------------------------------
+        writer.add_scalar("Loss/train", train_loss, epoch_num)
+        writer.add_scalar("Loss/val", val_loss, epoch_num)
+        writer.add_scalar("LR", current_lr, epoch_num)
+        for key, value in train_metrics.items():
+            writer.add_scalar(f"Train/{key}", value, epoch_num)
+        for key, value in val_metrics.items():
+            writer.add_scalar(f"Val/{key}", value, epoch_num)
+        # -- TensorBoard prediction images ------------------------------
+        if last_val_batch is not None and epoch_num % vis_interval == 0:
+            log_predictions_to_tensorboard(
+                writer,
+                img_a=last_val_batch["A"],
+                img_b=last_val_batch["B"],
+                mask_true=last_val_batch["mask"],
+                mask_pred=last_val_batch["logits"],
+                step=epoch_num,
+                num_samples=4,
+            )
+        # -- Console log ------------------------------------------------
+        epoch_time = time.monotonic() - epoch_start
         logger.info(
+            "  Train — loss: %.4f | F1: %.4f | IoU: %.4f",
+            train_loss, train_metrics["f1"], train_metrics["iou"],
         )
+        logger.info(
+            "  Val   — loss: %.4f | F1: %.4f | IoU: %.4f | Prec: %.4f | Rec: %.4f | OA: %.4f",
+            val_loss,
+            val_metrics["f1"],
+            val_metrics["iou"],
+            val_metrics["precision"],
+            val_metrics["recall"],
+            val_metrics["oa"],
+        )
+        logger.info("  Epoch time: %.1fs", epoch_time)
+        # -- Save last checkpoint (every epoch) -------------------------
         save_checkpoint(
+            model, optimizer, scheduler, scaler,
+            epoch=epoch_num,
+            best_f1=best_f1,
+            best_epoch=best_epoch,
+            save_path=paths["checkpoints"] / f"{model_name}_last.pth",
         )
+        # -- Save best checkpoint (if improved) -------------------------
         if val_metrics["f1"] > best_f1:
             best_f1 = val_metrics["f1"]
+            best_epoch = epoch_num
             patience_counter = 0
             save_checkpoint(
+                model, optimizer, scheduler, scaler,
+                epoch=epoch_num,
+                best_f1=best_f1,
+                best_epoch=best_epoch,
+                save_path=paths["checkpoints"] / f"{model_name}_best.pth",
             )
+            logger.info("  ★ New best F1: %.4f (epoch %d)", best_f1, best_epoch)
         else:
             patience_counter += 1
+            logger.info(
+                "  No improvement (%d/%d patience)", patience_counter, patience,
+            )
+        # -- Early stopping ---------------------------------------------
+        if es_enabled and patience_counter >= patience:
+            logger.info(
+                "Early stopping triggered after %d epochs without improvement.",
+                patience,
+            )
             break
+    # ---- Summary ------------------------------------------------------
     writer.close()
+    total_time = time.monotonic() - wall_start
+    hours, remainder = divmod(total_time, 3600)
+    minutes, seconds = divmod(remainder, 60)
+    logger.info("=" * 60)
+    logger.info("Training complete.")
+    logger.info("  Best val F1 : %.4f  (epoch %d)", best_f1, best_epoch)
+    logger.info("  Total time  : %dh %dm %ds", int(hours), int(minutes), int(seconds))
+    logger.info("  Checkpoints : %s", paths["checkpoints"])
+    logger.info("=" * 60)
 if __name__ == "__main__":

utils/visualization.py CHANGED Viewed

@@ -1,141 +1,297 @@
 """Visualization utilities for change detection results.
-Provides functions to plot predictions, overlay change maps, and track
-training metrics over time.
 """
 from pathlib import Path
-from typing import Dict, List, Optional
 import matplotlib.pyplot as plt
 import numpy as np
 import torch
 def denormalize(
     img: np.ndarray,
-    mean: tuple = (0.485, 0.456, 0.406),
-    std: tuple = (0.229, 0.224, 0.225),
 ) -> np.ndarray:
-    """Reverse ImageNet normalization for display.
     Args:
-        img: Normalized image array [H, W, 3].
-        mean: Channel means used for normalization.
-        std: Channel stds used for normalization.
     Returns:
-        Denormalized image clipped to [0, 1].
     """
-    img = img * np.array(std) + np.array(mean)
-    return np.clip(img, 0, 1)
 def plot_prediction(
     img_a: torch.Tensor,
     img_b: torch.Tensor,
-    mask_gt: torch.Tensor,
     mask_pred: torch.Tensor,
-    save_path: Optional[Path] = None,
 ) -> plt.Figure:
-    """Plot a single change detection prediction.
-    Shows: Before | After | Ground Truth | Prediction in a 1x4 grid.
     Args:
-        img_a: Before image tensor [3, H, W] (normalized).
-        img_b: After image tensor [3, H, W] (normalized).
-        mask_gt: Ground truth mask [1, H, W] (binary).
-        mask_pred: Predicted mask [1, H, W] (binary or probability).
-        save_path: Optional path to save the figure.
     Returns:
-        Matplotlib figure.
     """
-    fig, axes = plt.subplots(1, 4, figsize=(16, 4))
-    # Convert tensors to numpy
-    a = denormalize(img_a.permute(1, 2, 0).cpu().numpy())
-    b = denormalize(img_b.permute(1, 2, 0).cpu().numpy())
-    gt = mask_gt.squeeze(0).cpu().numpy()
-    pred = mask_pred.squeeze(0).cpu().numpy()
     titles = ["Before (A)", "After (B)", "Ground Truth", "Prediction"]
-    images = [a, b, gt, pred]
     cmaps = [None, None, "gray", "gray"]
     for ax, img, title, cmap in zip(axes, images, titles, cmaps):
         ax.imshow(img, cmap=cmap, vmin=0, vmax=1)
-        ax.set_title(title)
         ax.axis("off")
-    plt.tight_layout()
-    if save_path is not None:
-        fig.savefig(save_path, dpi=150, bbox_inches="tight")
     return fig
 def overlay_changes(
-    img_b: torch.Tensor,
     mask_pred: torch.Tensor,
     alpha: float = 0.4,
-    color: tuple = (1.0, 0.0, 0.0),
 ) -> np.ndarray:
-    """Overlay predicted change mask on the after image.
     Args:
-        img_b: After image tensor [3, H, W] (normalized).
-        mask_pred: Predicted binary mask [1, H, W].
-        alpha: Overlay transparency.
-        color: RGB color for the overlay (default: red).
     Returns:
-        Overlaid image as numpy array [H, W, 3].
     """
-    b = denormalize(img_b.permute(1, 2, 0).cpu().numpy())
-    mask = mask_pred.squeeze(0).cpu().numpy()
-    overlay = b.copy()
     for c in range(3):
         overlay[:, :, c] = np.where(
-            mask > 0.5,
-            b[:, :, c] * (1 - alpha) + color[c] * alpha,
-            b[:, :, c],
         )
-    return overlay
 def plot_metrics_history(
-    history: Dict[str, List[float]],
-    save_path: Optional[Path] = None,
 ) -> plt.Figure:
-    """Plot training metric curves over epochs.
     Args:
-        history: Dict mapping metric names to lists of per-epoch values.
-        save_path: Optional path to save the figure.
     Returns:
-        Matplotlib figure.
     """
-    n_metrics = len(history)
-    fig, axes = plt.subplots(1, n_metrics, figsize=(5 * n_metrics, 4))
     if n_metrics == 1:
         axes = [axes]
-    for ax, (name, values) in zip(axes, history.items()):
-        ax.plot(values, marker="o", markersize=2)
-        ax.set_title(name)
         ax.set_xlabel("Epoch")
         ax.set_ylabel(name)
         ax.grid(True, alpha=0.3)
-    plt.tight_layout()
     if save_path is not None:
-        fig.savefig(save_path, dpi=150, bbox_inches="tight")
     return fig

 """Visualization utilities for change detection results.
+Provides helpers for:
+- Plotting side-by-side predictions (Before | After | GT | Pred)
+- Overlaying predicted change masks on satellite images
+- Plotting metric curves across epochs
+- Logging sample prediction grids to TensorBoard
+All public functions accept **ImageNet-normalised** ``torch.Tensor`` inputs
+with shape ``[C, H, W]`` and handle denormalisation internally.  The Agg
+backend is set at import time so the module works in headless environments
+(Google Colab, CI, remote servers).
 """
+import matplotlib
+matplotlib.use("Agg")  # headless backend — must be set before pyplot import
+import logging
 from pathlib import Path
+from typing import Dict, List, Optional, Tuple, Union
 import matplotlib.pyplot as plt
 import numpy as np
 import torch
+from torch.utils.tensorboard import SummaryWriter
+import torchvision.utils as vutils
+logger = logging.getLogger(__name__)
+# ImageNet constants (duplicated here to avoid circular imports from data/)
+_IMAGENET_MEAN = np.array([0.485, 0.456, 0.406], dtype=np.float32)
+_IMAGENET_STD = np.array([0.229, 0.224, 0.225], dtype=np.float32)
+# ---------------------------------------------------------------------------
+# Internal helpers
+# ---------------------------------------------------------------------------
+def _to_numpy_hwc(tensor: torch.Tensor) -> np.ndarray:
+    """Convert a ``[C, H, W]`` torch tensor to ``[H, W, C]`` numpy array.
+    Args:
+        tensor: Image tensor of shape ``[C, H, W]``.
+    Returns:
+        Numpy array of shape ``[H, W, C]`` (float32).
+    """
+    return tensor.detach().cpu().float().permute(1, 2, 0).numpy()
+def _mask_to_numpy(tensor: torch.Tensor) -> np.ndarray:
+    """Convert a ``[1, H, W]`` mask tensor to ``[H, W]`` numpy array.
+    Args:
+        tensor: Mask tensor of shape ``[1, H, W]``.
+    Returns:
+        Numpy array of shape ``[H, W]`` (float32).
+    """
+    return tensor.detach().cpu().float().squeeze(0).numpy()
 def denormalize(
     img: np.ndarray,
+    mean: np.ndarray = _IMAGENET_MEAN,
+    std: np.ndarray = _IMAGENET_STD,
 ) -> np.ndarray:
+    """Reverse ImageNet normalisation for display.
+    Args:
+        img: Normalised image of shape ``[H, W, 3]`` (float32).
+        mean: Per-channel means used during normalisation.
+        std: Per-channel standard deviations used during normalisation.
+    Returns:
+        Denormalised image clipped to ``[0, 1]``.
+    """
+    return np.clip(img * std + mean, 0.0, 1.0)
+def _denorm_tensor(tensor: torch.Tensor) -> np.ndarray:
+    """Shortcut: ``[C, H, W]`` tensor → denormalised ``[H, W, C]`` numpy.
     Args:
+        tensor: ImageNet-normalised image ``[C, H, W]``.
     Returns:
+        Denormalised numpy array ``[H, W, C]`` in ``[0, 1]``.
     """
+    return denormalize(_to_numpy_hwc(tensor))
+# ---------------------------------------------------------------------------
+# 1. plot_prediction
+# ---------------------------------------------------------------------------
 def plot_prediction(
     img_a: torch.Tensor,
     img_b: torch.Tensor,
+    mask_true: torch.Tensor,
     mask_pred: torch.Tensor,
+    filename: Optional[Union[str, Path]] = None,
 ) -> plt.Figure:
+    """Plot a single change-detection prediction as a 1×4 grid.
+    Columns: **Before (A)** | **After (B)** | **Ground Truth** | **Prediction**.
+    Images are denormalised from ImageNet stats before display.  Masks are
+    rendered in binary black / white.
     Args:
+        img_a: Before image ``[3, H, W]`` (ImageNet-normalised).
+        img_b: After image ``[3, H, W]`` (ImageNet-normalised).
+        mask_true: Ground-truth binary mask ``[1, H, W]`` (0 or 1).
+        mask_pred: Predicted mask ``[1, H, W]`` (binary or probability).
+        filename: If provided, save the figure to this path and close it.
+            Otherwise the caller is responsible for ``plt.close(fig)``.
     Returns:
+        The ``matplotlib.figure.Figure`` object.
     """
+    a_np = _denorm_tensor(img_a)
+    b_np = _denorm_tensor(img_b)
+    gt_np = _mask_to_numpy(mask_true)
+    pred_np = _mask_to_numpy(mask_pred)
+    # Binarise prediction for clean display (handles probability maps)
+    pred_np = (pred_np > 0.5).astype(np.float32)
+    fig, axes = plt.subplots(1, 4, figsize=(16, 4))
     titles = ["Before (A)", "After (B)", "Ground Truth", "Prediction"]
+    images = [a_np, b_np, gt_np, pred_np]
     cmaps = [None, None, "gray", "gray"]
     for ax, img, title, cmap in zip(axes, images, titles, cmaps):
         ax.imshow(img, cmap=cmap, vmin=0, vmax=1)
+        ax.set_title(title, fontsize=11)
         ax.axis("off")
+    fig.tight_layout(pad=1.0)
+    if filename is not None:
+        path = Path(filename)
+        path.parent.mkdir(parents=True, exist_ok=True)
+        fig.savefig(path, dpi=150, bbox_inches="tight")
+        plt.close(fig)
+        logger.debug("Saved prediction plot: %s", path)
     return fig
+# ---------------------------------------------------------------------------
+# 2. overlay_changes
+# ---------------------------------------------------------------------------
 def overlay_changes(
+    img_after: torch.Tensor,
     mask_pred: torch.Tensor,
     alpha: float = 0.4,
+    color: Tuple[int, int, int] = (255, 0, 0),
 ) -> np.ndarray:
+    """Overlay predicted change pixels on the *after* image.
+    Changed pixels are tinted with ``color`` at the given ``alpha``
+    transparency; unchanged pixels are left as-is.
     Args:
+        img_after: After image ``[3, H, W]`` (ImageNet-normalised).
+        mask_pred: Predicted binary mask ``[1, H, W]`` (0 or 1).
+        alpha: Blending factor for the overlay colour (0 = transparent,
+            1 = fully opaque).
+        color: RGB overlay colour as **uint8** values in ``[0, 255]``
+            (default red).
     Returns:
+        Composited RGB image as a **uint8** numpy array ``[H, W, 3]``
+        with values in ``[0, 255]``, ready for ``cv2.imwrite`` or display.
     """
+    base = _denorm_tensor(img_after)  # [H, W, 3], float32 in [0, 1]
+    mask = _mask_to_numpy(mask_pred)  # [H, W], float32
+    # Normalise colour to [0, 1]
+    color_f = np.array(color, dtype=np.float32) / 255.0
+    overlay = base.copy()
+    change_mask = mask > 0.5
     for c in range(3):
         overlay[:, :, c] = np.where(
+            change_mask,
+            base[:, :, c] * (1.0 - alpha) + color_f[c] * alpha,
+            base[:, :, c],
         )
+    return (overlay * 255.0).astype(np.uint8)
+# ---------------------------------------------------------------------------
+# 3. plot_metrics_history
+# ---------------------------------------------------------------------------
 def plot_metrics_history(
+    history_dict: Dict[str, List[float]],
+    save_path: Optional[Union[str, Path]] = None,
 ) -> plt.Figure:
+    """Plot training / validation metric curves across epochs.
+    Creates one subplot per metric key.  Suitable for inclusion in reports
+    or as a TensorBoard-compatible image.
     Args:
+        history_dict: Mapping from metric name to a list of per-epoch
+            values, e.g. ``{"f1": [0.5, 0.6, ...], "loss": [0.8, ...]}``.
+        save_path: If provided, save the figure and close it.
     Returns:
+        The ``matplotlib.figure.Figure`` object.
     """
+    n_metrics = len(history_dict)
+    if n_metrics == 0:
+        fig, _ = plt.subplots()
+        return fig
+    fig, axes = plt.subplots(1, n_metrics, figsize=(5 * n_metrics, 4))
     if n_metrics == 1:
         axes = [axes]
+    for ax, (name, values) in zip(axes, history_dict.items()):
+        epochs = list(range(1, len(values) + 1))
+        ax.plot(epochs, values, marker="o", markersize=3, linewidth=1.5)
+        ax.set_title(name.upper(), fontsize=11)
         ax.set_xlabel("Epoch")
         ax.set_ylabel(name)
         ax.grid(True, alpha=0.3)
+    fig.tight_layout(pad=1.5)
     if save_path is not None:
+        path = Path(save_path)
+        path.parent.mkdir(parents=True, exist_ok=True)
+        fig.savefig(path, dpi=150, bbox_inches="tight")
+        plt.close(fig)
+        logger.debug("Saved metrics plot: %s", path)
     return fig
+# ---------------------------------------------------------------------------
+# 4. log_predictions_to_tensorboard
+# ---------------------------------------------------------------------------
+def log_predictions_to_tensorboard(
+    writer: SummaryWriter,
+    img_a: torch.Tensor,
+    img_b: torch.Tensor,
+    mask_true: torch.Tensor,
+    mask_pred: torch.Tensor,
+    step: int,
+    num_samples: int = 4,
+) -> None:
+    """Log a grid of sample predictions to TensorBoard.
+    For each sample the grid contains four rows:
+    *Before*, *After*, *Ground Truth*, *Prediction*.
+    Images are denormalised; masks are expanded to 3-channel for consistent
+    grid rendering.
+    Args:
+        writer: Active ``SummaryWriter`` instance.
+        img_a: Before images ``[B, 3, H, W]`` (ImageNet-normalised).
+        img_b: After images ``[B, 3, H, W]`` (ImageNet-normalised).
+        mask_true: Ground-truth masks ``[B, 1, H, W]`` (binary).
+        mask_pred: Predicted masks ``[B, 1, H, W]`` (binary or probability).
+        step: Global training step (used as the x-axis in TensorBoard).
+        num_samples: How many samples from the batch to include (taken
+            from the front of the batch dimension).
+    """
+    n = min(num_samples, img_a.size(0))
+    # Denormalise images on CPU (keep as tensors for vutils.make_grid)
+    mean = torch.tensor(_IMAGENET_MEAN).view(1, 3, 1, 1)
+    std = torch.tensor(_IMAGENET_STD).view(1, 3, 1, 1)
+    a = (img_a[:n].cpu().float() * std + mean).clamp(0.0, 1.0)
+    b = (img_b[:n].cpu().float() * std + mean).clamp(0.0, 1.0)
+    # Expand single-channel masks to 3-channel for the grid
+    gt = mask_true[:n].cpu().float().expand(-1, 3, -1, -1)
+    pred = (mask_pred[:n].cpu().float() > 0.5).float().expand(-1, 3, -1, -1)
+    # Interleave: [a0, b0, gt0, pred0, a1, b1, gt1, pred1, ...]
+    rows = []
+    for i in range(n):
+        rows.extend([a[i], b[i], gt[i], pred[i]])
+    grid = vutils.make_grid(rows, nrow=4, padding=2, normalize=False)
+    writer.add_image("Predictions/before_after_gt_pred", grid, global_step=step)