modelling/train.py · earthroverprogram/soilformer at main

File size: 18,312 Bytes

6fb6c07

import argparse
import json
import os
import random
from pathlib import Path
from typing import Dict, Optional

import numpy as np
import torch
from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingLR, StepLR, LinearLR, SequentialLR
from tqdm import tqdm

from loader import SoilFormerDataset, build_train_eval_dataloaders
from soilformer import SoilFormer, loss_function
from utils import get_dtype, load_json, save_json

try:
    import wandb
except ImportError:  # pragma: no cover
    wandb = None


def set_seed(seed: int, deterministic: bool = True) -> None:
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)

    if deterministic:
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False


def resolve_device(device_str: str) -> torch.device:
    device_str = device_str.lower()

    if device_str == "cuda":
        if not torch.cuda.is_available():
            raise RuntimeError("config requests cuda, but CUDA is not available")
        return torch.device("cuda")

    if device_str == "mps":
        if not torch.backends.mps.is_available():
            raise RuntimeError("config requests mps, but MPS is not available")
        return torch.device("mps")

    if device_str == "cpu":
        return torch.device("cpu")

    raise ValueError(f"Unsupported device: {device_str}")


def move_batch_to_device(batch: Dict, device: torch.device, float_dtype: torch.dtype) -> Dict:
    out = {}
    for key, value in batch.items():
        if isinstance(value, torch.Tensor):
            if value.dtype.is_floating_point:
                out[key] = value.to(device=device, dtype=float_dtype, non_blocking=True)
            else:
                out[key] = value.to(device=device, non_blocking=True)
        elif isinstance(value, dict):
            sub = {}
            for sub_key, sub_value in value.items():
                if isinstance(sub_value, torch.Tensor):
                    if sub_value.dtype.is_floating_point:
                        sub[sub_key] = sub_value.to(device=device, dtype=float_dtype, non_blocking=True)
                    else:
                        sub[sub_key] = sub_value.to(device=device, non_blocking=True)
                else:
                    sub[sub_key] = sub_value
            out[key] = sub
        else:
            out[key] = value
    return out


def build_scheduler(
        optimizer: torch.optim.Optimizer,
        scheduler_cfg: Dict,
):
    scheduler_type = str(scheduler_cfg.get("type", "none")).lower()

    if scheduler_type == "none":
        return None

    warmup_epochs = int(scheduler_cfg.get("warmup_epochs", 0))
    warmup_start_factor = float(scheduler_cfg.get("warmup_start_factor", 0.1))

    if scheduler_type == "cosine":
        total_epochs = int(scheduler_cfg["total_epochs"])
        eta_min = float(scheduler_cfg.get("eta_min", 1e-6))

        if warmup_epochs > 0:
            t_max = int(scheduler_cfg.get("t_max", total_epochs - warmup_epochs))
            if t_max <= 0:
                raise ValueError(
                    f"Invalid cosine scheduler config: total_epochs={total_epochs}, "
                    f"warmup_epochs={warmup_epochs}, resulting T_max={t_max}"
                )
        else:
            t_max = int(scheduler_cfg.get("t_max", total_epochs))

        main_scheduler = CosineAnnealingLR(
            optimizer,
            T_max=t_max,
            eta_min=eta_min,
        )

    elif scheduler_type == "step":
        step_size = int(scheduler_cfg["step_size"])
        gamma = float(scheduler_cfg.get("gamma", 0.1))
        main_scheduler = StepLR(
            optimizer,
            step_size=step_size,
            gamma=gamma,
        )

    else:
        raise ValueError(f"Unsupported scheduler type: {scheduler_type}")

    if warmup_epochs <= 0:
        return main_scheduler

    warmup_scheduler = LinearLR(
        optimizer,
        start_factor=warmup_start_factor,
        total_iters=warmup_epochs,
    )

    scheduler = SequentialLR(
        optimizer,
        schedulers=[warmup_scheduler, main_scheduler],
        milestones=[warmup_epochs],
    )
    return scheduler


def get_checkpoint_model_state(model: SoilFormer) -> Dict[str, torch.Tensor]:
    if hasattr(model, "_checkpoint_state_dict"):
        return model._checkpoint_state_dict()  # noqa
    return model.state_dict()


def load_checkpoint_model_state(model: SoilFormer, state_dict: Dict[str, torch.Tensor]) -> None:
    if hasattr(model, "load_weights"):
        payload = {"model_state_dict": state_dict}
        tmp_path = None
        try:
            import tempfile
            with tempfile.NamedTemporaryFile(suffix=".pt", delete=False) as f:
                tmp_path = f.name
            torch.save(payload, tmp_path)
            model.load_weights(tmp_path, map_location="cpu", strict=True)
        finally:
            if tmp_path is not None and os.path.exists(tmp_path):
                os.remove(tmp_path)
        return

    model.load_state_dict(state_dict, strict=True)


def save_checkpoint(
        checkpoint_path: Path,
        model: SoilFormer,
        optimizer: torch.optim.Optimizer,
        scheduler,
        epoch: int,
        global_step: int,
        config_train: Dict,
        config_model: Dict,
        config_data: Dict,
) -> None:
    checkpoint = {
        "epoch": epoch,
        "global_step": global_step,
        "model_state_dict": get_checkpoint_model_state(model),
        "optimizer_state_dict": optimizer.state_dict(),
        "scheduler_state_dict": None if scheduler is None else scheduler.state_dict(),
        "config_train": config_train,
        "config_model": config_model,
        "config_data": config_data,
    }
    checkpoint_path.parent.mkdir(parents=True, exist_ok=True)
    torch.save(checkpoint, checkpoint_path)


def rotate_checkpoints(checkpoint_dir: Path, max_saved_checkpoints: int) -> None:
    checkpoint_paths = sorted(checkpoint_dir.glob("checkpoint_epoch_*.pt"))
    if max_saved_checkpoints is None or max_saved_checkpoints <= 0:
        return
    while len(checkpoint_paths) > max_saved_checkpoints:
        oldest = checkpoint_paths.pop(0)
        oldest.unlink(missing_ok=True)


def compute_loss_from_batch(
        model: SoilFormer,
        batch: Dict,
        device: torch.device,
        dtype: torch.dtype,
        cat_s_bound: Optional[float] = None,
        num_s_bound: Optional[float] = None,
):
    batch = move_batch_to_device(batch, device=device, float_dtype=dtype)

    cat_logits_padded, cat_s, valid_class_mask, value_by_nin, s_by_nin, _ = model(
        cat_local_ids=batch["masked_cat_local_ids"],
        numeric_values_by_nin=batch["masked_numeric_values_by_nin"],
        cat_valid_positions=batch["masked_cat_valid_positions"],
        numeric_valid_positions_by_nin=batch["masked_numeric_valid_positions_by_nin"],
        pixel_values=batch["pixel_values"],
        vision_valid_positions=batch["vision_valid_positions"],
    )

    total_loss, stats = loss_function(
        x_cat=cat_logits_padded,
        s_cat=cat_s,
        y_cat=batch["original_cat_local_ids"],
        loss_mask_cat=batch["cat_loss_mask"],
        valid_class_mask=valid_class_mask,
        x_num=value_by_nin,
        s_num=s_by_nin,
        y_num=batch["original_numeric_values_by_nin"],
        loss_mask_num=batch["numeric_loss_mask_by_nin"],
        reduction="mean",
        cat_s_bound=cat_s_bound,
        num_s_bound=num_s_bound,
    )

    return total_loss, stats


@torch.no_grad()
def evaluate(
        model: SoilFormer,
        dataset: SoilFormerDataset,
        eval_loader,
        device: torch.device,
        dtype: torch.dtype,
        cat_mask_ratio: float,
        num_mask_ratio: float,
        active_mask_seed: int,
        show_tqdm: bool,
        epoch: int,
        cat_s_bound: Optional[float] = None,
        num_s_bound: Optional[float] = None,
):
    model.eval()

    totals = {
        "total": 0.0,
        "cat_loss": 0.0,
        "num_loss": 0.0,
        "cat_base": 0.0,
        "num_base": 0.0,
        "cat_acc": 0.0,
    }
    num_batches = 0

    iterator = eval_loader
    if show_tqdm:
        iterator = tqdm(eval_loader, desc=f"Eval {epoch}", leave=False)

    for batch_idx, raw_batch in enumerate(iterator):
        mask_seed = int(active_mask_seed + batch_idx)
        masked_batch = dataset.perform_active_mask(
            raw_batch,
            cat_ratio=cat_mask_ratio,
            num_ratio=num_mask_ratio,
            seed=mask_seed,
        )

        _, stats = compute_loss_from_batch(
            model=model,
            batch=masked_batch,
            device=device,
            dtype=dtype,
            cat_s_bound=cat_s_bound,
            num_s_bound=num_s_bound,
        )

        num_batches += 1
        for key in totals:
            totals[key] += float(stats[key].item())

    if num_batches == 0:
        raise RuntimeError("Eval dataloader is empty")

    return {f"eval/{k}": v / num_batches for k, v in totals.items()}


def maybe_init_wandb(config_train: Dict):
    wandb_cfg = config_train["logging"]["wandb"]
    if not bool(wandb_cfg.get("enabled", False)):
        return None

    if wandb is None:
        raise ImportError("wandb is enabled in config but package is not installed")

    run = wandb.init(
        project=wandb_cfg["project"],
        entity=wandb_cfg.get("entity"),
        name=wandb_cfg.get("run_name"),
        dir=wandb_cfg.get("dir"),
        config=config_train,
        mode=wandb_cfg.get("mode", "online"),
    )
    return run


def print_parameter_stats(model):
    total = 0
    trainable = 0

    for p in model.parameters():
        num = p.numel()
        total += num
        if p.requires_grad:
            trainable += num

    print("\nParameter statistics:")
    print(f"Total parameters: {total:,}")
    print(f"Trainable parameters: {trainable:,}")
    print(f"Frozen parameters: {total - trainable:,}\n")


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--config", type=str, default="config/config_train.json")
    args = parser.parse_args()

    config_train = load_json(args.config)
    config_paths = config_train["paths"]
    config_data = load_json(config_paths["config_data_path"])
    config_model = load_json(config_paths["config_model_path"])

    seed_cfg = config_train["seed"]
    runtime_cfg = config_train["runtime"]
    optim_cfg = config_train["optimization"]
    checkpoint_cfg = config_train["checkpoint"]
    logging_cfg = config_train["logging"]
    loss_cfg = config_train["loss"]

    set_seed(int(seed_cfg["seed"]), deterministic=bool(seed_cfg.get("deterministic", True)))

    device = resolve_device(runtime_cfg["device"])
    dtype = get_dtype(config_model.get("dtype", "bfloat16"))

    output_dir = Path(config_paths["output_dir"])
    checkpoint_dir = output_dir / "checkpoints"
    output_dir.mkdir(parents=True, exist_ok=True)
    checkpoint_dir.mkdir(parents=True, exist_ok=True)

    save_json(config_train, str(output_dir / "config_train.snapshot.json"))
    save_json(config_data, str(output_dir / "config_data.snapshot.json"))
    save_json(config_model, str(output_dir / "config_model.snapshot.json"))

    dataset = SoilFormerDataset(
        csv_path=config_data["data_csv_path"],
        photo_map_path=config_data["photo_map_path"],
        cat_vocab_path=config_data["cat_vocab_path"],
        numeric_vocab_path=config_data["numeric_vocab_path"],
        numeric_stats_path=config_data["numeric_stats_path"],
        photo_root=config_data["photo_root"],
        image_size=int(config_data["image_size"]),
    )

    train_loader, eval_loader, train_generator = build_train_eval_dataloaders(
        dataset=dataset,
        train_ratio=float(config_data["train_ratio"]),
        seed=int(config_data["train_eval_split_seed"]),
        batch_size=int(config_data["batch_size"]),
    )
    print("\nSample statistics:")
    print("Train samples:", len(train_loader.dataset))
    print("Eval samples:", len(eval_loader.dataset))
    train_generator.manual_seed(int(seed_cfg["seed"]))

    model = SoilFormer(config=config_model, device=str(device))

    resume_path = checkpoint_cfg.get("resume_checkpoint_path")
    if resume_path:
        checkpoint = torch.load(resume_path, map_location="cpu")
        load_checkpoint_model_state(model, checkpoint["model_state_dict"])
    else:
        model.init_weights(std=float(runtime_cfg.get("init_weight_std", 0.02)))
        checkpoint = None

    print_parameter_stats(model)

    optimizer = AdamW(
        [p for p in model.parameters() if p.requires_grad],
        lr=float(optim_cfg["lr"]),
        betas=(float(optim_cfg["beta1"]), float(optim_cfg["beta2"])),
        eps=float(optim_cfg["eps"]),
        weight_decay=float(optim_cfg["weight_decay"]),
    )

    scheduler = build_scheduler(
        optimizer=optimizer,
        scheduler_cfg=optim_cfg.get("scheduler", {"type": "none"})
    )

    start_epoch = 1
    global_step = 0

    if checkpoint is not None:
        optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
        if scheduler is not None and checkpoint.get("scheduler_state_dict") is not None:
            scheduler.load_state_dict(checkpoint["scheduler_state_dict"])
        start_epoch = int(checkpoint["epoch"]) + 1
        global_step = int(checkpoint.get("global_step", 0))

    wandb_run = maybe_init_wandb(config_train)

    num_epochs = int(runtime_cfg["num_epochs"])
    show_tqdm = bool(logging_cfg.get("tqdm", True))
    cat_mask_ratio = float(config_data["cat_mask_ratio"])
    num_mask_ratio = float(config_data["num_mask_ratio"])
    active_mask_seed = int(config_data["active_mask_seed"])
    max_grad_norm = optim_cfg.get("max_grad_norm")
    epochs_per_save = int(checkpoint_cfg["epochs_per_save"])
    max_saved_checkpoints = int(checkpoint_cfg["max_saved_checkpoints"])

    for epoch in range(start_epoch, num_epochs + 1):
        model.train()

        epoch_totals = {
            "total": 0.0,
            "cat_loss": 0.0,
            "num_loss": 0.0,
            "cat_base": 0.0,
            "num_base": 0.0,
            "cat_acc": 0.0,
        }
        num_batches = 0

        iterator = train_loader
        if show_tqdm:
            iterator = tqdm(train_loader, desc=f"Train {epoch}", leave=True)

        for batch_idx, raw_batch in enumerate(iterator):
            global_step += 1
            mask_seed = int(active_mask_seed + epoch * 1_000_000 + batch_idx)
            masked_batch = dataset.perform_active_mask(
                raw_batch,
                cat_ratio=cat_mask_ratio,
                num_ratio=num_mask_ratio,
                seed=mask_seed,
            )

            optimizer.zero_grad(set_to_none=True)

            total_loss, stats = compute_loss_from_batch(
                model=model,
                batch=masked_batch,
                device=device,
                dtype=dtype,
                cat_s_bound=loss_cfg.get("cat_s_bound", None),
                num_s_bound=loss_cfg.get("num_s_bound", None),
            )

            total_loss.backward()
            if max_grad_norm is not None:
                torch.nn.utils.clip_grad_norm_(model.parameters(), float(max_grad_norm))
            optimizer.step()

            num_batches += 1
            for key in epoch_totals:
                epoch_totals[key] += float(stats[key].item())

            current_lr = float(optimizer.param_groups[0]["lr"])
            train_step_log = {
                "train/step_total": float(stats["total"].item()),
                "train/step_cat_loss": float(stats["cat_loss"].item()),
                "train/step_num_loss": float(stats["num_loss"].item()),
                "train/step_cat_acc": float(stats["cat_acc"].item()),
                "train/lr": current_lr,
                "epoch": epoch,
                "global_step": global_step,
            }

            if wandb_run is not None:
                wandb.log(train_step_log, step=global_step)

            if show_tqdm:
                iterator.set_postfix(
                    loss=f"{train_step_log['train/step_total']:.4f}",
                    lr=f"{current_lr:.3e}",
                )

        if num_batches == 0:
            raise RuntimeError("Train dataloader is empty")

        train_epoch_log = {f"train/{k}": v / num_batches for k, v in epoch_totals.items()}
        train_epoch_log["train/lr_epoch_end"] = float(optimizer.param_groups[0]["lr"])
        train_epoch_log["epoch"] = epoch
        train_epoch_log["global_step"] = global_step

        eval_log = evaluate(
            model=model,
            dataset=dataset,
            eval_loader=eval_loader,
            device=device,
            dtype=dtype,
            cat_mask_ratio=cat_mask_ratio,
            num_mask_ratio=num_mask_ratio,
            active_mask_seed=active_mask_seed,
            show_tqdm=show_tqdm,
            epoch=epoch,
            cat_s_bound=loss_cfg.get("cat_s_bound", None),
            num_s_bound=loss_cfg.get("num_s_bound", None),
        )
        eval_log["epoch"] = epoch
        eval_log["global_step"] = global_step

        merged_log = {}
        merged_log.update(train_epoch_log)
        merged_log.update(eval_log)

        print(json.dumps(merged_log, ensure_ascii=False))

        if wandb_run is not None:
            wandb.log(merged_log, step=global_step)

        if scheduler is not None:
            scheduler.step()

        if epochs_per_save > 0 and epoch % epochs_per_save == 0:
            checkpoint_path = checkpoint_dir / f"checkpoint_epoch_{epoch}.pt"
            save_checkpoint(
                checkpoint_path=checkpoint_path,
                model=model,
                optimizer=optimizer,
                scheduler=scheduler,
                epoch=epoch,
                global_step=global_step,
                config_train=config_train,
                config_model=config_model,
                config_data=config_data,
            )
            rotate_checkpoints(checkpoint_dir, max_saved_checkpoints)

    if wandb_run is not None:
        wandb.finish()


if __name__ == "__main__":
    main()