asdf98
/

LiquidGen

Model card Files Files and versions

xet

Community

asdf98 commited on 8 days ago

Commit

57090a0

verified ·

1 Parent(s): 193fbf7

Add gradient checkpointing + auto batch size to prevent OOM on T4

Browse files

Files changed (1) hide show

train.py +106 -187

train.py CHANGED Viewed

@@ -4,13 +4,10 @@ LiquidGen Training Pipeline v2
 Optimized for Colab free tier:
 - Latent pre-caching: encode images with VAE once, save to disk, train on pure tensors
 - No VAE needed during training loop -> saves ~1GB VRAM + faster iterations
 - All datasets are pure parquet — no legacy loading scripts
 - Uses madebyollin/sdxl-vae-fp16-fix (fully open, no login, fp16 stable)
-Flow Matching training objective (velocity prediction):
-- Forward: x_t = (1 - t) * x_0 + t * eps
-- Target: v = eps - x_0
-- Loss: MSE(model(x_t, t), v)
 """
 import torch
@@ -26,10 +23,6 @@ from typing import Optional
 from dataclasses import dataclass, asdict
-# =============================================================================
-# Dataset Presets — ALL pure parquet, no loading scripts, no auth
-# =============================================================================
 DATASET_PRESETS = {
     "cartoon": {
         "name": "Norod78/cartoon-blip-captions",
@@ -52,7 +45,7 @@ DATASET_PRESETS = {
         "config": "",
         "image_column": "image",
         "label_column": "style",
-        "num_classes": 0,  # string labels, mapped to ints automatically
         "description": "~105K paintings with style labels, 1.6GB (use max_images to limit)",
     },
     "art_painting": {
@@ -66,26 +59,47 @@ DATASET_PRESETS = {
 }
 @dataclass
 class TrainConfig:
-    """Training configuration optimized for Colab free tier (T4 16GB)."""
-    # Model
-    model_size: str = "small"  # small (~55M), base (~140M), large (~280M)
-    num_classes: int = 0       # 0 = unconditional
     class_drop_prob: float = 0.1
-    # Data
-    dataset_preset: str = "cartoon"    # key from DATASET_PRESETS
-    image_size: int = 256              # 256 or 512
-    max_images: int = 0                # 0 = use all, >0 = limit
-    # VAE — fully open, no login needed
     vae_id: str = "madebyollin/sdxl-vae-fp16-fix"
     vae_scaling_factor: float = 0.13025
     latent_channels: int = 4
-    # Training
-    batch_size: int = 32
     gradient_accumulation_steps: int = 1
     learning_rate: float = 1e-4
     weight_decay: float = 0.01
@@ -94,28 +108,19 @@ class TrainConfig:
     warmup_steps: int = 500
     ema_decay: float = 0.9999
     mixed_precision: bool = True
-    # Flow matching
     min_timestep: float = 0.001
     max_timestep: float = 0.999
-    # Saving
     output_dir: str = "./outputs"
     save_every_n_steps: int = 2000
     sample_every_n_steps: int = 500
     log_every_n_steps: int = 25
-    # Sampling
     num_sample_steps: int = 50
     cfg_scale: float = 2.0
     num_samples: int = 4
-    # System
     seed: int = 42
     num_workers: int = 2
     compile_model: bool = False
-    # Hub
     push_to_hub: bool = False
     hub_model_id: str = ""
@@ -136,38 +141,25 @@ def get_model_config(size, num_classes=0, class_drop_prob=0.1):
     return cfg
-# =============================================================================
-# Latent Pre-Caching
-# =============================================================================
 class CachedLatentDataset(Dataset):
-    """Training dataset from pre-encoded VAE latents on disk."""
     def __init__(self, cache_path):
         data = torch.load(cache_path, map_location="cpu", weights_only=True)
         self.latents = data["latents"]
         self.labels = data.get("labels", None)
         print(f"Loaded {len(self.latents)} cached latents from {cache_path}")
-        print(f"  Shape: {self.latents.shape}, dtype: {self.latents.dtype}")
         if self.labels is not None and (self.labels >= 0).any():
-            print(f"  Labels: unique={self.labels[self.labels >= 0].unique().shape[0]}")
-    def __len__(self):
-        return len(self.latents)
     def __getitem__(self, idx):
-        lat = self.latents[idx]
-        label = self.labels[idx] if self.labels is not None else -1
-        return lat, label
 def precache_latents(config, cache_path=None):
-    """
-    Encode all images to VAE latents once, save to disk.
-    """
     if cache_path is None:
         cache_path = os.path.join(config.output_dir, "cached_latents.pt")
     if os.path.exists(cache_path):
         print(f"Cache exists: {cache_path}")
         data = torch.load(cache_path, map_location="cpu", weights_only=True)
@@ -177,167 +169,115 @@ def precache_latents(config, cache_path=None):
     os.makedirs(os.path.dirname(cache_path) if os.path.dirname(cache_path) else ".", exist_ok=True)
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    # Load VAE
     print(f"Loading VAE: {config.vae_id}...")
     from diffusers import AutoencoderKL
-    vae = AutoencoderKL.from_pretrained(
-        config.vae_id, torch_dtype=torch.float16
-    ).to(device).eval()
-    for p in vae.parameters():
-        p.requires_grad_(False)
     print(f"  VAE: {sum(p.numel() for p in vae.parameters())/1e6:.0f}M params")
-    # Load dataset
     preset = DATASET_PRESETS[config.dataset_preset]
     print(f"Loading: {preset['name']} ({preset['description']})")
     from datasets import load_dataset
     from torchvision import transforms
     ds_kwargs = {"split": "train"}
-    if preset["config"]:
-        ds_kwargs["name"] = preset["config"]
     dataset = load_dataset(preset["name"], **ds_kwargs)
     transform = transforms.Compose([
         transforms.Resize(config.image_size, interpolation=transforms.InterpolationMode.LANCZOS),
-        transforms.CenterCrop(config.image_size),
-        transforms.ToTensor(),
     ])
-    # For Artificio/WikiArt: style is a string, map to int
-    img_col = preset["image_column"]
-    lbl_col = preset["label_column"]
     style_to_id = {}
-    all_latents = []
-    all_labels = []
-    batch_pixels = []
-    batch_labels = []
-    encode_bs = 16
-    count = 0
-    max_imgs = config.max_images if config.max_images > 0 else float("inf")
-    print(f"Encoding to VAE latents...")
     t0 = time.time()
     for item in dataset:
-        if count >= max_imgs:
-            break
         img = item[img_col]
-        if img.mode != "RGB":
-            img = img.convert("RGB")
-        batch_pixels.append(transform(img))
-        # Handle labels: int or string
         if lbl_col and lbl_col in item:
-            raw_label = item[lbl_col]
-            if isinstance(raw_label, str):
-                if raw_label not in style_to_id:
-                    style_to_id[raw_label] = len(style_to_id)
-                batch_labels.append(style_to_id[raw_label])
-            elif isinstance(raw_label, int):
-                batch_labels.append(raw_label)
-            else:
-                batch_labels.append(-1)
-        else:
-            batch_labels.append(-1)
         count += 1
-        if len(batch_pixels) >= encode_bs:
             with torch.no_grad():
-                px = torch.stack(batch_pixels).to(device, dtype=torch.float16) * 2 - 1
-                lat = vae.encode(px).latent_dist.sample()
-                lat = lat * config.vae_scaling_factor
                 all_latents.append(lat.cpu().float())
-            all_labels.extend(batch_labels)
-            batch_pixels, batch_labels = [], []
-            if count % 500 == 0:
-                print(f"  {count} images ({time.time()-t0:.0f}s)")
-    if batch_pixels:
         with torch.no_grad():
-            px = torch.stack(batch_pixels).to(device, dtype=torch.float16) * 2 - 1
-            lat = vae.encode(px).latent_dist.sample()
-            lat = lat * config.vae_scaling_factor
             all_latents.append(lat.cpu().float())
-        all_labels.extend(batch_labels)
     all_latents = torch.cat(all_latents, dim=0)
     all_labels = torch.tensor(all_labels, dtype=torch.long)
     save_data = {"latents": all_latents, "labels": all_labels}
     if style_to_id:
         save_data["style_to_id"] = style_to_id
-        print(f"  Mapped {len(style_to_id)} style labels to class IDs")
     torch.save(save_data, cache_path)
-    elapsed = time.time() - t0
     mb = os.path.getsize(cache_path) / 1024**2
-    print(f"\nCached {count} latents -> {cache_path}")
-    print(f"  Shape: {all_latents.shape}, {mb:.1f}MB, {elapsed:.0f}s")
     del vae
-    if torch.cuda.is_available():
-        torch.cuda.empty_cache()
     print("  VAE unloaded\n")
     return cache_path
-# =============================================================================
-# EMA, FlowMatching, Scheduler
-# =============================================================================
 class EMAModel:
     def __init__(self, model, decay=0.9999):
         self.decay = decay
         self.shadow = {n: p.clone().detach() for n, p in model.named_parameters() if p.requires_grad}
     @torch.no_grad()
     def update(self, model):
         for n, p in model.named_parameters():
             if p.requires_grad and n in self.shadow:
                 self.shadow[n].mul_(self.decay).add_(p.data, alpha=1 - self.decay)
     def apply(self, model):
         self.backup = {n: p.data.clone() for n, p in model.named_parameters() if p.requires_grad}
         for n, p in model.named_parameters():
-            if p.requires_grad and n in self.shadow:
-                p.data.copy_(self.shadow[n])
     def restore(self, model):
         for n, p in model.named_parameters():
-            if p.requires_grad and n in self.backup:
-                p.data.copy_(self.backup[n])
         self.backup = {}
 class FlowMatchingScheduler:
     def __init__(self, min_t=0.001, max_t=0.999):
         self.min_t, self.max_t = min_t, max_t
     def sample_timesteps(self, bs, dev):
         return torch.rand(bs, device=dev) * (self.max_t - self.min_t) + self.min_t
     def add_noise(self, x0, noise, t):
         t = t.view(-1, 1, 1, 1); return (1 - t) * x0 + t * noise
     def get_velocity_target(self, x0, noise):
         return noise - x0
     @torch.no_grad()
     def sample(self, model, shape, dev, num_steps=50, labels=None, cfg=1.0):
-        model.eval(); x = torch.randn(shape, device=dev)
-        dt = 1.0 / num_steps
         for tv in torch.linspace(1.0, dt, num_steps, device=dev):
             t = torch.full((shape[0],), tv.item(), device=dev)
             with torch.amp.autocast("cuda"):
                 if cfg > 1.0 and labels is not None:
                     vc = model(x, t, labels); vu = model(x, t, torch.zeros_like(labels))
                     v = vu + cfg * (vc - vu)
-                else:
-                    v = model(x, t, labels)
             x = x - dt * v.float()
         return x
@@ -349,29 +289,30 @@ def cosine_schedule(opt, warmup, total):
     return torch.optim.lr_scheduler.LambdaLR(opt, lr)
-# =============================================================================
-# Main Training Loop
-# =============================================================================
 def train(config):
     from model import LiquidGen
     torch.manual_seed(config.seed)
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    print(f"Device: {device}")
     if torch.cuda.is_available():
-        print(f"GPU: {torch.cuda.get_device_name(0)} "
-              f"({torch.cuda.get_device_properties(0).total_mem/1024**3:.1f} GB)")
     os.makedirs(config.output_dir, exist_ok=True)
     os.makedirs(f"{config.output_dir}/samples", exist_ok=True)
     os.makedirs(f"{config.output_dir}/checkpoints", exist_ok=True)
     with open(f"{config.output_dir}/config.json", "w") as f:
         json.dump(asdict(config), f, indent=2)
     cache_path = precache_latents(config)
     train_ds = CachedLatentDataset(cache_path)
     train_dl = DataLoader(train_ds, batch_size=config.batch_size, shuffle=True,
                           num_workers=config.num_workers, pin_memory=True, drop_last=True)
@@ -379,6 +320,12 @@ def train(config):
     mcfg = get_model_config(config.model_size, config.num_classes, config.class_drop_prob)
     mcfg["in_channels"] = config.latent_channels
     model = LiquidGen(**mcfg).to(device)
     print(f"LiquidGen-{config.model_size}: {model.count_params()/1e6:.1f}M params")
     if config.compile_model and hasattr(torch, "compile"):
@@ -393,11 +340,10 @@ def train(config):
     fm = FlowMatchingScheduler(config.min_timestep, config.max_timestep)
     lat_size = config.image_size // 8
-    print(f"\nSteps: {total_steps}, Batch: {config.batch_size}x{config.gradient_accumulation_steps}")
     print(f"Latent: [{config.batch_size}, {config.latent_channels}, {lat_size}, {lat_size}]")
     if torch.cuda.is_available():
-        print(f"VRAM: {torch.cuda.memory_allocated()/1024**3:.1f} / "
-              f"{torch.cuda.get_device_properties(0).total_mem/1024**3:.1f} GB")
     gs = 0; la = 0.0; vae = None; vae_loaded = False
     print(f"\n{'='*60}\nTraining!\n{'='*60}\n")
@@ -408,76 +354,49 @@ def train(config):
         for bi, (lats, lbls) in enumerate(train_dl):
             lats = lats.to(device)
             lbls = lbls.to(device) if config.num_classes > 0 else None
             t = fm.sample_timesteps(lats.shape[0], device)
             noise = torch.randn_like(lats)
             xt = fm.add_noise(lats, noise, t)
             vtgt = fm.get_velocity_target(lats, noise)
             with autocast("cuda", enabled=config.mixed_precision and torch.cuda.is_available()):
                 vp = model(xt, t, lbls)
                 loss = F.mse_loss(vp, vtgt) / config.gradient_accumulation_steps
             scaler.scale(loss).backward()
             la += loss.item()
             if (bi + 1) % config.gradient_accumulation_steps == 0:
                 scaler.unscale_(opt)
                 gn = torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_grad_norm)
                 scaler.step(opt); scaler.update(); opt.zero_grad(); sched.step()
                 ema.update(model); gs += 1
                 if gs % config.log_every_n_steps == 0:
                     al = la / config.log_every_n_steps
-                    lr = opt.param_groups[0]["lr"]
                     vram = torch.cuda.memory_allocated()/1024**3 if torch.cuda.is_available() else 0
                     sps = gs / max(time.time() - t_start, 1)
                     print(f"step={gs:>6d} | ep={epoch} | loss={al:.4f} | gn={gn:.2f} | "
-                          f"lr={lr:.2e} | vram={vram:.1f}G | {sps:.1f} st/s")
                     la = 0.0
-                    if math.isnan(al) or al > 50:
-                        print("Diverged!"); return
                 if gs % config.sample_every_n_steps == 0:
                     if not vae_loaded:
                         from diffusers import AutoencoderKL
-                        vae = AutoencoderKL.from_pretrained(
-                            config.vae_id, torch_dtype=torch.float16
-                        ).to(device).eval()
                         for p in vae.parameters(): p.requires_grad_(False)
                         vae_loaded = True
                     ema.apply(model); model.eval()
-                    sl = torch.randint(0, max(1, config.num_classes), (config.num_samples,),
-                                       device=device) if config.num_classes > 0 else None
                     samp = fm.sample(model, (config.num_samples, config.latent_channels, lat_size, lat_size),
                                      device, config.num_sample_steps, sl, config.cfg_scale)
                     with torch.no_grad():
-                        dec = samp.half() / config.vae_scaling_factor
-                        imgs = ((vae.decode(dec).sample + 1) / 2).clamp(0, 1).float()
                     from torchvision.utils import save_image
-                    sp = f"{config.output_dir}/samples/step_{gs:07d}.png"
-                    save_image(imgs, sp, nrow=2); print(f"  Saved: {sp}")
-                    ema.restore(model); model.train()
                 if gs % config.save_every_n_steps == 0:
-                    cp = f"{config.output_dir}/checkpoints/step_{gs:07d}.pt"
                     torch.save({"model": model.state_dict(), "ema": ema.shadow,
-                                "optimizer": opt.state_dict(), "scheduler": sched.state_dict(),
-                                "step": gs, "epoch": epoch, "model_config": mcfg}, cp)
-                    print(f"  Saved: {cp}")
         print(f"Epoch {epoch} | {time.time()-et:.0f}s\n")
     final = f"{config.output_dir}/checkpoints/final.pt"
-    torch.save({"model": model.state_dict(), "ema": ema.shadow,
-                "model_config": mcfg, "step": gs}, final)
     print(f"\nDone! {gs} steps, {(time.time()-t_start)/60:.1f}min -> {final}")
-if __name__ == "__main__":
-    config = TrainConfig(
-        model_size="small", dataset_preset="cartoon",
-        image_size=256, batch_size=8, num_epochs=5,
-        log_every_n_steps=5, sample_every_n_steps=99999,
-    )
-    train(config)

 Optimized for Colab free tier:
 - Latent pre-caching: encode images with VAE once, save to disk, train on pure tensors
 - No VAE needed during training loop -> saves ~1GB VRAM + faster iterations
+- Gradient checkpointing enabled by default (saves ~50% activation VRAM)
+- Auto batch size selection based on model size + image size + GPU VRAM
 - All datasets are pure parquet — no legacy loading scripts
 - Uses madebyollin/sdxl-vae-fp16-fix (fully open, no login, fp16 stable)
 """
 import torch
 from dataclasses import dataclass, asdict
 DATASET_PRESETS = {
     "cartoon": {
         "name": "Norod78/cartoon-blip-captions",
         "config": "",
         "image_column": "image",
         "label_column": "style",
+        "num_classes": 0,
         "description": "~105K paintings with style labels, 1.6GB (use max_images to limit)",
     },
     "art_painting": {
 }
+def auto_batch_size(model_size, image_size, gpu_mem_gb):
+    """Compute safe batch size based on model + resolution + GPU memory.
+    Accounts for: fp16 weights + fp16 grads + fp32 Adam states + activations.
+    With gradient checkpointing enabled, activation memory is ~50% less.
+    """
+    # Fixed memory per model (weights + grads + optimizer) in GB
+    param_mem = {"small": 0.66, "base": 1.68, "large": 3.35}
+    base = param_mem.get(model_size, 1.0)
+    # Activation memory per sample at this resolution (GB, with grad checkpointing)
+    # 256px: lat=32x32, patch=16x16  |  512px: lat=64x64, patch=32x32
+    act_per_sample = {"small": {256: 0.02, 512: 0.07},
+                      "base":  {256: 0.03, 512: 0.13},
+                      "large": {256: 0.05, 512: 0.21}}
+    per_sample = act_per_sample.get(model_size, {}).get(image_size, 0.1)
+    # Leave 1.5GB headroom for PyTorch overhead, CUDA kernels, VAE loading
+    available = gpu_mem_gb - base - 1.5
+    bs = max(1, int(available / per_sample))
+    # Round down to nearest power of 2 for efficiency
+    bs = min(bs, 64)
+    if bs >= 32: bs = 32
+    elif bs >= 16: bs = 16
+    elif bs >= 8: bs = 8
+    elif bs >= 4: bs = 4
+    return bs
 @dataclass
 class TrainConfig:
+    model_size: str = "small"
+    num_classes: int = 0
     class_drop_prob: float = 0.1
+    dataset_preset: str = "cartoon"
+    image_size: int = 256
+    max_images: int = 0
     vae_id: str = "madebyollin/sdxl-vae-fp16-fix"
     vae_scaling_factor: float = 0.13025
     latent_channels: int = 4
+    batch_size: int = 0          # 0 = auto-detect based on GPU
     gradient_accumulation_steps: int = 1
     learning_rate: float = 1e-4
     weight_decay: float = 0.01
     warmup_steps: int = 500
     ema_decay: float = 0.9999
     mixed_precision: bool = True
+    gradient_checkpointing: bool = True  # Enabled by default!
     min_timestep: float = 0.001
     max_timestep: float = 0.999
     output_dir: str = "./outputs"
     save_every_n_steps: int = 2000
     sample_every_n_steps: int = 500
     log_every_n_steps: int = 25
     num_sample_steps: int = 50
     cfg_scale: float = 2.0
     num_samples: int = 4
     seed: int = 42
     num_workers: int = 2
     compile_model: bool = False
     push_to_hub: bool = False
     hub_model_id: str = ""
     return cfg
 class CachedLatentDataset(Dataset):
     def __init__(self, cache_path):
         data = torch.load(cache_path, map_location="cpu", weights_only=True)
         self.latents = data["latents"]
         self.labels = data.get("labels", None)
         print(f"Loaded {len(self.latents)} cached latents from {cache_path}")
+        print(f"  Shape: {self.latents.shape}")
         if self.labels is not None and (self.labels >= 0).any():
+            print(f"  Labels: {self.labels[self.labels >= 0].unique().shape[0]} classes")
+    def __len__(self): return len(self.latents)
     def __getitem__(self, idx):
+        return self.latents[idx], (self.labels[idx] if self.labels is not None else -1)
 def precache_latents(config, cache_path=None):
     if cache_path is None:
         cache_path = os.path.join(config.output_dir, "cached_latents.pt")
     if os.path.exists(cache_path):
         print(f"Cache exists: {cache_path}")
         data = torch.load(cache_path, map_location="cpu", weights_only=True)
     os.makedirs(os.path.dirname(cache_path) if os.path.dirname(cache_path) else ".", exist_ok=True)
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     print(f"Loading VAE: {config.vae_id}...")
     from diffusers import AutoencoderKL
+    vae = AutoencoderKL.from_pretrained(config.vae_id, torch_dtype=torch.float16).to(device).eval()
+    for p in vae.parameters(): p.requires_grad_(False)
     print(f"  VAE: {sum(p.numel() for p in vae.parameters())/1e6:.0f}M params")
     preset = DATASET_PRESETS[config.dataset_preset]
     print(f"Loading: {preset['name']} ({preset['description']})")
     from datasets import load_dataset
     from torchvision import transforms
     ds_kwargs = {"split": "train"}
+    if preset["config"]: ds_kwargs["name"] = preset["config"]
     dataset = load_dataset(preset["name"], **ds_kwargs)
     transform = transforms.Compose([
         transforms.Resize(config.image_size, interpolation=transforms.InterpolationMode.LANCZOS),
+        transforms.CenterCrop(config.image_size), transforms.ToTensor(),
     ])
+    img_col, lbl_col = preset["image_column"], preset["label_column"]
     style_to_id = {}
+    all_latents, all_labels = [], []
+    batch_px, batch_lb = [], []
+    count, max_imgs = 0, config.max_images if config.max_images > 0 else float("inf")
     t0 = time.time()
     for item in dataset:
+        if count >= max_imgs: break
         img = item[img_col]
+        if img.mode != "RGB": img = img.convert("RGB")
+        batch_px.append(transform(img))
         if lbl_col and lbl_col in item:
+            raw = item[lbl_col]
+            if isinstance(raw, str):
+                if raw not in style_to_id: style_to_id[raw] = len(style_to_id)
+                batch_lb.append(style_to_id[raw])
+            elif isinstance(raw, int): batch_lb.append(raw)
+            else: batch_lb.append(-1)
+        else: batch_lb.append(-1)
         count += 1
+        if len(batch_px) >= 16:
             with torch.no_grad():
+                px = torch.stack(batch_px).to(device, dtype=torch.float16) * 2 - 1
+                lat = vae.encode(px).latent_dist.sample() * config.vae_scaling_factor
                 all_latents.append(lat.cpu().float())
+            all_labels.extend(batch_lb); batch_px, batch_lb = [], []
+            if count % 500 == 0: print(f"  {count} images ({time.time()-t0:.0f}s)")
+    if batch_px:
         with torch.no_grad():
+            px = torch.stack(batch_px).to(device, dtype=torch.float16) * 2 - 1
+            lat = vae.encode(px).latent_dist.sample() * config.vae_scaling_factor
             all_latents.append(lat.cpu().float())
+        all_labels.extend(batch_lb)
     all_latents = torch.cat(all_latents, dim=0)
     all_labels = torch.tensor(all_labels, dtype=torch.long)
     save_data = {"latents": all_latents, "labels": all_labels}
     if style_to_id:
         save_data["style_to_id"] = style_to_id
+        print(f"  {len(style_to_id)} style classes mapped")
     torch.save(save_data, cache_path)
     mb = os.path.getsize(cache_path) / 1024**2
+    print(f"\nCached {count} latents -> {cache_path} ({all_latents.shape}, {mb:.0f}MB, {time.time()-t0:.0f}s)")
     del vae
+    if torch.cuda.is_available(): torch.cuda.empty_cache()
     print("  VAE unloaded\n")
     return cache_path
 class EMAModel:
     def __init__(self, model, decay=0.9999):
         self.decay = decay
         self.shadow = {n: p.clone().detach() for n, p in model.named_parameters() if p.requires_grad}
     @torch.no_grad()
     def update(self, model):
         for n, p in model.named_parameters():
             if p.requires_grad and n in self.shadow:
                 self.shadow[n].mul_(self.decay).add_(p.data, alpha=1 - self.decay)
     def apply(self, model):
         self.backup = {n: p.data.clone() for n, p in model.named_parameters() if p.requires_grad}
         for n, p in model.named_parameters():
+            if p.requires_grad and n in self.shadow: p.data.copy_(self.shadow[n])
     def restore(self, model):
         for n, p in model.named_parameters():
+            if p.requires_grad and n in self.backup: p.data.copy_(self.backup[n])
         self.backup = {}
 class FlowMatchingScheduler:
     def __init__(self, min_t=0.001, max_t=0.999):
         self.min_t, self.max_t = min_t, max_t
     def sample_timesteps(self, bs, dev):
         return torch.rand(bs, device=dev) * (self.max_t - self.min_t) + self.min_t
     def add_noise(self, x0, noise, t):
         t = t.view(-1, 1, 1, 1); return (1 - t) * x0 + t * noise
     def get_velocity_target(self, x0, noise):
         return noise - x0
     @torch.no_grad()
     def sample(self, model, shape, dev, num_steps=50, labels=None, cfg=1.0):
+        model.eval(); x = torch.randn(shape, device=dev); dt = 1.0 / num_steps
         for tv in torch.linspace(1.0, dt, num_steps, device=dev):
             t = torch.full((shape[0],), tv.item(), device=dev)
             with torch.amp.autocast("cuda"):
                 if cfg > 1.0 and labels is not None:
                     vc = model(x, t, labels); vu = model(x, t, torch.zeros_like(labels))
                     v = vu + cfg * (vc - vu)
+                else: v = model(x, t, labels)
             x = x - dt * v.float()
         return x
     return torch.optim.lr_scheduler.LambdaLR(opt, lr)
 def train(config):
     from model import LiquidGen
     torch.manual_seed(config.seed)
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    gpu_mem = 0
     if torch.cuda.is_available():
+        gpu_mem = torch.cuda.get_device_properties(0).total_mem / 1024**3
+        print(f"GPU: {torch.cuda.get_device_name(0)} ({gpu_mem:.1f} GB)")
+    # Auto batch size if not set
+    if config.batch_size <= 0:
+        if gpu_mem > 0:
+            config.batch_size = auto_batch_size(config.model_size, config.image_size, gpu_mem)
+            print(f"Auto batch size: {config.batch_size} (for {config.model_size} at {config.image_size}px on {gpu_mem:.0f}GB)")
+        else:
+            config.batch_size = 4
     os.makedirs(config.output_dir, exist_ok=True)
     os.makedirs(f"{config.output_dir}/samples", exist_ok=True)
     os.makedirs(f"{config.output_dir}/checkpoints", exist_ok=True)
     with open(f"{config.output_dir}/config.json", "w") as f:
         json.dump(asdict(config), f, indent=2)
     cache_path = precache_latents(config)
     train_ds = CachedLatentDataset(cache_path)
     train_dl = DataLoader(train_ds, batch_size=config.batch_size, shuffle=True,
                           num_workers=config.num_workers, pin_memory=True, drop_last=True)
     mcfg = get_model_config(config.model_size, config.num_classes, config.class_drop_prob)
     mcfg["in_channels"] = config.latent_channels
     model = LiquidGen(**mcfg).to(device)
+    # Enable gradient checkpointing (saves ~50% activation VRAM)
+    if config.gradient_checkpointing:
+        model.enable_gradient_checkpointing()
+        print(f"Gradient checkpointing: ON")
     print(f"LiquidGen-{config.model_size}: {model.count_params()/1e6:.1f}M params")
     if config.compile_model and hasattr(torch, "compile"):
     fm = FlowMatchingScheduler(config.min_timestep, config.max_timestep)
     lat_size = config.image_size // 8
+    print(f"Steps: {total_steps}, Batch: {config.batch_size}x{config.gradient_accumulation_steps}")
     print(f"Latent: [{config.batch_size}, {config.latent_channels}, {lat_size}, {lat_size}]")
     if torch.cuda.is_available():
+        print(f"VRAM: {torch.cuda.memory_allocated()/1024**3:.1f} / {gpu_mem:.1f} GB")
     gs = 0; la = 0.0; vae = None; vae_loaded = False
     print(f"\n{'='*60}\nTraining!\n{'='*60}\n")
         for bi, (lats, lbls) in enumerate(train_dl):
             lats = lats.to(device)
             lbls = lbls.to(device) if config.num_classes > 0 else None
             t = fm.sample_timesteps(lats.shape[0], device)
             noise = torch.randn_like(lats)
             xt = fm.add_noise(lats, noise, t)
             vtgt = fm.get_velocity_target(lats, noise)
             with autocast("cuda", enabled=config.mixed_precision and torch.cuda.is_available()):
                 vp = model(xt, t, lbls)
                 loss = F.mse_loss(vp, vtgt) / config.gradient_accumulation_steps
             scaler.scale(loss).backward()
             la += loss.item()
             if (bi + 1) % config.gradient_accumulation_steps == 0:
                 scaler.unscale_(opt)
                 gn = torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_grad_norm)
                 scaler.step(opt); scaler.update(); opt.zero_grad(); sched.step()
                 ema.update(model); gs += 1
                 if gs % config.log_every_n_steps == 0:
                     al = la / config.log_every_n_steps
                     vram = torch.cuda.memory_allocated()/1024**3 if torch.cuda.is_available() else 0
                     sps = gs / max(time.time() - t_start, 1)
                     print(f"step={gs:>6d} | ep={epoch} | loss={al:.4f} | gn={gn:.2f} | "
+                          f"lr={opt.param_groups[0]['lr']:.2e} | vram={vram:.1f}G | {sps:.1f} st/s")
                     la = 0.0
+                    if math.isnan(al) or al > 50: print("Diverged!"); return
                 if gs % config.sample_every_n_steps == 0:
                     if not vae_loaded:
                         from diffusers import AutoencoderKL
+                        vae = AutoencoderKL.from_pretrained(config.vae_id, torch_dtype=torch.float16).to(device).eval()
                         for p in vae.parameters(): p.requires_grad_(False)
                         vae_loaded = True
                     ema.apply(model); model.eval()
+                    sl = torch.randint(0, max(1, config.num_classes), (config.num_samples,), device=device) if config.num_classes > 0 else None
                     samp = fm.sample(model, (config.num_samples, config.latent_channels, lat_size, lat_size),
                                      device, config.num_sample_steps, sl, config.cfg_scale)
                     with torch.no_grad():
+                        imgs = ((vae.decode(samp.half() / config.vae_scaling_factor).sample + 1) / 2).clamp(0, 1).float()
                     from torchvision.utils import save_image
+                    save_image(imgs, f"{config.output_dir}/samples/step_{gs:07d}.png", nrow=2)
+                    print(f"  Saved samples"); ema.restore(model); model.train()
                 if gs % config.save_every_n_steps == 0:
                     torch.save({"model": model.state_dict(), "ema": ema.shadow,
+                                "optimizer": opt.state_dict(), "step": gs, "model_config": mcfg},
+                               f"{config.output_dir}/checkpoints/step_{gs:07d}.pt")
         print(f"Epoch {epoch} | {time.time()-et:.0f}s\n")
     final = f"{config.output_dir}/checkpoints/final.pt"
+    torch.save({"model": model.state_dict(), "ema": ema.shadow, "model_config": mcfg, "step": gs}, final)
     print(f"\nDone! {gs} steps, {(time.time()-t_start)/60:.1f}min -> {final}")