asdf98
/

LiquidGen

Model card Files Files and versions

xet

Community

asdf98 commited on 8 days ago

Commit

ce2ad4d

verified ·

1 Parent(s): 635ef78

Add optimized training v2 with latent pre-caching for Colab

Browse files

Files changed (1) hide show

train.py +379 -380

train.py CHANGED Viewed

@@ -1,95 +1,136 @@
 """
-LiquidGen Training Pipeline
 Flow Matching training objective (velocity prediction):
-- Forward: x_t = (1 - t) * x_0 + t * ε   (linear interpolation)
-- Target: v = ε - x_0                      (velocity)
 - Loss: MSE(model(x_t, t), v)
-At inference: solve ODE from t=1 (noise) to t=0 (clean) using Euler steps.
-Dataset loading: Uses STREAMING mode by default — no full download needed!
-For small datasets (<500MB), set use_streaming=False for faster epoch iteration.
 """
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from torch.utils.data import DataLoader, Dataset, IterableDataset
 from torch.amp import autocast, GradScaler
 import math
 import os
 import json
 import time
-from pathlib import Path
-from typing import Optional, Dict, Any
-from dataclasses import dataclass, field, asdict
 @dataclass
 class TrainConfig:
-    """Training configuration with sensible defaults for Colab free tier."""
     # Model
-    model_size: str = "small"
-    num_classes: int = 0
     class_drop_prob: float = 0.1
     # Data
-    image_size: int = 256
-    dataset_name: str = "huggan/wikiart"
-    dataset_config: str = ""
-    image_column: str = "image"
-    label_column: str = ""
-    use_streaming: bool = True     # KEY: streaming mode, no full download
-    max_samples: int = 0           # 0 = use all (only for non-streaming)
-    streaming_buffer: int = 1000   # Shuffle buffer for streaming
-    # VAE (SDXL VAE - open access, no login needed, fp16-safe)
-    vae_id: str = "madebyollin/sdxl-vae-fp16-fix"
-    vae_subfolder: str = ""
-    vae_dtype: str = "float16"
-    vae_scaling_factor: float = 0.13025
-    vae_shift_factor: float = 0.0       # SDXL VAE has no shift
     # Training
-    batch_size: int = 8
-    gradient_accumulation_steps: int = 4
     learning_rate: float = 1e-4
     weight_decay: float = 0.01
-    max_grad_norm: float = 2.0
-    max_steps: int = 50000         # Train by steps, not epochs (better for streaming)
-    warmup_steps: int = 1000
     ema_decay: float = 0.9999
     mixed_precision: bool = True
     # Flow matching
     min_timestep: float = 0.001
     max_timestep: float = 0.999
     # Saving
     output_dir: str = "./outputs"
-    save_every_n_steps: int = 5000
-    sample_every_n_steps: int = 1000
-    log_every_n_steps: int = 50
     # Sampling
     num_sample_steps: int = 50
-    cfg_scale: float = 1.5
     num_samples: int = 4
     # System
     seed: int = 42
-    num_workers: int = 0           # 0 for streaming (required)
-    pin_memory: bool = True
     compile_model: bool = False
     # Hub
     push_to_hub: bool = False
     hub_model_id: str = ""
-def get_model_config(size: str, num_classes: int = 0, class_drop_prob: float = 0.1) -> dict:
-    """Get model kwargs for a given size preset."""
     configs = {
         "small": dict(embed_dim=512, depth=12, spatial_kernel=7, scan_kernel=31,
                       expand_ratio=2.0, mlp_ratio=3.0),
@@ -106,384 +147,342 @@ def get_model_config(size: str, num_classes: int = 0, class_drop_prob: float = 0
 # =============================================================================
-# Dataset Loaders
 # =============================================================================
-class StreamingImageDataset(IterableDataset):
-    """
-    Streaming dataset — loads images on-the-fly from HuggingFace Hub.
-    NO full download needed. Starts training immediately.
-    Perfect for large datasets (WikiArt, LAION, etc.) on Colab free tier.
-    """
-    def __init__(self, dataset_name, image_column="image", label_column="",
-                 image_size=256, split="train", dataset_config="",
-                 buffer_size=1000, seed=42):
-        super().__init__()
-        self.dataset_name = dataset_name
-        self.image_column = image_column
-        self.label_column = label_column
-        self.split = split
-        self.dataset_config = dataset_config
-        self.buffer_size = buffer_size
-        self.seed = seed
-        from torchvision import transforms
-        self.transform = transforms.Compose([
-            transforms.Resize(image_size, interpolation=transforms.InterpolationMode.LANCZOS),
-            transforms.CenterCrop(image_size),
-            transforms.RandomHorizontalFlip(),
-            transforms.ToTensor(),
-        ])
-    def _get_stream(self):
-        from datasets import load_dataset
-        kwargs = {}
-        if self.dataset_config:
-            kwargs["name"] = self.dataset_config
-        ds = load_dataset(self.dataset_name, split=self.split, streaming=True, **kwargs)
-        ds = ds.shuffle(seed=self.seed, buffer_size=self.buffer_size)
-        return iter(ds)
-    def __iter__(self):
-        stream = self._get_stream()
-        for item in stream:
-            try:
-                img = item[self.image_column]
-                if img.mode != "RGB":
-                    img = img.convert("RGB")
-                img_tensor = self.transform(img)
-                label = -1
-                if self.label_column and self.label_column in item:
-                    label = item[self.label_column]
-                yield img_tensor, label
-            except Exception:
-                continue
-class MapImageDataset(Dataset):
-    """
-    Standard map-style dataset for small datasets that fit in memory.
-    Downloads once, then fast random access.
-    Good for: Pokemon (95MB), Flowers (330MB), few-shot-art (510MB)
-    """
-    def __init__(self, dataset_name, image_column="image", label_column="",
-                 image_size=256, split="train", dataset_config="", max_samples=0):
-        super().__init__()
-        self.image_column = image_column
-        self.label_column = label_column
-        from datasets import load_dataset
-        from torchvision import transforms
-        kwargs = {}
-        if dataset_config:
-            kwargs["name"] = dataset_config
-        print(f"Downloading {dataset_name}...")
-        self.dataset = load_dataset(dataset_name, split=split, **kwargs)
-        if max_samples > 0:
-            self.dataset = self.dataset.select(range(min(max_samples, len(self.dataset))))
-        print(f"  {len(self.dataset)} images loaded")
-        self.transform = transforms.Compose([
-            transforms.Resize(image_size, interpolation=transforms.InterpolationMode.LANCZOS),
-            transforms.CenterCrop(image_size),
-            transforms.RandomHorizontalFlip(),
-            transforms.ToTensor(),
-        ])
     def __len__(self):
-        return len(self.dataset)
     def __getitem__(self, idx):
-        item = self.dataset[idx]
-        img = item[self.image_column]
         if img.mode != "RGB":
             img = img.convert("RGB")
-        img = self.transform(img)
-        label = item[self.label_column] if self.label_column and self.label_column in item else -1
-        return img, label
 # =============================================================================
-# Training Utilities
 # =============================================================================
 class EMAModel:
-    """Exponential Moving Average of model parameters."""
-    def __init__(self, model: nn.Module, decay: float = 0.9999):
         self.decay = decay
-        self.shadow = {name: p.clone().detach() for name, p in model.named_parameters() if p.requires_grad}
     @torch.no_grad()
-    def update(self, model: nn.Module):
-        for name, p in model.named_parameters():
-            if p.requires_grad and name in self.shadow:
-                self.shadow[name].mul_(self.decay).add_(p.data, alpha=1 - self.decay)
-    def apply(self, model: nn.Module):
-        self.backup = {name: p.data.clone() for name, p in model.named_parameters() if p.requires_grad}
-        for name, p in model.named_parameters():
-            if p.requires_grad and name in self.shadow:
-                p.data.copy_(self.shadow[name])
-    def restore(self, model: nn.Module):
-        for name, p in model.named_parameters():
-            if p.requires_grad and name in self.backup:
-                p.data.copy_(self.backup[name])
         self.backup = {}
-    def state_dict(self):
-        return self.shadow
-    def load_state_dict(self, state_dict):
-        self.shadow = state_dict
 class FlowMatchingScheduler:
-    """Flow Matching: x_t = (1-t)*x_0 + t*ε, v_target = ε - x_0"""
     def __init__(self, min_t=0.001, max_t=0.999):
         self.min_t, self.max_t = min_t, max_t
-    def sample_timesteps(self, batch_size, device):
-        return torch.rand(batch_size, device=device) * (self.max_t - self.min_t) + self.min_t
     def add_noise(self, x0, noise, t):
-        t = t.view(-1, 1, 1, 1)
-        return (1 - t) * x0 + t * noise
     def get_velocity_target(self, x0, noise):
         return noise - x0
     @torch.no_grad()
-    def sample(self, model, shape, device, num_steps=50, class_labels=None,
-               cfg_scale=1.0, dtype=torch.float32):
-        model.eval()
-        x = torch.randn(shape, device=device, dtype=dtype)
         dt = 1.0 / num_steps
-        for t_val in torch.linspace(1.0, dt, num_steps, device=device):
-            t = torch.full((shape[0],), t_val.item(), device=device, dtype=dtype)
-            if cfg_scale > 1.0 and class_labels is not None:
-                with torch.amp.autocast('cuda', enabled=(dtype != torch.float32)):
-                    v_cond = model(x, t, class_labels)
-                    v_uncond = model(x, t, torch.zeros_like(class_labels))
-                v = v_uncond + cfg_scale * (v_cond - v_uncond)
-            else:
-                with torch.amp.autocast('cuda', enabled=(dtype != torch.float32)):
-                    v = model(x, t, class_labels)
-            x = x - dt * v
         return x
-def get_cosine_schedule_with_warmup(optimizer, warmup_steps, total_steps):
-    def lr_lambda(step):
-        if step < warmup_steps:
-            return step / max(1, warmup_steps)
-        progress = (step - warmup_steps) / max(1, total_steps - warmup_steps)
-        return max(0.0, 0.5 * (1.0 + math.cos(math.pi * progress)))
-    return torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)
-@torch.no_grad()
-def encode_images_with_vae(images, vae, scaling_factor, shift_factor):
-    images = images * 2.0 - 1.0
-    latents = vae.encode(images).latent_dist.sample()
-    return (latents - shift_factor) * scaling_factor
-@torch.no_grad()
-def decode_latents_with_vae(latents, vae, scaling_factor, shift_factor):
-    latents = latents / scaling_factor + shift_factor
-    images = vae.decode(latents).sample
-    return ((images + 1.0) / 2.0).clamp(0, 1)
 # =============================================================================
 # Main Training Loop
 # =============================================================================
-def train(config: TrainConfig):
-    """Main training loop with streaming dataset support."""
     from model import LiquidGen
     torch.manual_seed(config.seed)
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     print(f"Device: {device}")
     os.makedirs(config.output_dir, exist_ok=True)
-    os.makedirs(os.path.join(config.output_dir, "samples"), exist_ok=True)
-    os.makedirs(os.path.join(config.output_dir, "checkpoints"), exist_ok=True)
-    with open(os.path.join(config.output_dir, "config.json"), "w") as f:
         json.dump(asdict(config), f, indent=2)
-    # Load VAE (frozen)
-    print("Loading VAE...")
-    from diffusers import AutoencoderKL
-    vae_dtype = torch.float16 if config.vae_dtype == "float16" else torch.bfloat16
-    vae_kwargs = {"torch_dtype": vae_dtype}
-    if config.vae_subfolder:
-        vae_kwargs["subfolder"] = config.vae_subfolder
-    vae = AutoencoderKL.from_pretrained(
-        config.vae_id, **vae_kwargs
-    ).to(device).eval()
-    for p in vae.parameters():
-        p.requires_grad_(False)
-    print(f"VAE: {sum(p.numel() for p in vae.parameters())/1e6:.1f}M params (frozen)")
-    # Load Dataset
-    print(f"Loading dataset: {config.dataset_name} (streaming={config.use_streaming})")
-    if config.use_streaming:
-        train_dataset = StreamingImageDataset(
-            dataset_name=config.dataset_name,
-            image_column=config.image_column,
-            label_column=config.label_column,
-            image_size=config.image_size,
-            dataset_config=config.dataset_config,
-            buffer_size=config.streaming_buffer,
-            seed=config.seed,
-        )
-        train_loader = DataLoader(
-            train_dataset, batch_size=config.batch_size,
-            num_workers=0,  # Required for streaming
-            pin_memory=config.pin_memory,
-        )
-        print("  Streaming mode — no full download, starts immediately!")
-    else:
-        train_dataset = MapImageDataset(
-            dataset_name=config.dataset_name,
-            image_column=config.image_column,
-            label_column=config.label_column,
-            image_size=config.image_size,
-            dataset_config=config.dataset_config,
-            max_samples=config.max_samples,
-        )
-        train_loader = DataLoader(
-            train_dataset, batch_size=config.batch_size, shuffle=True,
-            num_workers=2, pin_memory=config.pin_memory, drop_last=True,
-        )
-    # Create Model
-    model_kwargs = get_model_config(config.model_size, config.num_classes, config.class_drop_prob)
-    model = LiquidGen(**model_kwargs).to(device)
-    print(f"LiquidGen-{config.model_size}: {model.count_params() / 1e6:.1f}M params")
     if config.compile_model and hasattr(torch, "compile"):
         model = torch.compile(model)
-    optimizer = torch.optim.AdamW(model.parameters(), lr=config.learning_rate,
-                                   weight_decay=config.weight_decay, betas=(0.9, 0.999))
-    scheduler = get_cosine_schedule_with_warmup(optimizer, config.warmup_steps, config.max_steps)
-    ema = EMAModel(model, decay=config.ema_decay)
-    scaler = GradScaler('cuda', enabled=config.mixed_precision)
-    fm = FlowMatchingScheduler(min_t=config.min_timestep, max_t=config.max_timestep)
-    print(f"\nTraining for {config.max_steps} steps")
-    print(f"Effective batch size: {config.batch_size * config.gradient_accumulation_steps}")
-    # Step-based training loop (works for both streaming and map datasets)
-    global_step = 0
-    loss_accum = 0.0
-    accum_count = 0
-    model.train()
     t_start = time.time()
-    while global_step < config.max_steps:
-        for images, labels in train_loader:
-            if global_step >= config.max_steps:
-                break
-            images = images.to(device)
-            labels = labels.to(device) if config.num_classes > 0 else None
-            # Encode to latents
-            with torch.no_grad():
-                latents = encode_images_with_vae(
-                    images.to(vae_dtype), vae, config.vae_scaling_factor, config.vae_shift_factor
-                ).float()
-            # Flow matching
-            t = fm.sample_timesteps(latents.shape[0], device)
-            noise = torch.randn_like(latents)
-            x_t = fm.add_noise(latents, noise, t)
-            v_target = fm.get_velocity_target(latents, noise)
-            with autocast('cuda', enabled=config.mixed_precision):
-                v_pred = model(x_t, t, labels)
-                loss = F.mse_loss(v_pred, v_target) / config.gradient_accumulation_steps
             scaler.scale(loss).backward()
-            loss_accum += loss.item()
-            accum_count += 1
-            if accum_count % config.gradient_accumulation_steps == 0:
-                scaler.unscale_(optimizer)
-                grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_grad_norm)
-                scaler.step(optimizer)
-                scaler.update()
-                optimizer.zero_grad()
-                scheduler.step()
-                ema.update(model)
-                global_step += 1
-                # Logging
-                if global_step % config.log_every_n_steps == 0:
-                    avg_loss = loss_accum / config.log_every_n_steps
-                    lr = optimizer.param_groups[0]["lr"]
-                    elapsed = time.time() - t_start
-                    steps_per_sec = global_step / max(elapsed, 1)
-                    print(f"step={global_step} | loss={avg_loss:.4f} | "
-                          f"grad_norm={grad_norm:.2f} | lr={lr:.2e} | "
-                          f"steps/s={steps_per_sec:.2f} | elapsed={elapsed:.0f}s")
-                    loss_accum = 0.0
-                    if math.isnan(avg_loss) or avg_loss > 100:
-                        print("⚠️ Training diverged!")
-                        return
-                # Sample
-                if global_step % config.sample_every_n_steps == 0:
-                    ema.apply(model)
-                    model.eval()
-                    latent_size = config.image_size // 8
-                    sample_labels = None
-                    if config.num_classes > 0:
-                        sample_labels = torch.randint(0, config.num_classes, (config.num_samples,), device=device)
-                    latent_ch = vae.config.latent_channels  # 4 for SDXL, 16 for Flux
-                    sampled = fm.sample(model, (config.num_samples, latent_ch, latent_size, latent_size),
-                                       device, config.num_sample_steps, sample_labels, config.cfg_scale)
-                    sample_imgs = decode_latents_with_vae(sampled.to(vae_dtype), vae,
-                                                          config.vae_scaling_factor, config.vae_shift_factor).float()
                     from torchvision.utils import save_image
-                    save_image(sample_imgs, os.path.join(config.output_dir, "samples", f"step_{global_step:07d}.png"), nrow=2)
-                    print(f"  📸 Saved samples: step_{global_step:07d}.png")
-                    ema.restore(model)
-                    model.train()
-                # Checkpoint
-                if global_step % config.save_every_n_steps == 0:
-                    ckpt_path = os.path.join(config.output_dir, "checkpoints", f"step_{global_step:07d}.pt")
-                    torch.save({
-                        "model": model.state_dict(), "ema": ema.state_dict(),
-                        "optimizer": optimizer.state_dict(), "scheduler": scheduler.state_dict(),
-                        "global_step": global_step, "config": asdict(config),
-                    }, ckpt_path)
-                    print(f"  💾 Checkpoint: {ckpt_path}")
-    # Final save
-    final_path = os.path.join(config.output_dir, "checkpoints", "final.pt")
-    torch.save({"model": model.state_dict(), "ema": ema.state_dict(),
-                "config": asdict(config), "global_step": global_step}, final_path)
-    elapsed = time.time() - t_start
-    print(f"\n🎉 Training complete! {global_step} steps in {elapsed/60:.1f} min")
-    print(f"   Final model: {final_path}")
 if __name__ == "__main__":
     config = TrainConfig(
-        model_size="small", image_size=256, batch_size=4,
-        max_steps=100, use_streaming=True,
     )
     train(config)

 """
+LiquidGen Training Pipeline v2
+Optimized for Colab free tier:
+- Latent pre-caching: encode images with VAE once, save to disk, train on pure tensors
+- No VAE needed during training loop → saves ~1GB VRAM + faster iterations
+- Streaming support for large datasets
+- Multiple small dataset presets
 Flow Matching training objective (velocity prediction):
+- Forward: x_t = (1 - t) * x_0 + t * ε
+- Target: v = ε - x_0
 - Loss: MSE(model(x_t, t), v)
 """
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from torch.utils.data import DataLoader, Dataset
 from torch.amp import autocast, GradScaler
 import math
 import os
 import json
 import time
+from typing import Optional
+from dataclasses import dataclass, asdict
+# =============================================================================
+# Dataset Presets (all verified, fast to download)
+# =============================================================================
+DATASET_PRESETS = {
+    "paintings_mini": {
+        "name": "keremberke/painting-style-classification",
+        "config": "mini",
+        "image_column": "image",
+        "label_column": "labels",
+        "num_classes": 27,
+        "description": "~200 painting samples, 27 styles, 1.7MB — instant smoke test",
+    },
+    "paintings": {
+        "name": "keremberke/painting-style-classification",
+        "config": "full",
+        "image_column": "image",
+        "label_column": "labels",
+        "num_classes": 27,
+        "description": "~8K paintings, 27 styles, 204MB — best for style-conditional training",
+    },
+    "cartoon": {
+        "name": "Norod78/cartoon-blip-captions",
+        "config": "",
+        "image_column": "image",
+        "label_column": "",
+        "num_classes": 0,
+        "description": "~2.5K cartoon/anime, unconditional, 181MB",
+    },
+    "flowers": {
+        "name": "huggan/flowers-102-categories",
+        "config": "",
+        "image_column": "image",
+        "label_column": "",
+        "num_classes": 0,
+        "description": "~8K flower photos, unconditional, 331MB",
+    },
+    "wikiart_stream": {
+        "name": "huggan/wikiart",
+        "config": "",
+        "image_column": "image",
+        "label_column": "style",
+        "num_classes": 27,
+        "streaming": True,
+        "description": "~80K paintings, 27 styles, STREAMING (0 disk) — use max_images to limit",
+    },
+}
 @dataclass
 class TrainConfig:
+    """Training configuration optimized for Colab free tier (T4 16GB)."""
     # Model
+    model_size: str = "small"  # small (~55M), base (~140M), large (~280M)
+    num_classes: int = 27
     class_drop_prob: float = 0.1
     # Data
+    dataset_preset: str = "paintings"  # key from DATASET_PRESETS
+    image_size: int = 256              # 256 or 512
+    max_images: int = 0                # 0 = use all, >0 = limit (for streaming/testing)
+    # VAE (for pre-caching only — NOT loaded during training)
+    vae_id: str = "black-forest-labs/FLUX.1-schnell"
+    vae_subfolder: str = "vae"
+    vae_scaling_factor: float = 0.3611
+    vae_shift_factor: float = 0.1159
     # Training
+    batch_size: int = 32               # Can be large since training on cached tensors!
+    gradient_accumulation_steps: int = 1
     learning_rate: float = 1e-4
     weight_decay: float = 0.01
+    max_grad_norm: float = 2.0         # Critical for stability (ZigMa paper)
+    num_epochs: int = 100
+    warmup_steps: int = 500
     ema_decay: float = 0.9999
     mixed_precision: bool = True
     # Flow matching
     min_timestep: float = 0.001
     max_timestep: float = 0.999
     # Saving
     output_dir: str = "./outputs"
+    save_every_n_steps: int = 2000
+    sample_every_n_steps: int = 500
+    log_every_n_steps: int = 25
     # Sampling
     num_sample_steps: int = 50
+    cfg_scale: float = 2.0
     num_samples: int = 4
     # System
     seed: int = 42
+    num_workers: int = 2
     compile_model: bool = False
     # Hub
     push_to_hub: bool = False
     hub_model_id: str = ""
+def get_model_config(size, num_classes=0, class_drop_prob=0.1):
     configs = {
         "small": dict(embed_dim=512, depth=12, spatial_kernel=7, scan_kernel=31,
                       expand_ratio=2.0, mlp_ratio=3.0),
 # =============================================================================
+# Latent Pre-Caching (the key optimization for Colab)
 # =============================================================================
+class CachedLatentDataset(Dataset):
+    """Training dataset from pre-encoded VAE latents on disk."""
+    def __init__(self, cache_path):
+        data = torch.load(cache_path, map_location="cpu", weights_only=True)
+        self.latents = data["latents"]
+        self.labels = data.get("labels", None)
+        print(f"Loaded {len(self.latents)} cached latents from {cache_path}")
+        print(f"  Shape: {self.latents.shape}, dtype: {self.latents.dtype}")
+        if self.labels is not None:
+            print(f"  Labels: unique={self.labels.unique().shape[0]}")
     def __len__(self):
+        return len(self.latents)
     def __getitem__(self, idx):
+        lat = self.latents[idx]
+        label = self.labels[idx] if self.labels is not None else -1
+        return lat, label
+def precache_latents(config, cache_path=None):
+    """
+    Encode all images to VAE latents once, save to disk.
+    After caching:
+    - VAE unloaded → frees ~1GB VRAM
+    - Training loads pure tensors → much faster iterations
+    - Larger batch sizes possible (no VAE memory overhead)
+    Returns path to cache file.
+    """
+    if cache_path is None:
+        cache_path = os.path.join(config.output_dir, "cached_latents.pt")
+    if os.path.exists(cache_path):
+        print(f"✅ Cache exists: {cache_path}")
+        data = torch.load(cache_path, map_location="cpu", weights_only=True)
+        print(f"   {data['latents'].shape[0]} latents, shape {data['latents'].shape[1:]}")
+        return cache_path
+    os.makedirs(os.path.dirname(cache_path) if os.path.dirname(cache_path) else ".", exist_ok=True)
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    # Load VAE temporarily
+    print("Loading VAE for encoding...")
+    from diffusers import AutoencoderKL
+    vae = AutoencoderKL.from_pretrained(
+        config.vae_id, subfolder=config.vae_subfolder, torch_dtype=torch.float16
+    ).to(device).eval()
+    for p in vae.parameters():
+        p.requires_grad_(False)
+    # Load dataset
+    preset = DATASET_PRESETS[config.dataset_preset]
+    print(f"Loading dataset: {preset['name']} ({preset['description']})")
+    from datasets import load_dataset
+    from torchvision import transforms
+    is_streaming = preset.get("streaming", False)
+    ds_kwargs = {"split": "train"}
+    if preset["config"]:
+        ds_kwargs["name"] = preset["config"]
+    if is_streaming:
+        ds_kwargs["streaming"] = True
+    dataset = load_dataset(preset["name"], **ds_kwargs)
+    transform = transforms.Compose([
+        transforms.Resize(config.image_size, interpolation=transforms.InterpolationMode.LANCZOS),
+        transforms.CenterCrop(config.image_size),
+        transforms.ToTensor(),
+    ])
+    all_latents = []
+    all_labels = []
+    batch_pixels = []
+    batch_labels = []
+    encode_bs = 16
+    count = 0
+    max_imgs = config.max_images if config.max_images > 0 else float("inf")
+    img_col = preset["image_column"]
+    lbl_col = preset["label_column"]
+    print(f"Encoding images to latents...")
+    t0 = time.time()
+    for item in dataset:
+        if count >= max_imgs:
+            break
+        img = item[img_col]
         if img.mode != "RGB":
             img = img.convert("RGB")
+        batch_pixels.append(transform(img))
+        if lbl_col and lbl_col in item:
+            batch_labels.append(item[lbl_col])
+        else:
+            batch_labels.append(-1)
+        count += 1
+        if len(batch_pixels) >= encode_bs:
+            with torch.no_grad():
+                px = torch.stack(batch_pixels).to(device, dtype=torch.float16) * 2 - 1
+                lat = vae.encode(px).latent_dist.sample()
+                lat = (lat - config.vae_shift_factor) * config.vae_scaling_factor
+                all_latents.append(lat.cpu().float())
+            all_labels.extend(batch_labels)
+            batch_pixels, batch_labels = [], []
+            if count % 500 == 0:
+                print(f"  {count} images encoded ({time.time()-t0:.0f}s)")
+    if batch_pixels:
+        with torch.no_grad():
+            px = torch.stack(batch_pixels).to(device, dtype=torch.float16) * 2 - 1
+            lat = vae.encode(px).latent_dist.sample()
+            lat = (lat - config.vae_shift_factor) * config.vae_scaling_factor
+            all_latents.append(lat.cpu().float())
+        all_labels.extend(batch_labels)
+    all_latents = torch.cat(all_latents, dim=0)
+    all_labels = torch.tensor(all_labels, dtype=torch.long)
+    torch.save({"latents": all_latents, "labels": all_labels}, cache_path)
+    elapsed = time.time() - t0
+    mb = os.path.getsize(cache_path) / 1024**2
+    print(f"\n✅ Cached {count} latents → {cache_path}")
+    print(f"   Shape: {all_latents.shape}, Size: {mb:.1f}MB, Time: {elapsed:.0f}s")
+    del vae
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    print("   VAE unloaded, VRAM freed\n")
+    return cache_path
 # =============================================================================
+# EMA, FlowMatching, Scheduler
 # =============================================================================
 class EMAModel:
+    def __init__(self, model, decay=0.9999):
         self.decay = decay
+        self.shadow = {n: p.clone().detach() for n, p in model.named_parameters() if p.requires_grad}
     @torch.no_grad()
+    def update(self, model):
+        for n, p in model.named_parameters():
+            if p.requires_grad and n in self.shadow:
+                self.shadow[n].mul_(self.decay).add_(p.data, alpha=1 - self.decay)
+    def apply(self, model):
+        self.backup = {n: p.data.clone() for n, p in model.named_parameters() if p.requires_grad}
+        for n, p in model.named_parameters():
+            if p.requires_grad and n in self.shadow:
+                p.data.copy_(self.shadow[n])
+    def restore(self, model):
+        for n, p in model.named_parameters():
+            if p.requires_grad and n in self.backup:
+                p.data.copy_(self.backup[n])
         self.backup = {}
 class FlowMatchingScheduler:
     def __init__(self, min_t=0.001, max_t=0.999):
         self.min_t, self.max_t = min_t, max_t
+    def sample_timesteps(self, bs, dev):
+        return torch.rand(bs, device=dev) * (self.max_t - self.min_t) + self.min_t
     def add_noise(self, x0, noise, t):
+        t = t.view(-1, 1, 1, 1); return (1 - t) * x0 + t * noise
     def get_velocity_target(self, x0, noise):
         return noise - x0
     @torch.no_grad()
+    def sample(self, model, shape, dev, num_steps=50, labels=None, cfg=1.0):
+        model.eval(); x = torch.randn(shape, device=dev)
         dt = 1.0 / num_steps
+        for tv in torch.linspace(1.0, dt, num_steps, device=dev):
+            t = torch.full((shape[0],), tv.item(), device=dev)
+            with torch.amp.autocast("cuda"):
+                if cfg > 1.0 and labels is not None:
+                    vc = model(x, t, labels); vu = model(x, t, torch.zeros_like(labels))
+                    v = vu + cfg * (vc - vu)
+                else:
+                    v = model(x, t, labels)
+            x = x - dt * v.float()
         return x
+def cosine_schedule(opt, warmup, total):
+    def lr(s):
+        if s < warmup: return s / max(1, warmup)
+        return max(0, 0.5 * (1 + math.cos(math.pi * (s - warmup) / max(1, total - warmup))))
+    return torch.optim.lr_scheduler.LambdaLR(opt, lr)
 # =============================================================================
 # Main Training Loop
 # =============================================================================
+def train(config):
     from model import LiquidGen
     torch.manual_seed(config.seed)
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     print(f"Device: {device}")
+    if torch.cuda.is_available():
+        print(f"GPU: {torch.cuda.get_device_name(0)} "
+              f"({torch.cuda.get_device_properties(0).total_mem/1024**3:.1f} GB)")
     os.makedirs(config.output_dir, exist_ok=True)
+    os.makedirs(f"{config.output_dir}/samples", exist_ok=True)
+    os.makedirs(f"{config.output_dir}/checkpoints", exist_ok=True)
+    with open(f"{config.output_dir}/config.json", "w") as f:
         json.dump(asdict(config), f, indent=2)
+    # Step 1: Pre-cache latents
+    cache_path = precache_latents(config)
+    # Step 2: Dataset from cache
+    train_ds = CachedLatentDataset(cache_path)
+    train_dl = DataLoader(train_ds, batch_size=config.batch_size, shuffle=True,
+                          num_workers=config.num_workers, pin_memory=True, drop_last=True)
+    # Step 3: Model
+    mcfg = get_model_config(config.model_size, config.num_classes, config.class_drop_prob)
+    model = LiquidGen(**mcfg).to(device)
+    print(f"LiquidGen-{config.model_size}: {model.count_params()/1e6:.1f}M params")
     if config.compile_model and hasattr(torch, "compile"):
         model = torch.compile(model)
+    # Step 4: Training setup
+    opt = torch.optim.AdamW(model.parameters(), lr=config.learning_rate,
+                            weight_decay=config.weight_decay, betas=(0.9, 0.999))
+    total_steps = len(train_dl) * config.num_epochs // config.gradient_accumulation_steps
+    sched = cosine_schedule(opt, config.warmup_steps, total_steps)
+    ema = EMAModel(model, config.ema_decay)
+    scaler = GradScaler("cuda", enabled=config.mixed_precision and torch.cuda.is_available())
+    fm = FlowMatchingScheduler(config.min_timestep, config.max_timestep)
+    lat_size = config.image_size // 8
+    print(f"\nTotal steps: {total_steps}, Batch: {config.batch_size}×{config.gradient_accumulation_steps}")
+    print(f"No VAE during training → max VRAM for model")
+    if torch.cuda.is_available():
+        print(f"VRAM: {torch.cuda.memory_allocated()/1024**3:.1f} / "
+              f"{torch.cuda.get_device_properties(0).total_mem/1024**3:.1f} GB")
+    # Step 5: Train!
+    gs = 0; la = 0.0; vae = None; vae_loaded = False
+    print(f"\n{'='*60}\n🚀 Training!\n{'='*60}\n")
     t_start = time.time()
+    for epoch in range(config.num_epochs):
+        model.train(); et = time.time()
+        for bi, (lats, lbls) in enumerate(train_dl):
+            lats = lats.to(device)
+            lbls = lbls.to(device) if config.num_classes > 0 else None
+            t = fm.sample_timesteps(lats.shape[0], device)
+            noise = torch.randn_like(lats)
+            xt = fm.add_noise(lats, noise, t)
+            vtgt = fm.get_velocity_target(lats, noise)
+            with autocast("cuda", enabled=config.mixed_precision and torch.cuda.is_available()):
+                vp = model(xt, t, lbls)
+                loss = F.mse_loss(vp, vtgt) / config.gradient_accumulation_steps
             scaler.scale(loss).backward()
+            la += loss.item()
+            if (bi + 1) % config.gradient_accumulation_steps == 0:
+                scaler.unscale_(opt)
+                gn = torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_grad_norm)
+                scaler.step(opt); scaler.update(); opt.zero_grad(); sched.step()
+                ema.update(model); gs += 1
+                if gs % config.log_every_n_steps == 0:
+                    al = la / config.log_every_n_steps
+                    lr = opt.param_groups[0]["lr"]
+                    vram = torch.cuda.memory_allocated()/1024**3 if torch.cuda.is_available() else 0
+                    sps = gs / max(time.time() - t_start, 1)
+                    print(f"step={gs:>6d} | ep={epoch} | loss={al:.4f} | gn={gn:.2f} | "
+                          f"lr={lr:.2e} | vram={vram:.1f}G | {sps:.1f} st/s")
+                    la = 0.0
+                    if math.isnan(al) or al > 50:
+                        print("💥 Diverged!"); return
+                if gs % config.sample_every_n_steps == 0:
+                    if not vae_loaded:
+                        from diffusers import AutoencoderKL
+                        vae = AutoencoderKL.from_pretrained(
+                            config.vae_id, subfolder=config.vae_subfolder,
+                            torch_dtype=torch.float16).to(device).eval()
+                        for p in vae.parameters(): p.requires_grad_(False)
+                        vae_loaded = True
+                    ema.apply(model); model.eval()
+                    sl = torch.randint(0, max(1, config.num_classes), (config.num_samples,),
+                                       device=device) if config.num_classes > 0 else None
+                    samp = fm.sample(model, (config.num_samples, 16, lat_size, lat_size),
+                                     device, config.num_sample_steps, sl, config.cfg_scale)
+                    with torch.no_grad():
+                        dec = samp.half() / config.vae_scaling_factor + config.vae_shift_factor
+                        imgs = ((vae.decode(dec).sample + 1) / 2).clamp(0, 1).float()
                     from torchvision.utils import save_image
+                    sp = f"{config.output_dir}/samples/step_{gs:07d}.png"
+                    save_image(imgs, sp, nrow=2); print(f"  📸 {sp}")
+                    ema.restore(model); model.train()
+                if gs % config.save_every_n_steps == 0:
+                    cp = f"{config.output_dir}/checkpoints/step_{gs:07d}.pt"
+                    torch.save({"model": model.state_dict(), "ema": ema.shadow,
+                                "optimizer": opt.state_dict(), "scheduler": sched.state_dict(),
+                                "step": gs, "epoch": epoch, "model_config": mcfg}, cp)
+                    print(f"  💾 {cp}")
+        print(f"Epoch {epoch} | {time.time()-et:.0f}s\n")
+    final = f"{config.output_dir}/checkpoints/final.pt"
+    torch.save({"model": model.state_dict(), "ema": ema.shadow,
+                "model_config": mcfg, "step": gs}, final)
+    print(f"\n🎉 Done! {gs} steps, {(time.time()-t_start)/60:.1f}min → {final}")
 if __name__ == "__main__":
     config = TrainConfig(
+        model_size="small", dataset_preset="paintings_mini",
+        image_size=256, batch_size=8, num_epochs=5,
+        log_every_n_steps=5, sample_every_n_steps=99999,
     )
     train(config)