AbstractPhil
/

tiny-flux

@@ -1,9 +1,15 @@
 # ============================================================================
-# TinyFlux Training Cell - Full Featured
 # ============================================================================
-# Run the model cell before this one (defines TinyFlux, TinyFluxConfig)
-# Dataset: AbstractPhil/flux-schnell-teacher-latents
-# Uploads checkpoints to: AbstractPhil/tiny-flux
 # ============================================================================
 import torch
@@ -21,11 +27,24 @@ import os
 import json
 from datetime import datetime
 # ============================================================================
 # CONFIG
 # ============================================================================
-BATCH_SIZE = 4
-GRAD_ACCUM = 2
 LR = 1e-4
 EPOCHS = 10
 MAX_SEQ = 128
@@ -36,26 +55,16 @@ DTYPE = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
 # HuggingFace Hub
 HF_REPO = "AbstractPhil/tiny-flux"
-SAVE_EVERY = 1000      # steps - local save
-UPLOAD_EVERY = 1000    # steps - hub upload
-SAMPLE_EVERY = 500    # steps - generate samples
-LOG_EVERY = 10        # steps - tensorboard
-# Checkpoint loading target
-# Options:
-#   None or "latest" - load most recent checkpoint
-#   "best" - load best model
-#   int (e.g. 1500) - load specific step
-#   "hub:step_1000" - load specific checkpoint from hub
-#   "local:path/to/checkpoint.safetensors" or "local:path/to/checkpoint.pt"
-#   "none" - start fresh, ignore existing checkpoints
-LOAD_TARGET = "latest"
-# Manual resume step (set to override step from checkpoint, or None to use checkpoint's step)
-# Useful when checkpoint doesn't contain step info
-RESUME_STEP = None  # e.g., 5000 to resume from step 5000
-# Local paths
 CHECKPOINT_DIR = "./tiny_flux_checkpoints"
 LOG_DIR = "./tiny_flux_logs"
 SAMPLE_DIR = "./tiny_flux_samples"
@@ -69,7 +78,6 @@ os.makedirs(SAMPLE_DIR, exist_ok=True)
 # ============================================================================
 print("Setting up HuggingFace Hub...")
 api = HfApi()
 try:
     api.create_repo(repo_id=HF_REPO, exist_ok=True, repo_type="model")
     print(f"✓ Repo ready: {HF_REPO}")
@@ -87,7 +95,7 @@ print(f"✓ Tensorboard: {LOG_DIR}/{run_name}")
 # LOAD DATASET
 # ============================================================================
 print("\nLoading dataset...")
-ds = load_dataset("AbstractPhil/flux-schnell-teacher-latents", split="train")
 print(f"Samples: {len(ds)}")
 # ============================================================================
@@ -109,104 +117,151 @@ for p in clip_enc.parameters(): p.requires_grad = False
 # ============================================================================
 print("Loading Flux VAE for samples...")
 from diffusers import AutoencoderKL
 vae = AutoencoderKL.from_pretrained(
-    "black-forest-labs/FLUX.1-schnell",
     subfolder="vae",
     torch_dtype=DTYPE
 ).to(DEVICE).eval()
 for p in vae.parameters(): p.requires_grad = False
 # ============================================================================
-# ENCODING HELPERS
-# ============================================================================
-@torch.no_grad()
-def encode_prompt(prompt):
-    t5_in = t5_tok(prompt, max_length=MAX_SEQ, padding="max_length", truncation=True, return_tensors="pt").to(DEVICE)
-    t5_out = t5_enc(input_ids=t5_in.input_ids, attention_mask=t5_in.attention_mask).last_hidden_state
-    clip_in = clip_tok(prompt, max_length=77, padding="max_length", truncation=True, return_tensors="pt").to(DEVICE)
-    clip_out = clip_enc(input_ids=clip_in.input_ids, attention_mask=clip_in.attention_mask)
     return t5_out, clip_out.pooler_output
 # ============================================================================
-# FLOW MATCHING HELPERS
-# ============================================================================
-# Rectified Flow / Flow Matching formulation:
-#   x_t = (1-t) * x_0 + t * x_1
-#   where x_0 = noise, x_1 = data
-#   t=0: pure noise, t=1: pure data
-#   velocity v = x_1 - x_0 = data - noise
-#
-# Training: model learns to predict v given (x_t, t)
-# Inference: start from noise (t=0), integrate to data (t=1)
-#   x_{t+dt} = x_t + v_pred * dt
 # ============================================================================
-def flux_shift(t, s=SHIFT):
-    """Flux timestep shift for training distribution.
-    Shifts timesteps towards higher values (closer to data),
-    making training focus more on refining details.
-    s=3.0 (default): flux_shift(0.5) ≈ 0.75
-    """
     return s * t / (1 + (s - 1) * t)
-def flux_shift_inverse(t_shifted, s=SHIFT):
-    """Inverse of flux_shift."""
-    return t_shifted / (s - (s - 1) * t_shifted)
 def min_snr_weight(t, gamma=MIN_SNR):
-    """Min-SNR weighting to balance loss across timesteps.
-    Downweights very easy timesteps (near t=0 or t=1).
-    gamma=5.0 is typical.
-    """
     snr = (t / (1 - t).clamp(min=1e-5)).pow(2)
     return torch.clamp(snr, max=gamma) / snr.clamp(min=1e-5)
 # ============================================================================
-# SAMPLING FUNCTION
 # ============================================================================
-@torch.no_grad()
 def generate_samples(model, prompts, num_steps=20, guidance_scale=3.5, H=64, W=64):
-    """Generate sample images using Euler sampling.
-    Flow matching: x_t = (1-t)*noise + t*data, v = data - noise
-    At t=0: pure noise. At t=1: pure data.
-    We integrate from t=0 to t=1.
-    """
     model.eval()
     B = len(prompts)
-    C = 16  # VAE channels
-    # Encode prompts
-    t5_embeds, clip_pooleds = [], []
-    for p in prompts:
-        t5_out, clip_pooled = encode_prompt(p)
-        t5_embeds.append(t5_out.squeeze(0))
-        clip_pooleds.append(clip_pooled.squeeze(0))
-    t5_embeds = torch.stack(t5_embeds)
-    clip_pooleds = torch.stack(clip_pooleds)
-    # Start from pure noise (t=0)
     x = torch.randn(B, H * W, C, device=DEVICE, dtype=DTYPE)
-    # Create image IDs
     img_ids = TinyFlux.create_img_ids(B, H, W, DEVICE)
-    # Euler sampling: t goes from 0 (noise) to 1 (data)
-    timesteps = torch.linspace(0, 1, num_steps + 1, device=DEVICE, dtype=DTYPE)
     for i in range(num_steps):
         t_curr = timesteps[i]
         t_next = timesteps[i + 1]
-        dt = t_next - t_curr  # positive
-        t_batch = t_curr.expand(B)
-        # Conditional prediction
         guidance = torch.full((B,), guidance_scale, device=DEVICE, dtype=DTYPE)
         v_cond = model(
             hidden_states=x,
             encoder_hidden_states=t5_embeds,
@@ -216,13 +271,10 @@ def generate_samples(model, prompts, num_steps=20, guidance_scale=3.5, H=64, W=6
             guidance=guidance,
         )
-        # Euler step: x_{t+dt} = x_t + v * dt
         x = x + v_cond * dt
-    # Reshape to image format: (B, H*W, C) -> (B, C, H, W)
     latents = x.reshape(B, H, W, C).permute(0, 3, 1, 2)
-    # Decode with VAE (match VAE dtype)
     latents = latents / vae.config.scaling_factor
     images = vae.decode(latents.to(vae.dtype)).sample
     images = (images / 2 + 0.5).clamp(0, 1)
@@ -230,94 +282,133 @@ def generate_samples(model, prompts, num_steps=20, guidance_scale=3.5, H=64, W=6
     model.train()
     return images
 def save_samples(images, prompts, step, save_dir):
-    """Save sample images and log to tensorboard."""
     from torchvision.utils import make_grid, save_image
-    # Save individual images
     for i, (img, prompt) in enumerate(zip(images, prompts)):
         safe_prompt = prompt[:50].replace(" ", "_").replace("/", "-")
         path = os.path.join(save_dir, f"step{step}_{i}_{safe_prompt}.png")
         save_image(img, path)
-    # Log grid to tensorboard
     grid = make_grid(images, nrow=2, normalize=False)
     writer.add_image("samples", grid, step)
-    # Log prompts
     writer.add_text("sample_prompts", "\n".join(prompts), step)
     print(f"  ✓ Saved {len(images)} samples")
 # ============================================================================
-# COLLATE
 # ============================================================================
-def collate(batch):
-    latents, t5_embeds, clip_embeds, prompts = [], [], [], []
-    for b in batch:
-        latents.append(torch.tensor(np.array(b["latent"]), dtype=DTYPE))
-        t5_out, clip_pooled = encode_prompt(b["prompt"])
-        t5_embeds.append(t5_out.squeeze(0))
-        clip_embeds.append(clip_pooled.squeeze(0))
-        prompts.append(b["prompt"])
     return {
-        "latents": torch.stack(latents).to(DEVICE),
-        "t5_embeds": torch.stack(t5_embeds),
-        "clip_pooled": torch.stack(clip_embeds),
-        "prompts": prompts,
     }
 # ============================================================================
 # CHECKPOINT FUNCTIONS
 # ============================================================================
 def load_weights(path):
-    """Load weights from .safetensors or .pt file."""
     if path.endswith(".safetensors"):
-        return load_file(path)
     elif path.endswith(".pt"):
         ckpt = torch.load(path, map_location=DEVICE, weights_only=False)
         if isinstance(ckpt, dict):
-            if "model" in ckpt:
-                return ckpt["model"]
-            elif "state_dict" in ckpt:
-                return ckpt["state_dict"]
-            else:
-                # Check if it looks like a state dict (has tensor values)
-                first_val = next(iter(ckpt.values()), None)
-                if isinstance(first_val, torch.Tensor):
-                    return ckpt
-                # Otherwise might have optimizer etc, look for model keys
-                return ckpt
-        return ckpt
     else:
-        # Try safetensors first, then pt
         try:
-            return load_file(path)
         except:
-            return torch.load(path, map_location=DEVICE, weights_only=False)
 def save_checkpoint(model, optimizer, scheduler, step, epoch, loss, path):
-    """Save checkpoint locally."""
     os.makedirs(os.path.dirname(path) if os.path.dirname(path) else ".", exist_ok=True)
     weights_path = path.replace(".pt", ".safetensors")
-    save_file(model.state_dict(), weights_path)
-    state = {
         "step": step,
         "epoch": epoch,
         "loss": loss,
         "optimizer": optimizer.state_dict(),
         "scheduler": scheduler.state_dict(),
-    }
-    torch.save(state, path)
     print(f"  ✓ Saved checkpoint: step {step}")
     return weights_path
-def upload_checkpoint(weights_path, step, config, include_logs=True):
-    """Upload checkpoint to HuggingFace Hub."""
     try:
-        # Upload weights
         api.upload_file(
             path_or_fileobj=weights_path,
             path_in_repo=f"checkpoints/step_{step}.safetensors",
@@ -325,286 +416,86 @@ def upload_checkpoint(weights_path, step, config, include_logs=True):
             commit_message=f"Checkpoint step {step}",
         )
-        # Upload config
         config_path = os.path.join(CHECKPOINT_DIR, "config.json")
         with open(config_path, "w") as f:
             json.dump(config.__dict__, f, indent=2)
-        api.upload_file(
-            path_or_fileobj=config_path,
-            path_in_repo="config.json",
-            repo_id=HF_REPO,
-        )
-        # Upload tensorboard logs
-        if include_logs and os.path.exists(LOG_DIR):
-            api.upload_folder(
-                folder_path=LOG_DIR,
-                path_in_repo="logs",
-                repo_id=HF_REPO,
-                commit_message=f"Logs at step {step}",
-            )
-        # Upload samples
-        if os.path.exists(SAMPLE_DIR) and os.listdir(SAMPLE_DIR):
-            api.upload_folder(
-                folder_path=SAMPLE_DIR,
-                path_in_repo="samples",
-                repo_id=HF_REPO,
-                commit_message=f"Samples at step {step}",
-            )
-        print(f"  ✓ Uploaded to {HF_REPO}")
     except Exception as e:
         print(f"  ⚠ Upload failed: {e}")
 def load_checkpoint(model, optimizer, scheduler, target):
-    """
-    Load checkpoint based on target specification.
-    Args:
-        target:
-            None, "latest" - most recent checkpoint
-            "best" - best model
-            int (1500) - specific step
-            "hub:step_1000" - specific hub checkpoint
-            "local:/path/to/file.safetensors" or "local:/path/to/file.pt" - specific local file
-            "none" - skip loading, start fresh
-    """
-    if target == "none":
-        print("Starting fresh (no checkpoint loading)")
         return 0, 0
-    start_step, start_epoch = 0, 0
-    # Parse target
-    if target is None or target == "latest":
-        load_mode = "latest"
-        load_path = None
-    elif target == "best":
-        load_mode = "best"
-        load_path = None
-    elif isinstance(target, int):
-        load_mode = "step"
-        load_path = target
-    elif target.startswith("hub:"):
-        load_mode = "hub"
-        load_path = target[4:]  # Remove "hub:" prefix
-    elif target.startswith("local:"):
-        load_mode = "local"
-        load_path = target[6:]  # Remove "local:" prefix
-    else:
-        print(f"Unknown target format: {target}, trying as step number")
         try:
-            load_mode = "step"
-            load_path = int(target)
-        except:
-            load_mode = "latest"
-            load_path = None
-    # Load based on mode
-    if load_mode == "local":
-        # Direct local file (.pt or .safetensors)
-        if os.path.exists(load_path):
-            weights = load_weights(load_path)
-            model.load_state_dict(weights)
-            # Try to find associated state file for optimizer/scheduler
-            if load_path.endswith(".safetensors"):
-                state_path = load_path.replace(".safetensors", ".pt")
-            elif load_path.endswith(".pt"):
-                # The .pt file might contain everything
-                ckpt = torch.load(load_path, map_location=DEVICE, weights_only=False)
-                if isinstance(ckpt, dict):
-                    # Debug: show what keys are in the checkpoint
-                    non_tensor_keys = [k for k in ckpt.keys() if not isinstance(ckpt.get(k), torch.Tensor)]
-                    if non_tensor_keys:
-                        print(f"  Checkpoint keys: {non_tensor_keys}")
-                    # Extract step/epoch - try multiple common key names
-                    start_step = ckpt.get("step", ckpt.get("global_step", ckpt.get("iteration", 0)))
-                    start_epoch = ckpt.get("epoch", 0)
-                    # Also check for nested state dict
-                    if "state" in ckpt and isinstance(ckpt["state"], dict):
-                        start_step = ckpt["state"].get("step", start_step)
-                        start_epoch = ckpt["state"].get("epoch", start_epoch)
-                    # Try to load optimizer/scheduler if present
-                    if "optimizer" in ckpt:
-                        try:
-                            optimizer.load_state_dict(ckpt["optimizer"])
-                            if "scheduler" in ckpt:
-                                scheduler.load_state_dict(ckpt["scheduler"])
-                        except Exception as e:
-                            print(f"  Note: Could not load optimizer state: {e}")
-                state_path = None
             else:
-                state_path = load_path + ".pt"
-            if state_path and os.path.exists(state_path):
-                state = torch.load(state_path, map_location=DEVICE, weights_only=False)
                 try:
-                    start_step = state.get("step", start_step)
-                    start_epoch = state.get("epoch", start_epoch)
-                    if "optimizer" in state:
-                        optimizer.load_state_dict(state["optimizer"])
-                    if "scheduler" in state:
-                        scheduler.load_state_dict(state["scheduler"])
-                except Exception as e:
-                    print(f"  Note: Could not load optimizer state: {e}")
-            print(f"✓ Loaded local: {load_path} (step {start_step})")
-            return start_step, start_epoch
-        else:
-            print(f"⚠ Local file not found: {load_path}")
-    elif load_mode == "hub":
-        # Specific hub checkpoint - try both extensions
-        for ext in [".safetensors", ".pt", ""]:
-            try:
-                if load_path.endswith((".safetensors", ".pt")):
-                    filename = load_path if "/" in load_path else f"checkpoints/{load_path}"
-                else:
-                    filename = f"checkpoints/{load_path}{ext}"
-                local_path = hf_hub_download(repo_id=HF_REPO, filename=filename)
-                weights = load_weights(local_path)
-                model.load_state_dict(weights)
-                # Extract step from filename
-                if "step_" in load_path:
-                    start_step = int(load_path.split("step_")[-1].replace(".safetensors", "").replace(".pt", ""))
-                print(f"✓ Loaded from Hub: {filename} (step {start_step})")
-                return start_step, start_epoch
-            except Exception as e:
-                continue
-        print(f"⚠ Could not load from hub: {load_path}")
-    elif load_mode == "best":
-        # Try hub best first (try both extensions)
-        for ext in [".safetensors", ".pt"]:
-            try:
-                filename = f"model{ext}" if ext else "model.safetensors"
-                local_path = hf_hub_download(repo_id=HF_REPO, filename=filename)
-                weights = load_weights(local_path)
-                model.load_state_dict(weights)
-                print(f"✓ Loaded best model from Hub")
-                return start_step, start_epoch
-            except:
-                continue
-        # Try local best (both extensions)
-        for ext in [".safetensors", ".pt"]:
-            best_path = os.path.join(CHECKPOINT_DIR, f"best{ext}")
-            if os.path.exists(best_path):
-                weights = load_weights(best_path)
-                model.load_state_dict(weights)
-                # Try to load optimizer state
-                state_path = best_path.replace(ext, ".pt") if ext == ".safetensors" else best_path
-                if os.path.exists(state_path):
-                    state = torch.load(state_path, map_location=DEVICE, weights_only=False)
-                    if isinstance(state, dict) and "step" in state:
-                        start_step = state.get("step", 0)
-                        start_epoch = state.get("epoch", 0)
-                print(f"✓ Loaded local best (step {start_step})")
-                return start_step, start_epoch
-    elif load_mode == "step":
-        # Specific step number
-        step_num = load_path
-        # Try hub (both extensions)
-        for ext in [".safetensors", ".pt"]:
-            try:
-                filename = f"checkpoints/step_{step_num}{ext}"
-                local_path = hf_hub_download(repo_id=HF_REPO, filename=filename)
-                weights = load_weights(local_path)
-                model.load_state_dict(weights)
-                start_step = step_num
-                print(f"✓ Loaded step {step_num} from Hub")
-                return start_step, start_epoch
-            except:
-                continue
-        # Try local (both extensions)
-        for ext in [".safetensors", ".pt"]:
-            local_path = os.path.join(CHECKPOINT_DIR, f"step_{step_num}{ext}")
-            if os.path.exists(local_path):
-                weights = load_weights(local_path)
-                model.load_state_dict(weights)
-                state_path = local_path.replace(".safetensors", ".pt") if ext == ".safetensors" else local_path
-                if os.path.exists(state_path):
-                    state = torch.load(state_path, map_location=DEVICE, weights_only=False)
-                    if isinstance(state, dict):
-                        try:
-                            if "optimizer" in state:
-                                optimizer.load_state_dict(state["optimizer"])
-                            if "scheduler" in state:
-                                scheduler.load_state_dict(state["scheduler"])
-                            start_epoch = state.get("epoch", 0)
-                        except:
-                            pass
-                start_step = step_num
-                print(f"✓ Loaded local step {step_num}")
-                return start_step, start_epoch
-        print(f"⚠ Step {step_num} not found")
-    # Default: latest
-    # Try Hub first (both extensions)
-    try:
-        files = api.list_repo_files(repo_id=HF_REPO)
-        checkpoints = [f for f in files if f.startswith("checkpoints/step_") and (f.endswith(".safetensors") or f.endswith(".pt"))]
-        if checkpoints:
-            # Sort by step number
-            def get_step(f):
-                return int(f.split("step_")[-1].replace(".safetensors", "").replace(".pt", ""))
-            checkpoints.sort(key=get_step)
-            latest = checkpoints[-1]
-            step = get_step(latest)
-            local_path = hf_hub_download(repo_id=HF_REPO, filename=latest)
-            weights = load_weights(local_path)
-            model.load_state_dict(weights)
-            start_step = step
-            print(f"✓ Loaded latest from Hub: step {step}")
-            return start_step, start_epoch
-    except Exception as e:
-        print(f"Hub check: {e}")
-    # Try local (both extensions)
-    if os.path.exists(CHECKPOINT_DIR):
-        local_ckpts = [f for f in os.listdir(CHECKPOINT_DIR) if f.startswith("step_") and (f.endswith(".safetensors") or f.endswith(".pt"))]
-        # Filter to just weights files (not state .pt files that pair with .safetensors)
-        local_ckpts = [f for f in local_ckpts if not (f.endswith(".pt") and f.replace(".pt", ".safetensors") in local_ckpts)]
-        if local_ckpts:
-            def get_step(f):
-                return int(f.split("step_")[-1].replace(".safetensors", "").replace(".pt", ""))
-            local_ckpts.sort(key=get_step)
-            latest = local_ckpts[-1]
-            step = get_step(latest)
-            weights_path = os.path.join(CHECKPOINT_DIR, latest)
             weights = load_weights(weights_path)
-            model.load_state_dict(weights)
-            # Try to load optimizer state
-            state_path = weights_path.replace(".safetensors", ".pt") if weights_path.endswith(".safetensors") else weights_path
-            if os.path.exists(state_path):
-                state = torch.load(state_path, map_location=DEVICE, weights_only=False)
-                if isinstance(state, dict):
-                    try:
-                        if "optimizer" in state:
-                            optimizer.load_state_dict(state["optimizer"])
-                        if "scheduler" in state:
-                            scheduler.load_state_dict(state["scheduler"])
-                        start_epoch = state.get("epoch", 0)
-                    except:
-                        pass
-            start_step = step
-            print(f"✓ Loaded latest local: step {step}")
             return start_step, start_epoch
     print("No checkpoint found, starting fresh")
     return 0, 0
 # ============================================================================
-# DATALOADER
 # ============================================================================
-loader = DataLoader(ds, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate, num_workers=0)
 # ============================================================================
 # MODEL
@@ -612,33 +503,46 @@ loader = DataLoader(ds, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate,
 config = TinyFluxConfig()
 model = TinyFlux(config).to(DEVICE).to(DTYPE)
 print(f"\nParams: {sum(p.numel() for p in model.parameters()):,}")
-model = torch.compile(model, mode="default")
 # ============================================================================
-# OPTIMIZER & SCHEDULER
 # ============================================================================
-opt = torch.optim.AdamW(model.parameters(), lr=LR, betas=(0.9, 0.99), weight_decay=0.01)
 total_steps = len(loader) * EPOCHS // GRAD_ACCUM
 warmup = min(500, total_steps // 10)
 def lr_fn(step):
-    if step < warmup: return step / warmup
     return 0.5 * (1 + math.cos(math.pi * (step - warmup) / (total_steps - warmup)))
 sched = torch.optim.lr_scheduler.LambdaLR(opt, lr_fn)
 # ============================================================================
-# LOAD CHECKPOINT
 # ============================================================================
 print(f"\nLoad target: {LOAD_TARGET}")
 start_step, start_epoch = load_checkpoint(model, opt, sched, LOAD_TARGET)
-# Override start_step if RESUME_STEP is set
 if RESUME_STEP is not None:
     print(f"Overriding start_step: {start_step} -> {RESUME_STEP}")
     start_step = RESUME_STEP
-# Log config to tensorboard
 writer.add_text("config", json.dumps(config.__dict__, indent=2), 0)
 writer.add_text("training_config", json.dumps({
     "batch_size": BATCH_SIZE,
@@ -647,11 +551,10 @@ writer.add_text("training_config", json.dumps({
     "epochs": EPOCHS,
     "min_snr": MIN_SNR,
     "shift": SHIFT,
 }, indent=2), 0)
-# ============================================================================
-# SAMPLE PROMPTS FOR PERIODIC GENERATION
-# ============================================================================
 SAMPLE_PROMPTS = [
     "a photo of a cat sitting on a windowsill",
     "a beautiful sunset over mountains",
@@ -660,67 +563,55 @@ SAMPLE_PROMPTS = [
 ]
 # ============================================================================
-# TRAINING
 # ============================================================================
 print(f"\nTraining {EPOCHS} epochs, {total_steps} total steps")
 print(f"Resuming from step {start_step}, epoch {start_epoch}")
 print(f"Save: {SAVE_EVERY}, Upload: {UPLOAD_EVERY}, Sample: {SAMPLE_EVERY}, Log: {LOG_EVERY}")
 model.train()
 step = start_step
 best = float("inf")
 for ep in range(start_epoch, EPOCHS):
     ep_loss = 0
     ep_batches = 0
-    pbar = tqdm(loader, desc=f"E{ep+1}")
     for i, batch in enumerate(pbar):
-        latents = batch["latents"]      # Ground truth data (VAE encoded images)
-        t5 = batch["t5_embeds"]
-        clip = batch["clip_pooled"]
         B, C, H, W = latents.shape
-        # ================================================================
-        # FLOW MATCHING FORMULATION
-        # ================================================================
-        # x_1 = data (what we want to generate)
-        # x_0 = noise (where we start at inference)
-        # x_t = (1-t)*x_0 + t*x_1  (linear interpolation)
-        #
-        # At t=0: x_t = x_0 (pure noise)
-        # At t=1: x_t = x_1 (pure data)
-        #
-        # Velocity field: v = dx/dt = x_1 - x_0
-        # Model learns to predict v given (x_t, t)
-        #
-        # At inference: start from noise, integrate v from t=0 to t=1
-        # ================================================================
-        # Reshape data to sequence format: (B, C, H, W) -> (B, H*W, C)
-        data = latents.permute(0, 2, 3, 1).reshape(B, H*W, C)  # x_1
-        noise = torch.randn_like(data)                         # x_0
-        # Sample timesteps with logit-normal distribution + Flux shift
-        # This biases training towards higher t (closer to data)
         t = torch.sigmoid(torch.randn(B, device=DEVICE))
-        t = flux_shift(t, s=SHIFT).to(DTYPE).clamp(1e-4, 1-1e-4)
-        # Create noisy samples via linear interpolation
         t_expanded = t.view(B, 1, 1)
-        x_t = (1 - t_expanded) * noise + t_expanded * data  # Noisy sample at time t
-        # Target velocity: direction from noise to data
         v_target = data - noise
-        # Create position IDs for RoPE
         img_ids = TinyFlux.create_img_ids(B, H, W, DEVICE)
-        # Random guidance scale (for CFG training)
-        guidance = torch.rand(B, device=DEVICE, dtype=DTYPE) * 4 + 1  # [1, 5]
-        # Forward pass: predict velocity
         with torch.autocast("cuda", dtype=DTYPE):
             v_pred = model(
                 hidden_states=x_t,
@@ -731,10 +622,8 @@ for ep in range(start_epoch, EPOCHS):
                 guidance=guidance,
             )
-        # Loss: MSE between predicted and target velocity
         loss_raw = F.mse_loss(v_pred, v_target, reduction="none").mean(dim=[1, 2])
-        # Min-SNR weighting: downweight easy timesteps (near t=0 or t=1)
         snr_weights = min_snr_weight(t)
         loss = (loss_raw * snr_weights).mean() / GRAD_ACCUM
         loss.backward()
@@ -743,38 +632,33 @@ for ep in range(start_epoch, EPOCHS):
             grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
             opt.step()
             sched.step()
-            opt.zero_grad()
             step += 1
-            # Tensorboard logging
             if step % LOG_EVERY == 0:
                 writer.add_scalar("train/loss", loss.item() * GRAD_ACCUM, step)
                 writer.add_scalar("train/lr", sched.get_last_lr()[0], step)
                 writer.add_scalar("train/grad_norm", grad_norm.item(), step)
                 writer.add_scalar("train/t_mean", t.mean().item(), step)
-                writer.add_scalar("train/snr_weight_mean", snr_weights.mean().item(), step)
-            # Generate samples
             if step % SAMPLE_EVERY == 0:
                 print(f"\n  Generating samples at step {step}...")
                 images = generate_samples(model, SAMPLE_PROMPTS, num_steps=20)
                 save_samples(images, SAMPLE_PROMPTS, step, SAMPLE_DIR)
-            # Save checkpoint
             if step % SAVE_EVERY == 0:
                 ckpt_path = os.path.join(CHECKPOINT_DIR, f"step_{step}.pt")
                 weights_path = save_checkpoint(model, opt, sched, step, ep, loss.item(), ckpt_path)
-                # Upload
                 if step % UPLOAD_EVERY == 0:
-                    upload_checkpoint(weights_path, step, config, include_logs=True)
         ep_loss += loss.item() * GRAD_ACCUM
         ep_batches += 1
-        pbar.set_postfix(loss=f"{loss.item()*GRAD_ACCUM:.4f}", lr=f"{sched.get_last_lr()[0]:.1e}", step=step)
     avg = ep_loss / max(ep_batches, 1)
-    print(f"Epoch {ep+1} loss: {avg:.4f}")
     writer.add_scalar("train/epoch_loss", avg, ep + 1)
     if avg < best:
@@ -787,7 +671,7 @@ for ep in range(start_epoch, EPOCHS):
                 path_or_fileobj=weights_path,
                 path_in_repo="model.safetensors",
                 repo_id=HF_REPO,
-                commit_message=f"Best model (epoch {ep+1}, loss {avg:.4f})",
             )
             print(f"  ✓ Uploaded best to {HF_REPO}")
         except Exception as e:
@@ -800,20 +684,16 @@ print("\nSaving final model...")
 final_path = os.path.join(CHECKPOINT_DIR, "final.pt")
 weights_path = save_checkpoint(model, opt, sched, step, EPOCHS, best, final_path)
-# Final samples
 print("Generating final samples...")
 images = generate_samples(model, SAMPLE_PROMPTS, num_steps=20)
 save_samples(images, SAMPLE_PROMPTS, step, SAMPLE_DIR)
-# Final upload
 try:
     api.upload_file(path_or_fileobj=weights_path, path_in_repo="model.safetensors", repo_id=HF_REPO)
     config_path = os.path.join(CHECKPOINT_DIR, "config.json")
     with open(config_path, "w") as f:
         json.dump(config.__dict__, f, indent=2)
     api.upload_file(path_or_fileobj=config_path, path_in_repo="config.json", repo_id=HF_REPO)
-    api.upload_folder(folder_path=LOG_DIR, path_in_repo="logs", repo_id=HF_REPO)
-    api.upload_folder(folder_path=SAMPLE_DIR, path_in_repo="samples", repo_id=HF_REPO)
     print(f"\n✓ Training complete! https://huggingface.co/{HF_REPO}")
 except Exception as e:
     print(f"\n⚠ Final upload failed: {e}")

 # ============================================================================
+# TinyFlux Training Cell - OPTIMIZED
 # ============================================================================
+# Optimizations:
+#   - TF32 and cuDNN settings for faster matmuls
+#   - Fused AdamW optimizer
+#   - Pre-encoded prompts (encode once at startup, not per batch)
+#   - Batched prompt encoding
+#   - DataLoader with num_workers and pin_memory
+#   - torch.inference_mode() for sampling
+#   - Cached img_ids in model
+#   - torch.compile with max-autotune
 # ============================================================================
 import torch
 import json
 from datetime import datetime
+# ============================================================================
+# CUDA OPTIMIZATIONS - Set these BEFORE model creation
+# ============================================================================
+# New PyTorch 2.x API for TF32
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cudnn.allow_tf32 = True
+torch.backends.cudnn.benchmark = True
+torch.set_float32_matmul_precision('high')
+# Suppress the deprecation warning (settings still work)
+import warnings
+warnings.filterwarnings('ignore', message='.*TF32.*')
 # ============================================================================
 # CONFIG
 # ============================================================================
+BATCH_SIZE = 128
+GRAD_ACCUM = 1
 LR = 1e-4
 EPOCHS = 10
 MAX_SEQ = 128
 # HuggingFace Hub
 HF_REPO = "AbstractPhil/tiny-flux"
+SAVE_EVERY = 1000
+UPLOAD_EVERY = 1000
+SAMPLE_EVERY = 500
+LOG_EVERY = 10
+# Checkpoint loading
+LOAD_TARGET = "hub:step_24000"  # "latest", "best", int, "hub:step_X", "local:path", "none"
+RESUME_STEP = None
+# Paths
 CHECKPOINT_DIR = "./tiny_flux_checkpoints"
 LOG_DIR = "./tiny_flux_logs"
 SAMPLE_DIR = "./tiny_flux_samples"
 # ============================================================================
 print("Setting up HuggingFace Hub...")
 api = HfApi()
 try:
     api.create_repo(repo_id=HF_REPO, exist_ok=True, repo_type="model")
     print(f"✓ Repo ready: {HF_REPO}")
 # LOAD DATASET
 # ============================================================================
 print("\nLoading dataset...")
+ds = load_dataset("AbstractPhil/flux-schnell-teacher-latents", "train_3_512", split="train")
 print(f"Samples: {len(ds)}")
 # ============================================================================
 # ============================================================================
 print("Loading Flux VAE for samples...")
 from diffusers import AutoencoderKL
 vae = AutoencoderKL.from_pretrained(
+    "black-forest-labs/FLUX.1-schnell",
     subfolder="vae",
     torch_dtype=DTYPE
 ).to(DEVICE).eval()
 for p in vae.parameters(): p.requires_grad = False
 # ============================================================================
+# BATCHED ENCODING - Much faster than one-by-one
+# ============================================================================
+@torch.inference_mode()
+def encode_prompts_batched(prompts: list) -> tuple:
+    """Encode multiple prompts at once - MUCH faster than loop."""
+    # T5 encoding
+    t5_in = t5_tok(
+        prompts,
+        max_length=MAX_SEQ,
+        padding="max_length",
+        truncation=True,
+        return_tensors="pt"
+    ).to(DEVICE)
+    t5_out = t5_enc(
+        input_ids=t5_in.input_ids,
+        attention_mask=t5_in.attention_mask
+    ).last_hidden_state
+    # CLIP encoding
+    clip_in = clip_tok(
+        prompts,
+        max_length=77,
+        padding="max_length",
+        truncation=True,
+        return_tensors="pt"
+    ).to(DEVICE)
+    clip_out = clip_enc(
+        input_ids=clip_in.input_ids,
+        attention_mask=clip_in.attention_mask
+    )
     return t5_out, clip_out.pooler_output
+@torch.inference_mode()
+def encode_prompt(prompt: str) -> tuple:
+    """Encode single prompt (for compatibility)."""
+    return encode_prompts_batched([prompt])
 # ============================================================================
+# PRE-ENCODE ALL PROMPTS (with disk caching)
 # ============================================================================
+print("\nPre-encoding prompts...")
+PRECOMPUTE_ENCODINGS = True
+ENCODING_CACHE_DIR = "./encoding_cache"
+os.makedirs(ENCODING_CACHE_DIR, exist_ok=True)
+# Cache filename based on dataset size and encoder
+cache_file = os.path.join(ENCODING_CACHE_DIR, f"encodings_{len(ds)}_t5base_clipL.pt")
+if PRECOMPUTE_ENCODINGS:
+    if os.path.exists(cache_file):
+        # Load from cache
+        print(f"Loading cached encodings from {cache_file}...")
+        cached = torch.load(cache_file, weights_only=True)
+        all_t5_embeds = cached["t5_embeds"]
+        all_clip_pooled = cached["clip_pooled"]
+        print(f"✓ Loaded cached encodings")
+    else:
+        # Get all prompts via columnar access (instant, no iteration)
+        print("Encoding prompts (will cache for future runs)...")
+        all_prompts = ds["prompt"]  # Columnar access - instant!
+        encode_batch_size = 64
+        all_t5_embeds = []
+        all_clip_pooled = []
+        for i in tqdm(range(0, len(all_prompts), encode_batch_size), desc="Encoding"):
+            batch_prompts = all_prompts[i:i+encode_batch_size]
+            t5_out, clip_out = encode_prompts_batched(batch_prompts)
+            all_t5_embeds.append(t5_out.cpu())
+            all_clip_pooled.append(clip_out.cpu())
+        all_t5_embeds = torch.cat(all_t5_embeds, dim=0)
+        all_clip_pooled = torch.cat(all_clip_pooled, dim=0)
+        # Save cache (~750MB for 10k samples)
+        torch.save({
+            "t5_embeds": all_t5_embeds,
+            "clip_pooled": all_clip_pooled,
+        }, cache_file)
+        print(f"✓ Saved encoding cache to {cache_file}")
+    print(f"  T5 embeds: {all_t5_embeds.shape}")
+    print(f"  CLIP pooled: {all_clip_pooled.shape}")
+# ============================================================================
+# FLOW MATCHING HELPERS
+# ============================================================================
+def flux_shift(t, s=SHIFT):
+    """Flux timestep shift for training distribution."""
     return s * t / (1 + (s - 1) * t)
 def min_snr_weight(t, gamma=MIN_SNR):
+    """Min-SNR weighting to balance loss across timesteps."""
     snr = (t / (1 - t).clamp(min=1e-5)).pow(2)
     return torch.clamp(snr, max=gamma) / snr.clamp(min=1e-5)
 # ============================================================================
+# SAMPLING FUNCTION - Optimized
 # ============================================================================
+@torch.inference_mode()
 def generate_samples(model, prompts, num_steps=20, guidance_scale=3.5, H=64, W=64):
+    """Generate sample images using Euler sampling."""
     model.eval()
     B = len(prompts)
+    C = 16
+    # Batch encode prompts
+    t5_embeds, clip_pooleds = encode_prompts_batched(prompts)
+    t5_embeds = t5_embeds.to(DTYPE)
+    clip_pooleds = clip_pooleds.to(DTYPE)
+    # Start from pure noise
     x = torch.randn(B, H * W, C, device=DEVICE, dtype=DTYPE)
+    # Create image IDs (cached in optimized model)
     img_ids = TinyFlux.create_img_ids(B, H, W, DEVICE)
+    # Timesteps with flux_shift
+    t_linear = torch.linspace(0, 1, num_steps + 1, device=DEVICE, dtype=DTYPE)
+    timesteps = flux_shift(t_linear, s=SHIFT)
+    # Euler sampling
     for i in range(num_steps):
         t_curr = timesteps[i]
         t_next = timesteps[i + 1]
+        dt = t_next - t_curr
+        t_batch = t_curr.expand(B).to(DTYPE)
         guidance = torch.full((B,), guidance_scale, device=DEVICE, dtype=DTYPE)
         v_cond = model(
             hidden_states=x,
             encoder_hidden_states=t5_embeds,
             guidance=guidance,
         )
         x = x + v_cond * dt
+    # Decode
     latents = x.reshape(B, H, W, C).permute(0, 3, 1, 2)
     latents = latents / vae.config.scaling_factor
     images = vae.decode(latents.to(vae.dtype)).sample
     images = (images / 2 + 0.5).clamp(0, 1)
     model.train()
     return images
 def save_samples(images, prompts, step, save_dir):
+    """Save sample images."""
     from torchvision.utils import make_grid, save_image
     for i, (img, prompt) in enumerate(zip(images, prompts)):
         safe_prompt = prompt[:50].replace(" ", "_").replace("/", "-")
         path = os.path.join(save_dir, f"step{step}_{i}_{safe_prompt}.png")
         save_image(img, path)
     grid = make_grid(images, nrow=2, normalize=False)
     writer.add_image("samples", grid, step)
     writer.add_text("sample_prompts", "\n".join(prompts), step)
     print(f"  ✓ Saved {len(images)} samples")
 # ============================================================================
+# OPTIMIZED COLLATE - Returns CPU tensors (GPU transfer in training loop)
 # ============================================================================
+def collate_preencoded(batch):
+    """Collate using pre-encoded embeddings - returns CPU tensors."""
+    indices = [b["__index__"] for b in batch]
+    latents = torch.stack([
+        torch.tensor(np.array(b["latent"]), dtype=DTYPE)
+        for b in batch
+    ])
+    # Return CPU tensors - move to GPU in training loop
+    return {
+        "latents": latents,
+        "t5_embeds": all_t5_embeds[indices].to(DTYPE),
+        "clip_pooled": all_clip_pooled[indices].to(DTYPE),
+    }
+def collate_online(batch):
+    """Collate with online encoding - returns CPU tensors."""
+    prompts = [b["prompt"] for b in batch]
+    latents = torch.stack([
+        torch.tensor(np.array(b["latent"]), dtype=DTYPE)
+        for b in batch
+    ])
+    # This still needs CUDA for encoding, so use num_workers=0
+    t5_embeds, clip_pooled = encode_prompts_batched(prompts)
     return {
+        "latents": latents,
+        "t5_embeds": t5_embeds.cpu().to(DTYPE),
+        "clip_pooled": clip_pooled.cpu().to(DTYPE),
     }
+# Simple wrapper to add index without touching the data
+class IndexedDataset:
+    """Wraps dataset to add __index__ field without expensive ds.map()"""
+    def __init__(self, ds):
+        self.ds = ds
+    def __len__(self):
+        return len(self.ds)
+    def __getitem__(self, idx):
+        item = dict(self.ds[idx])
+        item["__index__"] = idx
+        return item
+# Choose collate strategy
+if PRECOMPUTE_ENCODINGS:
+    ds = IndexedDataset(ds)  # Instant, no iteration
+    collate_fn = collate_preencoded
+    num_workers = 2
+else:
+    collate_fn = collate_online
+    num_workers = 0
 # ============================================================================
 # CHECKPOINT FUNCTIONS
 # ============================================================================
 def load_weights(path):
+    """Load weights, handling torch.compile prefix."""
     if path.endswith(".safetensors"):
+        state_dict = load_file(path)
     elif path.endswith(".pt"):
         ckpt = torch.load(path, map_location=DEVICE, weights_only=False)
         if isinstance(ckpt, dict):
+            state_dict = ckpt.get("model", ckpt.get("state_dict", ckpt))
+        else:
+            state_dict = ckpt
     else:
         try:
+            state_dict = load_file(path)
         except:
+            state_dict = torch.load(path, map_location=DEVICE, weights_only=False)
+    # Strip torch.compile prefix
+    if isinstance(state_dict, dict) and any(k.startswith("_orig_mod.") for k in state_dict.keys()):
+        print("  Stripping torch.compile prefix...")
+        state_dict = {k.replace("_orig_mod.", ""): v for k, v in state_dict.items()}
+    return state_dict
 def save_checkpoint(model, optimizer, scheduler, step, epoch, loss, path):
+    """Save checkpoint, stripping torch.compile prefix."""
     os.makedirs(os.path.dirname(path) if os.path.dirname(path) else ".", exist_ok=True)
+    state_dict = model.state_dict()
+    if any(k.startswith("_orig_mod.") for k in state_dict.keys()):
+        state_dict = {k.replace("_orig_mod.", ""): v for k, v in state_dict.items()}
     weights_path = path.replace(".pt", ".safetensors")
+    save_file(state_dict, weights_path)
+    torch.save({
         "step": step,
         "epoch": epoch,
         "loss": loss,
         "optimizer": optimizer.state_dict(),
         "scheduler": scheduler.state_dict(),
+    }, path)
     print(f"  ✓ Saved checkpoint: step {step}")
     return weights_path
+def upload_checkpoint(weights_path, step, config):
+    """Upload to HuggingFace Hub."""
     try:
         api.upload_file(
             path_or_fileobj=weights_path,
             path_in_repo=f"checkpoints/step_{step}.safetensors",
             commit_message=f"Checkpoint step {step}",
         )
         config_path = os.path.join(CHECKPOINT_DIR, "config.json")
         with open(config_path, "w") as f:
             json.dump(config.__dict__, f, indent=2)
+        api.upload_file(path_or_fileobj=config_path, path_in_repo="config.json", repo_id=HF_REPO)
+        print(f"  ✓ Uploaded step {step} to {HF_REPO}")
     except Exception as e:
         print(f"  ⚠ Upload failed: {e}")
 def load_checkpoint(model, optimizer, scheduler, target):
+    """Load checkpoint from various sources."""
+    start_step, start_epoch = 0, 0
+    if target == "none" or target is None:
+        print("Starting fresh (no checkpoint)")
         return 0, 0
+    # Hub loading
+    if target == "hub" or (isinstance(target, str) and target.startswith("hub:")):
         try:
+            if target == "hub":
+                weights_path = hf_hub_download(repo_id=HF_REPO, filename="model.safetensors")
             else:
+                step_name = target.split(":")[1]
                 try:
+                    weights_path = hf_hub_download(repo_id=HF_REPO, filename=f"checkpoints/{step_name}.safetensors")
+                except:
+                    weights_path = hf_hub_download(repo_id=HF_REPO, filename=f"checkpoints/{step_name}.pt")
+                start_step = int(step_name.split("_")[-1]) if "_" in step_name else 0
             weights = load_weights(weights_path)
+            # strict=False: ignore missing buffers (sin_basis, freqs) - they're precomputed constants
+            missing, unexpected = model.load_state_dict(weights, strict=False)
+            if missing:
+                # Filter out expected missing buffers
+                expected_missing = {'time_in.sin_basis', 'guidance_in.sin_basis',
+                                   'rope.freqs_0', 'rope.freqs_1', 'rope.freqs_2'}
+                actual_missing = set(missing) - expected_missing
+                if actual_missing:
+                    print(f"  ⚠ Unexpected missing keys: {actual_missing}")
+                else:
+                    print(f"  ✓ Missing only precomputed buffers (OK)")
+            print(f"✓ Loaded from hub: {target}")
             return start_step, start_epoch
+        except Exception as e:
+            print(f"Hub load failed: {e}")
+            return 0, 0
+    # Local loading
+    if isinstance(target, str) and target.startswith("local:"):
+        path = target.split(":", 1)[1]
+        weights = load_weights(path)
+        missing, unexpected = model.load_state_dict(weights, strict=False)
+        if missing:
+            expected_missing = {'time_in.sin_basis', 'guidance_in.sin_basis',
+                               'rope.freqs_0', 'rope.freqs_1', 'rope.freqs_2'}
+            actual_missing = set(missing) - expected_missing
+            if actual_missing:
+                print(f"  ⚠ Unexpected missing keys: {actual_missing}")
+        print(f"✓ Loaded from local: {path}")
+        return 0, 0
     print("No checkpoint found, starting fresh")
     return 0, 0
 # ============================================================================
+# DATALOADER - Optimized
 # ============================================================================
+loader = DataLoader(
+    ds,
+    batch_size=BATCH_SIZE,
+    shuffle=True,
+    collate_fn=collate_fn,
+    num_workers=num_workers,  # 2 for precomputed, 0 for online
+    pin_memory=True,
+    persistent_workers=(num_workers > 0),
+    prefetch_factor=2 if num_workers > 0 else None,
+)
 # ============================================================================
 # MODEL
 config = TinyFluxConfig()
 model = TinyFlux(config).to(DEVICE).to(DTYPE)
 print(f"\nParams: {sum(p.numel() for p in model.parameters()):,}")
 # ============================================================================
+# OPTIMIZER - Fused for speed
 # ============================================================================
+opt = torch.optim.AdamW(
+    model.parameters(),
+    lr=LR,
+    betas=(0.9, 0.99),
+    weight_decay=0.01,
+    fused=True,
+)
 total_steps = len(loader) * EPOCHS // GRAD_ACCUM
 warmup = min(500, total_steps // 10)
 def lr_fn(step):
+    if step < warmup:
+        return step / warmup
     return 0.5 * (1 + math.cos(math.pi * (step - warmup) / (total_steps - warmup)))
 sched = torch.optim.lr_scheduler.LambdaLR(opt, lr_fn)
 # ============================================================================
+# LOAD CHECKPOINT (before compile!)
 # ============================================================================
 print(f"\nLoad target: {LOAD_TARGET}")
 start_step, start_epoch = load_checkpoint(model, opt, sched, LOAD_TARGET)
 if RESUME_STEP is not None:
     print(f"Overriding start_step: {start_step} -> {RESUME_STEP}")
     start_step = RESUME_STEP
+# ============================================================================
+# COMPILE MODEL (after loading weights)
+# ============================================================================
+model = torch.compile(model, mode="default")
+# Log config
 writer.add_text("config", json.dumps(config.__dict__, indent=2), 0)
 writer.add_text("training_config", json.dumps({
     "batch_size": BATCH_SIZE,
     "epochs": EPOCHS,
     "min_snr": MIN_SNR,
     "shift": SHIFT,
+    "optimizations": ["TF32", "fused_adamw", "precomputed_encodings", "flash_attention", "torch.compile"]
 }, indent=2), 0)
+# Sample prompts
 SAMPLE_PROMPTS = [
     "a photo of a cat sitting on a windowsill",
     "a beautiful sunset over mountains",
 ]
 # ============================================================================
+# TRAINING LOOP
 # ============================================================================
 print(f"\nTraining {EPOCHS} epochs, {total_steps} total steps")
 print(f"Resuming from step {start_step}, epoch {start_epoch}")
 print(f"Save: {SAVE_EVERY}, Upload: {UPLOAD_EVERY}, Sample: {SAMPLE_EVERY}, Log: {LOG_EVERY}")
+print("Optimizations: TF32, fused AdamW, pre-encoded prompts, Flash Attention, torch.compile")
 model.train()
 step = start_step
 best = float("inf")
+# Pre-create img_ids for common resolution (will be cached)
+_cached_img_ids = None
 for ep in range(start_epoch, EPOCHS):
     ep_loss = 0
     ep_batches = 0
+    pbar = tqdm(loader, desc=f"E{ep + 1}")
     for i, batch in enumerate(pbar):
+        # Move to GPU here (not in collate, to support multiprocessing)
+        latents = batch["latents"].to(DEVICE, non_blocking=True)
+        t5 = batch["t5_embeds"].to(DEVICE, non_blocking=True)
+        clip = batch["clip_pooled"].to(DEVICE, non_blocking=True)
         B, C, H, W = latents.shape
+        # Reshape: (B, C, H, W) -> (B, H*W, C)
+        data = latents.permute(0, 2, 3, 1).reshape(B, H * W, C)
+        noise = torch.randn_like(data)
+        # Sample timesteps with logit-normal + flux shift
         t = torch.sigmoid(torch.randn(B, device=DEVICE))
+        t = flux_shift(t, s=SHIFT).to(DTYPE).clamp(1e-4, 1 - 1e-4)
+        # Linear interpolation
         t_expanded = t.view(B, 1, 1)
+        x_t = (1 - t_expanded) * noise + t_expanded * data
+        # Velocity target
         v_target = data - noise
+        # Get img_ids (cached in model)
         img_ids = TinyFlux.create_img_ids(B, H, W, DEVICE)
+        # Random guidance
+        guidance = torch.rand(B, device=DEVICE, dtype=DTYPE) * 4 + 1
+        # Forward
         with torch.autocast("cuda", dtype=DTYPE):
             v_pred = model(
                 hidden_states=x_t,
                 guidance=guidance,
             )
+        # Loss with Min-SNR weighting
         loss_raw = F.mse_loss(v_pred, v_target, reduction="none").mean(dim=[1, 2])
         snr_weights = min_snr_weight(t)
         loss = (loss_raw * snr_weights).mean() / GRAD_ACCUM
         loss.backward()
             grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
             opt.step()
             sched.step()
+            opt.zero_grad(set_to_none=True)  # Slightly faster than zero_grad()
             step += 1
             if step % LOG_EVERY == 0:
                 writer.add_scalar("train/loss", loss.item() * GRAD_ACCUM, step)
                 writer.add_scalar("train/lr", sched.get_last_lr()[0], step)
                 writer.add_scalar("train/grad_norm", grad_norm.item(), step)
                 writer.add_scalar("train/t_mean", t.mean().item(), step)
             if step % SAMPLE_EVERY == 0:
                 print(f"\n  Generating samples at step {step}...")
                 images = generate_samples(model, SAMPLE_PROMPTS, num_steps=20)
                 save_samples(images, SAMPLE_PROMPTS, step, SAMPLE_DIR)
             if step % SAVE_EVERY == 0:
                 ckpt_path = os.path.join(CHECKPOINT_DIR, f"step_{step}.pt")
                 weights_path = save_checkpoint(model, opt, sched, step, ep, loss.item(), ckpt_path)
                 if step % UPLOAD_EVERY == 0:
+                    upload_checkpoint(weights_path, step, config)
         ep_loss += loss.item() * GRAD_ACCUM
         ep_batches += 1
+        pbar.set_postfix(loss=f"{loss.item() * GRAD_ACCUM:.4f}", lr=f"{sched.get_last_lr()[0]:.1e}", step=step)
     avg = ep_loss / max(ep_batches, 1)
+    print(f"Epoch {ep + 1} loss: {avg:.4f}")
     writer.add_scalar("train/epoch_loss", avg, ep + 1)
     if avg < best:
                 path_or_fileobj=weights_path,
                 path_in_repo="model.safetensors",
                 repo_id=HF_REPO,
+                commit_message=f"Best model (epoch {ep + 1}, loss {avg:.4f})",
             )
             print(f"  ✓ Uploaded best to {HF_REPO}")
         except Exception as e:
 final_path = os.path.join(CHECKPOINT_DIR, "final.pt")
 weights_path = save_checkpoint(model, opt, sched, step, EPOCHS, best, final_path)
 print("Generating final samples...")
 images = generate_samples(model, SAMPLE_PROMPTS, num_steps=20)
 save_samples(images, SAMPLE_PROMPTS, step, SAMPLE_DIR)
 try:
     api.upload_file(path_or_fileobj=weights_path, path_in_repo="model.safetensors", repo_id=HF_REPO)
     config_path = os.path.join(CHECKPOINT_DIR, "config.json")
     with open(config_path, "w") as f:
         json.dump(config.__dict__, f, indent=2)
     api.upload_file(path_or_fileobj=config_path, path_in_repo="config.json", repo_id=HF_REPO)
     print(f"\n✓ Training complete! https://huggingface.co/{HF_REPO}")
 except Exception as e:
     print(f"\n⚠ Final upload failed: {e}")