krystv
/

liquid-diffusion

Model card Files Files and versions

xet

Community

krystv commited on 7 days ago

Commit

8589a61

verified ·

1 Parent(s): fafdff9

Upload validate.py

Browse files

Files changed (1) hide show

validate.py +336 -0

validate.py ADDED Viewed

	@@ -0,0 +1,336 @@

+#!/usr/bin/env python3
+"""
+LiquidDiffusion — Self-Contained Validation Script
+Run this to verify everything works before training:
+    python validate.py
+Tests:
+1. Model construction at all scales
+2. Forward pass at multiple resolutions
+3. Backward pass and gradient flow
+4. 20-step training stability with random data
+5. Sampling with Euler ODE
+6. VRAM estimation
+7. Full trainer pipeline
+"""
+import sys
+import math
+import time
+import copy
+print("=" * 70)
+print("LiquidDiffusion Validation Suite")
+print("=" * 70)
+# Check imports
+try:
+    import torch
+    import torch.nn as nn
+    import torch.nn.functional as F
+    print(f"✓ PyTorch {torch.__version__}")
+except ImportError:
+    print("✗ PyTorch not installed. Run: pip install torch torchvision")
+    sys.exit(1)
+try:
+    from torchvision.utils import save_image
+    print("✓ torchvision")
+except ImportError:
+    print("✗ torchvision not installed. Run: pip install torchvision")
+    sys.exit(1)
+# Import our modules
+try:
+    from liquid_diffusion.model import (
+        LiquidDiffusionUNet, liquid_diffusion_tiny,
+        liquid_diffusion_small, liquid_diffusion_base,
+        SinusoidalTimeEmbedding, ParallelCfCBlock, AdaLN,
+    )
+    print("✓ liquid_diffusion.model")
+except ImportError as e:
+    print(f"✗ Failed to import model: {e}")
+    print("  Make sure you're in the liquid-diffusion directory")
+    sys.exit(1)
+try:
+    from liquid_diffusion.trainer import RectifiedFlowTrainer, get_cosine_schedule_with_warmup
+    print("✓ liquid_diffusion.trainer")
+except ImportError as e:
+    print(f"✗ Failed to import trainer: {e}")
+    sys.exit(1)
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+print(f"\nDevice: {device}")
+if device == 'cuda':
+    print(f"GPU: {torch.cuda.get_device_name(0)}")
+    print(f"VRAM: {torch.cuda.get_device_properties(0).total_mem / 1e9:.1f} GB")
+all_passed = True
+test_num = 0
+def test(name):
+    global test_num
+    test_num += 1
+    print(f"\n--- Test {test_num}: {name} ---")
+def fail(msg):
+    global all_passed
+    all_passed = False
+    print(f"  ✗ FAIL: {msg}")
+def ok(msg):
+    print(f"  ✓ {msg}")
+# =========================================================================
+test("Model Construction & Parameter Count")
+# =========================================================================
+for name, factory in [("tiny", liquid_diffusion_tiny), ("small", liquid_diffusion_small), ("base", liquid_diffusion_base)]:
+    try:
+        m = factory()
+        total, trainable = m.count_params()
+        ok(f"{name:8s}: {total:>12,} params ({total/1e6:.1f}M)")
+        del m
+    except Exception as e:
+        fail(f"{name}: {e}")
+# =========================================================================
+test("Forward Pass (multiple resolutions)")
+# =========================================================================
+model = liquid_diffusion_tiny()
+for res in [32, 64, 128]:
+    try:
+        x = torch.randn(2, 3, res, res)
+        t = torch.rand(2)
+        out = model(x, t)
+        assert out.shape == x.shape, f"Shape mismatch: {out.shape} vs {x.shape}"
+        assert not torch.isnan(out).any(), "NaN in output"
+        assert not torch.isinf(out).any(), "Inf in output"
+        ok(f"{res}x{res}: output shape {out.shape}, range [{out.min():.4f}, {out.max():.4f}]")
+    except Exception as e:
+        fail(f"{res}x{res}: {e}")
+# =========================================================================
+test("Backward Pass (gradient flow)")
+# =========================================================================
+model = liquid_diffusion_tiny()
+x = torch.randn(2, 3, 64, 64, requires_grad=False)
+t = torch.rand(2)
+out = model(x, t)
+loss = out.mean()
+loss.backward()
+total_params = 0
+params_with_grad = 0
+nan_grads = 0
+zero_grads = 0
+for name_p, p in model.named_parameters():
+    total_params += 1
+    if p.grad is not None:
+        params_with_grad += 1
+        if torch.isnan(p.grad).any():
+            nan_grads += 1
+        if p.grad.abs().max() == 0:
+            zero_grads += 1
+if nan_grads > 0:
+    fail(f"NaN gradients in {nan_grads}/{total_params} parameters")
+elif params_with_grad == 0:
+    fail("No parameters received gradients")
+else:
+    ok(f"{params_with_grad}/{total_params} params have gradients, {nan_grads} NaN, {zero_grads} zero-only")
+# Check gradient magnitude distribution
+grad_maxes = [p.grad.abs().max().item() for p in model.parameters() if p.grad is not None]
+ok(f"Gradient |max| range: [{min(grad_maxes):.2e}, {max(grad_maxes):.2e}]")
+# =========================================================================
+test("Training Stability (20 steps, random data)")
+# =========================================================================
+model = liquid_diffusion_tiny()
+optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=0.01)
+losses = []
+for step in range(20):
+    model.train()
+    x0 = torch.randn(4, 3, 64, 64)
+    x1 = torch.randn_like(x0)
+    t_val = torch.rand(4)
+    t_expand = t_val[:, None, None, None]
+    x_t = (1 - t_expand) * x0 + t_expand * x1
+    v_target = x1 - x0
+    v_pred = model(x_t, t_val)
+    loss = F.mse_loss(v_pred, v_target)
+    optimizer.zero_grad()
+    loss.backward()
+    gn = torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+    optimizer.step()
+    losses.append(loss.item())
+    if step % 5 == 0:
+        print(f"    Step {step:3d}: loss={loss.item():.4f}, grad_norm={gn.item():.4f}")
+stable = all(not math.isnan(l) and not math.isinf(l) for l in losses)
+not_exploding = max(losses) < 100
+if stable:
+    ok(f"No NaN/Inf in any of {len(losses)} steps")
+else:
+    fail("NaN or Inf detected in loss")
+if not_exploding:
+    ok(f"Loss range: [{min(losses):.4f}, {max(losses):.4f}]")
+else:
+    fail(f"Loss exploded: max={max(losses):.4f}")
+# =========================================================================
+test("Sampling (Euler ODE, 10 steps)")
+# =========================================================================
+model.eval()
+with torch.no_grad():
+    z = torch.randn(2, 3, 64, 64)
+    num_steps = 10
+    dt = 1.0 / num_steps
+    for i in range(num_steps, 0, -1):
+        t_s = torch.full((2,), i / num_steps)
+        v = model(z, t_s)
+        z = z - v * dt
+    z = z.clamp(-1, 1)
+    if torch.isnan(z).any():
+        fail("NaN in generated samples")
+    elif torch.isinf(z).any():
+        fail("Inf in generated samples")
+    else:
+        ok(f"Shape: {z.shape}, range: [{z.min():.3f}, {z.max():.3f}], "
+           f"mean: {z.mean():.4f}, std: {z.std():.4f}")
+# =========================================================================
+test("Timestep Sensitivity")
+# =========================================================================
+model.eval()
+x = torch.randn(1, 3, 64, 64)
+outputs = {}
+for t_val in [0.01, 0.25, 0.5, 0.75, 0.99]:
+    with torch.no_grad():
+        out = model(x, torch.tensor([t_val]))
+    outputs[t_val] = out
+    print(f"    t={t_val:.2f}: mean={out.mean():.6f}, std={out.std():.6f}, |max|={out.abs().max():.6f}")
+# Check that different timesteps give different outputs
+diff_01_099 = (outputs[0.01] - outputs[0.99]).abs().mean().item()
+if diff_01_099 > 1e-6:
+    ok(f"Timestep affects output (mean diff t=0.01 vs t=0.99: {diff_01_099:.6f})")
+else:
+    fail(f"Timestep has no effect on output (diff={diff_01_099:.10f})")
+# =========================================================================
+test("Full Trainer Pipeline (CPU, 5 steps)")
+# =========================================================================
+model = liquid_diffusion_tiny()
+trainer = RectifiedFlowTrainer(
+    model=model,
+    lr=1e-4,
+    device='cpu',
+    use_amp=False,  # CPU doesn't support AMP
+    time_sampling='logit_normal',
+)
+for step in range(5):
+    x0 = torch.randn(2, 3, 64, 64)
+    metrics = trainer.train_step(x0)
+    if step == 0:
+        print(f"    Step {step}: loss={metrics['loss']:.4f}, grad_norm={metrics['grad_norm']:.4f}")
+if math.isnan(metrics['loss']):
+    fail("Trainer produced NaN loss")
+else:
+    ok(f"Trainer works: final loss={metrics['loss']:.4f}, step={trainer.step}")
+# Test sampling
+try:
+    samples = trainer.sample(batch_size=1, image_size=64, num_steps=5, use_ema=True)
+    if torch.isnan(samples).any():
+        fail("Trainer sampling produced NaN")
+    else:
+        ok(f"Sampling works: shape={samples.shape}, range=[{samples.min():.3f}, {samples.max():.3f}]")
+except Exception as e:
+    fail(f"Sampling failed: {e}")
+# Test checkpoint save/load
+try:
+    import tempfile, os
+    with tempfile.TemporaryDirectory() as tmpdir:
+        ckpt_path = os.path.join(tmpdir, 'test_ckpt.pt')
+        trainer.save_checkpoint(ckpt_path)
+        # Create new trainer and load
+        model2 = liquid_diffusion_tiny()
+        trainer2 = RectifiedFlowTrainer(model2, lr=1e-4, device='cpu', use_amp=False)
+        trainer2.load_checkpoint(ckpt_path)
+        assert trainer2.step == trainer.step, f"Step mismatch: {trainer2.step} vs {trainer.step}"
+        ok(f"Checkpoint save/load works (step={trainer2.step})")
+except Exception as e:
+    fail(f"Checkpoint save/load failed: {e}")
+# =========================================================================
+test("Architecture Properties")
+# =========================================================================
+m = liquid_diffusion_tiny()
+total_blocks = (sum(len(s) for s in m.encoder_blocks) +
+                len(m.bottleneck) +
+                sum(len(s) for s in m.decoder_blocks))
+# Count attention layers (should be 0)
+attention_count = 0
+for name_m, module in m.named_modules():
+    if 'attention' in name_m.lower() or 'attn' in name_m.lower():
+        attention_count += 1
+ok(f"Attention layers: {attention_count} (should be 0)")
+ok(f"LiquidCfC blocks: {total_blocks}")
+ok(f"Training: Rectified Flow (MSE velocity)")
+ok(f"Sampling: Euler ODE (configurable steps)")
+# =========================================================================
+test("VRAM Estimation for Colab T4 (16GB)")
+# =========================================================================
+for name, factory, res, bs in [
+    ("tiny @256px bs=8", liquid_diffusion_tiny, 256, 8),
+    ("tiny @256px bs=4", liquid_diffusion_tiny, 256, 4),
+    ("small @256px bs=4", liquid_diffusion_small, 256, 4),
+    ("base @512px bs=2", liquid_diffusion_base, 512, 2),
+    ("tiny @512px bs=4", liquid_diffusion_tiny, 512, 4),
+]:
+    m = factory()
+    tp = sum(p.numel() for p in m.parameters())
+    # Conservative VRAM estimate:
+    # params (fp16) + gradients (fp32) + Adam states (2×fp32) + activations
+    param_gb = tp * 2 / 1e9   # fp16
+    grad_gb = tp * 4 / 1e9    # fp32
+    optim_gb = tp * 8 / 1e9   # Adam: 2× fp32
+    # Activation estimate: ~4 bytes per element, scale with resolution and batch
+    act_gb = bs * res * res * max(m.channels) * 4 * len(m.channels) * 2 / 1e9
+    total_gb = param_gb + grad_gb + optim_gb + act_gb
+    fits = "✓ fits T4" if total_gb < 15 else "✗ too large"
+    print(f"    {name:25s}: {tp/1e6:5.1f}M params, ~{total_gb:5.1f}GB  {fits}")
+    del m
+# =========================================================================
+# FINAL SUMMARY
+# =========================================================================
+print("\n" + "=" * 70)
+if all_passed:
+    print("✅ ALL TESTS PASSED")
+    print("\nReady for training! Open the Colab notebook:")
+    print("  LiquidDiffusion_Training.ipynb")
+else:
+    print("❌ SOME TESTS FAILED — check output above")
+print("=" * 70)