krystv commited on
Commit
8589a61
·
verified ·
1 Parent(s): fafdff9

Upload validate.py

Browse files
Files changed (1) hide show
  1. validate.py +336 -0
validate.py ADDED
@@ -0,0 +1,336 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ LiquidDiffusion — Self-Contained Validation Script
4
+
5
+ Run this to verify everything works before training:
6
+ python validate.py
7
+
8
+ Tests:
9
+ 1. Model construction at all scales
10
+ 2. Forward pass at multiple resolutions
11
+ 3. Backward pass and gradient flow
12
+ 4. 20-step training stability with random data
13
+ 5. Sampling with Euler ODE
14
+ 6. VRAM estimation
15
+ 7. Full trainer pipeline
16
+ """
17
+
18
+ import sys
19
+ import math
20
+ import time
21
+ import copy
22
+
23
+ print("=" * 70)
24
+ print("LiquidDiffusion Validation Suite")
25
+ print("=" * 70)
26
+
27
+ # Check imports
28
+ try:
29
+ import torch
30
+ import torch.nn as nn
31
+ import torch.nn.functional as F
32
+ print(f"✓ PyTorch {torch.__version__}")
33
+ except ImportError:
34
+ print("✗ PyTorch not installed. Run: pip install torch torchvision")
35
+ sys.exit(1)
36
+
37
+ try:
38
+ from torchvision.utils import save_image
39
+ print("✓ torchvision")
40
+ except ImportError:
41
+ print("✗ torchvision not installed. Run: pip install torchvision")
42
+ sys.exit(1)
43
+
44
+ # Import our modules
45
+ try:
46
+ from liquid_diffusion.model import (
47
+ LiquidDiffusionUNet, liquid_diffusion_tiny,
48
+ liquid_diffusion_small, liquid_diffusion_base,
49
+ SinusoidalTimeEmbedding, ParallelCfCBlock, AdaLN,
50
+ )
51
+ print("✓ liquid_diffusion.model")
52
+ except ImportError as e:
53
+ print(f"✗ Failed to import model: {e}")
54
+ print(" Make sure you're in the liquid-diffusion directory")
55
+ sys.exit(1)
56
+
57
+ try:
58
+ from liquid_diffusion.trainer import RectifiedFlowTrainer, get_cosine_schedule_with_warmup
59
+ print("✓ liquid_diffusion.trainer")
60
+ except ImportError as e:
61
+ print(f"✗ Failed to import trainer: {e}")
62
+ sys.exit(1)
63
+
64
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
65
+ print(f"\nDevice: {device}")
66
+ if device == 'cuda':
67
+ print(f"GPU: {torch.cuda.get_device_name(0)}")
68
+ print(f"VRAM: {torch.cuda.get_device_properties(0).total_mem / 1e9:.1f} GB")
69
+
70
+ all_passed = True
71
+ test_num = 0
72
+
73
+ def test(name):
74
+ global test_num
75
+ test_num += 1
76
+ print(f"\n--- Test {test_num}: {name} ---")
77
+
78
+ def fail(msg):
79
+ global all_passed
80
+ all_passed = False
81
+ print(f" ✗ FAIL: {msg}")
82
+
83
+ def ok(msg):
84
+ print(f" ✓ {msg}")
85
+
86
+
87
+ # =========================================================================
88
+ test("Model Construction & Parameter Count")
89
+ # =========================================================================
90
+ for name, factory in [("tiny", liquid_diffusion_tiny), ("small", liquid_diffusion_small), ("base", liquid_diffusion_base)]:
91
+ try:
92
+ m = factory()
93
+ total, trainable = m.count_params()
94
+ ok(f"{name:8s}: {total:>12,} params ({total/1e6:.1f}M)")
95
+ del m
96
+ except Exception as e:
97
+ fail(f"{name}: {e}")
98
+
99
+ # =========================================================================
100
+ test("Forward Pass (multiple resolutions)")
101
+ # =========================================================================
102
+ model = liquid_diffusion_tiny()
103
+ for res in [32, 64, 128]:
104
+ try:
105
+ x = torch.randn(2, 3, res, res)
106
+ t = torch.rand(2)
107
+ out = model(x, t)
108
+ assert out.shape == x.shape, f"Shape mismatch: {out.shape} vs {x.shape}"
109
+ assert not torch.isnan(out).any(), "NaN in output"
110
+ assert not torch.isinf(out).any(), "Inf in output"
111
+ ok(f"{res}x{res}: output shape {out.shape}, range [{out.min():.4f}, {out.max():.4f}]")
112
+ except Exception as e:
113
+ fail(f"{res}x{res}: {e}")
114
+
115
+ # =========================================================================
116
+ test("Backward Pass (gradient flow)")
117
+ # =========================================================================
118
+ model = liquid_diffusion_tiny()
119
+ x = torch.randn(2, 3, 64, 64, requires_grad=False)
120
+ t = torch.rand(2)
121
+ out = model(x, t)
122
+ loss = out.mean()
123
+ loss.backward()
124
+
125
+ total_params = 0
126
+ params_with_grad = 0
127
+ nan_grads = 0
128
+ zero_grads = 0
129
+ for name_p, p in model.named_parameters():
130
+ total_params += 1
131
+ if p.grad is not None:
132
+ params_with_grad += 1
133
+ if torch.isnan(p.grad).any():
134
+ nan_grads += 1
135
+ if p.grad.abs().max() == 0:
136
+ zero_grads += 1
137
+
138
+ if nan_grads > 0:
139
+ fail(f"NaN gradients in {nan_grads}/{total_params} parameters")
140
+ elif params_with_grad == 0:
141
+ fail("No parameters received gradients")
142
+ else:
143
+ ok(f"{params_with_grad}/{total_params} params have gradients, {nan_grads} NaN, {zero_grads} zero-only")
144
+
145
+ # Check gradient magnitude distribution
146
+ grad_maxes = [p.grad.abs().max().item() for p in model.parameters() if p.grad is not None]
147
+ ok(f"Gradient |max| range: [{min(grad_maxes):.2e}, {max(grad_maxes):.2e}]")
148
+
149
+ # =========================================================================
150
+ test("Training Stability (20 steps, random data)")
151
+ # =========================================================================
152
+ model = liquid_diffusion_tiny()
153
+ optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=0.01)
154
+
155
+ losses = []
156
+ for step in range(20):
157
+ model.train()
158
+ x0 = torch.randn(4, 3, 64, 64)
159
+ x1 = torch.randn_like(x0)
160
+ t_val = torch.rand(4)
161
+ t_expand = t_val[:, None, None, None]
162
+ x_t = (1 - t_expand) * x0 + t_expand * x1
163
+ v_target = x1 - x0
164
+
165
+ v_pred = model(x_t, t_val)
166
+ loss = F.mse_loss(v_pred, v_target)
167
+
168
+ optimizer.zero_grad()
169
+ loss.backward()
170
+ gn = torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
171
+ optimizer.step()
172
+
173
+ losses.append(loss.item())
174
+ if step % 5 == 0:
175
+ print(f" Step {step:3d}: loss={loss.item():.4f}, grad_norm={gn.item():.4f}")
176
+
177
+ stable = all(not math.isnan(l) and not math.isinf(l) for l in losses)
178
+ not_exploding = max(losses) < 100
179
+
180
+ if stable:
181
+ ok(f"No NaN/Inf in any of {len(losses)} steps")
182
+ else:
183
+ fail("NaN or Inf detected in loss")
184
+
185
+ if not_exploding:
186
+ ok(f"Loss range: [{min(losses):.4f}, {max(losses):.4f}]")
187
+ else:
188
+ fail(f"Loss exploded: max={max(losses):.4f}")
189
+
190
+ # =========================================================================
191
+ test("Sampling (Euler ODE, 10 steps)")
192
+ # =========================================================================
193
+ model.eval()
194
+ with torch.no_grad():
195
+ z = torch.randn(2, 3, 64, 64)
196
+ num_steps = 10
197
+ dt = 1.0 / num_steps
198
+ for i in range(num_steps, 0, -1):
199
+ t_s = torch.full((2,), i / num_steps)
200
+ v = model(z, t_s)
201
+ z = z - v * dt
202
+ z = z.clamp(-1, 1)
203
+
204
+ if torch.isnan(z).any():
205
+ fail("NaN in generated samples")
206
+ elif torch.isinf(z).any():
207
+ fail("Inf in generated samples")
208
+ else:
209
+ ok(f"Shape: {z.shape}, range: [{z.min():.3f}, {z.max():.3f}], "
210
+ f"mean: {z.mean():.4f}, std: {z.std():.4f}")
211
+
212
+ # =========================================================================
213
+ test("Timestep Sensitivity")
214
+ # =========================================================================
215
+ model.eval()
216
+ x = torch.randn(1, 3, 64, 64)
217
+ outputs = {}
218
+ for t_val in [0.01, 0.25, 0.5, 0.75, 0.99]:
219
+ with torch.no_grad():
220
+ out = model(x, torch.tensor([t_val]))
221
+ outputs[t_val] = out
222
+ print(f" t={t_val:.2f}: mean={out.mean():.6f}, std={out.std():.6f}, |max|={out.abs().max():.6f}")
223
+
224
+ # Check that different timesteps give different outputs
225
+ diff_01_099 = (outputs[0.01] - outputs[0.99]).abs().mean().item()
226
+ if diff_01_099 > 1e-6:
227
+ ok(f"Timestep affects output (mean diff t=0.01 vs t=0.99: {diff_01_099:.6f})")
228
+ else:
229
+ fail(f"Timestep has no effect on output (diff={diff_01_099:.10f})")
230
+
231
+ # =========================================================================
232
+ test("Full Trainer Pipeline (CPU, 5 steps)")
233
+ # =========================================================================
234
+ model = liquid_diffusion_tiny()
235
+
236
+ trainer = RectifiedFlowTrainer(
237
+ model=model,
238
+ lr=1e-4,
239
+ device='cpu',
240
+ use_amp=False, # CPU doesn't support AMP
241
+ time_sampling='logit_normal',
242
+ )
243
+
244
+ for step in range(5):
245
+ x0 = torch.randn(2, 3, 64, 64)
246
+ metrics = trainer.train_step(x0)
247
+ if step == 0:
248
+ print(f" Step {step}: loss={metrics['loss']:.4f}, grad_norm={metrics['grad_norm']:.4f}")
249
+
250
+ if math.isnan(metrics['loss']):
251
+ fail("Trainer produced NaN loss")
252
+ else:
253
+ ok(f"Trainer works: final loss={metrics['loss']:.4f}, step={trainer.step}")
254
+
255
+ # Test sampling
256
+ try:
257
+ samples = trainer.sample(batch_size=1, image_size=64, num_steps=5, use_ema=True)
258
+ if torch.isnan(samples).any():
259
+ fail("Trainer sampling produced NaN")
260
+ else:
261
+ ok(f"Sampling works: shape={samples.shape}, range=[{samples.min():.3f}, {samples.max():.3f}]")
262
+ except Exception as e:
263
+ fail(f"Sampling failed: {e}")
264
+
265
+ # Test checkpoint save/load
266
+ try:
267
+ import tempfile, os
268
+ with tempfile.TemporaryDirectory() as tmpdir:
269
+ ckpt_path = os.path.join(tmpdir, 'test_ckpt.pt')
270
+ trainer.save_checkpoint(ckpt_path)
271
+
272
+ # Create new trainer and load
273
+ model2 = liquid_diffusion_tiny()
274
+ trainer2 = RectifiedFlowTrainer(model2, lr=1e-4, device='cpu', use_amp=False)
275
+ trainer2.load_checkpoint(ckpt_path)
276
+
277
+ assert trainer2.step == trainer.step, f"Step mismatch: {trainer2.step} vs {trainer.step}"
278
+ ok(f"Checkpoint save/load works (step={trainer2.step})")
279
+ except Exception as e:
280
+ fail(f"Checkpoint save/load failed: {e}")
281
+
282
+ # =========================================================================
283
+ test("Architecture Properties")
284
+ # =========================================================================
285
+ m = liquid_diffusion_tiny()
286
+ total_blocks = (sum(len(s) for s in m.encoder_blocks) +
287
+ len(m.bottleneck) +
288
+ sum(len(s) for s in m.decoder_blocks))
289
+
290
+ # Count attention layers (should be 0)
291
+ attention_count = 0
292
+ for name_m, module in m.named_modules():
293
+ if 'attention' in name_m.lower() or 'attn' in name_m.lower():
294
+ attention_count += 1
295
+
296
+ ok(f"Attention layers: {attention_count} (should be 0)")
297
+ ok(f"LiquidCfC blocks: {total_blocks}")
298
+ ok(f"Training: Rectified Flow (MSE velocity)")
299
+ ok(f"Sampling: Euler ODE (configurable steps)")
300
+
301
+ # =========================================================================
302
+ test("VRAM Estimation for Colab T4 (16GB)")
303
+ # =========================================================================
304
+ for name, factory, res, bs in [
305
+ ("tiny @256px bs=8", liquid_diffusion_tiny, 256, 8),
306
+ ("tiny @256px bs=4", liquid_diffusion_tiny, 256, 4),
307
+ ("small @256px bs=4", liquid_diffusion_small, 256, 4),
308
+ ("base @512px bs=2", liquid_diffusion_base, 512, 2),
309
+ ("tiny @512px bs=4", liquid_diffusion_tiny, 512, 4),
310
+ ]:
311
+ m = factory()
312
+ tp = sum(p.numel() for p in m.parameters())
313
+ # Conservative VRAM estimate:
314
+ # params (fp16) + gradients (fp32) + Adam states (2×fp32) + activations
315
+ param_gb = tp * 2 / 1e9 # fp16
316
+ grad_gb = tp * 4 / 1e9 # fp32
317
+ optim_gb = tp * 8 / 1e9 # Adam: 2× fp32
318
+ # Activation estimate: ~4 bytes per element, scale with resolution and batch
319
+ act_gb = bs * res * res * max(m.channels) * 4 * len(m.channels) * 2 / 1e9
320
+ total_gb = param_gb + grad_gb + optim_gb + act_gb
321
+ fits = "✓ fits T4" if total_gb < 15 else "✗ too large"
322
+ print(f" {name:25s}: {tp/1e6:5.1f}M params, ~{total_gb:5.1f}GB {fits}")
323
+ del m
324
+
325
+
326
+ # =========================================================================
327
+ # FINAL SUMMARY
328
+ # =========================================================================
329
+ print("\n" + "=" * 70)
330
+ if all_passed:
331
+ print("✅ ALL TESTS PASSED")
332
+ print("\nReady for training! Open the Colab notebook:")
333
+ print(" LiquidDiffusion_Training.ipynb")
334
+ else:
335
+ print("❌ SOME TESTS FAILED — check output above")
336
+ print("=" * 70)