"""
IRIS Architecture Validation Tests
===================================
Tests forward pass, training step, generation, and memory profile.
"""

import torch
import time
import sys
from iris_model import (
    IRIS, IRISConfig, create_iris_small, create_iris_tiny, create_iris_base,
    count_parameters, estimate_memory_mb,
    HaarDWT2D, HaarIDWT2D, WaveletVAE, IRISGenerator, GRFM
)


def test_wavelet_transform():
    """Test Haar DWT/IDWT roundtrip."""
    print("=" * 60)
    print("Test 1: Wavelet Transform Roundtrip")
    print("=" * 60)
    dwt = HaarDWT2D()
    idwt = HaarIDWT2D()
    
    x = torch.randn(2, 3, 64, 64)
    y = dwt(x)
    x_recon = idwt(y)
    
    error = (x - x_recon).abs().max().item()
    print(f"  Input shape:  {list(x.shape)}")
    print(f"  DWT shape:    {list(y.shape)}")
    print(f"  Recon shape:  {list(x_recon.shape)}")
    print(f"  Max error:    {error:.2e}")
    assert error < 1e-5, f"DWT roundtrip error too high: {error}"
    print("  ✅ PASSED (lossless roundtrip)")
    return True


def test_vae():
    """Test VAE encode/decode."""
    print("\n" + "=" * 60)
    print("Test 2: Wavelet VAE")
    print("=" * 60)
    config = IRISConfig(
        latent_channels=16,
        latent_spatial=32,
        vae_channels=[32, 64, 128, 256],
    )
    vae = WaveletVAE(config)
    
    # Input: 256×256 images (will be compressed to 16×16×16 latent by VAE alone,
    # but DWT first halves to 128×128, then 3 downsamples = 16×16)
    # Actually: DWT gives 12×128×128, then conv_in → 32×128×128
    # Down1: 64×64, Down2: 32×32, Down3: 16×16
    x = torch.randn(2, 3, 256, 256)
    
    z, mean, logvar = vae.encode(x)
    x_recon = vae.decode(z)
    
    print(f"  Input shape:   {list(x.shape)}")
    print(f"  Latent shape:  {list(z.shape)}")
    print(f"  Recon shape:   {list(x_recon.shape)}")
    print(f"  Compression:   {x.numel() / z.numel():.1f}×")
    
    vae_params = sum(p.numel() for p in vae.parameters())
    print(f"  VAE params:    {vae_params:,}")
    print(f"  VAE memory:    {vae_params * 2 / 1024 / 1024:.1f} MB (fp16)")
    print("  ✅ PASSED")
    return True


def test_grfm():
    """Test GRFM module independently."""
    print("\n" + "=" * 60)
    print("Test 3: GRFM (Gated Recurrent Fourier Mixer)")
    print("=" * 60)
    config = IRISConfig(
        hidden_dim=256,
        num_heads=4,
        fourier_num_blocks=4,
        recurrence_dim=128,
        manhattan_window=8,
    )
    grfm = GRFM(config)
    
    B, H, W, D = 2, 8, 8, 256
    x = torch.randn(B, H * W, D)
    
    t0 = time.time()
    out = grfm(x, H, W)
    t1 = time.time()
    
    print(f"  Input:  [B={B}, N={H*W}, D={D}]")
    print(f"  Output: {list(out.shape)}")
    print(f"  Time:   {(t1-t0)*1000:.1f} ms")
    
    grfm_params = sum(p.numel() for p in grfm.parameters())
    print(f"  Params: {grfm_params:,}")
    
    # Test gradient flow
    loss = out.sum()
    loss.backward()
    grad_ok = all(p.grad is not None for p in grfm.parameters() if p.requires_grad)
    print(f"  Gradients: {'✅ All flowing' if grad_ok else '❌ Some missing'}")
    print("  ✅ PASSED")
    return True


def test_generator_forward():
    """Test generator forward pass."""
    print("\n" + "=" * 60)
    print("Test 4: Generator Forward Pass")
    print("=" * 60)
    config = IRISConfig(
        latent_channels=8,
        latent_spatial=8,
        hidden_dim=256,
        num_heads=4,
        head_dim=64,
        num_prelude_blocks=1,
        num_core_layers=2,
        num_coda_blocks=1,
        default_iterations=4,
        fourier_num_blocks=4,
        recurrence_dim=128,
        manhattan_window=8,
        text_dim=768,
        patch_size=2,
    )
    gen = IRISGenerator(config)
    
    B = 2
    z_t = torch.randn(B, config.latent_channels, config.latent_spatial, config.latent_spatial)
    t = torch.rand(B)
    text_tokens = torch.randn(B, 77, config.text_dim)
    
    # Test different iteration counts
    for r in [2, 4, 8]:
        t0 = time.time()
        v_pred = gen(z_t, t, text_tokens, num_iterations=r)
        t1 = time.time()
        print(f"  r={r:2d}: output={list(v_pred.shape)}, time={1000*(t1-t0):.0f}ms")
    
    assert v_pred.shape == z_t.shape, "Output shape mismatch"
    
    gen_params = sum(p.numel() for p in gen.parameters())
    print(f"  Generator params: {gen_params:,}")
    print(f"  Note: Core block shared across all iterations!")
    print("  ✅ PASSED")
    return True


def test_training_step():
    """Test full training step with loss computation."""
    print("\n" + "=" * 60)
    print("Test 5: Training Step")
    print("=" * 60)
    config = IRISConfig(
        latent_channels=8,
        latent_spatial=8,  # VAE with DWT + 3 down blocks: 128->DWT->64->32->16->8
        hidden_dim=256,
        num_heads=4,
        head_dim=64,
        num_prelude_blocks=1,
        num_core_layers=2,
        num_coda_blocks=1,
        default_iterations=4,
        fourier_num_blocks=4,
        recurrence_dim=128,
        manhattan_window=8,
        text_dim=768,
        patch_size=2,
        vae_channels=[16, 32, 64, 128],
    )
    model = IRIS(config)
    
    # Simulate training
    B = 2
    # Input image size: 128×128
    # DWT: 128→64 (×12 channels), Down×3: 64→32→16→8
    # So latent is 8×8 with latent_channels
    images = torch.randn(B, 3, 128, 128)
    text_tokens = torch.randn(B, 77, config.text_dim)
    
    # Forward
    t0 = time.time()
    result = model.train_step(images, text_tokens, num_iterations=4)
    t1 = time.time()
    
    print(f"  Loss:           {result['loss'].item():.4f}")
    print(f"  Velocity loss:  {result['velocity_loss']:.4f}")
    print(f"  KL loss:        {result['kl_loss']:.4f}")
    print(f"  Mean t:         {result['mean_t']:.3f}")
    print(f"  Time:           {(t1-t0)*1000:.0f} ms")
    
    # Backward
    t0 = time.time()
    result['loss'].backward()
    t1 = time.time()
    print(f"  Backward time:  {(t1-t0)*1000:.0f} ms")
    
    # Check gradients
    n_grads = sum(1 for p in model.parameters() if p.grad is not None)
    n_params = sum(1 for p in model.parameters())
    print(f"  Gradients: {n_grads}/{n_params} params have gradients")
    print("  ✅ PASSED")
    return True


def test_generation():
    """Test full generation pipeline."""
    print("\n" + "=" * 60)
    print("Test 6: Image Generation Pipeline")
    print("=" * 60)
    config = IRISConfig(
        latent_channels=8,
        latent_spatial=8,
        hidden_dim=256,
        num_heads=4,
        head_dim=64,
        num_prelude_blocks=1,
        num_core_layers=2,
        num_coda_blocks=1,
        default_iterations=4,
        fourier_num_blocks=4,
        recurrence_dim=128,
        manhattan_window=8,
        text_dim=768,
        patch_size=2,
        vae_channels=[16, 32, 64, 128],
    )
    model = IRIS(config)
    model.eval()
    
    B = 2
    text_tokens = torch.randn(B, 77, config.text_dim)
    
    # Generate with different settings
    for steps, iters in [(1, 4), (4, 4), (4, 8)]:
        t0 = time.time()
        with torch.no_grad():
            images = model.generate(
                text_tokens, 
                num_steps=steps, 
                num_iterations=iters,
                cfg_scale=1.0,  # No CFG for speed test
                seed=42
            )
        t1 = time.time()
        print(f"  steps={steps}, iters={iters}: shape={list(images.shape)}, "
              f"range=[{images.min():.2f}, {images.max():.2f}], time={1000*(t1-t0):.0f}ms")
    
    assert images.shape == (B, 3, 128, 128), f"Unexpected output shape: {images.shape}"
    print("  ✅ PASSED")
    return True


def test_adaptive_compute():
    """Test that different iteration counts produce different results."""
    print("\n" + "=" * 60)
    print("Test 7: Adaptive Compute Budget")
    print("=" * 60)
    config = IRISConfig(
        latent_channels=8,
        latent_spatial=8,
        hidden_dim=256,
        num_heads=4,
        head_dim=64,
        num_prelude_blocks=1,
        num_core_layers=2,
        num_coda_blocks=1,
        default_iterations=4,
        fourier_num_blocks=4,
        recurrence_dim=128,
        manhattan_window=8,
        text_dim=768,
        patch_size=2,
        vae_channels=[16, 32, 64, 128],
    )
    model = IRIS(config)
    model.eval()
    
    text_tokens = torch.randn(1, 77, config.text_dim)
    
    # For an untrained model with zero-init adaLN gates, the core has minimal effect.
    # After training, different iterations WILL produce different outputs.
    # For this test, initialize adaLN gates to non-zero to simulate a partially trained model.
    with torch.no_grad():
        model.generator.output_proj.weight.normal_(0, 0.02)
        for name, param in model.generator.core.named_parameters():
            if 'adaln' in name:
                param.normal_(0, 0.1)
    
    results = {}
    for r in [2, 4, 8, 12]:
        with torch.no_grad():
            img = model.generate(text_tokens, num_steps=2, num_iterations=r, 
                               cfg_scale=1.0, seed=42)
        results[r] = img
    
    # Check that different iterations give different results
    diff_4_8 = (results[4] - results[8]).abs().mean().item()
    diff_8_12 = (results[8] - results[12]).abs().mean().item()
    diff_2_12 = (results[2] - results[12]).abs().mean().item()
    
    print(f"  Diff(r=4, r=8):   {diff_4_8:.4f}")
    print(f"  Diff(r=8, r=12):  {diff_8_12:.4f}")
    print(f"  Diff(r=2, r=12):  {diff_2_12:.4f}")
    print(f"  More iterations → more refinement: {'✅' if diff_2_12 > diff_8_12 else '⚠️'}")
    
    # All should be different (model produces different outputs at different budgets)
    assert diff_4_8 > 0, "r=4 and r=8 should differ"
    assert diff_8_12 > 0, "r=8 and r=12 should differ"
    print("  ✅ PASSED")
    return True


def test_memory_profile():
    """Profile memory usage for mobile deployment."""
    print("\n" + "=" * 60)
    print("Test 8: Memory Profile for Mobile Deployment")
    print("=" * 60)
    
    for name, create_fn in [("IRIS-Tiny", create_iris_tiny), 
                             ("IRIS-Small", create_iris_small)]:
        model = create_fn()
        
        # Component-wise analysis
        vae_params = sum(p.numel() for p in model.vae.parameters())
        gen_params = sum(p.numel() for p in model.generator.parameters())
        
        # Core block (shared) — this is the key
        core_params = sum(p.numel() for p in model.generator.core.parameters())
        prelude_params = sum(p.numel() for p in model.generator.prelude.parameters())
        coda_params = sum(p.numel() for p in model.generator.coda.parameters())
        
        vae_mb = vae_params * 2 / 1024 / 1024
        gen_mb = gen_params * 2 / 1024 / 1024
        core_mb = core_params * 2 / 1024 / 1024
        
        # Estimate total inference memory (fp16)
        model_mb = (vae_params + gen_params) * 2 / 1024 / 1024
        text_enc_mb = 156  # CLIP-L/14 text encoder
        activation_mb = 50  # Single iteration buffer
        overhead_mb = 300  # OS + framework
        total_mb = model_mb + text_enc_mb + activation_mb + overhead_mb
        
        print(f"\n  {name}:")
        print(f"    VAE:       {vae_params:>10,} params = {vae_mb:>6.1f} MB")
        print(f"    Generator: {gen_params:>10,} params = {gen_mb:>6.1f} MB")
        print(f"      Prelude: {prelude_params:>10,}")
        print(f"      Core:    {core_params:>10,} (shared, iterated r times)")
        print(f"      Coda:    {coda_params:>10,}")
        print(f"    ────────────────────────────────")
        print(f"    Model total:     {model_mb:>6.1f} MB (fp16)")
        print(f"    + CLIP-L/14:     {text_enc_mb:>6.1f} MB")
        print(f"    + Activations:   {activation_mb:>6.1f} MB")
        print(f"    + OS overhead:   {overhead_mb:>6.1f} MB")
        print(f"    ═══════════════════════════════")
        print(f"    TOTAL INFERENCE: {total_mb:>6.1f} MB")
        print(f"    Fits in 3GB:     {'✅ YES' if total_mb < 3000 else '❌ NO'}")
        print(f"    Fits in 4GB:     {'✅ YES' if total_mb < 4000 else '❌ NO'}")
    
    print("\n  ✅ PASSED")
    return True


def test_effective_depth():
    """Demonstrate the effective depth advantage."""
    print("\n" + "=" * 60)
    print("Test 9: Effective Depth Analysis")
    print("=" * 60)
    
    model = create_iris_small()
    config = model.config
    
    # Unique parameters
    core_params = sum(p.numel() for p in model.generator.core.parameters())
    total_unique = sum(p.numel() for p in model.parameters())
    
    layers_per_iteration = config.num_core_layers
    
    print(f"  Architecture: Prelude({config.num_prelude_blocks}) → "
          f"Core({config.num_core_layers} layers × r iters) → "
          f"Coda({config.num_coda_blocks})")
    print(f"  Unique params: {total_unique:,}")
    print(f"  Core params:   {core_params:,} (shared)")
    print()
    
    for r in [4, 8, 12, 16]:
        effective_layers = config.num_prelude_blocks + r * layers_per_iteration + config.num_coda_blocks
        effective_params = total_unique + (r - 1) * core_params  # Conceptual equivalent
        
        print(f"  r={r:2d}: {effective_layers} effective layers, "
              f"~{effective_params/1e6:.0f}M effective params, "
              f"from {total_unique/1e6:.0f}M unique")
    
    print(f"\n  → 16× iteration gives {(total_unique + 15*core_params)/total_unique:.1f}× "
          f"effective capacity from same model!")
    print("  ✅ PASSED")
    return True


if __name__ == "__main__":
    print("🔬 IRIS Architecture Validation Suite")
    print("=" * 60)
    
    tests = [
        test_wavelet_transform,
        test_vae,
        test_grfm,
        test_generator_forward,
        test_training_step,
        test_generation,
        test_adaptive_compute,
        test_memory_profile,
        test_effective_depth,
    ]
    
    passed = 0
    failed = 0
    for test in tests:
        try:
            if test():
                passed += 1
        except Exception as e:
            print(f"  ❌ FAILED: {e}")
            import traceback
            traceback.print_exc()
            failed += 1
    
    print(f"\n{'=' * 60}")
    print(f"Results: {passed} passed, {failed} failed out of {len(tests)} tests")
    print(f"{'=' * 60}")
    
    if failed > 0:
        sys.exit(1)