"""
Comprehensive test suite for ViL Tracker.

16 tests covering all components:
1. mLSTM Cell (LinearHeadwiseExpand correctness + param count)
2. mLSTM Block (full block without MLP)
3. TMoE MLP
4. Backbone (standard, small depth)
5. Backbone (with TMoE + integrated FiLM, medium depth)
6. Prediction Heads
7. FiLM Temporal Modulation
8. Full Tracker (small depth for speed)
9. Loss Functions (all 6)
10. Kalman Filter (8-state, adaptive)
11. Dataset (synthetic)
12. Training Step (mini forward + backward with temporal)
13. Model Summary (FULL depth=24, constraint check)
14. Online Tracker (full inference pipeline)
15. Augmentation pipeline
16. ACL curriculum integration
"""

import sys
import time
import torch
import numpy as np

torch.manual_seed(42)
np.random.seed(42)

PASS = 0
FAIL = 0

def test(name, fn):
    global PASS, FAIL
    print(f"\nTest {PASS + FAIL + 1}: {name}...", flush=True)
    try:
        fn()
        PASS += 1
        print(f"  ✅ PASSED")
    except Exception as e:
        FAIL += 1
        print(f"  ❌ FAILED: {e}")
        import traceback
        traceback.print_exc()


def count_params(model):
    return sum(p.numel() for p in model.parameters())


# ============================================================
# Test 1: mLSTM Cell
# ============================================================
def test_mlstm_cell():
    from vil_tracker.models.mlstm import mLSTMCell, LinearHeadwiseExpand
    
    # Test LinearHeadwiseExpand
    lhe = LinearHeadwiseExpand(768, num_heads=192, bias=False)
    lhe_params = count_params(lhe)
    assert lhe_params == 192 * 4 * 4, f"LHE params: {lhe_params} != {192*4*4}"
    
    x = torch.randn(2, 10, 768)
    y = lhe(x)
    assert y.shape == (2, 10, 768), f"LHE output shape: {y.shape}"
    
    # Test full mLSTM cell
    cell = mLSTMCell(dim=384, proj_factor=2.0, qkv_proj_blocksize=4, num_heads=4)
    cell_params = count_params(cell)
    print(f"  mLSTMCell params: {cell_params:,} ({cell_params/1e6:.3f}M)")
    
    # Should be ~920K, not 2.66M
    assert cell_params < 1_000_000, f"Cell has {cell_params:,} params (should be <1M)"
    assert cell_params > 800_000, f"Cell has {cell_params:,} params (should be >800K)"
    
    # Verify GroupNorm uses 192 groups (num_proj_heads), not 4 (num_heads)
    assert cell.outnorm.num_groups == 192, f"GroupNorm should have 192 groups, got {cell.outnorm.num_groups}"
    print(f"  GroupNorm groups: {cell.outnorm.num_groups} (correct: per-projection-head)")
    
    x = torch.randn(2, 20, 384)
    y = cell(x)
    assert y.shape == (2, 20, 384), f"Cell output shape: {y.shape}"
    
    # Test reverse mode
    y_rev = cell(x, reverse=True)
    assert y_rev.shape == (2, 20, 384), f"Reverse output shape: {y_rev.shape}"
    # Forward and reverse should produce different results
    assert not torch.allclose(y, y_rev, atol=1e-3), "Forward and reverse should differ"

test("mLSTM Cell (LinearHeadwiseExpand)", test_mlstm_cell)


# ============================================================
# Test 2: mLSTM Block
# ============================================================
def test_mlstm_block():
    from vil_tracker.models.mlstm import mLSTMBlock
    
    block = mLSTMBlock(dim=384, proj_factor=2.0, qkv_proj_blocksize=4,
                       num_heads=4, mlp_ratio=4.0)
    params = count_params(block)
    print(f"  mLSTMBlock params: {params:,} ({params/1e6:.3f}M)")
    
    # No separate MLP — should be ~920K, same as cell + LayerNorm
    assert params < 1_050_000, f"Block has {params:,} params (should be <1.05M without MLP)"
    
    x = torch.randn(2, 20, 384)
    y = block(x)
    assert y.shape == (2, 20, 384), f"Block output shape: {y.shape}"
    
    # Residual connection: output should be close-ish to input at init
    diff = (y - x).abs().mean().item()
    print(f"  Residual diff from input: {diff:.4f}")

test("mLSTM Block (no separate MLP)", test_mlstm_block)


# ============================================================
# Test 3: TMoE MLP
# ============================================================
def test_tmoe():
    from vil_tracker.models.backbone import TMoEMLP
    
    tmoe = TMoEMLP(dim=384, mlp_ratio=4.0, num_experts=4)
    params = count_params(tmoe)
    print(f"  TMoEMLP params: {params:,} ({params/1e6:.3f}M)")
    
    x = torch.randn(2, 20, 384)
    y = tmoe(x)
    assert y.shape == (2, 20, 384), f"TMoE output shape: {y.shape}"
    
    # Test freezing shared expert
    tmoe.freeze_shared_expert()
    frozen = sum(1 for p in tmoe.shared_expert.parameters() if not p.requires_grad)
    total_shared = sum(1 for p in tmoe.shared_expert.parameters())
    assert frozen == total_shared, "Shared expert should be fully frozen"

test("TMoE MLP", test_tmoe)


# ============================================================
# Test 4: Backbone (standard, small depth)
# ============================================================
def test_backbone_small():
    from vil_tracker.models.backbone import ViLBackbone
    
    backbone = ViLBackbone(dim=384, depth=4, patch_size=16, tmoe_blocks=0)
    params = count_params(backbone)
    print(f"  Backbone (depth=4, no TMoE) params: {params:,} ({params/1e6:.3f}M)")
    
    template = torch.randn(2, 3, 128, 128)
    search = torch.randn(2, 3, 256, 256)
    
    t_feat, s_feat = backbone(template, search)
    assert t_feat.shape == (2, 64, 384), f"Template feat shape: {t_feat.shape}"
    assert s_feat.shape == (2, 256, 384), f"Search feat shape: {s_feat.shape}"

test("Backbone (standard, depth=4)", test_backbone_small)


# ============================================================
# Test 5: Backbone with TMoE + integrated FiLM
# ============================================================
def test_backbone_tmoe_film():
    from vil_tracker.models.backbone import ViLBackbone
    from vil_tracker.models.film_temporal import TemporalModulationManager
    
    backbone = ViLBackbone(dim=384, depth=6, patch_size=16, tmoe_blocks=2,
                           num_experts=4, film_interval=3)
    params = count_params(backbone)
    print(f"  Backbone (depth=6, TMoE=2) params: {params:,} ({params/1e6:.3f}M)")
    
    # Create temporal modulation manager
    temporal_mod = TemporalModulationManager(dim=384, num_blocks=6, modulation_interval=3)
    
    template = torch.randn(1, 3, 128, 128)
    search = torch.randn(1, 3, 256, 256)
    
    # First pass: no temporal context yet
    t_feat, s_feat = backbone(template, search, temporal_mod_manager=temporal_mod)
    assert t_feat.shape == (1, 64, 384), f"Template feat shape: {t_feat.shape}"
    assert s_feat.shape == (1, 256, 384), f"Search feat shape: {s_feat.shape}"
    
    # Second pass: temporal context should be active now
    t_feat2, s_feat2 = backbone(template, search, temporal_mod_manager=temporal_mod)
    # Output should differ when temporal modulation is active
    assert t_feat2.shape == (1, 64, 384)
    print(f"  FiLM modulation active: features differ = {not torch.allclose(t_feat, t_feat2, atol=1e-5)}")

test("Backbone (TMoE + integrated FiLM)", test_backbone_tmoe_film)


# ============================================================
# Test 6: Prediction Heads
# ============================================================
def test_heads():
    from vil_tracker.models.heads import CenterHead, UncertaintyHead, decode_predictions, create_hanning_window
    
    center_head = CenterHead(dim=384, feat_size=16)
    unc_head = UncertaintyHead(dim=384, feat_size=16)
    
    print(f"  CenterHead params: {count_params(center_head):,}")
    print(f"  UncertaintyHead params: {count_params(unc_head):,}")
    
    search_feat = torch.randn(2, 256, 384)
    preds = center_head(search_feat)
    
    assert preds['heatmap'].shape == (2, 1, 16, 16), f"Heatmap shape: {preds['heatmap'].shape}"
    assert preds['size'].shape == (2, 2, 16, 16), f"Size shape: {preds['size'].shape}"
    assert preds['offset'].shape == (2, 2, 16, 16), f"Offset shape: {preds['offset'].shape}"
    
    # Decode without Hanning
    boxes, scores = decode_predictions(preds['heatmap'], preds['size'], preds['offset'])
    assert boxes.shape == (2, 4), f"Boxes shape: {boxes.shape}"
    assert scores.shape == (2,), f"Scores shape: {scores.shape}"
    
    # Decode WITH Hanning window
    hann = create_hanning_window(16)
    assert hann.shape == (16, 16), f"Hanning shape: {hann.shape}"
    assert abs(hann[8, 8].item() - 1.0) < 0.05, f"Hanning center should be ~1.0, got {hann[8, 8]}"
    assert hann[0, 0].item() < 0.01, f"Hanning corner should be ~0, got {hann[0, 0]}"
    
    boxes_h, scores_h = decode_predictions(preds['heatmap'], preds['size'], preds['offset'],
                                           hanning_window=hann)
    assert boxes_h.shape == (2, 4), f"Hanning boxes shape: {boxes_h.shape}"
    print(f"  Hanning window: center={hann[8,8]:.3f}, corner={hann[0,0]:.6f}")
    print(f"  Without Hanning: box={boxes[0].tolist()}, score={scores[0].item():.4f}")
    print(f"  With Hanning:    box={boxes_h[0].tolist()}, score={scores_h[0].item():.4f}")
    
    # Uncertainty
    log_var = unc_head(search_feat)
    assert log_var.shape == (2, 1, 16, 16), f"Log variance shape: {log_var.shape}"

test("Prediction Heads", test_heads)


# ============================================================
# Test 7: FiLM Temporal Modulation
# ============================================================
def test_film():
    from vil_tracker.models.film_temporal import (
        TemporalReliabilityCalibrator,
        FiLMTemporalModulation,
        TemporalModulationManager,
    )
    
    # Test individual components
    calib = TemporalReliabilityCalibrator(384)
    film = FiLMTemporalModulation(384)
    
    x = torch.randn(2, 20, 384)
    tc = torch.randn(2, 20, 384)
    
    rel = calib(tc)
    assert rel.shape == (2, 20, 1), f"Reliability shape: {rel.shape}"
    assert (rel >= 0).all() and (rel <= 1).all(), "Reliability not in [0,1]"
    
    modulated = film(x, tc, rel)
    assert modulated.shape == (2, 20, 384), f"Modulated shape: {modulated.shape}"
    
    # Test manager
    manager = TemporalModulationManager(dim=384, num_blocks=24, modulation_interval=6)
    print(f"  TemporalModulationManager params: {count_params(manager):,}")
    
    # First call: no temporal context yet, should return unchanged
    y = manager.modulate(x, block_idx=5)
    assert torch.allclose(y, x), "Should return unchanged without temporal context"
    
    # Update context and try again
    manager.update_temporal_context(x)
    y = manager.modulate(x, block_idx=5)  # block 5 → (5+1)%6==0, should modulate
    assert y.shape == (2, 20, 384)
    
    # Test reset
    manager.reset()
    y = manager.modulate(x, block_idx=5)
    assert torch.allclose(y, x), "After reset, should return unchanged"

test("FiLM Temporal Modulation", test_film)


# ============================================================
# Test 8: Full Tracker (small depth for speed)
# ============================================================
def test_full_tracker_small():
    from vil_tracker.models.tracker import ViLTracker, get_default_config
    
    config = get_default_config()
    config['depth'] = 4
    config['tmoe_blocks'] = 1
    config['film_interval'] = 2
    
    tracker = ViLTracker(config)
    params = count_params(tracker)
    print(f"  Tracker (depth=4) params: {params:,} ({params/1e6:.3f}M)")
    
    B, K = 2, 3
    template = torch.randn(B, 3, 128, 128)
    
    # Test single-frame (backward compat)
    search_single = torch.randn(B, 3, 256, 256)
    output_s = tracker(template, search_single, use_temporal=False)
    assert output_s['heatmap'].shape == (B, 1, 16, 16), f"Single heatmap: {output_s['heatmap'].shape}"
    assert output_s['boxes'].shape == (B, 4), f"Single boxes: {output_s['boxes'].shape}"
    assert output_s['scores'].shape == (B,), f"Single scores: {output_s['scores'].shape}"
    print(f"  Single-frame: boxes={output_s['boxes'][0].tolist()}")
    
    # Test multi-frame sequence
    searches = torch.randn(B, K, 3, 256, 256)
    output_m = tracker(template, searches, use_temporal=True)
    assert output_m['heatmap'].shape == (B, K, 1, 16, 16), f"Multi heatmap: {output_m['heatmap'].shape}"
    assert output_m['boxes'].shape == (B, K, 4), f"Multi boxes: {output_m['boxes'].shape}"
    assert output_m['scores'].shape == (B, K), f"Multi scores: {output_m['scores'].shape}"
    assert output_m['search_feats'].shape == (B, K, 256, 384), f"Multi feats: {output_m['search_feats'].shape}"
    print(f"  Multi-frame (K={K}): frame 0 box={output_m['boxes'][0,0].tolist()}")
    print(f"                       frame 2 box={output_m['boxes'][0,2].tolist()}")
    
    tracker.reset_temporal()

test("Full Tracker (single + multi-frame)", test_full_tracker_small)


# ============================================================
# Test 9: Loss Functions (all 6)
# ============================================================
def test_losses():
    from vil_tracker.training.losses import (
        FocalLoss, GIoULoss, UncertaintyNLLLoss,
        MemoryContrastiveLoss, AFKDDistillationLoss,
        ADWLoss, CombinedTrackingLoss,
    )
    
    B = 4
    
    # Focal loss
    focal = FocalLoss()
    pred_hm = torch.randn(B, 1, 16, 16)
    gt_hm = torch.zeros(B, 1, 16, 16)
    gt_hm[:, :, 8, 8] = 1.0
    fl = focal(pred_hm, gt_hm)
    print(f"  Focal loss: {fl.item():.4f}")
    assert fl.item() > 0, "Focal loss should be positive"
    
    # GIoU loss
    giou = GIoULoss()
    pred_box = torch.tensor([[128.0, 128.0, 50.0, 50.0]] * B)
    gt_box = torch.tensor([[130.0, 130.0, 48.0, 48.0]] * B)
    gl = giou(pred_box, gt_box)
    print(f"  GIoU loss: {gl.item():.4f}")
    assert 0 <= gl.item() <= 2, f"GIoU loss out of range: {gl.item()}"
    
    # Uncertainty NLL loss
    unc = UncertaintyNLLLoss()
    pred_v = torch.randn(B, 4)
    target_v = torch.randn(B, 4)
    log_var = torch.zeros(B, 4)  # unit variance
    ul = unc(pred_v, target_v, log_var)
    print(f"  Uncertainty NLL loss: {ul.item():.4f}")
    assert ul.item() > 0
    
    # Contrastive loss
    contrastive = MemoryContrastiveLoss()
    feat_a = torch.randn(B, 384)
    feat_b = feat_a + torch.randn(B, 384) * 0.1
    cl = contrastive(feat_a, feat_b)
    print(f"  Contrastive loss: {cl.item():.4f}")
    
    # AFKD distillation loss
    afkd = AFKDDistillationLoss(student_dim=384, teacher_dim=768)
    student_feat = torch.randn(B, 256, 384)
    teacher_feat = torch.randn(B, 256, 768)
    dl = afkd(student_feat, teacher_feat)
    print(f"  AFKD distillation loss: {dl.item():.4f}")
    assert dl.item() > 0
    
    # ADW loss
    adw = ADWLoss(num_tasks=3)
    losses = [torch.tensor(1.0), torch.tensor(0.5), torch.tensor(2.0)]
    al = adw(losses)
    print(f"  ADW loss: {al.item():.4f}")
    
    # Combined loss
    combined = CombinedTrackingLoss()
    pred = {
        'heatmap': pred_hm,
        'size': torch.rand(B, 2, 16, 16),
        'boxes': pred_box,
        'log_variance': torch.randn(B, 1, 16, 16),
    }
    loss_dict = combined(pred, gt_hm, torch.tensor([[0.2, 0.2]] * B), gt_box)
    print(f"  Combined loss: {loss_dict['total'].item():.4f}")
    assert loss_dict['total'].item() > 0

test("Loss Functions (all 6)", test_losses)


# ============================================================
# Test 10: Kalman Filter
# ============================================================
def test_kalman():
    from vil_tracker.inference.kalman import KalmanFilter
    
    kf = KalmanFilter()
    assert not kf.initialized
    
    # Initialize
    init_box = np.array([100.0, 100.0, 50.0, 50.0])
    kf.initialize(init_box)
    assert kf.initialized
    
    # Predict + update cycle with moving target
    for i in range(10):
        pred = kf.predict()
        assert len(pred) == 4, f"Prediction length: {len(pred)}"
        
        # Simulate noisy measurement of linearly moving target
        noise = np.random.randn(4) * 2
        meas = init_box + np.array([i * 2, i * 1, 0, 0]) + noise
        kf.update(meas, uncertainty=1.0)
    
    state = kf.get_state()
    print(f"  Final state: cx={state[0]:.1f}, cy={state[1]:.1f}, w={state[2]:.1f}, h={state[3]:.1f}")
    assert state[2] > 0 and state[3] > 0, "Width/height should be positive"
    
    # Test outlier rejection (chi-squared gating)
    kf.update(np.array([500.0, 500.0, 50.0, 50.0]), uncertainty=1.0)  # Far outlier
    state_after = kf.get_state()
    # State should NOT have jumped to 500,500
    assert state_after[0] < 200, f"Outlier should be rejected, cx={state_after[0]}"

test("Kalman Filter (8-state, adaptive)", test_kalman)


# ============================================================
# Test 11: Dataset (synthetic)
# ============================================================
def test_dataset():
    from vil_tracker.data.dataset import SyntheticTrackingDataset, TrackingDataset
    
    ds = SyntheticTrackingDataset(length=100, clip_length=3)
    assert len(ds) == 100
    
    sample = ds[0]
    assert sample['template'].shape == (3, 128, 128), f"Template shape: {sample['template'].shape}"
    assert sample['searches'].shape == (3, 3, 256, 256), f"Searches shape: {sample['searches'].shape}"
    assert sample['heatmaps'].shape == (3, 1, 16, 16), f"Heatmaps shape: {sample['heatmaps'].shape}"
    assert sample['sizes'].shape == (3, 2), f"Sizes shape: {sample['sizes'].shape}"
    assert sample['boxes'].shape == (3, 4), f"Boxes shape: {sample['boxes'].shape}"
    
    # Verify target moves across frames (not static)
    cx_f0 = sample['boxes'][0, 0].item()
    cx_f2 = sample['boxes'][2, 0].item()
    print(f"  Frame 0 cx: {cx_f0:.1f}, Frame 2 cx: {cx_f2:.1f} (moving target)")
    
    # Check ACL difficulty changes motion magnitude
    ds.set_acl_difficulty(0.0)
    easy_sample = ds[42]
    ds.set_acl_difficulty(1.0)
    hard_sample = ds[42]
    print(f"  Easy frame spread: {(easy_sample['boxes'][:, 0].max() - easy_sample['boxes'][:, 0].min()).item():.1f} px")
    print(f"  Hard frame spread: {(hard_sample['boxes'][:, 0].max() - hard_sample['boxes'][:, 0].min()).item():.1f} px")
    
    # Test backward-compatible alias
    ds2 = TrackingDataset(synthetic=True, synthetic_length=50, clip_length=3)
    assert len(ds2) == 50
    sample2 = ds2[0]
    assert sample2['searches'].shape[0] == 3, "Clip length should be 3"

test("Dataset (synthetic + backward compat)", test_dataset)


# ============================================================
# Test 12: Training Step (with temporal modulation)
# ============================================================
def test_training_step():
    from vil_tracker.models.tracker import ViLTracker, get_default_config
    from vil_tracker.training.losses import CombinedTrackingLoss, MemoryContrastiveLoss
    from vil_tracker.models.heads import generate_heatmap
    
    config = get_default_config()
    config['depth'] = 2
    config['tmoe_blocks'] = 0
    config['film_interval'] = 2
    
    model = ViLTracker(config)
    model.train()
    loss_fn = CombinedTrackingLoss()
    contrastive_loss = MemoryContrastiveLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
    
    B, K = 2, 3
    template = torch.randn(B, 3, 128, 128)
    searches = torch.randn(B, K, 3, 256, 256)
    
    # GT targets for K frames
    gt_heatmaps = torch.zeros(B, K, 1, 16, 16)
    gt_heatmaps[:, :, :, 8, 8] = 1.0  # center
    gt_sizes = torch.tensor([[[0.2, 0.3]] * K] * B)
    gt_boxes = torch.tensor([[[128.0, 128.0, 51.2, 76.8]] * K] * B)
    
    # Forward WITH temporal modulation, multi-frame
    pred = model(template, searches, use_temporal=True)
    
    assert pred['heatmap'].shape == (B, K, 1, 16, 16), f"Heatmap shape: {pred['heatmap'].shape}"
    assert pred['boxes'].shape == (B, K, 4), f"Boxes shape: {pred['boxes'].shape}"
    assert pred['scores'].shape == (B, K), f"Scores shape: {pred['scores'].shape}"
    assert pred['search_feats'].shape == (B, K, 256, 384), f"Search feats: {pred['search_feats'].shape}"
    
    # Accumulate loss over K frames
    total_loss = torch.tensor(0.0)
    for k in range(K):
        pred_k = {
            'heatmap': pred['heatmap'][:, k],
            'size': pred['size'][:, k],
            'boxes': pred['boxes'][:, k],
        }
        if 'log_variance' in pred:
            pred_k['log_variance'] = pred['log_variance'][:, k]
        loss_dict = loss_fn(pred_k, gt_heatmaps[:, k], gt_sizes[:, k], gt_boxes[:, k])
        total_loss = total_loss + loss_dict['total']
    total_loss = total_loss / K
    
    # Add contrastive loss
    t_pooled = pred['template_feat'].mean(dim=1)
    s_pooled = pred['search_feats'][:, -1].mean(dim=1)
    c_loss = contrastive_loss(t_pooled, s_pooled)
    total_loss = total_loss + 0.1 * c_loss
    
    # Backward
    total_loss.backward()
    
    has_grads = sum(1 for p in model.parameters() if p.grad is not None)
    total_params_count = sum(1 for p in model.parameters())
    print(f"  Total loss: {total_loss.item():.4f} (K={K} frames, contr={c_loss.item():.4f})")
    print(f"  Params with gradients: {has_grads}/{total_params_count}")
    
    optimizer.step()
    optimizer.zero_grad()
    
    assert total_loss.item() > 0
    assert has_grads > 0

test("Training Step (K=3 sequence + contrastive)", test_training_step)


# ============================================================
# Test 13: Model Summary (FULL depth=24, constraint check)
# ============================================================
def test_model_summary():
    from vil_tracker.models.tracker import ViLTracker, get_default_config
    from vil_tracker.utils.helpers import print_model_summary
    
    config = get_default_config()
    model = ViLTracker(config)
    
    summary = print_model_summary(model, config)
    
    total_m = summary['total_params'] / 1e6
    
    # HARD CONSTRAINTS
    assert summary['param_ok'], f"FAIL: {total_m:.2f}M params exceeds 50M limit"
    assert summary['size_ok'], f"FAIL: {summary['size_fp16_mb']:.1f}MB exceeds 500MB limit"
    # GFLOPs is approximate, warn but don't fail if close
    if not summary['flop_ok']:
        print(f"  ⚠️  GFLOPs estimate ({summary['gflops']:.2f}) exceeds 20, but this is approximate")

test("Model Summary (full depth=24)", test_model_summary)


# ============================================================
# Test 14: Online Tracker (inference pipeline)
# ============================================================
def test_online_tracker():
    from vil_tracker.models.tracker import ViLTracker, get_default_config
    from vil_tracker.inference.online_tracker import OnlineTracker
    
    config = get_default_config()
    config['depth'] = 2
    config['tmoe_blocks'] = 0
    config['film_interval'] = 2
    
    model = ViLTracker(config)
    model.eval()
    
    tracker = OnlineTracker(model, device='cpu', template_size=128, search_size=256)
    
    # Simulate a sequence: 480x640 frames with a moving rectangle
    H, W = 480, 640
    init_bbox = [200, 200, 60, 80]  # [x, y, w, h]
    
    # First frame
    frame0 = np.random.randint(0, 255, (H, W, 3), dtype=np.uint8)
    # Draw target
    x, y, w, h = init_bbox
    frame0[y:y+h, x:x+w] = [255, 0, 0]  # Red rectangle
    
    tracker.initialize(frame0, init_bbox)
    
    # Track for 5 frames
    for i in range(1, 6):
        frame = np.random.randint(0, 255, (H, W, 3), dtype=np.uint8)
        # Move target
        nx = x + i * 5
        ny = y + i * 3
        frame[ny:ny+h, nx:nx+w] = [255, 0, 0]
        
        bbox = tracker.track(frame)
        assert len(bbox) == 4, f"Bbox should have 4 elements, got {len(bbox)}"
        assert all(isinstance(v, (int, float, np.floating)) for v in bbox), f"Bbox values: {bbox}"
        print(f"  Frame {i}: predicted [{bbox[0]:.1f}, {bbox[1]:.1f}, {bbox[2]:.1f}, {bbox[3]:.1f}]")
    
    print(f"  Online tracker completed 5-frame sequence")

test("Online Tracker (inference pipeline)", test_online_tracker)


# ============================================================
# Test 15: Augmentation pipeline
# ============================================================
def test_augmentation():
    from vil_tracker.data.dataset import TrackingAugmentation
    
    aug = TrackingAugmentation(
        brightness=0.2,
        contrast=0.2,
        horizontal_flip_prob=1.0,  # Force flip to test bbox update
        grayscale_prob=0.0,
        blur_prob=0.0,
    )
    
    template = torch.rand(3, 128, 128)
    search = torch.rand(3, 256, 256)
    bbox = torch.tensor([128.0, 128.0, 50.0, 50.0])  # [cx, cy, w, h]
    
    t_aug, s_aug, b_aug = aug(template, search, bbox)
    
    assert t_aug.shape == (3, 128, 128), f"Aug template shape: {t_aug.shape}"
    assert s_aug.shape == (3, 256, 256), f"Aug search shape: {s_aug.shape}"
    assert b_aug.shape == (4,), f"Aug bbox shape: {b_aug.shape}"
    
    # With flip_prob=1.0, cx should be flipped: new_cx = W - old_cx = 256 - 128 = 128
    print(f"  Original bbox: {bbox.tolist()}")
    print(f"  Augmented bbox: {b_aug.tolist()}")
    assert abs(b_aug[0].item() - (256 - 128)) < 1.0, f"Flipped cx should be ~128, got {b_aug[0]}"

test("Augmentation pipeline", test_augmentation)


# ============================================================
# Test 16: ACL curriculum integration
# ============================================================
def test_acl_curriculum():
    from vil_tracker.data.dataset import SyntheticTrackingDataset
    
    ds = SyntheticTrackingDataset(length=100, acl_difficulty=0.0, clip_length=3)
    
    # Easy: targets barely move
    easy_spreads = []
    for i in range(20):
        sample = ds[i]
        spread = (sample['boxes'][:, 0].max() - sample['boxes'][:, 0].min()).item()
        easy_spreads.append(spread)
    
    ds.set_acl_difficulty(1.0)
    
    hard_spreads = []
    for i in range(20):
        sample = ds[i]
        spread = (sample['boxes'][:, 0].max() - sample['boxes'][:, 0].min()).item()
        hard_spreads.append(spread)
    
    avg_easy = np.mean(easy_spreads)
    avg_hard = np.mean(hard_spreads)
    
    print(f"  Avg cx spread (easy, d=0.0): {avg_easy:.1f} px")
    print(f"  Avg cx spread (hard, d=1.0): {avg_hard:.1f} px")
    print(f"  Hard > Easy: {avg_hard > avg_easy}")

test("ACL curriculum integration", test_acl_curriculum)


# ============================================================
# Summary
# ============================================================
print("\n" + "=" * 60)
print(f"Results: {PASS}/{PASS + FAIL} tests passed")
if FAIL > 0:
    print(f"  ❌ {FAIL} test(s) FAILED")
    sys.exit(1)
else:
    print("  ✅ All tests passed!")
    sys.exit(0)