omar-ah
/

vil-tracker

Model card Files Files and versions

xet

Community

omar-ah commited on 8 days ago

Commit

4ba026e

verified ·

1 Parent(s): 1bf192e

Sequence training: pairs→K-frame clips, mLSTM memory carries across frames

Browse files

Files changed (1) hide show

test_all.py +84 -68

test_all.py CHANGED Viewed

@@ -294,31 +294,30 @@ def test_full_tracker_small():
     params = count_params(tracker)
     print(f"  Tracker (depth=4) params: {params:,} ({params/1e6:.3f}M)")
-    template = torch.randn(2, 3, 128, 128)
-    search = torch.randn(2, 3, 256, 256)
-    # Test without temporal
-    output = tracker(template, search, use_temporal=False)
-    assert output['heatmap'].shape == (2, 1, 16, 16)
-    assert output['boxes'].shape == (2, 4)
-    assert output['scores'].shape == (2,)
-    assert 'log_variance' in output
-    # Test with temporal (first frame: no context)
-    output_t1 = tracker(template, search, use_temporal=True)
-    assert output_t1['boxes'].shape == (2, 4)
-    # Second frame: temporal context available
-    output_t2 = tracker(template, search, use_temporal=True)
-    assert output_t2['boxes'].shape == (2, 4)
-    # Reset temporal
     tracker.reset_temporal()
-    print(f"  Predicted boxes: {output['boxes'][0].tolist()}")
-    print(f"  Scores: {output['scores'].tolist()}")
-test("Full Tracker (depth=4, with temporal)", test_full_tracker_small)
 # ============================================================
@@ -438,29 +437,34 @@ test("Kalman Filter (8-state, adaptive)", test_kalman)
 def test_dataset():
     from vil_tracker.data.dataset import SyntheticTrackingDataset, TrackingDataset
-    ds = SyntheticTrackingDataset(length=100)
     assert len(ds) == 100
     sample = ds[0]
     assert sample['template'].shape == (3, 128, 128), f"Template shape: {sample['template'].shape}"
-    assert sample['search'].shape == (3, 256, 256), f"Search shape: {sample['search'].shape}"
-    assert sample['heatmap'].shape == (1, 16, 16), f"Heatmap shape: {sample['heatmap'].shape}"
-    assert sample['size'].shape == (2,), f"Size shape: {sample['size'].shape}"
-    assert sample['boxes'].shape == (4,), f"Boxes shape: {sample['boxes'].shape}"
-    # Check ACL difficulty changes output
     ds.set_acl_difficulty(0.0)
     easy_sample = ds[42]
     ds.set_acl_difficulty(1.0)
     hard_sample = ds[42]
-    print(f"  Easy center: {easy_sample['boxes'][:2].tolist()}")
-    print(f"  Hard center: {hard_sample['boxes'][:2].tolist()}")
     # Test backward-compatible alias
-    ds2 = TrackingDataset(synthetic=True, synthetic_length=50)
     assert len(ds2) == 50
     sample2 = ds2[0]
-    assert sample2['template'].shape == (3, 128, 128)
 test("Dataset (synthetic + backward compat)", test_dataset)
@@ -484,43 +488,59 @@ def test_training_step():
     contrastive_loss = MemoryContrastiveLoss()
     optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
-    B = 2
     template = torch.randn(B, 3, 128, 128)
-    search = torch.randn(B, 3, 256, 256)
-    # GT targets
-    gt_center = torch.tensor([[128.0, 128.0], [100.0, 150.0]])
-    gt_heatmap = generate_heatmap(gt_center, feat_size=16, search_size=256)
-    gt_size = torch.tensor([[0.2, 0.3], [0.15, 0.25]])
-    gt_boxes = torch.tensor([[128.0, 128.0, 51.2, 76.8], [100.0, 150.0, 38.4, 64.0]])
-    # Forward WITH temporal modulation
-    pred = model(template, search, use_temporal=True)
-    loss_dict = loss_fn(pred, gt_heatmap, gt_size, gt_boxes)
     # Add contrastive loss
     t_pooled = pred['template_feat'].mean(dim=1)
-    s_pooled = pred['search_feat'].mean(dim=1)
     c_loss = contrastive_loss(t_pooled, s_pooled)
-    total_loss = loss_dict['total'] + 0.1 * c_loss
     # Backward
     total_loss.backward()
-    # Check gradients exist
     has_grads = sum(1 for p in model.parameters() if p.grad is not None)
     total_params_count = sum(1 for p in model.parameters())
-    print(f"  Total loss: {total_loss.item():.4f} (tracking={loss_dict['total'].item():.4f}, contr={c_loss.item():.4f})")
     print(f"  Params with gradients: {has_grads}/{total_params_count}")
-    # Optimizer step
     optimizer.step()
     optimizer.zero_grad()
     assert total_loss.item() > 0
     assert has_grads > 0
-test("Training Step (with temporal + contrastive)", test_training_step)
 # ============================================================
@@ -632,33 +652,29 @@ test("Augmentation pipeline", test_augmentation)
 def test_acl_curriculum():
     from vil_tracker.data.dataset import SyntheticTrackingDataset
-    ds = SyntheticTrackingDataset(length=100, acl_difficulty=0.0)
-    # Easy: targets near center
-    easy_offsets = []
     for i in range(20):
         sample = ds[i]
-        cx, cy = sample['boxes'][:2].tolist()
-        offset = ((cx - 128) ** 2 + (cy - 128) ** 2) ** 0.5
-        easy_offsets.append(offset)
     ds.set_acl_difficulty(1.0)
-    hard_offsets = []
     for i in range(20):
         sample = ds[i]
-        cx, cy = sample['boxes'][:2].tolist()
-        offset = ((cx - 128) ** 2 + (cy - 128) ** 2) ** 0.5
-        hard_offsets.append(offset)
-    avg_easy = np.mean(easy_offsets)
-    avg_hard = np.mean(hard_offsets)
-    print(f"  Avg offset (easy, d=0.0): {avg_easy:.1f} px")
-    print(f"  Avg offset (hard, d=1.0): {avg_hard:.1f} px")
-    # Hard samples should have larger offsets from center on average
-    # (this is stochastic, so we allow some tolerance)
-    print(f"  Hard > Easy: {avg_hard > avg_easy * 0.5}")
 test("ACL curriculum integration", test_acl_curriculum)

     params = count_params(tracker)
     print(f"  Tracker (depth=4) params: {params:,} ({params/1e6:.3f}M)")
+    B, K = 2, 3
+    template = torch.randn(B, 3, 128, 128)
+    # Test single-frame (backward compat)
+    search_single = torch.randn(B, 3, 256, 256)
+    output_s = tracker(template, search_single, use_temporal=False)
+    assert output_s['heatmap'].shape == (B, 1, 16, 16), f"Single heatmap: {output_s['heatmap'].shape}"
+    assert output_s['boxes'].shape == (B, 4), f"Single boxes: {output_s['boxes'].shape}"
+    assert output_s['scores'].shape == (B,), f"Single scores: {output_s['scores'].shape}"
+    print(f"  Single-frame: boxes={output_s['boxes'][0].tolist()}")
+    # Test multi-frame sequence
+    searches = torch.randn(B, K, 3, 256, 256)
+    output_m = tracker(template, searches, use_temporal=True)
+    assert output_m['heatmap'].shape == (B, K, 1, 16, 16), f"Multi heatmap: {output_m['heatmap'].shape}"
+    assert output_m['boxes'].shape == (B, K, 4), f"Multi boxes: {output_m['boxes'].shape}"
+    assert output_m['scores'].shape == (B, K), f"Multi scores: {output_m['scores'].shape}"
+    assert output_m['search_feats'].shape == (B, K, 256, 384), f"Multi feats: {output_m['search_feats'].shape}"
+    print(f"  Multi-frame (K={K}): frame 0 box={output_m['boxes'][0,0].tolist()}")
+    print(f"                       frame 2 box={output_m['boxes'][0,2].tolist()}")
     tracker.reset_temporal()
+test("Full Tracker (single + multi-frame)", test_full_tracker_small)
 # ============================================================
 def test_dataset():
     from vil_tracker.data.dataset import SyntheticTrackingDataset, TrackingDataset
+    ds = SyntheticTrackingDataset(length=100, clip_length=3)
     assert len(ds) == 100
     sample = ds[0]
     assert sample['template'].shape == (3, 128, 128), f"Template shape: {sample['template'].shape}"
+    assert sample['searches'].shape == (3, 3, 256, 256), f"Searches shape: {sample['searches'].shape}"
+    assert sample['heatmaps'].shape == (3, 1, 16, 16), f"Heatmaps shape: {sample['heatmaps'].shape}"
+    assert sample['sizes'].shape == (3, 2), f"Sizes shape: {sample['sizes'].shape}"
+    assert sample['boxes'].shape == (3, 4), f"Boxes shape: {sample['boxes'].shape}"
+    # Verify target moves across frames (not static)
+    cx_f0 = sample['boxes'][0, 0].item()
+    cx_f2 = sample['boxes'][2, 0].item()
+    print(f"  Frame 0 cx: {cx_f0:.1f}, Frame 2 cx: {cx_f2:.1f} (moving target)")
+    # Check ACL difficulty changes motion magnitude
     ds.set_acl_difficulty(0.0)
     easy_sample = ds[42]
     ds.set_acl_difficulty(1.0)
     hard_sample = ds[42]
+    print(f"  Easy frame spread: {(easy_sample['boxes'][:, 0].max() - easy_sample['boxes'][:, 0].min()).item():.1f} px")
+    print(f"  Hard frame spread: {(hard_sample['boxes'][:, 0].max() - hard_sample['boxes'][:, 0].min()).item():.1f} px")
     # Test backward-compatible alias
+    ds2 = TrackingDataset(synthetic=True, synthetic_length=50, clip_length=3)
     assert len(ds2) == 50
     sample2 = ds2[0]
+    assert sample2['searches'].shape[0] == 3, "Clip length should be 3"
 test("Dataset (synthetic + backward compat)", test_dataset)
     contrastive_loss = MemoryContrastiveLoss()
     optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
+    B, K = 2, 3
     template = torch.randn(B, 3, 128, 128)
+    searches = torch.randn(B, K, 3, 256, 256)
+    # GT targets for K frames
+    gt_heatmaps = torch.zeros(B, K, 1, 16, 16)
+    gt_heatmaps[:, :, :, 8, 8] = 1.0  # center
+    gt_sizes = torch.tensor([[[0.2, 0.3]] * K] * B)
+    gt_boxes = torch.tensor([[[128.0, 128.0, 51.2, 76.8]] * K] * B)
+    # Forward WITH temporal modulation, multi-frame
+    pred = model(template, searches, use_temporal=True)
+    assert pred['heatmap'].shape == (B, K, 1, 16, 16), f"Heatmap shape: {pred['heatmap'].shape}"
+    assert pred['boxes'].shape == (B, K, 4), f"Boxes shape: {pred['boxes'].shape}"
+    assert pred['scores'].shape == (B, K), f"Scores shape: {pred['scores'].shape}"
+    assert pred['search_feats'].shape == (B, K, 256, 384), f"Search feats: {pred['search_feats'].shape}"
+    # Accumulate loss over K frames
+    total_loss = torch.tensor(0.0)
+    for k in range(K):
+        pred_k = {
+            'heatmap': pred['heatmap'][:, k],
+            'size': pred['size'][:, k],
+            'boxes': pred['boxes'][:, k],
+        }
+        if 'log_variance' in pred:
+            pred_k['log_variance'] = pred['log_variance'][:, k]
+        loss_dict = loss_fn(pred_k, gt_heatmaps[:, k], gt_sizes[:, k], gt_boxes[:, k])
+        total_loss = total_loss + loss_dict['total']
+    total_loss = total_loss / K
     # Add contrastive loss
     t_pooled = pred['template_feat'].mean(dim=1)
+    s_pooled = pred['search_feats'][:, -1].mean(dim=1)
     c_loss = contrastive_loss(t_pooled, s_pooled)
+    total_loss = total_loss + 0.1 * c_loss
     # Backward
     total_loss.backward()
     has_grads = sum(1 for p in model.parameters() if p.grad is not None)
     total_params_count = sum(1 for p in model.parameters())
+    print(f"  Total loss: {total_loss.item():.4f} (K={K} frames, contr={c_loss.item():.4f})")
     print(f"  Params with gradients: {has_grads}/{total_params_count}")
     optimizer.step()
     optimizer.zero_grad()
     assert total_loss.item() > 0
     assert has_grads > 0
+test("Training Step (K=3 sequence + contrastive)", test_training_step)
 # ============================================================
 def test_acl_curriculum():
     from vil_tracker.data.dataset import SyntheticTrackingDataset
+    ds = SyntheticTrackingDataset(length=100, acl_difficulty=0.0, clip_length=3)
+    # Easy: targets barely move
+    easy_spreads = []
     for i in range(20):
         sample = ds[i]
+        spread = (sample['boxes'][:, 0].max() - sample['boxes'][:, 0].min()).item()
+        easy_spreads.append(spread)
     ds.set_acl_difficulty(1.0)
+    hard_spreads = []
     for i in range(20):
         sample = ds[i]
+        spread = (sample['boxes'][:, 0].max() - sample['boxes'][:, 0].min()).item()
+        hard_spreads.append(spread)
+    avg_easy = np.mean(easy_spreads)
+    avg_hard = np.mean(hard_spreads)
+    print(f"  Avg cx spread (easy, d=0.0): {avg_easy:.1f} px")
+    print(f"  Avg cx spread (hard, d=1.0): {avg_hard:.1f} px")
+    print(f"  Hard > Easy: {avg_hard > avg_easy}")
 test("ACL curriculum integration", test_acl_curriculum)