omar-ah
/

vil-tracker

Model card Files Files and versions

xet

Community

omar-ah commited on 11 days ago

Commit

b3b0529

verified ·

1 Parent(s): ccfb718

Upload vil_tracker/models/tracker.py with huggingface_hub

Browse files

Files changed (1) hide show

vil_tracker/models/tracker.py +166 -0

vil_tracker/models/tracker.py ADDED Viewed

	@@ -0,0 +1,166 @@

+"""
+ViL Tracker: Full model combining backbone, FiLM modulation, and prediction heads.
+Pipeline:
+1. Template (128x128) + Search (256x256) → PatchEmbed → tokens
+2. Concatenated tokens → ViL backbone (24 mLSTM blocks, bidirectional)
+3. FiLM temporal modulation at intervals (conditioned on prev frame)
+4. Search features → CenterHead → heatmap + size + offset
+5. Optional: UncertaintyHead → log variance for adaptive weighting
+"""
+import torch
+import torch.nn as nn
+from .backbone import ViLBackbone
+from .film_temporal import TemporalModulationManager
+from .heads import CenterHead, UncertaintyHead, decode_predictions
+def get_default_config() -> dict:
+    """Default ViL-S tracker configuration meeting all constraints.
+    Constraints: ≤50M params, ≤30ms latency, ≤20 GFLOPs, ≤500MB
+    """
+    return {
+        # Backbone
+        'dim': 384,
+        'depth': 24,
+        'patch_size': 16,
+        'proj_factor': 2.0,
+        'qkv_proj_blocksize': 4,
+        'num_heads': 4,
+        'conv_kernel': 4,
+        'mlp_ratio': 4.0,
+        'drop_path_rate': 0.1,
+        'tmoe_blocks': 2,
+        'num_experts': 4,
+        # FiLM temporal modulation
+        'film_interval': 6,
+        # Heads
+        'feat_size': 16,
+        # Inputs
+        'template_size': 128,
+        'search_size': 256,
+        # Uncertainty
+        'use_uncertainty': True,
+    }
+class ViLTracker(nn.Module):
+    """Complete ViL-based single object tracker.
+    Target specs (ViL-S):
+    - Parameters: ~35-40M (well under 50M limit)
+    - GFLOPs: ~15-18 (under 20 GFLOPs)
+    - Model size: ~140-160MB fp32, ~70-80MB fp16 (under 500MB)
+    - Latency: ~20-25ms on GPU (under 30ms)
+    """
+    def __init__(self, config: dict = None):
+        super().__init__()
+        config = config or get_default_config()
+        self.config = config
+        dim = config['dim']
+        depth = config['depth']
+        # Backbone
+        self.backbone = ViLBackbone(
+            dim=dim,
+            depth=depth,
+            patch_size=config['patch_size'],
+            proj_factor=config['proj_factor'],
+            qkv_proj_blocksize=config['qkv_proj_blocksize'],
+            num_heads=config['num_heads'],
+            conv_kernel=config['conv_kernel'],
+            mlp_ratio=config['mlp_ratio'],
+            drop_path_rate=config['drop_path_rate'],
+            tmoe_blocks=config['tmoe_blocks'],
+            num_experts=config['num_experts'],
+        )
+        # FiLM temporal modulation
+        self.temporal_mod = TemporalModulationManager(
+            dim=dim,
+            num_blocks=depth,
+            modulation_interval=config['film_interval'],
+        )
+        # Prediction heads
+        self.center_head = CenterHead(dim=dim, feat_size=config['feat_size'])
+        if config.get('use_uncertainty', True):
+            self.uncertainty_head = UncertaintyHead(dim=dim, feat_size=config['feat_size'])
+        else:
+            self.uncertainty_head = None
+    def forward(
+        self,
+        template: torch.Tensor,
+        search: torch.Tensor,
+        use_temporal: bool = False,
+    ) -> dict:
+        """
+        Args:
+            template: (B, 3, 128, 128) template image
+            search: (B, 3, 256, 256) search region
+            use_temporal: whether to apply FiLM temporal modulation
+        Returns:
+            dict with predictions: heatmap, size, offset, boxes, scores,
+                                   and optionally uncertainty
+        """
+        # Backbone forward
+        template_feat, search_feat = self.backbone(template, search)
+        # Optional FiLM temporal modulation on search features
+        if use_temporal:
+            for i in range(self.backbone.depth):
+                if self.temporal_mod.should_modulate(i):
+                    search_feat = self.temporal_mod.modulate(search_feat, i)
+            # Update temporal context for next frame
+            self.temporal_mod.update_temporal_context(search_feat)
+        # Prediction heads
+        preds = self.center_head(search_feat)
+        # Decode to boxes
+        boxes, scores = decode_predictions(
+            preds['heatmap'],
+            preds['size'],
+            preds['offset'],
+            search_size=self.config['search_size'],
+            feat_size=self.config['feat_size'],
+        )
+        output = {
+            'heatmap': preds['heatmap'],
+            'size': preds['size'],
+            'offset': preds['offset'],
+            'boxes': boxes,
+            'scores': scores,
+            'template_feat': template_feat,
+            'search_feat': search_feat,
+        }
+        # Uncertainty prediction
+        if self.uncertainty_head is not None:
+            output['log_variance'] = self.uncertainty_head(search_feat)
+        return output
+    def reset_temporal(self):
+        """Reset temporal modulation state (for new tracking sequence)."""
+        self.temporal_mod.reset()
+    def freeze_backbone_shared_experts(self):
+        """Freeze shared experts in TMoE blocks for Phase 2."""
+        self.backbone.freeze_shared_experts()
+def build_tracker(config: dict = None) -> ViLTracker:
+    """Build a ViL tracker with given or default config."""
+    return ViLTracker(config or get_default_config())