omar-ah
/

vil-tracker

Model card Files Files and versions

xet

Community

omar-ah commited on 9 days ago

Commit

c08a3b0

verified ·

1 Parent(s): 3547636

Fix vil_tracker/models/backbone.py: audit corrections

Browse files

Files changed (1) hide show

vil_tracker/models/backbone.py +19 -4

vil_tracker/models/backbone.py CHANGED Viewed

@@ -4,6 +4,7 @@ ViL (Vision-LSTM) Backbone for single object tracking.
 Architecture:
 - Patch embedding (Conv2d) for template + search region
 - Stack of mLSTM blocks with bidirectional scanning (even=L→R, odd=R→L)
 - Optional TMoE-MLP in last N blocks (dense routing, frozen shared expert)
 - Outputs concatenated template+search features for head processing
@@ -133,10 +134,11 @@ class mLSTMBlockWithTMoE(nn.Module):
 class ViLBackbone(nn.Module):
-    """Vision-LSTM backbone for tracking.
     Concatenates template + search patches into a single sequence,
-    processes through bidirectional mLSTM blocks, then separates outputs.
     Template: 128x128 → 8x8 = 64 tokens
     Search:   256x256 → 16x16 = 256 tokens
@@ -144,6 +146,7 @@ class ViLBackbone(nn.Module):
     Bidirectional scanning: even blocks L→R, odd blocks R→L.
     Last `tmoe_blocks` blocks use TMoE MLP for temporal specialization.
     """
     def __init__(
         self,
@@ -160,11 +163,13 @@ class ViLBackbone(nn.Module):
         tmoe_blocks: int = 2,
         num_experts: int = 4,
         bias: bool = False,
     ):
         super().__init__()
         self.dim = dim
         self.depth = depth
         self.patch_size = patch_size
         # Patch embedding
         self.patch_embed = PatchEmbed(patch_size=patch_size, in_channels=in_channels, dim=dim)
@@ -209,11 +214,13 @@ class ViLBackbone(nn.Module):
         self,
         template: torch.Tensor,
         search: torch.Tensor,
     ) -> tuple:
         """
         Args:
             template: (B, 3, 128, 128) template image
             search: (B, 3, 256, 256) search region image
         Returns:
             template_feat: (B, 64, D) template features
             search_feat: (B, 256, D) search features
@@ -230,16 +237,24 @@ class ViLBackbone(nn.Module):
         # Concatenate: [template | search]
         tokens = torch.cat([t_tokens, s_tokens], dim=1)  # (B, 320, D)
-        # Process through bidirectional mLSTM blocks
         for i, block in enumerate(self.blocks):
             reverse = (i % 2 == 1)  # odd blocks: R→L
             tokens = block(tokens, reverse=reverse)
         tokens = self.norm(tokens)
         # Split back
-        n_template = t_tokens.shape[1]
         template_feat = tokens[:, :n_template]
         search_feat = tokens[:, n_template:]

 Architecture:
 - Patch embedding (Conv2d) for template + search region
 - Stack of mLSTM blocks with bidirectional scanning (even=L→R, odd=R→L)
+- FiLM temporal modulation integrated BETWEEN blocks (at interval=6)
 - Optional TMoE-MLP in last N blocks (dense routing, frozen shared expert)
 - Outputs concatenated template+search features for head processing
 class ViLBackbone(nn.Module):
+    """Vision-LSTM backbone for tracking with integrated FiLM temporal modulation.
     Concatenates template + search patches into a single sequence,
+    processes through bidirectional mLSTM blocks with FiLM modulation
+    injected between blocks at regular intervals, then separates outputs.
     Template: 128x128 → 8x8 = 64 tokens
     Search:   256x256 → 16x16 = 256 tokens
     Bidirectional scanning: even blocks L→R, odd blocks R→L.
     Last `tmoe_blocks` blocks use TMoE MLP for temporal specialization.
+    FiLM modulation: applied after every `film_interval`-th block.
     """
     def __init__(
         self,
         tmoe_blocks: int = 2,
         num_experts: int = 4,
         bias: bool = False,
+        film_interval: int = 6,
     ):
         super().__init__()
         self.dim = dim
         self.depth = depth
         self.patch_size = patch_size
+        self.film_interval = film_interval
         # Patch embedding
         self.patch_embed = PatchEmbed(patch_size=patch_size, in_channels=in_channels, dim=dim)
         self,
         template: torch.Tensor,
         search: torch.Tensor,
+        temporal_mod_manager=None,
     ) -> tuple:
         """
         Args:
             template: (B, 3, 128, 128) template image
             search: (B, 3, 256, 256) search region image
+            temporal_mod_manager: optional TemporalModulationManager for FiLM
         Returns:
             template_feat: (B, 64, D) template features
             search_feat: (B, 256, D) search features
         # Concatenate: [template | search]
         tokens = torch.cat([t_tokens, s_tokens], dim=1)  # (B, 320, D)
+        n_template = t_tokens.shape[1]
+        # Process through bidirectional mLSTM blocks with optional FiLM
         for i, block in enumerate(self.blocks):
             reverse = (i % 2 == 1)  # odd blocks: R→L
             tokens = block(tokens, reverse=reverse)
+            # Apply FiLM temporal modulation between blocks
+            if temporal_mod_manager is not None:
+                tokens = temporal_mod_manager.modulate(tokens, i)
         tokens = self.norm(tokens)
+        # Update temporal context after full forward pass
+        if temporal_mod_manager is not None:
+            temporal_mod_manager.update_temporal_context(tokens)
         # Split back
         template_feat = tokens[:, :n_template]
         search_feat = tokens[:, n_template:]