omar-ah
/

vil-tracker

Model card Files Files and versions

xet

Community

omar-ah commited on 9 days ago

Commit

faf4cb2

verified ·

1 Parent(s): 59fd921

Fix: mLSTM SiLU gate+activation, GroupNorm 192, stochastic depth 0.05, Hanning window

Browse files

Files changed (1) hide show

vil_tracker/models/mlstm.py +10 -6

vil_tracker/models/mlstm.py CHANGED Viewed

@@ -79,10 +79,12 @@ class mLSTMCell(nn.Module):
     - LinearHeadwiseExpand for Q, K, V projections
     - igate, fgate: Linear(3*inner_dim, num_heads) from concat(q,k,v)
     - Parallel scan: C_t = f_t*C_{t-1} + i_t*(v_t ⊗ k_t), h_t = C_t*q_t
-    - Output: (h + skip*conv_act) * sigmoid(z), then proj_down
     ViL-S config: D=384, proj_factor=2.0, inner_dim=768,
-                  qkv_proj_blocksize=4, num_heads=4
     Per-cell params: ~920K (vs 2.66M with full Linear Q/K/V)
     """
     def __init__(
@@ -103,6 +105,7 @@ class mLSTMCell(nn.Module):
         # Number of projection heads for Q/K/V (block-diagonal)
         num_proj_heads = self.inner_dim // qkv_proj_blocksize
         # Up-projection: D -> 2*inner_dim (mLSTM branch + output gate branch)
         self.proj_up = nn.Linear(dim, 2 * self.inner_dim, bias=bias)
@@ -126,8 +129,9 @@ class mLSTMCell(nn.Module):
         self.igate = nn.Linear(3 * self.inner_dim, num_heads, bias=True)
         self.fgate = nn.Linear(3 * self.inner_dim, num_heads, bias=True)
-        # Output normalization (per-head group norm)
-        self.outnorm = nn.GroupNorm(num_heads, self.inner_dim, affine=True)
         # Down-projection: inner_dim -> D
         self.proj_down = nn.Linear(self.inner_dim, dim, bias=bias)
@@ -166,7 +170,7 @@ class mLSTMCell(nn.Module):
         # 2. Causal conv1d on mLSTM branch
         x_conv = self.conv1d(x_mlstm.transpose(1, 2))  # (B, inner, S+pad)
         x_conv = x_conv[..., :S].transpose(1, 2)       # causal: keep first S
-        x_conv_act = F.gelu(x_conv)
         # 3. Q/K/V projections (block-diagonal, very lightweight)
         q = self.q_proj(x_conv_act)   # (B, S, inner)
@@ -230,7 +234,7 @@ class mLSTMCell(nn.Module):
         # 8. Skip connection + output gate
         h_skip = h + self.learnable_skip * x_conv_act
-        output = h_skip * torch.sigmoid(z)  # output gate
         # 9. Down-project + layer scale
         output = self.proj_down(output)

     - LinearHeadwiseExpand for Q, K, V projections
     - igate, fgate: Linear(3*inner_dim, num_heads) from concat(q,k,v)
     - Parallel scan: C_t = f_t*C_{t-1} + i_t*(v_t ⊗ k_t), h_t = C_t*q_t
+    - Output: (h + skip*conv_act) * SiLU(z), then proj_down
     ViL-S config: D=384, proj_factor=2.0, inner_dim=768,
+                  qkv_proj_blocksize=4, num_heads=4 (memory heads)
+    Note: GroupNorm uses num_proj_heads (192) groups, matching official
+          MultiHeadLayerNorm — one group per projection head, NOT per memory head.
     Per-cell params: ~920K (vs 2.66M with full Linear Q/K/V)
     """
     def __init__(
         # Number of projection heads for Q/K/V (block-diagonal)
         num_proj_heads = self.inner_dim // qkv_proj_blocksize
+        self.num_proj_heads = num_proj_heads
         # Up-projection: D -> 2*inner_dim (mLSTM branch + output gate branch)
         self.proj_up = nn.Linear(dim, 2 * self.inner_dim, bias=bias)
         self.igate = nn.Linear(3 * self.inner_dim, num_heads, bias=True)
         self.fgate = nn.Linear(3 * self.inner_dim, num_heads, bias=True)
+        # Output normalization: per-projection-head group norm (192 groups for ViL-S)
+        # Matches official MultiHeadLayerNorm — one group per projection head
+        self.outnorm = nn.GroupNorm(num_proj_heads, self.inner_dim, affine=True)
         # Down-projection: inner_dim -> D
         self.proj_down = nn.Linear(self.inner_dim, dim, bias=bias)
         # 2. Causal conv1d on mLSTM branch
         x_conv = self.conv1d(x_mlstm.transpose(1, 2))  # (B, inner, S+pad)
         x_conv = x_conv[..., :S].transpose(1, 2)       # causal: keep first S
+        x_conv_act = F.silu(x_conv)
         # 3. Q/K/V projections (block-diagonal, very lightweight)
         q = self.q_proj(x_conv_act)   # (B, S, inner)
         # 8. Skip connection + output gate
         h_skip = h + self.learnable_skip * x_conv_act
+        output = h_skip * F.silu(z)  # output gate: SiLU (not sigmoid) per official ViL
         # 9. Down-project + layer scale
         output = self.proj_down(output)