omar-ah
/

ViL-DLM-0.6B

+"""
+Vision xLSTM (ViL) encoder implementation.
+Based on: "Vision-LSTM: xLSTM as Generic Vision Backbone" (arxiv:2406.04303)
+Key design:
+- Patch embedding (ViT-style, 16x16 patches)
+- Alternating bidirectional mLSTM blocks (top-left→bottom-right, bottom-right→top-left)
+- Conv2D for QK local context
+- Linear complexity O(N) vs ViT's O(N²)
+"""
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+class PatchEmbedding(nn.Module):
+    """Convert image to patch tokens (identical to ViT)"""
+    def __init__(self, img_size=224, patch_size=16, in_channels=3, embed_dim=384):
+        super().__init__()
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = (img_size // patch_size) ** 2
+        self.proj = nn.Conv2d(in_channels, embed_dim, kernel_size=patch_size, stride=patch_size)
+        self.pos_embed = nn.Parameter(torch.zeros(1, self.num_patches, embed_dim))
+        nn.init.trunc_normal_(self.pos_embed, std=0.02)
+    def forward(self, x):
+        # x: [B, C, H, W]
+        B = x.shape[0]
+        x = self.proj(x)  # [B, D, H/P, W/P]
+        x = x.flatten(2).transpose(1, 2)  # [B, N, D]
+        x = x + self.pos_embed
+        return x
+class MLSTMCell(nn.Module):
+    """
+    Matrix-LSTM (mLSTM) cell with exponential gating.
+    Core equations:
+    q = W_q @ x, k = (1/√d) * W_k @ x, v = W_v @ x
+    f = exp(w_f @ x), i = exp(w_i @ x), o = sigmoid(w_o @ x)
+    C_t = f * C_{t-1} + i * (v ⊗ k)  [outer product memory update]
+    n_t = f * n_{t-1} + i * k         [normalizer]
+    h_t = o ⊙ (C_t @ q / max(|n_t^T @ q|, 1))
+    """
+    def __init__(self, input_dim, head_dim, num_heads=1):
+        super().__init__()
+        self.head_dim = head_dim
+        self.num_heads = num_heads
+        self.total_dim = head_dim * num_heads
+        # QKV projections
+        self.W_q = nn.Linear(input_dim, self.total_dim, bias=True)
+        self.W_k = nn.Linear(input_dim, self.total_dim, bias=True)
+        self.W_v = nn.Linear(input_dim, self.total_dim, bias=True)
+        # Gates (scalar per head)
+        self.w_f = nn.Linear(input_dim, num_heads, bias=True)  # forget gate
+        self.w_i = nn.Linear(input_dim, num_heads, bias=True)  # input gate
+        self.w_o = nn.Linear(input_dim, self.total_dim, bias=True)  # output gate
+        # Scaling
+        self.scale = 1.0 / math.sqrt(head_dim)
+    def forward(self, x):
+        """
+        x: [B, T, D]
+        Returns: [B, T, total_dim]
+        For efficiency, we compute the parallel form via cumulative sums.
+        """
+        B, T, D = x.shape
+        q = self.W_q(x)  # [B, T, total_dim]
+        k = self.W_k(x) * self.scale  # [B, T, total_dim]
+        v = self.W_v(x)  # [B, T, total_dim]
+        # Gates
+        log_f = self.w_f(x)  # [B, T, num_heads] - log forget gate
+        log_i = self.w_i(x)  # [B, T, num_heads] - log input gate
+        o = torch.sigmoid(self.w_o(x))  # [B, T, total_dim]
+        # Stabilize with log-space computation
+        # Cumulative log forget gates for parallel scan
+        log_f = F.logsigmoid(log_f)  # bound to (-inf, 0)
+        # Reshape for multi-head
+        q = rearrange(q, 'b t (h d) -> b h t d', h=self.num_heads)
+        k = rearrange(k, 'b t (h d) -> b h t d', h=self.num_heads)
+        v = rearrange(v, 'b t (h d) -> b h t d', h=self.num_heads)
+        log_f = rearrange(log_f, 'b t h -> b h t')
+        log_i = rearrange(log_i, 'b t h -> b h t')
+        # Parallel computation via chunked linear attention approximation
+        # For efficiency, use the "linear attention" form:
+        # h_t = Σ_{s≤t} (Π_{j=s+1}^{t} f_j) * i_s * v_s * k_s^T * q_t
+        # This is equivalent to softmax-free linear attention with decay
+        # Compute cumulative forget gate products in log space
+        cum_log_f = torch.cumsum(log_f, dim=-1)  # [B, H, T]
+        # Log weights: log(f^cum * i) for each position
+        # w_{t,s} = cum_log_f[t] - cum_log_f[s] + log_i[s]
+        # For parallel form, compute weighted KV accumulation
+        # Simplified parallel form using exponential weights
+        weights = torch.exp(cum_log_f)  # [B, H, T] - cumulative decay
+        i_weights = torch.exp(log_i)  # [B, H, T] - input gates
+        # Weighted keys and values
+        w = (i_weights / (weights + 1e-6)).unsqueeze(-1)  # [B, H, T, 1]
+        kv = torch.einsum('bhtd,bhte->bhde', k * w, v * w)  # [B, H, D, D] approx
+        # Actually, let's use the simpler chunkwise form for correctness:
+        # Direct sequential would be too slow, so use causal linear attention
+        # qk = q @ k^T with causal mask approximated by decay
+        # Efficient approximation: use causal dot product with decay
+        # Gates are per-head scalars: [B, H, T]
+        decay = torch.exp(log_f)  # [B, H, T]
+        gate = torch.exp(log_i)   # [B, H, T]
+        # Sequential scan (will be replaced by parallel scan in production)
+        h_state = torch.zeros(B, self.num_heads, self.head_dim, self.head_dim,
+                             device=x.device, dtype=x.dtype)
+        n_state = torch.zeros(B, self.num_heads, self.head_dim,
+                             device=x.device, dtype=x.dtype)
+        outputs = []
+        for t in range(T):
+            f_t = decay[:, :, t]  # [B, H] - per-head scalar
+            i_t = gate[:, :, t]   # [B, H] - per-head scalar
+            k_t = k[:, :, t, :]   # [B, H, D]
+            v_t = v[:, :, t, :]   # [B, H, D]
+            q_t = q[:, :, t, :]   # [B, H, D]
+            # Expand gates for broadcasting: [B, H] -> [B, H, 1] and [B, H, 1, 1]
+            f_t_d = f_t.unsqueeze(-1)   # [B, H, 1] for D dim
+            i_t_d = i_t.unsqueeze(-1)   # [B, H, 1] for D dim
+            f_t_dd = f_t.unsqueeze(-1).unsqueeze(-1)  # [B, H, 1, 1] for DxD
+            i_t_dd = i_t.unsqueeze(-1).unsqueeze(-1)  # [B, H, 1, 1] for DxD
+            # Update cell state: C = f*C + i*(v outer k)
+            h_state = f_t_dd * h_state + i_t_dd * torch.einsum('bhd,bhe->bhde', v_t, k_t)
+            # Update normalizer: n = f*n + i*k
+            n_state = f_t_d * n_state + i_t_d * k_t
+            # Output: o * (C @ q / max(|n^T @ q|, 1))
+            Cq = torch.einsum('bhde,bhe->bhd', h_state, q_t)
+            nq = torch.einsum('bhd,bhd->bh', n_state, q_t).unsqueeze(-1)
+            nq = torch.clamp(nq.abs(), min=1.0)
+            h_t = Cq / nq
+            outputs.append(h_t)
+        out = torch.stack(outputs, dim=2)  # [B, H, T, D]
+        out = rearrange(out, 'b h t d -> b t (h d)')
+        out = out * o
+        return out
+class MLSTMBlock(nn.Module):
+    """
+    ViL mLSTM block with Conv2D for QK spatial context.
+    Wraps mLSTM in a gated MLP structure.
+    """
+    def __init__(self, dim, conv_kernel=3, dropout=0.0):
+        super().__init__()
+        self.norm = nn.LayerNorm(dim)
+        # Pre-projection: expand to 3x for gate structure
+        self.pre_proj = nn.Linear(dim, dim * 3)
+        # Conv2D for spatial QK context (key ViL innovation)
+        self.conv = nn.Conv2d(dim, dim, kernel_size=conv_kernel,
+                             padding=conv_kernel // 2, groups=dim)  # depthwise
+        # mLSTM cell
+        self.mlstm = MLSTMCell(
+            input_dim=dim,
+            head_dim=dim // 4,  # 4 heads
+            num_heads=4
+        )
+        # Output projection
+        self.out_proj = nn.Linear(dim, dim)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x, h=None, w=None):
+        """
+        x: [B, T, D] patch tokens
+        h, w: spatial dimensions for conv (sqrt(T) each for square images)
+        """
+        B, T, D = x.shape
+        residual = x
+        x = self.norm(x)
+        # Gate structure: split into B (gate), C (gate), h_tilde (input)
+        projected = self.pre_proj(x)  # [B, T, 3D]
+        gate_b, gate_c, h_tilde = projected.chunk(3, dim=-1)
+        # Apply spatial conv to h_tilde for local context
+        if h is not None and w is not None:
+            h_2d = rearrange(h_tilde, 'b (h w) d -> b d h w', h=h, w=w)
+            h_2d = self.conv(h_2d)
+            h_tilde = rearrange(h_2d, 'b d h w -> b (h w) d')
+        # Input gating
+        y = torch.sigmoid(gate_b) * h_tilde
+        # mLSTM
+        y = self.mlstm(y)
+        # Output gating
+        y = torch.sigmoid(gate_c) * y
+        y = self.out_proj(y)
+        y = self.dropout(y)
+        return residual + y
+class FFNBlock(nn.Module):
+    """SwiGLU feed-forward block"""
+    def __init__(self, dim, mult=4, dropout=0.0):
+        super().__init__()
+        hidden = int(dim * mult * 2 / 3)  # SwiGLU uses 2/3 factor
+        self.norm = nn.LayerNorm(dim)
+        self.w1 = nn.Linear(dim, hidden)
+        self.w2 = nn.Linear(dim, hidden)
+        self.w3 = nn.Linear(hidden, dim)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x):
+        residual = x
+        x = self.norm(x)
+        return residual + self.dropout(self.w3(F.silu(self.w1(x)) * self.w2(x)))
+class VisionXLSTM(nn.Module):
+    """
+    Vision xLSTM (ViL) encoder.
+    Architecture:
+    1. Patch embedding (Conv2D, 16x16)
+    2. Alternating bidirectional mLSTM blocks
+    3. SwiGLU FFN after each mLSTM
+    Output: all patch tokens [B, num_patches, dim] for VLM projection
+    """
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        # Patch embedding
+        self.patch_embed = PatchEmbedding(
+            img_size=config.img_size,
+            patch_size=config.patch_size,
+            in_channels=config.in_channels,
+            embed_dim=config.dim
+        )
+        self.h = config.img_size // config.patch_size
+        self.w = config.img_size // config.patch_size
+        # Alternating mLSTM blocks + FFN
+        self.blocks = nn.ModuleList()
+        self.ffns = nn.ModuleList()
+        for i in range(config.depth):
+            self.blocks.append(MLSTMBlock(
+                dim=config.dim,
+                conv_kernel=config.conv_kernel_size,
+                dropout=config.dropout
+            ))
+            self.ffns.append(FFNBlock(dim=config.dim, dropout=config.dropout))
+        self.final_norm = nn.LayerNorm(config.dim)
+    def forward_features(self, pixel_values):
+        """
+        Extract patch features for VLM projection.
+        Args:
+            pixel_values: [B, C, H, W] images
+        Returns:
+            [B, num_patches, dim] patch token features
+        """
+        x = self.patch_embed(pixel_values)  # [B, N, D]
+        for i, (block, ffn) in enumerate(zip(self.blocks, self.ffns)):
+            if self.config.bidirectional and i % 2 == 1:
+                # Even blocks (0-indexed odd): reverse scan direction
+                x = x.flip(1)
+                x = block(x, h=self.h, w=self.w)
+                x = ffn(x)
+                x = x.flip(1)
+            else:
+                # Odd blocks: forward scan
+                x = block(x, h=self.h, w=self.w)
+                x = ffn(x)
+        x = self.final_norm(x)
+        return x
+    def forward(self, pixel_values):
+        """Classification forward (bilateral concat pooling)"""
+        features = self.forward_features(pixel_values)
+        # Bilateral concat: first + last patch
+        pooled = torch.cat([features[:, 0], features[:, -1]], dim=-1)
+        return pooled
+class VisionProjector(nn.Module):
+    """
+    MLP projector: maps ViL features → LM embedding space.
+    Following LLaDA-V / LaViDa: 2-layer MLP with GELU.
+    """
+    def __init__(self, config):
+        super().__init__()
+        hidden_dim = config.lm_dim * config.hidden_mult
+        layers = []
+        layers.append(nn.Linear(config.vil_dim, hidden_dim))
+        layers.append(nn.GELU())
+        if config.dropout > 0:
+            layers.append(nn.Dropout(config.dropout))
+        for _ in range(config.num_layers - 1):
+            layers.append(nn.Linear(hidden_dim, hidden_dim))
+            layers.append(nn.GELU())
+            if config.dropout > 0:
+                layers.append(nn.Dropout(config.dropout))
+        layers.append(nn.Linear(hidden_dim, config.lm_dim))
+        self.mlp = nn.Sequential(*layers)
+    def forward(self, vision_features):
+        """
+        Args:
+            vision_features: [B, num_patches, vil_dim]
+        Returns:
+            [B, num_patches, lm_dim]
+        """
+        return self.mlp(vision_features)