Update model configuration and training scripts with new vision backbone support and dependencies

Browse files

Files changed (6) hide show

code/model_config.py +4 -2
code/train_production.py +6 -3
code/vil_dlm_model.py +3 -4
code/vision_xlstm.py +159 -316
pyproject.toml +24 -0
train_production.py +3 -1

code/model_config.py CHANGED Viewed

@@ -10,11 +10,13 @@ from typing import Optional, List
 @dataclass
 class ViLEncoderConfig:
     """Vision xLSTM (ViL) encoder configuration"""
     img_size: int = 224
     patch_size: int = 16
     in_channels: int = 3
-    dim: int = 384          # ViL-S default (23M params)
-    depth: int = 24         # Standard ViL depth
     mlstm_dim_mult: int = 2  # mLSTM internal dim = 2 * dim
     conv_kernel_size: int = 3  # QK Conv2D kernel
     bidirectional: bool = True  # alternating scan directions

 @dataclass
 class ViLEncoderConfig:
     """Vision xLSTM (ViL) encoder configuration"""
+    vision_backbone: str = "vil2-small"
+    pretrained: bool = True
     img_size: int = 224
     patch_size: int = 16
     in_channels: int = 3
+    dim: int = 384          # patch feature dim for vil-small / vil2-small
+    depth: int = 12         # VisionLSTM2 block-pairs; v1 vil-small internally uses 24
     mlstm_dim_mult: int = 2  # mLSTM internal dim = 2 * dim
     conv_kernel_size: int = 3  # QK Conv2D kernel
     bidirectional: bool = True  # alternating scan directions

code/train_production.py CHANGED Viewed

@@ -28,6 +28,7 @@ from io import BytesIO
 from datasets import load_dataset
 from transformers import AutoTokenizer, AutoModelForMaskedLM
 from huggingface_hub import HfApi, snapshot_download
 import trackio
@@ -39,6 +40,8 @@ from dataclasses import dataclass, field
 @dataclass
 class ViLConfig:
     img_size: int = 224
     patch_size: int = 16
     in_channels: int = 3
@@ -243,15 +246,15 @@ class ViLDLM(nn.Module):
     def __init__(self, vil_config, proj_config, lm_path):
         super().__init__()
         self.vil_config = vil_config
-        self.vision_encoder = VisionXLSTM(vil_config)
-        self.projector = VisionProjector(proj_config)
         self.scheduler = MDLMScheduler()
         self.num_patches = vil_config.num_patches
         # Load diffusion LM
         print(f"Loading diffusion LM from {lm_path}...")
         self.lm = AutoModelForMaskedLM.from_pretrained(
-            lm_path, trust_remote_code=True, dtype=torch.bfloat16
         )
         self.tokenizer = AutoTokenizer.from_pretrained(lm_path, trust_remote_code=True)
         lm_params = sum(p.numel() for p in self.lm.parameters())

 from datasets import load_dataset
 from transformers import AutoTokenizer, AutoModelForMaskedLM
 from huggingface_hub import HfApi, snapshot_download
+from vision_xlstm import VisionProjector as UpstreamVisionProjector, VisionXLSTM as UpstreamVisionXLSTM
 import trackio
 @dataclass
 class ViLConfig:
+    vision_backbone: str = "vil2-small"
+    pretrained: bool = True
     img_size: int = 224
     patch_size: int = 16
     in_channels: int = 3
     def __init__(self, vil_config, proj_config, lm_path):
         super().__init__()
         self.vil_config = vil_config
+        self.vision_encoder = UpstreamVisionXLSTM(vil_config)
+        self.projector = UpstreamVisionProjector(proj_config)
         self.scheduler = MDLMScheduler()
         self.num_patches = vil_config.num_patches
         # Load diffusion LM
         print(f"Loading diffusion LM from {lm_path}...")
         self.lm = AutoModelForMaskedLM.from_pretrained(
+            lm_path, trust_remote_code=True, torch_dtype=torch.bfloat16
         )
         self.tokenizer = AutoTokenizer.from_pretrained(lm_path, trust_remote_code=True)
         lm_params = sum(p.numel() for p in self.lm.parameters())

code/vil_dlm_model.py CHANGED Viewed

@@ -26,7 +26,7 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from typing import Optional, Dict, Any, Tuple
-from transformers import AutoModelForMaskedLM, AutoTokenizer
 from model_config import ViLEncoderConfig, ProjectorConfig, TrainingConfig
 from vision_xlstm import VisionXLSTM, VisionProjector
@@ -119,7 +119,7 @@ class ViLDLM(nn.Module):
         self.lm = AutoModelForMaskedLM.from_pretrained(
             model_path,
             trust_remote_code=True,
-            dtype=torch.bfloat16 if self.config.bf16 else torch.float32,
         )
         self.tokenizer = AutoTokenizer.from_pretrained(
             model_path,
@@ -419,13 +419,12 @@ class ViLDLMWithDistillation(ViLDLM):
                 bnb_4bit_compute_dtype=torch.bfloat16,
                 bnb_4bit_quant_type="nf4",
             )
-            self.teacher = AutoModelForMaskedLM.from_pretrained(
                 self.kd_config.teacher_model_id,
                 quantization_config=bnb_config,
                 device_map="auto",
             )
         else:
-            from transformers import AutoModelForImageTextToText
             self.teacher = AutoModelForImageTextToText.from_pretrained(
                 self.kd_config.teacher_model_id,
                 torch_dtype=torch.bfloat16,

 import torch.nn as nn
 import torch.nn.functional as F
 from typing import Optional, Dict, Any, Tuple
+from transformers import AutoModelForImageTextToText, AutoModelForMaskedLM, AutoTokenizer
 from model_config import ViLEncoderConfig, ProjectorConfig, TrainingConfig
 from vision_xlstm import VisionXLSTM, VisionProjector
         self.lm = AutoModelForMaskedLM.from_pretrained(
             model_path,
             trust_remote_code=True,
+            torch_dtype=torch.bfloat16 if self.config.bf16 else torch.float32,
         )
         self.tokenizer = AutoTokenizer.from_pretrained(
             model_path,
                 bnb_4bit_compute_dtype=torch.bfloat16,
                 bnb_4bit_quant_type="nf4",
             )
+            self.teacher = AutoModelForImageTextToText.from_pretrained(
                 self.kd_config.teacher_model_id,
                 quantization_config=bnb_config,
                 device_map="auto",
             )
         else:
             self.teacher = AutoModelForImageTextToText.from_pretrained(
                 self.kd_config.teacher_model_id,
                 torch_dtype=torch.bfloat16,

code/vision_xlstm.py CHANGED Viewed

@@ -1,348 +1,191 @@
 """
-Vision xLSTM (ViL) encoder implementation.
-Based on: "Vision-LSTM: xLSTM as Generic Vision Backbone" (arxiv:2406.04303)
-Key design:
-- Patch embedding (ViT-style, 16x16 patches)
-- Alternating bidirectional mLSTM blocks (top-left→bottom-right, bottom-right→top-left)
-- Conv2D for QK local context
-- Linear complexity O(N) vs ViT's O(N²)
 """
-import math
 import torch
 import torch.nn as nn
-import torch.nn.functional as F
-from einops import rearrange
-class PatchEmbedding(nn.Module):
-    """Convert image to patch tokens (identical to ViT)"""
-    def __init__(self, img_size=224, patch_size=16, in_channels=3, embed_dim=384):
-        super().__init__()
-        self.img_size = img_size
-        self.patch_size = patch_size
-        self.num_patches = (img_size // patch_size) ** 2
-        self.proj = nn.Conv2d(in_channels, embed_dim, kernel_size=patch_size, stride=patch_size)
-        self.pos_embed = nn.Parameter(torch.zeros(1, self.num_patches, embed_dim))
-        nn.init.trunc_normal_(self.pos_embed, std=0.02)
-    def forward(self, x):
-        # x: [B, C, H, W]
-        B = x.shape[0]
-        x = self.proj(x)  # [B, D, H/P, W/P]
-        x = x.flatten(2).transpose(1, 2)  # [B, N, D]
-        x = x + self.pos_embed
-        return x
-class MLSTMCell(nn.Module):
-    """
-    Matrix-LSTM (mLSTM) cell with exponential gating.
-    Core equations:
-    q = W_q @ x, k = (1/√d) * W_k @ x, v = W_v @ x
-    f = exp(w_f @ x), i = exp(w_i @ x), o = sigmoid(w_o @ x)
-    C_t = f * C_{t-1} + i * (v ⊗ k)  [outer product memory update]
-    n_t = f * n_{t-1} + i * k         [normalizer]
-    h_t = o ⊙ (C_t @ q / max(|n_t^T @ q|, 1))
-    """
-    def __init__(self, input_dim, head_dim, num_heads=1):
-        super().__init__()
-        self.head_dim = head_dim
-        self.num_heads = num_heads
-        self.total_dim = head_dim * num_heads
-        # QKV projections
-        self.W_q = nn.Linear(input_dim, self.total_dim, bias=True)
-        self.W_k = nn.Linear(input_dim, self.total_dim, bias=True)
-        self.W_v = nn.Linear(input_dim, self.total_dim, bias=True)
-        # Gates (scalar per head)
-        self.w_f = nn.Linear(input_dim, num_heads, bias=True)  # forget gate
-        self.w_i = nn.Linear(input_dim, num_heads, bias=True)  # input gate
-        self.w_o = nn.Linear(input_dim, self.total_dim, bias=True)  # output gate
-        # Scaling
-        self.scale = 1.0 / math.sqrt(head_dim)
-    def forward(self, x):
-        """
-        x: [B, T, D]
-        Returns: [B, T, total_dim]
-        For efficiency, we compute the parallel form via cumulative sums.
-        """
-        B, T, D = x.shape
-        q = self.W_q(x)  # [B, T, total_dim]
-        k = self.W_k(x) * self.scale  # [B, T, total_dim]
-        v = self.W_v(x)  # [B, T, total_dim]
-        # Gates
-        log_f = self.w_f(x)  # [B, T, num_heads] - log forget gate
-        log_i = self.w_i(x)  # [B, T, num_heads] - log input gate
-        o = torch.sigmoid(self.w_o(x))  # [B, T, total_dim]
-        # Stabilize with log-space computation
-        # Cumulative log forget gates for parallel scan
-        log_f = F.logsigmoid(log_f)  # bound to (-inf, 0)
-        # Reshape for multi-head
-        q = rearrange(q, 'b t (h d) -> b h t d', h=self.num_heads)
-        k = rearrange(k, 'b t (h d) -> b h t d', h=self.num_heads)
-        v = rearrange(v, 'b t (h d) -> b h t d', h=self.num_heads)
-        log_f = rearrange(log_f, 'b t h -> b h t')
-        log_i = rearrange(log_i, 'b t h -> b h t')
-        # Parallel computation via chunked linear attention approximation
-        # For efficiency, use the "linear attention" form:
-        # h_t = Σ_{s≤t} (Π_{j=s+1}^{t} f_j) * i_s * v_s * k_s^T * q_t
-        # This is equivalent to softmax-free linear attention with decay
-        # Compute cumulative forget gate products in log space
-        cum_log_f = torch.cumsum(log_f, dim=-1)  # [B, H, T]
-        # Log weights: log(f^cum * i) for each position
-        # w_{t,s} = cum_log_f[t] - cum_log_f[s] + log_i[s]
-        # For parallel form, compute weighted KV accumulation
-        # Simplified parallel form using exponential weights
-        weights = torch.exp(cum_log_f)  # [B, H, T] - cumulative decay
-        i_weights = torch.exp(log_i)  # [B, H, T] - input gates
-        # Weighted keys and values
-        w = (i_weights / (weights + 1e-6)).unsqueeze(-1)  # [B, H, T, 1]
-        kv = torch.einsum('bhtd,bhte->bhde', k * w, v * w)  # [B, H, D, D] approx
-        # Actually, let's use the simpler chunkwise form for correctness:
-        # Direct sequential would be too slow, so use causal linear attention
-        # qk = q @ k^T with causal mask approximated by decay
-        # Efficient approximation: use causal dot product with decay
-        # Gates are per-head scalars: [B, H, T]
-        decay = torch.exp(log_f)  # [B, H, T]
-        gate = torch.exp(log_i)   # [B, H, T]
-        # Sequential scan (will be replaced by parallel scan in production)
-        h_state = torch.zeros(B, self.num_heads, self.head_dim, self.head_dim,
-                             device=x.device, dtype=x.dtype)
-        n_state = torch.zeros(B, self.num_heads, self.head_dim,
-                             device=x.device, dtype=x.dtype)
-        outputs = []
-        for t in range(T):
-            f_t = decay[:, :, t]  # [B, H] - per-head scalar
-            i_t = gate[:, :, t]   # [B, H] - per-head scalar
-            k_t = k[:, :, t, :]   # [B, H, D]
-            v_t = v[:, :, t, :]   # [B, H, D]
-            q_t = q[:, :, t, :]   # [B, H, D]
-            # Expand gates for broadcasting: [B, H] -> [B, H, 1] and [B, H, 1, 1]
-            f_t_d = f_t.unsqueeze(-1)   # [B, H, 1] for D dim
-            i_t_d = i_t.unsqueeze(-1)   # [B, H, 1] for D dim
-            f_t_dd = f_t.unsqueeze(-1).unsqueeze(-1)  # [B, H, 1, 1] for DxD
-            i_t_dd = i_t.unsqueeze(-1).unsqueeze(-1)  # [B, H, 1, 1] for DxD
-            # Update cell state: C = f*C + i*(v outer k)
-            h_state = f_t_dd * h_state + i_t_dd * torch.einsum('bhd,bhe->bhde', v_t, k_t)
-            # Update normalizer: n = f*n + i*k
-            n_state = f_t_d * n_state + i_t_d * k_t
-            # Output: o * (C @ q / max(|n^T @ q|, 1))
-            Cq = torch.einsum('bhde,bhe->bhd', h_state, q_t)
-            nq = torch.einsum('bhd,bhd->bh', n_state, q_t).unsqueeze(-1)
-            nq = torch.clamp(nq.abs(), min=1.0)
-            h_t = Cq / nq
-            outputs.append(h_t)
-        out = torch.stack(outputs, dim=2)  # [B, H, T, D]
-        out = rearrange(out, 'b h t d -> b t (h d)')
-        out = out * o
-        return out
-class MLSTMBlock(nn.Module):
-    """
-    ViL mLSTM block with Conv2D for QK spatial context.
-    Wraps mLSTM in a gated MLP structure.
-    """
-    def __init__(self, dim, conv_kernel=3, dropout=0.0):
-        super().__init__()
-        self.norm = nn.LayerNorm(dim)
-        # Pre-projection: expand to 3x for gate structure
-        self.pre_proj = nn.Linear(dim, dim * 3)
-        # Conv2D for spatial QK context (key ViL innovation)
-        self.conv = nn.Conv2d(dim, dim, kernel_size=conv_kernel,
-                             padding=conv_kernel // 2, groups=dim)  # depthwise
-        # mLSTM cell
-        self.mlstm = MLSTMCell(
-            input_dim=dim,
-            head_dim=dim // 4,  # 4 heads
-            num_heads=4
         )
-        # Output projection
-        self.out_proj = nn.Linear(dim, dim)
-        self.dropout = nn.Dropout(dropout)
-    def forward(self, x, h=None, w=None):
-        """
-        x: [B, T, D] patch tokens
-        h, w: spatial dimensions for conv (sqrt(T) each for square images)
-        """
-        B, T, D = x.shape
-        residual = x
-        x = self.norm(x)
-        # Gate structure: split into B (gate), C (gate), h_tilde (input)
-        projected = self.pre_proj(x)  # [B, T, 3D]
-        gate_b, gate_c, h_tilde = projected.chunk(3, dim=-1)
-        # Apply spatial conv to h_tilde for local context
-        if h is not None and w is not None:
-            h_2d = rearrange(h_tilde, 'b (h w) d -> b d h w', h=h, w=w)
-            h_2d = self.conv(h_2d)
-            h_tilde = rearrange(h_2d, 'b d h w -> b (h w) d')
-        # Input gating
-        y = torch.sigmoid(gate_b) * h_tilde
-        # mLSTM
-        y = self.mlstm(y)
-        # Output gating
-        y = torch.sigmoid(gate_c) * y
-        y = self.out_proj(y)
-        y = self.dropout(y)
-        return residual + y
-class FFNBlock(nn.Module):
-    """SwiGLU feed-forward block"""
-    def __init__(self, dim, mult=4, dropout=0.0):
-        super().__init__()
-        hidden = int(dim * mult * 2 / 3)  # SwiGLU uses 2/3 factor
-        self.norm = nn.LayerNorm(dim)
-        self.w1 = nn.Linear(dim, hidden)
-        self.w2 = nn.Linear(dim, hidden)
-        self.w3 = nn.Linear(hidden, dim)
-        self.dropout = nn.Dropout(dropout)
-    def forward(self, x):
-        residual = x
-        x = self.norm(x)
-        return residual + self.dropout(self.w3(F.silu(self.w1(x)) * self.w2(x)))
 class VisionXLSTM(nn.Module):
     """
-    Vision xLSTM (ViL) encoder.
-    Architecture:
-    1. Patch embedding (Conv2D, 16x16)
-    2. Alternating bidirectional mLSTM blocks
-    3. SwiGLU FFN after each mLSTM
-    Output: all patch tokens [B, num_patches, dim] for VLM projection
     """
     def __init__(self, config):
         super().__init__()
         self.config = config
-        # Patch embedding
-        self.patch_embed = PatchEmbedding(
-            img_size=config.img_size,
-            patch_size=config.patch_size,
-            in_channels=config.in_channels,
-            embed_dim=config.dim
-        )
-        self.h = config.img_size // config.patch_size
-        self.w = config.img_size // config.patch_size
-        # Alternating mLSTM blocks + FFN
-        self.blocks = nn.ModuleList()
-        self.ffns = nn.ModuleList()
-        for i in range(config.depth):
-            self.blocks.append(MLSTMBlock(
-                dim=config.dim,
-                conv_kernel=config.conv_kernel_size,
-                dropout=config.dropout
-            ))
-            self.ffns.append(FFNBlock(dim=config.dim, dropout=config.dropout))
-        self.final_norm = nn.LayerNorm(config.dim)
-    def forward_features(self, pixel_values):
-        """
-        Extract patch features for VLM projection.
-        Args:
-            pixel_values: [B, C, H, W] images
-        Returns:
-            [B, num_patches, dim] patch token features
-        """
-        x = self.patch_embed(pixel_values)  # [B, N, D]
-        for i, (block, ffn) in enumerate(zip(self.blocks, self.ffns)):
-            if self.config.bidirectional and i % 2 == 1:
-                # Even blocks (0-indexed odd): reverse scan direction
-                x = x.flip(1)
-                x = block(x, h=self.h, w=self.w)
-                x = ffn(x)
-                x = x.flip(1)
-            else:
-                # Odd blocks: forward scan
-                x = block(x, h=self.h, w=self.w)
-                x = ffn(x)
-        x = self.final_norm(x)
-        return x
-    def forward(self, pixel_values):
-        """Classification forward (bilateral concat pooling)"""
-        features = self.forward_features(pixel_values)
-        # Bilateral concat: first + last patch
-        pooled = torch.cat([features[:, 0], features[:, -1]], dim=-1)
-        return pooled
 class VisionProjector(nn.Module):
     """
-    MLP projector: maps ViL features → LM embedding space.
-    Following LLaDA-V / LaViDa: 2-layer MLP with GELU.
     """
     def __init__(self, config):
         super().__init__()
         hidden_dim = config.lm_dim * config.hidden_mult
-        layers = []
-        layers.append(nn.Linear(config.vil_dim, hidden_dim))
-        layers.append(nn.GELU())
         if config.dropout > 0:
             layers.append(nn.Dropout(config.dropout))
         for _ in range(config.num_layers - 1):
-            layers.append(nn.Linear(hidden_dim, hidden_dim))
-            layers.append(nn.GELU())
             if config.dropout > 0:
                 layers.append(nn.Dropout(config.dropout))
         layers.append(nn.Linear(hidden_dim, config.lm_dim))
         self.mlp = nn.Sequential(*layers)
-    def forward(self, vision_features):
-        """
-        Args:
-            vision_features: [B, num_patches, vil_dim]
-        Returns:
-            [B, num_patches, lm_dim]
-        """
         return self.mlp(vision_features)

 """
+Vision xLSTM adapter built on the upstream NX-AI vision-lstm repository.
+This module keeps the existing ViL-DLM contract:
+- `VisionXLSTM.forward_features(pixel_values)` returns patch tokens `[B, N, D]`
+- `VisionProjector` maps those visual tokens into the LM embedding space
 """
+from __future__ import annotations
+import sys
+from pathlib import Path
+import os
+import ssl
+import certifi
 import torch
 import torch.nn as nn
+REPO_ROOT = Path(__file__).resolve().parents[1]
+VISION_LSTM_ROOT = REPO_ROOT / "external" / "vision-lstm"
+if str(VISION_LSTM_ROOT) not in sys.path:
+    sys.path.insert(0, str(VISION_LSTM_ROOT))
+from vision_lstm import VisionLSTM, VisionLSTM2  # noqa: E402
+VISION_BACKBONES = {
+    "vil-small": {
+        "ctor": VisionLSTM,
+        "preprocess": "v1",
+        "url": "https://ml.jku.at/research/vision_lstm/download/vil_small16_e400_in1k.th",
+        "kwargs": {
+            "dim": 384,
+            "depth": 24,
+            "legacy_norm": True,
+            "mode": None,
+            "pooling": None,
+            "output_shape": None,
+        },
+    },
+    "vil2-small": {
+        "ctor": VisionLSTM2,
+        "preprocess": "v2",
+        "url": "https://ml.jku.at/research/vision_lstm/download/vil2_small16_e400_in1k.th",
+        "kwargs": {
+            "dim": 384,
+            "depth": 12,
+            "legacy_norm": True,
+            "mode": "features",
+            "pooling": None,
+            "output_shape": None,
+            "conv_kind": "2d",
+            "conv_kernel_size": 3,
+            "norm_bias": True,
+            "proj_bias": True,
+        },
+    },
+}
+def _preprocess_v1_state_dict(state_dict: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]:
+    state_dict = {key.replace(".xlstm.", ".layer."): value for key, value in state_dict.items()}
+    state_dict = {key.replace("xlstm.", ""): value for key, value in state_dict.items()}
+    state_dict = {key.replace(".xlstm_norm.", ".norm."): value for key, value in state_dict.items()}
+    state_dict["legacy_norm.weight"] = state_dict.pop("post_blocks_norm.weight")
+    state_dict["norm.weight"] = state_dict.pop("head.0.weight")
+    state_dict["norm.bias"] = state_dict.pop("head.0.bias")
+    state_dict["head.weight"] = state_dict.pop("head.1.weight")
+    state_dict["head.bias"] = state_dict.pop("head.1.bias")
+    return state_dict
+def _preprocess_v2_state_dict(
+    state_dict: dict[str, torch.Tensor],
+    *,
+    depth: int,
+    legacy_norm: bool,
+) -> dict[str, torch.Tensor]:
+    state_dict = {key.replace(".xlstm.", ".layer."): value for key, value in state_dict.items()}
+    state_dict = {key.replace("xlstm.", ""): value for key, value in state_dict.items()}
+    state_dict = {key.replace(".xlstm_norm.", ".norm."): value for key, value in state_dict.items()}
+    state_dict = {key.replace(".conv1d.", ".conv."): value for key, value in state_dict.items()}
+    for index in range(depth * 2):
+        if index % 2 == 0:
+            state_dict = {
+                key.replace(f"blocks.{index}.", f"blocks.{index // 2}.rowwise_from_top_left."): value
+                for key, value in state_dict.items()
+            }
+        else:
+            state_dict = {
+                key.replace(f"blocks.{index}.", f"blocks.{index // 2}.rowwise_from_bot_right."): value
+                for key, value in state_dict.items()
+            }
+    state_dict["norm.weight"] = state_dict.pop("post_blocks_norm.weight")
+    state_dict["norm.bias"] = state_dict.pop("post_blocks_norm.bias")
+    if legacy_norm:
+        state_dict["legacy_norm.weight"] = state_dict.pop("head.0.weight")
+        state_dict["legacy_norm.bias"] = state_dict.pop("head.0.bias")
+    state_dict["head.weight"] = state_dict.pop("head.1.weight")
+    state_dict["head.bias"] = state_dict.pop("head.1.bias")
+    return state_dict
+def _load_pretrained_backbone(model: nn.Module, name: str, spec: dict) -> None:
+    os.environ.setdefault("SSL_CERT_FILE", certifi.where())
+    ssl._create_default_https_context = lambda: ssl.create_default_context(cafile=certifi.where())
+    payload = torch.hub.load_state_dict_from_url(spec["url"], map_location="cpu")
+    state_dict = payload["state_dict"]
+    if spec["preprocess"] == "v1":
+        state_dict = _preprocess_v1_state_dict(state_dict)
+    elif spec["preprocess"] == "v2":
+        state_dict = _preprocess_v2_state_dict(
+            state_dict,
+            depth=spec["kwargs"]["depth"],
+            legacy_norm=spec["kwargs"]["legacy_norm"],
         )
+    else:
+        raise ValueError(f"Unsupported checkpoint preprocessing mode: {spec['preprocess']}")
+    if getattr(model, "head", None) is None:
+        state_dict.pop("head.weight", None)
+        state_dict.pop("head.bias", None)
+    model.load_state_dict(state_dict)
 class VisionXLSTM(nn.Module):
     """
+    Thin adapter over upstream VisionLSTM / VisionLSTM2 models.
+    The default backbone is `vil2-small`, which matches the requested 384-dim
+    patch features while using the newer ViL v2 implementation.
     """
     def __init__(self, config):
         super().__init__()
+        backbone_name = getattr(config, "vision_backbone", "vil2-small")
+        pretrained = getattr(config, "pretrained", True)
+        img_size = getattr(config, "img_size", 224)
+        patch_size = getattr(config, "patch_size", 16)
+        in_channels = getattr(config, "in_channels", 3)
+        if backbone_name not in VISION_BACKBONES:
+            supported = ", ".join(sorted(VISION_BACKBONES))
+            raise ValueError(f"Unsupported vision backbone '{backbone_name}'. Supported backbones: {supported}")
+        spec = VISION_BACKBONES[backbone_name]
+        ctor_kwargs = dict(spec["kwargs"])
+        ctor_kwargs["input_shape"] = (in_channels, img_size, img_size)
+        ctor_kwargs["patch_size"] = patch_size
         self.config = config
+        self.backbone_name = backbone_name
+        self.model = spec["ctor"](**ctor_kwargs)
+        self.dim = ctor_kwargs["dim"]
+        self.num_patches = self.model.patch_embed.num_patches
+        if pretrained:
+            _load_pretrained_backbone(self.model, backbone_name, spec)
+    def forward_features(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        return self.model(pixel_values)
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        return self.forward_features(pixel_values)
 class VisionProjector(nn.Module):
     """
+    MLP projector: maps ViL features -> LM embedding space.
     """
     def __init__(self, config):
         super().__init__()
         hidden_dim = config.lm_dim * config.hidden_mult
+        layers = [nn.Linear(config.vil_dim, hidden_dim), nn.GELU()]
         if config.dropout > 0:
             layers.append(nn.Dropout(config.dropout))
         for _ in range(config.num_layers - 1):
+            layers.extend([nn.Linear(hidden_dim, hidden_dim), nn.GELU()])
             if config.dropout > 0:
                 layers.append(nn.Dropout(config.dropout))
         layers.append(nn.Linear(hidden_dim, config.lm_dim))
         self.mlp = nn.Sequential(*layers)
+    def forward(self, vision_features: torch.Tensor) -> torch.Tensor:
         return self.mlp(vision_features)

pyproject.toml ADDED Viewed

	@@ -0,0 +1,24 @@

+[project]
+name = "vil-dlm"
+version = "0.1.0"
+description = "Local smoke-test environment for ViL-DLM"
+requires-python = ">=3.11,<3.12"
+dependencies = [
+  "torch",
+  "torchvision",
+  "transformers",
+  "huggingface_hub",
+  "einops",
+  "numpy",
+  "pillow",
+]
+[dependency-groups]
+dev = [
+  "datasets",
+  "accelerate",
+  "trackio",
+]
+[tool.uv]
+package = false

train_production.py CHANGED Viewed

	@@ -1 +1,3 @@
1	- # ~~Content~~ ~~will~~ be ~~read~~ ~~from~~ ~~sandbox~~


1	+ """Compatibility stub for the real training entrypoint in `code/train_production.py`."""
2	+
3	+ raise SystemExit("Use `python code/train_production.py ...` from the repo root.")