Implement stage-aware real-run training pipeline

Browse files

Files changed (5) hide show

README.md +18 -10
code/model_config.py +21 -9
code/train_production.py +1075 -549
code/vil_dlm_model.py +46 -72
pyproject.toml +1 -0

README.md CHANGED Viewed

@@ -48,9 +48,9 @@ pipeline_tag: image-text-to-text
 - **Key change from AR**: replaces causal attention mask with bidirectional padding-only mask
 - Weighted cross-entropy loss on masked positions only (MDLM objective)
-### Knowledge Distillation (Planned Stage 3)
 - Teacher: [Gemma 4 E2B](https://huggingface.co/google/gemma-4-E2B-it) (5.1B params, ~2B effective)
-- **Decoupled Top-K Distillation** (from [LFM2](https://arxiv.org/abs/2511.23404)): only align top-32 teacher logits
 - Temperature τ=2.0, α_KD=0.5 (50% diffusion loss + 50% KD loss)
 ## Training Recipe
@@ -61,7 +61,7 @@ Multi-stage training inspired by LLaDA-V, LaViDa, LFM2, and Mistral/Pixtral:
 |-------|-------------------|---------|---------------|--------|
 | 1 | Projector only (ViL & LM frozen) | [LLaVA-Pretrain](https://huggingface.co/datasets/liuhaotian/LLaVA-Pretrain) (558K) | 1e-3 | 1-2 |
 | 2 | Full model (all components) | [The Cauldron](https://huggingface.co/datasets/HuggingFaceM4/the_cauldron) | ViL:2e-6, Proj:1e-5, LM:1e-5 | 3 |
-| 3 | + KD from Gemma 4 E2B | Mixed instruction data | + Top-K KD (α=0.5) | 2 |
 ### Efficiency Tricks Applied
 - **Per-component learning rates** (LLaDA-V recipe): vision encoder gets 5× lower LR
@@ -82,20 +82,28 @@ This is a genuinely **unexplored frontier** in the literature:
 ## Running Training
 ```bash
-# Stage 1: Projector alignment (2-4 hours on A10G)
-python train_production.py --stage 1 --epochs 2 --batch_size 4 --grad_accum 8
-# Stage 2: Full finetune (8-12 hours on A10G)
-python train_production.py --stage 2 --epochs 3 --batch_size 2 --grad_accum 16
-# Quick test (10 min, small subset)
-python train_production.py --stage 1 --epochs 1 --batch_size 2 --grad_accum 1 --max_samples 100
 ```
 ### Hardware Requirements
 - **Stage 1**: A10G (24GB) or T4 (16GB) — only projector gradients (~7M params)
 - **Stage 2**: A10G (24GB) recommended — full model gradients (~660M params)
-- **Stage 3**: A100 (80GB) recommended — teacher model (Gemma 4 E2B) + student
 ### Dependencies
 ```

 - **Key change from AR**: replaces causal attention mask with bidirectional padding-only mask
 - Weighted cross-entropy loss on masked positions only (MDLM objective)
+### Knowledge Distillation (Stage 3)
 - Teacher: [Gemma 4 E2B](https://huggingface.co/google/gemma-4-E2B-it) (5.1B params, ~2B effective)
+- **Sparse cross-tokenizer distillation**: prepare a teacher-scored candidate bank in the student token space, then blend sparse KL with diffusion loss
 - Temperature τ=2.0, α_KD=0.5 (50% diffusion loss + 50% KD loss)
 ## Training Recipe
 |-------|-------------------|---------|---------------|--------|
 | 1 | Projector only (ViL & LM frozen) | [LLaVA-Pretrain](https://huggingface.co/datasets/liuhaotian/LLaVA-Pretrain) (558K) | 1e-3 | 1-2 |
 | 2 | Full model (all components) | [The Cauldron](https://huggingface.co/datasets/HuggingFaceM4/the_cauldron) | ViL:2e-6, Proj:1e-5, LM:1e-5 | 3 |
+| 3 | + KD from Gemma 4 E2B | Stage 2 data mix + cached teacher bank | Sparse cross-tokenizer KD (α=0.5) | 2 |
 ### Efficiency Tricks Applied
 - **Per-component learning rates** (LLaDA-V recipe): vision encoder gets 5× lower LR
 ## Running Training
 ```bash
+# Stage 1: projector-only alignment
+python code/train_production.py --stage 1 --require_cuda --epochs 1 --batch_size 8 --grad_accum 4
+# Stage 2: full-model finetune on the balanced Cauldron mix
+python code/train_production.py --stage 2 --require_cuda --epochs 3 --batch_size 2 --grad_accum 16
+# Stage 3a: build the Gemma teacher candidate bank from a Stage 2 checkpoint
+python code/train_production.py --stage 3a --require_cuda --resume_from ./vil-dlm-output/stage2_best --teacher_batch_size 2
+# Stage 3b: sparse KD training from the cached teacher bank
+python code/train_production.py --stage 3b --require_cuda --resume_from ./vil-dlm-output/stage2_best --epochs 2 --batch_size 2 --grad_accum 16
+# Cheap validation gate for any stage
+python code/train_production.py --stage 1 --require_cuda --dry_run_batches 1 --max_samples 8
 ```
+Training now saves checkpoints locally by default. Add `--push_to_hub` only when you want to publish artifacts.
 ### Hardware Requirements
 - **Stage 1**: A10G (24GB) or T4 (16GB) — only projector gradients (~7M params)
 - **Stage 2**: A10G (24GB) recommended — full model gradients (~660M params)
+- **Stage 3**: H100 / A100 (80GB) recommended — Gemma 4 teacher bank prep + student distillation
 ### Dependencies
 ```

code/model_config.py CHANGED Viewed

@@ -62,7 +62,9 @@ class DistillationConfig:
     temperature: float = 2.0       # KD temperature
     alpha_kd: float = 0.5          # weight for KD loss vs diffusion loss
     alpha_vision_kd: float = 0.3   # weight for vision feature distillation
-    top_k_logits: int = 32         # LFM2-style top-K distillation
 @dataclass
@@ -96,34 +98,44 @@ class TrainingConfig:
     # Data
     pretrain_dataset: str = "liuhaotian/LLaVA-Pretrain"  # Stage 1: 558K
     finetune_dataset: str = "HuggingFaceM4/the_cauldron"  # Stage 2: rich multimodal
     # Output
     output_dir: str = "./vil-dlm-output"
     hub_model_id: str = "omar-ah/ViL-DLM-0.6B"
-    push_to_hub: bool = True
     # Stages
-    stage: int = 1  # 1 = projector only, 2 = full finetune, 3 = + distillation
-def get_config(stage: int = 1) -> TrainingConfig:
     config = TrainingConfig()
     config.stage = stage
-    if stage == 1:
         # Stage 1: Train projector only (ViL frozen, LM frozen)
         config.learning_rate = 1e-3
         config.num_epochs = 1
         config.per_device_train_batch_size = 8
         config.gradient_accumulation_steps = 4
-    elif stage == 2:
         # Stage 2: Full model finetune (ViL + projector + LM)
         config.learning_rate = 1e-5
         config.vil_learning_rate = 2e-6
         config.projector_learning_rate = 1e-5
         config.num_epochs = 3
-    elif stage == 3:
-        # Stage 3: + Distillation from Gemma 4
         config.learning_rate = 1e-5
         config.num_epochs = 2
         config.distillation.alpha_kd = 0.5

     temperature: float = 2.0       # KD temperature
     alpha_kd: float = 0.5          # weight for KD loss vs diffusion loss
     alpha_vision_kd: float = 0.3   # weight for vision feature distillation
+    kd_top_k: int = 8              # sparse cross-tokenizer candidate set size
+    kd_positions_per_sample: int = 16
+    teacher_cache_dir: str = "./vil-dlm-output/teacher-cache"
 @dataclass
     # Data
     pretrain_dataset: str = "liuhaotian/LLaVA-Pretrain"  # Stage 1: 558K
     finetune_dataset: str = "HuggingFaceM4/the_cauldron"  # Stage 2: rich multimodal
+    finetune_dataset_configs: List[str] = field(default_factory=lambda: [
+        "ai2d",
+        "vqav2",
+        "a_okvqa",
+        "textvqa",
+        "docvqa",
+        "chartqa",
+        "textcaps",
+        "screen2words",
+    ])
     # Output
     output_dir: str = "./vil-dlm-output"
     hub_model_id: str = "omar-ah/ViL-DLM-0.6B"
+    push_to_hub: bool = False
     # Stages
+    stage: str = "1"  # 1, 2, 3a, 3b
+def get_config(stage: str = "1") -> TrainingConfig:
     config = TrainingConfig()
     config.stage = stage
+    if stage == "1":
         # Stage 1: Train projector only (ViL frozen, LM frozen)
         config.learning_rate = 1e-3
         config.num_epochs = 1
         config.per_device_train_batch_size = 8
         config.gradient_accumulation_steps = 4
+    elif stage == "2":
         # Stage 2: Full model finetune (ViL + projector + LM)
         config.learning_rate = 1e-5
         config.vil_learning_rate = 2e-6
         config.projector_learning_rate = 1e-5
         config.num_epochs = 3
+    elif stage in {"3a", "3b"}:
+        # Stage 3: sparse cross-tokenizer distillation with Gemma 4
         config.learning_rate = 1e-5
         config.num_epochs = 2
         config.distillation.alpha_kd = 0.5

code/train_production.py CHANGED Viewed

@@ -1,42 +1,61 @@
 """
-ViL-DLM Production Training Script
-Runs on HF Jobs with GPU
-Stage 1: Train projector only (ViL frozen, LM frozen) on LLaVA-Pretrain
-Stage 2: Full finetune on multimodal instruction data
 """
-import os
-import sys
-import math
 import json
 import time
-import argparse
 from pathlib import Path
-from typing import Dict, Optional
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from torch.utils.data import Dataset, DataLoader
 from torch.optim import AdamW
 from torch.optim.lr_scheduler import CosineAnnealingLR
-import numpy as np
-from PIL import Image
-from io import BytesIO
-from datasets import load_dataset
-from transformers import AutoTokenizer, AutoModelForMaskedLM
-from huggingface_hub import HfApi, snapshot_download
-from vision_xlstm import VisionProjector as UpstreamVisionProjector, VisionXLSTM as UpstreamVisionXLSTM
-import trackio
-# ============================================================
-# 1. Model Config
-# ============================================================
-from dataclasses import dataclass, field
 @dataclass
 class ViLConfig:
@@ -50,9 +69,9 @@ class ViLConfig:
     conv_kernel_size: int = 3
     bidirectional: bool = True
     dropout: float = 0.0
     @property
-    def num_patches(self):
         return (self.img_size // self.patch_size) ** 2
@@ -66,10 +85,10 @@ class ProjConfig:
 class _TrackioShim:
-    def __init__(self):
         self.enabled = False
-    def init(self, name: str, project: str = "vil-dlm"):
         try:
             trackio.init(name=name, project=project)
             self.enabled = True
@@ -77,7 +96,7 @@ class _TrackioShim:
             self.enabled = False
             print(f"Trackio disabled: {exc}")
-    def log(self, payload: dict):
         if not self.enabled:
             return
         try:
@@ -86,610 +105,1097 @@ class _TrackioShim:
             self.enabled = False
             print(f"Trackio logging disabled after error: {exc}")
-# ============================================================
-# 2. Vision xLSTM Implementation
-# ============================================================
-class PatchEmbedding(nn.Module):
-    def __init__(self, img_size=224, patch_size=16, in_channels=3, embed_dim=384):
-        super().__init__()
-        self.num_patches = (img_size // patch_size) ** 2
-        self.proj = nn.Conv2d(in_channels, embed_dim, kernel_size=patch_size, stride=patch_size)
-        self.pos_embed = nn.Parameter(torch.zeros(1, self.num_patches, embed_dim))
-        nn.init.trunc_normal_(self.pos_embed, std=0.02)
-    def forward(self, x):
-        x = self.proj(x).flatten(2).transpose(1, 2)
-        return x + self.pos_embed
-class MLSTMCell(nn.Module):
-    """mLSTM with matrix memory and exponential gating"""
-    def __init__(self, input_dim, head_dim, num_heads=4):
-        super().__init__()
-        self.head_dim = head_dim
-        self.num_heads = num_heads
-        self.total_dim = head_dim * num_heads
-        self.scale = 1.0 / math.sqrt(head_dim)
-        self.W_q = nn.Linear(input_dim, self.total_dim, bias=True)
-        self.W_k = nn.Linear(input_dim, self.total_dim, bias=True)
-        self.W_v = nn.Linear(input_dim, self.total_dim, bias=True)
-        self.w_f = nn.Linear(input_dim, num_heads, bias=True)
-        self.w_i = nn.Linear(input_dim, num_heads, bias=True)
-        self.w_o = nn.Linear(input_dim, self.total_dim, bias=True)
-    def forward(self, x):
-        B, T, D = x.shape
-        q = self.W_q(x).view(B, T, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
-        k = (self.W_k(x) * self.scale).view(B, T, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
-        v = self.W_v(x).view(B, T, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
-        o = torch.sigmoid(self.w_o(x))
-        log_f = F.logsigmoid(self.w_f(x)).permute(0, 2, 1)  # [B, H, T]
-        log_i = self.w_i(x).permute(0, 2, 1)  # [B, H, T]
-        decay = torch.exp(log_f)  # [B, H, T]
-        gate = torch.exp(log_i)   # [B, H, T]
-        h_state = torch.zeros(B, self.num_heads, self.head_dim, self.head_dim,
-                             device=x.device, dtype=x.dtype)
-        n_state = torch.zeros(B, self.num_heads, self.head_dim,
-                             device=x.device, dtype=x.dtype)
-        outputs = []
-        for t in range(T):
-            f_t = decay[:, :, t].unsqueeze(-1)
-            i_t = gate[:, :, t].unsqueeze(-1)
-            k_t = k[:, :, t, :]
-            v_t = v[:, :, t, :]
-            q_t = q[:, :, t, :]
-            h_state = f_t.unsqueeze(-1) * h_state + i_t.unsqueeze(-1) * torch.einsum('bhd,bhe->bhde', v_t, k_t)
-            n_state = f_t * n_state + i_t * k_t
-            Cq = torch.einsum('bhde,bhe->bhd', h_state, q_t)
-            nq = torch.einsum('bhd,bhd->bh', n_state, q_t).unsqueeze(-1).abs().clamp(min=1.0)
-            outputs.append(Cq / nq)
-        out = torch.stack(outputs, dim=2)  # [B, H, T, D]
-        out = out.permute(0, 2, 1, 3).reshape(B, T, self.total_dim)
-        return out * o
-class MLSTMBlock(nn.Module):
-    def __init__(self, dim, conv_kernel=3, dropout=0.0):
-        super().__init__()
-        self.norm = nn.LayerNorm(dim)
-        self.pre_proj = nn.Linear(dim, dim * 3)
-        self.conv = nn.Conv2d(dim, dim, kernel_size=conv_kernel, padding=conv_kernel // 2, groups=dim)
-        self.mlstm = MLSTMCell(dim, dim // 4, num_heads=4)
-        self.out_proj = nn.Linear(dim, dim)
-        self.dropout = nn.Dropout(dropout)
-    def forward(self, x, h=None, w=None):
-        B, T, D = x.shape
-        residual = x
-        x = self.norm(x)
-        gate_b, gate_c, h_tilde = self.pre_proj(x).chunk(3, dim=-1)
-        if h is not None and w is not None:
-            h_2d = h_tilde.transpose(1, 2).view(B, D, h, w)
-            h_2d = self.conv(h_2d)
-            h_tilde = h_2d.view(B, D, T).transpose(1, 2)
-        y = torch.sigmoid(gate_b) * h_tilde
-        y = self.mlstm(y)
-        y = torch.sigmoid(gate_c) * y
-        return residual + self.dropout(self.out_proj(y))
-class FFNBlock(nn.Module):
-    def __init__(self, dim, mult=4, dropout=0.0):
-        super().__init__()
-        hidden = int(dim * mult * 2 / 3)
-        self.norm = nn.LayerNorm(dim)
-        self.w1 = nn.Linear(dim, hidden)
-        self.w2 = nn.Linear(dim, hidden)
-        self.w3 = nn.Linear(hidden, dim)
-        self.dropout = nn.Dropout(dropout)
-    def forward(self, x):
-        r = x
-        x = self.norm(x)
-        return r + self.dropout(self.w3(F.silu(self.w1(x)) * self.w2(x)))
-class VisionXLSTM(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.patch_embed = PatchEmbedding(config.img_size, config.patch_size, config.in_channels, config.dim)
-        self.h = config.img_size // config.patch_size
-        self.w = config.img_size // config.patch_size
-        self.blocks = nn.ModuleList()
-        self.ffns = nn.ModuleList()
-        for _ in range(config.depth):
-            self.blocks.append(MLSTMBlock(config.dim, config.conv_kernel_size, config.dropout))
-            self.ffns.append(FFNBlock(config.dim, dropout=config.dropout))
-        self.final_norm = nn.LayerNorm(config.dim)
-    def forward_features(self, pixel_values):
-        x = self.patch_embed(pixel_values)
-        for i, (block, ffn) in enumerate(zip(self.blocks, self.ffns)):
-            if self.config.bidirectional and i % 2 == 1:
-                x = x.flip(1)
-                x = block(x, h=self.h, w=self.w)
-                x = ffn(x)
-                x = x.flip(1)
-            else:
-                x = block(x, h=self.h, w=self.w)
-                x = ffn(x)
-        return self.final_norm(x)
-class VisionProjector(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        hidden_dim = config.lm_dim * config.hidden_mult
-        layers = [nn.Linear(config.vil_dim, hidden_dim), nn.GELU()]
-        for _ in range(config.num_layers - 1):
-            layers.extend([nn.Linear(hidden_dim, hidden_dim), nn.GELU()])
-        layers.append(nn.Linear(hidden_dim, config.lm_dim))
-        self.mlp = nn.Sequential(*layers)
-    def forward(self, x):
-        return self.mlp(x)
-# ============================================================
-# 3. MDLM Scheduler & ViL-DLM Model
-# ============================================================
 class MDLMScheduler:
-    def __init__(self, mask_token_id=151643):
         self.mask_token_id = mask_token_id
-    def add_noise(self, input_ids, t):
-        B, T = input_ids.shape
         mask_ratio = 1.0 - torch.cos(t * math.pi / 2)
-        mask_ratio = mask_ratio.unsqueeze(1).expand(B, T)
-        mask = torch.rand(B, T, device=input_ids.device) < mask_ratio
         noisy_ids = input_ids.clone()
         noisy_ids[mask] = self.mask_token_id
         return noisy_ids, mask
-    def sample_timesteps(self, batch_size, device):
         return torch.rand(batch_size, device=device)
 class ViLDLM(nn.Module):
-    def __init__(self, vil_config, proj_config, lm_path):
         super().__init__()
         self.vil_config = vil_config
         self.vision_encoder = UpstreamVisionXLSTM(vil_config)
         self.projector = UpstreamVisionProjector(proj_config)
-        self.scheduler = MDLMScheduler()
-        self.num_patches = vil_config.num_patches
-        # Load diffusion LM
-        print(f"Loading diffusion LM from {lm_path}...")
         self.lm = AutoModelForMaskedLM.from_pretrained(
-            lm_path, trust_remote_code=True, torch_dtype=torch.bfloat16
         )
         self.tokenizer = AutoTokenizer.from_pretrained(lm_path, trust_remote_code=True)
-        lm_params = sum(p.numel() for p in self.lm.parameters())
-        print(f"Loaded LM: {lm_params/1e6:.1f}M params")
-    def forward(self, pixel_values, input_ids, attention_mask, labels=None):
-        B, T = input_ids.shape
-        device = input_ids.device
-        if labels is None:
-            labels = input_ids.clone()
-        # Diffusion: mask tokens
-        t = self.scheduler.sample_timesteps(B, device)
-        noisy_ids, noise_mask = self.scheduler.add_noise(input_ids, t)
-        # Encode image
         vision_features = self.vision_encoder.forward_features(pixel_values)
         visual_tokens = self.projector(vision_features)
-        # Get text embeddings
-        text_embeds = self.lm.model.embed_tokens(noisy_ids)
         visual_tokens = visual_tokens.to(dtype=text_embeds.dtype)
-        # Concat [vision | text]
         inputs_embeds = torch.cat([visual_tokens, text_embeds], dim=1)
-        vis_mask = torch.ones(B, self.num_patches, device=device, dtype=attention_mask.dtype)
-        full_mask = torch.cat([vis_mask, attention_mask], dim=1)
-        # Forward through LM
-        outputs = self.lm(inputs_embeds=inputs_embeds, attention_mask=full_mask)
-        text_logits = outputs.logits[:, self.num_patches:, :]
-        # MDLM loss on masked positions only
-        loss_mask = noise_mask.float()
-        if loss_mask.sum() == 0:
             loss = torch.tensor(0.0, device=device, requires_grad=True)
         else:
             logits_flat = text_logits.reshape(-1, text_logits.shape[-1])
             labels_flat = labels.reshape(-1)
-            loss_flat = F.cross_entropy(logits_flat, labels_flat, reduction='none').reshape(B, T)
-            loss = (loss_flat * loss_mask).sum() / loss_mask.sum()
-        return {'loss': loss, 'logits': text_logits, 'noise_mask': noise_mask, 't': t}
-    def freeze_vision(self):
-        for p in self.vision_encoder.parameters():
-            p.requires_grad = False
-    def freeze_lm(self):
-        for p in self.lm.parameters():
-            p.requires_grad = False
-    def unfreeze_all(self):
-        for p in self.parameters():
-            p.requires_grad = True
-    def count_params(self):
         vil = sum(p.numel() for p in self.vision_encoder.parameters())
         proj = sum(p.numel() for p in self.projector.parameters())
         lm = sum(p.numel() for p in self.lm.parameters())
-        train = sum(p.numel() for p in self.parameters() if p.requires_grad)
-        return {'vil': vil, 'proj': proj, 'lm': lm, 'total': vil+proj+lm, 'trainable': train}
-# ============================================================
-# 4. Dataset
-# ============================================================
-class LLaVAPretrainDataset(Dataset):
-    def __init__(self, tokenizer, max_length=512, img_size=224, max_samples=None):
-        print("Loading LLaVA-Pretrain dataset...")
-        self.dataset_root = None
-        try:
-            self.data = load_dataset("liuhaotian/LLaVA-Pretrain", split="train")
-        except Exception as exc:
-            print(f"Primary dataset loader failed ({exc}). Falling back to direct JSON loading...")
-            self.dataset_root = snapshot_download(
-                "liuhaotian/LLaVA-Pretrain",
-                repo_type="dataset",
-                allow_patterns=["blip_laion_cc_sbu_558k.json", "images.zip"],
-            )
-            json_path = os.path.join(self.dataset_root, "blip_laion_cc_sbu_558k.json")
-            self.data = load_dataset("json", data_files={"train": json_path}, split="train")
-        if max_samples:
-            self.data = self.data.select(range(min(max_samples, len(self.data))))
-        print(f"Loaded {len(self.data)} samples")
         self.tokenizer = tokenizer
         self.max_length = max_length
         self.img_size = img_size
-        self.mean = torch.tensor([0.485, 0.456, 0.406]).view(3, 1, 1)
-        self.std = torch.tensor([0.229, 0.224, 0.225]).view(3, 1, 1)
-    def __len__(self):
-        return len(self.data)
-    def __getitem__(self, idx):
-        sample = self.data[idx]
-        # Image
-        try:
-            img = sample['image']
-            if isinstance(img, str):
-                candidate_paths = [img]
-                if self.dataset_root and not os.path.isabs(img):
-                    candidate_paths.extend([
-                        os.path.join(self.dataset_root, img),
-                        os.path.join(self.dataset_root, "images", img),
-                    ])
-                image_path = next((path for path in candidate_paths if os.path.exists(path)), img)
-                img = Image.open(image_path).convert('RGB')
-            elif isinstance(img, dict) and 'bytes' in img:
-                img = Image.open(BytesIO(img['bytes'])).convert('RGB')
-            elif not isinstance(img, Image.Image):
-                img = Image.new('RGB', (self.img_size, self.img_size), (128, 128, 128))
-            else:
-                img = img.convert('RGB')
-            img = img.resize((self.img_size, self.img_size), Image.BICUBIC)
-            arr = np.array(img).astype(np.float32) / 255.0
-            pv = torch.from_numpy(arr).permute(2, 0, 1)
-            pv = (pv - self.mean) / self.std
-        except Exception:
-            pv = torch.zeros(3, self.img_size, self.img_size)
-        # Text from conversations
         text = ""
-        if 'conversations' in sample:
             parts = []
-            for turn in sample['conversations']:
-                val = turn.get('value', '').replace('<image>\n', '').replace('<image>', '').strip()
                 if val:
                     parts.append(val)
-            text = ' '.join(parts)
-        elif sample.get('blip_caption'):
-            text = sample['blip_caption'].strip()
         if not text:
             text = "Describe this image."
-        tokens = self.tokenizer(text, max_length=self.max_length, padding='max_length',
-                               truncation=True, return_tensors='pt')
         return {
-            'pixel_values': pv,
-            'input_ids': tokens['input_ids'].squeeze(0),
-            'attention_mask': tokens['attention_mask'].squeeze(0),
-            'labels': tokens['input_ids'].squeeze(0).clone(),
         }
-# ============================================================
-# 5. Training Loop
-# ============================================================
-def train(args):
-    tracker = _TrackioShim()
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     print(f"Device: {device}")
     if torch.cuda.is_available():
-        print(f"GPU: {torch.cuda.get_device_name()}")
         print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
-    # Download dLLM model
-    print("Downloading dLLM Qwen3-0.6B diffusion model...")
-    lm_path = snapshot_download('dllm-hub/Qwen3-0.6B-diffusion-mdlm-v0.1')
-    # Fix the modeling file (remove dllm import in __main__)
-    modeling_file = os.path.join(lm_path, "modeling_qwen3.py")
-    with open(modeling_file, 'r') as f:
-        content = f.read()
-    # Replace the __main__ block that imports dllm
-    content = content.replace(
-        'if __name__ == "__main__":\n    import dllm',
-        'if __name__ == "__main__":\n    pass\n    # import dllm'
     )
-    # Fix attention_type compatibility
-    content = content.replace(
-        'attention_mask=causal_mask_mapping[decoder_layer.attention_type]',
-        'attention_mask=causal_mask_mapping.get(getattr(decoder_layer, "attention_type", "full_attention"), causal_mask_mapping.get("full_attention"))'
     )
-    with open(modeling_file, 'w') as f:
-        f.write(content)
-    print(f"Model downloaded to {lm_path}")
-    # Build model
     vil_config = ViLConfig()
     proj_config = ProjConfig()
     model = ViLDLM(vil_config, proj_config, lm_path)
-    # Stage setup
-    if args.stage == 1:
-        print("\n=== STAGE 1: Projector-only training ===")
-        model.freeze_vision()
-        model.freeze_lm()
-    elif args.stage == 2:
-        print("\n=== STAGE 2: Full finetune ===")
-        model.unfreeze_all()
     params = model.count_params()
     print(f"Parameters: Total={params['total']/1e6:.1f}M, Trainable={params['trainable']/1e6:.1f}M")
     print(f"  ViL: {params['vil']/1e6:.1f}M, Proj: {params['proj']/1e6:.1f}M, LM: {params['lm']/1e6:.1f}M")
     model = model.to(device)
-    # Enable gradient checkpointing for LM
-    if hasattr(model.lm, 'gradient_checkpointing_enable'):
         model.lm.gradient_checkpointing_enable()
-    # Dataset
-    dataset = LLaVAPretrainDataset(
-        tokenizer=model.tokenizer,
-        max_length=args.max_length,
-        img_size=224,
-        max_samples=args.max_samples,
-    )
-    dataloader = DataLoader(
-        dataset, batch_size=args.batch_size, shuffle=True,
-        num_workers=4, pin_memory=True, drop_last=True,
     )
-    # Optimizer with per-component LR
-    param_groups = []
-    if args.stage == 1:
-        param_groups = [{'params': [p for p in model.projector.parameters() if p.requires_grad],
-                        'lr': 1e-3}]
-    else:
-        param_groups = [
-            {'params': [p for p in model.vision_encoder.parameters() if p.requires_grad], 'lr': 2e-6},
-            {'params': [p for p in model.projector.parameters() if p.requires_grad], 'lr': 1e-5},
-            {'params': [p for p in model.lm.parameters() if p.requires_grad], 'lr': 1e-5},
-        ]
-    param_groups = [g for g in param_groups if len(g['params']) > 0]
-    optimizer = AdamW(param_groups, weight_decay=0.05, betas=(0.9, 0.999))
-    total_steps = len(dataloader) * args.epochs // args.grad_accum
-    scheduler = CosineAnnealingLR(optimizer, T_max=max(total_steps, 1), eta_min=1e-6)
-    # Trackio
     tracker.init(name=f"vil-dlm-stage{args.stage}")
-    # Training loop
     global_step = 0
-    best_loss = float('inf')
     for epoch in range(args.epochs):
         model.train()
-        epoch_loss = 0
         num_batches = 0
         for batch_idx, batch in enumerate(dataloader):
-            pv = batch['pixel_values'].to(device)
-            ids = batch['input_ids'].to(device)
-            mask = batch['attention_mask'].to(device)
-            labels = batch['labels'].to(device)
-            outputs = model(pixel_values=pv, input_ids=ids, attention_mask=mask, labels=labels)
-            loss = outputs['loss'] / args.grad_accum
             loss.backward()
             if (batch_idx + 1) % args.grad_accum == 0:
                 torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                 optimizer.step()
                 scheduler.step()
-                optimizer.zero_grad()
                 global_step += 1
-                actual_loss = loss.item() * args.grad_accum
-                mask_ratio = outputs['noise_mask'].float().mean().item()
-                lr = optimizer.param_groups[0]['lr']
-                if global_step % 5 == 0:
-                    print(f"[E{epoch}] Step {global_step}/{total_steps} | "
-                          f"Loss: {actual_loss:.4f} | LR: {lr:.2e} | Mask: {mask_ratio:.1%}")
-                tracker.log({
-                    'train/loss': actual_loss,
-                    'train/lr': lr,
-                    'train/mask_ratio': mask_ratio,
-                    'train/epoch': epoch,
-                    'train/step': global_step,
-                })
-            epoch_loss += loss.item() * args.grad_accum
-            num_batches += 1
-        avg_loss = epoch_loss / max(num_batches, 1)
-        print(f"\n[Epoch {epoch}] Average Loss: {avg_loss:.4f}\n")
-        tracker.log({'train/epoch_loss': avg_loss, 'train/epoch': epoch})
-        # Save checkpoint
-        if avg_loss < best_loss:
-            best_loss = avg_loss
-            save_dir = os.path.join(args.output_dir, f"stage{args.stage}_best")
-            os.makedirs(save_dir, exist_ok=True)
-            torch.save(model.vision_encoder.state_dict(), os.path.join(save_dir, "vision_encoder.pt"))
-            torch.save(model.projector.state_dict(), os.path.join(save_dir, "projector.pt"))
-            if args.stage >= 2:
-                model.lm.save_pretrained(os.path.join(save_dir, "diffusion_lm"))
-            print(f"Saved best checkpoint (loss={best_loss:.4f})")
-    # Push to Hub
-    print("\nPushing to Hub...")
-    api = HfApi()
-    repo_id = args.hub_model_id
-    try:
-        api.create_repo(repo_id, exist_ok=True, private=False)
-    except Exception as e:
-        print(f"Repo note: {e}")
-    save_dir = os.path.join(args.output_dir, f"stage{args.stage}_best")
-    # Save config + README
-    config_dict = {
-        'architecture': 'ViL-DLM',
-        'components': {
-            'vision_encoder': 'Vision-xLSTM-S (ViL-S)',
-            'projector': '2-layer MLP',
-            'diffusion_lm': 'dLLM Qwen3-0.6B MDLM',
-        },
-        'vil_dim': 384,
-        'lm_dim': 1024,
-        'num_patches': 196,
-        'training_stage': args.stage,
-        'best_loss': best_loss,
-        'total_params_M': params['total'] / 1e6,
-        'trainable_params_M': params['trainable'] / 1e6,
-        'based_on': [
-            'Vision-LSTM (arxiv:2406.04303)',
-            'dLLM (arxiv:2602.22661)',
-            'LLaDA-V (arxiv:2505.16933)',
-            'LFM2 (arxiv:2511.23404)',
-        ],
-        'teacher': 'google/gemma-4-E2B-it (planned for stage 3)',
-    }
-    with open(os.path.join(save_dir, "model_config.json"), 'w') as f:
-        json.dump(config_dict, f, indent=2)
-    readme = f"""---
-license: apache-2.0
-tags:
-  - vision-language
-  - diffusion
-  - xlstm
-  - vision-lstm
-  - masked-diffusion
-  - mdlm
-language: en
-pipeline_tag: image-text-to-text
----
-# ViL-DLM: Vision xLSTM Diffusion Language Model
-**The first vision-language model combining Vision xLSTM with a diffusion language backbone.**
-## Architecture
-| Component | Model | Params |
-|-----------|-------|--------|
-| Vision Encoder | **Vision-xLSTM-S (ViL-S)** | ~57M |
-| Projector | 2-layer MLP (GELU) | ~7M |
-| Language Backbone | **dLLM Qwen3-0.6B (MDLM)** | ~596M |
-| **Total** | | **~660M** |
-### Why This Combination?
-1. **ViL (Vision xLSTM)** — O(N) linear complexity vision encoder vs ViT's O(N²). Uses alternating bidirectional mLSTM blocks with exponential gating and Conv2D for spatial context. Based on [arxiv:2406.04303](https://arxiv.org/abs/2406.04303).
-2. **Diffusion Language Model** — Non-autoregressive text generation via masked denoising. Bidirectional attention enables richer contextual understanding. Based on [dLLM/MDLM](https://arxiv.org/abs/2602.22661).
-3. **Knowledge Distillation** (Stage 3) — Planned distillation from [Gemma 4 E2B](https://huggingface.co/google/gemma-4-E2B-it) using LFM2-style Decoupled Top-K distillation.
-## Training Recipe
-Inspired by LLaDA-V, LaViDa, LFM2, and Mistral/Pixtral:
-| Stage | What's Trained | Dataset | LR |
-|-------|---------------|---------|-----|
-| 1 | Projector only | LLaVA-Pretrain (558K) | 1e-3 |
-| 2 | Full model | The Cauldron (multimodal) | ViL:2e-6, Proj:1e-5, LM:1e-5 |
-| 3 | + KD from Gemma 4 E2B | Mixed | + Top-K KD (α=0.5, T=2, K=32) |
-**Current stage: {args.stage} | Best loss: {best_loss:.4f}**
-## Novelty
-This is (to our knowledge) the **first published model** combining:
-- Vision xLSTM as a vision encoder in a VLM
-- A discrete masked diffusion language model backbone
-- Multi-stage training with knowledge distillation from an AR multimodal teacher
-## References
-- [Vision-LSTM](https://arxiv.org/abs/2406.04303) — Alkin et al., 2024
-- [dLLM](https://arxiv.org/abs/2602.22661) — Berkeley, 2025
-- [MDLM](https://arxiv.org/abs/2406.07524) — Kuleshov group, NeurIPS 2024
-- [LLaDA-V](https://arxiv.org/abs/2505.16933) — GSAI-ML, 2025
-- [LFM2](https://arxiv.org/abs/2511.23404) — Liquid AI, 2025
-- [Gemma 4](https://huggingface.co/google/gemma-4-E2B-it) — Google, 2026
-"""
-    with open(os.path.join(save_dir, "README.md"), 'w') as f:
-        f.write(readme)
-    api.upload_folder(folder_path=save_dir, repo_id=repo_id,
-                     commit_message=f"Stage {args.stage} training (loss={best_loss:.4f})")
-    print(f"\n✅ Model pushed to https://huggingface.co/{repo_id}")
     print("Training complete!")
-if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument("--stage", type=int, default=1)
     parser.add_argument("--epochs", type=int, default=2)
     parser.add_argument("--batch_size", type=int, default=4)
     parser.add_argument("--grad_accum", type=int, default=8)
@@ -697,6 +1203,26 @@ if __name__ == "__main__":
     parser.add_argument("--max_samples", type=int, default=None)
     parser.add_argument("--output_dir", type=str, default="./vil-dlm-output")
     parser.add_argument("--hub_model_id", type=str, default="omar-ah/ViL-DLM-0.6B")
     args = parser.parse_args()
-    train(args)

 """
+ViL-DLM production training script.
+Stages:
+  1  - projector-only alignment on LLaVA-Pretrain
+  2  - full-model finetune on The Cauldron
+  3a - offline teacher candidate-bank preparation with Gemma 4 E2B-it
+  3b - sparse cross-tokenizer distillation training using cached teacher targets
 """
+import argparse
+import hashlib
 import json
+import math
+import os
 import time
+from collections import defaultdict
+from dataclasses import dataclass
+from io import BytesIO
 from pathlib import Path
+from typing import Dict, Iterable, List, Optional, Sequence, Tuple
+import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+import trackio
+from datasets import Dataset as HFDataset
+from datasets import concatenate_datasets, load_dataset
+from huggingface_hub import HfApi, snapshot_download
+from PIL import Image
 from torch.optim import AdamW
 from torch.optim.lr_scheduler import CosineAnnealingLR
+from torch.utils.data import DataLoader, Dataset
+from transformers import (
+    AutoModelForImageTextToText,
+    AutoModelForMaskedLM,
+    AutoProcessor,
+    AutoTokenizer,
+)
+from vision_xlstm import (
+    VisionProjector as UpstreamVisionProjector,
+    VisionXLSTM as UpstreamVisionXLSTM,
+)
+DEFAULT_CAULDRON_CONFIGS = [
+    "ai2d",
+    "vqav2",
+    "a_okvqa",
+    "textvqa",
+    "docvqa",
+    "chartqa",
+    "textcaps",
+    "screen2words",
+]
 @dataclass
 class ViLConfig:
     conv_kernel_size: int = 3
     bidirectional: bool = True
     dropout: float = 0.0
     @property
+    def num_patches(self) -> int:
         return (self.img_size // self.patch_size) ** 2
 class _TrackioShim:
+    def __init__(self) -> None:
         self.enabled = False
+    def init(self, name: str, project: str = "vil-dlm") -> None:
         try:
             trackio.init(name=name, project=project)
             self.enabled = True
             self.enabled = False
             print(f"Trackio disabled: {exc}")
+    def log(self, payload: dict) -> None:
         if not self.enabled:
             return
         try:
             self.enabled = False
             print(f"Trackio logging disabled after error: {exc}")
 class MDLMScheduler:
+    def __init__(self, mask_token_id: int) -> None:
         self.mask_token_id = mask_token_id
+    def add_noise(self, input_ids: torch.Tensor, t: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        batch, length = input_ids.shape
         mask_ratio = 1.0 - torch.cos(t * math.pi / 2)
+        mask_ratio = mask_ratio.unsqueeze(1).expand(batch, length)
+        mask = torch.rand(batch, length, device=input_ids.device) < mask_ratio
         noisy_ids = input_ids.clone()
         noisy_ids[mask] = self.mask_token_id
         return noisy_ids, mask
+    def sample_timesteps(self, batch_size: int, device: torch.device) -> torch.Tensor:
         return torch.rand(batch_size, device=device)
 class ViLDLM(nn.Module):
+    def __init__(self, vil_config: ViLConfig, proj_config: ProjConfig, lm_path: str) -> None:
         super().__init__()
         self.vil_config = vil_config
         self.vision_encoder = UpstreamVisionXLSTM(vil_config)
         self.projector = UpstreamVisionProjector(proj_config)
         self.lm = AutoModelForMaskedLM.from_pretrained(
+            lm_path,
+            trust_remote_code=True,
+            torch_dtype=torch.bfloat16,
         )
         self.tokenizer = AutoTokenizer.from_pretrained(lm_path, trust_remote_code=True)
+        if self.tokenizer.pad_token_id is None:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+        self.scheduler = MDLMScheduler(mask_token_id=self.tokenizer.pad_token_id)
+    @property
+    def num_patches(self) -> int:
+        return self.vil_config.num_patches
+    def prepare_multimodal_inputs(
+        self,
+        pixel_values: torch.Tensor,
+        input_ids: torch.Tensor,
+        attention_mask: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
         vision_features = self.vision_encoder.forward_features(pixel_values)
         visual_tokens = self.projector(vision_features)
+        text_embeds = self.lm.model.embed_tokens(input_ids)
         visual_tokens = visual_tokens.to(dtype=text_embeds.dtype)
         inputs_embeds = torch.cat([visual_tokens, text_embeds], dim=1)
+        vis_mask = torch.ones(
+            pixel_values.shape[0],
+            self.num_patches,
+            device=attention_mask.device,
+            dtype=attention_mask.dtype,
+        )
+        full_attention_mask = torch.cat([vis_mask, attention_mask], dim=1)
+        return inputs_embeds, full_attention_mask
+    def predict_clean_logits(
+        self,
+        pixel_values: torch.Tensor,
+        input_ids: torch.Tensor,
+        attention_mask: torch.Tensor,
+    ) -> torch.Tensor:
+        inputs_embeds, full_attention_mask = self.prepare_multimodal_inputs(
+            pixel_values=pixel_values,
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+        )
+        outputs = self.lm(inputs_embeds=inputs_embeds, attention_mask=full_attention_mask)
+        return outputs.logits[:, self.num_patches :, :]
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        input_ids: torch.Tensor,
+        attention_mask: torch.Tensor,
+        labels: Optional[torch.Tensor] = None,
+        loss_mask: Optional[torch.Tensor] = None,
+    ) -> Dict[str, torch.Tensor]:
+        batch_size, seq_len = input_ids.shape
+        device = input_ids.device
+        if labels is None:
+            labels = input_ids.clone()
+        if loss_mask is None:
+            loss_mask = attention_mask
+        t = self.scheduler.sample_timesteps(batch_size, device)
+        noisy_ids, noise_mask = self.scheduler.add_noise(input_ids, t)
+        inputs_embeds, full_attention_mask = self.prepare_multimodal_inputs(
+            pixel_values=pixel_values,
+            input_ids=noisy_ids,
+            attention_mask=attention_mask,
+        )
+        outputs = self.lm(inputs_embeds=inputs_embeds, attention_mask=full_attention_mask)
+        text_logits = outputs.logits[:, self.num_patches :, :]
+        active_mask = noise_mask.float() * loss_mask.float()
+        if active_mask.sum() == 0:
             loss = torch.tensor(0.0, device=device, requires_grad=True)
         else:
             logits_flat = text_logits.reshape(-1, text_logits.shape[-1])
             labels_flat = labels.reshape(-1)
+            per_token = F.cross_entropy(logits_flat, labels_flat, reduction="none").reshape(batch_size, seq_len)
+            loss = (per_token * active_mask).sum() / active_mask.sum()
+        return {
+            "loss": loss,
+            "logits": text_logits,
+            "noise_mask": noise_mask,
+            "t": t,
+        }
+    def freeze_vision(self) -> None:
+        for param in self.vision_encoder.parameters():
+            param.requires_grad = False
+    def freeze_lm(self) -> None:
+        for param in self.lm.parameters():
+            param.requires_grad = False
+    def unfreeze_all(self) -> None:
+        for param in self.parameters():
+            param.requires_grad = True
+    def count_params(self) -> Dict[str, int]:
         vil = sum(p.numel() for p in self.vision_encoder.parameters())
         proj = sum(p.numel() for p in self.projector.parameters())
         lm = sum(p.numel() for p in self.lm.parameters())
+        trainable = sum(p.numel() for p in self.parameters() if p.requires_grad)
+        return {"vil": vil, "proj": proj, "lm": lm, "total": vil + proj + lm, "trainable": trainable}
+    def save_checkpoint(self, save_dir: Path, include_lm: bool) -> None:
+        save_dir.mkdir(parents=True, exist_ok=True)
+        torch.save(self.vision_encoder.state_dict(), save_dir / "vision_encoder.pt")
+        torch.save(self.projector.state_dict(), save_dir / "projector.pt")
+        if include_lm:
+            self.lm.save_pretrained(save_dir / "diffusion_lm")
+            self.tokenizer.save_pretrained(save_dir / "diffusion_lm")
+    def load_checkpoint(self, checkpoint_dir: Path, include_lm: bool) -> None:
+        vision_path = checkpoint_dir / "vision_encoder.pt"
+        projector_path = checkpoint_dir / "projector.pt"
+        if vision_path.exists():
+            self.vision_encoder.load_state_dict(torch.load(vision_path, map_location="cpu"))
+        if projector_path.exists():
+            self.projector.load_state_dict(torch.load(projector_path, map_location="cpu"))
+        if include_lm:
+            diffusion_dir = checkpoint_dir / "diffusion_lm"
+            if diffusion_dir.exists():
+                self.lm = AutoModelForMaskedLM.from_pretrained(
+                    diffusion_dir,
+                    trust_remote_code=True,
+                    torch_dtype=torch.bfloat16,
+                )
+                self.tokenizer = AutoTokenizer.from_pretrained(diffusion_dir, trust_remote_code=True)
+                if self.tokenizer.pad_token_id is None:
+                    self.tokenizer.pad_token = self.tokenizer.eos_token
+                self.scheduler = MDLMScheduler(mask_token_id=self.tokenizer.pad_token_id)
+def ensure_hf_cache_root() -> None:
+    os.environ.setdefault("HF_HOME", "/teamspace/studios/this_studio/.cache/huggingface")
+def patch_diffusion_modeling_file(lm_path: str) -> None:
+    modeling_file = os.path.join(lm_path, "modeling_qwen3.py")
+    with open(modeling_file, "r", encoding="utf-8") as handle:
+        content = handle.read()
+    content = content.replace(
+        'if __name__ == "__main__":\n    import dllm',
+        'if __name__ == "__main__":\n    pass\n    # import dllm',
+    )
+    content = content.replace(
+        "attention_mask=causal_mask_mapping[decoder_layer.attention_type]",
+        'attention_mask=causal_mask_mapping.get(getattr(decoder_layer, "attention_type", "full_attention"), causal_mask_mapping.get("full_attention"))',
+    )
+    with open(modeling_file, "w", encoding="utf-8") as handle:
+        handle.write(content)
+def download_student_backbone() -> str:
+    print("Downloading dLLM Qwen3-0.6B diffusion model...")
+    lm_path = snapshot_download("dllm-hub/Qwen3-0.6B-diffusion-mdlm-v0.1")
+    patch_diffusion_modeling_file(lm_path)
+    print(f"Model downloaded to {lm_path}")
+    return lm_path
+def parse_dataset_configs(dataset_configs: Optional[str]) -> List[str]:
+    if dataset_configs:
+        return [item.strip() for item in dataset_configs.split(",") if item.strip()]
+    return list(DEFAULT_CAULDRON_CONFIGS)
+def stable_text_hash(*parts: str) -> str:
+    joined = "\n".join(parts)
+    return hashlib.sha1(joined.encode("utf-8")).hexdigest()
+def build_prompt_prefix(prompt_text: str) -> str:
+    return f"User: {prompt_text.strip()}\nAssistant:"
+def tokenize_prompt_and_target(
+    tokenizer: AutoTokenizer,
+    prompt_text: str,
+    target_text: str,
+    max_length: int,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    prefix_text = build_prompt_prefix(prompt_text)
+    prefix_ids = tokenizer(prefix_text, add_special_tokens=True)["input_ids"]
+    target_ids = tokenizer(" " + target_text.strip(), add_special_tokens=False)["input_ids"]
+    if not target_ids:
+        target_ids = tokenizer(" " + "N/A", add_special_tokens=False)["input_ids"][:1]
+    max_prefix_len = max_length - 1
+    if len(prefix_ids) > max_prefix_len:
+        prefix_ids = prefix_ids[:max_prefix_len]
+    remaining = max_length - len(prefix_ids)
+    if remaining <= 0:
+        prefix_ids = prefix_ids[: max_length - 1]
+        remaining = 1
+    target_ids = target_ids[:remaining]
+    if not target_ids:
+        prefix_ids = prefix_ids[: max_length - 1]
+        target_ids = tokenizer(" " + target_text.strip(), add_special_tokens=False)["input_ids"][:1]
+    input_ids = prefix_ids + target_ids
+    loss_mask = [0] * len(prefix_ids) + [1] * len(target_ids)
+    attention_mask = [1] * len(input_ids)
+    labels = list(input_ids)
+    pad_token_id = tokenizer.pad_token_id
+    if pad_token_id is None:
+        pad_token_id = tokenizer.eos_token_id
+    pad_len = max_length - len(input_ids)
+    if pad_len > 0:
+        input_ids = input_ids + [pad_token_id] * pad_len
+        attention_mask = attention_mask + [0] * pad_len
+        labels = labels + [pad_token_id] * pad_len
+        loss_mask = loss_mask + [0] * pad_len
+    return (
+        torch.tensor(input_ids, dtype=torch.long),
+        torch.tensor(attention_mask, dtype=torch.long),
+        torch.tensor(labels, dtype=torch.long),
+        torch.tensor(loss_mask, dtype=torch.float32),
+    )
+def preprocess_image_for_student(img: object, img_size: int) -> Tuple[torch.Tensor, Image.Image]:
+    if isinstance(img, str):
+        img = Image.open(img).convert("RGB")
+    elif isinstance(img, dict) and "bytes" in img:
+        img = Image.open(BytesIO(img["bytes"])).convert("RGB")
+    elif isinstance(img, Image.Image):
+        img = img.convert("RGB")
+    else:
+        img = Image.new("RGB", (img_size, img_size), (128, 128, 128))
+    pil_image = img
+    resized = pil_image.resize((img_size, img_size), Image.BICUBIC)
+    arr = np.array(resized).astype(np.float32) / 255.0
+    tensor = torch.from_numpy(arr).permute(2, 0, 1)
+    mean = torch.tensor([0.485, 0.456, 0.406]).view(3, 1, 1)
+    std = torch.tensor([0.229, 0.224, 0.225]).view(3, 1, 1)
+    tensor = (tensor - mean) / std
+    return tensor, pil_image
+class NormalizedVisionLanguageDataset(Dataset):
+    def __init__(
+        self,
+        records: HFDataset,
+        tokenizer: AutoTokenizer,
+        max_length: int,
+        img_size: int,
+    ) -> None:
+        self.records = records
         self.tokenizer = tokenizer
         self.max_length = max_length
         self.img_size = img_size
+    def __len__(self) -> int:
+        return len(self.records)
+    def __getitem__(self, idx: int) -> Dict[str, object]:
+        sample = self.records[int(idx)]
+        pixel_values, pil_image = preprocess_image_for_student(sample["image"], self.img_size)
+        input_ids, attention_mask, labels, loss_mask = tokenize_prompt_and_target(
+            tokenizer=self.tokenizer,
+            prompt_text=sample["prompt_text"],
+            target_text=sample["target_text"],
+            max_length=self.max_length,
+        )
+        return {
+            "pixel_values": pixel_values,
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "labels": labels,
+            "loss_mask": loss_mask,
+            "sample_id": sample["sample_id"],
+            "prompt_text": sample["prompt_text"],
+            "target_text": sample["target_text"],
+            "source_config": sample.get("source_config", "unknown"),
+            "pil_image": pil_image,
+        }
+def build_llava_records(max_samples: Optional[int]) -> HFDataset:
+    print("Loading LLaVA-Pretrain dataset...")
+    dataset_root = None
+    try:
+        data = load_dataset("liuhaotian/LLaVA-Pretrain", split="train")
+    except Exception as exc:
+        print(f"Primary dataset loader failed ({exc}). Falling back to direct JSON loading...")
+        dataset_root = snapshot_download(
+            "liuhaotian/LLaVA-Pretrain",
+            repo_type="dataset",
+            allow_patterns=["blip_laion_cc_sbu_558k.json", "images.zip"],
+        )
+        json_path = os.path.join(dataset_root, "blip_laion_cc_sbu_558k.json")
+        data = load_dataset("json", data_files={"train": json_path}, split="train")
+    if max_samples:
+        data = data.select(range(min(max_samples, len(data))))
+    def normalize(sample: Dict[str, object], idx: int) -> Dict[str, object]:
         text = ""
+        if "conversations" in sample:
             parts = []
+            for turn in sample["conversations"]:
+                val = turn.get("value", "").replace("<image>\n", "").replace("<image>", "").strip()
                 if val:
                     parts.append(val)
+            text = " ".join(parts)
+        elif sample.get("blip_caption"):
+            text = sample["blip_caption"].strip()
         if not text:
             text = "Describe this image."
+        image_obj = sample.get("image")
+        if isinstance(image_obj, str) and dataset_root and not os.path.isabs(image_obj):
+            candidate_paths = [
+                image_obj,
+                os.path.join(dataset_root, image_obj),
+                os.path.join(dataset_root, "images", image_obj),
+            ]
+            image_obj = next((path for path in candidate_paths if os.path.exists(path)), image_obj)
         return {
+            "image": image_obj,
+            "prompt_text": "Describe this image.",
+            "target_text": text,
+            "sample_id": f"llava-pretrain:{sample.get('id', idx)}",
+            "source_config": "llava_pretrain",
         }
+    records = [normalize(data[i], i) for i in range(len(data))]
+    normalized = HFDataset.from_list(records)
+    print(f"Loaded {len(normalized)} LLaVA samples")
+    return normalized
+def build_cauldron_records(configs: Sequence[str], max_samples: Optional[int]) -> Tuple[HFDataset, Dict[str, Dict[str, int]]]:
+    normalized_configs: List[HFDataset] = []
+    skip_stats: Dict[str, Dict[str, int]] = {}
+    per_config_limit = None
+    if max_samples:
+        per_config_limit = max(1, max_samples // max(len(configs), 1))
+    for config_name in configs:
+        print(f"Loading The Cauldron config: {config_name}")
+        ds = load_dataset("HuggingFaceM4/the_cauldron", config_name, split="train")
+        stats = defaultdict(int)
+        def explode(batch: Dict[str, List[object]], indices: List[int]) -> Dict[str, List[object]]:
+            output = {
+                "image": [],
+                "prompt_text": [],
+                "target_text": [],
+                "sample_id": [],
+                "source_config": [],
+            }
+            for local_idx, row_idx in enumerate(indices):
+                images = batch["images"][local_idx]
+                texts = batch["texts"][local_idx]
+                if not images or len(images) != 1:
+                    stats["multi_or_missing_image"] += 1
+                    continue
+                if not texts:
+                    stats["missing_turns"] += 1
+                    continue
+                for turn_idx, turn in enumerate(texts):
+                    user_text = (turn.get("user") or "").strip()
+                    assistant_text = (turn.get("assistant") or "").strip()
+                    if not user_text or not assistant_text:
+                        stats["missing_user_or_assistant"] += 1
+                        continue
+                    output["image"].append(images[0])
+                    output["prompt_text"].append(user_text)
+                    output["target_text"].append(assistant_text)
+                    output["sample_id"].append(f"{config_name}:{row_idx}:{turn_idx}")
+                    output["source_config"].append(config_name)
+                    stats["kept"] += 1
+            return output
+        exploded = ds.map(
+            explode,
+            batched=True,
+            with_indices=True,
+            remove_columns=ds.column_names,
+            desc=f"Normalizing {config_name}",
+        )
+        if per_config_limit is not None:
+            exploded = exploded.select(range(min(per_config_limit, len(exploded))))
+        normalized_configs.append(exploded)
+        skip_stats[config_name] = dict(stats)
+        print(f"{config_name}: kept={stats['kept']} skipped={sum(v for k, v in stats.items() if k != 'kept')}")
+    if not normalized_configs:
+        raise RuntimeError("No valid The Cauldron configs were loaded.")
+    combined = concatenate_datasets(normalized_configs)
+    if max_samples:
+        combined = combined.select(range(min(max_samples, len(combined))))
+    print(f"Loaded {len(combined)} normalized The Cauldron samples")
+    return combined, skip_stats
+def collate_vision_language(batch: List[Dict[str, object]]) -> Dict[str, object]:
+    return {
+        "pixel_values": torch.stack([sample["pixel_values"] for sample in batch]),
+        "input_ids": torch.stack([sample["input_ids"] for sample in batch]),
+        "attention_mask": torch.stack([sample["attention_mask"] for sample in batch]),
+        "labels": torch.stack([sample["labels"] for sample in batch]),
+        "loss_mask": torch.stack([sample["loss_mask"] for sample in batch]),
+        "sample_id": [sample["sample_id"] for sample in batch],
+        "prompt_text": [sample["prompt_text"] for sample in batch],
+        "target_text": [sample["target_text"] for sample in batch],
+        "source_config": [sample["source_config"] for sample in batch],
+        "pil_image": [sample["pil_image"] for sample in batch],
+    }
+def create_stage_dataset(stage: str, tokenizer: AutoTokenizer, args: argparse.Namespace) -> Tuple[NormalizedVisionLanguageDataset, Dict[str, Dict[str, int]]]:
+    if stage == "1":
+        return NormalizedVisionLanguageDataset(
+            records=build_llava_records(args.max_samples),
+            tokenizer=tokenizer,
+            max_length=args.max_length,
+            img_size=224,
+        ), {}
+    configs = parse_dataset_configs(args.dataset_configs)
+    records, skip_stats = build_cauldron_records(configs, args.max_samples)
+    return NormalizedVisionLanguageDataset(
+        records=records,
+        tokenizer=tokenizer,
+        max_length=args.max_length,
+        img_size=224,
+    ), skip_stats
+def build_dataloader(
+    dataset: Dataset,
+    batch_size: int,
+    shuffle: bool,
+    num_workers: int,
+    persistent_workers: bool,
+) -> DataLoader:
+    return DataLoader(
+        dataset,
+        batch_size=batch_size,
+        shuffle=shuffle,
+        num_workers=num_workers,
+        pin_memory=torch.cuda.is_available(),
+        persistent_workers=persistent_workers and num_workers > 0,
+        drop_last=False,
+        collate_fn=collate_vision_language,
+    )
+def print_device_info(device: torch.device) -> None:
     print(f"Device: {device}")
     if torch.cuda.is_available():
+        print(f"GPU: {torch.cuda.get_device_name(0)}")
         print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
+        print(f"torch.version.cuda: {torch.version.cuda}")
+def ensure_runtime_requirements(args: argparse.Namespace) -> None:
+    if args.require_cuda and not torch.cuda.is_available():
+        raise RuntimeError("CUDA is required for this run but torch.cuda.is_available() is False.")
+    if args.stage in {"2", "3a", "3b"} and not parse_dataset_configs(args.dataset_configs):
+        raise RuntimeError("Stage 2/3 requires at least one The Cauldron config.")
+    if args.stage in {"3a", "3b"} and not args.teacher_cache_dir:
+        raise RuntimeError("Stage 3 requires --teacher_cache_dir.")
+    if args.stage in {"3a", "3b"} and not args.resume_from:
+        raise RuntimeError("Stage 3 requires --resume_from pointing to a Stage 2 checkpoint.")
+    if args.stage == "3a":
+        try:
+            import bitsandbytes  # noqa: F401
+        except ImportError as exc:
+            raise RuntimeError("Stage 3a requires bitsandbytes in the active environment.") from exc
+def maybe_resume_model(model: ViLDLM, args: argparse.Namespace) -> None:
+    if not args.resume_from:
+        return
+    checkpoint_dir = Path(args.resume_from)
+    if not checkpoint_dir.exists():
+        raise FileNotFoundError(f"Checkpoint directory not found: {checkpoint_dir}")
+    include_lm = args.stage in {"2", "3a", "3b"}
+    print(f"Resuming from checkpoint: {checkpoint_dir}")
+    model.load_checkpoint(checkpoint_dir, include_lm=include_lm)
+def get_optimizer(model: ViLDLM, stage: str) -> AdamW:
+    if stage == "1":
+        groups = [
+            {
+                "params": [p for p in model.projector.parameters() if p.requires_grad],
+                "lr": 1e-3,
+            }
+        ]
+    else:
+        groups = [
+            {
+                "params": [p for p in model.vision_encoder.parameters() if p.requires_grad],
+                "lr": 2e-6,
+            },
+            {
+                "params": [p for p in model.projector.parameters() if p.requires_grad],
+                "lr": 1e-5,
+            },
+            {
+                "params": [p for p in model.lm.parameters() if p.requires_grad],
+                "lr": 1e-5,
+            },
+        ]
+    groups = [group for group in groups if group["params"]]
+    return AdamW(groups, weight_decay=0.05, betas=(0.9, 0.999))
+def setup_model_for_stage(model: ViLDLM, stage: str) -> None:
+    if stage == "1":
+        print("\n=== STAGE 1: Projector-only alignment ===")
+        model.freeze_vision()
+        model.freeze_lm()
+    elif stage in {"2", "3b"}:
+        label = "Full finetune" if stage == "2" else "Sparse KD finetune"
+        print(f"\n=== STAGE {stage.upper()}: {label} ===")
+        model.unfreeze_all()
+    elif stage == "3a":
+        print("\n=== STAGE 3A: Teacher candidate-bank preparation ===")
+        model.unfreeze_all()
+        for param in model.parameters():
+            param.requires_grad = False
+    else:
+        raise ValueError(f"Unsupported stage: {stage}")
+def compute_sparse_kd_loss(
+    student_logits: torch.Tensor,
+    noise_mask: torch.Tensor,
+    sample_ids: Sequence[str],
+    bank_map: Dict[str, List[Dict[str, object]]],
+    temperature: float,
+) -> Tuple[torch.Tensor, int]:
+    entries_used = 0
+    losses: List[torch.Tensor] = []
+    for batch_idx, sample_id in enumerate(sample_ids):
+        sample_entries = bank_map.get(sample_id, [])
+        for entry in sample_entries:
+            position = int(entry["position"])
+            if position >= student_logits.shape[1]:
+                continue
+            if not bool(noise_mask[batch_idx, position].item()):
+                continue
+            candidate_ids = torch.tensor(
+                entry["candidate_token_ids"],
+                device=student_logits.device,
+                dtype=torch.long,
+            )
+            teacher_probs = torch.tensor(
+                entry["teacher_probs"],
+                device=student_logits.device,
+                dtype=student_logits.dtype,
+            )
+            gathered = student_logits[batch_idx, position, candidate_ids]
+            student_log_probs = F.log_softmax(gathered / temperature, dim=-1)
+            loss = F.kl_div(
+                student_log_probs.unsqueeze(0),
+                teacher_probs.unsqueeze(0),
+                reduction="batchmean",
+            ) * (temperature ** 2)
+            losses.append(loss)
+            entries_used += 1
+    if not losses:
+        return torch.tensor(0.0, device=student_logits.device), 0
+    return torch.stack(losses).mean(), entries_used
+def compute_teacher_logprobs(
+    teacher: AutoModelForImageTextToText,
+    processor: AutoProcessor,
+    pil_image: Image.Image,
+    prompt_text: str,
+    candidate_texts: Sequence[str],
+    teacher_batch_size: int,
+) -> torch.Tensor:
+    prompt_messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "image": pil_image},
+                {"type": "text", "text": prompt_text},
+            ],
+        }
+    ]
+    prompt_inputs = processor.apply_chat_template(
+        prompt_messages,
+        tokenize=True,
+        return_dict=True,
+        return_tensors="pt",
+        add_generation_prompt=True,
     )
+    prompt_len = prompt_inputs["input_ids"].shape[1]
+    teacher_device = next(teacher.parameters()).device
+    all_logprobs = []
+    for start in range(0, len(candidate_texts), max(teacher_batch_size, 1)):
+        batch_candidates = candidate_texts[start : start + max(teacher_batch_size, 1)]
+        conversations = []
+        for candidate_text in batch_candidates:
+            conversations.append(
+                [
+                    {
+                        "role": "user",
+                        "content": [
+                            {"type": "image", "image": pil_image},
+                            {"type": "text", "text": prompt_text},
+                        ],
+                    },
+                    {
+                        "role": "assistant",
+                        "content": [{"type": "text", "text": candidate_text}],
+                    },
+                ]
+            )
+        batch_inputs = processor.apply_chat_template(
+            conversations,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt",
+            padding=True,
+            add_generation_prompt=False,
+        )
+        batch_inputs = {key: value.to(teacher_device) for key, value in batch_inputs.items()}
+        outputs = teacher(**batch_inputs)
+        logits = outputs.logits[:, :-1, :]
+        labels = batch_inputs["input_ids"][:, 1:].clone()
+        attention_mask = batch_inputs["attention_mask"]
+        seq_len = batch_inputs["input_ids"].shape[1]
+        for batch_idx in range(labels.shape[0]):
+            valid_len = int(attention_mask[batch_idx].sum().item())
+            left_pad = seq_len - valid_len
+            prefix_cut = left_pad + prompt_len - 1
+            if prefix_cut > 0:
+                labels[batch_idx, :prefix_cut] = -100
+            labels[batch_idx, attention_mask[batch_idx, 1:] == 0] = -100
+        per_token = F.cross_entropy(
+            logits.reshape(-1, logits.shape[-1]),
+            labels.reshape(-1),
+            ignore_index=-100,
+            reduction="none",
+        ).reshape(labels.shape)
+        token_mask = (labels != -100).float()
+        all_logprobs.append(-(per_token * token_mask).sum(dim=-1).cpu())
+    return torch.cat(all_logprobs, dim=0)
+def choose_distillation_positions(
+    clean_logits: torch.Tensor,
+    labels: torch.Tensor,
+    loss_mask: torch.Tensor,
+    max_positions: int,
+) -> List[int]:
+    valid_positions = torch.nonzero(loss_mask > 0, as_tuple=False).flatten()
+    if valid_positions.numel() == 0:
+        return []
+    probs = F.softmax(clean_logits[valid_positions], dim=-1)
+    gold = labels[valid_positions].unsqueeze(-1)
+    gold_probs = probs.gather(-1, gold).squeeze(-1)
+    _, ranked = torch.sort(gold_probs, descending=False)
+    selected = valid_positions[ranked][:max_positions]
+    return [int(pos.item()) for pos in selected]
+def build_candidate_ids(
+    logits_at_position: torch.Tensor,
+    gold_token_id: int,
+    top_k: int,
+) -> List[int]:
+    candidate_ids = logits_at_position.topk(max(top_k - 1, 1)).indices.tolist()
+    if gold_token_id not in candidate_ids:
+        candidate_ids.append(gold_token_id)
+    deduped = []
+    seen = set()
+    for token_id in candidate_ids:
+        if token_id in seen:
+            continue
+        deduped.append(token_id)
+        seen.add(token_id)
+    return deduped[:top_k]
+def decode_assistant_text(
+    tokenizer: AutoTokenizer,
+    full_ids: torch.Tensor,
+    attention_mask: torch.Tensor,
+    loss_mask: torch.Tensor,
+) -> str:
+    active = (attention_mask > 0) & (loss_mask > 0)
+    assistant_ids = full_ids[active].tolist()
+    return tokenizer.decode(assistant_ids, skip_special_tokens=True).strip()
+def prepare_teacher_bank(
+    args: argparse.Namespace,
+    model: ViLDLM,
+    dataset: NormalizedVisionLanguageDataset,
+) -> None:
+    if args.dry_run_batches:
+        max_items = min(args.teacher_batch_size * args.dry_run_batches, len(dataset))
+    elif args.max_samples:
+        max_items = min(args.max_samples, len(dataset))
+    else:
+        max_items = len(dataset)
+    try:
+        from transformers import BitsAndBytesConfig
+    except ImportError as exc:
+        raise RuntimeError("bitsandbytes/transformers quantization support is required for Stage 3a.") from exc
+    print(f"Loading teacher: {args.teacher_model_id}")
+    quantization_config = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_compute_dtype=torch.bfloat16,
+        bnb_4bit_quant_type="nf4",
     )
+    teacher = AutoModelForImageTextToText.from_pretrained(
+        args.teacher_model_id,
+        quantization_config=quantization_config,
+        device_map="auto",
+        attn_implementation="sdpa",
+    )
+    teacher.eval()
+    processor = AutoProcessor.from_pretrained(args.teacher_model_id, padding_side="left")
+    cache_dir = Path(args.teacher_cache_dir)
+    cache_dir.mkdir(parents=True, exist_ok=True)
+    output_path = cache_dir / "candidate_bank.jsonl"
+    seen_keys = set()
+    if output_path.exists():
+        with open(output_path, "r", encoding="utf-8") as handle:
+            for line in handle:
+                if not line.strip():
+                    continue
+                record = json.loads(line)
+                seen_keys.add((record["sample_id"], int(record["position"])))
+    dataloader = build_dataloader(
+        dataset=dataset,
+        batch_size=1,
+        shuffle=False,
+        num_workers=0,
+        persistent_workers=False,
+    )
+    prepared = 0
+    with torch.no_grad(), open(output_path, "a", encoding="utf-8") as writer:
+        for batch in dataloader:
+            sample_id = batch["sample_id"][0]
+            prompt_text = batch["prompt_text"][0]
+            target_text = batch["target_text"][0]
+            pil_image = batch["pil_image"][0]
+            pixel_values = batch["pixel_values"].to(next(model.parameters()).device)
+            input_ids = batch["input_ids"].to(pixel_values.device)
+            attention_mask = batch["attention_mask"].to(pixel_values.device)
+            labels = batch["labels"].to(pixel_values.device)
+            loss_mask = batch["loss_mask"].to(pixel_values.device)
+            clean_logits = model.predict_clean_logits(pixel_values, input_ids, attention_mask)[0]
+            sample_labels = labels[0]
+            sample_loss_mask = loss_mask[0]
+            positions = choose_distillation_positions(
+                clean_logits=clean_logits,
+                labels=sample_labels,
+                loss_mask=sample_loss_mask,
+                max_positions=args.kd_positions_per_sample,
+            )
+            for position in positions:
+                cache_key = (sample_id, position)
+                if cache_key in seen_keys:
+                    continue
+                gold_token_id = int(sample_labels[position].item())
+                candidate_token_ids = build_candidate_ids(
+                    logits_at_position=clean_logits[position],
+                    gold_token_id=gold_token_id,
+                    top_k=args.kd_top_k,
+                )
+                candidate_texts: List[str] = []
+                for candidate_id in candidate_token_ids:
+                    modified_ids = input_ids[0].clone()
+                    modified_ids[position] = candidate_id
+                    candidate_texts.append(
+                        decode_assistant_text(
+                            tokenizer=model.tokenizer,
+                            full_ids=modified_ids,
+                            attention_mask=attention_mask[0],
+                            loss_mask=loss_mask[0],
+                        )
+                    )
+                teacher_logprobs = compute_teacher_logprobs(
+                    teacher=teacher,
+                    processor=processor,
+                    pil_image=pil_image,
+                    prompt_text=prompt_text,
+                    candidate_texts=candidate_texts,
+                    teacher_batch_size=args.teacher_batch_size,
+                )
+                teacher_probs = F.softmax(teacher_logprobs / 2.0, dim=-1).cpu().tolist()
+                record = {
+                    "sample_id": sample_id,
+                    "position": position,
+                    "candidate_token_ids": candidate_token_ids,
+                    "teacher_probs": teacher_probs,
+                    "gold_token_id": gold_token_id,
+                    "source_config": batch["source_config"][0],
+                    "text_hash": stable_text_hash(sample_id, prompt_text, target_text),
+                }
+                writer.write(json.dumps(record) + "\n")
+                seen_keys.add(cache_key)
+                prepared += 1
+            if args.dry_run_batches and prepared >= args.kd_positions_per_sample * args.dry_run_batches:
+                break
+            if prepared and prepared % 50 == 0:
+                print(f"Prepared {prepared} KD entries...")
+    print(f"Teacher bank written to {output_path} with {prepared} new entries")
+def load_teacher_bank(cache_dir: str) -> Dict[str, List[Dict[str, object]]]:
+    bank_path = Path(cache_dir) / "candidate_bank.jsonl"
+    if not bank_path.exists():
+        raise FileNotFoundError(f"Teacher bank not found: {bank_path}")
+    bank_map: Dict[str, List[Dict[str, object]]] = defaultdict(list)
+    with open(bank_path, "r", encoding="utf-8") as handle:
+        for line in handle:
+            if not line.strip():
+                continue
+            record = json.loads(line)
+            bank_map[record["sample_id"]].append(record)
+    print(f"Loaded teacher bank for {len(bank_map)} samples from {bank_path}")
+    return bank_map
+def maybe_push_to_hub(
+    args: argparse.Namespace,
+    save_dir: Path,
+    params: Dict[str, int],
+    best_loss: float,
+) -> None:
+    if not args.push_to_hub:
+        print("Skipping Hub push (enable with --push_to_hub).")
+        return
+    print("\nPushing to Hub...")
+    api = HfApi()
+    repo_id = args.hub_model_id
+    try:
+        api.create_repo(repo_id, exist_ok=True, private=False)
+    except Exception as exc:
+        print(f"Repo note: {exc}")
+    config_dict = {
+        "architecture": "ViL-DLM",
+        "training_stage": args.stage,
+        "best_loss": best_loss,
+        "total_params_M": params["total"] / 1e6,
+        "trainable_params_M": params["trainable"] / 1e6,
+        "teacher": args.teacher_model_id,
+        "dataset_configs": parse_dataset_configs(args.dataset_configs) if args.stage in {"2", "3a", "3b"} else ["llava_pretrain"],
+    }
+    with open(save_dir / "model_config.json", "w", encoding="utf-8") as handle:
+        json.dump(config_dict, handle, indent=2)
+    api.upload_folder(
+        folder_path=str(save_dir),
+        repo_id=repo_id,
+        commit_message=f"Stage {args.stage} training (loss={best_loss:.4f})",
+    )
+    print(f"\n✅ Model pushed to https://huggingface.co/{repo_id}")
+def run_training_stage(args: argparse.Namespace) -> None:
+    tracker = _TrackioShim()
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print_device_info(device)
+    ensure_runtime_requirements(args)
+    lm_path = download_student_backbone()
     vil_config = ViLConfig()
     proj_config = ProjConfig()
     model = ViLDLM(vil_config, proj_config, lm_path)
+    setup_model_for_stage(model, args.stage)
+    maybe_resume_model(model, args)
     params = model.count_params()
     print(f"Parameters: Total={params['total']/1e6:.1f}M, Trainable={params['trainable']/1e6:.1f}M")
     print(f"  ViL: {params['vil']/1e6:.1f}M, Proj: {params['proj']/1e6:.1f}M, LM: {params['lm']/1e6:.1f}M")
     model = model.to(device)
+    if hasattr(model.lm, "gradient_checkpointing_enable"):
         model.lm.gradient_checkpointing_enable()
+    dataset, skip_stats = create_stage_dataset("1" if args.stage == "1" else "2", model.tokenizer, args)
+    if skip_stats:
+        print(f"Skip stats: {json.dumps(skip_stats)}")
+    if args.stage == "3a":
+        prepare_teacher_bank(args=args, model=model, dataset=dataset)
+        return
+    dataloader = build_dataloader(
+        dataset=dataset,
+        batch_size=args.batch_size,
+        shuffle=args.stage != "3a",
+        num_workers=args.num_workers,
+        persistent_workers=args.persistent_workers,
     )
+    optimizer = get_optimizer(model, stage="1" if args.stage == "1" else "2")
+    total_steps = max(1, (len(dataloader) * args.epochs) // max(args.grad_accum, 1))
+    scheduler = CosineAnnealingLR(optimizer, T_max=total_steps, eta_min=1e-6)
     tracker.init(name=f"vil-dlm-stage{args.stage}")
+    teacher_bank = load_teacher_bank(args.teacher_cache_dir) if args.stage == "3b" else {}
+    best_loss = float("inf")
     global_step = 0
+    step_timer = time.time()
     for epoch in range(args.epochs):
         model.train()
+        epoch_loss = 0.0
+        epoch_kd_loss = 0.0
+        epoch_kd_entries = 0
         num_batches = 0
+        optimizer.zero_grad(set_to_none=True)
         for batch_idx, batch in enumerate(dataloader):
+            pixel_values = batch["pixel_values"].to(device)
+            input_ids = batch["input_ids"].to(device)
+            attention_mask = batch["attention_mask"].to(device)
+            labels = batch["labels"].to(device)
+            loss_mask = batch["loss_mask"].to(device)
+            outputs = model(
+                pixel_values=pixel_values,
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                labels=labels,
+                loss_mask=loss_mask,
+            )
+            diffusion_loss = outputs["loss"]
+            kd_loss = torch.tensor(0.0, device=device)
+            kd_entries = 0
+            total_loss = diffusion_loss
+            if args.stage == "3b":
+                kd_loss, kd_entries = compute_sparse_kd_loss(
+                    student_logits=outputs["logits"],
+                    noise_mask=outputs["noise_mask"],
+                    sample_ids=batch["sample_id"],
+                    bank_map=teacher_bank,
+                    temperature=2.0,
+                )
+                total_loss = (1.0 - 0.5) * diffusion_loss + 0.5 * kd_loss
+            loss = total_loss / args.grad_accum
             loss.backward()
             if (batch_idx + 1) % args.grad_accum == 0:
                 torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                 optimizer.step()
                 scheduler.step()
+                optimizer.zero_grad(set_to_none=True)
                 global_step += 1
+                actual_loss = float(total_loss.item())
+                actual_diffusion = float(diffusion_loss.item())
+                actual_kd = float(kd_loss.item()) if args.stage == "3b" else 0.0
+                elapsed = max(time.time() - step_timer, 1e-6)
+                samples_per_sec = (args.batch_size * args.grad_accum) / elapsed
+                step_timer = time.time()
+                gpu_mem_gb = 0.0
+                if torch.cuda.is_available():
+                    gpu_mem_gb = torch.cuda.max_memory_allocated(device) / 1e9
+                print(
+                    f"[E{epoch}] Step {global_step}/{total_steps} | "
+                    f"Loss: {actual_loss:.4f} | Diff: {actual_diffusion:.4f} | "
+                    f"KD: {actual_kd:.4f} | KD entries: {kd_entries} | "
+                    f"Samples/s: {samples_per_sec:.2f} | GPU mem: {gpu_mem_gb:.2f} GB"
+                )
+                tracker.log(
+                    {
+                        "train/loss": actual_loss,
+                        "train/diffusion_loss": actual_diffusion,
+                        "train/kd_loss": actual_kd,
+                        "train/kd_entries": kd_entries,
+                        "train/epoch": epoch,
+                        "train/step": global_step,
+                        "train/samples_per_sec": samples_per_sec,
+                        "train/gpu_mem_gb": gpu_mem_gb,
+                    }
+                )
+            epoch_loss += float(total_loss.item())
+            epoch_kd_loss += float(kd_loss.item())
+            epoch_kd_entries += kd_entries
+            num_batches += 1
+            if args.dry_run_batches and num_batches >= args.dry_run_batches:
+                break
+        remainder = num_batches % args.grad_accum
+        if remainder != 0:
+            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+            optimizer.step()
+            scheduler.step()
+            optimizer.zero_grad(set_to_none=True)
+            global_step += 1
+        avg_loss = epoch_loss / max(num_batches, 1)
+        avg_kd_loss = epoch_kd_loss / max(num_batches, 1)
+        print(f"\n[Epoch {epoch}] Average Loss: {avg_loss:.4f} | Average KD: {avg_kd_loss:.4f} | KD entries: {epoch_kd_entries}\n")
+        tracker.log(
+            {
+                "train/epoch_loss": avg_loss,
+                "train/epoch_kd_loss": avg_kd_loss,
+                "train/epoch_kd_entries": epoch_kd_entries,
+                "train/epoch": epoch,
+            }
+        )
+        if avg_loss < best_loss:
+            best_loss = avg_loss
+            save_dir = Path(args.output_dir) / f"stage{args.stage}_best"
+            include_lm = args.stage in {"2", "3b"}
+            model.save_checkpoint(save_dir, include_lm=include_lm)
+            training_state = {
+                "stage": args.stage,
+                "best_loss": best_loss,
+                "args": vars(args),
+            }
+            with open(save_dir / "training_state.json", "w", encoding="utf-8") as handle:
+                json.dump(training_state, handle, indent=2)
+            print(f"Saved best checkpoint (loss={best_loss:.4f})")
+    maybe_push_to_hub(
+        args=args,
+        save_dir=Path(args.output_dir) / f"stage{args.stage}_best",
+        params=params,
+        best_loss=best_loss,
+    )
     print("Training complete!")
+def build_parser() -> argparse.ArgumentParser:
     parser = argparse.ArgumentParser()
+    parser.add_argument("--stage", type=str, default="1", choices=["1", "2", "3a", "3b"])
     parser.add_argument("--epochs", type=int, default=2)
     parser.add_argument("--batch_size", type=int, default=4)
     parser.add_argument("--grad_accum", type=int, default=8)
     parser.add_argument("--max_samples", type=int, default=None)
     parser.add_argument("--output_dir", type=str, default="./vil-dlm-output")
     parser.add_argument("--hub_model_id", type=str, default="omar-ah/ViL-DLM-0.6B")
+    parser.add_argument("--push_to_hub", action="store_true")
+    parser.add_argument("--require_cuda", action="store_true")
+    parser.add_argument("--resume_from", type=str, default=None)
+    parser.add_argument("--dataset_configs", type=str, default=",".join(DEFAULT_CAULDRON_CONFIGS))
+    parser.add_argument("--num_workers", type=int, default=4)
+    parser.add_argument("--persistent_workers", action="store_true")
+    parser.add_argument("--dry_run_batches", type=int, default=0)
+    parser.add_argument("--teacher_model_id", type=str, default="google/gemma-4-E2B-it")
+    parser.add_argument("--teacher_cache_dir", type=str, default="./vil-dlm-output/teacher-cache")
+    parser.add_argument("--prepare_teacher_bank", action="store_true")
+    parser.add_argument("--teacher_batch_size", type=int, default=1)
+    parser.add_argument("--kd_top_k", type=int, default=8)
+    parser.add_argument("--kd_positions_per_sample", type=int, default=16)
+    return parser
+if __name__ == "__main__":
+    ensure_hf_cache_root()
+    parser = build_parser()
     args = parser.parse_args()
+    if args.prepare_teacher_bank and args.stage != "3a":
+        raise ValueError("--prepare_teacher_bank is only valid with --stage 3a")
+    run_training_stage(args)

code/vil_dlm_model.py CHANGED Viewed

@@ -393,11 +393,9 @@ class ViLDLMWithDistillation(ViLDLM):
     """
     ViL-DLM with knowledge distillation from Gemma 4 E2B teacher.
-    Distillation losses:
-    1. Response-level KD: KL(teacher_logits || student_logits) on text output
-    2. Vision feature KD: MSE(teacher_vision_features, projected_vil_features)
-    Uses LFM2-style Decoupled Top-K distillation for efficiency.
     """
     def __init__(self, config: TrainingConfig):
@@ -442,60 +440,54 @@ class ViLDLMWithDistillation(ViLDLM):
         print(f"Teacher loaded: {sum(p.numel() for p in self.teacher.parameters()) / 1e9:.1f}B params")
-    def compute_kd_loss(
         self,
-        student_logits: torch.Tensor,  # [B, T, student_vocab]
-        teacher_logits: torch.Tensor,  # [B, T, teacher_vocab]
-        mask: torch.Tensor,            # [B, T] where to compute loss
     ) -> torch.Tensor:
-        """
-        Decoupled Top-K KL divergence (LFM2 recipe).
-        Only align on top-K teacher logits for efficiency.
-        """
-        T = self.kd_config.temperature
-        K = self.kd_config.top_k_logits
-        # Get top-K teacher predictions
-        teacher_topk_vals, teacher_topk_idx = teacher_logits.topk(K, dim=-1)
-        teacher_topk_probs = F.softmax(teacher_topk_vals / T, dim=-1)
-        # Gather student logits at teacher's top-K positions
-        # Need to handle vocab size mismatch between student and teacher
-        # Student vocab: 151936 (Qwen3), Teacher vocab: 262144 (Gemma4)
-        # Only use indices that are valid in student vocab
-        valid_mask = teacher_topk_idx < student_logits.shape[-1]
-        teacher_topk_idx_clamped = teacher_topk_idx.clamp(0, student_logits.shape[-1] - 1)
-        student_topk_logits = torch.gather(student_logits, -1, teacher_topk_idx_clamped)
-        student_topk_probs = F.softmax(student_topk_logits / T, dim=-1)
-        # KL divergence on top-K
-        kl = F.kl_div(
-            student_topk_probs.log(),
-            teacher_topk_probs,
-            reduction='none'
-        )
-        # Apply valid mask and position mask
-        kl = kl * valid_mask.float()
-        kl = kl.sum(-1)  # sum over top-K
-        if mask.sum() > 0:
-            loss = (kl * mask.float()).sum() / mask.sum()
-        else:
-            loss = kl.mean()
-        return loss * (T ** 2)  # scale by T² as is standard for KD
     def forward_with_distillation(
         self,
         pixel_values: torch.Tensor,
         input_ids: torch.Tensor,
         attention_mask: torch.Tensor,
-        teacher_pixel_values: Optional[torch.Tensor] = None,  # may need different preprocessing
         labels: Optional[torch.Tensor] = None,
     ) -> Dict[str, torch.Tensor]:
-        """Forward with both diffusion loss and distillation loss"""
         # Student forward (diffusion loss)
         student_outputs = self.forward(
@@ -506,29 +498,11 @@ class ViLDLMWithDistillation(ViLDLM):
         )
         diffusion_loss = student_outputs['loss']
-        # Teacher forward (no grad)
-        if self.teacher is not None:
-            with torch.no_grad():
-                # Prepare teacher inputs
-                teacher_inputs = {
-                    'input_ids': input_ids,
-                    'attention_mask': attention_mask,
-                }
-                if teacher_pixel_values is not None:
-                    teacher_inputs['pixel_values'] = teacher_pixel_values
-                teacher_outputs = self.teacher(**teacher_inputs)
-                teacher_logits = teacher_outputs.logits
-            # Compute KD loss
-            kd_loss = self.compute_kd_loss(
-                student_logits=student_outputs['logits'],
-                teacher_logits=teacher_logits,
-                mask=student_outputs['noise_mask'],
-            )
-        else:
-            kd_loss = torch.tensor(0.0, device=pixel_values.device)
         # Combined loss
         alpha = self.kd_config.alpha_kd

     """
     ViL-DLM with knowledge distillation from Gemma 4 E2B teacher.
+    Real Stage 3 uses sparse cross-tokenizer KD targets that are
+    prepared offline with the teacher and cached in the student's
+    token space.
     """
     def __init__(self, config: TrainingConfig):
         print(f"Teacher loaded: {sum(p.numel() for p in self.teacher.parameters()) / 1e9:.1f}B params")
+    def compute_sparse_kd_loss(
         self,
+        student_logits: torch.Tensor,
+        noise_mask: torch.Tensor,
+        kd_targets: Optional[list[dict[str, Any]]],
     ) -> torch.Tensor:
+        """Compute sparse KL in the student's token space."""
+        if not kd_targets:
+            return torch.tensor(0.0, device=student_logits.device)
+        temperature = self.kd_config.temperature
+        losses = []
+        for entry in kd_targets:
+            batch_idx = int(entry["batch_idx"])
+            position = int(entry["position"])
+            if position >= student_logits.shape[1]:
+                continue
+            if not bool(noise_mask[batch_idx, position].item()):
+                continue
+            candidate_token_ids = torch.tensor(
+                entry["candidate_token_ids"],
+                device=student_logits.device,
+                dtype=torch.long,
+            )
+            teacher_probs = torch.tensor(
+                entry["teacher_probs"],
+                device=student_logits.device,
+                dtype=student_logits.dtype,
+            )
+            gathered = student_logits[batch_idx, position, candidate_token_ids]
+            student_log_probs = F.log_softmax(gathered / temperature, dim=-1)
+            losses.append(
+                F.kl_div(student_log_probs, teacher_probs, reduction="batchmean") * (temperature ** 2)
+            )
+        if not losses:
+            return torch.tensor(0.0, device=student_logits.device)
+        return torch.stack(losses).mean()
     def forward_with_distillation(
         self,
         pixel_values: torch.Tensor,
         input_ids: torch.Tensor,
         attention_mask: torch.Tensor,
         labels: Optional[torch.Tensor] = None,
+        kd_targets: Optional[list[dict[str, Any]]] = None,
     ) -> Dict[str, torch.Tensor]:
+        """Forward with diffusion loss plus sparse cached KD targets."""
         # Student forward (diffusion loss)
         student_outputs = self.forward(
         )
         diffusion_loss = student_outputs['loss']
+        kd_loss = self.compute_sparse_kd_loss(
+            student_logits=student_outputs["logits"],
+            noise_mask=student_outputs["noise_mask"],
+            kd_targets=kd_targets,
+        )
         # Combined loss
         alpha = self.kd_config.alpha_kd

pyproject.toml CHANGED Viewed

@@ -18,6 +18,7 @@ dev = [
   "datasets",
   "accelerate",
   "trackio",
 ]
 [tool.uv]

   "datasets",
   "accelerate",
   "trackio",
+  "bitsandbytes>=0.45.0; platform_system == 'Linux'",
 ]
 [tool.uv]