omar-ah
/

ViL-DLM-0.6B

+"""
+ViL-DLM Production Training Script
+Runs on HF Jobs with GPU
+Stage 1: Train projector only (ViL frozen, LM frozen) on LLaVA-Pretrain
+Stage 2: Full finetune on multimodal instruction data
+"""
+import os
+import sys
+import math
+import json
+import time
+import argparse
+from pathlib import Path
+from typing import Dict, Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.utils.data import Dataset, DataLoader
+from torch.optim import AdamW
+from torch.optim.lr_scheduler import CosineAnnealingLR
+import numpy as np
+from PIL import Image
+from io import BytesIO
+from datasets import load_dataset
+from transformers import AutoTokenizer, AutoModelForMaskedLM
+from huggingface_hub import HfApi, snapshot_download
+import trackio
+# ============================================================
+# 1. Model Config
+# ============================================================
+from dataclasses import dataclass, field
+@dataclass
+class ViLConfig:
+    img_size: int = 224
+    patch_size: int = 16
+    in_channels: int = 3
+    dim: int = 384
+    depth: int = 24
+    conv_kernel_size: int = 3
+    bidirectional: bool = True
+    dropout: float = 0.0
+    @property
+    def num_patches(self):
+        return (self.img_size // self.patch_size) ** 2
+@dataclass
+class ProjConfig:
+    vil_dim: int = 384
+    lm_dim: int = 1024
+    hidden_mult: int = 2
+    num_layers: int = 2
+# ============================================================
+# 2. Vision xLSTM Implementation
+# ============================================================
+class PatchEmbedding(nn.Module):
+    def __init__(self, img_size=224, patch_size=16, in_channels=3, embed_dim=384):
+        super().__init__()
+        self.num_patches = (img_size // patch_size) ** 2
+        self.proj = nn.Conv2d(in_channels, embed_dim, kernel_size=patch_size, stride=patch_size)
+        self.pos_embed = nn.Parameter(torch.zeros(1, self.num_patches, embed_dim))
+        nn.init.trunc_normal_(self.pos_embed, std=0.02)
+    def forward(self, x):
+        x = self.proj(x).flatten(2).transpose(1, 2)
+        return x + self.pos_embed
+class MLSTMCell(nn.Module):
+    """mLSTM with matrix memory and exponential gating"""
+    def __init__(self, input_dim, head_dim, num_heads=4):
+        super().__init__()
+        self.head_dim = head_dim
+        self.num_heads = num_heads
+        self.total_dim = head_dim * num_heads
+        self.scale = 1.0 / math.sqrt(head_dim)
+        self.W_q = nn.Linear(input_dim, self.total_dim, bias=True)
+        self.W_k = nn.Linear(input_dim, self.total_dim, bias=True)
+        self.W_v = nn.Linear(input_dim, self.total_dim, bias=True)
+        self.w_f = nn.Linear(input_dim, num_heads, bias=True)
+        self.w_i = nn.Linear(input_dim, num_heads, bias=True)
+        self.w_o = nn.Linear(input_dim, self.total_dim, bias=True)
+    def forward(self, x):
+        B, T, D = x.shape
+        q = self.W_q(x).view(B, T, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
+        k = (self.W_k(x) * self.scale).view(B, T, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
+        v = self.W_v(x).view(B, T, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
+        o = torch.sigmoid(self.w_o(x))
+        log_f = F.logsigmoid(self.w_f(x)).permute(0, 2, 1)  # [B, H, T]
+        log_i = self.w_i(x).permute(0, 2, 1)  # [B, H, T]
+        decay = torch.exp(log_f)  # [B, H, T]
+        gate = torch.exp(log_i)   # [B, H, T]
+        h_state = torch.zeros(B, self.num_heads, self.head_dim, self.head_dim,
+                             device=x.device, dtype=x.dtype)
+        n_state = torch.zeros(B, self.num_heads, self.head_dim,
+                             device=x.device, dtype=x.dtype)
+        outputs = []
+        for t in range(T):
+            f_t = decay[:, :, t].unsqueeze(-1)
+            i_t = gate[:, :, t].unsqueeze(-1)
+            k_t = k[:, :, t, :]
+            v_t = v[:, :, t, :]
+            q_t = q[:, :, t, :]
+            h_state = f_t.unsqueeze(-1) * h_state + i_t.unsqueeze(-1) * torch.einsum('bhd,bhe->bhde', v_t, k_t)
+            n_state = f_t * n_state + i_t * k_t
+            Cq = torch.einsum('bhde,bhe->bhd', h_state, q_t)
+            nq = torch.einsum('bhd,bhd->bh', n_state, q_t).unsqueeze(-1).abs().clamp(min=1.0)
+            outputs.append(Cq / nq)
+        out = torch.stack(outputs, dim=2)  # [B, H, T, D]
+        out = out.permute(0, 2, 1, 3).reshape(B, T, self.total_dim)
+        return out * o
+class MLSTMBlock(nn.Module):
+    def __init__(self, dim, conv_kernel=3, dropout=0.0):
+        super().__init__()
+        self.norm = nn.LayerNorm(dim)
+        self.pre_proj = nn.Linear(dim, dim * 3)
+        self.conv = nn.Conv2d(dim, dim, kernel_size=conv_kernel, padding=conv_kernel // 2, groups=dim)
+        self.mlstm = MLSTMCell(dim, dim // 4, num_heads=4)
+        self.out_proj = nn.Linear(dim, dim)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x, h=None, w=None):
+        B, T, D = x.shape
+        residual = x
+        x = self.norm(x)
+        gate_b, gate_c, h_tilde = self.pre_proj(x).chunk(3, dim=-1)
+        if h is not None and w is not None:
+            h_2d = h_tilde.transpose(1, 2).view(B, D, h, w)
+            h_2d = self.conv(h_2d)
+            h_tilde = h_2d.view(B, D, T).transpose(1, 2)
+        y = torch.sigmoid(gate_b) * h_tilde
+        y = self.mlstm(y)
+        y = torch.sigmoid(gate_c) * y
+        return residual + self.dropout(self.out_proj(y))
+class FFNBlock(nn.Module):
+    def __init__(self, dim, mult=4, dropout=0.0):
+        super().__init__()
+        hidden = int(dim * mult * 2 / 3)
+        self.norm = nn.LayerNorm(dim)
+        self.w1 = nn.Linear(dim, hidden)
+        self.w2 = nn.Linear(dim, hidden)
+        self.w3 = nn.Linear(hidden, dim)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x):
+        r = x
+        x = self.norm(x)
+        return r + self.dropout(self.w3(F.silu(self.w1(x)) * self.w2(x)))
+class VisionXLSTM(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.patch_embed = PatchEmbedding(config.img_size, config.patch_size, config.in_channels, config.dim)
+        self.h = config.img_size // config.patch_size
+        self.w = config.img_size // config.patch_size
+        self.blocks = nn.ModuleList()
+        self.ffns = nn.ModuleList()
+        for _ in range(config.depth):
+            self.blocks.append(MLSTMBlock(config.dim, config.conv_kernel_size, config.dropout))
+            self.ffns.append(FFNBlock(config.dim, dropout=config.dropout))
+        self.final_norm = nn.LayerNorm(config.dim)
+    def forward_features(self, pixel_values):
+        x = self.patch_embed(pixel_values)
+        for i, (block, ffn) in enumerate(zip(self.blocks, self.ffns)):
+            if self.config.bidirectional and i % 2 == 1:
+                x = x.flip(1)
+                x = block(x, h=self.h, w=self.w)
+                x = ffn(x)
+                x = x.flip(1)
+            else:
+                x = block(x, h=self.h, w=self.w)
+                x = ffn(x)
+        return self.final_norm(x)
+class VisionProjector(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        hidden_dim = config.lm_dim * config.hidden_mult
+        layers = [nn.Linear(config.vil_dim, hidden_dim), nn.GELU()]
+        for _ in range(config.num_layers - 1):
+            layers.extend([nn.Linear(hidden_dim, hidden_dim), nn.GELU()])
+        layers.append(nn.Linear(hidden_dim, config.lm_dim))
+        self.mlp = nn.Sequential(*layers)
+    def forward(self, x):
+        return self.mlp(x)
+# ============================================================
+# 3. MDLM Scheduler & ViL-DLM Model
+# ============================================================
+class MDLMScheduler:
+    def __init__(self, mask_token_id=151643):
+        self.mask_token_id = mask_token_id
+    def add_noise(self, input_ids, t):
+        B, T = input_ids.shape
+        mask_ratio = 1.0 - torch.cos(t * math.pi / 2)
+        mask_ratio = mask_ratio.unsqueeze(1).expand(B, T)
+        mask = torch.rand(B, T, device=input_ids.device) < mask_ratio
+        noisy_ids = input_ids.clone()
+        noisy_ids[mask] = self.mask_token_id
+        return noisy_ids, mask
+    def sample_timesteps(self, batch_size, device):
+        return torch.rand(batch_size, device=device)
+class ViLDLM(nn.Module):
+    def __init__(self, vil_config, proj_config, lm_path):
+        super().__init__()
+        self.vil_config = vil_config
+        self.vision_encoder = VisionXLSTM(vil_config)
+        self.projector = VisionProjector(proj_config)
+        self.scheduler = MDLMScheduler()
+        self.num_patches = vil_config.num_patches
+        # Load diffusion LM
+        print(f"Loading diffusion LM from {lm_path}...")
+        self.lm = AutoModelForMaskedLM.from_pretrained(
+            lm_path, trust_remote_code=True, dtype=torch.bfloat16
+        )
+        self.tokenizer = AutoTokenizer.from_pretrained(lm_path, trust_remote_code=True)
+        lm_params = sum(p.numel() for p in self.lm.parameters())
+        print(f"Loaded LM: {lm_params/1e6:.1f}M params")
+    def forward(self, pixel_values, input_ids, attention_mask, labels=None):
+        B, T = input_ids.shape
+        device = input_ids.device
+        if labels is None:
+            labels = input_ids.clone()
+        # Diffusion: mask tokens
+        t = self.scheduler.sample_timesteps(B, device)
+        noisy_ids, noise_mask = self.scheduler.add_noise(input_ids, t)
+        # Encode image
+        vision_features = self.vision_encoder.forward_features(pixel_values)
+        visual_tokens = self.projector(vision_features)
+        # Get text embeddings
+        text_embeds = self.lm.model.embed_tokens(noisy_ids)
+        visual_tokens = visual_tokens.to(dtype=text_embeds.dtype)
+        # Concat [vision | text]
+        inputs_embeds = torch.cat([visual_tokens, text_embeds], dim=1)
+        vis_mask = torch.ones(B, self.num_patches, device=device, dtype=attention_mask.dtype)
+        full_mask = torch.cat([vis_mask, attention_mask], dim=1)
+        # Forward through LM
+        outputs = self.lm(inputs_embeds=inputs_embeds, attention_mask=full_mask)
+        text_logits = outputs.logits[:, self.num_patches:, :]
+        # MDLM loss on masked positions only
+        loss_mask = noise_mask.float()
+        if loss_mask.sum() == 0:
+            loss = torch.tensor(0.0, device=device, requires_grad=True)
+        else:
+            logits_flat = text_logits.reshape(-1, text_logits.shape[-1])
+            labels_flat = labels.reshape(-1)
+            loss_flat = F.cross_entropy(logits_flat, labels_flat, reduction='none').reshape(B, T)
+            loss = (loss_flat * loss_mask).sum() / loss_mask.sum()
+        return {'loss': loss, 'logits': text_logits, 'noise_mask': noise_mask, 't': t}
+    def freeze_vision(self):
+        for p in self.vision_encoder.parameters():
+            p.requires_grad = False
+    def freeze_lm(self):
+        for p in self.lm.parameters():
+            p.requires_grad = False
+    def unfreeze_all(self):
+        for p in self.parameters():
+            p.requires_grad = True
+    def count_params(self):
+        vil = sum(p.numel() for p in self.vision_encoder.parameters())
+        proj = sum(p.numel() for p in self.projector.parameters())
+        lm = sum(p.numel() for p in self.lm.parameters())
+        train = sum(p.numel() for p in self.parameters() if p.requires_grad)
+        return {'vil': vil, 'proj': proj, 'lm': lm, 'total': vil+proj+lm, 'trainable': train}
+# ============================================================
+# 4. Dataset
+# ============================================================
+class LLaVAPretrainDataset(Dataset):
+    def __init__(self, tokenizer, max_length=512, img_size=224, max_samples=None):
+        print("Loading LLaVA-Pretrain dataset...")
+        self.data = load_dataset("liuhaotian/LLaVA-Pretrain", split="train")
+        if max_samples:
+            self.data = self.data.select(range(min(max_samples, len(self.data))))
+        print(f"Loaded {len(self.data)} samples")
+        self.tokenizer = tokenizer
+        self.max_length = max_length
+        self.img_size = img_size
+        self.mean = torch.tensor([0.485, 0.456, 0.406]).view(3, 1, 1)
+        self.std = torch.tensor([0.229, 0.224, 0.225]).view(3, 1, 1)
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        sample = self.data[idx]
+        # Image
+        try:
+            img = sample['image']
+            if isinstance(img, str):
+                img = Image.open(img).convert('RGB')
+            elif isinstance(img, dict) and 'bytes' in img:
+                img = Image.open(BytesIO(img['bytes'])).convert('RGB')
+            elif not isinstance(img, Image.Image):
+                img = Image.new('RGB', (self.img_size, self.img_size), (128, 128, 128))
+            else:
+                img = img.convert('RGB')
+            img = img.resize((self.img_size, self.img_size), Image.BICUBIC)
+            arr = np.array(img).astype(np.float32) / 255.0
+            pv = torch.from_numpy(arr).permute(2, 0, 1)
+            pv = (pv - self.mean) / self.std
+        except Exception:
+            pv = torch.zeros(3, self.img_size, self.img_size)
+        # Text from conversations
+        text = ""
+        if 'conversations' in sample:
+            parts = []
+            for turn in sample['conversations']:
+                val = turn.get('value', '').replace('<image>\n', '').replace('<image>', '').strip()
+                if val:
+                    parts.append(val)
+            text = ' '.join(parts)
+        if not text:
+            text = "Describe this image."
+        tokens = self.tokenizer(text, max_length=self.max_length, padding='max_length',
+                               truncation=True, return_tensors='pt')
+        return {
+            'pixel_values': pv,
+            'input_ids': tokens['input_ids'].squeeze(0),
+            'attention_mask': tokens['attention_mask'].squeeze(0),
+            'labels': tokens['input_ids'].squeeze(0).clone(),
+        }
+# ============================================================
+# 5. Training Loop
+# ============================================================
+def train(args):
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"Device: {device}")
+    if torch.cuda.is_available():
+        print(f"GPU: {torch.cuda.get_device_name()}")
+        print(f"VRAM: {torch.cuda.get_device_properties(0).total_mem / 1e9:.1f} GB")
+    # Download dLLM model
+    print("Downloading dLLM Qwen3-0.6B diffusion model...")
+    lm_path = snapshot_download('dllm-hub/Qwen3-0.6B-diffusion-mdlm-v0.1')
+    # Fix the modeling file (remove dllm import in __main__)
+    modeling_file = os.path.join(lm_path, "modeling_qwen3.py")
+    with open(modeling_file, 'r') as f:
+        content = f.read()
+    # Replace the __main__ block that imports dllm
+    content = content.replace(
+        'if __name__ == "__main__":\n    import dllm',
+        'if __name__ == "__main__":\n    pass\n    # import dllm'
+    )
+    # Fix attention_type compatibility
+    content = content.replace(
+        'attention_mask=causal_mask_mapping[decoder_layer.attention_type]',
+        'attention_mask=causal_mask_mapping.get(getattr(decoder_layer, "attention_type", "full_attention"), causal_mask_mapping.get("full_attention"))'
+    )
+    with open(modeling_file, 'w') as f:
+        f.write(content)
+    print(f"Model downloaded to {lm_path}")
+    # Build model
+    vil_config = ViLConfig()
+    proj_config = ProjConfig()
+    model = ViLDLM(vil_config, proj_config, lm_path)
+    # Stage setup
+    if args.stage == 1:
+        print("\n=== STAGE 1: Projector-only training ===")
+        model.freeze_vision()
+        model.freeze_lm()
+    elif args.stage == 2:
+        print("\n=== STAGE 2: Full finetune ===")
+        model.unfreeze_all()
+    params = model.count_params()
+    print(f"Parameters: Total={params['total']/1e6:.1f}M, Trainable={params['trainable']/1e6:.1f}M")
+    print(f"  ViL: {params['vil']/1e6:.1f}M, Proj: {params['proj']/1e6:.1f}M, LM: {params['lm']/1e6:.1f}M")
+    model = model.to(device)
+    # Enable gradient checkpointing for LM
+    if hasattr(model.lm, 'gradient_checkpointing_enable'):
+        model.lm.gradient_checkpointing_enable()
+    # Dataset
+    dataset = LLaVAPretrainDataset(
+        tokenizer=model.tokenizer,
+        max_length=args.max_length,
+        img_size=224,
+        max_samples=args.max_samples,
+    )
+    dataloader = DataLoader(
+        dataset, batch_size=args.batch_size, shuffle=True,
+        num_workers=4, pin_memory=True, drop_last=True,
+    )
+    # Optimizer with per-component LR
+    param_groups = []
+    if args.stage == 1:
+        param_groups = [{'params': [p for p in model.projector.parameters() if p.requires_grad],
+                        'lr': 1e-3}]
+    else:
+        param_groups = [
+            {'params': [p for p in model.vision_encoder.parameters() if p.requires_grad], 'lr': 2e-6},
+            {'params': [p for p in model.projector.parameters() if p.requires_grad], 'lr': 1e-5},
+            {'params': [p for p in model.lm.parameters() if p.requires_grad], 'lr': 1e-5},
+        ]
+    param_groups = [g for g in param_groups if len(g['params']) > 0]
+    optimizer = AdamW(param_groups, weight_decay=0.05, betas=(0.9, 0.999))
+    total_steps = len(dataloader) * args.epochs // args.grad_accum
+    scheduler = CosineAnnealingLR(optimizer, T_max=max(total_steps, 1), eta_min=1e-6)
+    # Trackio
+    trackio.init(name=f"vil-dlm-stage{args.stage}")
+    # Training loop
+    global_step = 0
+    best_loss = float('inf')
+    for epoch in range(args.epochs):
+        model.train()
+        epoch_loss = 0
+        num_batches = 0
+        for batch_idx, batch in enumerate(dataloader):
+            pv = batch['pixel_values'].to(device)
+            ids = batch['input_ids'].to(device)
+            mask = batch['attention_mask'].to(device)
+            labels = batch['labels'].to(device)
+            outputs = model(pixel_values=pv, input_ids=ids, attention_mask=mask, labels=labels)
+            loss = outputs['loss'] / args.grad_accum
+            loss.backward()
+            if (batch_idx + 1) % args.grad_accum == 0:
+                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+                optimizer.step()
+                scheduler.step()
+                optimizer.zero_grad()
+                global_step += 1
+                actual_loss = loss.item() * args.grad_accum
+                mask_ratio = outputs['noise_mask'].float().mean().item()
+                lr = optimizer.param_groups[0]['lr']
+                if global_step % 5 == 0:
+                    print(f"[E{epoch}] Step {global_step}/{total_steps} | "
+                          f"Loss: {actual_loss:.4f} | LR: {lr:.2e} | Mask: {mask_ratio:.1%}")
+                trackio.log({
+                    'train/loss': actual_loss,
+                    'train/lr': lr,
+                    'train/mask_ratio': mask_ratio,
+                    'train/epoch': epoch,
+                    'train/step': global_step,
+                })
+            epoch_loss += loss.item() * args.grad_accum
+            num_batches += 1
+        avg_loss = epoch_loss / max(num_batches, 1)
+        print(f"\n[Epoch {epoch}] Average Loss: {avg_loss:.4f}\n")
+        trackio.log({'train/epoch_loss': avg_loss, 'train/epoch': epoch})
+        # Save checkpoint
+        if avg_loss < best_loss:
+            best_loss = avg_loss
+            save_dir = os.path.join(args.output_dir, f"stage{args.stage}_best")
+            os.makedirs(save_dir, exist_ok=True)
+            torch.save(model.vision_encoder.state_dict(), os.path.join(save_dir, "vision_encoder.pt"))
+            torch.save(model.projector.state_dict(), os.path.join(save_dir, "projector.pt"))
+            if args.stage >= 2:
+                model.lm.save_pretrained(os.path.join(save_dir, "diffusion_lm"))
+            print(f"Saved best checkpoint (loss={best_loss:.4f})")
+    # Push to Hub
+    print("\nPushing to Hub...")
+    api = HfApi()
+    repo_id = args.hub_model_id
+    try:
+        api.create_repo(repo_id, exist_ok=True, private=False)
+    except Exception as e:
+        print(f"Repo note: {e}")
+    save_dir = os.path.join(args.output_dir, f"stage{args.stage}_best")
+    # Save config + README
+    config_dict = {
+        'architecture': 'ViL-DLM',
+        'components': {
+            'vision_encoder': 'Vision-xLSTM-S (ViL-S)',
+            'projector': '2-layer MLP',
+            'diffusion_lm': 'dLLM Qwen3-0.6B MDLM',
+        },
+        'vil_dim': 384,
+        'lm_dim': 1024,
+        'num_patches': 196,
+        'training_stage': args.stage,
+        'best_loss': best_loss,
+        'total_params_M': params['total'] / 1e6,
+        'trainable_params_M': params['trainable'] / 1e6,
+        'based_on': [
+            'Vision-LSTM (arxiv:2406.04303)',
+            'dLLM (arxiv:2602.22661)',
+            'LLaDA-V (arxiv:2505.16933)',
+            'LFM2 (arxiv:2511.23404)',
+        ],
+        'teacher': 'google/gemma-4-E2B-it (planned for stage 3)',
+    }
+    with open(os.path.join(save_dir, "model_config.json"), 'w') as f:
+        json.dump(config_dict, f, indent=2)
+    readme = f"""---
+license: apache-2.0
+tags:
+  - vision-language
+  - diffusion
+  - xlstm
+  - vision-lstm
+  - masked-diffusion
+  - mdlm
+language: en
+pipeline_tag: image-text-to-text
+---
+# ViL-DLM: Vision xLSTM Diffusion Language Model
+**The first vision-language model combining Vision xLSTM with a diffusion language backbone.**
+## Architecture
+| Component | Model | Params |
+|-----------|-------|--------|
+| Vision Encoder | **Vision-xLSTM-S (ViL-S)** | ~57M |
+| Projector | 2-layer MLP (GELU) | ~7M |
+| Language Backbone | **dLLM Qwen3-0.6B (MDLM)** | ~596M |
+| **Total** | | **~660M** |
+### Why This Combination?
+1. **ViL (Vision xLSTM)** — O(N) linear complexity vision encoder vs ViT's O(N²). Uses alternating bidirectional mLSTM blocks with exponential gating and Conv2D for spatial context. Based on [arxiv:2406.04303](https://arxiv.org/abs/2406.04303).
+2. **Diffusion Language Model** — Non-autoregressive text generation via masked denoising. Bidirectional attention enables richer contextual understanding. Based on [dLLM/MDLM](https://arxiv.org/abs/2602.22661).
+3. **Knowledge Distillation** (Stage 3) — Planned distillation from [Gemma 4 E2B](https://huggingface.co/google/gemma-4-E2B-it) using LFM2-style Decoupled Top-K distillation.
+## Training Recipe
+Inspired by LLaDA-V, LaViDa, LFM2, and Mistral/Pixtral:
+| Stage | What's Trained | Dataset | LR |
+|-------|---------------|---------|-----|
+| 1 | Projector only | LLaVA-Pretrain (558K) | 1e-3 |
+| 2 | Full model | The Cauldron (multimodal) | ViL:2e-6, Proj:1e-5, LM:1e-5 |
+| 3 | + KD from Gemma 4 E2B | Mixed | + Top-K KD (α=0.5, T=2, K=32) |
+**Current stage: {args.stage} | Best loss: {best_loss:.4f}**
+## Novelty
+This is (to our knowledge) the **first published model** combining:
+- Vision xLSTM as a vision encoder in a VLM
+- A discrete masked diffusion language model backbone
+- Multi-stage training with knowledge distillation from an AR multimodal teacher
+## References
+- [Vision-LSTM](https://arxiv.org/abs/2406.04303) — Alkin et al., 2024
+- [dLLM](https://arxiv.org/abs/2602.22661) — Berkeley, 2025
+- [MDLM](https://arxiv.org/abs/2406.07524) — Kuleshov group, NeurIPS 2024
+- [LLaDA-V](https://arxiv.org/abs/2505.16933) — GSAI-ML, 2025
+- [LFM2](https://arxiv.org/abs/2511.23404) — Liquid AI, 2025
+- [Gemma 4](https://huggingface.co/google/gemma-4-E2B-it) — Google, 2026
+"""
+    with open(os.path.join(save_dir, "README.md"), 'w') as f:
+        f.write(readme)
+    api.upload_folder(folder_path=save_dir, repo_id=repo_id,
+                     commit_message=f"Stage {args.stage} training (loss={best_loss:.4f})")
+    print(f"\n✅ Model pushed to https://huggingface.co/{repo_id}")
+    print("Training complete!")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--stage", type=int, default=1)
+    parser.add_argument("--epochs", type=int, default=2)
+    parser.add_argument("--batch_size", type=int, default=4)
+    parser.add_argument("--grad_accum", type=int, default=8)
+    parser.add_argument("--max_length", type=int, default=512)
+    parser.add_argument("--max_samples", type=int, default=None)
+    parser.add_argument("--output_dir", type=str, default="./vil-dlm-output")
+    parser.add_argument("--hub_model_id", type=str, default="omar-ah/ViL-DLM-0.6B")
+    args = parser.parse_args()
+    train(args)