omar-ah
/

ViL-DLM-0.6B

+"""
+ViL-DLM: Vision xLSTM + Diffusion Language Model
+Architecture Configuration
+"""
+from dataclasses import dataclass, field
+from typing import Optional, List
+@dataclass
+class ViLEncoderConfig:
+    """Vision xLSTM (ViL) encoder configuration"""
+    img_size: int = 224
+    patch_size: int = 16
+    in_channels: int = 3
+    dim: int = 384          # ViL-S default (23M params)
+    depth: int = 24         # Standard ViL depth
+    mlstm_dim_mult: int = 2  # mLSTM internal dim = 2 * dim
+    conv_kernel_size: int = 3  # QK Conv2D kernel
+    bidirectional: bool = True  # alternating scan directions
+    dropout: float = 0.0
+    @property
+    def num_patches(self):
+        return (self.img_size // self.patch_size) ** 2  # 196 for 224/16
+    @property
+    def num_params_approx(self):
+        # Rough estimate: each mLSTM block has ~4 * dim * (2*dim) params for QKV + gates
+        per_block = 4 * self.dim * (self.mlstm_dim_mult * self.dim) + self.dim * self.dim * 4
+        return self.depth * per_block
+@dataclass
+class ProjectorConfig:
+    """MLP projector: maps ViL features to LM embedding space"""
+    vil_dim: int = 384         # ViL-S output dim
+    lm_dim: int = 1024         # Qwen3-0.6B hidden_size
+    hidden_mult: int = 2       # projector hidden = lm_dim * hidden_mult
+    num_layers: int = 2        # 2-layer MLP (LaViDa/LLaDA-V standard)
+    activation: str = "gelu"
+    dropout: float = 0.0
+@dataclass
+class DiffusionConfig:
+    """Masked diffusion (MDLM) training configuration"""
+    noise_schedule: str = "cosine"  # cosine schedule (MDLM default)
+    mask_token_id: int = 151643     # Qwen3 pad/mask token
+    num_diffusion_steps: int = 1000  # training steps
+    inference_steps: int = 128       # sampling steps
+    remasking: str = "low_confidence"  # remasking strategy
+@dataclass
+class DistillationConfig:
+    """Knowledge distillation from Gemma 4 E2B teacher"""
+    teacher_model_id: str = "google/gemma-4-E2B-it"
+    teacher_quantize: bool = True  # 4-bit quantization for memory
+    temperature: float = 2.0       # KD temperature
+    alpha_kd: float = 0.5          # weight for KD loss vs diffusion loss
+    alpha_vision_kd: float = 0.3   # weight for vision feature distillation
+    top_k_logits: int = 32         # LFM2-style top-K distillation
+@dataclass
+class TrainingConfig:
+    """Full training configuration"""
+    # Model
+    vil_encoder: ViLEncoderConfig = field(default_factory=ViLEncoderConfig)
+    projector: ProjectorConfig = field(default_factory=ProjectorConfig)
+    diffusion: DiffusionConfig = field(default_factory=DiffusionConfig)
+    distillation: DistillationConfig = field(default_factory=DistillationConfig)
+    # Backbone
+    diffusion_lm_id: str = "dllm-hub/Qwen3-0.6B-diffusion-mdlm-v0.1"
+    # Training hyperparams (from dLLM + LLaDA-V + LFM2 recipes)
+    learning_rate: float = 1e-4
+    vil_learning_rate: float = 2e-6      # lower LR for vision encoder (LLaDA-V)
+    projector_learning_rate: float = 1e-3  # higher LR for projector (LLaDA-V Stage 1)
+    weight_decay: float = 0.05
+    warmup_ratio: float = 0.1
+    lr_scheduler: str = "cosine"
+    max_seq_len: int = 1024
+    per_device_train_batch_size: int = 4
+    gradient_accumulation_steps: int = 8  # effective batch = 32
+    num_epochs: int = 3
+    bf16: bool = True
+    gradient_checkpointing: bool = True
+    # Data
+    pretrain_dataset: str = "liuhaotian/LLaVA-Pretrain"  # Stage 1: 558K
+    finetune_dataset: str = "HuggingFaceM4/the_cauldron"  # Stage 2: rich multimodal
+    # Output
+    output_dir: str = "./vil-dlm-output"
+    hub_model_id: str = "omar-ah/ViL-DLM-0.6B"
+    push_to_hub: bool = True
+    # Stages
+    stage: int = 1  # 1 = projector only, 2 = full finetune, 3 = + distillation
+def get_config(stage: int = 1) -> TrainingConfig:
+    config = TrainingConfig()
+    config.stage = stage
+    if stage == 1:
+        # Stage 1: Train projector only (ViL frozen, LM frozen)
+        config.learning_rate = 1e-3
+        config.num_epochs = 1
+        config.per_device_train_batch_size = 8
+        config.gradient_accumulation_steps = 4
+    elif stage == 2:
+        # Stage 2: Full model finetune (ViL + projector + LM)
+        config.learning_rate = 1e-5
+        config.vil_learning_rate = 2e-6
+        config.projector_learning_rate = 1e-5
+        config.num_epochs = 3
+    elif stage == 3:
+        # Stage 3: + Distillation from Gemma 4
+        config.learning_rate = 1e-5
+        config.num_epochs = 2
+        config.distillation.alpha_kd = 0.5
+    return config