Tinman-Lab
/

Tinman-SmolOmni-MLA-256M

+{
+  "eval_date": "2026-04-26",
+  "model": "TinmanLabSL/SmolOmni-MLA-256M",
+  "hardware": "NVIDIA L4 (24GB)",
+  "stages": {
+    "stage2_image_generation": {
+      "status": "WORKING",
+      "checkpoint": "stage2_v2/final/model.pt",
+      "dataset": "HuggingFaceM4/the_cauldron/cocoqa",
+      "n_samples": 20,
+      "metrics": {
+        "avg_latent_mse": 2.480859,
+        "avg_latent_l1": 1.252344
+      },
+      "assessment": "Pipeline functional. Structural alignment to VAE latents exists (~2.5 MSE). Not photorealistic — needs 50M+ image-text pairs and 20K+ steps for quality.",
+      "job_id": "69ed60b9d2c8bd8662bcebf8",
+      "runtime_seconds": 175
+    },
+    "stage3_audio_asr": {
+      "status": "BROKEN",
+      "checkpoint": "stage3_v2/final/model.pt",
+      "dataset": "hf-internal-testing/librispeech_asr_demo",
+      "n_samples": 20,
+      "metrics": {
+        "avg_cer": 1.0,
+        "routing_tested": ["audio_embeds -> lm_head (direct)", "audio_embeds -> LLM layers 0-29 -> lm_head (full routing)"],
+        "both_routing_approaches": "CER=1.0 on both"
+      },
+      "root_cause": "Model never learned ASR. Training used naive MSE projection against text embeddings (70 clips only), not next-token prediction (CrossEntropy). Audio projector maps to continuous space; LM head expects discrete token distributions.",
+      "fix_required": "Retrain with: (1) ASR cross-entropy loss, (2) minimum 10K hours audio, (3) freeze audio encoder, train projector + LLM LoRA, or use discrete audio tokens (WavTokenizer)",
+      "job_id": "69ed634ad2c8bd8662bcec39",
+      "runtime_seconds": 118
+    },
+    "stage4_tts_mel": {
+      "status": "WORKING_BUT_UNDERTRAINED",
+      "checkpoint": "stage4_v2/final/model.pt",
+      "dataset": "hf-internal-testing/librispeech_asr_demo",
+      "n_samples": 20,
+      "metrics": {
+        "avg_mel_mse": 2.003452,
+        "avg_mel_l1": 1.137004
+      },
+      "assessment": "Mel-spectrogram shapes are structurally correct. MSE ~2.0 on 80-bin normalized mel proves the decoder architecture works. Undertrained on only 8 samples. Quality TTS needs LJSpeech (13K clips) or MLS 10K hours.",
+      "job_id": "69ed60c3d2c8bd8662bcebfd",
+      "runtime_seconds": 115
+    }
+  },
+  "onnx": {
+    "files": [
+      "smolomni_256M_understanding.onnx (627MB)",
+      "smolomni_256M_flow_head_step.onnx (172MB)",
+      "smolomni_256M_gen_context.onnx (519MB)"
+    ],
+    "ode_wrapper": "ode_solver.py",
+    "assessment": "3 separate ONNX exports functional. Single-step DiT velocity + text context encoder + ODE loop wrapper = full generation pipeline. Not yet latency-benchmarked on target hardware."
+  },
+  "novelty_finding": {
+    "mla_for_audio": "NO published work applies MLA or SVD initialization to audio. SmolOmni-MLA would be genuinely novel in this dimension.",
+    "closest_precedents": {
+      "unified_audio": "UniVoice (SmolLM2-360M + Whisper) and GPA (0.3B AR) prove 300-500M scale works for ASR+TTS",
+      "linear_attention_audio": "Lina-Speech (311M, GLA attention) proves low-rank attention works for TTS",
+      "svd_for_attention": "MHA2MLA and TransMLA prove SVD conversion for text attention, but NOT audio"
+    }
+  },
+  "recommendations": {
+    "256M_model": [
+      "Keep image understanding + generation (Stage 2 works)",
+      "Keep TTS mel regression as proof-of-concept (Stage 4 architecture sound)",
+      "REMOVE ASR from 256M — model is broken and scale is too small for discrete audio vocab"
+    ],
+    "500M_model": [
+      "Add full audio: Whisper-medium encoder (frozen) + WavTokenizer discrete tokens (1024 vocab)",
+      "Use UniVoice recipe: SmolLM2-360M backbone proves this scale works",
+      "ASR: audio_tokens -> next-text-token prediction (cross-entropy)",
+      "TTS: text_tokens -> next-audio-token prediction OR flow matching on mels",
+      "Data: parler-tts/mls_eng_10k (10K hours, verified on HF) minimum",
+      "MLA is MORE valuable for audio than text — audio sequences are 10-100x longer"
+    ],
+    "research_opportunity": [
+      "Apply MHA2MLA partial-RoPE + joint SVD to audio attention heads",
+      "SVD-initialize audio-to-text projector using Whisper encoder outputs x text embedding matrix",
+      "Time-aware KV cache compression for streaming ASR on edge devices",
+      "Hybrid GQA/MLA split optimized for audio spectral vs text semantic patterns"
+    ]
+  }
+}