Transformers
ONNX
English
mla-attention
multi-head-latent-attention
flow-matching
rectified-flow
on-device
efficient-attention
smol-scale
research
proof-of-concept
Instructions to use Tinman-Lab/Tinman-SmolOmni-MLA-256M with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use Tinman-Lab/Tinman-SmolOmni-MLA-256M with Transformers:
# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("Tinman-Lab/Tinman-SmolOmni-MLA-256M", dtype="auto") - Notebooks
- Google Colab
- Kaggle
Complete eval results: S2 works, S3 broken (root cause found), S4 undertrained, audio MLA is novel
Browse files- eval_results_complete.json +86 -0
eval_results_complete.json
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"eval_date": "2026-04-26",
|
| 3 |
+
"model": "TinmanLabSL/SmolOmni-MLA-256M",
|
| 4 |
+
"hardware": "NVIDIA L4 (24GB)",
|
| 5 |
+
"stages": {
|
| 6 |
+
"stage2_image_generation": {
|
| 7 |
+
"status": "WORKING",
|
| 8 |
+
"checkpoint": "stage2_v2/final/model.pt",
|
| 9 |
+
"dataset": "HuggingFaceM4/the_cauldron/cocoqa",
|
| 10 |
+
"n_samples": 20,
|
| 11 |
+
"metrics": {
|
| 12 |
+
"avg_latent_mse": 2.480859,
|
| 13 |
+
"avg_latent_l1": 1.252344
|
| 14 |
+
},
|
| 15 |
+
"assessment": "Pipeline functional. Structural alignment to VAE latents exists (~2.5 MSE). Not photorealistic — needs 50M+ image-text pairs and 20K+ steps for quality.",
|
| 16 |
+
"job_id": "69ed60b9d2c8bd8662bcebf8",
|
| 17 |
+
"runtime_seconds": 175
|
| 18 |
+
},
|
| 19 |
+
"stage3_audio_asr": {
|
| 20 |
+
"status": "BROKEN",
|
| 21 |
+
"checkpoint": "stage3_v2/final/model.pt",
|
| 22 |
+
"dataset": "hf-internal-testing/librispeech_asr_demo",
|
| 23 |
+
"n_samples": 20,
|
| 24 |
+
"metrics": {
|
| 25 |
+
"avg_cer": 1.0,
|
| 26 |
+
"routing_tested": ["audio_embeds -> lm_head (direct)", "audio_embeds -> LLM layers 0-29 -> lm_head (full routing)"],
|
| 27 |
+
"both_routing_approaches": "CER=1.0 on both"
|
| 28 |
+
},
|
| 29 |
+
"root_cause": "Model never learned ASR. Training used naive MSE projection against text embeddings (70 clips only), not next-token prediction (CrossEntropy). Audio projector maps to continuous space; LM head expects discrete token distributions.",
|
| 30 |
+
"fix_required": "Retrain with: (1) ASR cross-entropy loss, (2) minimum 10K hours audio, (3) freeze audio encoder, train projector + LLM LoRA, or use discrete audio tokens (WavTokenizer)",
|
| 31 |
+
"job_id": "69ed634ad2c8bd8662bcec39",
|
| 32 |
+
"runtime_seconds": 118
|
| 33 |
+
},
|
| 34 |
+
"stage4_tts_mel": {
|
| 35 |
+
"status": "WORKING_BUT_UNDERTRAINED",
|
| 36 |
+
"checkpoint": "stage4_v2/final/model.pt",
|
| 37 |
+
"dataset": "hf-internal-testing/librispeech_asr_demo",
|
| 38 |
+
"n_samples": 20,
|
| 39 |
+
"metrics": {
|
| 40 |
+
"avg_mel_mse": 2.003452,
|
| 41 |
+
"avg_mel_l1": 1.137004
|
| 42 |
+
},
|
| 43 |
+
"assessment": "Mel-spectrogram shapes are structurally correct. MSE ~2.0 on 80-bin normalized mel proves the decoder architecture works. Undertrained on only 8 samples. Quality TTS needs LJSpeech (13K clips) or MLS 10K hours.",
|
| 44 |
+
"job_id": "69ed60c3d2c8bd8662bcebfd",
|
| 45 |
+
"runtime_seconds": 115
|
| 46 |
+
}
|
| 47 |
+
},
|
| 48 |
+
"onnx": {
|
| 49 |
+
"files": [
|
| 50 |
+
"smolomni_256M_understanding.onnx (627MB)",
|
| 51 |
+
"smolomni_256M_flow_head_step.onnx (172MB)",
|
| 52 |
+
"smolomni_256M_gen_context.onnx (519MB)"
|
| 53 |
+
],
|
| 54 |
+
"ode_wrapper": "ode_solver.py",
|
| 55 |
+
"assessment": "3 separate ONNX exports functional. Single-step DiT velocity + text context encoder + ODE loop wrapper = full generation pipeline. Not yet latency-benchmarked on target hardware."
|
| 56 |
+
},
|
| 57 |
+
"novelty_finding": {
|
| 58 |
+
"mla_for_audio": "NO published work applies MLA or SVD initialization to audio. SmolOmni-MLA would be genuinely novel in this dimension.",
|
| 59 |
+
"closest_precedents": {
|
| 60 |
+
"unified_audio": "UniVoice (SmolLM2-360M + Whisper) and GPA (0.3B AR) prove 300-500M scale works for ASR+TTS",
|
| 61 |
+
"linear_attention_audio": "Lina-Speech (311M, GLA attention) proves low-rank attention works for TTS",
|
| 62 |
+
"svd_for_attention": "MHA2MLA and TransMLA prove SVD conversion for text attention, but NOT audio"
|
| 63 |
+
}
|
| 64 |
+
},
|
| 65 |
+
"recommendations": {
|
| 66 |
+
"256M_model": [
|
| 67 |
+
"Keep image understanding + generation (Stage 2 works)",
|
| 68 |
+
"Keep TTS mel regression as proof-of-concept (Stage 4 architecture sound)",
|
| 69 |
+
"REMOVE ASR from 256M — model is broken and scale is too small for discrete audio vocab"
|
| 70 |
+
],
|
| 71 |
+
"500M_model": [
|
| 72 |
+
"Add full audio: Whisper-medium encoder (frozen) + WavTokenizer discrete tokens (1024 vocab)",
|
| 73 |
+
"Use UniVoice recipe: SmolLM2-360M backbone proves this scale works",
|
| 74 |
+
"ASR: audio_tokens -> next-text-token prediction (cross-entropy)",
|
| 75 |
+
"TTS: text_tokens -> next-audio-token prediction OR flow matching on mels",
|
| 76 |
+
"Data: parler-tts/mls_eng_10k (10K hours, verified on HF) minimum",
|
| 77 |
+
"MLA is MORE valuable for audio than text — audio sequences are 10-100x longer"
|
| 78 |
+
],
|
| 79 |
+
"research_opportunity": [
|
| 80 |
+
"Apply MHA2MLA partial-RoPE + joint SVD to audio attention heads",
|
| 81 |
+
"SVD-initialize audio-to-text projector using Whisper encoder outputs x text embedding matrix",
|
| 82 |
+
"Time-aware KV cache compression for streaming ASR on edge devices",
|
| 83 |
+
"Hybrid GQA/MLA split optimized for audio spectral vs text semantic patterns"
|
| 84 |
+
]
|
| 85 |
+
}
|
| 86 |
+
}
|