TinmanLabSL commited on
Commit
b7e1cef
·
verified ·
1 Parent(s): 1e58e23

Complete eval results: S2 works, S3 broken (root cause found), S4 undertrained, audio MLA is novel

Browse files
Files changed (1) hide show
  1. eval_results_complete.json +86 -0
eval_results_complete.json ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eval_date": "2026-04-26",
3
+ "model": "TinmanLabSL/SmolOmni-MLA-256M",
4
+ "hardware": "NVIDIA L4 (24GB)",
5
+ "stages": {
6
+ "stage2_image_generation": {
7
+ "status": "WORKING",
8
+ "checkpoint": "stage2_v2/final/model.pt",
9
+ "dataset": "HuggingFaceM4/the_cauldron/cocoqa",
10
+ "n_samples": 20,
11
+ "metrics": {
12
+ "avg_latent_mse": 2.480859,
13
+ "avg_latent_l1": 1.252344
14
+ },
15
+ "assessment": "Pipeline functional. Structural alignment to VAE latents exists (~2.5 MSE). Not photorealistic — needs 50M+ image-text pairs and 20K+ steps for quality.",
16
+ "job_id": "69ed60b9d2c8bd8662bcebf8",
17
+ "runtime_seconds": 175
18
+ },
19
+ "stage3_audio_asr": {
20
+ "status": "BROKEN",
21
+ "checkpoint": "stage3_v2/final/model.pt",
22
+ "dataset": "hf-internal-testing/librispeech_asr_demo",
23
+ "n_samples": 20,
24
+ "metrics": {
25
+ "avg_cer": 1.0,
26
+ "routing_tested": ["audio_embeds -> lm_head (direct)", "audio_embeds -> LLM layers 0-29 -> lm_head (full routing)"],
27
+ "both_routing_approaches": "CER=1.0 on both"
28
+ },
29
+ "root_cause": "Model never learned ASR. Training used naive MSE projection against text embeddings (70 clips only), not next-token prediction (CrossEntropy). Audio projector maps to continuous space; LM head expects discrete token distributions.",
30
+ "fix_required": "Retrain with: (1) ASR cross-entropy loss, (2) minimum 10K hours audio, (3) freeze audio encoder, train projector + LLM LoRA, or use discrete audio tokens (WavTokenizer)",
31
+ "job_id": "69ed634ad2c8bd8662bcec39",
32
+ "runtime_seconds": 118
33
+ },
34
+ "stage4_tts_mel": {
35
+ "status": "WORKING_BUT_UNDERTRAINED",
36
+ "checkpoint": "stage4_v2/final/model.pt",
37
+ "dataset": "hf-internal-testing/librispeech_asr_demo",
38
+ "n_samples": 20,
39
+ "metrics": {
40
+ "avg_mel_mse": 2.003452,
41
+ "avg_mel_l1": 1.137004
42
+ },
43
+ "assessment": "Mel-spectrogram shapes are structurally correct. MSE ~2.0 on 80-bin normalized mel proves the decoder architecture works. Undertrained on only 8 samples. Quality TTS needs LJSpeech (13K clips) or MLS 10K hours.",
44
+ "job_id": "69ed60c3d2c8bd8662bcebfd",
45
+ "runtime_seconds": 115
46
+ }
47
+ },
48
+ "onnx": {
49
+ "files": [
50
+ "smolomni_256M_understanding.onnx (627MB)",
51
+ "smolomni_256M_flow_head_step.onnx (172MB)",
52
+ "smolomni_256M_gen_context.onnx (519MB)"
53
+ ],
54
+ "ode_wrapper": "ode_solver.py",
55
+ "assessment": "3 separate ONNX exports functional. Single-step DiT velocity + text context encoder + ODE loop wrapper = full generation pipeline. Not yet latency-benchmarked on target hardware."
56
+ },
57
+ "novelty_finding": {
58
+ "mla_for_audio": "NO published work applies MLA or SVD initialization to audio. SmolOmni-MLA would be genuinely novel in this dimension.",
59
+ "closest_precedents": {
60
+ "unified_audio": "UniVoice (SmolLM2-360M + Whisper) and GPA (0.3B AR) prove 300-500M scale works for ASR+TTS",
61
+ "linear_attention_audio": "Lina-Speech (311M, GLA attention) proves low-rank attention works for TTS",
62
+ "svd_for_attention": "MHA2MLA and TransMLA prove SVD conversion for text attention, but NOT audio"
63
+ }
64
+ },
65
+ "recommendations": {
66
+ "256M_model": [
67
+ "Keep image understanding + generation (Stage 2 works)",
68
+ "Keep TTS mel regression as proof-of-concept (Stage 4 architecture sound)",
69
+ "REMOVE ASR from 256M — model is broken and scale is too small for discrete audio vocab"
70
+ ],
71
+ "500M_model": [
72
+ "Add full audio: Whisper-medium encoder (frozen) + WavTokenizer discrete tokens (1024 vocab)",
73
+ "Use UniVoice recipe: SmolLM2-360M backbone proves this scale works",
74
+ "ASR: audio_tokens -> next-text-token prediction (cross-entropy)",
75
+ "TTS: text_tokens -> next-audio-token prediction OR flow matching on mels",
76
+ "Data: parler-tts/mls_eng_10k (10K hours, verified on HF) minimum",
77
+ "MLA is MORE valuable for audio than text — audio sequences are 10-100x longer"
78
+ ],
79
+ "research_opportunity": [
80
+ "Apply MHA2MLA partial-RoPE + joint SVD to audio attention heads",
81
+ "SVD-initialize audio-to-text projector using Whisper encoder outputs x text embedding matrix",
82
+ "Time-aware KV cache compression for streaming ASR on edge devices",
83
+ "Hybrid GQA/MLA split optimized for audio spectral vs text semantic patterns"
84
+ ]
85
+ }
86
+ }