{ "model": "Qwen3-ASR-0.6B", "component": "audio_encoder", "format": "tflite", "quantization": "int8", "sample_rate": 16000, "mel_frames_per_second": 100, "input_mel_frames": 1000, "input_mel_bins": 128, "output_tokens": 125, "output_dim": 1024, "encoder": { "num_layers": 18, "d_model": 896, "num_heads": 14, "ffn_dim": 3584 }, "inputs": { "mel": { "shape": [ 1, 128, 1000 ], "dtype": "float32" } }, "outputs": { "audio_embeddings": { "shape": [ 1, 125, 1024 ], "dtype": "float32" } }, "note": "This is the audio encoder only. The text decoder is a Qwen3-0.6B LLM; run it through LiteRT-LM (separate runtime) with the encoder outputs as cross-attention context. Supports 30 languages + 22 Chinese dialects." }