{
  "model": "Qwen3-ASR-0.6B",
  "component": "audio_encoder",
  "format": "tflite",
  "quantization": "int8",
  "sample_rate": 16000,
  "mel_frames_per_second": 100,
  "input_mel_frames": 1000,
  "input_mel_bins": 128,
  "output_tokens": 125,
  "output_dim": 1024,
  "encoder": {
    "num_layers": 18,
    "d_model": 896,
    "num_heads": 14,
    "ffn_dim": 3584
  },
  "inputs": {
    "mel": {
      "shape": [
        1,
        128,
        1000
      ],
      "dtype": "float32"
    }
  },
  "outputs": {
    "audio_embeddings": {
      "shape": [
        1,
        125,
        1024
      ],
      "dtype": "float32"
    }
  },
  "note": "This is the audio encoder only. The text decoder is a Qwen3-0.6B LLM; run it through LiteRT-LM (separate runtime) with the encoder outputs as cross-attention context. Supports 30 languages + 22 Chinese dialects."
}