Reza2kn commited on
Commit
61dfe9b
·
verified ·
1 Parent(s): a4c397e

Use INT8 encoder + INT4 decoder (91.9% accuracy); force-English prompt default

Browse files
Files changed (2) hide show
  1. index.html +8 -0
  2. mega-asr.js +21 -7
index.html CHANGED
@@ -73,6 +73,14 @@
73
  <label for="audio-file">Audio (any format)</label>
74
  <input type="file" id="audio-file" accept="audio/*" />
75
  <audio id="audio-player" controls></audio>
 
 
 
 
 
 
 
 
76
  <label for="ref-text" style="margin-top:14px">Reference transcript (optional)</label>
77
  <textarea id="ref-text" placeholder="Paste the ground-truth text for scoring."></textarea>
78
  <div style="margin-top: 12px;" class="row">
 
73
  <label for="audio-file">Audio (any format)</label>
74
  <input type="file" id="audio-file" accept="audio/*" />
75
  <audio id="audio-player" controls></audio>
76
+ <label for="lang-select" style="margin-top:14px">Force language (auto-detect can fail at INT4)</label>
77
+ <select id="lang-select" style="padding:8px 10px;border-radius:8px;border:1px solid var(--border);background:var(--panel);color:var(--fg);font-family:inherit;width:100%">
78
+ <option value="english" selected>English</option>
79
+ <option value="chinese">Chinese</option>
80
+ <option value="japanese">Japanese</option>
81
+ <option value="korean">Korean</option>
82
+ <option value="auto">Auto-detect</option>
83
+ </select>
84
  <label for="ref-text" style="margin-top:14px">Reference transcript (optional)</label>
85
  <textarea id="ref-text" placeholder="Paste the ground-truth text for scoring."></textarea>
86
  <div style="margin-top: 12px;" class="row">
mega-asr.js CHANGED
@@ -116,6 +116,16 @@ async function fetchWithCache(url, label, onProgress) {
116
  }
117
 
118
  // ---- ONNX session creation -------------------------------------------------
 
 
 
 
 
 
 
 
 
 
119
  async function createSession(graphUrl, dataUrl, label, onProgress) {
120
  // Fetch the .onnx graph and the .onnx.data weights blob, then construct
121
  // an InferenceSession from the two arrays.
@@ -206,11 +216,12 @@ async function loadAll() {
206
  setProgress(30);
207
 
208
  // 3. ONNX sessions
209
- setLoaderStatus("audio encoder (~215 MB)...");
210
- state.encoder = await createSession(
211
- `${HF_ROOT}/onnx/audio_encoder_int4.onnx`,
212
- `${HF_ROOT}/onnx/audio_encoder_int4.onnx.data`,
213
- "audio_encoder",
 
214
  p => setProgress(30 + p * 10),
215
  );
216
  setProgress(40);
@@ -289,9 +300,12 @@ async function transcribe({ mel, dims, T_mel }) {
289
  const lastChunkMel = T_mel - (realChunks - 1) * 100;
290
  const realAudioFrames = (realChunks - 1) * 13 + Math.floor((lastChunkMel + 7) / 8);
291
 
292
- // 2. build prompt + scatter audio embeds at <|audio_pad|>
 
 
293
  setStatus("building prompt ...");
294
- const promptIds = state.manifest.prompt_ids;
 
295
  const audioPadId = state.manifest.audio_pad_id;
296
  // Expand audio_pad in the prompt to realAudioFrames placeholder tokens
297
  const tokens = [];
 
116
  }
117
 
118
  // ---- ONNX session creation -------------------------------------------------
119
+ async function createSessionSimple(graphUrl, label, onProgress) {
120
+ // Single-file ONNX (weights embedded in the graph file).
121
+ const graph = await fetchWithCache(graphUrl, label, onProgress);
122
+ const sess = await ort.InferenceSession.create(graph, {
123
+ executionProviders: state.device === "webgpu" ? ["webgpu"] : ["wasm"],
124
+ });
125
+ log(`session ready: ${label} (${state.device})`);
126
+ return sess;
127
+ }
128
+
129
  async function createSession(graphUrl, dataUrl, label, onProgress) {
130
  // Fetch the .onnx graph and the .onnx.data weights blob, then construct
131
  // an InferenceSession from the two arrays.
 
216
  setProgress(30);
217
 
218
  // 3. ONNX sessions
219
+ // INT8 audio encoder + INT4 decoders gives the best size/quality tradeoff
220
+ // (91.9% vs 87.8% INT4-only on VITW). Encoder is single-file (no .data sidecar).
221
+ setLoaderStatus("audio encoder INT8 (~320 MB)...");
222
+ state.encoder = await createSessionSimple(
223
+ `${HF_ROOT}/onnx/audio_encoder_int8.onnx`,
224
+ "audio_encoder INT8",
225
  p => setProgress(30 + p * 10),
226
  );
227
  setProgress(40);
 
300
  const lastChunkMel = T_mel - (realChunks - 1) * 100;
301
  const realAudioFrames = (realChunks - 1) * 13 + Math.floor((lastChunkMel + 7) / 8);
302
 
303
+ // 2. build prompt + scatter audio embeds at <|audio_pad|>.
304
+ // Default to the forced-English prompt; the model's auto language detection
305
+ // can fail at INT4 quantization on borderline audio.
306
  setStatus("building prompt ...");
307
+ const lang = (document.getElementById("lang-select")?.value) || "english";
308
+ const promptIds = (state.manifest.prompts && state.manifest.prompts[lang]?.ids) || state.manifest.prompt_ids;
309
  const audioPadId = state.manifest.audio_pad_id;
310
  // Expand audio_pad in the prompt to realAudioFrames placeholder tokens
311
  const tokens = [];