Spaces:

Reza2kn
/

mega-asr-bench

Running

App Files Files Community

Reza2kn commited on about 16 hours ago

Commit

61dfe9b

verified ·

1 Parent(s): a4c397e

Use INT8 encoder + INT4 decoder (91.9% accuracy); force-English prompt default

Browse files

Files changed (2) hide show

index.html +8 -0
mega-asr.js +21 -7

index.html CHANGED Viewed

@@ -73,6 +73,14 @@
       <label for="audio-file">Audio (any format)</label>
       <input type="file" id="audio-file" accept="audio/*" />
       <audio id="audio-player" controls></audio>
       <label for="ref-text" style="margin-top:14px">Reference transcript (optional)</label>
       <textarea id="ref-text" placeholder="Paste the ground-truth text for scoring."></textarea>
       <div style="margin-top: 12px;" class="row">

       <label for="audio-file">Audio (any format)</label>
       <input type="file" id="audio-file" accept="audio/*" />
       <audio id="audio-player" controls></audio>
+      <label for="lang-select" style="margin-top:14px">Force language (auto-detect can fail at INT4)</label>
+      <select id="lang-select" style="padding:8px 10px;border-radius:8px;border:1px solid var(--border);background:var(--panel);color:var(--fg);font-family:inherit;width:100%">
+        <option value="english" selected>English</option>
+        <option value="chinese">Chinese</option>
+        <option value="japanese">Japanese</option>
+        <option value="korean">Korean</option>
+        <option value="auto">Auto-detect</option>
+      </select>
       <label for="ref-text" style="margin-top:14px">Reference transcript (optional)</label>
       <textarea id="ref-text" placeholder="Paste the ground-truth text for scoring."></textarea>
       <div style="margin-top: 12px;" class="row">

mega-asr.js CHANGED Viewed

@@ -116,6 +116,16 @@ async function fetchWithCache(url, label, onProgress) {
 }
 // ---- ONNX session creation -------------------------------------------------
 async function createSession(graphUrl, dataUrl, label, onProgress) {
   // Fetch the .onnx graph and the .onnx.data weights blob, then construct
   // an InferenceSession from the two arrays.
@@ -206,11 +216,12 @@ async function loadAll() {
   setProgress(30);
   // 3. ONNX sessions
-  setLoaderStatus("audio encoder (~215 MB)...");
-  state.encoder = await createSession(
-    `${HF_ROOT}/onnx/audio_encoder_int4.onnx`,
-    `${HF_ROOT}/onnx/audio_encoder_int4.onnx.data`,
-    "audio_encoder",
     p => setProgress(30 + p * 10),
   );
   setProgress(40);
@@ -289,9 +300,12 @@ async function transcribe({ mel, dims, T_mel }) {
   const lastChunkMel = T_mel - (realChunks - 1) * 100;
   const realAudioFrames = (realChunks - 1) * 13 + Math.floor((lastChunkMel + 7) / 8);
-  // 2. build prompt + scatter audio embeds at <|audio_pad|>
   setStatus("building prompt ...");
-  const promptIds = state.manifest.prompt_ids;
   const audioPadId = state.manifest.audio_pad_id;
   // Expand audio_pad in the prompt to realAudioFrames placeholder tokens
   const tokens = [];

 }
 // ---- ONNX session creation -------------------------------------------------
+async function createSessionSimple(graphUrl, label, onProgress) {
+  // Single-file ONNX (weights embedded in the graph file).
+  const graph = await fetchWithCache(graphUrl, label, onProgress);
+  const sess = await ort.InferenceSession.create(graph, {
+    executionProviders: state.device === "webgpu" ? ["webgpu"] : ["wasm"],
+  });
+  log(`session ready: ${label} (${state.device})`);
+  return sess;
+}
 async function createSession(graphUrl, dataUrl, label, onProgress) {
   // Fetch the .onnx graph and the .onnx.data weights blob, then construct
   // an InferenceSession from the two arrays.
   setProgress(30);
   // 3. ONNX sessions
+  // INT8 audio encoder + INT4 decoders gives the best size/quality tradeoff
+  // (91.9% vs 87.8% INT4-only on VITW). Encoder is single-file (no .data sidecar).
+  setLoaderStatus("audio encoder INT8 (~320 MB)...");
+  state.encoder = await createSessionSimple(
+    `${HF_ROOT}/onnx/audio_encoder_int8.onnx`,
+    "audio_encoder INT8",
     p => setProgress(30 + p * 10),
   );
   setProgress(40);
   const lastChunkMel = T_mel - (realChunks - 1) * 100;
   const realAudioFrames = (realChunks - 1) * 13 + Math.floor((lastChunkMel + 7) / 8);
+  // 2. build prompt + scatter audio embeds at <|audio_pad|>.
+  // Default to the forced-English prompt; the model's auto language detection
+  // can fail at INT4 quantization on borderline audio.
   setStatus("building prompt ...");
+  const lang = (document.getElementById("lang-select")?.value) || "english";
+  const promptIds = (state.manifest.prompts && state.manifest.prompts[lang]?.ids) || state.manifest.prompt_ids;
   const audioPadId = state.manifest.audio_pad_id;
   // Expand audio_pad in the prompt to realAudioFrames placeholder tokens
   const tokens = [];