Spaces:
Running
Running
Use INT8 encoder + INT4 decoder (91.9% accuracy); force-English prompt default
Browse files- index.html +8 -0
- mega-asr.js +21 -7
index.html
CHANGED
|
@@ -73,6 +73,14 @@
|
|
| 73 |
<label for="audio-file">Audio (any format)</label>
|
| 74 |
<input type="file" id="audio-file" accept="audio/*" />
|
| 75 |
<audio id="audio-player" controls></audio>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
<label for="ref-text" style="margin-top:14px">Reference transcript (optional)</label>
|
| 77 |
<textarea id="ref-text" placeholder="Paste the ground-truth text for scoring."></textarea>
|
| 78 |
<div style="margin-top: 12px;" class="row">
|
|
|
|
| 73 |
<label for="audio-file">Audio (any format)</label>
|
| 74 |
<input type="file" id="audio-file" accept="audio/*" />
|
| 75 |
<audio id="audio-player" controls></audio>
|
| 76 |
+
<label for="lang-select" style="margin-top:14px">Force language (auto-detect can fail at INT4)</label>
|
| 77 |
+
<select id="lang-select" style="padding:8px 10px;border-radius:8px;border:1px solid var(--border);background:var(--panel);color:var(--fg);font-family:inherit;width:100%">
|
| 78 |
+
<option value="english" selected>English</option>
|
| 79 |
+
<option value="chinese">Chinese</option>
|
| 80 |
+
<option value="japanese">Japanese</option>
|
| 81 |
+
<option value="korean">Korean</option>
|
| 82 |
+
<option value="auto">Auto-detect</option>
|
| 83 |
+
</select>
|
| 84 |
<label for="ref-text" style="margin-top:14px">Reference transcript (optional)</label>
|
| 85 |
<textarea id="ref-text" placeholder="Paste the ground-truth text for scoring."></textarea>
|
| 86 |
<div style="margin-top: 12px;" class="row">
|
mega-asr.js
CHANGED
|
@@ -116,6 +116,16 @@ async function fetchWithCache(url, label, onProgress) {
|
|
| 116 |
}
|
| 117 |
|
| 118 |
// ---- ONNX session creation -------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
async function createSession(graphUrl, dataUrl, label, onProgress) {
|
| 120 |
// Fetch the .onnx graph and the .onnx.data weights blob, then construct
|
| 121 |
// an InferenceSession from the two arrays.
|
|
@@ -206,11 +216,12 @@ async function loadAll() {
|
|
| 206 |
setProgress(30);
|
| 207 |
|
| 208 |
// 3. ONNX sessions
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
|
|
|
| 214 |
p => setProgress(30 + p * 10),
|
| 215 |
);
|
| 216 |
setProgress(40);
|
|
@@ -289,9 +300,12 @@ async function transcribe({ mel, dims, T_mel }) {
|
|
| 289 |
const lastChunkMel = T_mel - (realChunks - 1) * 100;
|
| 290 |
const realAudioFrames = (realChunks - 1) * 13 + Math.floor((lastChunkMel + 7) / 8);
|
| 291 |
|
| 292 |
-
// 2. build prompt + scatter audio embeds at <|audio_pad|>
|
|
|
|
|
|
|
| 293 |
setStatus("building prompt ...");
|
| 294 |
-
const
|
|
|
|
| 295 |
const audioPadId = state.manifest.audio_pad_id;
|
| 296 |
// Expand audio_pad in the prompt to realAudioFrames placeholder tokens
|
| 297 |
const tokens = [];
|
|
|
|
| 116 |
}
|
| 117 |
|
| 118 |
// ---- ONNX session creation -------------------------------------------------
|
| 119 |
+
async function createSessionSimple(graphUrl, label, onProgress) {
|
| 120 |
+
// Single-file ONNX (weights embedded in the graph file).
|
| 121 |
+
const graph = await fetchWithCache(graphUrl, label, onProgress);
|
| 122 |
+
const sess = await ort.InferenceSession.create(graph, {
|
| 123 |
+
executionProviders: state.device === "webgpu" ? ["webgpu"] : ["wasm"],
|
| 124 |
+
});
|
| 125 |
+
log(`session ready: ${label} (${state.device})`);
|
| 126 |
+
return sess;
|
| 127 |
+
}
|
| 128 |
+
|
| 129 |
async function createSession(graphUrl, dataUrl, label, onProgress) {
|
| 130 |
// Fetch the .onnx graph and the .onnx.data weights blob, then construct
|
| 131 |
// an InferenceSession from the two arrays.
|
|
|
|
| 216 |
setProgress(30);
|
| 217 |
|
| 218 |
// 3. ONNX sessions
|
| 219 |
+
// INT8 audio encoder + INT4 decoders gives the best size/quality tradeoff
|
| 220 |
+
// (91.9% vs 87.8% INT4-only on VITW). Encoder is single-file (no .data sidecar).
|
| 221 |
+
setLoaderStatus("audio encoder INT8 (~320 MB)...");
|
| 222 |
+
state.encoder = await createSessionSimple(
|
| 223 |
+
`${HF_ROOT}/onnx/audio_encoder_int8.onnx`,
|
| 224 |
+
"audio_encoder INT8",
|
| 225 |
p => setProgress(30 + p * 10),
|
| 226 |
);
|
| 227 |
setProgress(40);
|
|
|
|
| 300 |
const lastChunkMel = T_mel - (realChunks - 1) * 100;
|
| 301 |
const realAudioFrames = (realChunks - 1) * 13 + Math.floor((lastChunkMel + 7) / 8);
|
| 302 |
|
| 303 |
+
// 2. build prompt + scatter audio embeds at <|audio_pad|>.
|
| 304 |
+
// Default to the forced-English prompt; the model's auto language detection
|
| 305 |
+
// can fail at INT4 quantization on borderline audio.
|
| 306 |
setStatus("building prompt ...");
|
| 307 |
+
const lang = (document.getElementById("lang-select")?.value) || "english";
|
| 308 |
+
const promptIds = (state.manifest.prompts && state.manifest.prompts[lang]?.ids) || state.manifest.prompt_ids;
|
| 309 |
const audioPadId = state.manifest.audio_pad_id;
|
| 310 |
// Expand audio_pad in the prompt to realAudioFrames placeholder tokens
|
| 311 |
const tokens = [];
|