vox-upscaler-web

Running

App Files Files Community

espinasgabriel551 commited on 13 days ago

Commit

7247f76

verified ·

1 Parent(s): 97cd206

Upload index.html

Browse files

Files changed (1) hide show

index.html +18 -29

index.html CHANGED Viewed

@@ -628,8 +628,11 @@ processBtn.addEventListener('click', async () => {
   }
   const totalPadded = padded.length;
-  // Init state
-  let state = new Float32Array(meta.total_state_size);
   const outputs = [];
   const numChunks = Math.ceil(totalPadded / chunkSamples);
   let chunkIdx = 0;
@@ -637,48 +640,34 @@ processBtn.addEventListener('click', async () => {
   progressLabel.textContent = 'Processing…';
   const t0 = performance.now();
-  // sr_bin_idx is a model-internal index, NOT the literal sample rate.
-  // VoxCPM2 uses index 0 for 48 kHz output. Check meta.sr_bins if available.
-  const srBins = meta.sr_bins || [48000]; // e.g. [48000] or [16000,24000,48000]
-  let srBinIdx = srBins.indexOf(TARGET_SR);
-  if (srBinIdx < 0) srBinIdx = 0; // fallback to first bin
-  const srIdx = new Int32Array([srBinIdx]);
-  console.log('[VoxUpscaler] sr_bin_idx =', srBinIdx, '(bins:', srBins, ')');
-  // State size: prefer meta.total_state_size, fall back to session input shape
-  const stateSize = meta.total_state_size ||
-    (session.inputNames.includes('state_in')
-      ? session.inputs.find(i => i.name === 'state_in')?.dims?.reduce((a,b)=>a*b,1) || 0
-      : 0);
-  if (!stateSize) {
-    console.error('[VoxUpscaler] Could not determine state size from meta:', meta);
-  }
-  console.log('[VoxUpscaler] state size =', stateSize);
   for (let pos = 0; pos < totalPadded; pos += chunkSamples) {
     const end = Math.min(pos + chunkSamples, totalPadded);
     const chunk = padded.slice(pos, end);
-    // Shape: [1, 1, samples]
     const audioTensor = new ort.Tensor('float32', chunk, [1, 1, chunk.length]);
-    const srTensor = new ort.Tensor('int32', srIdx, [1]);
-    const stateTensor = new ort.Tensor('float32', state, [stateSize]);
     const result = await session.run({
-      audio: audioTensor,
       sr_bin_idx: srTensor,
-      state_in: stateTensor,
     });
     const audioOut = await readTensorData(result.audio_out);
     const stateOut = await readTensorData(result.state_out);
-    // Debug first chunk: log min/max to detect silence
     if (chunkIdx === 0) {
-      const arr = new Float32Array(audioOut);
-      const max = arr.reduce((m,v) => Math.max(m, Math.abs(v)), 0);
-      console.log('[VoxUpscaler] chunk 0 audio_out max amplitude:', max,
-        '— if 0.0 the model is outputting silence (check sr_bin_idx / state_size)');
     }
     outputs.push(new Float32Array(audioOut));

   }
   const totalPadded = padded.length;
+  // State: meta gives only flat size, no shape dims.
+  // [1, total_state_size] is the standard ONNX convention for a flat state.
+  const stateFlat  = meta.total_state_size; // 338502
+  const stateShape = [1, stateFlat];
+  let state = new Float32Array(stateFlat);
   const outputs = [];
   const numChunks = Math.ceil(totalPadded / chunkSamples);
   let chunkIdx = 0;
   progressLabel.textContent = 'Processing…';
   const t0 = performance.now();
+  // sr_bin_idx 0 = 48 kHz (VoxCPM2 only has one output SR)
+  const srIdx = new Int32Array([0]);
   for (let pos = 0; pos < totalPadded; pos += chunkSamples) {
     const end = Math.min(pos + chunkSamples, totalPadded);
     const chunk = padded.slice(pos, end);
     const audioTensor = new ort.Tensor('float32', chunk, [1, 1, chunk.length]);
+    const srTensor    = new ort.Tensor('int32',   srIdx, [1]);
+    const stateTensor = new ort.Tensor('float32', state, stateShape);
     const result = await session.run({
+      audio:      audioTensor,
       sr_bin_idx: srTensor,
+      state_in:   stateTensor,
     });
     const audioOut = await readTensorData(result.audio_out);
     const stateOut = await readTensorData(result.state_out);
+    // Amplitude check on first chunk — open DevTools Console to see this
     if (chunkIdx === 0) {
+      const a = new Float32Array(audioOut);
+      const maxAmp = a.reduce((m, v) => Math.max(m, Math.abs(v)), 0);
+      const outShape = result.audio_out?.dims;
+      console.log('[VoxUpscaler] chunk 0 → audio_out shape:', outShape,
+        '| max amplitude:', maxAmp.toFixed(6),
+        maxAmp < 1e-6 ? '⚠️ SILENT — model outputting zeros' : '✓ signal present');
     }
     outputs.push(new Float32Array(audioOut));