espinasgabriel551 commited on
Commit
7247f76
·
verified ·
1 Parent(s): 97cd206

Upload index.html

Browse files
Files changed (1) hide show
  1. index.html +18 -29
index.html CHANGED
@@ -628,8 +628,11 @@ processBtn.addEventListener('click', async () => {
628
  }
629
  const totalPadded = padded.length;
630
 
631
- // Init state
632
- let state = new Float32Array(meta.total_state_size);
 
 
 
633
  const outputs = [];
634
  const numChunks = Math.ceil(totalPadded / chunkSamples);
635
  let chunkIdx = 0;
@@ -637,48 +640,34 @@ processBtn.addEventListener('click', async () => {
637
  progressLabel.textContent = 'Processing…';
638
  const t0 = performance.now();
639
 
640
- // sr_bin_idx is a model-internal index, NOT the literal sample rate.
641
- // VoxCPM2 uses index 0 for 48 kHz output. Check meta.sr_bins if available.
642
- const srBins = meta.sr_bins || [48000]; // e.g. [48000] or [16000,24000,48000]
643
- let srBinIdx = srBins.indexOf(TARGET_SR);
644
- if (srBinIdx < 0) srBinIdx = 0; // fallback to first bin
645
- const srIdx = new Int32Array([srBinIdx]);
646
- console.log('[VoxUpscaler] sr_bin_idx =', srBinIdx, '(bins:', srBins, ')');
647
-
648
- // State size: prefer meta.total_state_size, fall back to session input shape
649
- const stateSize = meta.total_state_size ||
650
- (session.inputNames.includes('state_in')
651
- ? session.inputs.find(i => i.name === 'state_in')?.dims?.reduce((a,b)=>a*b,1) || 0
652
- : 0);
653
- if (!stateSize) {
654
- console.error('[VoxUpscaler] Could not determine state size from meta:', meta);
655
- }
656
- console.log('[VoxUpscaler] state size =', stateSize);
657
 
658
  for (let pos = 0; pos < totalPadded; pos += chunkSamples) {
659
  const end = Math.min(pos + chunkSamples, totalPadded);
660
  const chunk = padded.slice(pos, end);
661
 
662
- // Shape: [1, 1, samples]
663
  const audioTensor = new ort.Tensor('float32', chunk, [1, 1, chunk.length]);
664
- const srTensor = new ort.Tensor('int32', srIdx, [1]);
665
- const stateTensor = new ort.Tensor('float32', state, [stateSize]);
666
 
667
  const result = await session.run({
668
- audio: audioTensor,
669
  sr_bin_idx: srTensor,
670
- state_in: stateTensor,
671
  });
672
 
673
  const audioOut = await readTensorData(result.audio_out);
674
  const stateOut = await readTensorData(result.state_out);
675
 
676
- // Debug first chunk: log min/max to detect silence
677
  if (chunkIdx === 0) {
678
- const arr = new Float32Array(audioOut);
679
- const max = arr.reduce((m,v) => Math.max(m, Math.abs(v)), 0);
680
- console.log('[VoxUpscaler] chunk 0 audio_out max amplitude:', max,
681
- ' if 0.0 the model is outputting silence (check sr_bin_idx / state_size)');
 
 
682
  }
683
 
684
  outputs.push(new Float32Array(audioOut));
 
628
  }
629
  const totalPadded = padded.length;
630
 
631
+ // State: meta gives only flat size, no shape dims.
632
+ // [1, total_state_size] is the standard ONNX convention for a flat state.
633
+ const stateFlat = meta.total_state_size; // 338502
634
+ const stateShape = [1, stateFlat];
635
+ let state = new Float32Array(stateFlat);
636
  const outputs = [];
637
  const numChunks = Math.ceil(totalPadded / chunkSamples);
638
  let chunkIdx = 0;
 
640
  progressLabel.textContent = 'Processing…';
641
  const t0 = performance.now();
642
 
643
+ // sr_bin_idx 0 = 48 kHz (VoxCPM2 only has one output SR)
644
+ const srIdx = new Int32Array([0]);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
645
 
646
  for (let pos = 0; pos < totalPadded; pos += chunkSamples) {
647
  const end = Math.min(pos + chunkSamples, totalPadded);
648
  const chunk = padded.slice(pos, end);
649
 
 
650
  const audioTensor = new ort.Tensor('float32', chunk, [1, 1, chunk.length]);
651
+ const srTensor = new ort.Tensor('int32', srIdx, [1]);
652
+ const stateTensor = new ort.Tensor('float32', state, stateShape);
653
 
654
  const result = await session.run({
655
+ audio: audioTensor,
656
  sr_bin_idx: srTensor,
657
+ state_in: stateTensor,
658
  });
659
 
660
  const audioOut = await readTensorData(result.audio_out);
661
  const stateOut = await readTensorData(result.state_out);
662
 
663
+ // Amplitude check on first chunk open DevTools Console to see this
664
  if (chunkIdx === 0) {
665
+ const a = new Float32Array(audioOut);
666
+ const maxAmp = a.reduce((m, v) => Math.max(m, Math.abs(v)), 0);
667
+ const outShape = result.audio_out?.dims;
668
+ console.log('[VoxUpscaler] chunk 0 audio_out shape:', outShape,
669
+ '| max amplitude:', maxAmp.toFixed(6),
670
+ maxAmp < 1e-6 ? '⚠️ SILENT — model outputting zeros' : '✓ signal present');
671
  }
672
 
673
  outputs.push(new Float32Array(audioOut));