Spaces:
Running
Running
| <html lang="en"> | |
| <head> | |
| <meta charset="UTF-8"> | |
| <meta name="viewport" content="width=device-width, initial-scale=1.0"> | |
| <title>Vox Upscaler</title> | |
| <link rel="preconnect" href="https://fonts.googleapis.com"> | |
| <link href="https://fonts.googleapis.com/css2?family=DM+Mono:wght@400;500&family=Instrument+Sans:wght@400;600;700&display=swap" rel="stylesheet"> | |
| <style> | |
| *, *::before, *::after { box-sizing: border-box; margin: 0; padding: 0; } | |
| :root { | |
| --bg: #0a0a0f; | |
| --surface: #13131a; | |
| --border: #1e1e2a; | |
| --text: #e8e8ed; | |
| --text-dim: #6b6b7b; | |
| --accent: #ff6b35; | |
| --accent-glow: rgba(255, 107, 53, 0.15); | |
| --green: #34d399; | |
| --yellow: #fbbf24; | |
| --red: #f87171; | |
| --font-body: 'Instrument Sans', sans-serif; | |
| --font-mono: 'DM Mono', monospace; | |
| } | |
| body { | |
| background: var(--bg); | |
| color: var(--text); | |
| font-family: var(--font-body); | |
| min-height: 100vh; | |
| display: flex; | |
| align-items: center; | |
| justify-content: center; | |
| } | |
| .container { | |
| width: 100%; | |
| max-width: 520px; | |
| padding: 2rem; | |
| } | |
| h1 { | |
| font-size: 1.1rem; | |
| font-weight: 700; | |
| letter-spacing: 0.08em; | |
| text-transform: uppercase; | |
| margin-bottom: 2rem; | |
| display: flex; | |
| align-items: center; | |
| gap: 0.6rem; | |
| } | |
| h1 .dot { | |
| width: 8px; height: 8px; | |
| border-radius: 50%; | |
| background: var(--accent); | |
| box-shadow: 0 0 12px var(--accent); | |
| } | |
| .drop-zone { | |
| border: 2px dashed var(--border); | |
| border-radius: 12px; | |
| padding: 3rem 2rem; | |
| text-align: center; | |
| cursor: pointer; | |
| transition: border-color 0.2s, background 0.2s; | |
| position: relative; | |
| } | |
| .drop-zone:hover, .drop-zone.dragover { | |
| border-color: var(--accent); | |
| background: var(--accent-glow); | |
| } | |
| .drop-zone.has-file { | |
| border-style: solid; | |
| border-color: var(--border); | |
| padding: 1.2rem 1.5rem; | |
| text-align: left; | |
| } | |
| .drop-zone label { | |
| font-size: 0.85rem; | |
| color: var(--text-dim); | |
| display: block; | |
| cursor: pointer; | |
| } | |
| .drop-zone .filename { | |
| font-family: var(--font-mono); | |
| font-size: 0.9rem; | |
| margin-top: 0.3rem; | |
| color: var(--text); | |
| } | |
| .drop-zone input { display: none; } | |
| .status-bar { | |
| display: flex; | |
| gap: 1.2rem; | |
| margin-top: 1.2rem; | |
| font-family: var(--font-mono); | |
| font-size: 0.75rem; | |
| color: var(--text-dim); | |
| } | |
| .status-bar .chip { | |
| display: flex; | |
| align-items: center; | |
| gap: 0.4rem; | |
| background: var(--surface); | |
| border: 1px solid var(--border); | |
| border-radius: 6px; | |
| padding: 0.35rem 0.7rem; | |
| } | |
| .chip .indicator { | |
| width: 6px; height: 6px; | |
| border-radius: 50%; | |
| background: var(--text-dim); | |
| } | |
| .chip .indicator.gpu { background: var(--green); box-shadow: 0 0 6px var(--green); } | |
| .chip .indicator.cpu { background: var(--yellow); box-shadow: 0 0 6px var(--yellow); } | |
| button#process { | |
| width: 100%; | |
| margin-top: 1.5rem; | |
| padding: 0.9rem; | |
| border: none; | |
| border-radius: 10px; | |
| background: var(--accent); | |
| color: #fff; | |
| font-family: var(--font-body); | |
| font-size: 0.9rem; | |
| font-weight: 600; | |
| cursor: pointer; | |
| transition: opacity 0.2s, transform 0.1s; | |
| } | |
| button#process:hover { opacity: 0.9; } | |
| button#process:active { transform: scale(0.98); } | |
| button#process:disabled { opacity: 0.4; cursor: not-allowed; transform: none; } | |
| .progress-wrap { | |
| margin-top: 1.5rem; | |
| display: none; | |
| } | |
| .progress-wrap.active { display: block; } | |
| .progress-bar-bg { | |
| width: 100%; | |
| height: 4px; | |
| background: var(--surface); | |
| border-radius: 2px; | |
| overflow: hidden; | |
| } | |
| .progress-bar { | |
| height: 100%; | |
| width: 0%; | |
| background: var(--accent); | |
| border-radius: 2px; | |
| transition: width 0.15s; | |
| } | |
| .progress-info { | |
| display: flex; | |
| justify-content: space-between; | |
| margin-top: 0.6rem; | |
| font-family: var(--font-mono); | |
| font-size: 0.75rem; | |
| color: var(--text-dim); | |
| } | |
| .ab-section { | |
| margin-top: 1.5rem; | |
| display: flex; | |
| flex-direction: column; | |
| gap: 1rem; | |
| } | |
| .ab-player { | |
| background: var(--surface); | |
| border: 1px solid var(--border); | |
| border-radius: 10px; | |
| padding: 1rem; | |
| } | |
| .ab-label { | |
| font-family: var(--font-mono); | |
| font-size: 0.8rem; | |
| font-weight: 500; | |
| margin-bottom: 0.5rem; | |
| color: var(--text); | |
| } | |
| .ab-label .ab-sr { | |
| color: var(--text-dim); | |
| font-weight: 400; | |
| } | |
| .ab-player audio { | |
| width: 100%; | |
| border-radius: 6px; | |
| } | |
| .ab-player a { | |
| display: inline-block; | |
| margin-top: 0.6rem; | |
| font-family: var(--font-mono); | |
| font-size: 0.8rem; | |
| color: var(--accent); | |
| text-decoration: none; | |
| } | |
| .ab-player a:hover { text-decoration: underline; } | |
| .model-loading { | |
| margin-top: 1rem; | |
| font-family: var(--font-mono); | |
| font-size: 0.75rem; | |
| color: var(--text-dim); | |
| display: none; | |
| } | |
| .model-loading.active { display: block; } | |
| footer { | |
| margin-top: 3rem; | |
| padding-top: 1.2rem; | |
| border-top: 1px solid var(--border); | |
| font-family: var(--font-mono); | |
| font-size: 0.7rem; | |
| color: var(--text-dim); | |
| display: flex; | |
| flex-direction: column; | |
| gap: 0.3rem; | |
| } | |
| footer a { | |
| color: var(--text-dim); | |
| text-decoration: none; | |
| border-bottom: 1px dotted var(--text-dim); | |
| } | |
| footer a:hover { color: var(--text); border-color: var(--text); } | |
| @keyframes pulse { 0%,100% { opacity: 1; } 50% { opacity: 0.4; } } | |
| .pulsing { animation: pulse 1.5s ease-in-out infinite; } | |
| </style> | |
| </head> | |
| <body> | |
| <div class="container"> | |
| <h1><span class="dot"></span>Vox Upscaler</h1> | |
| <div class="drop-zone" id="dropZone"> | |
| <label>Drop an audio file or click to browse</label> | |
| <div class="filename" id="fileName" style="display:none"></div> | |
| <input type="file" id="fileInput" accept="audio/*"> | |
| </div> | |
| <div class="status-bar"> | |
| <div class="chip"><span class="indicator" id="backendDot"></span><span id="backendLabel">detecting…</span></div> | |
| <div class="chip" id="rtfChip" style="display:none">RTFx: <span id="rtfValue">—</span></div> | |
| <div class="chip" id="modelChip"><span id="modelStatus">model not loaded</span></div> | |
| </div> | |
| <div class="model-loading" id="modelLoading"></div> | |
| <button id="process" disabled>Upscale to 48 kHz</button> | |
| <div class="progress-wrap" id="progressWrap"> | |
| <div class="progress-bar-bg"><div class="progress-bar" id="progressBar"></div></div> | |
| <div class="progress-info"> | |
| <span id="progressLabel">Processing…</span> | |
| <span id="progressPct">0%</span> | |
| </div> | |
| </div> | |
| <div class="ab-section" id="abSection" style="display:none"> | |
| <div class="ab-player"> | |
| <div class="ab-label">Input<span class="ab-sr" id="inputSrLabel"></span></div> | |
| <audio controls id="inputPlayer"></audio> | |
| </div> | |
| <div class="ab-player" id="outputPanel" style="display:none"> | |
| <div class="ab-label">Output<span class="ab-sr"> — 48 kHz</span></div> | |
| <audio controls id="audioPlayer"></audio> | |
| <a id="downloadLink" download>Download WAV</a> | |
| </div> | |
| </div> | |
| <footer> | |
| <span>VAE model: <a href="https://huggingface.co/openbmb/VoxCPM2/blob/main/audiovae.pth" target="_blank">VoxCPM2</a> by <a href="https://huggingface.co/openbmb" target="_blank">OpenBMB</a> · Apache-2.0</span> | |
| <span>WebGPU port by <a href="https://huggingface.co/KevinAHM" target="_blank">KevinAHM</a></span> | |
| </footer> | |
| </div> | |
| <script> | |
| const HOP = 640; | |
| const TARGET_SR = 48000; | |
| const INPUT_SR = 16000; | |
| const META_URL = 'onnx/meta.json'; | |
| const MODEL_URL = 'onnx/vae_stream.onnx'; | |
| let session = null; | |
| let meta = null; | |
| let backend = null; // 'webgpu' or 'cpu' | |
| let fileBuffer = null; | |
| let fileName = ''; | |
| const dropZone = document.getElementById('dropZone'); | |
| const fileInput = document.getElementById('fileInput'); | |
| const fileNameEl = document.getElementById('fileName'); | |
| const processBtn = document.getElementById('process'); | |
| const progressWrap = document.getElementById('progressWrap'); | |
| const progressBar = document.getElementById('progressBar'); | |
| const progressLabel = document.getElementById('progressLabel'); | |
| const progressPct = document.getElementById('progressPct'); | |
| const audioPlayer = document.getElementById('audioPlayer'); | |
| const downloadLink = document.getElementById('downloadLink'); | |
| const backendDot = document.getElementById('backendDot'); | |
| const backendLabel = document.getElementById('backendLabel'); | |
| const rtfChip = document.getElementById('rtfChip'); | |
| const rtfValue = document.getElementById('rtfValue'); | |
| const modelStatus = document.getElementById('modelStatus'); | |
| const modelLoading = document.getElementById('modelLoading'); | |
| // -- File handling -- | |
| dropZone.addEventListener('click', () => fileInput.click()); | |
| dropZone.addEventListener('dragover', e => { e.preventDefault(); dropZone.classList.add('dragover'); }); | |
| dropZone.addEventListener('dragleave', () => dropZone.classList.remove('dragover')); | |
| dropZone.addEventListener('drop', e => { e.preventDefault(); dropZone.classList.remove('dragover'); handleFile(e.dataTransfer.files[0]); }); | |
| fileInput.addEventListener('change', () => { if (fileInput.files[0]) handleFile(fileInput.files[0]); }); | |
| function handleFile(file) { | |
| fileName = file.name; | |
| fileNameEl.textContent = file.name; | |
| fileNameEl.style.display = 'block'; | |
| dropZone.classList.add('has-file'); | |
| dropZone.querySelector('label').textContent = 'Selected file'; | |
| // Show input player with original file | |
| document.getElementById('inputPlayer').src = URL.createObjectURL(file); | |
| document.getElementById('abSection').style.display = 'flex'; | |
| document.getElementById('outputPanel').style.display = 'none'; | |
| // Read sample rate from file header if WAV | |
| file.arrayBuffer().then(buf => { | |
| fileBuffer = buf; | |
| const view = new DataView(buf); | |
| let srText = 'Original'; | |
| // WAV: bytes 24-27 = sample rate (little-endian uint32) | |
| if (buf.byteLength > 28) { | |
| const riff = String.fromCharCode(view.getUint8(0), view.getUint8(1), view.getUint8(2), view.getUint8(3)); | |
| if (riff === 'RIFF') { | |
| const sr = view.getUint32(24, true); | |
| srText = (sr / 1000) + ' kHz'; | |
| } | |
| } | |
| document.getElementById('inputSrLabel').textContent = ' — ' + srText; | |
| updateBtn(); | |
| }); | |
| } | |
| function updateBtn() { processBtn.disabled = !(fileBuffer && session); } | |
| async function readTensorData(tensor) { | |
| return typeof tensor.getData === 'function' ? await tensor.getData() : tensor.data; | |
| } | |
| // -- Detect backend & load model -- | |
| async function init() { | |
| // Detect WebGPU and patch device creation to raise storage buffer limits | |
| if (navigator.gpu) { | |
| try { | |
| const adapter = await navigator.gpu.requestAdapter(); | |
| if (adapter) { | |
| // Patch requestDevice to raise limits ORT doesn't request itself | |
| const origRequestDevice = GPUAdapter.prototype.requestDevice; | |
| const adapterLimits = adapter.limits; | |
| GPUAdapter.prototype.requestDevice = function(desc) { | |
| desc = desc || {}; | |
| desc.requiredLimits = desc.requiredLimits || {}; | |
| const rl = desc.requiredLimits; | |
| rl.maxStorageBuffersPerShaderStage = adapterLimits.maxStorageBuffersPerShaderStage; | |
| rl.maxBufferSize = adapterLimits.maxBufferSize; | |
| rl.maxStorageBufferBindingSize = adapterLimits.maxStorageBufferBindingSize; | |
| console.log(`[VoxUpscaler] patched requestDevice:`, JSON.stringify(rl)); | |
| return origRequestDevice.call(this, desc); | |
| }; | |
| backend = 'webgpu'; | |
| backendDot.className = 'indicator gpu'; | |
| backendLabel.textContent = 'WebGPU'; | |
| } | |
| } catch(e) {} | |
| } | |
| if (!backend) { | |
| backend = 'cpu'; | |
| backendDot.className = 'indicator cpu'; | |
| backendLabel.textContent = 'CPU (WASM)'; | |
| } | |
| // Load meta | |
| modelLoading.classList.add('active'); | |
| modelLoading.innerHTML = '<span class="pulsing">Loading model metadata…</span>'; | |
| const resp = await fetch(META_URL); | |
| meta = await resp.json(); | |
| // Load ONNX model | |
| modelLoading.innerHTML = '<span class="pulsing">Loading ONNX model (fp32, ~376 MB)…</span>'; | |
| modelStatus.textContent = 'loading…'; | |
| const ep = backend === 'webgpu' ? 'webgpu' : 'wasm'; | |
| const opts = { executionProviders: [ep] }; | |
| if (ep === 'webgpu') { | |
| opts.preferredOutputLocation = { | |
| audio_out: 'cpu', | |
| state_out: 'cpu', | |
| }; | |
| } else { | |
| opts.executionProviders = [{ name: 'wasm', options: { numThreads: navigator.hardwareConcurrency || 4 } }]; | |
| } | |
| try { | |
| // Fetch as ArrayBuffer to avoid ORT Web external data issues | |
| const modelResp = await fetch(MODEL_URL); | |
| const modelBuf = await modelResp.arrayBuffer(); | |
| session = await ort.InferenceSession.create(modelBuf, opts); | |
| modelStatus.textContent = 'ready'; | |
| modelLoading.innerHTML = '✓ Model loaded'; | |
| modelLoading.classList.remove('active'); | |
| setTimeout(() => { modelLoading.style.display = 'none'; }, 1500); | |
| } catch(e) { | |
| // Fallback to wasm if webgpu fails | |
| if (backend === 'webgpu') { | |
| backend = 'cpu'; | |
| backendDot.className = 'indicator cpu'; | |
| backendLabel.textContent = 'CPU (WASM)'; | |
| modelLoading.innerHTML = '<span class="pulsing">WebGPU failed, falling back to CPU (fp32)…</span>'; | |
| session = await ort.InferenceSession.create(MODEL_URL, { | |
| executionProviders: [{ name: 'wasm', options: { numThreads: navigator.hardwareConcurrency || 4 } }] | |
| }); | |
| modelStatus.textContent = 'ready'; | |
| modelLoading.innerHTML = '✓ Model loaded (CPU fallback)'; | |
| } else { | |
| modelLoading.innerHTML = 'Failed to load model: ' + e.message; | |
| modelStatus.textContent = 'error'; | |
| return; | |
| } | |
| } | |
| updateBtn(); | |
| } | |
| function mixToMono(audioBuffer) { | |
| const len = audioBuffer.length; | |
| const mono = new Float32Array(len); | |
| for (let ch = 0; ch < audioBuffer.numberOfChannels; ch++) { | |
| const data = audioBuffer.getChannelData(ch); | |
| for (let i = 0; i < len; i++) mono[i] += data[i]; | |
| } | |
| const gain = 1 / audioBuffer.numberOfChannels; | |
| for (let i = 0; i < len; i++) mono[i] *= gain; | |
| return mono; | |
| } | |
| function readFourCc(view, offset) { | |
| return String.fromCharCode( | |
| view.getUint8(offset), | |
| view.getUint8(offset + 1), | |
| view.getUint8(offset + 2), | |
| view.getUint8(offset + 3) | |
| ); | |
| } | |
| function decodeWavToMono(arrayBuffer) { | |
| if (arrayBuffer.byteLength < 44) return null; | |
| const view = new DataView(arrayBuffer); | |
| if (readFourCc(view, 0) !== 'RIFF' || readFourCc(view, 8) !== 'WAVE') return null; | |
| let offset = 12; | |
| let fmt = null; | |
| let dataOffset = 0; | |
| let dataSize = 0; | |
| while (offset + 8 <= view.byteLength) { | |
| const id = readFourCc(view, offset); | |
| const size = view.getUint32(offset + 4, true); | |
| const chunkStart = offset + 8; | |
| if (id === 'fmt ') { | |
| const format = view.getUint16(chunkStart, true); | |
| fmt = { | |
| format: format === 0xfffe && size >= 40 ? view.getUint16(chunkStart + 24, true) : format, | |
| channels: view.getUint16(chunkStart + 2, true), | |
| sampleRate: view.getUint32(chunkStart + 4, true), | |
| blockAlign: view.getUint16(chunkStart + 12, true), | |
| bitsPerSample: view.getUint16(chunkStart + 14, true), | |
| }; | |
| } else if (id === 'data') { | |
| dataOffset = chunkStart; | |
| dataSize = size; | |
| break; | |
| } | |
| offset = chunkStart + size + (size % 2); | |
| } | |
| if (!fmt || !dataOffset || !dataSize) return null; | |
| if (fmt.format !== 1 && fmt.format !== 3) return null; | |
| const bytesPerSample = fmt.bitsPerSample / 8; | |
| if (!Number.isInteger(bytesPerSample) || bytesPerSample < 1) return null; | |
| const frames = Math.floor(dataSize / fmt.blockAlign); | |
| const mono = new Float32Array(frames); | |
| const readSample = (pos) => { | |
| if (fmt.format === 3 && fmt.bitsPerSample === 32) return view.getFloat32(pos, true); | |
| if (fmt.format !== 1) return 0; | |
| if (fmt.bitsPerSample === 8) return (view.getUint8(pos) - 128) / 128; | |
| if (fmt.bitsPerSample === 16) return view.getInt16(pos, true) / 32768; | |
| if (fmt.bitsPerSample === 24) { | |
| let v = view.getUint8(pos) | (view.getUint8(pos + 1) << 8) | (view.getUint8(pos + 2) << 16); | |
| if (v & 0x800000) v |= 0xff000000; | |
| return v / 8388608; | |
| } | |
| if (fmt.bitsPerSample === 32) return view.getInt32(pos, true) / 2147483648; | |
| return 0; | |
| }; | |
| for (let frame = 0; frame < frames; frame++) { | |
| const frameOffset = dataOffset + frame * fmt.blockAlign; | |
| let sum = 0; | |
| for (let ch = 0; ch < fmt.channels; ch++) { | |
| sum += readSample(frameOffset + ch * bytesPerSample); | |
| } | |
| mono[frame] = sum / fmt.channels; | |
| } | |
| return { | |
| mono, | |
| sampleRate: fmt.sampleRate, | |
| channels: fmt.channels, | |
| source: 'wav', | |
| }; | |
| } | |
| function sinc(x) { | |
| if (Math.abs(x) < 1e-8) return 1; | |
| const pix = Math.PI * x; | |
| return Math.sin(pix) / pix; | |
| } | |
| function resampleSinc(input, inSr, outSr) { | |
| if (inSr === outSr) return new Float32Array(input); | |
| const outLen = Math.round(input.length * outSr / inSr); | |
| const output = new Float32Array(outLen); | |
| const ratio = inSr / outSr; | |
| const cutoff = Math.min(1, outSr / inSr) * 0.95; | |
| const radius = 12; | |
| const support = radius / cutoff; | |
| for (let i = 0; i < outLen; i++) { | |
| const center = i * ratio; | |
| const left = Math.max(0, Math.ceil(center - support)); | |
| const right = Math.min(input.length - 1, Math.floor(center + support)); | |
| let sum = 0; | |
| let weightSum = 0; | |
| for (let j = left; j <= right; j++) { | |
| const x = (center - j) * cutoff; | |
| const weight = sinc(x) * sinc(x / radius); | |
| sum += input[j] * weight; | |
| weightSum += weight; | |
| } | |
| output[i] = weightSum ? sum / weightSum : 0; | |
| } | |
| return output; | |
| } | |
| // -- Decode audio to mono 16kHz Float32 -- | |
| async function decodeToMono16k(arrayBuffer) { | |
| let decodedAudio = decodeWavToMono(arrayBuffer); | |
| if (!decodedAudio) { | |
| const AudioCtx = window.AudioContext || window.webkitAudioContext; | |
| const audioCtx = new AudioCtx(); | |
| const decoded = await audioCtx.decodeAudioData(arrayBuffer.slice(0)); | |
| await audioCtx.close(); | |
| decodedAudio = { | |
| mono: mixToMono(decoded), | |
| sampleRate: decoded.sampleRate, | |
| channels: decoded.numberOfChannels, | |
| source: 'webaudio', | |
| }; | |
| } | |
| const origSr = decodedAudio.sampleRate; | |
| const mono = decodedAudio.mono; | |
| const audio16k = resampleSinc(mono, origSr, INPUT_SR); | |
| return audio16k; | |
| } | |
| // -- Process -- | |
| processBtn.addEventListener('click', async () => { | |
| if (!fileBuffer || !session) return; | |
| processBtn.disabled = true; | |
| document.getElementById('outputPanel').style.display = 'none'; | |
| progressWrap.classList.add('active'); | |
| progressBar.style.width = '0%'; | |
| progressPct.textContent = '0%'; | |
| progressLabel.textContent = 'Decoding input…'; | |
| const audio16k = await decodeToMono16k(fileBuffer); | |
| const totalSamples = audio16k.length; | |
| const audioDuration = totalSamples / INPUT_SR; | |
| // Chunk sizing: CPU=1000ms, GPU=5000ms | |
| const chunkMs = backend === 'webgpu' ? 5000 : 1000; | |
| const chunkHops = Math.max(1, Math.floor(chunkMs / 1000 * INPUT_SR / HOP)); | |
| const chunkSamples = chunkHops * HOP; | |
| // Pad to HOP boundary | |
| const pad = (HOP - totalSamples % HOP) % HOP; | |
| let padded; | |
| if (pad > 0) { | |
| padded = new Float32Array(totalSamples + pad); | |
| padded.set(audio16k); | |
| } else { | |
| padded = audio16k; | |
| } | |
| const totalPadded = padded.length; | |
| // Init state | |
| let state = new Float32Array(meta.total_state_size); | |
| const outputs = []; | |
| const numChunks = Math.ceil(totalPadded / chunkSamples); | |
| let chunkIdx = 0; | |
| progressLabel.textContent = 'Processing…'; | |
| const t0 = performance.now(); | |
| const srIdx = new Int32Array([TARGET_SR]); | |
| for (let pos = 0; pos < totalPadded; pos += chunkSamples) { | |
| const end = Math.min(pos + chunkSamples, totalPadded); | |
| const chunk = padded.slice(pos, end); | |
| // Shape: [1, 1, samples] | |
| const audioTensor = new ort.Tensor('float32', chunk, [1, 1, chunk.length]); | |
| const srTensor = new ort.Tensor('int32', srIdx, [1]); | |
| const stateTensor = new ort.Tensor('float32', state, [meta.total_state_size]); | |
| const result = await session.run({ | |
| audio: audioTensor, | |
| sr_bin_idx: srTensor, | |
| state_in: stateTensor, | |
| }); | |
| const audioOut = await readTensorData(result.audio_out); | |
| const stateOut = await readTensorData(result.state_out); | |
| outputs.push(new Float32Array(audioOut)); | |
| state = new Float32Array(stateOut); | |
| chunkIdx++; | |
| const pct = Math.round(chunkIdx / numChunks * 100); | |
| progressBar.style.width = pct + '%'; | |
| progressPct.textContent = pct + '%'; | |
| const elapsed = (performance.now() - t0) / 1000; | |
| const processedDur = end / INPUT_SR; | |
| const rtf = processedDur / elapsed; | |
| rtfChip.style.display = 'flex'; | |
| rtfValue.textContent = rtf.toFixed(3) + 'x'; | |
| } | |
| const totalElapsed = (performance.now() - t0) / 1000; | |
| const finalRtf = audioDuration / totalElapsed; | |
| rtfValue.textContent = finalRtf.toFixed(3) + 'x'; | |
| progressLabel.textContent = `Done in ${totalElapsed.toFixed(1)}s`; | |
| progressPct.textContent = '100%'; | |
| progressBar.style.width = '100%'; | |
| // Concatenate outputs | |
| const totalOut = outputs.reduce((s, a) => s + a.length, 0); | |
| const fullOutput = new Float32Array(totalOut); | |
| let off = 0; | |
| for (const o of outputs) { fullOutput.set(o, off); off += o.length; } | |
| // Trim to expected length | |
| const expectedLen = Math.round(audioDuration * TARGET_SR); | |
| const trimmed = fullOutput.slice(0, expectedLen); | |
| // Encode WAV | |
| const wav = encodeWav(trimmed, TARGET_SR); | |
| const blob = new Blob([wav], { type: 'audio/wav' }); | |
| const url = URL.createObjectURL(blob); | |
| audioPlayer.src = url; | |
| const outName = fileName.replace(/\.[^.]+$/, '') + '_48k.wav'; | |
| downloadLink.href = url; | |
| downloadLink.download = outName; | |
| downloadLink.textContent = 'Download ' + outName; | |
| document.getElementById('outputPanel').style.display = 'block'; | |
| processBtn.disabled = false; | |
| }); | |
| function encodeWav(samples, sr) { | |
| const len = samples.length; | |
| const buf = new ArrayBuffer(44 + len * 2); | |
| const view = new DataView(buf); | |
| const writeStr = (o, s) => { for (let i = 0; i < s.length; i++) view.setUint8(o + i, s.charCodeAt(i)); }; | |
| writeStr(0, 'RIFF'); | |
| view.setUint32(4, 36 + len * 2, true); | |
| writeStr(8, 'WAVE'); | |
| writeStr(12, 'fmt '); | |
| view.setUint32(16, 16, true); | |
| view.setUint16(20, 1, true); | |
| view.setUint16(22, 1, true); | |
| view.setUint32(24, sr, true); | |
| view.setUint32(28, sr * 2, true); | |
| view.setUint16(32, 2, true); | |
| view.setUint16(34, 16, true); | |
| writeStr(36, 'data'); | |
| view.setUint32(40, len * 2, true); | |
| for (let i = 0; i < len; i++) { | |
| let s = Math.max(-1, Math.min(1, samples[i])); | |
| view.setInt16(44 + i * 2, s < 0 ? s * 0x8000 : s * 0x7FFF, true); | |
| } | |
| return buf; | |
| } | |
| // Load ORT and init | |
| const script = document.createElement('script'); | |
| script.src = 'https://cdn.jsdelivr.net/npm/onnxruntime-web@1.22.0/dist/ort.webgpu.min.js'; | |
| script.crossOrigin = 'anonymous'; | |
| script.onload = () => { | |
| ort.env.wasm.numThreads = navigator.hardwareConcurrency || 4; | |
| ort.env.wasm.wasmPaths = 'https://cdn.jsdelivr.net/npm/onnxruntime-web@1.22.0/dist/'; | |
| init(); | |
| }; | |
| document.head.appendChild(script); | |
| </script> | |
| </body> | |
| </html> | |