vox-upscaler-web / index.html
KevinAHM's picture
Fix deterministic audio preprocessing on Windows
ea8d726 verified
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Vox Upscaler</title>
<link rel="preconnect" href="https://fonts.googleapis.com">
<link href="https://fonts.googleapis.com/css2?family=DM+Mono:wght@400;500&family=Instrument+Sans:wght@400;600;700&display=swap" rel="stylesheet">
<style>
*, *::before, *::after { box-sizing: border-box; margin: 0; padding: 0; }
:root {
--bg: #0a0a0f;
--surface: #13131a;
--border: #1e1e2a;
--text: #e8e8ed;
--text-dim: #6b6b7b;
--accent: #ff6b35;
--accent-glow: rgba(255, 107, 53, 0.15);
--green: #34d399;
--yellow: #fbbf24;
--red: #f87171;
--font-body: 'Instrument Sans', sans-serif;
--font-mono: 'DM Mono', monospace;
}
body {
background: var(--bg);
color: var(--text);
font-family: var(--font-body);
min-height: 100vh;
display: flex;
align-items: center;
justify-content: center;
}
.container {
width: 100%;
max-width: 520px;
padding: 2rem;
}
h1 {
font-size: 1.1rem;
font-weight: 700;
letter-spacing: 0.08em;
text-transform: uppercase;
margin-bottom: 2rem;
display: flex;
align-items: center;
gap: 0.6rem;
}
h1 .dot {
width: 8px; height: 8px;
border-radius: 50%;
background: var(--accent);
box-shadow: 0 0 12px var(--accent);
}
.drop-zone {
border: 2px dashed var(--border);
border-radius: 12px;
padding: 3rem 2rem;
text-align: center;
cursor: pointer;
transition: border-color 0.2s, background 0.2s;
position: relative;
}
.drop-zone:hover, .drop-zone.dragover {
border-color: var(--accent);
background: var(--accent-glow);
}
.drop-zone.has-file {
border-style: solid;
border-color: var(--border);
padding: 1.2rem 1.5rem;
text-align: left;
}
.drop-zone label {
font-size: 0.85rem;
color: var(--text-dim);
display: block;
cursor: pointer;
}
.drop-zone .filename {
font-family: var(--font-mono);
font-size: 0.9rem;
margin-top: 0.3rem;
color: var(--text);
}
.drop-zone input { display: none; }
.status-bar {
display: flex;
gap: 1.2rem;
margin-top: 1.2rem;
font-family: var(--font-mono);
font-size: 0.75rem;
color: var(--text-dim);
}
.status-bar .chip {
display: flex;
align-items: center;
gap: 0.4rem;
background: var(--surface);
border: 1px solid var(--border);
border-radius: 6px;
padding: 0.35rem 0.7rem;
}
.chip .indicator {
width: 6px; height: 6px;
border-radius: 50%;
background: var(--text-dim);
}
.chip .indicator.gpu { background: var(--green); box-shadow: 0 0 6px var(--green); }
.chip .indicator.cpu { background: var(--yellow); box-shadow: 0 0 6px var(--yellow); }
button#process {
width: 100%;
margin-top: 1.5rem;
padding: 0.9rem;
border: none;
border-radius: 10px;
background: var(--accent);
color: #fff;
font-family: var(--font-body);
font-size: 0.9rem;
font-weight: 600;
cursor: pointer;
transition: opacity 0.2s, transform 0.1s;
}
button#process:hover { opacity: 0.9; }
button#process:active { transform: scale(0.98); }
button#process:disabled { opacity: 0.4; cursor: not-allowed; transform: none; }
.progress-wrap {
margin-top: 1.5rem;
display: none;
}
.progress-wrap.active { display: block; }
.progress-bar-bg {
width: 100%;
height: 4px;
background: var(--surface);
border-radius: 2px;
overflow: hidden;
}
.progress-bar {
height: 100%;
width: 0%;
background: var(--accent);
border-radius: 2px;
transition: width 0.15s;
}
.progress-info {
display: flex;
justify-content: space-between;
margin-top: 0.6rem;
font-family: var(--font-mono);
font-size: 0.75rem;
color: var(--text-dim);
}
.ab-section {
margin-top: 1.5rem;
display: flex;
flex-direction: column;
gap: 1rem;
}
.ab-player {
background: var(--surface);
border: 1px solid var(--border);
border-radius: 10px;
padding: 1rem;
}
.ab-label {
font-family: var(--font-mono);
font-size: 0.8rem;
font-weight: 500;
margin-bottom: 0.5rem;
color: var(--text);
}
.ab-label .ab-sr {
color: var(--text-dim);
font-weight: 400;
}
.ab-player audio {
width: 100%;
border-radius: 6px;
}
.ab-player a {
display: inline-block;
margin-top: 0.6rem;
font-family: var(--font-mono);
font-size: 0.8rem;
color: var(--accent);
text-decoration: none;
}
.ab-player a:hover { text-decoration: underline; }
.model-loading {
margin-top: 1rem;
font-family: var(--font-mono);
font-size: 0.75rem;
color: var(--text-dim);
display: none;
}
.model-loading.active { display: block; }
footer {
margin-top: 3rem;
padding-top: 1.2rem;
border-top: 1px solid var(--border);
font-family: var(--font-mono);
font-size: 0.7rem;
color: var(--text-dim);
display: flex;
flex-direction: column;
gap: 0.3rem;
}
footer a {
color: var(--text-dim);
text-decoration: none;
border-bottom: 1px dotted var(--text-dim);
}
footer a:hover { color: var(--text); border-color: var(--text); }
@keyframes pulse { 0%,100% { opacity: 1; } 50% { opacity: 0.4; } }
.pulsing { animation: pulse 1.5s ease-in-out infinite; }
</style>
</head>
<body>
<div class="container">
<h1><span class="dot"></span>Vox Upscaler</h1>
<div class="drop-zone" id="dropZone">
<label>Drop an audio file or click to browse</label>
<div class="filename" id="fileName" style="display:none"></div>
<input type="file" id="fileInput" accept="audio/*">
</div>
<div class="status-bar">
<div class="chip"><span class="indicator" id="backendDot"></span><span id="backendLabel">detecting…</span></div>
<div class="chip" id="rtfChip" style="display:none">RTFx: <span id="rtfValue"></span></div>
<div class="chip" id="modelChip"><span id="modelStatus">model not loaded</span></div>
</div>
<div class="model-loading" id="modelLoading"></div>
<button id="process" disabled>Upscale to 48 kHz</button>
<div class="progress-wrap" id="progressWrap">
<div class="progress-bar-bg"><div class="progress-bar" id="progressBar"></div></div>
<div class="progress-info">
<span id="progressLabel">Processing…</span>
<span id="progressPct">0%</span>
</div>
</div>
<div class="ab-section" id="abSection" style="display:none">
<div class="ab-player">
<div class="ab-label">Input<span class="ab-sr" id="inputSrLabel"></span></div>
<audio controls id="inputPlayer"></audio>
</div>
<div class="ab-player" id="outputPanel" style="display:none">
<div class="ab-label">Output<span class="ab-sr"> — 48 kHz</span></div>
<audio controls id="audioPlayer"></audio>
<a id="downloadLink" download>Download WAV</a>
</div>
</div>
<footer>
<span>VAE model: <a href="https://huggingface.co/openbmb/VoxCPM2/blob/main/audiovae.pth" target="_blank">VoxCPM2</a> by <a href="https://huggingface.co/openbmb" target="_blank">OpenBMB</a> · Apache-2.0</span>
<span>WebGPU port by <a href="https://huggingface.co/KevinAHM" target="_blank">KevinAHM</a></span>
</footer>
</div>
<script>
const HOP = 640;
const TARGET_SR = 48000;
const INPUT_SR = 16000;
const META_URL = 'onnx/meta.json';
const MODEL_URL = 'onnx/vae_stream.onnx';
let session = null;
let meta = null;
let backend = null; // 'webgpu' or 'cpu'
let fileBuffer = null;
let fileName = '';
const dropZone = document.getElementById('dropZone');
const fileInput = document.getElementById('fileInput');
const fileNameEl = document.getElementById('fileName');
const processBtn = document.getElementById('process');
const progressWrap = document.getElementById('progressWrap');
const progressBar = document.getElementById('progressBar');
const progressLabel = document.getElementById('progressLabel');
const progressPct = document.getElementById('progressPct');
const audioPlayer = document.getElementById('audioPlayer');
const downloadLink = document.getElementById('downloadLink');
const backendDot = document.getElementById('backendDot');
const backendLabel = document.getElementById('backendLabel');
const rtfChip = document.getElementById('rtfChip');
const rtfValue = document.getElementById('rtfValue');
const modelStatus = document.getElementById('modelStatus');
const modelLoading = document.getElementById('modelLoading');
// -- File handling --
dropZone.addEventListener('click', () => fileInput.click());
dropZone.addEventListener('dragover', e => { e.preventDefault(); dropZone.classList.add('dragover'); });
dropZone.addEventListener('dragleave', () => dropZone.classList.remove('dragover'));
dropZone.addEventListener('drop', e => { e.preventDefault(); dropZone.classList.remove('dragover'); handleFile(e.dataTransfer.files[0]); });
fileInput.addEventListener('change', () => { if (fileInput.files[0]) handleFile(fileInput.files[0]); });
function handleFile(file) {
fileName = file.name;
fileNameEl.textContent = file.name;
fileNameEl.style.display = 'block';
dropZone.classList.add('has-file');
dropZone.querySelector('label').textContent = 'Selected file';
// Show input player with original file
document.getElementById('inputPlayer').src = URL.createObjectURL(file);
document.getElementById('abSection').style.display = 'flex';
document.getElementById('outputPanel').style.display = 'none';
// Read sample rate from file header if WAV
file.arrayBuffer().then(buf => {
fileBuffer = buf;
const view = new DataView(buf);
let srText = 'Original';
// WAV: bytes 24-27 = sample rate (little-endian uint32)
if (buf.byteLength > 28) {
const riff = String.fromCharCode(view.getUint8(0), view.getUint8(1), view.getUint8(2), view.getUint8(3));
if (riff === 'RIFF') {
const sr = view.getUint32(24, true);
srText = (sr / 1000) + ' kHz';
}
}
document.getElementById('inputSrLabel').textContent = ' — ' + srText;
updateBtn();
});
}
function updateBtn() { processBtn.disabled = !(fileBuffer && session); }
async function readTensorData(tensor) {
return typeof tensor.getData === 'function' ? await tensor.getData() : tensor.data;
}
// -- Detect backend & load model --
async function init() {
// Detect WebGPU and patch device creation to raise storage buffer limits
if (navigator.gpu) {
try {
const adapter = await navigator.gpu.requestAdapter();
if (adapter) {
// Patch requestDevice to raise limits ORT doesn't request itself
const origRequestDevice = GPUAdapter.prototype.requestDevice;
const adapterLimits = adapter.limits;
GPUAdapter.prototype.requestDevice = function(desc) {
desc = desc || {};
desc.requiredLimits = desc.requiredLimits || {};
const rl = desc.requiredLimits;
rl.maxStorageBuffersPerShaderStage = adapterLimits.maxStorageBuffersPerShaderStage;
rl.maxBufferSize = adapterLimits.maxBufferSize;
rl.maxStorageBufferBindingSize = adapterLimits.maxStorageBufferBindingSize;
console.log(`[VoxUpscaler] patched requestDevice:`, JSON.stringify(rl));
return origRequestDevice.call(this, desc);
};
backend = 'webgpu';
backendDot.className = 'indicator gpu';
backendLabel.textContent = 'WebGPU';
}
} catch(e) {}
}
if (!backend) {
backend = 'cpu';
backendDot.className = 'indicator cpu';
backendLabel.textContent = 'CPU (WASM)';
}
// Load meta
modelLoading.classList.add('active');
modelLoading.innerHTML = '<span class="pulsing">Loading model metadata…</span>';
const resp = await fetch(META_URL);
meta = await resp.json();
// Load ONNX model
modelLoading.innerHTML = '<span class="pulsing">Loading ONNX model (fp32, ~376 MB)…</span>';
modelStatus.textContent = 'loading…';
const ep = backend === 'webgpu' ? 'webgpu' : 'wasm';
const opts = { executionProviders: [ep] };
if (ep === 'webgpu') {
opts.preferredOutputLocation = {
audio_out: 'cpu',
state_out: 'cpu',
};
} else {
opts.executionProviders = [{ name: 'wasm', options: { numThreads: navigator.hardwareConcurrency || 4 } }];
}
try {
// Fetch as ArrayBuffer to avoid ORT Web external data issues
const modelResp = await fetch(MODEL_URL);
const modelBuf = await modelResp.arrayBuffer();
session = await ort.InferenceSession.create(modelBuf, opts);
modelStatus.textContent = 'ready';
modelLoading.innerHTML = '✓ Model loaded';
modelLoading.classList.remove('active');
setTimeout(() => { modelLoading.style.display = 'none'; }, 1500);
} catch(e) {
// Fallback to wasm if webgpu fails
if (backend === 'webgpu') {
backend = 'cpu';
backendDot.className = 'indicator cpu';
backendLabel.textContent = 'CPU (WASM)';
modelLoading.innerHTML = '<span class="pulsing">WebGPU failed, falling back to CPU (fp32)…</span>';
session = await ort.InferenceSession.create(MODEL_URL, {
executionProviders: [{ name: 'wasm', options: { numThreads: navigator.hardwareConcurrency || 4 } }]
});
modelStatus.textContent = 'ready';
modelLoading.innerHTML = '✓ Model loaded (CPU fallback)';
} else {
modelLoading.innerHTML = 'Failed to load model: ' + e.message;
modelStatus.textContent = 'error';
return;
}
}
updateBtn();
}
function mixToMono(audioBuffer) {
const len = audioBuffer.length;
const mono = new Float32Array(len);
for (let ch = 0; ch < audioBuffer.numberOfChannels; ch++) {
const data = audioBuffer.getChannelData(ch);
for (let i = 0; i < len; i++) mono[i] += data[i];
}
const gain = 1 / audioBuffer.numberOfChannels;
for (let i = 0; i < len; i++) mono[i] *= gain;
return mono;
}
function readFourCc(view, offset) {
return String.fromCharCode(
view.getUint8(offset),
view.getUint8(offset + 1),
view.getUint8(offset + 2),
view.getUint8(offset + 3)
);
}
function decodeWavToMono(arrayBuffer) {
if (arrayBuffer.byteLength < 44) return null;
const view = new DataView(arrayBuffer);
if (readFourCc(view, 0) !== 'RIFF' || readFourCc(view, 8) !== 'WAVE') return null;
let offset = 12;
let fmt = null;
let dataOffset = 0;
let dataSize = 0;
while (offset + 8 <= view.byteLength) {
const id = readFourCc(view, offset);
const size = view.getUint32(offset + 4, true);
const chunkStart = offset + 8;
if (id === 'fmt ') {
const format = view.getUint16(chunkStart, true);
fmt = {
format: format === 0xfffe && size >= 40 ? view.getUint16(chunkStart + 24, true) : format,
channels: view.getUint16(chunkStart + 2, true),
sampleRate: view.getUint32(chunkStart + 4, true),
blockAlign: view.getUint16(chunkStart + 12, true),
bitsPerSample: view.getUint16(chunkStart + 14, true),
};
} else if (id === 'data') {
dataOffset = chunkStart;
dataSize = size;
break;
}
offset = chunkStart + size + (size % 2);
}
if (!fmt || !dataOffset || !dataSize) return null;
if (fmt.format !== 1 && fmt.format !== 3) return null;
const bytesPerSample = fmt.bitsPerSample / 8;
if (!Number.isInteger(bytesPerSample) || bytesPerSample < 1) return null;
const frames = Math.floor(dataSize / fmt.blockAlign);
const mono = new Float32Array(frames);
const readSample = (pos) => {
if (fmt.format === 3 && fmt.bitsPerSample === 32) return view.getFloat32(pos, true);
if (fmt.format !== 1) return 0;
if (fmt.bitsPerSample === 8) return (view.getUint8(pos) - 128) / 128;
if (fmt.bitsPerSample === 16) return view.getInt16(pos, true) / 32768;
if (fmt.bitsPerSample === 24) {
let v = view.getUint8(pos) | (view.getUint8(pos + 1) << 8) | (view.getUint8(pos + 2) << 16);
if (v & 0x800000) v |= 0xff000000;
return v / 8388608;
}
if (fmt.bitsPerSample === 32) return view.getInt32(pos, true) / 2147483648;
return 0;
};
for (let frame = 0; frame < frames; frame++) {
const frameOffset = dataOffset + frame * fmt.blockAlign;
let sum = 0;
for (let ch = 0; ch < fmt.channels; ch++) {
sum += readSample(frameOffset + ch * bytesPerSample);
}
mono[frame] = sum / fmt.channels;
}
return {
mono,
sampleRate: fmt.sampleRate,
channels: fmt.channels,
source: 'wav',
};
}
function sinc(x) {
if (Math.abs(x) < 1e-8) return 1;
const pix = Math.PI * x;
return Math.sin(pix) / pix;
}
function resampleSinc(input, inSr, outSr) {
if (inSr === outSr) return new Float32Array(input);
const outLen = Math.round(input.length * outSr / inSr);
const output = new Float32Array(outLen);
const ratio = inSr / outSr;
const cutoff = Math.min(1, outSr / inSr) * 0.95;
const radius = 12;
const support = radius / cutoff;
for (let i = 0; i < outLen; i++) {
const center = i * ratio;
const left = Math.max(0, Math.ceil(center - support));
const right = Math.min(input.length - 1, Math.floor(center + support));
let sum = 0;
let weightSum = 0;
for (let j = left; j <= right; j++) {
const x = (center - j) * cutoff;
const weight = sinc(x) * sinc(x / radius);
sum += input[j] * weight;
weightSum += weight;
}
output[i] = weightSum ? sum / weightSum : 0;
}
return output;
}
// -- Decode audio to mono 16kHz Float32 --
async function decodeToMono16k(arrayBuffer) {
let decodedAudio = decodeWavToMono(arrayBuffer);
if (!decodedAudio) {
const AudioCtx = window.AudioContext || window.webkitAudioContext;
const audioCtx = new AudioCtx();
const decoded = await audioCtx.decodeAudioData(arrayBuffer.slice(0));
await audioCtx.close();
decodedAudio = {
mono: mixToMono(decoded),
sampleRate: decoded.sampleRate,
channels: decoded.numberOfChannels,
source: 'webaudio',
};
}
const origSr = decodedAudio.sampleRate;
const mono = decodedAudio.mono;
const audio16k = resampleSinc(mono, origSr, INPUT_SR);
return audio16k;
}
// -- Process --
processBtn.addEventListener('click', async () => {
if (!fileBuffer || !session) return;
processBtn.disabled = true;
document.getElementById('outputPanel').style.display = 'none';
progressWrap.classList.add('active');
progressBar.style.width = '0%';
progressPct.textContent = '0%';
progressLabel.textContent = 'Decoding input…';
const audio16k = await decodeToMono16k(fileBuffer);
const totalSamples = audio16k.length;
const audioDuration = totalSamples / INPUT_SR;
// Chunk sizing: CPU=1000ms, GPU=5000ms
const chunkMs = backend === 'webgpu' ? 5000 : 1000;
const chunkHops = Math.max(1, Math.floor(chunkMs / 1000 * INPUT_SR / HOP));
const chunkSamples = chunkHops * HOP;
// Pad to HOP boundary
const pad = (HOP - totalSamples % HOP) % HOP;
let padded;
if (pad > 0) {
padded = new Float32Array(totalSamples + pad);
padded.set(audio16k);
} else {
padded = audio16k;
}
const totalPadded = padded.length;
// Init state
let state = new Float32Array(meta.total_state_size);
const outputs = [];
const numChunks = Math.ceil(totalPadded / chunkSamples);
let chunkIdx = 0;
progressLabel.textContent = 'Processing…';
const t0 = performance.now();
const srIdx = new Int32Array([TARGET_SR]);
for (let pos = 0; pos < totalPadded; pos += chunkSamples) {
const end = Math.min(pos + chunkSamples, totalPadded);
const chunk = padded.slice(pos, end);
// Shape: [1, 1, samples]
const audioTensor = new ort.Tensor('float32', chunk, [1, 1, chunk.length]);
const srTensor = new ort.Tensor('int32', srIdx, [1]);
const stateTensor = new ort.Tensor('float32', state, [meta.total_state_size]);
const result = await session.run({
audio: audioTensor,
sr_bin_idx: srTensor,
state_in: stateTensor,
});
const audioOut = await readTensorData(result.audio_out);
const stateOut = await readTensorData(result.state_out);
outputs.push(new Float32Array(audioOut));
state = new Float32Array(stateOut);
chunkIdx++;
const pct = Math.round(chunkIdx / numChunks * 100);
progressBar.style.width = pct + '%';
progressPct.textContent = pct + '%';
const elapsed = (performance.now() - t0) / 1000;
const processedDur = end / INPUT_SR;
const rtf = processedDur / elapsed;
rtfChip.style.display = 'flex';
rtfValue.textContent = rtf.toFixed(3) + 'x';
}
const totalElapsed = (performance.now() - t0) / 1000;
const finalRtf = audioDuration / totalElapsed;
rtfValue.textContent = finalRtf.toFixed(3) + 'x';
progressLabel.textContent = `Done in ${totalElapsed.toFixed(1)}s`;
progressPct.textContent = '100%';
progressBar.style.width = '100%';
// Concatenate outputs
const totalOut = outputs.reduce((s, a) => s + a.length, 0);
const fullOutput = new Float32Array(totalOut);
let off = 0;
for (const o of outputs) { fullOutput.set(o, off); off += o.length; }
// Trim to expected length
const expectedLen = Math.round(audioDuration * TARGET_SR);
const trimmed = fullOutput.slice(0, expectedLen);
// Encode WAV
const wav = encodeWav(trimmed, TARGET_SR);
const blob = new Blob([wav], { type: 'audio/wav' });
const url = URL.createObjectURL(blob);
audioPlayer.src = url;
const outName = fileName.replace(/\.[^.]+$/, '') + '_48k.wav';
downloadLink.href = url;
downloadLink.download = outName;
downloadLink.textContent = 'Download ' + outName;
document.getElementById('outputPanel').style.display = 'block';
processBtn.disabled = false;
});
function encodeWav(samples, sr) {
const len = samples.length;
const buf = new ArrayBuffer(44 + len * 2);
const view = new DataView(buf);
const writeStr = (o, s) => { for (let i = 0; i < s.length; i++) view.setUint8(o + i, s.charCodeAt(i)); };
writeStr(0, 'RIFF');
view.setUint32(4, 36 + len * 2, true);
writeStr(8, 'WAVE');
writeStr(12, 'fmt ');
view.setUint32(16, 16, true);
view.setUint16(20, 1, true);
view.setUint16(22, 1, true);
view.setUint32(24, sr, true);
view.setUint32(28, sr * 2, true);
view.setUint16(32, 2, true);
view.setUint16(34, 16, true);
writeStr(36, 'data');
view.setUint32(40, len * 2, true);
for (let i = 0; i < len; i++) {
let s = Math.max(-1, Math.min(1, samples[i]));
view.setInt16(44 + i * 2, s < 0 ? s * 0x8000 : s * 0x7FFF, true);
}
return buf;
}
// Load ORT and init
const script = document.createElement('script');
script.src = 'https://cdn.jsdelivr.net/npm/onnxruntime-web@1.22.0/dist/ort.webgpu.min.js';
script.crossOrigin = 'anonymous';
script.onload = () => {
ort.env.wasm.numThreads = navigator.hardwareConcurrency || 4;
ort.env.wasm.wasmPaths = 'https://cdn.jsdelivr.net/npm/onnxruntime-web@1.22.0/dist/';
init();
};
document.head.appendChild(script);
</script>
</body>
</html>