fix: re-encode reference voices as PCM WAV in browser; expand turbo tag list to 9
Browse files- server/models/chatterbox_turbo.py +13 -2
- web/src/components/VoiceComposer.tsx +4 -1
- web/src/lib/wav.ts +57 -0
server/models/chatterbox_turbo.py
CHANGED
|
@@ -13,10 +13,21 @@ class Adapter:
|
|
| 13 |
id: ClassVar[str] = "chatterbox-turbo"
|
| 14 |
label: ClassVar[str] = "Chatterbox Turbo"
|
| 15 |
description: ClassVar[str] = (
|
| 16 |
-
"Faster, lower-VRAM English variant. Supports
|
|
|
|
| 17 |
)
|
| 18 |
languages: ClassVar[list[Lang]] = [Lang(code="en", label="English")]
|
| 19 |
-
paralinguistic_tags: ClassVar[list[str]] = [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
supports_voice_clone: ClassVar[bool] = True
|
| 21 |
params: ClassVar[list[ParamSpec]] = [
|
| 22 |
ParamSpec(name="cfg_weight", label="CFG weight", type="float",
|
|
|
|
| 13 |
id: ClassVar[str] = "chatterbox-turbo"
|
| 14 |
label: ClassVar[str] = "Chatterbox Turbo"
|
| 15 |
description: ClassVar[str] = (
|
| 16 |
+
"Faster, lower-VRAM English variant. Supports event tags: "
|
| 17 |
+
"[laugh] [chuckle] [sigh] [gasp] [cough] [sniff] [groan] [clear throat] [shush]."
|
| 18 |
)
|
| 19 |
languages: ClassVar[list[Lang]] = [Lang(code="en", label="English")]
|
| 20 |
+
paralinguistic_tags: ClassVar[list[str]] = [
|
| 21 |
+
"[laugh]",
|
| 22 |
+
"[chuckle]",
|
| 23 |
+
"[sigh]",
|
| 24 |
+
"[gasp]",
|
| 25 |
+
"[cough]",
|
| 26 |
+
"[sniff]",
|
| 27 |
+
"[groan]",
|
| 28 |
+
"[clear throat]",
|
| 29 |
+
"[shush]",
|
| 30 |
+
]
|
| 31 |
supports_voice_clone: ClassVar[bool] = True
|
| 32 |
params: ClassVar[list[ParamSpec]] = [
|
| 33 |
ParamSpec(name="cfg_weight", label="CFG weight", type="float",
|
web/src/components/VoiceComposer.tsx
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
import { useRef, useState } from "react";
|
| 2 |
import { Recorder } from "@/lib/audio";
|
| 3 |
import { addVoice } from "@/lib/idb";
|
|
|
|
| 4 |
|
| 5 |
type Props = {
|
| 6 |
onSaved: () => void;
|
|
@@ -16,9 +17,11 @@ export default function VoiceComposer({ onSaved }: Props) {
|
|
| 16 |
const arr = new Uint8Array(await blob.arrayBuffer());
|
| 17 |
const ctx = new AudioContext();
|
| 18 |
const buf = await ctx.decodeAudioData(arr.buffer.slice(0));
|
|
|
|
|
|
|
| 19 |
await addVoice({
|
| 20 |
name: name || defaultName || `voice-${Date.now()}`,
|
| 21 |
-
blob,
|
| 22 |
sampleRate: buf.sampleRate,
|
| 23 |
durationMs: Math.round(buf.duration * 1000),
|
| 24 |
});
|
|
|
|
| 1 |
import { useRef, useState } from "react";
|
| 2 |
import { Recorder } from "@/lib/audio";
|
| 3 |
import { addVoice } from "@/lib/idb";
|
| 4 |
+
import { encodeWav } from "@/lib/wav";
|
| 5 |
|
| 6 |
type Props = {
|
| 7 |
onSaved: () => void;
|
|
|
|
| 17 |
const arr = new Uint8Array(await blob.arrayBuffer());
|
| 18 |
const ctx = new AudioContext();
|
| 19 |
const buf = await ctx.decodeAudioData(arr.buffer.slice(0));
|
| 20 |
+
// Re-encode as 16-bit PCM mono WAV so the server (libsndfile) can decode it.
|
| 21 |
+
const wav = encodeWav(buf);
|
| 22 |
await addVoice({
|
| 23 |
name: name || defaultName || `voice-${Date.now()}`,
|
| 24 |
+
blob: wav,
|
| 25 |
sampleRate: buf.sampleRate,
|
| 26 |
durationMs: Math.round(buf.duration * 1000),
|
| 27 |
});
|
web/src/lib/wav.ts
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/**
|
| 2 |
+
* Encode a decoded AudioBuffer as a 16-bit PCM mono WAV Blob.
|
| 3 |
+
* Used at upload/record time so the server (libsndfile) can always decode it.
|
| 4 |
+
*/
|
| 5 |
+
export function encodeWav(buf: AudioBuffer): Blob {
|
| 6 |
+
const sr = buf.sampleRate;
|
| 7 |
+
const length = buf.length;
|
| 8 |
+
const numCh = 1;
|
| 9 |
+
|
| 10 |
+
// Mix down to mono in float32
|
| 11 |
+
const mono = new Float32Array(length);
|
| 12 |
+
const ch0 = buf.getChannelData(0);
|
| 13 |
+
if (buf.numberOfChannels === 1) {
|
| 14 |
+
mono.set(ch0);
|
| 15 |
+
} else {
|
| 16 |
+
const ch1 = buf.getChannelData(1);
|
| 17 |
+
for (let i = 0; i < length; i++) mono[i] = (ch0[i] + ch1[i]) / 2;
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
// Convert to int16
|
| 21 |
+
const dataSize = length * numCh * 2;
|
| 22 |
+
const out = new ArrayBuffer(44 + dataSize);
|
| 23 |
+
const view = new DataView(out);
|
| 24 |
+
let p = 0;
|
| 25 |
+
const writeStr = (s: string) => {
|
| 26 |
+
for (let i = 0; i < s.length; i++) view.setUint8(p++, s.charCodeAt(i));
|
| 27 |
+
};
|
| 28 |
+
const writeU32 = (n: number) => {
|
| 29 |
+
view.setUint32(p, n, true);
|
| 30 |
+
p += 4;
|
| 31 |
+
};
|
| 32 |
+
const writeU16 = (n: number) => {
|
| 33 |
+
view.setUint16(p, n, true);
|
| 34 |
+
p += 2;
|
| 35 |
+
};
|
| 36 |
+
|
| 37 |
+
writeStr("RIFF");
|
| 38 |
+
writeU32(36 + dataSize);
|
| 39 |
+
writeStr("WAVE");
|
| 40 |
+
writeStr("fmt ");
|
| 41 |
+
writeU32(16);
|
| 42 |
+
writeU16(1); // PCM
|
| 43 |
+
writeU16(numCh);
|
| 44 |
+
writeU32(sr);
|
| 45 |
+
writeU32(sr * numCh * 2);
|
| 46 |
+
writeU16(numCh * 2);
|
| 47 |
+
writeU16(16);
|
| 48 |
+
writeStr("data");
|
| 49 |
+
writeU32(dataSize);
|
| 50 |
+
|
| 51 |
+
for (let i = 0; i < length; i++) {
|
| 52 |
+
const s = Math.max(-1, Math.min(1, mono[i]));
|
| 53 |
+
view.setInt16(44 + i * 2, s < 0 ? s * 0x8000 : s * 0x7fff, true);
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
return new Blob([out], { type: "audio/wav" });
|
| 57 |
+
}
|