Spaces:

techfreakworm
/

chatterbox-voice-studio

Running

App Files Files Community

techfreakworm commited on 23 days ago

Commit

ded73f2

unverified ·

1 Parent(s): 108b9f8

fix: re-encode reference voices as PCM WAV in browser; expand turbo tag list to 9

Browse files

Files changed (3) hide show

server/models/chatterbox_turbo.py +13 -2
web/src/components/VoiceComposer.tsx +4 -1
web/src/lib/wav.ts +57 -0

server/models/chatterbox_turbo.py CHANGED Viewed

@@ -13,10 +13,21 @@ class Adapter:
     id: ClassVar[str] = "chatterbox-turbo"
     label: ClassVar[str] = "Chatterbox Turbo"
     description: ClassVar[str] = (
-        "Faster, lower-VRAM English variant. Supports [laugh], [cough], [chuckle] tags."
     )
     languages: ClassVar[list[Lang]] = [Lang(code="en", label="English")]
-    paralinguistic_tags: ClassVar[list[str]] = ["[laugh]", "[cough]", "[chuckle]"]
     supports_voice_clone: ClassVar[bool] = True
     params: ClassVar[list[ParamSpec]] = [
         ParamSpec(name="cfg_weight", label="CFG weight", type="float",

     id: ClassVar[str] = "chatterbox-turbo"
     label: ClassVar[str] = "Chatterbox Turbo"
     description: ClassVar[str] = (
+        "Faster, lower-VRAM English variant. Supports event tags: "
+        "[laugh] [chuckle] [sigh] [gasp] [cough] [sniff] [groan] [clear throat] [shush]."
     )
     languages: ClassVar[list[Lang]] = [Lang(code="en", label="English")]
+    paralinguistic_tags: ClassVar[list[str]] = [
+        "[laugh]",
+        "[chuckle]",
+        "[sigh]",
+        "[gasp]",
+        "[cough]",
+        "[sniff]",
+        "[groan]",
+        "[clear throat]",
+        "[shush]",
+    ]
     supports_voice_clone: ClassVar[bool] = True
     params: ClassVar[list[ParamSpec]] = [
         ParamSpec(name="cfg_weight", label="CFG weight", type="float",

web/src/components/VoiceComposer.tsx CHANGED Viewed

@@ -1,6 +1,7 @@
 import { useRef, useState } from "react";
 import { Recorder } from "@/lib/audio";
 import { addVoice } from "@/lib/idb";
 type Props = {
   onSaved: () => void;
@@ -16,9 +17,11 @@ export default function VoiceComposer({ onSaved }: Props) {
     const arr = new Uint8Array(await blob.arrayBuffer());
     const ctx = new AudioContext();
     const buf = await ctx.decodeAudioData(arr.buffer.slice(0));
     await addVoice({
       name: name || defaultName || `voice-${Date.now()}`,
-      blob,
       sampleRate: buf.sampleRate,
       durationMs: Math.round(buf.duration * 1000),
     });

 import { useRef, useState } from "react";
 import { Recorder } from "@/lib/audio";
 import { addVoice } from "@/lib/idb";
+import { encodeWav } from "@/lib/wav";
 type Props = {
   onSaved: () => void;
     const arr = new Uint8Array(await blob.arrayBuffer());
     const ctx = new AudioContext();
     const buf = await ctx.decodeAudioData(arr.buffer.slice(0));
+    // Re-encode as 16-bit PCM mono WAV so the server (libsndfile) can decode it.
+    const wav = encodeWav(buf);
     await addVoice({
       name: name || defaultName || `voice-${Date.now()}`,
+      blob: wav,
       sampleRate: buf.sampleRate,
       durationMs: Math.round(buf.duration * 1000),
     });

web/src/lib/wav.ts ADDED Viewed

	@@ -0,0 +1,57 @@

+/**
+ * Encode a decoded AudioBuffer as a 16-bit PCM mono WAV Blob.
+ * Used at upload/record time so the server (libsndfile) can always decode it.
+ */
+export function encodeWav(buf: AudioBuffer): Blob {
+  const sr = buf.sampleRate;
+  const length = buf.length;
+  const numCh = 1;
+  // Mix down to mono in float32
+  const mono = new Float32Array(length);
+  const ch0 = buf.getChannelData(0);
+  if (buf.numberOfChannels === 1) {
+    mono.set(ch0);
+  } else {
+    const ch1 = buf.getChannelData(1);
+    for (let i = 0; i < length; i++) mono[i] = (ch0[i] + ch1[i]) / 2;
+  }
+  // Convert to int16
+  const dataSize = length * numCh * 2;
+  const out = new ArrayBuffer(44 + dataSize);
+  const view = new DataView(out);
+  let p = 0;
+  const writeStr = (s: string) => {
+    for (let i = 0; i < s.length; i++) view.setUint8(p++, s.charCodeAt(i));
+  };
+  const writeU32 = (n: number) => {
+    view.setUint32(p, n, true);
+    p += 4;
+  };
+  const writeU16 = (n: number) => {
+    view.setUint16(p, n, true);
+    p += 2;
+  };
+  writeStr("RIFF");
+  writeU32(36 + dataSize);
+  writeStr("WAVE");
+  writeStr("fmt ");
+  writeU32(16);
+  writeU16(1); // PCM
+  writeU16(numCh);
+  writeU32(sr);
+  writeU32(sr * numCh * 2);
+  writeU16(numCh * 2);
+  writeU16(16);
+  writeStr("data");
+  writeU32(dataSize);
+  for (let i = 0; i < length; i++) {
+    const s = Math.max(-1, Math.min(1, mono[i]));
+    view.setInt16(44 + i * 2, s < 0 ? s * 0x8000 : s * 0x7fff, true);
+  }
+  return new Blob([out], { type: "audio/wav" });
+}