Spaces:

shreyask
/

KittenTTS-WebGPU

Running

App Files Files Community

shreyask commited on about 1 month ago

Commit

5c97b55

verified ·

1 Parent(s): 6525f03

Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

src/App.tsx +21 -15
src/WaveformPlayer.tsx +173 -0
src/index.css +52 -8
src/worker.ts +22 -11

src/App.tsx CHANGED Viewed

@@ -1,26 +1,32 @@
 import { useState, useRef, useCallback, useEffect } from "react";
 const MODELS: Record<string, string> = {
-  "Nano Int8 (15M · Fastest)": "KittenML/kitten-tts-nano-0.8-int8",
-  "Nano FP32 (15M)": "KittenML/kitten-tts-nano-0.8-fp32",
-  "Micro (40M · Balanced)": "KittenML/kitten-tts-micro-0.8",
-  "Mini (80M · Best Quality)": "KittenML/kitten-tts-mini-0.8",
 };
-const DEFAULT_MODEL = "Nano FP32 (15M)";
 const EXAMPLES = [
   {
     text: "Space is a three-dimensional continuum containing positions and directions.",
     voice: "Jasper",
   },
   {
     text: "She picked up her coffee and walked toward the window.",
     voice: "Luna",
   },
   {
-    text: "The sun set slowly over the calm, quiet lake.",
     voice: "Bella",
   },
 ];
@@ -140,6 +146,10 @@ export default function App() {
   const handleExample = (ex: (typeof EXAMPLES)[0]) => {
     setText(ex.text);
     setVoice(ex.voice);
   };
   const handleKeyDown = (e: React.KeyboardEvent<HTMLTextAreaElement>) => {
@@ -243,14 +253,7 @@ export default function App() {
         <div className="output-section">
           <label>Output</label>
           {audioUrl ? (
-            <div className="audio-result">
-              <audio controls src={audioUrl} className="audio-player" />
-              {duration !== null && (
-                <span className="duration">
-                  Generated in {(duration / 1000).toFixed(1)}s
-                </span>
-              )}
-            </div>
           ) : (
             <div className="audio-placeholder">
               {status === "loading" || status === "generating"
@@ -268,10 +271,13 @@ export default function App() {
                 key={i}
                 className="example-btn"
                 onClick={() => handleExample(ex)}
-                disabled={status !== "ready"}
               >
                 <span className="example-voice">{ex.voice}</span>
                 <span className="example-text">{ex.text}</span>
               </button>
             ))}
           </div>

 import { useState, useRef, useCallback, useEffect } from "react";
+import WaveformPlayer from "./WaveformPlayer";
 const MODELS: Record<string, string> = {
+  "Nano (15M - Fastest)": "onnx-community/KittenTTS-Nano-v0.8-ONNX",
+  "Micro (40M - Balanced)": "onnx-community/KittenTTS-Micro-v0.8-ONNX",
+  "Mini (80M - Best Quality)": "onnx-community/KittenTTS-Mini-v0.8-ONNX",
 };
+const DEFAULT_MODEL = "Micro (40M - Balanced)";
 const EXAMPLES = [
   {
     text: "Space is a three-dimensional continuum containing positions and directions.",
+    model: "Micro (40M - Balanced)",
     voice: "Jasper",
+    speed: 1.0,
   },
   {
     text: "She picked up her coffee and walked toward the window.",
+    model: "Mini (80M - Best Quality)",
     voice: "Luna",
+    speed: 1.0,
   },
   {
+    text: "The sun set slowly over the calm, quiet lake",
+    model: "Nano (15M - Fastest)",
     voice: "Bella",
+    speed: 1.1,
   },
 ];
   const handleExample = (ex: (typeof EXAMPLES)[0]) => {
     setText(ex.text);
     setVoice(ex.voice);
+    setSpeed(ex.speed);
+    if (ex.model !== model) {
+      handleModelChange(ex.model);
+    }
   };
   const handleKeyDown = (e: React.KeyboardEvent<HTMLTextAreaElement>) => {
         <div className="output-section">
           <label>Output</label>
           {audioUrl ? (
+            <WaveformPlayer audioUrl={audioUrl} duration={duration} />
           ) : (
             <div className="audio-placeholder">
               {status === "loading" || status === "generating"
                 key={i}
                 className="example-btn"
                 onClick={() => handleExample(ex)}
+                disabled={status === "loading" || status === "generating"}
               >
                 <span className="example-voice">{ex.voice}</span>
                 <span className="example-text">{ex.text}</span>
+                <span className="example-meta">
+                  {ex.model.split(" (")[0]}{ex.speed !== 1.0 ? ` · ${ex.speed}x` : ""}
+                </span>
               </button>
             ))}
           </div>

src/WaveformPlayer.tsx ADDED Viewed

	@@ -0,0 +1,173 @@

+import { useRef, useEffect, useState, useCallback } from "react";
+interface WaveformPlayerProps {
+  audioUrl: string;
+  duration?: number | null;
+}
+export default function WaveformPlayer({ audioUrl, duration }: WaveformPlayerProps) {
+  const canvasRef = useRef<HTMLCanvasElement>(null);
+  const audioRef = useRef<HTMLAudioElement>(null);
+  const animRef = useRef<number>(0);
+  const waveformRef = useRef<number[]>([]);
+  const [playing, setPlaying] = useState(false);
+  const [currentTime, setCurrTime] = useState(0);
+  const [totalDuration, setTotalDuration] = useState(0);
+  const [hovering, setHovering] = useState(false);
+  const [hoverX, setHoverX] = useState(0);
+  // Decode audio and compute waveform peaks
+  useEffect(() => {
+    if (!audioUrl) return;
+    const ctx = new AudioContext();
+    fetch(audioUrl)
+      .then((r) => r.arrayBuffer())
+      .then((buf) => ctx.decodeAudioData(buf))
+      .then((decoded) => {
+        const raw = decoded.getChannelData(0);
+        const bars = 100;
+        const blockSize = Math.floor(raw.length / bars);
+        const peaks: number[] = [];
+        for (let i = 0; i < bars; i++) {
+          let sum = 0;
+          for (let j = 0; j < blockSize; j++) {
+            sum += Math.abs(raw[i * blockSize + j]);
+          }
+          peaks.push(sum / blockSize);
+        }
+        // Normalize
+        const max = Math.max(...peaks, 0.01);
+        waveformRef.current = peaks.map((p) => p / max);
+        drawWaveform();
+        ctx.close();
+      })
+      .catch(() => {});
+  }, [audioUrl]);
+  const drawWaveform = useCallback(() => {
+    const canvas = canvasRef.current;
+    if (!canvas) return;
+    const ctx = canvas.getContext("2d");
+    if (!ctx) return;
+    const dpr = window.devicePixelRatio || 1;
+    const rect = canvas.getBoundingClientRect();
+    canvas.width = rect.width * dpr;
+    canvas.height = rect.height * dpr;
+    ctx.scale(dpr, dpr);
+    const w = rect.width;
+    const h = rect.height;
+    const peaks = waveformRef.current;
+    const bars = peaks.length || 1;
+    const audio = audioRef.current;
+    const progress = audio && audio.duration ? audio.currentTime / audio.duration : 0;
+    ctx.clearRect(0, 0, w, h);
+    const barWidth = (w / bars) * 0.7;
+    const gap = (w / bars) * 0.3;
+    const mid = h / 2;
+    for (let i = 0; i < bars; i++) {
+      const x = (i / bars) * w;
+      const barH = Math.max(2, (peaks[i] || 0) * mid * 0.9);
+      const iPlayed = i / bars < progress;
+      ctx.fillStyle = iPlayed ? "#c084fc" : hovering && x < hoverX ? "rgba(192,132,252,0.4)" : "#444";
+      ctx.beginPath();
+      ctx.roundRect(x + gap / 2, mid - barH, barWidth, barH * 2, 1.5);
+      ctx.fill();
+    }
+  }, [hovering, hoverX]);
+  // Animation loop
+  useEffect(() => {
+    const tick = () => {
+      const audio = audioRef.current;
+      if (audio) setCurrTime(audio.currentTime);
+      drawWaveform();
+      animRef.current = requestAnimationFrame(tick);
+    };
+    animRef.current = requestAnimationFrame(tick);
+    return () => cancelAnimationFrame(animRef.current);
+  }, [drawWaveform]);
+  const togglePlay = () => {
+    const audio = audioRef.current;
+    if (!audio) return;
+    if (audio.paused) {
+      audio.play();
+      setPlaying(true);
+    } else {
+      audio.pause();
+      setPlaying(false);
+    }
+  };
+  const seek = (e: React.MouseEvent<HTMLCanvasElement>) => {
+    const audio = audioRef.current;
+    const canvas = canvasRef.current;
+    if (!audio || !canvas || !audio.duration) return;
+    const rect = canvas.getBoundingClientRect();
+    const ratio = (e.clientX - rect.left) / rect.width;
+    audio.currentTime = ratio * audio.duration;
+  };
+  const handleMouseMove = (e: React.MouseEvent<HTMLCanvasElement>) => {
+    const canvas = canvasRef.current;
+    if (!canvas) return;
+    const rect = canvas.getBoundingClientRect();
+    setHoverX(e.clientX - rect.left);
+  };
+  const fmt = (s: number) => {
+    const m = Math.floor(s / 60);
+    const sec = Math.floor(s % 60);
+    return `${m}:${sec.toString().padStart(2, "0")}`;
+  };
+  return (
+    <div className="waveform-player">
+      <audio
+        ref={audioRef}
+        src={audioUrl}
+        onLoadedMetadata={() => setTotalDuration(audioRef.current?.duration || 0)}
+        onEnded={() => setPlaying(false)}
+      />
+      <button className="waveform-play" onClick={togglePlay} aria-label={playing ? "Pause" : "Play"}>
+        {playing ? (
+          <svg width="16" height="16" viewBox="0 0 16 16" fill="currentColor">
+            <rect x="3" y="2" width="4" height="12" rx="1" />
+            <rect x="9" y="2" width="4" height="12" rx="1" />
+          </svg>
+        ) : (
+          <svg width="16" height="16" viewBox="0 0 16 16" fill="currentColor">
+            <path d="M4 2.5v11l9-5.5z" />
+          </svg>
+        )}
+      </button>
+      <canvas
+        ref={canvasRef}
+        className="waveform-canvas"
+        onClick={seek}
+        onMouseEnter={() => setHovering(true)}
+        onMouseLeave={() => setHovering(false)}
+        onMouseMove={handleMouseMove}
+      />
+      <span className="waveform-time">
+        {fmt(currentTime)} / {fmt(totalDuration)}
+      </span>
+      {duration !== null && duration !== undefined && (
+        <span className="waveform-gen-time">
+          {(duration / 1000).toFixed(1)}s
+        </span>
+      )}
+    </div>
+  );
+}

src/index.css CHANGED Viewed

@@ -216,21 +216,57 @@ input[type="range"] {
   padding: 1rem;
 }
-.audio-result {
   display: flex;
-  flex-direction: column;
-  gap: 0.5rem;
 }
-.audio-player {
-  width: 100%;
-  border-radius: var(--radius);
 }
-.duration {
-  font-size: 0.75rem;
   color: var(--text-muted);
   font-family: var(--mono);
 }
 .audio-placeholder {
@@ -287,6 +323,14 @@ input[type="range"] {
 .example-text {
   color: var(--text-muted);
 }
 /* Error */

   padding: 1rem;
 }
+/* Waveform player */
+.waveform-player {
   display: flex;
+  align-items: center;
+  gap: 0.75rem;
 }
+.waveform-play {
+  flex-shrink: 0;
+  width: 36px;
+  height: 36px;
+  border-radius: 50%;
+  border: none;
+  background: var(--accent);
+  color: #111;
+  cursor: pointer;
+  display: flex;
+  align-items: center;
+  justify-content: center;
+  transition: opacity 0.15s;
 }
+.waveform-play:hover {
+  opacity: 0.85;
+}
+.waveform-canvas {
+  flex: 1;
+  height: 48px;
+  cursor: pointer;
+  border-radius: 4px;
+}
+.waveform-time {
+  flex-shrink: 0;
+  font-family: var(--mono);
+  font-size: 0.7rem;
   color: var(--text-muted);
+  min-width: 5.5em;
+  text-align: right;
+}
+.waveform-gen-time {
+  flex-shrink: 0;
   font-family: var(--mono);
+  font-size: 0.65rem;
+  color: #555;
+  padding: 0.15rem 0.4rem;
+  background: var(--surface-2);
+  border-radius: 4px;
 }
 .audio-placeholder {
 .example-text {
   color: var(--text-muted);
+  flex: 1;
+}
+.example-meta {
+  flex-shrink: 0;
+  font-family: var(--mono);
+  font-size: 0.7rem;
+  color: #555;
 }
 /* Error */

src/worker.ts CHANGED Viewed

@@ -16,8 +16,8 @@ let ort: any;
 const HF_BASE = "https://huggingface.co";
 const SAMPLE_RATE = 24000;
-// Int8 quantized models produce NaN on WebGPU; all fp32 models should be fine
-const WEBGPU_BLOCKED_PATTERNS = ["int8"];
 interface ModelConfig {
   name: string;
@@ -63,19 +63,22 @@ async function loadModel(repoId: string) {
   ort = ortModule;
   phonemize = phonemizerModule.phonemize;
-  // Load config
   self.postMessage({ type: "status", message: "Loading config..." });
-  const configUrl = resolveUrl(repoId, "config.json");
-  const configResp = await fetch(configUrl);
   config = (await configResp.json()) as ModelConfig;
-  // Int8 quantized models produce NaN on WebGPU — only block those
   const modelName = config.model || repoId.split("/").pop() || "";
-  const isBlocked = WEBGPU_BLOCKED_PATTERNS.some((p) => modelName.includes(p));
-  currentDevice = hasWebGPU && !isBlocked ? "webgpu" : "wasm";
-  if (hasWebGPU && isBlocked) {
-    console.log(`[KittenTTS] Using WASM for "${modelName}" (int8 models produce NaN on WebGPU)`);
   }
   self.postMessage({ type: "device", device: currentDevice });
@@ -83,7 +86,9 @@ async function loadModel(repoId: string) {
   // Load voices (.npz) and ONNX model in parallel
   self.postMessage({ type: "status", message: "Downloading model & voices..." });
-  const modelUrl = resolveUrl(repoId, config.model_file);
   const modelPromise = (async () => {
     const resp = await fetch(modelUrl);
@@ -244,6 +249,12 @@ async function generateChunk(
   const outputKey = session.outputNames[0];
   const audioData = results[outputKey].data as Float32Array;
   // Trim trailing silence (matching Python: audio[..., :-5000])
   return audioData.slice(0, Math.max(0, audioData.length - 5000));
 }

 const HF_BASE = "https://huggingface.co";
 const SAMPLE_RATE = 24000;
+// Only nano (fp32) confirmed working on WebGPU; micro/mini are int8 quantized
+const WEBGPU_SAFE_MODELS = ["Nano", "nano", "fp32"];
 interface ModelConfig {
   name: string;
   ort = ortModule;
   phonemize = phonemizerModule.phonemize;
+  // Load config (onnx-community repos use kitten_config.json for the TTS config)
   self.postMessage({ type: "status", message: "Loading config..." });
+  let configResp = await fetch(resolveUrl(repoId, "kitten_config.json"));
+  if (!configResp.ok) {
+    // Fallback to config.json for original KittenML repos
+    configResp = await fetch(resolveUrl(repoId, "config.json"));
+  }
   config = (await configResp.json()) as ModelConfig;
+  // Only use WebGPU for models confirmed to work (nano-fp32)
   const modelName = config.model || repoId.split("/").pop() || "";
+  const isSafe = WEBGPU_SAFE_MODELS.some((m) => modelName.includes(m));
+  currentDevice = hasWebGPU && isSafe ? "webgpu" : "wasm";
+  if (hasWebGPU && !isSafe) {
+    console.log(`[KittenTTS] Using WASM for "${modelName}" (WebGPU only confirmed for nano-fp32)`);
   }
   self.postMessage({ type: "device", device: currentDevice });
   // Load voices (.npz) and ONNX model in parallel
   self.postMessage({ type: "status", message: "Downloading model & voices..." });
+  // onnx-community repos have model at onnx/model.onnx, original repos use config.model_file
+  const modelFile = config.model_file || "onnx/model.onnx";
+  const modelUrl = resolveUrl(repoId, modelFile);
   const modelPromise = (async () => {
     const resp = await fetch(modelUrl);
   const outputKey = session.outputNames[0];
   const audioData = results[outputKey].data as Float32Array;
+  // Check for NaN — if detected, the model doesn't work on this backend
+  const hasNaN = audioData.length > 0 && isNaN(audioData[0]);
+  if (hasNaN) {
+    console.warn(`[KittenTTS] Model produced NaN audio — this model may not be compatible with ${currentDevice.toUpperCase()}`);
+  }
   // Trim trailing silence (matching Python: audio[..., :-5000])
   return audioData.slice(0, Math.max(0, audioData.length - 5000));
 }