Spaces:

shreyask
/

KittenTTS-WebGPU

Running

App Files Files Community

shreyask commited on about 1 month ago

Commit

92754e9

verified ·

1 Parent(s): 4c16b13

Upload folder using huggingface_hub

Browse files

Files changed (3) hide show

src/App.tsx +14 -49
src/index.css +14 -0
src/worker.ts +26 -3

src/App.tsx CHANGED Viewed

@@ -40,7 +40,7 @@ export default function App() {
   const [voices, setVoices] = useState<string[]>([]);
   const [status, setStatus] = useState<Status>("idle");
   const [statusMsg, setStatusMsg] = useState("");
-  const [device, setDevice] = useState("");
   const [progress, setProgress] = useState({ current: 0, total: 0 });
   const [audioUrl, setAudioUrl] = useState<string | null>(null);
   const [error, setError] = useState<string | null>(null);
@@ -163,18 +163,13 @@ export default function App() {
     <div className="container">
       <header>
         <h1>
-          <span className="logo">🐱</span> KittenTTS
         </h1>
         <p className="subtitle">
           Text-to-speech running entirely in your browser
         </p>
-        {device && (
-          <span
-            className={`badge ${device === "webgpu" ? "badge-gpu" : "badge-wasm"}`}
-          >
-            {device.toUpperCase()}
-          </span>
-        )}
       </header>
       <main>
@@ -287,54 +282,24 @@ export default function App() {
       </main>
       <footer>
         <p>
           Models by{" "}
-          <a
-            href="https://huggingface.co/KittenML"
-            target="_blank"
-            rel="noopener"
-          >
-            KittenML
-          </a>
           {" · "}
-          Original demo:{" "}
-          <a
-            href="https://huggingface.co/spaces/KittenML/KittenTTS-Demo"
-            target="_blank"
-            rel="noopener"
-          >
-            KittenTTS-Demo
-          </a>
         </p>
         <p>
           Powered by{" "}
-          <a
-            href="https://github.com/huggingface/transformers.js"
-            target="_blank"
-            rel="noopener"
-          >
-            Transformers.js v4
-          </a>
           {" · "}
-          <a
-            href="https://github.com/xenova/phonemizer.js"
-            target="_blank"
-            rel="noopener"
-          >
-            phonemizer.js
-          </a>{" "}
-          by{" "}
-          <a
-            href="https://github.com/xenova"
-            target="_blank"
-            rel="noopener"
-          >
-            Xenova
-          </a>
           {" · "}
-          <a href="https://onnxruntime.ai" target="_blank" rel="noopener">
-            ONNX Runtime Web
-          </a>
         </p>
       </footer>
     </div>

   const [voices, setVoices] = useState<string[]>([]);
   const [status, setStatus] = useState<Status>("idle");
   const [statusMsg, setStatusMsg] = useState("");
+  const [, setDevice] = useState("");
   const [progress, setProgress] = useState({ current: 0, total: 0 });
   const [audioUrl, setAudioUrl] = useState<string | null>(null);
   const [error, setError] = useState<string | null>(null);
     <div className="container">
       <header>
         <h1>
+          <a href="https://huggingface.co/KittenML" target="_blank" rel="noopener" className="title-link">
+            <span className="logo">🐱</span> KittenTTS
+          </a>
         </h1>
         <p className="subtitle">
           Text-to-speech running entirely in your browser
         </p>
       </header>
       <main>
       </main>
       <footer>
+        <p className="footer-note">
+          * Nano runs on WebGPU for faster inference. Micro and Mini use WASM (int8 quantized).
+        </p>
         <p>
           Models by{" "}
+          <a href="https://huggingface.co/KittenML" target="_blank" rel="noopener">KittenML</a>
           {" · "}
+          <a href="https://huggingface.co/spaces/KittenML/KittenTTS-Demo" target="_blank" rel="noopener">Original demo</a>
         </p>
         <p>
           Powered by{" "}
+          <a href="https://github.com/huggingface/transformers.js" target="_blank" rel="noopener">Transformers.js</a>
           {" · "}
+          <a href="https://github.com/xenova/phonemizer.js" target="_blank" rel="noopener">phonemizer.js</a>
+          {" by "}
+          <a href="https://github.com/xenova" target="_blank" rel="noopener">Xenova</a>
           {" · "}
+          <a href="https://onnxruntime.ai" target="_blank" rel="noopener">ONNX Runtime Web</a>
         </p>
       </footer>
     </div>

src/index.css CHANGED Viewed

@@ -56,6 +56,15 @@ header h1 {
   margin-bottom: 0.25rem;
 }
 .logo {
   font-size: 1.5rem;
 }
@@ -358,6 +367,11 @@ footer {
   gap: 0.25rem;
 }
 footer a {
   color: var(--text-muted);
   text-decoration: underline;

   margin-bottom: 0.25rem;
 }
+.title-link {
+  color: var(--text) !important;
+  text-decoration: none !important;
+}
+.title-link:hover {
+  opacity: 0.8;
+}
 .logo {
   font-size: 1.5rem;
 }
   gap: 0.25rem;
 }
+.footer-note {
+  font-style: italic;
+  margin-bottom: 0.5rem;
+}
 footer a {
   color: var(--text-muted);
   text-decoration: underline;

src/worker.ts CHANGED Viewed

@@ -218,12 +218,35 @@ async function generateChunk(
     speed = speed * config.speed_priors[voiceId];
   }
-  // Phonemize text using espeak-ng WASM
-  const phonemesList = await phonemize(text, "en-us");
-  const phonemesRaw = phonemesList[0] || "";
   const phonemeTokens = basicTokenize(phonemesRaw);
   const phonemesJoined = phonemeTokens.join(" ");
   const inputIds = tokenize(phonemesJoined);
   // Select voice style reference based on text length (matches Python logic)
   const refId = Math.min(text.length, voiceData.shape[0] - 1);

     speed = speed * config.speed_priors[voiceId];
   }
+  // Phonemize text preserving punctuation (matching Python's preserve_punctuation=True).
+  // Split on punctuation, phonemize non-punctuation segments, rejoin with punctuation.
+  const PUNCT_RE = /(\s*[;:,.!?¡¿—…"«»""()\[\]{}]+\s*)+/g;
+  const sections: { match: boolean; text: string }[] = [];
+  let lastIdx = 0;
+  for (const m of text.matchAll(PUNCT_RE)) {
+    if (lastIdx < m.index!) {
+      sections.push({ match: false, text: text.slice(lastIdx, m.index!) });
+    }
+    sections.push({ match: true, text: m[0] });
+    lastIdx = m.index! + m[0].length;
+  }
+  if (lastIdx < text.length) {
+    sections.push({ match: false, text: text.slice(lastIdx) });
+  }
+  // Phonemize only non-punctuation sections
+  const phonemeParts = await Promise.all(
+    sections.map(async (s) => {
+      if (s.match) return s.text; // keep punctuation as-is
+      const result = await phonemize(s.text, "en-us");
+      return result.join(" ");
+    })
+  );
+  const phonemesRaw = phonemeParts.join("");
   const phonemeTokens = basicTokenize(phonemesRaw);
   const phonemesJoined = phonemeTokens.join(" ");
   const inputIds = tokenize(phonemesJoined);
+  console.log(`[KittenTTS] Text: "${text}" → Phonemes: "${phonemesJoined}" (${inputIds.length} tokens)`);
   // Select voice style reference based on text length (matches Python logic)
   const refId = Math.min(text.length, voiceData.shape[0] - 1);