shreyask commited on
Commit
92754e9
·
verified ·
1 Parent(s): 4c16b13

Upload folder using huggingface_hub

Browse files
Files changed (3) hide show
  1. src/App.tsx +14 -49
  2. src/index.css +14 -0
  3. src/worker.ts +26 -3
src/App.tsx CHANGED
@@ -40,7 +40,7 @@ export default function App() {
40
  const [voices, setVoices] = useState<string[]>([]);
41
  const [status, setStatus] = useState<Status>("idle");
42
  const [statusMsg, setStatusMsg] = useState("");
43
- const [device, setDevice] = useState("");
44
  const [progress, setProgress] = useState({ current: 0, total: 0 });
45
  const [audioUrl, setAudioUrl] = useState<string | null>(null);
46
  const [error, setError] = useState<string | null>(null);
@@ -163,18 +163,13 @@ export default function App() {
163
  <div className="container">
164
  <header>
165
  <h1>
166
- <span className="logo">🐱</span> KittenTTS
 
 
167
  </h1>
168
  <p className="subtitle">
169
  Text-to-speech running entirely in your browser
170
  </p>
171
- {device && (
172
- <span
173
- className={`badge ${device === "webgpu" ? "badge-gpu" : "badge-wasm"}`}
174
- >
175
- {device.toUpperCase()}
176
- </span>
177
- )}
178
  </header>
179
 
180
  <main>
@@ -287,54 +282,24 @@ export default function App() {
287
  </main>
288
 
289
  <footer>
 
 
 
290
  <p>
291
  Models by{" "}
292
- <a
293
- href="https://huggingface.co/KittenML"
294
- target="_blank"
295
- rel="noopener"
296
- >
297
- KittenML
298
- </a>
299
  {" · "}
300
- Original demo:{" "}
301
- <a
302
- href="https://huggingface.co/spaces/KittenML/KittenTTS-Demo"
303
- target="_blank"
304
- rel="noopener"
305
- >
306
- KittenTTS-Demo
307
- </a>
308
  </p>
309
  <p>
310
  Powered by{" "}
311
- <a
312
- href="https://github.com/huggingface/transformers.js"
313
- target="_blank"
314
- rel="noopener"
315
- >
316
- Transformers.js v4
317
- </a>
318
  {" · "}
319
- <a
320
- href="https://github.com/xenova/phonemizer.js"
321
- target="_blank"
322
- rel="noopener"
323
- >
324
- phonemizer.js
325
- </a>{" "}
326
- by{" "}
327
- <a
328
- href="https://github.com/xenova"
329
- target="_blank"
330
- rel="noopener"
331
- >
332
- Xenova
333
- </a>
334
  {" · "}
335
- <a href="https://onnxruntime.ai" target="_blank" rel="noopener">
336
- ONNX Runtime Web
337
- </a>
338
  </p>
339
  </footer>
340
  </div>
 
40
  const [voices, setVoices] = useState<string[]>([]);
41
  const [status, setStatus] = useState<Status>("idle");
42
  const [statusMsg, setStatusMsg] = useState("");
43
+ const [, setDevice] = useState("");
44
  const [progress, setProgress] = useState({ current: 0, total: 0 });
45
  const [audioUrl, setAudioUrl] = useState<string | null>(null);
46
  const [error, setError] = useState<string | null>(null);
 
163
  <div className="container">
164
  <header>
165
  <h1>
166
+ <a href="https://huggingface.co/KittenML" target="_blank" rel="noopener" className="title-link">
167
+ <span className="logo">🐱</span> KittenTTS
168
+ </a>
169
  </h1>
170
  <p className="subtitle">
171
  Text-to-speech running entirely in your browser
172
  </p>
 
 
 
 
 
 
 
173
  </header>
174
 
175
  <main>
 
282
  </main>
283
 
284
  <footer>
285
+ <p className="footer-note">
286
+ * Nano runs on WebGPU for faster inference. Micro and Mini use WASM (int8 quantized).
287
+ </p>
288
  <p>
289
  Models by{" "}
290
+ <a href="https://huggingface.co/KittenML" target="_blank" rel="noopener">KittenML</a>
 
 
 
 
 
 
291
  {" · "}
292
+ <a href="https://huggingface.co/spaces/KittenML/KittenTTS-Demo" target="_blank" rel="noopener">Original demo</a>
 
 
 
 
 
 
 
293
  </p>
294
  <p>
295
  Powered by{" "}
296
+ <a href="https://github.com/huggingface/transformers.js" target="_blank" rel="noopener">Transformers.js</a>
 
 
 
 
 
 
297
  {" · "}
298
+ <a href="https://github.com/xenova/phonemizer.js" target="_blank" rel="noopener">phonemizer.js</a>
299
+ {" by "}
300
+ <a href="https://github.com/xenova" target="_blank" rel="noopener">Xenova</a>
 
 
 
 
 
 
 
 
 
 
 
 
301
  {" · "}
302
+ <a href="https://onnxruntime.ai" target="_blank" rel="noopener">ONNX Runtime Web</a>
 
 
303
  </p>
304
  </footer>
305
  </div>
src/index.css CHANGED
@@ -56,6 +56,15 @@ header h1 {
56
  margin-bottom: 0.25rem;
57
  }
58
 
 
 
 
 
 
 
 
 
 
59
  .logo {
60
  font-size: 1.5rem;
61
  }
@@ -358,6 +367,11 @@ footer {
358
  gap: 0.25rem;
359
  }
360
 
 
 
 
 
 
361
  footer a {
362
  color: var(--text-muted);
363
  text-decoration: underline;
 
56
  margin-bottom: 0.25rem;
57
  }
58
 
59
+ .title-link {
60
+ color: var(--text) !important;
61
+ text-decoration: none !important;
62
+ }
63
+
64
+ .title-link:hover {
65
+ opacity: 0.8;
66
+ }
67
+
68
  .logo {
69
  font-size: 1.5rem;
70
  }
 
367
  gap: 0.25rem;
368
  }
369
 
370
+ .footer-note {
371
+ font-style: italic;
372
+ margin-bottom: 0.5rem;
373
+ }
374
+
375
  footer a {
376
  color: var(--text-muted);
377
  text-decoration: underline;
src/worker.ts CHANGED
@@ -218,12 +218,35 @@ async function generateChunk(
218
  speed = speed * config.speed_priors[voiceId];
219
  }
220
 
221
- // Phonemize text using espeak-ng WASM
222
- const phonemesList = await phonemize(text, "en-us");
223
- const phonemesRaw = phonemesList[0] || "";
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
224
  const phonemeTokens = basicTokenize(phonemesRaw);
225
  const phonemesJoined = phonemeTokens.join(" ");
226
  const inputIds = tokenize(phonemesJoined);
 
227
 
228
  // Select voice style reference based on text length (matches Python logic)
229
  const refId = Math.min(text.length, voiceData.shape[0] - 1);
 
218
  speed = speed * config.speed_priors[voiceId];
219
  }
220
 
221
+ // Phonemize text preserving punctuation (matching Python's preserve_punctuation=True).
222
+ // Split on punctuation, phonemize non-punctuation segments, rejoin with punctuation.
223
+ const PUNCT_RE = /(\s*[;:,.!?¡¿—…"«»""()\[\]{}]+\s*)+/g;
224
+ const sections: { match: boolean; text: string }[] = [];
225
+ let lastIdx = 0;
226
+ for (const m of text.matchAll(PUNCT_RE)) {
227
+ if (lastIdx < m.index!) {
228
+ sections.push({ match: false, text: text.slice(lastIdx, m.index!) });
229
+ }
230
+ sections.push({ match: true, text: m[0] });
231
+ lastIdx = m.index! + m[0].length;
232
+ }
233
+ if (lastIdx < text.length) {
234
+ sections.push({ match: false, text: text.slice(lastIdx) });
235
+ }
236
+
237
+ // Phonemize only non-punctuation sections
238
+ const phonemeParts = await Promise.all(
239
+ sections.map(async (s) => {
240
+ if (s.match) return s.text; // keep punctuation as-is
241
+ const result = await phonemize(s.text, "en-us");
242
+ return result.join(" ");
243
+ })
244
+ );
245
+ const phonemesRaw = phonemeParts.join("");
246
  const phonemeTokens = basicTokenize(phonemesRaw);
247
  const phonemesJoined = phonemeTokens.join(" ");
248
  const inputIds = tokenize(phonemesJoined);
249
+ console.log(`[KittenTTS] Text: "${text}" → Phonemes: "${phonemesJoined}" (${inputIds.length} tokens)`);
250
 
251
  // Select voice style reference based on text length (matches Python logic)
252
  const refId = Math.min(text.length, voiceData.shape[0] - 1);