Spaces:

shreyask
/

svara-tts-webgpu

Running

App Files Files Community

shreyask commited on 19 days ago

Commit

c69c1ee

verified ·

1 Parent(s): e546297

Upload Svāra TTS WebGPU app

Browse files

Files changed (19) hide show

.gitignore +28 -0
README.md +57 -5
eslint.config.js +38 -0
index.html +17 -17
package-lock.json +0 -0
package.json +35 -0
postcss.config.js +6 -0
public/hf-logo.svg +8 -0
public/warli-strip.svg +1117 -0
public/wave.svg +9 -0
src/App.jsx +569 -0
src/assets/react.svg +1 -0
src/index.css +756 -0
src/main.jsx +10 -0
src/worker.js +626 -0
style.css +0 -28
tailwind.config.js +8 -0
tools/run_svara_onnx_local.py +248 -0
vite.config.js +25 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,28 @@

+# Logs
+logs
+*.log
+npm-debug.log*
+yarn-debug.log*
+yarn-error.log*
+pnpm-debug.log*
+lerna-debug.log*
+node_modules
+dist
+dist-ssr
+*.local
+.hf-models
+.venv-onnx
+.venv-onnx314
+validate-output*.wav
+# Editor directories and files
+.vscode/*
+!.vscode/extensions.json
+.idea
+.DS_Store
+*.suo
+*.ntvs*
+*.njsproj
+*.sln
+*.sw?

README.md CHANGED Viewed

@@ -1,10 +1,62 @@
 ---
-title: Svara Tts Webgpu
-emoji: 🏃
-colorFrom: gray
-colorTo: purple
 sdk: static
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Svāra TTS WebGPU
+emoji: 🗣️
+colorFrom: yellow
+colorTo: red
 sdk: static
+app_build_command: npm run build
+app_file: dist/index.html
 pinned: false
+license: apache-2.0
+short_description: Multilingual Indic TTS in your browser, via WebGPU
 ---
+# Svāra TTS · WebGPU
+Browser-native multilingual TTS for **19 Indian languages** powered by [Svara](https://huggingface.co/kenpath/svara-tts-v1), [SNAC](https://huggingface.co/hubertsiuzdak/snac_24khz), and [Transformers.js v4](https://huggingface.co/docs/transformers.js). Runs 100% locally in the browser after the one-time model download.
+This build adds an explicit model load step, browser-side caching, multilingual voice switching, prompt presets, and a WebGPU worker tuned around the ONNX-exported Svāra model.
+## Architecture
+```
+text → tokenizer → Llama-3.2-3B (q4f16, transformers.js v4 + WebGPU) →
+  audio token IDs in [128266, 156938) →
+  group every 7 → SNAC frame (3 hierarchical levels) →
+  SNAC decoder ONNX (q4f16/fp16 from onnx-community/snac_24khz-ONNX) →
+  24 kHz mono PCM → WAV blob → <audio>
+```
+## Models
+| Repo | Size | Notes |
+|------|------|-------|
+| [`shreyask/svara-tts-v1-ONNX`](https://huggingface.co/shreyask/svara-tts-v1-ONNX) | ~1.95 GB | Llama-3.2-3B q4f16, GQA, KV-cache |
+| [`onnx-community/snac_24khz-ONNX`](https://huggingface.co/onnx-community/snac_24khz-ONNX) | ~26 MB (fp16) | SNAC decoder |
+## Run locally
+```sh
+npm install
+npm run dev   # http://localhost:5173
+```
+First run downloads the selected model into the browser cache (LM + codec + tokenizer). Subsequent runs reuse the cached weights.
+## Voices
+Use a string of the form `"<Language Name> (<Gender>)"`. **38 voices across 19 languages**: Hindi, Bengali, Marathi, Telugu, Kannada, Tamil, Malayalam, Gujarati, Punjabi, Assamese, Bhojpuri, Magahi, Maithili, Chhattisgarhi, Bodo, Dogri, Nepali, Sanskrit, English (Indian) — male + female each.
+## Notes
+- `q4f16` is the fastest cold-start option and works well for short prompts.
+- `q8` is heavier but can sound cleaner on more difficult prompts.
+- Emotion tags such as `<happy>` and `<sad>` can be appended at the end of a line.
+- Everything stays local to the browser after the model has loaded.
+## Credits
+- [Kenpath](https://huggingface.co/kenpath) — Svara TTS v1 base model.
+- [Canopy Labs](https://huggingface.co/canopylabs) — Orpheus 3B Hindi base.
+- [Hugging Face](https://github.com/huggingface/transformers.js-examples/tree/main/text-to-speech-webgpu) — original `text-to-speech-webgpu` scaffold this project forked from.
+- License: Apache 2.0.

eslint.config.js ADDED Viewed

	@@ -0,0 +1,38 @@

+import js from "@eslint/js";
+import globals from "globals";
+import react from "eslint-plugin-react";
+import reactHooks from "eslint-plugin-react-hooks";
+import reactRefresh from "eslint-plugin-react-refresh";
+export default [
+  { ignores: ["dist"] },
+  {
+    files: ["**/*.{js,jsx}"],
+    languageOptions: {
+      ecmaVersion: 2020,
+      globals: globals.browser,
+      parserOptions: {
+        ecmaVersion: "latest",
+        ecmaFeatures: { jsx: true },
+        sourceType: "module",
+      },
+    },
+    settings: { react: { version: "18.3" } },
+    plugins: {
+      react,
+      "react-hooks": reactHooks,
+      "react-refresh": reactRefresh,
+    },
+    rules: {
+      ...js.configs.recommended.rules,
+      ...react.configs.recommended.rules,
+      ...react.configs["jsx-runtime"].rules,
+      ...reactHooks.configs.recommended.rules,
+      "react/jsx-no-target-blank": "off",
+      "react-refresh/only-export-components": [
+        "warn",
+        { allowConstantExport: true },
+      ],
+    },
+  },
+];

index.html CHANGED Viewed

@@ -1,19 +1,19 @@
 <!doctype html>
-<html>
-	<head>
-		<meta charset="utf-8" />
-		<meta name="viewport" content="width=device-width" />
-		<title>My static Space</title>
-		<link rel="stylesheet" href="style.css" />
-	</head>
-	<body>
-		<div class="card">
-			<h1>Welcome to your static Space!</h1>
-			<p>You can modify this app directly by editing <i>index.html</i> in the Files and versions tab.</p>
-			<p>
-				Also don't forget to check the
-				<a href="https://huggingface.co/docs/hub/spaces" target="_blank">Spaces documentation</a>.
-			</p>
-		</div>
-	</body>
 </html>

 <!doctype html>
+<html lang="en">
+  <head>
+    <meta charset="UTF-8" />
+    <link rel="icon" type="image/svg+xml" href="/hf-logo.svg" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+    <link rel="preconnect" href="https://fonts.googleapis.com" />
+    <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin />
+    <link
+      href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&family=Tiro+Devanagari+Marathi:ital@0;1&family=Yatra+One&display=swap"
+      rel="stylesheet"
+    />
+    <title>Svāra TTS · WebGPU</title>
+  </head>
+  <body>
+    <div id="root"></div>
+    <script type="module" src="/src/main.jsx"></script>
+  </body>
 </html>

package-lock.json ADDED Viewed

The diff for this file is too large to render. See raw diff

package.json ADDED Viewed

	@@ -0,0 +1,35 @@

+{
+  "name": "svara-tts-webgpu",
+  "private": true,
+  "version": "0.0.1",
+  "type": "module",
+  "scripts": {
+    "dev": "vite",
+    "build": "vite build",
+    "lint": "eslint .",
+    "preview": "vite preview"
+  },
+  "dependencies": {
+    "@huggingface/transformers": "^4.0.0",
+    "motion": "^11.12.0",
+    "onnxruntime-web": "^1.20.0",
+    "react": "^18.3.1",
+    "react-dom": "^18.3.1"
+  },
+  "devDependencies": {
+    "@eslint/js": "^9.15.0",
+    "@types/react": "^18.3.12",
+    "@types/react-dom": "^18.3.1",
+    "@vitejs/plugin-react": "^4.3.4",
+    "autoprefixer": "^10.4.20",
+    "eslint": "^9.15.0",
+    "eslint-plugin-react": "^7.37.2",
+    "eslint-plugin-react-hooks": "^5.0.0",
+    "eslint-plugin-react-refresh": "^0.4.14",
+    "globals": "^15.12.0",
+    "postcss": "^8.4.49",
+    "tailwindcss": "^3.4.15",
+    "vite": "^6.0.1",
+    "vite-plugin-static-copy": "^2.1.0"
+  }
+}

postcss.config.js ADDED Viewed

	@@ -0,0 +1,6 @@

+export default {
+  plugins: {
+    tailwindcss: {},
+    autoprefixer: {},
+  },
+};

public/hf-logo.svg ADDED Viewed

public/warli-strip.svg ADDED Viewed

public/wave.svg ADDED Viewed

src/App.jsx ADDED Viewed

	@@ -0,0 +1,569 @@

+import { useEffect, useMemo, useRef, useState } from "react";
+import { motion } from "motion/react";
+const LANGUAGES = [
+  ["Hindi", "नमस्ते, आप कैसे हैं?"],
+  ["Bengali", "নমস্কার, আপনি কেমন আছেন?"],
+  ["Marathi", "नमस्कार, तुम्ही कसे आहात?"],
+  ["Telugu", "నమస్కారం, మీరు ఎలా ఉన్నారు?"],
+  ["Kannada", "ನಮಸ್ಕಾರ, ನೀವು ಹೇಗಿದ್ದೀರಿ?"],
+  ["Tamil", "வணக்கம், நீங்கள் எப்படி இருக்கிறீர்கள்?"],
+  ["Malayalam", "നമസ്കാരം, സുഖമാണോ?"],
+  ["Gujarati", "નમસ્તે, તમે કેમ છો?"],
+  ["Punjabi", "ਸਤ ਸ੍ਰੀ ਅਕਾਲ, ਤੁਸੀਂ ਕਿਵੇਂ ਹੋ?"],
+  ["Assamese", "নমস্কাৰ, আপুনি কেনে আছে?"],
+  ["Bhojpuri", "नमस्कार, राउर का हाल बा?"],
+  ["Magahi", "नमस्कार, तू कैसन हे?"],
+  ["Maithili", "नमस्कार, अहाँ कोना छी?"],
+  ["Chhattisgarhi", "नमस्कार, आप कइसन हन?"],
+  ["Bodo", "नमस्कार, नोँ बेसेबा डंनो?"],
+  ["Dogri", "नमस्ते, तुसें कि’यां ओ?"],
+  ["Nepali", "नमस्ते, तपाईं कस्तो हुनुहुन्छ?"],
+  ["Sanskrit", "नमस्कारः, भवान् कथमस्ति?"],
+  ["English (Indian)", "Hello, how are you?"],
+];
+const LANGUAGE_DETAILS = {
+  Hindi: { script: "Devanagari", region: "North India" },
+  Bengali: { script: "Bengali", region: "Eastern India" },
+  Marathi: { script: "Devanagari", region: "Maharashtra" },
+  Telugu: { script: "Telugu", region: "Andhra Pradesh + Telangana" },
+  Kannada: { script: "Kannada", region: "Karnataka" },
+  Tamil: { script: "Tamil", region: "Tamil Nadu" },
+  Malayalam: { script: "Malayalam", region: "Kerala" },
+  Gujarati: { script: "Gujarati", region: "Gujarat" },
+  Punjabi: { script: "Gurmukhi", region: "Punjab" },
+  Assamese: { script: "Assamese", region: "Assam" },
+  Bhojpuri: { script: "Devanagari", region: "Bihar + Eastern UP" },
+  Magahi: { script: "Devanagari", region: "Bihar" },
+  Maithili: { script: "Devanagari", region: "Mithila" },
+  Chhattisgarhi: { script: "Devanagari", region: "Chhattisgarh" },
+  Bodo: { script: "Devanagari", region: "Northeast India" },
+  Dogri: { script: "Devanagari", region: "Jammu" },
+  Nepali: { script: "Devanagari", region: "Nepal + India" },
+  Sanskrit: { script: "Devanagari", region: "Classical Indic" },
+  "English (Indian)": { script: "Latin", region: "Indian English" },
+};
+const VOICES = LANGUAGES.flatMap(([lang]) => [
+  `${lang} (Female)`,
+  `${lang} (Male)`,
+]);
+const DTYPES = [
+  { value: "q4f16", label: "q4f16", note: "~1.95 GB · fastest cold start" },
+  { value: "q8", label: "q8", note: "~4.32 GB · cleaner, slower preload" },
+];
+const STACK_FACTS = [
+  { label: "Model", value: "Svāra-TTS v1" },
+  { label: "Codec", value: "SNAC 24 kHz" },
+  { label: "Runtime", value: "WebGPU + Transformers.js" },
+];
+function withEmotionTag(text, tag) {
+  return `${text.replace(/\s*<[^>]+>\s*$/u, "").trim()} ${tag}`;
+}
+export default function App() {
+  const worker = useRef(null);
+  const runtimeReadyRef = useRef(false);
+  const loadedDtypesRef = useRef([]);
+  const [selectedVoice, setSelectedVoice] = useState("Hindi (Female)");
+  const [inputText, setInputText] = useState(LANGUAGES[0][1]);
+  const [dtype, setDtype] = useState("q4f16");
+  const [status, setStatus] = useState(null);
+  const [error, setError] = useState(null);
+  const [runtimeReady, setRuntimeReady] = useState(false);
+  const [loadingDtype, setLoadingDtype] = useState(null);
+  const [loadedDtypes, setLoadedDtypes] = useState([]);
+  const [loadingMessage, setLoadingMessage] = useState(
+    "Detecting WebGPU support...",
+  );
+  const [results, setResults] = useState([]);
+  const selectedLanguage = selectedVoice.split(" (")[0];
+  const selectedGender = selectedVoice.includes("(Male)") ? "Male" : "Female";
+  const languageDetail = LANGUAGE_DETAILS[selectedLanguage] ?? {
+    script: "Indic",
+    region: "South Asia",
+  };
+  const currentSample =
+    LANGUAGES.find(([lang]) => lang === selectedLanguage)?.[1] ?? inputText;
+  const currentDtype = DTYPES.find((entry) => entry.value === dtype) ?? DTYPES[0];
+  const isCurrentDtypeLoaded = loadedDtypes.includes(dtype);
+  const isLoadingCurrentDtype =
+    status === "loading" && loadingDtype === currentDtype.value;
+  const promptChips = useMemo(
+    () => [
+      { label: "Sample line", value: currentSample },
+      { label: "Sample + <sad>", value: withEmotionTag(currentSample, "<sad>") },
+      {
+        label: "Sample + <happy>",
+        value: withEmotionTag(currentSample, "<happy>"),
+      },
+    ],
+    [currentSample],
+  );
+  useEffect(() => {
+    runtimeReadyRef.current = runtimeReady;
+  }, [runtimeReady]);
+  useEffect(() => {
+    loadedDtypesRef.current = loadedDtypes;
+  }, [loadedDtypes]);
+  useEffect(() => {
+    worker.current ??= new Worker(new URL("./worker.js", import.meta.url), {
+      type: "module",
+    });
+    const onMessageReceived = (e) => {
+      switch (e.data.status) {
+        case "feature-success":
+          runtimeReadyRef.current = true;
+          setRuntimeReady(true);
+          setError(null);
+          setStatus("idle");
+          setLoadingMessage(
+            "WebGPU is available. Load a model when you want to start local synthesis.",
+          );
+          break;
+        case "feature-error":
+          setError(e.data.data);
+          break;
+        case "loading":
+          setError(null);
+          if (loadedDtypesRef.current.includes(e.data.dtype)) {
+            setLoadingDtype(null);
+            setStatus("running");
+          } else {
+            setLoadingDtype(e.data.dtype);
+            setLoadingMessage(
+              e.data.dtype === "q8"
+                ? "Loading q8 weights (~4.32 GB, sharded). First run can take a minute..."
+                : "Loading q4f16 weights (~1.95 GB). First run downloads once, then stays cached...",
+            );
+            setStatus("loading");
+          }
+          break;
+        case "ready":
+          setLoadingDtype(null);
+          setError(null);
+          setLoadedDtypes((prev) => {
+            if (prev.includes(e.data.dtype)) return prev;
+            const next = [...prev, e.data.dtype];
+            loadedDtypesRef.current = next;
+            return next;
+          });
+          setStatus("ready");
+          break;
+        case "complete":
+          setResults((prev) => [
+            {
+              text: e.data.text,
+              src: e.data.audio,
+              voice: e.data.voice,
+              dtype: e.data.dtype,
+              createdAt: new Date().toLocaleTimeString([], {
+                hour: "numeric",
+                minute: "2-digit",
+              }),
+            },
+            ...prev,
+          ]);
+          setError(null);
+          setStatus("ready");
+          break;
+        case "error":
+          setLoadingDtype(null);
+          setError(e.data.data);
+          setStatus(
+            loadedDtypesRef.current.includes(e.data.dtype)
+              ? "ready"
+              : runtimeReadyRef.current
+                ? "idle"
+                : null,
+          );
+          break;
+      }
+    };
+    worker.current.addEventListener("message", onMessageReceived);
+    worker.current.addEventListener("error", (event) => console.error(event));
+    return () => {
+      worker.current.removeEventListener("message", onMessageReceived);
+    };
+  }, []);
+  const handleSubmit = (event) => {
+    event.preventDefault();
+    if (!isCurrentDtypeLoaded) return;
+    setStatus("running");
+    setError(null);
+    worker.current.postMessage({
+      type: "generate",
+      text: inputText.trim(),
+      speaker_id: selectedVoice,
+      dtype,
+    });
+  };
+  const handleLoadModel = () => {
+    if (!runtimeReady || isCurrentDtypeLoaded) return;
+    setError(null);
+    setLoadingDtype(dtype);
+    setLoadingMessage(
+      dtype === "q8"
+        ? "Loading q8 weights (~4.32 GB, sharded). First run can take a minute..."
+        : "Loading q4f16 weights (~1.95 GB). First run downloads once, then stays cached...",
+    );
+    setStatus("loading");
+    worker.current?.postMessage({ type: "preload", dtype });
+  };
+  const onLanguageChange = (lang) => {
+    const sample = LANGUAGES.find(([entry]) => entry === lang)?.[1] ?? inputText;
+    setInputText(sample);
+    setSelectedVoice(`${lang} (Female)`);
+  };
+  const onDtypeChange = (next) => {
+    if (next === dtype) return;
+    setDtype(next);
+    setError(null);
+    setLoadingDtype(null);
+    setStatus(loadedDtypesRef.current.includes(next) ? "ready" : "idle");
+  };
+  let statusHeadline = "Checking browser runtime";
+  let statusBody = loadingMessage;
+  if (error) {
+    statusHeadline = runtimeReady ? "Load issue" : "Startup issue";
+    statusBody = error;
+  } else if (status === "running") {
+    statusHeadline = "Rendering speech locally";
+    statusBody = `Synthesizing with ${selectedVoice} on ${currentDtype.label}.`;
+  } else if (status === "loading") {
+    statusHeadline = "Loading model weights";
+  } else if (runtimeReady && !isCurrentDtypeLoaded) {
+    statusHeadline = "Ready to load model";
+    statusBody = `${currentDtype.label} is a one-time ${currentDtype.note.split("·")[0].trim()} download. Tap Load model to cache it in this browser.`;
+  } else if (isCurrentDtypeLoaded) {
+    statusHeadline = "Model ready in this browser";
+    statusBody = `${selectedVoice} is ready on ${currentDtype.label}. Everything runs locally after the one-time model download.`;
+  }
+  const statusActivityLabel = status === "running"
+    ? "Generating audio..."
+    : status === "loading"
+      ? "Loading in the background"
+      : null;
+  const statusCardBusy = !error && (
+    status === "loading" || status === "running" || status === null
+  );
+  const loadButtonLabel = isLoadingCurrentDtype
+    ? `Loading ${currentDtype.label}...`
+    : `Load ${currentDtype.label}`;
+  return (
+    <div className="app-shell">
+      <div className="ornament ornament-top" aria-hidden="true">
+        <img src="/warli-strip.svg" alt="" />
+      </div>
+      <main className="app-main">
+        <header className="hero">
+          <span className="hero-kicker">Svāra TTS · WebGPU</span>
+          <h1 className="hero-title">Svāra</h1>
+          <span className="hero-subline">स्वरा · Indic text-to-speech in the browser</span>
+          <p className="hero-copy">
+            A warmer frontend for the same local synthesis engine: 19 languages,
+            38 voices, SNAC decoding, and no server round-trip once the model is
+            cached in this browser.
+          </p>
+          <div className="hero-links">
+            <a
+              href="https://huggingface.co/kenpath/svara-tts-v1"
+              target="_blank"
+              rel="noreferrer"
+            >
+              Base model
+            </a>
+            <a
+              href="https://huggingface.co/shreyask/svara-tts-v1-ONNX"
+              target="_blank"
+              rel="noreferrer"
+            >
+              ONNX export
+            </a>
+            <a
+              href="https://huggingface.co/onnx-community/snac_24khz-ONNX"
+              target="_blank"
+              rel="noreferrer"
+            >
+              SNAC codec
+            </a>
+          </div>
+        </header>
+        <section
+          className={`card status-card ${statusCardBusy ? "is-busy" : ""}`}
+        >
+          <div className="status-main">
+            <p className="section-kicker">Session</p>
+            <h2>{statusHeadline}</h2>
+            <p className={`status-copy ${error ? "is-error" : ""}`}>
+              {statusBody}
+            </p>
+            {statusActivityLabel && !error && (
+              <div className="inline-loader" aria-hidden="true">
+                <span className="inline-loader-dot"></span>
+                <span className="inline-loader-label">{statusActivityLabel}</span>
+              </div>
+            )}
+            {runtimeReady && !isCurrentDtypeLoaded && status !== "loading" && (
+              <div className="model-gate">
+                <div>
+                  <p className="model-gate-copy">
+                    Model load is explicit in this build.
+                  </p>
+                  <span className="model-gate-sub">
+                    {loadedDtypes.length > 0
+                      ? `Cached here: ${loadedDtypes.join(", ")}`
+                      : "Nothing cached in this browser session yet."}
+                  </span>
+                </div>
+                <button
+                  type="button"
+                  className="primary-button load-button"
+                  onClick={handleLoadModel}
+                  disabled={!runtimeReady || status === "running"}
+                >
+                  {loadButtonLabel}
+                </button>
+              </div>
+            )}
+          </div>
+          <div className="pill-row">
+            <span className="pill">19 languages</span>
+            <span className="pill">38 voices</span>
+            <span className="pill">24 kHz mono</span>
+            <span className="pill">Runs locally</span>
+          </div>
+        </section>
+        <div className="workspace">
+          <section className="card composer-card">
+            <div className="card-header">
+              <div>
+                <p className="section-kicker">Compose</p>
+                <h2>Switch language, adjust voice, synthesize</h2>
+              </div>
+              <button
+                type="button"
+                className="ghost-button"
+                onClick={() => setInputText(currentSample)}
+              >
+                Use sample
+              </button>
+            </div>
+            <form onSubmit={handleSubmit} className="composer-form">
+              <div className="control-grid">
+                <label className="field">
+                  <span className="field-label">Language</span>
+                  <select
+                    value={selectedLanguage}
+                    onChange={(event) => onLanguageChange(event.target.value)}
+                  >
+                    {LANGUAGES.map(([lang]) => (
+                      <option key={lang} value={lang}>
+                        {lang}
+                      </option>
+                    ))}
+                  </select>
+                </label>
+                <label className="field">
+                  <span className="field-label">Voice</span>
+                  <select
+                    value={selectedVoice}
+                    onChange={(event) => setSelectedVoice(event.target.value)}
+                  >
+                    {VOICES.filter((voice) => voice.startsWith(`${selectedLanguage} (`)).map(
+                      (voice) => (
+                        <option key={voice} value={voice}>
+                          {voice.split("(")[1].replace(")", "")}
+                        </option>
+                      ),
+                    )}
+                  </select>
+                </label>
+                <label className="field field-wide">
+                  <span className="field-label">Quantization</span>
+                  <select
+                    value={dtype}
+                    onChange={(event) => onDtypeChange(event.target.value)}
+                    disabled={status === "running" || status === "loading"}
+                  >
+                    {DTYPES.map((entry) => (
+                      <option key={entry.value} value={entry.value}>
+                        {entry.label}
+                      </option>
+                    ))}
+                  </select>
+                  <small className="field-note">{currentDtype.note}</small>
+                </label>
+              </div>
+              <label className="field">
+                <div className="label-row">
+                  <span className="field-label">Prompt</span>
+                  <span className="field-meta">
+                    {languageDetail.script} · {languageDetail.region}
+                  </span>
+                </div>
+                <textarea
+                  placeholder="Enter text in any supported Indian language..."
+                  value={inputText}
+                  onChange={(event) => setInputText(event.target.value)}
+                  rows={Math.min(8, Math.max(4, inputText.split("\n").length))}
+                />
+              </label>
+              <div className="chip-bar">
+                {promptChips.map((chip) => (
+                  <button
+                    key={chip.label}
+                    type="button"
+                    className="utility-chip"
+                    onClick={() => setInputText(chip.value)}
+                  >
+                    {chip.label}
+                  </button>
+                ))}
+              </div>
+              <div className="composer-footer">
+                <p className="helper-copy">
+                  Emotion tags can be appended at the end of the sentence, for
+                  example <code>&lt;sad&gt;</code> or <code>&lt;happy&gt;</code>.
+                  Use <code>q8</code> if you want the cleanest output and can
+                  afford the larger one-time download.
+                </p>
+                <button
+                  type="submit"
+                  className="primary-button"
+                  disabled={
+                    status !== "ready" ||
+                    !isCurrentDtypeLoaded ||
+                    inputText.trim() === ""
+                  }
+                >
+                  {status === "running"
+                    ? "Generating audio..."
+                    : isLoadingCurrentDtype
+                      ? `Loading ${currentDtype.label}...`
+                      : !isCurrentDtypeLoaded
+                        ? "Load model to continue"
+                      : "Generate speech"}
+                </button>
+              </div>
+            </form>
+          </section>
+          <aside className="sidebar">
+            <section className="card inspector-card">
+              <p className="section-kicker">Inspector</p>
+              <h3 className="inspector-title">{selectedVoice}</h3>
+              <dl className="compact-meta-grid">
+                <div className="compact-meta">
+                  <dt>Script</dt>
+                  <dd>{languageDetail.script}</dd>
+                </div>
+                <div className="compact-meta">
+                  <dt>Region</dt>
+                  <dd>{languageDetail.region}</dd>
+                </div>
+                <div className="compact-meta">
+                  <dt>Type</dt>
+                  <dd>{selectedGender}</dd>
+                </div>
+                <div className="compact-meta">
+                  <dt>Quant</dt>
+                  <dd>{currentDtype.label}</dd>
+                </div>
+              </dl>
+              <div className="stack-chip-list">
+                {STACK_FACTS.map((fact) => (
+                  <div key={fact.label} className="stack-chip">
+                    <span>{fact.label}</span>
+                    <strong>{fact.value}</strong>
+                  </div>
+                ))}
+              </div>
+              <details className="debug-notes">
+                <summary>Usage notes</summary>
+                <ul className="note-list note-list-compact">
+                  <li>Model and codec are browser-cached after the first load.</li>
+                  <li>Short prompts are the best way to compare voices and quant levels.</li>
+                  <li>The results archive below preserves each render with the actual voice used.</li>
+                </ul>
+              </details>
+            </section>
+          </aside>
+        </div>
+        {results.length > 0 && (
+          <section className="results-section">
+            <div className="results-header">
+              <div>
+                <p className="section-kicker">Archive</p>
+                <h2>Generated takes</h2>
+              </div>
+              <span className="results-meta">Newest first</span>
+            </div>
+            <div className="results-grid">
+              {results.map((result, index) => (
+                <motion.article
+                  key={`${result.voice}-${result.createdAt}-${index}`}
+                  initial={{ y: 24, opacity: 0 }}
+                  animate={{ y: 0, opacity: 1 }}
+                  transition={{ duration: 0.35, delay: index * 0.04 }}
+                  className="card result-card"
+                >
+                  <div className="result-head">
+                    <div>
+                      <h3>{result.voice}</h3>
+                      <p>{result.createdAt}</p>
+                    </div>
+                    <span className="result-pill">{result.dtype}</span>
+                  </div>
+                  <p className="result-text">{result.text}</p>
+                  <audio controls src={result.src} className="result-audio">
+                    Your browser does not support the audio element.
+                  </audio>
+                </motion.article>
+              ))}
+            </div>
+          </section>
+        )}
+      </main>
+      <div className="ornament ornament-bottom" aria-hidden="true">
+        <img src="/warli-strip.svg" alt="" />
+      </div>
+    </div>
+  );
+}

src/assets/react.svg ADDED Viewed

src/index.css ADDED Viewed

	@@ -0,0 +1,756 @@

+@tailwind base;
+@tailwind components;
+@tailwind utilities;
+:root {
+  --bg: #faf3e7;
+  --bg-shadow: #efe2c6;
+  --card: rgba(242, 232, 211, 0.88);
+  --card-border: #d8c4a3;
+  --card-strong: #eadcc0;
+  --accent: #c34a19;
+  --accent-dark: #962f12;
+  --accent-soft: rgba(195, 74, 25, 0.12);
+  --ink: #2f1d12;
+  --muted: #816047;
+  --red: #862e2e;
+  --warli: #fff8ec;
+  --line: rgba(47, 29, 18, 0.1);
+  --shadow: 0 18px 48px rgba(90, 47, 16, 0.08);
+  color: var(--ink);
+  font-family: "Inter", "Segoe UI", sans-serif;
+}
+*,
+*:before,
+*:after {
+  box-sizing: border-box;
+}
+html,
+body,
+#root {
+  min-height: 100%;
+}
+body {
+  margin: 0;
+  background:
+    radial-gradient(circle at top, rgba(255, 248, 236, 0.85), transparent 32%),
+    radial-gradient(circle at bottom left, rgba(195, 74, 25, 0.08), transparent 28%),
+    linear-gradient(180deg, var(--bg), var(--bg-shadow));
+  color: var(--ink);
+}
+button,
+select,
+textarea,
+audio {
+  font: inherit;
+}
+a {
+  color: var(--accent-dark);
+  text-decoration-color: rgba(150, 47, 18, 0.35);
+  text-underline-offset: 0.18em;
+}
+a:hover {
+  color: var(--accent);
+}
+#root {
+  position: relative;
+}
+.app-shell {
+  position: relative;
+  min-height: 100vh;
+  overflow: hidden;
+}
+.app-shell:before {
+  content: "";
+  position: fixed;
+  inset: 0;
+  background:
+    linear-gradient(90deg, rgba(255, 255, 255, 0.14), transparent 20%, transparent 80%, rgba(255, 255, 255, 0.12)),
+    radial-gradient(circle at 20% 15%, rgba(195, 74, 25, 0.08), transparent 18%);
+  mix-blend-mode: multiply;
+  pointer-events: none;
+}
+.app-main {
+  position: relative;
+  z-index: 1;
+  width: min(1160px, calc(100% - 32px));
+  margin: 0 auto;
+  padding: 36px 0 72px;
+}
+.ornament {
+  position: relative;
+  z-index: 1;
+  display: flex;
+  justify-content: center;
+  width: 100%;
+}
+.ornament img {
+  width: min(1600px, 100%);
+  height: auto;
+  display: block;
+}
+.ornament-top {
+  padding-top: 24px;
+}
+.ornament-bottom {
+  padding-bottom: 24px;
+}
+.hero {
+  text-align: center;
+  margin: 18px auto 28px;
+  max-width: 760px;
+}
+.hero-kicker,
+.section-kicker {
+  display: inline-flex;
+  align-items: center;
+  gap: 8px;
+  margin: 0 0 10px;
+  color: var(--accent-dark);
+  font-size: 0.78rem;
+  font-weight: 700;
+  letter-spacing: 0.16em;
+  text-transform: uppercase;
+}
+.hero-title {
+  margin: 0;
+  color: var(--red);
+  font-family: "Yatra One", "Tiro Devanagari Marathi", serif;
+  font-size: clamp(3.6rem, 8vw, 5.8rem);
+  line-height: 0.92;
+}
+.hero-subline {
+  display: block;
+  margin-top: 10px;
+  color: var(--accent-dark);
+  font-family: "Tiro Devanagari Marathi", serif;
+  font-size: clamp(1.1rem, 2.4vw, 1.5rem);
+}
+.hero-copy {
+  margin: 18px auto 0;
+  max-width: 700px;
+  color: var(--muted);
+  font-size: 1.06rem;
+  line-height: 1.75;
+}
+.hero-links {
+  display: flex;
+  flex-wrap: wrap;
+  justify-content: center;
+  gap: 14px 18px;
+  margin-top: 18px;
+  font-size: 0.95rem;
+}
+.card {
+  background: var(--card);
+  border: 1px solid var(--card-border);
+  border-radius: 18px;
+  box-shadow: var(--shadow);
+  backdrop-filter: blur(12px);
+}
+.status-card,
+.composer-card,
+.info-card,
+.result-card {
+  padding: 22px 24px;
+}
+.status-card h2,
+.composer-card h2,
+.info-card h2,
+.results-header h2 {
+  margin: 0;
+  font-family: "Tiro Devanagari Marathi", serif;
+  font-size: 1.75rem;
+  line-height: 1.2;
+}
+.status-card {
+  display: grid;
+  grid-template-columns: minmax(0, 1fr) auto;
+  gap: 20px;
+  align-items: start;
+  margin-bottom: 22px;
+}
+.status-card.is-busy {
+  border-color: rgba(195, 74, 25, 0.35);
+}
+.status-main {
+  min-width: 0;
+}
+.status-copy {
+  margin: 10px 0 0;
+  color: var(--muted);
+  line-height: 1.7;
+}
+.status-copy.is-error {
+  color: var(--red);
+}
+.inline-loader {
+  display: inline-flex;
+  align-items: center;
+  gap: 10px;
+  margin-top: 14px;
+  color: var(--accent-dark);
+  font-size: 0.9rem;
+  font-weight: 600;
+}
+.inline-loader-dot {
+  width: 12px;
+  height: 12px;
+  border-radius: 999px;
+  background: linear-gradient(180deg, var(--accent), var(--accent-dark));
+  box-shadow: 0 0 0 0 rgba(195, 74, 25, 0.3);
+  animation: pulse-dot 1.6s ease-out infinite;
+}
+.inline-loader-label {
+  color: var(--muted);
+  font-weight: 500;
+}
+.model-gate {
+  display: flex;
+  justify-content: space-between;
+  align-items: center;
+  gap: 16px;
+  margin-top: 16px;
+  padding: 14px 16px;
+  border: 1px solid var(--card-border);
+  border-left: 3px solid var(--accent);
+  border-radius: 14px;
+  background: rgba(255, 248, 236, 0.72);
+}
+.model-gate-copy {
+  margin: 0;
+  color: var(--ink);
+  font-size: 0.95rem;
+  font-weight: 600;
+}
+.model-gate-sub {
+  display: block;
+  margin-top: 4px;
+  color: var(--muted);
+  font-size: 0.82rem;
+  line-height: 1.5;
+}
+.load-button {
+  flex: 0 0 auto;
+  min-width: 154px;
+  padding-inline: 16px;
+}
+.pill-row {
+  display: flex;
+  flex-wrap: wrap;
+  justify-content: flex-end;
+  gap: 10px;
+}
+.pill {
+  display: inline-flex;
+  align-items: center;
+  padding: 8px 12px;
+  border: 1px solid var(--card-border);
+  border-radius: 999px;
+  background: rgba(255, 248, 236, 0.9);
+  color: var(--accent-dark);
+  font-size: 0.86rem;
+  font-weight: 600;
+}
+.workspace {
+  display: grid;
+  grid-template-columns: minmax(0, 1.7fr) minmax(260px, 0.72fr);
+  gap: 20px;
+  align-items: start;
+}
+.card-header {
+  display: flex;
+  justify-content: space-between;
+  gap: 16px;
+  align-items: start;
+}
+.ghost-button,
+.utility-chip {
+  border: 1px solid var(--card-border);
+  background: rgba(255, 248, 236, 0.82);
+  color: var(--ink);
+  cursor: pointer;
+  transition:
+    border-color 140ms ease,
+    background 140ms ease,
+    color 140ms ease,
+    transform 140ms ease;
+}
+.ghost-button {
+  border-radius: 999px;
+  padding: 8px 13px;
+  font-size: 0.92rem;
+  font-weight: 600;
+}
+.ghost-button:hover,
+.utility-chip:hover {
+  background: var(--accent-soft);
+  border-color: var(--accent);
+  color: var(--accent-dark);
+  transform: translateY(-1px);
+}
+.composer-form {
+  margin-top: 18px;
+}
+.control-grid {
+  display: grid;
+  grid-template-columns: repeat(2, minmax(0, 1fr));
+  gap: 16px;
+}
+.field {
+  display: block;
+}
+.field-wide {
+  grid-column: 1 / -1;
+}
+.field-label {
+  display: block;
+  margin-bottom: 8px;
+  font-size: 0.9rem;
+  font-weight: 700;
+  color: var(--accent-dark);
+}
+.label-row {
+  display: flex;
+  justify-content: space-between;
+  align-items: baseline;
+  gap: 12px;
+  margin-bottom: 8px;
+}
+.field-meta,
+.field-note {
+  color: var(--muted);
+  font-size: 0.84rem;
+}
+.field select,
+.field textarea {
+  width: 100%;
+  border: 1px solid var(--card-border);
+  border-radius: 14px;
+  background: var(--warli);
+  color: var(--ink);
+}
+.field select {
+  min-height: 48px;
+  padding: 0 14px;
+}
+.field textarea {
+  min-height: 168px;
+  padding: 16px 18px;
+  resize: vertical;
+  line-height: 1.7;
+  font-size: 1.18rem;
+  font-family: "Inter", "Tiro Devanagari Marathi", "Noto Sans Devanagari", serif;
+}
+.field select:focus,
+.field textarea:focus,
+.primary-button:focus,
+.ghost-button:focus,
+.utility-chip:focus {
+  outline: 2px solid rgba(195, 74, 25, 0.36);
+  outline-offset: 2px;
+}
+.chip-bar {
+  display: flex;
+  flex-wrap: wrap;
+  gap: 10px;
+  margin-top: 14px;
+}
+.utility-chip {
+  border-radius: 999px;
+  padding: 7px 12px;
+  font-size: 0.9rem;
+}
+.composer-footer {
+  display: flex;
+  justify-content: space-between;
+  gap: 18px;
+  align-items: end;
+  margin-top: 18px;
+}
+.helper-copy {
+  margin: 0;
+  color: var(--muted);
+  max-width: 600px;
+  line-height: 1.7;
+}
+.helper-copy code {
+  background: rgba(255, 248, 236, 0.95);
+  border: 1px solid var(--card-border);
+  border-radius: 6px;
+  padding: 0.12rem 0.4rem;
+  font-family: ui-monospace, SFMono-Regular, Menlo, monospace;
+  font-size: 0.85rem;
+  color: var(--accent-dark);
+}
+.primary-button {
+  border: none;
+  border-radius: 14px;
+  padding: 13px 18px;
+  min-width: 180px;
+  background: linear-gradient(180deg, var(--accent), var(--accent-dark));
+  color: #fff;
+  font-size: 1rem;
+  font-weight: 700;
+  cursor: pointer;
+  box-shadow: 0 10px 20px rgba(150, 47, 18, 0.18);
+  transition:
+    transform 140ms ease,
+    box-shadow 140ms ease,
+    opacity 140ms ease;
+}
+.primary-button:hover:not(:disabled) {
+  transform: translateY(-1px);
+  box-shadow: 0 14px 24px rgba(150, 47, 18, 0.22);
+}
+.primary-button:disabled {
+  opacity: 0.56;
+  cursor: default;
+}
+.sidebar {
+  display: grid;
+  gap: 12px;
+}
+.inspector-card {
+  padding: 16px 18px;
+}
+.inspector-title {
+  margin: 4px 0 0;
+  font-family: "Tiro Devanagari Marathi", serif;
+  font-size: 1.85rem;
+  line-height: 1.1;
+}
+.compact-meta-grid {
+  display: grid;
+  grid-template-columns: repeat(2, minmax(0, 1fr));
+  gap: 10px;
+  margin: 14px 0 0;
+}
+.compact-meta {
+  padding: 10px 12px;
+  border: 1px solid var(--line);
+  border-radius: 12px;
+  background: rgba(255, 248, 236, 0.58);
+}
+.compact-meta dt {
+  margin: 0;
+  color: var(--muted);
+  font-size: 0.76rem;
+  font-weight: 700;
+  letter-spacing: 0.08em;
+  text-transform: uppercase;
+}
+.compact-meta dd {
+  margin: 6px 0 0;
+  color: var(--ink);
+  font-size: 1rem;
+  font-weight: 700;
+  line-height: 1.35;
+}
+.stack-chip-list {
+  display: grid;
+  gap: 8px;
+  margin-top: 12px;
+}
+.stack-chip {
+  display: flex;
+  justify-content: space-between;
+  gap: 10px;
+  align-items: baseline;
+  padding: 10px 12px;
+  border-radius: 12px;
+  background: rgba(255, 248, 236, 0.72);
+  border: 1px solid var(--line);
+}
+.stack-chip span {
+  color: var(--muted);
+  font-size: 0.8rem;
+  font-weight: 700;
+  letter-spacing: 0.08em;
+  text-transform: uppercase;
+}
+.stack-chip strong {
+  color: var(--ink);
+  font-size: 0.98rem;
+  font-weight: 700;
+  text-align: right;
+}
+.debug-notes {
+  margin-top: 12px;
+  border-top: 1px solid var(--line);
+  padding-top: 10px;
+}
+.debug-notes summary {
+  cursor: pointer;
+  color: var(--accent-dark);
+  font-size: 0.88rem;
+  font-weight: 700;
+  list-style: none;
+}
+.debug-notes summary::-webkit-details-marker {
+  display: none;
+}
+.debug-notes summary:after {
+  content: " +";
+}
+.debug-notes[open] summary:after {
+  content: " -";
+}
+.note-list {
+  margin: 16px 0 0;
+  padding: 0;
+  list-style: none;
+}
+.note-list li {
+  position: relative;
+  padding-left: 18px;
+  color: var(--muted);
+  line-height: 1.7;
+}
+.note-list li + li {
+  margin-top: 10px;
+}
+.note-list-compact {
+  margin-top: 10px;
+}
+.note-list-compact li {
+  font-size: 0.93rem;
+  line-height: 1.55;
+}
+.note-list li:before {
+  content: "";
+  position: absolute;
+  left: 0;
+  top: 0.78em;
+  width: 7px;
+  height: 7px;
+  border-radius: 50%;
+  background: var(--accent);
+}
+.results-section {
+  margin-top: 28px;
+}
+.results-header {
+  display: flex;
+  justify-content: space-between;
+  gap: 16px;
+  align-items: end;
+  margin-bottom: 14px;
+}
+.results-meta {
+  color: var(--muted);
+  font-size: 0.92rem;
+}
+.results-grid {
+  display: grid;
+  gap: 14px;
+}
+.result-head {
+  display: flex;
+  justify-content: space-between;
+  gap: 16px;
+  align-items: start;
+}
+.result-head h3 {
+  margin: 0;
+  font-family: "Tiro Devanagari Marathi", serif;
+  font-size: 1.28rem;
+}
+.result-head p {
+  margin: 6px 0 0;
+  color: var(--muted);
+  font-size: 0.88rem;
+}
+.result-pill {
+  display: inline-flex;
+  align-items: center;
+  padding: 7px 10px;
+  border-radius: 999px;
+  background: var(--accent-soft);
+  color: var(--accent-dark);
+  font-size: 0.8rem;
+  font-weight: 700;
+}
+.result-text {
+  margin: 14px 0;
+  font-size: 1.02rem;
+  line-height: 1.72;
+}
+.result-audio {
+  width: 100%;
+}
+@keyframes pulse-dot {
+  0% {
+    transform: scale(0.92);
+    box-shadow: 0 0 0 0 rgba(195, 74, 25, 0.24);
+  }
+  45% {
+    transform: scale(1);
+    box-shadow: 0 0 0 10px rgba(195, 74, 25, 0);
+  }
+  100% {
+    transform: scale(0.92);
+    box-shadow: 0 0 0 0 rgba(195, 74, 25, 0);
+  }
+}
+@media (max-width: 960px) {
+  .workspace {
+    grid-template-columns: 1fr;
+  }
+  .status-card {
+    grid-template-columns: 1fr;
+  }
+  .model-gate {
+    flex-direction: column;
+    align-items: stretch;
+  }
+  .load-button {
+    width: 100%;
+  }
+  .pill-row {
+    justify-content: flex-start;
+  }
+}
+@media (max-width: 720px) {
+  .app-main {
+    width: min(100%, calc(100% - 20px));
+    padding-top: 24px;
+  }
+  .status-card,
+  .composer-card,
+  .info-card,
+  .result-card {
+    padding: 18px;
+  }
+  .card-header,
+  .composer-footer,
+  .results-header {
+    flex-direction: column;
+    align-items: stretch;
+  }
+  .control-grid {
+    grid-template-columns: 1fr;
+  }
+  .compact-meta-grid {
+    grid-template-columns: 1fr;
+  }
+  .field-wide {
+    grid-column: auto;
+  }
+  .hero-copy,
+  .helper-copy {
+    font-size: 0.96rem;
+  }
+  .primary-button {
+    width: 100%;
+  }
+}

src/main.jsx ADDED Viewed

	@@ -0,0 +1,10 @@

+import { StrictMode } from "react";
+import { createRoot } from "react-dom/client";
+import "./index.css";
+import App from "./App.jsx";
+createRoot(document.getElementById("root")).render(
+  <StrictMode>
+    <App />
+  </StrictMode>,
+);

src/worker.js ADDED Viewed

	@@ -0,0 +1,626 @@

+// Svara TTS WebGPU worker.
+//
+// Architecture:
+//   1) Llama-3.2-3B causal LM (loaded via @huggingface/transformers v4) emits
+//      audio token IDs in the range [128266, 156938).
+//   2) We group every 7-token bundle into a SNAC frame.
+//   3) Offline decode mirrors Kenpath's streaming path: decode a sliding
+//      4-frame SNAC window and keep samples [2048:4096] from each window.
+//      That matches the codec's stable synthesis region and avoids the
+//      "behind a fan" smear seen when decoding the whole sequence in one shot.
+import {
+  AutoTokenizer,
+  AutoModelForCausalLM,
+  LogitsProcessor,
+  LogitsProcessorList,
+  Tensor,
+} from "@huggingface/transformers";
+import * as ort from "onnxruntime-web/webgpu";
+// ORT-Web's .wasm/.mjs files aren't served by Vite by default; vite.config.js
+// copies them from node_modules to /ort-wasm/ via vite-plugin-static-copy.
+ort.env.wasm.wasmPaths = "/ort-wasm/";
+// --- WebGPU feature detection -----------------------------------------------
+let fp16_supported = false;
+try {
+  const adapter = await navigator.gpu.requestAdapter();
+  if (!adapter) throw new Error("WebGPU is not supported (no adapter found)");
+  fp16_supported = adapter.features.has("shader-f16");
+  self.postMessage({ status: "feature-success", fp16: fp16_supported });
+} catch (e) {
+  self.postMessage({ status: "feature-error", data: e.toString() });
+  throw e;
+}
+// --- Constants matching upstream Svara inference -----------------------------
+const EOT = 128009;
+const SOS = 128257, EOS = 128258;
+const SOH = 128259, EOH = 128260;
+const SOAI = 128261;
+const AUDIO_OFFSET = 128266;
+const AUDIO_END = AUDIO_OFFSET + 7 * 4096;
+const WINDOW_FRAMES = 4;
+const WINDOW_AUDIO_START = 2048;
+const WINDOW_AUDIO_END = 4096;
+const SAMPLE_RATE = 24000;
+const SVARA_REPO = "shreyask/svara-tts-v1-ONNX";
+const SNAC_REPO = "onnx-community/snac_24khz-ONNX";
+const SUPPORTED_DTYPES = new Set(["q4f16", "q8"]);
+// Lazy load the tokenizer once -- it's the same across dtypes.
+let tokenizerPromise = null;
+function getTokenizer() {
+  return (tokenizerPromise ??= AutoTokenizer.from_pretrained(SVARA_REPO));
+}
+// SNAC decoder is small (~26 MB at fp16); load once, share across LM dtypes.
+let snacPromise = null;
+function getSnac() {
+  return (snacPromise ??= (async () => {
+    const url = `https://huggingface.co/${SNAC_REPO}/resolve/main/onnx/decoder_model${fp16_supported ? "_fp16" : ""}.onnx`;
+    return ort.InferenceSession.create(url, { executionProviders: ["webgpu"] });
+  })());
+}
+// LM is per-dtype. Cache by dtype string so switching back is instant.
+const lmCache = new Map();
+function getLM(dtype) {
+  if (!lmCache.has(dtype)) {
+    lmCache.set(
+      dtype,
+      AutoModelForCausalLM.from_pretrained(SVARA_REPO, {
+        dtype,
+        device: "webgpu",
+        // Number of external data chunks to fetch alongside the .onnx graph.
+        // q4f16 is one .onnx_data file; q8 is sharded into 3 chunks
+        // (.onnx_data, _data_1, _data_2) to stay under the ~2 GB browser
+        // ArrayBuffer ceiling. transformers.js v4 accepts a number here per
+        // its types: `false` | `true` (=1) | <number of chunks>.
+        use_external_data_format: dtype === "q8" ? 3 : true,
+      }),
+    );
+  }
+  return lmCache.get(dtype);
+}
+// --- Generation guards ------------------------------------------------------
+// Svara should only emit 7-band audio tokens followed by END_OF_SPEECH. If we
+// let the sampler wander into the text/control vocab, the rest of the clip
+// turns phasey/robotic because frame alignment is lost.
+class SvaraLogitsProcessor extends LogitsProcessor {
+  constructor(promptLength) {
+    super();
+    this.promptLength = promptLength;
+  }
+  _call(inputIds, logits) {
+    for (let i = 0; i < inputIds.length; i++) {
+      const data = logits[i].data;
+      const step = inputIds[i].length - this.promptLength;
+      if (step === 0) {
+        data.fill(-Infinity);
+        data[SOAI] = 0;
+        continue;
+      }
+      if (step === 1) {
+        data.fill(-Infinity);
+        data[SOS] = 0;
+        continue;
+      }
+      const eosLogit = data[EOS];
+      data.subarray(0, AUDIO_OFFSET).fill(-Infinity);
+      data.subarray(AUDIO_END).fill(-Infinity);
+      data[EOS] = eosLogit;
+    }
+    return logits;
+  }
+}
+function buildLogitsProcessor(promptLength) {
+  const list = new LogitsProcessorList();
+  list.push(new SvaraLogitsProcessor(promptLength));
+  return list;
+}
+function estimateAudioTokenBudget(text) {
+  const spokenText = stripTrailingEmotionTag(text);
+  const graphemeCount = Array.from(
+    new Intl.Segmenter(undefined, { granularity: "grapheme" }).segment(spokenText),
+    ({ segment }) => segment,
+  ).filter((segment) => /\S/u.test(segment)).length;
+  const punctuationGroups = Array.from(
+    spokenText.matchAll(/[.,!?;:।॥…\-—]+/gu),
+  ).length;
+  const wordCount = spokenText.split(/\s+/u).filter(Boolean).length;
+  const roughBudget = graphemeCount * 12 + wordCount * 20 + punctuationGroups * 28 + 84;
+  const clampedBudget = Math.max(224, Math.min(1120, roughBudget));
+  return Math.ceil(clampedBudget / 7) * 7;
+}
+function getTrailingEmotionTag(text) {
+  return text.match(/\s*(<[^>]+>)\s*$/u)?.[1] ?? "";
+}
+function stripTrailingEmotionTag(text) {
+  return text.replace(/\s*<[^>]+>\s*$/u, "").trim();
+}
+function normalizeTextForSvara(text) {
+  return text
+    .replace(/\.{2,}/gu, ",")
+    .replace(/…+/gu, ",")
+    .replace(/[—–]+/gu, ",")
+    .replace(/\s+/gu, " ")
+    .replace(/\s*([,.;!?।॥])\s*/gu, "$1 ")
+    .trim();
+}
+function countChunkGraphemes(chunk) {
+  return Array.from(
+    new Intl.Segmenter(undefined, { granularity: "grapheme" }).segment(chunk),
+    ({ segment }) => segment,
+  ).filter((segment) => /\S/u.test(segment)).length;
+}
+function countChunkWords(chunk) {
+  return chunk.split(/\s+/u).filter(Boolean).length;
+}
+function splitLongChunk(chunk) {
+  const graphemeCount = countChunkGraphemes(chunk);
+  const wordCount = countChunkWords(chunk);
+  if (graphemeCount <= 28 || wordCount <= 5) return [chunk];
+  const parts = chunk.split(/\s*,\s*/u).map((part) => part.trim()).filter(Boolean);
+  return parts.length > 1 ? parts : [chunk];
+}
+function mergeTinyChunks(chunks) {
+  const merged = [];
+  for (const chunk of chunks) {
+    const graphemeCount = countChunkGraphemes(chunk);
+    const wordCount = countChunkWords(chunk);
+    const shouldAttach =
+      merged.length > 0 &&
+      !/[.!?।॥]$/u.test(merged.at(-1)) &&
+      (graphemeCount < 10 || wordCount < 3);
+    if (shouldAttach) {
+      merged[merged.length - 1] = `${merged.at(-1)}, ${chunk}`;
+      continue;
+    }
+    merged.push(chunk);
+  }
+  return merged;
+}
+function splitTextForSvara(text) {
+  const emotionTag = getTrailingEmotionTag(text);
+  const spokenText = normalizeTextForSvara(stripTrailingEmotionTag(text));
+  if (!spokenText) return [];
+  const chunks = mergeTinyChunks(
+    spokenText
+      .match(/[^.!?।॥]+[.!?।॥]?/gu)
+      ?.map((part) => part.trim())
+      .filter(Boolean)
+      .flatMap(splitLongChunk) ?? [],
+  );
+  if (!emotionTag) return chunks;
+  return chunks.map((chunk, index) =>
+    index === chunks.length - 1 ? `${chunk} ${emotionTag}` : chunk,
+  );
+}
+function mergeTinyLeadingChunks(chunks) {
+  const merged = [];
+  for (let i = 0; i < chunks.length; i++) {
+    const chunk = chunks[i];
+    const graphemeCount = countChunkGraphemes(chunk);
+    const wordCount = countChunkWords(chunk);
+    if (graphemeCount < 10 && wordCount < 3) {
+      if (i + 1 < chunks.length) {
+        chunks[i + 1] = `${chunk}, ${chunks[i + 1]}`;
+        continue;
+      }
+      if (merged.length > 0) {
+        merged[merged.length - 1] = `${merged.at(-1)}, ${chunk}`;
+        continue;
+      }
+    }
+    merged.push(chunk);
+  }
+  return merged;
+}
+function splitEmotionSafeTextForSvara(text) {
+  const emotionTag = getTrailingEmotionTag(text);
+  const spokenText = normalizeTextForSvara(stripTrailingEmotionTag(text));
+  if (!spokenText) return [];
+  const chunks = spokenText
+    .match(/[^.!?।॥]+[.!?।॥]?/gu)
+    ?.map((part) => part.trim())
+    .filter(Boolean)
+    .flatMap((sentence) => {
+      const commaParts = sentence
+        .split(/\s*,\s*/u)
+        .map((part) => part.trim())
+        .filter(Boolean);
+      return mergeTinyLeadingChunks(commaParts);
+    }) ?? [];
+  if (!emotionTag) return chunks;
+  return chunks.map((chunk, index) =>
+    index === chunks.length - 1 ? `${chunk} ${emotionTag}` : chunk,
+  );
+}
+function splitFinalEmotionClauseTextForSvara(text) {
+  const emotionTag = getTrailingEmotionTag(text);
+  const spokenText = normalizeTextForSvara(stripTrailingEmotionTag(text));
+  if (!spokenText) return [];
+  const chunks = mergeTinyLeadingChunks(
+    spokenText.split(/\s*,\s*/u).map((part) => part.trim()).filter(Boolean),
+  );
+  if (!emotionTag) return chunks;
+  return chunks.map((chunk, index) =>
+    index === chunks.length - 1 ? `${chunk} ${emotionTag}` : chunk,
+  );
+}
+function buildPromptVariants(text) {
+  const rawText = text.trim();
+  const spokenText = normalizeTextForSvara(stripTrailingEmotionTag(text));
+  if (!rawText && !spokenText) return [];
+  const variants = rawText ? [[rawText]] : [];
+  variants.push(
+    splitTextForSvara(text),
+    splitEmotionSafeTextForSvara(text),
+    splitFinalEmotionClauseTextForSvara(text),
+  );
+  if (getTrailingEmotionTag(text)) {
+    variants.push([spokenText]);
+    variants.push(splitEmotionSafeTextForSvara(spokenText));
+  }
+  const seen = new Set();
+  return variants.filter((chunks) => {
+    if (chunks.length === 0) return false;
+    const key = chunks.join("\u241e");
+    if (seen.has(key)) return false;
+    seen.add(key);
+    return true;
+  });
+}
+function pauseDurationForChunk(chunk, isLast) {
+  if (isLast) return 0;
+  const trimmed = chunk.trim();
+  if (/[!?]$/u.test(trimmed)) return 0.26;
+  if (/[.]$/u.test(trimmed)) return 0.3;
+  return 0.18;
+}
+function concatFloat32Arrays(chunks) {
+  const totalLength = chunks.reduce((sum, chunk) => sum + chunk.length, 0);
+  const merged = new Float32Array(totalLength);
+  let offset = 0;
+  for (const chunk of chunks) {
+    merged.set(chunk, offset);
+    offset += chunk.length;
+  }
+  return merged;
+}
+function pcmStats(samples) {
+  let peak = 0;
+  let sumSquares = 0;
+  for (let i = 0; i < samples.length; i++) {
+    const value = Math.abs(samples[i]);
+    if (value > peak) peak = value;
+    sumSquares += value * value;
+  }
+  const rms = samples.length > 0 ? Math.sqrt(sumSquares / samples.length) : 0;
+  return { peak, rms };
+}
+function isNearlySilent(samples) {
+  const { peak, rms } = pcmStats(samples);
+  return peak < 0.006 && rms < 0.0015;
+}
+function isComplexQ4Prompt(text) {
+  const spokenText = stripTrailingEmotionTag(text);
+  const wordCount = countChunkWords(spokenText);
+  const punctuationGroups = Array.from(
+    spokenText.matchAll(/[.,!?;:।॥…\-—]+/gu),
+  ).length;
+  return punctuationGroups >= 3 || wordCount >= 8 || (
+    getTrailingEmotionTag(text) && punctuationGroups >= 1 && wordCount >= 5
+  );
+}
+async function synthesizeChunks(tokenizer, lm, speaker_id, chunks, generation) {
+  const pcmChunks = [];
+  for (let index = 0; index < chunks.length; index++) {
+    const chunk = chunks[index];
+    const promptIds = buildPrompt(tokenizer, chunk, speaker_id);
+    const inputIds = new Tensor(
+      "int64",
+      BigInt64Array.from(promptIds.map(BigInt)),
+      [1, promptIds.length],
+    );
+    const maxAudioTokens = estimateAudioTokenBudget(chunk);
+    const out = await lm.generate({
+      inputs: inputIds,
+      max_new_tokens: maxAudioTokens + 3,
+      logits_processor: buildLogitsProcessor(promptIds.length),
+      ...generation,
+      repetition_penalty: 1.0,
+      eos_token_id: EOS,
+    });
+    const allIds = Array.from(out.data, (x) => Number(x));
+    const audioIds = extractAudioTokens(allIds, promptIds.length);
+    if (audioIds.length === 0) {
+      throw new Error(`LM produced no audio tokens for chunk ${index + 1}/${chunks.length}.`);
+    }
+    const pcm = await decodeSnacStable(audioIds);
+    pcmChunks.push(pcm);
+    const pauseSeconds = pauseDurationForChunk(chunk, index === chunks.length - 1);
+    if (pauseSeconds > 0) {
+      pcmChunks.push(new Float32Array(Math.round(SAMPLE_RATE * pauseSeconds)));
+    }
+  }
+  return concatFloat32Arrays(pcmChunks);
+}
+// --- Token-stream → SNAC code conversion ------------------------------------
+// Reference: mlx_audio/tts/models/llama/llama.py:codes_to_layers
+//   layer_1 (band 0):           [c0]                — 1 code per coarse frame
+//   layer_2 (bands 1, 4):       [c1, c4]            — 2 codes per coarse frame
+//   layer_3 (bands 2, 3, 5, 6): [c2, c3, c5, c6]    — 4 codes per coarse frame
+function codesToLayers(audioTokenIds) {
+  const N = Math.floor(audioTokenIds.length / 7);
+  const l1 = new BigInt64Array(N);
+  const l2 = new BigInt64Array(N * 2);
+  const l3 = new BigInt64Array(N * 4);
+  for (let i = 0; i < N; i++) {
+    const base = i * 7;
+    l1[i]         = BigInt(audioTokenIds[base    ] - AUDIO_OFFSET - 0 * 4096);
+    l2[2 * i + 0] = BigInt(audioTokenIds[base + 1] - AUDIO_OFFSET - 1 * 4096);
+    l3[4 * i + 0] = BigInt(audioTokenIds[base + 2] - AUDIO_OFFSET - 2 * 4096);
+    l3[4 * i + 1] = BigInt(audioTokenIds[base + 3] - AUDIO_OFFSET - 3 * 4096);
+    l2[2 * i + 1] = BigInt(audioTokenIds[base + 4] - AUDIO_OFFSET - 4 * 4096);
+    l3[4 * i + 2] = BigInt(audioTokenIds[base + 5] - AUDIO_OFFSET - 5 * 4096);
+    l3[4 * i + 3] = BigInt(audioTokenIds[base + 6] - AUDIO_OFFSET - 6 * 4096);
+  }
+  return { l1, l2, l3, N };
+}
+async function decodeSnacWindow(audioTokenIds) {
+  const snac = await getSnac();
+  const { l1, l2, l3, N } = codesToLayers(audioTokenIds);
+  const feeds = {
+    [snac.inputNames[0]]: new ort.Tensor("int64", l1, [1, N]),
+    [snac.inputNames[1]]: new ort.Tensor("int64", l2, [1, N * 2]),
+    [snac.inputNames[2]]: new ort.Tensor("int64", l3, [1, N * 4]),
+  };
+  const out = await snac.run(feeds);
+  return out[snac.outputNames[0]].data;
+}
+async function decodeSnacStable(audioTokenIds) {
+  const numFrames = Math.floor(audioTokenIds.length / 7);
+  if (numFrames === 0) return new Float32Array(0);
+  if (numFrames < WINDOW_FRAMES) {
+    return await decodeSnacWindow(audioTokenIds);
+  }
+  const chunks = [];
+  let totalLength = 0;
+  for (let start = 0; start <= numFrames - WINDOW_FRAMES; start++) {
+    const windowIds = audioTokenIds.slice(start * 7, (start + WINDOW_FRAMES) * 7);
+    const decoded = await decodeSnacWindow(windowIds);
+    const stable = decoded.slice(WINDOW_AUDIO_START, WINDOW_AUDIO_END);
+    chunks.push(stable);
+    totalLength += stable.length;
+  }
+  const merged = new Float32Array(totalLength);
+  let offset = 0;
+  for (const chunk of chunks) {
+    merged.set(chunk, offset);
+    offset += chunk.length;
+  }
+  return merged;
+}
+// Match the exported ONNX repo README:
+//   [SOH, BOS, "<voice>: <text>" tokens, EOT, EOH]
+// The model predicts SOAI -> SOS -> audio tokens -> EOS itself.
+function buildPrompt(tokenizer, text, voice) {
+  const body = tokenizer.encode(`${voice}: ${text}`, { add_special_tokens: false });
+  return [SOH, tokenizer.bos_token_id, ...body, EOT, EOH];
+}
+// Keep audio tokens after the first START_OF_SPEECH emitted by the model.
+function extractAudioTokens(allTokenIds, promptLength) {
+  let sosIdx = -1;
+  for (let i = promptLength; i < allTokenIds.length; i++) {
+    if (allTokenIds[i] === SOS) {
+      sosIdx = i;
+      break;
+    }
+  }
+  if (sosIdx === -1) return [];
+  const audio = [];
+  for (let i = sosIdx + 1; i < allTokenIds.length; i++) {
+    const tokenId = allTokenIds[i];
+    if (tokenId === EOS) break;
+    if (tokenId >= AUDIO_OFFSET && tokenId < AUDIO_END) {
+      audio.push(tokenId);
+    }
+  }
+  return audio.slice(0, audio.length - (audio.length % 7));
+}
+// --- WAV encoder (24 kHz, mono, PCM16) --------------------------------------
+function pcmFloat32ToWav(samples, sampleRate) {
+  const bufLen = 44 + samples.length * 2;
+  const buf = new ArrayBuffer(bufLen);
+  const v = new DataView(buf);
+  let p = 0;
+  const w = (s) => { for (let i = 0; i < s.length; i++) v.setUint8(p++, s.charCodeAt(i)); };
+  w("RIFF");
+  v.setUint32(p, 36 + samples.length * 2, true); p += 4;
+  w("WAVEfmt ");
+  v.setUint32(p, 16, true); p += 4;
+  v.setUint16(p, 1, true); p += 2;
+  v.setUint16(p, 1, true); p += 2;
+  v.setUint32(p, sampleRate, true); p += 4;
+  v.setUint32(p, sampleRate * 2, true); p += 4;
+  v.setUint16(p, 2, true); p += 2;
+  v.setUint16(p, 16, true); p += 2;
+  w("data");
+  v.setUint32(p, samples.length * 2, true); p += 4;
+  for (let i = 0; i < samples.length; i++) {
+    const s = Math.max(-1, Math.min(1, samples[i]));
+    v.setInt16(p, s < 0 ? s * 0x8000 : s * 0x7fff, true);
+    p += 2;
+  }
+  return buf;
+}
+// --- Sampling defaults per dtype --------------------------------------------
+// Transformers.js v4 currently ignores top-k/top-p on this path, so unconstrained
+// sampling drifts badly on quantized Svara and turns later words robotic. Use
+// greedy decoding by default for stability; q8 can tolerate a little sampling.
+function generationFor(dtype) {
+  return dtype === "q8"
+    ? { do_sample: true, temperature: 0.35, min_new_tokens: 30 }
+    : { do_sample: false, min_new_tokens: 30 };
+}
+function generationPlansFor(dtype, text) {
+  const base = generationFor(dtype);
+  if (dtype !== "q4f16" || !isComplexQ4Prompt(text)) {
+    return [base];
+  }
+  return [
+    {
+      do_sample: true,
+      temperature: 0.6,
+      top_k: 40,
+      top_p: 0.9,
+      min_new_tokens: 30,
+    },
+    base,
+  ];
+}
+// --- Message handler --------------------------------------------------------
+self.addEventListener("message", async (e) => {
+  const { type, text, speaker_id, dtype: requested } = e.data;
+  const dtype = SUPPORTED_DTYPES.has(requested) ? requested : "q4f16";
+  try {
+    if (type === "preload") {
+      // Triggered by the explicit "Load model" action in the UI.
+      self.postMessage({ status: "loading", dtype });
+      await Promise.all([getTokenizer(), getSnac(), getLM(dtype)]);
+      self.postMessage({ status: "ready", dtype });
+      return;
+    }
+    self.postMessage({ status: "loading", dtype });
+    const [tokenizer, lm] = await Promise.all([getTokenizer(), getLM(dtype)]);
+    await getSnac(); // warm
+    const variants = buildPromptVariants(text);
+    if (variants.length === 0) {
+      throw new Error("No speakable text found after normalization.");
+    }
+    const generations = generationPlansFor(dtype, text);
+    let mergedPcm = null;
+    let lastError = null;
+    for (const generation of generations) {
+      for (const chunks of variants) {
+        try {
+          const candidate = await synthesizeChunks(
+            tokenizer,
+            lm,
+            speaker_id,
+            chunks,
+            generation,
+          );
+          if (isNearlySilent(candidate)) {
+            lastError = new Error("Generated near-silent audio.");
+            continue;
+          }
+          mergedPcm = candidate;
+          break;
+        } catch (err) {
+          lastError = err;
+        }
+      }
+      if (mergedPcm) {
+        break;
+      }
+    }
+    if (!mergedPcm) {
+      throw lastError ?? new Error("Synthesis failed for all prompt variants.");
+    }
+    const wav = pcmFloat32ToWav(mergedPcm, SAMPLE_RATE);
+    const blob = new Blob([wav], { type: "audio/wav" });
+    self.postMessage({
+      status: "complete",
+      audio: URL.createObjectURL(blob),
+      text,
+      voice: speaker_id,
+      dtype,
+    });
+  } catch (err) {
+    self.postMessage({ status: "error", data: String(err), dtype });
+    console.error(err);
+  }
+});

style.css DELETED Viewed

@@ -1,28 +0,0 @@
-body {
-	padding: 2rem;
-	font-family: -apple-system, BlinkMacSystemFont, "Arial", sans-serif;
-}
-h1 {
-	font-size: 16px;
-	margin-top: 0;
-}
-p {
-	color: rgb(107, 114, 128);
-	font-size: 15px;
-	margin-bottom: 10px;
-	margin-top: 5px;
-}
-.card {
-	max-width: 620px;
-	margin: 0 auto;
-	padding: 16px;
-	border: 1px solid lightgray;
-	border-radius: 16px;
-}
-.card p:last-child {
-	margin-bottom: 0;
-}

tailwind.config.js ADDED Viewed

	@@ -0,0 +1,8 @@

+/** @type {import('tailwindcss').Config} */
+export default {
+  content: ["./index.html", "./src/**/*.{js,ts,jsx,tsx}"],
+  theme: {
+    extend: {},
+  },
+  plugins: [],
+};

tools/run_svara_onnx_local.py ADDED Viewed

	@@ -0,0 +1,248 @@

+#!/usr/bin/env python3
+import argparse
+import math
+import wave
+from pathlib import Path
+import numpy as np
+import onnxruntime as ort
+import torch
+from optimum.onnxruntime import ORTModelForCausalLM
+from transformers import AutoTokenizer, LogitsProcessor, LogitsProcessorList
+EOT = 128009
+SOS = 128257
+EOS = 128258
+SOH = 128259
+EOH = 128260
+SOAI = 128261
+AUDIO_OFFSET = 128266
+AUDIO_END = AUDIO_OFFSET + 7 * 4096
+WINDOW_FRAMES = 4
+WINDOW_AUDIO_START = 2048
+WINDOW_AUDIO_END = 4096
+SAMPLE_RATE = 24000
+class SvaraLogitsProcessor(LogitsProcessor):
+    def __init__(self, prompt_length: int) -> None:
+        self.prompt_length = prompt_length
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        for row in range(scores.shape[0]):
+            step = input_ids[row].shape[0] - self.prompt_length
+            if step == 0:
+                scores[row].fill_(-float("inf"))
+                scores[row, SOAI] = 0
+                continue
+            if step == 1:
+                scores[row].fill_(-float("inf"))
+                scores[row, SOS] = 0
+                continue
+            eos_logit = scores[row, EOS].item()
+            scores[row, :AUDIO_OFFSET] = -float("inf")
+            scores[row, AUDIO_END:] = -float("inf")
+            scores[row, EOS] = eos_logit
+        return scores
+def build_prompt(tokenizer: AutoTokenizer, text: str, voice: str) -> list[int]:
+    body = tokenizer.encode(f"{voice}: {text}", add_special_tokens=False)
+    return [SOH, tokenizer.bos_token_id, *body, EOT, EOH]
+def extract_audio_tokens(all_token_ids: list[int], prompt_length: int) -> list[int]:
+    try:
+        sos_idx = next(i for i in range(prompt_length, len(all_token_ids)) if all_token_ids[i] == SOS)
+    except StopIteration:
+        return []
+    audio = []
+    for token_id in all_token_ids[sos_idx + 1 :]:
+        if token_id == EOS:
+            break
+        if AUDIO_OFFSET <= token_id < AUDIO_END:
+            audio.append(token_id)
+    return audio[: len(audio) - (len(audio) % 7)]
+def codes_to_layers(audio_token_ids: list[int]) -> tuple[np.ndarray, np.ndarray, np.ndarray, int]:
+    n = len(audio_token_ids) // 7
+    layer_1 = np.zeros((1, n), dtype=np.int64)
+    layer_2 = np.zeros((1, n * 2), dtype=np.int64)
+    layer_3 = np.zeros((1, n * 4), dtype=np.int64)
+    for i in range(n):
+        base = i * 7
+        layer_1[0, i] = audio_token_ids[base] - AUDIO_OFFSET
+        layer_2[0, 2 * i] = audio_token_ids[base + 1] - AUDIO_OFFSET - 1 * 4096
+        layer_3[0, 4 * i] = audio_token_ids[base + 2] - AUDIO_OFFSET - 2 * 4096
+        layer_3[0, 4 * i + 1] = audio_token_ids[base + 3] - AUDIO_OFFSET - 3 * 4096
+        layer_2[0, 2 * i + 1] = audio_token_ids[base + 4] - AUDIO_OFFSET - 4 * 4096
+        layer_3[0, 4 * i + 2] = audio_token_ids[base + 5] - AUDIO_OFFSET - 5 * 4096
+        layer_3[0, 4 * i + 3] = audio_token_ids[base + 6] - AUDIO_OFFSET - 6 * 4096
+    return layer_1, layer_2, layer_3, n
+def decode_snac_window(session: ort.InferenceSession, audio_token_ids: list[int]) -> np.ndarray:
+    layer_1, layer_2, layer_3, n = codes_to_layers(audio_token_ids)
+    outputs = session.run(
+        None,
+        {
+            session.get_inputs()[0].name: layer_1,
+            session.get_inputs()[1].name: layer_2,
+            session.get_inputs()[2].name: layer_3,
+        },
+    )
+    return outputs[0].reshape(-1).astype(np.float32, copy=False)
+def decode_snac_stable(session: ort.InferenceSession, audio_token_ids: list[int]) -> np.ndarray:
+    num_frames = len(audio_token_ids) // 7
+    if num_frames == 0:
+        return np.zeros(0, dtype=np.float32)
+    if num_frames < WINDOW_FRAMES:
+        return decode_snac_window(session, audio_token_ids)
+    chunks = []
+    for start in range(0, num_frames - WINDOW_FRAMES + 1):
+        window_ids = audio_token_ids[start * 7 : (start + WINDOW_FRAMES) * 7]
+        decoded = decode_snac_window(session, window_ids)
+        chunks.append(decoded[WINDOW_AUDIO_START:WINDOW_AUDIO_END])
+    return np.concatenate(chunks, axis=0)
+def write_wav(path: Path, samples: np.ndarray) -> None:
+    pcm = np.clip(samples, -1.0, 1.0)
+    pcm16 = np.where(pcm < 0, pcm * 32768.0, pcm * 32767.0).astype(np.int16)
+    with wave.open(str(path), "wb") as handle:
+        handle.setnchannels(1)
+        handle.setsampwidth(2)
+        handle.setframerate(SAMPLE_RATE)
+        handle.writeframes(pcm16.tobytes())
+def audio_stats(samples: np.ndarray) -> tuple[float, float, float, float]:
+    if samples.size == 0:
+        return 0.0, 0.0, -float("inf"), -float("inf")
+    peak = float(np.max(np.abs(samples)))
+    rms = float(np.sqrt(np.mean(np.square(samples, dtype=np.float64))))
+    peak_db = 20.0 * math.log10(max(peak, 1e-12))
+    rms_db = 20.0 * math.log10(max(rms, 1e-12))
+    return peak, rms, peak_db, rms_db
+def generation_kwargs(dtype: str) -> dict:
+    if dtype == "q8":
+        return {"do_sample": True, "temperature": 0.35, "min_new_tokens": 30}
+    return {"do_sample": False, "min_new_tokens": 30}
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-dir", default=".hf-models/svara-tts-v1-ONNX")
+    parser.add_argument("--snac-dir", default=".hf-models/snac_24khz-ONNX/onnx")
+    parser.add_argument("--dtype", choices=["q4f16", "q8"], default="q4f16")
+    parser.add_argument("--provider", default="CPUExecutionProvider")
+    parser.add_argument("--voice", default="Hindi (Female)")
+    parser.add_argument("--text", required=True)
+    parser.add_argument("--out", required=True)
+    parser.add_argument("--max-new-tokens", type=int, default=2048)
+    parser.add_argument("--fix-mistral-regex", action="store_true")
+    parser.add_argument("--do-sample", action="store_true")
+    parser.add_argument("--temperature", type=float, default=None)
+    parser.add_argument("--top-k", type=int, default=None)
+    parser.add_argument("--top-p", type=float, default=None)
+    args = parser.parse_args()
+    model_dir = Path(args.model_dir)
+    snac_dir = Path(args.snac_dir)
+    model_file = "model_q4f16.onnx" if args.dtype == "q4f16" else "model_quantized.onnx"
+    print(f"loading tokenizer from {model_dir}")
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_dir,
+        local_files_only=True,
+        fix_mistral_regex=args.fix_mistral_regex,
+    )
+    print(f"loading model {model_file} on {args.provider}")
+    model = ORTModelForCausalLM.from_pretrained(
+        model_dir,
+        subfolder="onnx",
+        file_name=model_file,
+        provider=args.provider,
+        use_io_binding=False,
+        local_files_only=True,
+    )
+    decoder_file = "decoder_model.onnx"
+    print(f"loading snac decoder {decoder_file}")
+    snac = ort.InferenceSession(
+        str(snac_dir / decoder_file),
+        providers=[args.provider, "CPUExecutionProvider"],
+    )
+    prompt_ids = build_prompt(tokenizer, args.text, args.voice)
+    input_ids = torch.tensor([prompt_ids], dtype=torch.long)
+    logits_processor = LogitsProcessorList([SvaraLogitsProcessor(len(prompt_ids))])
+    print(f"prompt_length={len(prompt_ids)} max_new_tokens={args.max_new_tokens}")
+    print(f'prompt={args.voice}: {args.text}')
+    gen_kwargs = generation_kwargs(args.dtype)
+    if args.do_sample:
+        gen_kwargs["do_sample"] = True
+    if args.temperature is not None:
+        gen_kwargs["temperature"] = args.temperature
+    if args.top_k is not None:
+        gen_kwargs["top_k"] = args.top_k
+    if args.top_p is not None:
+        gen_kwargs["top_p"] = args.top_p
+    output = model.generate(
+        input_ids=input_ids,
+        max_new_tokens=args.max_new_tokens,
+        logits_processor=logits_processor,
+        repetition_penalty=1.0,
+        eos_token_id=EOS,
+        **gen_kwargs,
+    )
+    if isinstance(output, torch.Tensor):
+        all_ids = output[0].tolist()
+    else:
+        all_ids = output.sequences[0].tolist()
+    audio_ids = extract_audio_tokens(all_ids, len(prompt_ids))
+    print(f"total_tokens={len(all_ids)} audio_tokens={len(audio_ids)} frames={len(audio_ids) // 7}")
+    if not audio_ids:
+        raise RuntimeError("no audio tokens produced")
+    pcm = decode_snac_stable(snac, audio_ids)
+    peak, rms, peak_db, rms_db = audio_stats(pcm)
+    print(
+        "samples="
+        f"{pcm.size} duration_s={pcm.size / SAMPLE_RATE:.3f} "
+        f"peak={peak:.6f} peak_db={peak_db:.2f} rms={rms:.6f} rms_db={rms_db:.2f}"
+    )
+    out_path = Path(args.out)
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    write_wav(out_path, pcm)
+    print(f"wrote {out_path}")
+if __name__ == "__main__":
+    main()

vite.config.js ADDED Viewed

	@@ -0,0 +1,25 @@

+import { defineConfig } from "vite";
+import react from "@vitejs/plugin-react";
+import { viteStaticCopy } from "vite-plugin-static-copy";
+// https://vite.dev/config/
+export default defineConfig({
+  plugins: [
+    react(),
+    // ORT-Web's .wasm/.mjs runtime files aren't served by Vite by default.
+    // Copy them from node_modules into the dev server + build output so the
+    // worker can load them via /ort-wasm/<file>.
+    viteStaticCopy({
+      targets: [
+        {
+          src: "node_modules/onnxruntime-web/dist/*.{wasm,mjs}",
+          dest: "ort-wasm",
+        },
+      ],
+    }),
+  ],
+  worker: { format: "es" },
+  build: {
+    target: "esnext",
+  },
+});