shreyask commited on
Commit
c69c1ee
·
verified ·
1 Parent(s): e546297

Upload Svāra TTS WebGPU app

Browse files
.gitignore ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Logs
2
+ logs
3
+ *.log
4
+ npm-debug.log*
5
+ yarn-debug.log*
6
+ yarn-error.log*
7
+ pnpm-debug.log*
8
+ lerna-debug.log*
9
+
10
+ node_modules
11
+ dist
12
+ dist-ssr
13
+ *.local
14
+ .hf-models
15
+ .venv-onnx
16
+ .venv-onnx314
17
+ validate-output*.wav
18
+
19
+ # Editor directories and files
20
+ .vscode/*
21
+ !.vscode/extensions.json
22
+ .idea
23
+ .DS_Store
24
+ *.suo
25
+ *.ntvs*
26
+ *.njsproj
27
+ *.sln
28
+ *.sw?
README.md CHANGED
@@ -1,10 +1,62 @@
1
  ---
2
- title: Svara Tts Webgpu
3
- emoji: 🏃
4
- colorFrom: gray
5
- colorTo: purple
6
  sdk: static
 
 
7
  pinned: false
 
 
8
  ---
9
 
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Svāra TTS WebGPU
3
+ emoji: 🗣️
4
+ colorFrom: yellow
5
+ colorTo: red
6
  sdk: static
7
+ app_build_command: npm run build
8
+ app_file: dist/index.html
9
  pinned: false
10
+ license: apache-2.0
11
+ short_description: Multilingual Indic TTS in your browser, via WebGPU
12
  ---
13
 
14
+ # Svāra TTS · WebGPU
15
+
16
+ Browser-native multilingual TTS for **19 Indian languages** powered by [Svara](https://huggingface.co/kenpath/svara-tts-v1), [SNAC](https://huggingface.co/hubertsiuzdak/snac_24khz), and [Transformers.js v4](https://huggingface.co/docs/transformers.js). Runs 100% locally in the browser after the one-time model download.
17
+
18
+ This build adds an explicit model load step, browser-side caching, multilingual voice switching, prompt presets, and a WebGPU worker tuned around the ONNX-exported Svāra model.
19
+
20
+ ## Architecture
21
+
22
+ ```
23
+ text → tokenizer → Llama-3.2-3B (q4f16, transformers.js v4 + WebGPU) →
24
+ audio token IDs in [128266, 156938) →
25
+ group every 7 → SNAC frame (3 hierarchical levels) →
26
+ SNAC decoder ONNX (q4f16/fp16 from onnx-community/snac_24khz-ONNX) →
27
+ 24 kHz mono PCM → WAV blob → <audio>
28
+ ```
29
+
30
+ ## Models
31
+
32
+ | Repo | Size | Notes |
33
+ |------|------|-------|
34
+ | [`shreyask/svara-tts-v1-ONNX`](https://huggingface.co/shreyask/svara-tts-v1-ONNX) | ~1.95 GB | Llama-3.2-3B q4f16, GQA, KV-cache |
35
+ | [`onnx-community/snac_24khz-ONNX`](https://huggingface.co/onnx-community/snac_24khz-ONNX) | ~26 MB (fp16) | SNAC decoder |
36
+
37
+ ## Run locally
38
+
39
+ ```sh
40
+ npm install
41
+ npm run dev # http://localhost:5173
42
+ ```
43
+
44
+ First run downloads the selected model into the browser cache (LM + codec + tokenizer). Subsequent runs reuse the cached weights.
45
+
46
+ ## Voices
47
+
48
+ Use a string of the form `"<Language Name> (<Gender>)"`. **38 voices across 19 languages**: Hindi, Bengali, Marathi, Telugu, Kannada, Tamil, Malayalam, Gujarati, Punjabi, Assamese, Bhojpuri, Magahi, Maithili, Chhattisgarhi, Bodo, Dogri, Nepali, Sanskrit, English (Indian) — male + female each.
49
+
50
+ ## Notes
51
+
52
+ - `q4f16` is the fastest cold-start option and works well for short prompts.
53
+ - `q8` is heavier but can sound cleaner on more difficult prompts.
54
+ - Emotion tags such as `<happy>` and `<sad>` can be appended at the end of a line.
55
+ - Everything stays local to the browser after the model has loaded.
56
+
57
+ ## Credits
58
+
59
+ - [Kenpath](https://huggingface.co/kenpath) — Svara TTS v1 base model.
60
+ - [Canopy Labs](https://huggingface.co/canopylabs) — Orpheus 3B Hindi base.
61
+ - [Hugging Face](https://github.com/huggingface/transformers.js-examples/tree/main/text-to-speech-webgpu) — original `text-to-speech-webgpu` scaffold this project forked from.
62
+ - License: Apache 2.0.
eslint.config.js ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import js from "@eslint/js";
2
+ import globals from "globals";
3
+ import react from "eslint-plugin-react";
4
+ import reactHooks from "eslint-plugin-react-hooks";
5
+ import reactRefresh from "eslint-plugin-react-refresh";
6
+
7
+ export default [
8
+ { ignores: ["dist"] },
9
+ {
10
+ files: ["**/*.{js,jsx}"],
11
+ languageOptions: {
12
+ ecmaVersion: 2020,
13
+ globals: globals.browser,
14
+ parserOptions: {
15
+ ecmaVersion: "latest",
16
+ ecmaFeatures: { jsx: true },
17
+ sourceType: "module",
18
+ },
19
+ },
20
+ settings: { react: { version: "18.3" } },
21
+ plugins: {
22
+ react,
23
+ "react-hooks": reactHooks,
24
+ "react-refresh": reactRefresh,
25
+ },
26
+ rules: {
27
+ ...js.configs.recommended.rules,
28
+ ...react.configs.recommended.rules,
29
+ ...react.configs["jsx-runtime"].rules,
30
+ ...reactHooks.configs.recommended.rules,
31
+ "react/jsx-no-target-blank": "off",
32
+ "react-refresh/only-export-components": [
33
+ "warn",
34
+ { allowConstantExport: true },
35
+ ],
36
+ },
37
+ },
38
+ ];
index.html CHANGED
@@ -1,19 +1,19 @@
1
  <!doctype html>
2
- <html>
3
- <head>
4
- <meta charset="utf-8" />
5
- <meta name="viewport" content="width=device-width" />
6
- <title>My static Space</title>
7
- <link rel="stylesheet" href="style.css" />
8
- </head>
9
- <body>
10
- <div class="card">
11
- <h1>Welcome to your static Space!</h1>
12
- <p>You can modify this app directly by editing <i>index.html</i> in the Files and versions tab.</p>
13
- <p>
14
- Also don't forget to check the
15
- <a href="https://huggingface.co/docs/hub/spaces" target="_blank">Spaces documentation</a>.
16
- </p>
17
- </div>
18
- </body>
19
  </html>
 
1
  <!doctype html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8" />
5
+ <link rel="icon" type="image/svg+xml" href="/hf-logo.svg" />
6
+ <meta name="viewport" content="width=device-width, initial-scale=1.0" />
7
+ <link rel="preconnect" href="https://fonts.googleapis.com" />
8
+ <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin />
9
+ <link
10
+ href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&family=Tiro+Devanagari+Marathi:ital@0;1&family=Yatra+One&display=swap"
11
+ rel="stylesheet"
12
+ />
13
+ <title>Svāra TTS · WebGPU</title>
14
+ </head>
15
+ <body>
16
+ <div id="root"></div>
17
+ <script type="module" src="/src/main.jsx"></script>
18
+ </body>
19
  </html>
package-lock.json ADDED
The diff for this file is too large to render. See raw diff
 
package.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "svara-tts-webgpu",
3
+ "private": true,
4
+ "version": "0.0.1",
5
+ "type": "module",
6
+ "scripts": {
7
+ "dev": "vite",
8
+ "build": "vite build",
9
+ "lint": "eslint .",
10
+ "preview": "vite preview"
11
+ },
12
+ "dependencies": {
13
+ "@huggingface/transformers": "^4.0.0",
14
+ "motion": "^11.12.0",
15
+ "onnxruntime-web": "^1.20.0",
16
+ "react": "^18.3.1",
17
+ "react-dom": "^18.3.1"
18
+ },
19
+ "devDependencies": {
20
+ "@eslint/js": "^9.15.0",
21
+ "@types/react": "^18.3.12",
22
+ "@types/react-dom": "^18.3.1",
23
+ "@vitejs/plugin-react": "^4.3.4",
24
+ "autoprefixer": "^10.4.20",
25
+ "eslint": "^9.15.0",
26
+ "eslint-plugin-react": "^7.37.2",
27
+ "eslint-plugin-react-hooks": "^5.0.0",
28
+ "eslint-plugin-react-refresh": "^0.4.14",
29
+ "globals": "^15.12.0",
30
+ "postcss": "^8.4.49",
31
+ "tailwindcss": "^3.4.15",
32
+ "vite": "^6.0.1",
33
+ "vite-plugin-static-copy": "^2.1.0"
34
+ }
35
+ }
postcss.config.js ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ export default {
2
+ plugins: {
3
+ tailwindcss: {},
4
+ autoprefixer: {},
5
+ },
6
+ };
public/hf-logo.svg ADDED
public/warli-strip.svg ADDED
public/wave.svg ADDED
src/App.jsx ADDED
@@ -0,0 +1,569 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { useEffect, useMemo, useRef, useState } from "react";
2
+ import { motion } from "motion/react";
3
+
4
+ const LANGUAGES = [
5
+ ["Hindi", "नमस्ते, आप कैसे हैं?"],
6
+ ["Bengali", "নমস্কার, আপনি কেমন আছেন?"],
7
+ ["Marathi", "नमस्कार, तुम्ही कसे आहात?"],
8
+ ["Telugu", "నమస్కారం, మీరు ఎలా ఉన్నారు?"],
9
+ ["Kannada", "ನಮಸ್ಕಾರ, ನೀವು ಹೇಗಿದ್ದೀರಿ?"],
10
+ ["Tamil", "வணக்கம், நீங்கள் எப்படி இருக்கிறீர்கள்?"],
11
+ ["Malayalam", "നമസ്കാരം, സുഖമാണോ?"],
12
+ ["Gujarati", "નમસ્તે, તમે કેમ છો?"],
13
+ ["Punjabi", "ਸਤ ਸ੍ਰੀ ਅਕਾਲ, ਤੁਸੀਂ ਕਿਵੇਂ ਹੋ?"],
14
+ ["Assamese", "নমস্কাৰ, আপুনি কেনে আছে?"],
15
+ ["Bhojpuri", "नमस्कार, राउर का हाल बा?"],
16
+ ["Magahi", "नमस्कार, तू कैसन हे?"],
17
+ ["Maithili", "नमस्कार, अहाँ कोना छी?"],
18
+ ["Chhattisgarhi", "नमस्कार, आप कइसन हन?"],
19
+ ["Bodo", "नमस्कार, नोँ बेसेबा डंनो?"],
20
+ ["Dogri", "नमस्ते, तुसें कि’यां ओ?"],
21
+ ["Nepali", "नमस्ते, तपाईं कस्तो हुनुहुन्छ?"],
22
+ ["Sanskrit", "नमस्कारः, भवान् कथमस्ति?"],
23
+ ["English (Indian)", "Hello, how are you?"],
24
+ ];
25
+
26
+ const LANGUAGE_DETAILS = {
27
+ Hindi: { script: "Devanagari", region: "North India" },
28
+ Bengali: { script: "Bengali", region: "Eastern India" },
29
+ Marathi: { script: "Devanagari", region: "Maharashtra" },
30
+ Telugu: { script: "Telugu", region: "Andhra Pradesh + Telangana" },
31
+ Kannada: { script: "Kannada", region: "Karnataka" },
32
+ Tamil: { script: "Tamil", region: "Tamil Nadu" },
33
+ Malayalam: { script: "Malayalam", region: "Kerala" },
34
+ Gujarati: { script: "Gujarati", region: "Gujarat" },
35
+ Punjabi: { script: "Gurmukhi", region: "Punjab" },
36
+ Assamese: { script: "Assamese", region: "Assam" },
37
+ Bhojpuri: { script: "Devanagari", region: "Bihar + Eastern UP" },
38
+ Magahi: { script: "Devanagari", region: "Bihar" },
39
+ Maithili: { script: "Devanagari", region: "Mithila" },
40
+ Chhattisgarhi: { script: "Devanagari", region: "Chhattisgarh" },
41
+ Bodo: { script: "Devanagari", region: "Northeast India" },
42
+ Dogri: { script: "Devanagari", region: "Jammu" },
43
+ Nepali: { script: "Devanagari", region: "Nepal + India" },
44
+ Sanskrit: { script: "Devanagari", region: "Classical Indic" },
45
+ "English (Indian)": { script: "Latin", region: "Indian English" },
46
+ };
47
+
48
+ const VOICES = LANGUAGES.flatMap(([lang]) => [
49
+ `${lang} (Female)`,
50
+ `${lang} (Male)`,
51
+ ]);
52
+
53
+ const DTYPES = [
54
+ { value: "q4f16", label: "q4f16", note: "~1.95 GB · fastest cold start" },
55
+ { value: "q8", label: "q8", note: "~4.32 GB · cleaner, slower preload" },
56
+ ];
57
+
58
+ const STACK_FACTS = [
59
+ { label: "Model", value: "Svāra-TTS v1" },
60
+ { label: "Codec", value: "SNAC 24 kHz" },
61
+ { label: "Runtime", value: "WebGPU + Transformers.js" },
62
+ ];
63
+
64
+ function withEmotionTag(text, tag) {
65
+ return `${text.replace(/\s*<[^>]+>\s*$/u, "").trim()} ${tag}`;
66
+ }
67
+
68
+ export default function App() {
69
+ const worker = useRef(null);
70
+ const runtimeReadyRef = useRef(false);
71
+ const loadedDtypesRef = useRef([]);
72
+
73
+ const [selectedVoice, setSelectedVoice] = useState("Hindi (Female)");
74
+ const [inputText, setInputText] = useState(LANGUAGES[0][1]);
75
+ const [dtype, setDtype] = useState("q4f16");
76
+ const [status, setStatus] = useState(null);
77
+ const [error, setError] = useState(null);
78
+ const [runtimeReady, setRuntimeReady] = useState(false);
79
+ const [loadingDtype, setLoadingDtype] = useState(null);
80
+ const [loadedDtypes, setLoadedDtypes] = useState([]);
81
+ const [loadingMessage, setLoadingMessage] = useState(
82
+ "Detecting WebGPU support...",
83
+ );
84
+ const [results, setResults] = useState([]);
85
+
86
+ const selectedLanguage = selectedVoice.split(" (")[0];
87
+ const selectedGender = selectedVoice.includes("(Male)") ? "Male" : "Female";
88
+ const languageDetail = LANGUAGE_DETAILS[selectedLanguage] ?? {
89
+ script: "Indic",
90
+ region: "South Asia",
91
+ };
92
+ const currentSample =
93
+ LANGUAGES.find(([lang]) => lang === selectedLanguage)?.[1] ?? inputText;
94
+ const currentDtype = DTYPES.find((entry) => entry.value === dtype) ?? DTYPES[0];
95
+ const isCurrentDtypeLoaded = loadedDtypes.includes(dtype);
96
+ const isLoadingCurrentDtype =
97
+ status === "loading" && loadingDtype === currentDtype.value;
98
+
99
+ const promptChips = useMemo(
100
+ () => [
101
+ { label: "Sample line", value: currentSample },
102
+ { label: "Sample + <sad>", value: withEmotionTag(currentSample, "<sad>") },
103
+ {
104
+ label: "Sample + <happy>",
105
+ value: withEmotionTag(currentSample, "<happy>"),
106
+ },
107
+ ],
108
+ [currentSample],
109
+ );
110
+
111
+ useEffect(() => {
112
+ runtimeReadyRef.current = runtimeReady;
113
+ }, [runtimeReady]);
114
+
115
+ useEffect(() => {
116
+ loadedDtypesRef.current = loadedDtypes;
117
+ }, [loadedDtypes]);
118
+
119
+ useEffect(() => {
120
+ worker.current ??= new Worker(new URL("./worker.js", import.meta.url), {
121
+ type: "module",
122
+ });
123
+
124
+ const onMessageReceived = (e) => {
125
+ switch (e.data.status) {
126
+ case "feature-success":
127
+ runtimeReadyRef.current = true;
128
+ setRuntimeReady(true);
129
+ setError(null);
130
+ setStatus("idle");
131
+ setLoadingMessage(
132
+ "WebGPU is available. Load a model when you want to start local synthesis.",
133
+ );
134
+ break;
135
+ case "feature-error":
136
+ setError(e.data.data);
137
+ break;
138
+ case "loading":
139
+ setError(null);
140
+ if (loadedDtypesRef.current.includes(e.data.dtype)) {
141
+ setLoadingDtype(null);
142
+ setStatus("running");
143
+ } else {
144
+ setLoadingDtype(e.data.dtype);
145
+ setLoadingMessage(
146
+ e.data.dtype === "q8"
147
+ ? "Loading q8 weights (~4.32 GB, sharded). First run can take a minute..."
148
+ : "Loading q4f16 weights (~1.95 GB). First run downloads once, then stays cached...",
149
+ );
150
+ setStatus("loading");
151
+ }
152
+ break;
153
+ case "ready":
154
+ setLoadingDtype(null);
155
+ setError(null);
156
+ setLoadedDtypes((prev) => {
157
+ if (prev.includes(e.data.dtype)) return prev;
158
+ const next = [...prev, e.data.dtype];
159
+ loadedDtypesRef.current = next;
160
+ return next;
161
+ });
162
+ setStatus("ready");
163
+ break;
164
+ case "complete":
165
+ setResults((prev) => [
166
+ {
167
+ text: e.data.text,
168
+ src: e.data.audio,
169
+ voice: e.data.voice,
170
+ dtype: e.data.dtype,
171
+ createdAt: new Date().toLocaleTimeString([], {
172
+ hour: "numeric",
173
+ minute: "2-digit",
174
+ }),
175
+ },
176
+ ...prev,
177
+ ]);
178
+ setError(null);
179
+ setStatus("ready");
180
+ break;
181
+ case "error":
182
+ setLoadingDtype(null);
183
+ setError(e.data.data);
184
+ setStatus(
185
+ loadedDtypesRef.current.includes(e.data.dtype)
186
+ ? "ready"
187
+ : runtimeReadyRef.current
188
+ ? "idle"
189
+ : null,
190
+ );
191
+ break;
192
+ }
193
+ };
194
+
195
+ worker.current.addEventListener("message", onMessageReceived);
196
+ worker.current.addEventListener("error", (event) => console.error(event));
197
+
198
+ return () => {
199
+ worker.current.removeEventListener("message", onMessageReceived);
200
+ };
201
+ }, []);
202
+
203
+ const handleSubmit = (event) => {
204
+ event.preventDefault();
205
+ if (!isCurrentDtypeLoaded) return;
206
+ setStatus("running");
207
+ setError(null);
208
+ worker.current.postMessage({
209
+ type: "generate",
210
+ text: inputText.trim(),
211
+ speaker_id: selectedVoice,
212
+ dtype,
213
+ });
214
+ };
215
+
216
+ const handleLoadModel = () => {
217
+ if (!runtimeReady || isCurrentDtypeLoaded) return;
218
+ setError(null);
219
+ setLoadingDtype(dtype);
220
+ setLoadingMessage(
221
+ dtype === "q8"
222
+ ? "Loading q8 weights (~4.32 GB, sharded). First run can take a minute..."
223
+ : "Loading q4f16 weights (~1.95 GB). First run downloads once, then stays cached...",
224
+ );
225
+ setStatus("loading");
226
+ worker.current?.postMessage({ type: "preload", dtype });
227
+ };
228
+
229
+ const onLanguageChange = (lang) => {
230
+ const sample = LANGUAGES.find(([entry]) => entry === lang)?.[1] ?? inputText;
231
+ setInputText(sample);
232
+ setSelectedVoice(`${lang} (Female)`);
233
+ };
234
+
235
+ const onDtypeChange = (next) => {
236
+ if (next === dtype) return;
237
+ setDtype(next);
238
+ setError(null);
239
+ setLoadingDtype(null);
240
+ setStatus(loadedDtypesRef.current.includes(next) ? "ready" : "idle");
241
+ };
242
+
243
+ let statusHeadline = "Checking browser runtime";
244
+ let statusBody = loadingMessage;
245
+
246
+ if (error) {
247
+ statusHeadline = runtimeReady ? "Load issue" : "Startup issue";
248
+ statusBody = error;
249
+ } else if (status === "running") {
250
+ statusHeadline = "Rendering speech locally";
251
+ statusBody = `Synthesizing with ${selectedVoice} on ${currentDtype.label}.`;
252
+ } else if (status === "loading") {
253
+ statusHeadline = "Loading model weights";
254
+ } else if (runtimeReady && !isCurrentDtypeLoaded) {
255
+ statusHeadline = "Ready to load model";
256
+ statusBody = `${currentDtype.label} is a one-time ${currentDtype.note.split("·")[0].trim()} download. Tap Load model to cache it in this browser.`;
257
+ } else if (isCurrentDtypeLoaded) {
258
+ statusHeadline = "Model ready in this browser";
259
+ statusBody = `${selectedVoice} is ready on ${currentDtype.label}. Everything runs locally after the one-time model download.`;
260
+ }
261
+
262
+ const statusActivityLabel = status === "running"
263
+ ? "Generating audio..."
264
+ : status === "loading"
265
+ ? "Loading in the background"
266
+ : null;
267
+ const statusCardBusy = !error && (
268
+ status === "loading" || status === "running" || status === null
269
+ );
270
+ const loadButtonLabel = isLoadingCurrentDtype
271
+ ? `Loading ${currentDtype.label}...`
272
+ : `Load ${currentDtype.label}`;
273
+
274
+ return (
275
+ <div className="app-shell">
276
+ <div className="ornament ornament-top" aria-hidden="true">
277
+ <img src="/warli-strip.svg" alt="" />
278
+ </div>
279
+
280
+ <main className="app-main">
281
+ <header className="hero">
282
+ <span className="hero-kicker">Svāra TTS · WebGPU</span>
283
+ <h1 className="hero-title">Svāra</h1>
284
+ <span className="hero-subline">स्वरा · Indic text-to-speech in the browser</span>
285
+ <p className="hero-copy">
286
+ A warmer frontend for the same local synthesis engine: 19 languages,
287
+ 38 voices, SNAC decoding, and no server round-trip once the model is
288
+ cached in this browser.
289
+ </p>
290
+ <div className="hero-links">
291
+ <a
292
+ href="https://huggingface.co/kenpath/svara-tts-v1"
293
+ target="_blank"
294
+ rel="noreferrer"
295
+ >
296
+ Base model
297
+ </a>
298
+ <a
299
+ href="https://huggingface.co/shreyask/svara-tts-v1-ONNX"
300
+ target="_blank"
301
+ rel="noreferrer"
302
+ >
303
+ ONNX export
304
+ </a>
305
+ <a
306
+ href="https://huggingface.co/onnx-community/snac_24khz-ONNX"
307
+ target="_blank"
308
+ rel="noreferrer"
309
+ >
310
+ SNAC codec
311
+ </a>
312
+ </div>
313
+ </header>
314
+
315
+ <section
316
+ className={`card status-card ${statusCardBusy ? "is-busy" : ""}`}
317
+ >
318
+ <div className="status-main">
319
+ <p className="section-kicker">Session</p>
320
+ <h2>{statusHeadline}</h2>
321
+ <p className={`status-copy ${error ? "is-error" : ""}`}>
322
+ {statusBody}
323
+ </p>
324
+ {statusActivityLabel && !error && (
325
+ <div className="inline-loader" aria-hidden="true">
326
+ <span className="inline-loader-dot"></span>
327
+ <span className="inline-loader-label">{statusActivityLabel}</span>
328
+ </div>
329
+ )}
330
+ {runtimeReady && !isCurrentDtypeLoaded && status !== "loading" && (
331
+ <div className="model-gate">
332
+ <div>
333
+ <p className="model-gate-copy">
334
+ Model load is explicit in this build.
335
+ </p>
336
+ <span className="model-gate-sub">
337
+ {loadedDtypes.length > 0
338
+ ? `Cached here: ${loadedDtypes.join(", ")}`
339
+ : "Nothing cached in this browser session yet."}
340
+ </span>
341
+ </div>
342
+ <button
343
+ type="button"
344
+ className="primary-button load-button"
345
+ onClick={handleLoadModel}
346
+ disabled={!runtimeReady || status === "running"}
347
+ >
348
+ {loadButtonLabel}
349
+ </button>
350
+ </div>
351
+ )}
352
+ </div>
353
+ <div className="pill-row">
354
+ <span className="pill">19 languages</span>
355
+ <span className="pill">38 voices</span>
356
+ <span className="pill">24 kHz mono</span>
357
+ <span className="pill">Runs locally</span>
358
+ </div>
359
+ </section>
360
+
361
+ <div className="workspace">
362
+ <section className="card composer-card">
363
+ <div className="card-header">
364
+ <div>
365
+ <p className="section-kicker">Compose</p>
366
+ <h2>Switch language, adjust voice, synthesize</h2>
367
+ </div>
368
+ <button
369
+ type="button"
370
+ className="ghost-button"
371
+ onClick={() => setInputText(currentSample)}
372
+ >
373
+ Use sample
374
+ </button>
375
+ </div>
376
+
377
+ <form onSubmit={handleSubmit} className="composer-form">
378
+ <div className="control-grid">
379
+ <label className="field">
380
+ <span className="field-label">Language</span>
381
+ <select
382
+ value={selectedLanguage}
383
+ onChange={(event) => onLanguageChange(event.target.value)}
384
+ >
385
+ {LANGUAGES.map(([lang]) => (
386
+ <option key={lang} value={lang}>
387
+ {lang}
388
+ </option>
389
+ ))}
390
+ </select>
391
+ </label>
392
+
393
+ <label className="field">
394
+ <span className="field-label">Voice</span>
395
+ <select
396
+ value={selectedVoice}
397
+ onChange={(event) => setSelectedVoice(event.target.value)}
398
+ >
399
+ {VOICES.filter((voice) => voice.startsWith(`${selectedLanguage} (`)).map(
400
+ (voice) => (
401
+ <option key={voice} value={voice}>
402
+ {voice.split("(")[1].replace(")", "")}
403
+ </option>
404
+ ),
405
+ )}
406
+ </select>
407
+ </label>
408
+
409
+ <label className="field field-wide">
410
+ <span className="field-label">Quantization</span>
411
+ <select
412
+ value={dtype}
413
+ onChange={(event) => onDtypeChange(event.target.value)}
414
+ disabled={status === "running" || status === "loading"}
415
+ >
416
+ {DTYPES.map((entry) => (
417
+ <option key={entry.value} value={entry.value}>
418
+ {entry.label}
419
+ </option>
420
+ ))}
421
+ </select>
422
+ <small className="field-note">{currentDtype.note}</small>
423
+ </label>
424
+ </div>
425
+
426
+ <label className="field">
427
+ <div className="label-row">
428
+ <span className="field-label">Prompt</span>
429
+ <span className="field-meta">
430
+ {languageDetail.script} · {languageDetail.region}
431
+ </span>
432
+ </div>
433
+ <textarea
434
+ placeholder="Enter text in any supported Indian language..."
435
+ value={inputText}
436
+ onChange={(event) => setInputText(event.target.value)}
437
+ rows={Math.min(8, Math.max(4, inputText.split("\n").length))}
438
+ />
439
+ </label>
440
+
441
+ <div className="chip-bar">
442
+ {promptChips.map((chip) => (
443
+ <button
444
+ key={chip.label}
445
+ type="button"
446
+ className="utility-chip"
447
+ onClick={() => setInputText(chip.value)}
448
+ >
449
+ {chip.label}
450
+ </button>
451
+ ))}
452
+ </div>
453
+
454
+ <div className="composer-footer">
455
+ <p className="helper-copy">
456
+ Emotion tags can be appended at the end of the sentence, for
457
+ example <code>&lt;sad&gt;</code> or <code>&lt;happy&gt;</code>.
458
+ Use <code>q8</code> if you want the cleanest output and can
459
+ afford the larger one-time download.
460
+ </p>
461
+ <button
462
+ type="submit"
463
+ className="primary-button"
464
+ disabled={
465
+ status !== "ready" ||
466
+ !isCurrentDtypeLoaded ||
467
+ inputText.trim() === ""
468
+ }
469
+ >
470
+ {status === "running"
471
+ ? "Generating audio..."
472
+ : isLoadingCurrentDtype
473
+ ? `Loading ${currentDtype.label}...`
474
+ : !isCurrentDtypeLoaded
475
+ ? "Load model to continue"
476
+ : "Generate speech"}
477
+ </button>
478
+ </div>
479
+ </form>
480
+ </section>
481
+
482
+ <aside className="sidebar">
483
+ <section className="card inspector-card">
484
+ <p className="section-kicker">Inspector</p>
485
+ <h3 className="inspector-title">{selectedVoice}</h3>
486
+
487
+ <dl className="compact-meta-grid">
488
+ <div className="compact-meta">
489
+ <dt>Script</dt>
490
+ <dd>{languageDetail.script}</dd>
491
+ </div>
492
+ <div className="compact-meta">
493
+ <dt>Region</dt>
494
+ <dd>{languageDetail.region}</dd>
495
+ </div>
496
+ <div className="compact-meta">
497
+ <dt>Type</dt>
498
+ <dd>{selectedGender}</dd>
499
+ </div>
500
+ <div className="compact-meta">
501
+ <dt>Quant</dt>
502
+ <dd>{currentDtype.label}</dd>
503
+ </div>
504
+ </dl>
505
+
506
+ <div className="stack-chip-list">
507
+ {STACK_FACTS.map((fact) => (
508
+ <div key={fact.label} className="stack-chip">
509
+ <span>{fact.label}</span>
510
+ <strong>{fact.value}</strong>
511
+ </div>
512
+ ))}
513
+ </div>
514
+
515
+ <details className="debug-notes">
516
+ <summary>Usage notes</summary>
517
+ <ul className="note-list note-list-compact">
518
+ <li>Model and codec are browser-cached after the first load.</li>
519
+ <li>Short prompts are the best way to compare voices and quant levels.</li>
520
+ <li>The results archive below preserves each render with the actual voice used.</li>
521
+ </ul>
522
+ </details>
523
+ </section>
524
+ </aside>
525
+ </div>
526
+
527
+ {results.length > 0 && (
528
+ <section className="results-section">
529
+ <div className="results-header">
530
+ <div>
531
+ <p className="section-kicker">Archive</p>
532
+ <h2>Generated takes</h2>
533
+ </div>
534
+ <span className="results-meta">Newest first</span>
535
+ </div>
536
+
537
+ <div className="results-grid">
538
+ {results.map((result, index) => (
539
+ <motion.article
540
+ key={`${result.voice}-${result.createdAt}-${index}`}
541
+ initial={{ y: 24, opacity: 0 }}
542
+ animate={{ y: 0, opacity: 1 }}
543
+ transition={{ duration: 0.35, delay: index * 0.04 }}
544
+ className="card result-card"
545
+ >
546
+ <div className="result-head">
547
+ <div>
548
+ <h3>{result.voice}</h3>
549
+ <p>{result.createdAt}</p>
550
+ </div>
551
+ <span className="result-pill">{result.dtype}</span>
552
+ </div>
553
+ <p className="result-text">{result.text}</p>
554
+ <audio controls src={result.src} className="result-audio">
555
+ Your browser does not support the audio element.
556
+ </audio>
557
+ </motion.article>
558
+ ))}
559
+ </div>
560
+ </section>
561
+ )}
562
+ </main>
563
+
564
+ <div className="ornament ornament-bottom" aria-hidden="true">
565
+ <img src="/warli-strip.svg" alt="" />
566
+ </div>
567
+ </div>
568
+ );
569
+ }
src/assets/react.svg ADDED
src/index.css ADDED
@@ -0,0 +1,756 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ @tailwind base;
2
+ @tailwind components;
3
+ @tailwind utilities;
4
+
5
+ :root {
6
+ --bg: #faf3e7;
7
+ --bg-shadow: #efe2c6;
8
+ --card: rgba(242, 232, 211, 0.88);
9
+ --card-border: #d8c4a3;
10
+ --card-strong: #eadcc0;
11
+ --accent: #c34a19;
12
+ --accent-dark: #962f12;
13
+ --accent-soft: rgba(195, 74, 25, 0.12);
14
+ --ink: #2f1d12;
15
+ --muted: #816047;
16
+ --red: #862e2e;
17
+ --warli: #fff8ec;
18
+ --line: rgba(47, 29, 18, 0.1);
19
+ --shadow: 0 18px 48px rgba(90, 47, 16, 0.08);
20
+ color: var(--ink);
21
+ font-family: "Inter", "Segoe UI", sans-serif;
22
+ }
23
+
24
+ *,
25
+ *:before,
26
+ *:after {
27
+ box-sizing: border-box;
28
+ }
29
+
30
+ html,
31
+ body,
32
+ #root {
33
+ min-height: 100%;
34
+ }
35
+
36
+ body {
37
+ margin: 0;
38
+ background:
39
+ radial-gradient(circle at top, rgba(255, 248, 236, 0.85), transparent 32%),
40
+ radial-gradient(circle at bottom left, rgba(195, 74, 25, 0.08), transparent 28%),
41
+ linear-gradient(180deg, var(--bg), var(--bg-shadow));
42
+ color: var(--ink);
43
+ }
44
+
45
+ button,
46
+ select,
47
+ textarea,
48
+ audio {
49
+ font: inherit;
50
+ }
51
+
52
+ a {
53
+ color: var(--accent-dark);
54
+ text-decoration-color: rgba(150, 47, 18, 0.35);
55
+ text-underline-offset: 0.18em;
56
+ }
57
+
58
+ a:hover {
59
+ color: var(--accent);
60
+ }
61
+
62
+ #root {
63
+ position: relative;
64
+ }
65
+
66
+ .app-shell {
67
+ position: relative;
68
+ min-height: 100vh;
69
+ overflow: hidden;
70
+ }
71
+
72
+ .app-shell:before {
73
+ content: "";
74
+ position: fixed;
75
+ inset: 0;
76
+ background:
77
+ linear-gradient(90deg, rgba(255, 255, 255, 0.14), transparent 20%, transparent 80%, rgba(255, 255, 255, 0.12)),
78
+ radial-gradient(circle at 20% 15%, rgba(195, 74, 25, 0.08), transparent 18%);
79
+ mix-blend-mode: multiply;
80
+ pointer-events: none;
81
+ }
82
+
83
+ .app-main {
84
+ position: relative;
85
+ z-index: 1;
86
+ width: min(1160px, calc(100% - 32px));
87
+ margin: 0 auto;
88
+ padding: 36px 0 72px;
89
+ }
90
+
91
+ .ornament {
92
+ position: relative;
93
+ z-index: 1;
94
+ display: flex;
95
+ justify-content: center;
96
+ width: 100%;
97
+ }
98
+
99
+ .ornament img {
100
+ width: min(1600px, 100%);
101
+ height: auto;
102
+ display: block;
103
+ }
104
+
105
+ .ornament-top {
106
+ padding-top: 24px;
107
+ }
108
+
109
+ .ornament-bottom {
110
+ padding-bottom: 24px;
111
+ }
112
+
113
+ .hero {
114
+ text-align: center;
115
+ margin: 18px auto 28px;
116
+ max-width: 760px;
117
+ }
118
+
119
+ .hero-kicker,
120
+ .section-kicker {
121
+ display: inline-flex;
122
+ align-items: center;
123
+ gap: 8px;
124
+ margin: 0 0 10px;
125
+ color: var(--accent-dark);
126
+ font-size: 0.78rem;
127
+ font-weight: 700;
128
+ letter-spacing: 0.16em;
129
+ text-transform: uppercase;
130
+ }
131
+
132
+ .hero-title {
133
+ margin: 0;
134
+ color: var(--red);
135
+ font-family: "Yatra One", "Tiro Devanagari Marathi", serif;
136
+ font-size: clamp(3.6rem, 8vw, 5.8rem);
137
+ line-height: 0.92;
138
+ }
139
+
140
+ .hero-subline {
141
+ display: block;
142
+ margin-top: 10px;
143
+ color: var(--accent-dark);
144
+ font-family: "Tiro Devanagari Marathi", serif;
145
+ font-size: clamp(1.1rem, 2.4vw, 1.5rem);
146
+ }
147
+
148
+ .hero-copy {
149
+ margin: 18px auto 0;
150
+ max-width: 700px;
151
+ color: var(--muted);
152
+ font-size: 1.06rem;
153
+ line-height: 1.75;
154
+ }
155
+
156
+ .hero-links {
157
+ display: flex;
158
+ flex-wrap: wrap;
159
+ justify-content: center;
160
+ gap: 14px 18px;
161
+ margin-top: 18px;
162
+ font-size: 0.95rem;
163
+ }
164
+
165
+ .card {
166
+ background: var(--card);
167
+ border: 1px solid var(--card-border);
168
+ border-radius: 18px;
169
+ box-shadow: var(--shadow);
170
+ backdrop-filter: blur(12px);
171
+ }
172
+
173
+ .status-card,
174
+ .composer-card,
175
+ .info-card,
176
+ .result-card {
177
+ padding: 22px 24px;
178
+ }
179
+
180
+ .status-card h2,
181
+ .composer-card h2,
182
+ .info-card h2,
183
+ .results-header h2 {
184
+ margin: 0;
185
+ font-family: "Tiro Devanagari Marathi", serif;
186
+ font-size: 1.75rem;
187
+ line-height: 1.2;
188
+ }
189
+
190
+ .status-card {
191
+ display: grid;
192
+ grid-template-columns: minmax(0, 1fr) auto;
193
+ gap: 20px;
194
+ align-items: start;
195
+ margin-bottom: 22px;
196
+ }
197
+
198
+ .status-card.is-busy {
199
+ border-color: rgba(195, 74, 25, 0.35);
200
+ }
201
+
202
+ .status-main {
203
+ min-width: 0;
204
+ }
205
+
206
+ .status-copy {
207
+ margin: 10px 0 0;
208
+ color: var(--muted);
209
+ line-height: 1.7;
210
+ }
211
+
212
+ .status-copy.is-error {
213
+ color: var(--red);
214
+ }
215
+
216
+ .inline-loader {
217
+ display: inline-flex;
218
+ align-items: center;
219
+ gap: 10px;
220
+ margin-top: 14px;
221
+ color: var(--accent-dark);
222
+ font-size: 0.9rem;
223
+ font-weight: 600;
224
+ }
225
+
226
+ .inline-loader-dot {
227
+ width: 12px;
228
+ height: 12px;
229
+ border-radius: 999px;
230
+ background: linear-gradient(180deg, var(--accent), var(--accent-dark));
231
+ box-shadow: 0 0 0 0 rgba(195, 74, 25, 0.3);
232
+ animation: pulse-dot 1.6s ease-out infinite;
233
+ }
234
+
235
+ .inline-loader-label {
236
+ color: var(--muted);
237
+ font-weight: 500;
238
+ }
239
+
240
+ .model-gate {
241
+ display: flex;
242
+ justify-content: space-between;
243
+ align-items: center;
244
+ gap: 16px;
245
+ margin-top: 16px;
246
+ padding: 14px 16px;
247
+ border: 1px solid var(--card-border);
248
+ border-left: 3px solid var(--accent);
249
+ border-radius: 14px;
250
+ background: rgba(255, 248, 236, 0.72);
251
+ }
252
+
253
+ .model-gate-copy {
254
+ margin: 0;
255
+ color: var(--ink);
256
+ font-size: 0.95rem;
257
+ font-weight: 600;
258
+ }
259
+
260
+ .model-gate-sub {
261
+ display: block;
262
+ margin-top: 4px;
263
+ color: var(--muted);
264
+ font-size: 0.82rem;
265
+ line-height: 1.5;
266
+ }
267
+
268
+ .load-button {
269
+ flex: 0 0 auto;
270
+ min-width: 154px;
271
+ padding-inline: 16px;
272
+ }
273
+
274
+ .pill-row {
275
+ display: flex;
276
+ flex-wrap: wrap;
277
+ justify-content: flex-end;
278
+ gap: 10px;
279
+ }
280
+
281
+ .pill {
282
+ display: inline-flex;
283
+ align-items: center;
284
+ padding: 8px 12px;
285
+ border: 1px solid var(--card-border);
286
+ border-radius: 999px;
287
+ background: rgba(255, 248, 236, 0.9);
288
+ color: var(--accent-dark);
289
+ font-size: 0.86rem;
290
+ font-weight: 600;
291
+ }
292
+
293
+ .workspace {
294
+ display: grid;
295
+ grid-template-columns: minmax(0, 1.7fr) minmax(260px, 0.72fr);
296
+ gap: 20px;
297
+ align-items: start;
298
+ }
299
+
300
+ .card-header {
301
+ display: flex;
302
+ justify-content: space-between;
303
+ gap: 16px;
304
+ align-items: start;
305
+ }
306
+
307
+ .ghost-button,
308
+ .utility-chip {
309
+ border: 1px solid var(--card-border);
310
+ background: rgba(255, 248, 236, 0.82);
311
+ color: var(--ink);
312
+ cursor: pointer;
313
+ transition:
314
+ border-color 140ms ease,
315
+ background 140ms ease,
316
+ color 140ms ease,
317
+ transform 140ms ease;
318
+ }
319
+
320
+ .ghost-button {
321
+ border-radius: 999px;
322
+ padding: 8px 13px;
323
+ font-size: 0.92rem;
324
+ font-weight: 600;
325
+ }
326
+
327
+ .ghost-button:hover,
328
+ .utility-chip:hover {
329
+ background: var(--accent-soft);
330
+ border-color: var(--accent);
331
+ color: var(--accent-dark);
332
+ transform: translateY(-1px);
333
+ }
334
+
335
+ .composer-form {
336
+ margin-top: 18px;
337
+ }
338
+
339
+ .control-grid {
340
+ display: grid;
341
+ grid-template-columns: repeat(2, minmax(0, 1fr));
342
+ gap: 16px;
343
+ }
344
+
345
+ .field {
346
+ display: block;
347
+ }
348
+
349
+ .field-wide {
350
+ grid-column: 1 / -1;
351
+ }
352
+
353
+ .field-label {
354
+ display: block;
355
+ margin-bottom: 8px;
356
+ font-size: 0.9rem;
357
+ font-weight: 700;
358
+ color: var(--accent-dark);
359
+ }
360
+
361
+ .label-row {
362
+ display: flex;
363
+ justify-content: space-between;
364
+ align-items: baseline;
365
+ gap: 12px;
366
+ margin-bottom: 8px;
367
+ }
368
+
369
+ .field-meta,
370
+ .field-note {
371
+ color: var(--muted);
372
+ font-size: 0.84rem;
373
+ }
374
+
375
+ .field select,
376
+ .field textarea {
377
+ width: 100%;
378
+ border: 1px solid var(--card-border);
379
+ border-radius: 14px;
380
+ background: var(--warli);
381
+ color: var(--ink);
382
+ }
383
+
384
+ .field select {
385
+ min-height: 48px;
386
+ padding: 0 14px;
387
+ }
388
+
389
+ .field textarea {
390
+ min-height: 168px;
391
+ padding: 16px 18px;
392
+ resize: vertical;
393
+ line-height: 1.7;
394
+ font-size: 1.18rem;
395
+ font-family: "Inter", "Tiro Devanagari Marathi", "Noto Sans Devanagari", serif;
396
+ }
397
+
398
+ .field select:focus,
399
+ .field textarea:focus,
400
+ .primary-button:focus,
401
+ .ghost-button:focus,
402
+ .utility-chip:focus {
403
+ outline: 2px solid rgba(195, 74, 25, 0.36);
404
+ outline-offset: 2px;
405
+ }
406
+
407
+ .chip-bar {
408
+ display: flex;
409
+ flex-wrap: wrap;
410
+ gap: 10px;
411
+ margin-top: 14px;
412
+ }
413
+
414
+ .utility-chip {
415
+ border-radius: 999px;
416
+ padding: 7px 12px;
417
+ font-size: 0.9rem;
418
+ }
419
+
420
+ .composer-footer {
421
+ display: flex;
422
+ justify-content: space-between;
423
+ gap: 18px;
424
+ align-items: end;
425
+ margin-top: 18px;
426
+ }
427
+
428
+ .helper-copy {
429
+ margin: 0;
430
+ color: var(--muted);
431
+ max-width: 600px;
432
+ line-height: 1.7;
433
+ }
434
+
435
+ .helper-copy code {
436
+ background: rgba(255, 248, 236, 0.95);
437
+ border: 1px solid var(--card-border);
438
+ border-radius: 6px;
439
+ padding: 0.12rem 0.4rem;
440
+ font-family: ui-monospace, SFMono-Regular, Menlo, monospace;
441
+ font-size: 0.85rem;
442
+ color: var(--accent-dark);
443
+ }
444
+
445
+ .primary-button {
446
+ border: none;
447
+ border-radius: 14px;
448
+ padding: 13px 18px;
449
+ min-width: 180px;
450
+ background: linear-gradient(180deg, var(--accent), var(--accent-dark));
451
+ color: #fff;
452
+ font-size: 1rem;
453
+ font-weight: 700;
454
+ cursor: pointer;
455
+ box-shadow: 0 10px 20px rgba(150, 47, 18, 0.18);
456
+ transition:
457
+ transform 140ms ease,
458
+ box-shadow 140ms ease,
459
+ opacity 140ms ease;
460
+ }
461
+
462
+ .primary-button:hover:not(:disabled) {
463
+ transform: translateY(-1px);
464
+ box-shadow: 0 14px 24px rgba(150, 47, 18, 0.22);
465
+ }
466
+
467
+ .primary-button:disabled {
468
+ opacity: 0.56;
469
+ cursor: default;
470
+ }
471
+
472
+ .sidebar {
473
+ display: grid;
474
+ gap: 12px;
475
+ }
476
+
477
+ .inspector-card {
478
+ padding: 16px 18px;
479
+ }
480
+
481
+ .inspector-title {
482
+ margin: 4px 0 0;
483
+ font-family: "Tiro Devanagari Marathi", serif;
484
+ font-size: 1.85rem;
485
+ line-height: 1.1;
486
+ }
487
+
488
+ .compact-meta-grid {
489
+ display: grid;
490
+ grid-template-columns: repeat(2, minmax(0, 1fr));
491
+ gap: 10px;
492
+ margin: 14px 0 0;
493
+ }
494
+
495
+ .compact-meta {
496
+ padding: 10px 12px;
497
+ border: 1px solid var(--line);
498
+ border-radius: 12px;
499
+ background: rgba(255, 248, 236, 0.58);
500
+ }
501
+
502
+ .compact-meta dt {
503
+ margin: 0;
504
+ color: var(--muted);
505
+ font-size: 0.76rem;
506
+ font-weight: 700;
507
+ letter-spacing: 0.08em;
508
+ text-transform: uppercase;
509
+ }
510
+
511
+ .compact-meta dd {
512
+ margin: 6px 0 0;
513
+ color: var(--ink);
514
+ font-size: 1rem;
515
+ font-weight: 700;
516
+ line-height: 1.35;
517
+ }
518
+
519
+ .stack-chip-list {
520
+ display: grid;
521
+ gap: 8px;
522
+ margin-top: 12px;
523
+ }
524
+
525
+ .stack-chip {
526
+ display: flex;
527
+ justify-content: space-between;
528
+ gap: 10px;
529
+ align-items: baseline;
530
+ padding: 10px 12px;
531
+ border-radius: 12px;
532
+ background: rgba(255, 248, 236, 0.72);
533
+ border: 1px solid var(--line);
534
+ }
535
+
536
+ .stack-chip span {
537
+ color: var(--muted);
538
+ font-size: 0.8rem;
539
+ font-weight: 700;
540
+ letter-spacing: 0.08em;
541
+ text-transform: uppercase;
542
+ }
543
+
544
+ .stack-chip strong {
545
+ color: var(--ink);
546
+ font-size: 0.98rem;
547
+ font-weight: 700;
548
+ text-align: right;
549
+ }
550
+
551
+ .debug-notes {
552
+ margin-top: 12px;
553
+ border-top: 1px solid var(--line);
554
+ padding-top: 10px;
555
+ }
556
+
557
+ .debug-notes summary {
558
+ cursor: pointer;
559
+ color: var(--accent-dark);
560
+ font-size: 0.88rem;
561
+ font-weight: 700;
562
+ list-style: none;
563
+ }
564
+
565
+ .debug-notes summary::-webkit-details-marker {
566
+ display: none;
567
+ }
568
+
569
+ .debug-notes summary:after {
570
+ content: " +";
571
+ }
572
+
573
+ .debug-notes[open] summary:after {
574
+ content: " -";
575
+ }
576
+
577
+ .note-list {
578
+ margin: 16px 0 0;
579
+ padding: 0;
580
+ list-style: none;
581
+ }
582
+
583
+ .note-list li {
584
+ position: relative;
585
+ padding-left: 18px;
586
+ color: var(--muted);
587
+ line-height: 1.7;
588
+ }
589
+
590
+ .note-list li + li {
591
+ margin-top: 10px;
592
+ }
593
+
594
+ .note-list-compact {
595
+ margin-top: 10px;
596
+ }
597
+
598
+ .note-list-compact li {
599
+ font-size: 0.93rem;
600
+ line-height: 1.55;
601
+ }
602
+
603
+ .note-list li:before {
604
+ content: "";
605
+ position: absolute;
606
+ left: 0;
607
+ top: 0.78em;
608
+ width: 7px;
609
+ height: 7px;
610
+ border-radius: 50%;
611
+ background: var(--accent);
612
+ }
613
+
614
+ .results-section {
615
+ margin-top: 28px;
616
+ }
617
+
618
+ .results-header {
619
+ display: flex;
620
+ justify-content: space-between;
621
+ gap: 16px;
622
+ align-items: end;
623
+ margin-bottom: 14px;
624
+ }
625
+
626
+ .results-meta {
627
+ color: var(--muted);
628
+ font-size: 0.92rem;
629
+ }
630
+
631
+ .results-grid {
632
+ display: grid;
633
+ gap: 14px;
634
+ }
635
+
636
+ .result-head {
637
+ display: flex;
638
+ justify-content: space-between;
639
+ gap: 16px;
640
+ align-items: start;
641
+ }
642
+
643
+ .result-head h3 {
644
+ margin: 0;
645
+ font-family: "Tiro Devanagari Marathi", serif;
646
+ font-size: 1.28rem;
647
+ }
648
+
649
+ .result-head p {
650
+ margin: 6px 0 0;
651
+ color: var(--muted);
652
+ font-size: 0.88rem;
653
+ }
654
+
655
+ .result-pill {
656
+ display: inline-flex;
657
+ align-items: center;
658
+ padding: 7px 10px;
659
+ border-radius: 999px;
660
+ background: var(--accent-soft);
661
+ color: var(--accent-dark);
662
+ font-size: 0.8rem;
663
+ font-weight: 700;
664
+ }
665
+
666
+ .result-text {
667
+ margin: 14px 0;
668
+ font-size: 1.02rem;
669
+ line-height: 1.72;
670
+ }
671
+
672
+ .result-audio {
673
+ width: 100%;
674
+ }
675
+
676
+ @keyframes pulse-dot {
677
+ 0% {
678
+ transform: scale(0.92);
679
+ box-shadow: 0 0 0 0 rgba(195, 74, 25, 0.24);
680
+ }
681
+
682
+ 45% {
683
+ transform: scale(1);
684
+ box-shadow: 0 0 0 10px rgba(195, 74, 25, 0);
685
+ }
686
+
687
+ 100% {
688
+ transform: scale(0.92);
689
+ box-shadow: 0 0 0 0 rgba(195, 74, 25, 0);
690
+ }
691
+ }
692
+
693
+ @media (max-width: 960px) {
694
+ .workspace {
695
+ grid-template-columns: 1fr;
696
+ }
697
+
698
+ .status-card {
699
+ grid-template-columns: 1fr;
700
+ }
701
+
702
+ .model-gate {
703
+ flex-direction: column;
704
+ align-items: stretch;
705
+ }
706
+
707
+ .load-button {
708
+ width: 100%;
709
+ }
710
+
711
+ .pill-row {
712
+ justify-content: flex-start;
713
+ }
714
+ }
715
+
716
+ @media (max-width: 720px) {
717
+ .app-main {
718
+ width: min(100%, calc(100% - 20px));
719
+ padding-top: 24px;
720
+ }
721
+
722
+ .status-card,
723
+ .composer-card,
724
+ .info-card,
725
+ .result-card {
726
+ padding: 18px;
727
+ }
728
+
729
+ .card-header,
730
+ .composer-footer,
731
+ .results-header {
732
+ flex-direction: column;
733
+ align-items: stretch;
734
+ }
735
+
736
+ .control-grid {
737
+ grid-template-columns: 1fr;
738
+ }
739
+
740
+ .compact-meta-grid {
741
+ grid-template-columns: 1fr;
742
+ }
743
+
744
+ .field-wide {
745
+ grid-column: auto;
746
+ }
747
+
748
+ .hero-copy,
749
+ .helper-copy {
750
+ font-size: 0.96rem;
751
+ }
752
+
753
+ .primary-button {
754
+ width: 100%;
755
+ }
756
+ }
src/main.jsx ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ import { StrictMode } from "react";
2
+ import { createRoot } from "react-dom/client";
3
+ import "./index.css";
4
+ import App from "./App.jsx";
5
+
6
+ createRoot(document.getElementById("root")).render(
7
+ <StrictMode>
8
+ <App />
9
+ </StrictMode>,
10
+ );
src/worker.js ADDED
@@ -0,0 +1,626 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Svara TTS WebGPU worker.
2
+ //
3
+ // Architecture:
4
+ // 1) Llama-3.2-3B causal LM (loaded via @huggingface/transformers v4) emits
5
+ // audio token IDs in the range [128266, 156938).
6
+ // 2) We group every 7-token bundle into a SNAC frame.
7
+ // 3) Offline decode mirrors Kenpath's streaming path: decode a sliding
8
+ // 4-frame SNAC window and keep samples [2048:4096] from each window.
9
+ // That matches the codec's stable synthesis region and avoids the
10
+ // "behind a fan" smear seen when decoding the whole sequence in one shot.
11
+
12
+ import {
13
+ AutoTokenizer,
14
+ AutoModelForCausalLM,
15
+ LogitsProcessor,
16
+ LogitsProcessorList,
17
+ Tensor,
18
+ } from "@huggingface/transformers";
19
+ import * as ort from "onnxruntime-web/webgpu";
20
+
21
+ // ORT-Web's .wasm/.mjs files aren't served by Vite by default; vite.config.js
22
+ // copies them from node_modules to /ort-wasm/ via vite-plugin-static-copy.
23
+ ort.env.wasm.wasmPaths = "/ort-wasm/";
24
+
25
+ // --- WebGPU feature detection -----------------------------------------------
26
+ let fp16_supported = false;
27
+ try {
28
+ const adapter = await navigator.gpu.requestAdapter();
29
+ if (!adapter) throw new Error("WebGPU is not supported (no adapter found)");
30
+ fp16_supported = adapter.features.has("shader-f16");
31
+ self.postMessage({ status: "feature-success", fp16: fp16_supported });
32
+ } catch (e) {
33
+ self.postMessage({ status: "feature-error", data: e.toString() });
34
+ throw e;
35
+ }
36
+
37
+ // --- Constants matching upstream Svara inference -----------------------------
38
+ const EOT = 128009;
39
+ const SOS = 128257, EOS = 128258;
40
+ const SOH = 128259, EOH = 128260;
41
+ const SOAI = 128261;
42
+ const AUDIO_OFFSET = 128266;
43
+ const AUDIO_END = AUDIO_OFFSET + 7 * 4096;
44
+ const WINDOW_FRAMES = 4;
45
+ const WINDOW_AUDIO_START = 2048;
46
+ const WINDOW_AUDIO_END = 4096;
47
+ const SAMPLE_RATE = 24000;
48
+
49
+ const SVARA_REPO = "shreyask/svara-tts-v1-ONNX";
50
+ const SNAC_REPO = "onnx-community/snac_24khz-ONNX";
51
+ const SUPPORTED_DTYPES = new Set(["q4f16", "q8"]);
52
+
53
+ // Lazy load the tokenizer once -- it's the same across dtypes.
54
+ let tokenizerPromise = null;
55
+ function getTokenizer() {
56
+ return (tokenizerPromise ??= AutoTokenizer.from_pretrained(SVARA_REPO));
57
+ }
58
+
59
+ // SNAC decoder is small (~26 MB at fp16); load once, share across LM dtypes.
60
+ let snacPromise = null;
61
+ function getSnac() {
62
+ return (snacPromise ??= (async () => {
63
+ const url = `https://huggingface.co/${SNAC_REPO}/resolve/main/onnx/decoder_model${fp16_supported ? "_fp16" : ""}.onnx`;
64
+ return ort.InferenceSession.create(url, { executionProviders: ["webgpu"] });
65
+ })());
66
+ }
67
+
68
+ // LM is per-dtype. Cache by dtype string so switching back is instant.
69
+ const lmCache = new Map();
70
+ function getLM(dtype) {
71
+ if (!lmCache.has(dtype)) {
72
+ lmCache.set(
73
+ dtype,
74
+ AutoModelForCausalLM.from_pretrained(SVARA_REPO, {
75
+ dtype,
76
+ device: "webgpu",
77
+ // Number of external data chunks to fetch alongside the .onnx graph.
78
+ // q4f16 is one .onnx_data file; q8 is sharded into 3 chunks
79
+ // (.onnx_data, _data_1, _data_2) to stay under the ~2 GB browser
80
+ // ArrayBuffer ceiling. transformers.js v4 accepts a number here per
81
+ // its types: `false` | `true` (=1) | <number of chunks>.
82
+ use_external_data_format: dtype === "q8" ? 3 : true,
83
+ }),
84
+ );
85
+ }
86
+ return lmCache.get(dtype);
87
+ }
88
+
89
+ // --- Generation guards ------------------------------------------------------
90
+ // Svara should only emit 7-band audio tokens followed by END_OF_SPEECH. If we
91
+ // let the sampler wander into the text/control vocab, the rest of the clip
92
+ // turns phasey/robotic because frame alignment is lost.
93
+ class SvaraLogitsProcessor extends LogitsProcessor {
94
+ constructor(promptLength) {
95
+ super();
96
+ this.promptLength = promptLength;
97
+ }
98
+
99
+ _call(inputIds, logits) {
100
+ for (let i = 0; i < inputIds.length; i++) {
101
+ const data = logits[i].data;
102
+ const step = inputIds[i].length - this.promptLength;
103
+
104
+ if (step === 0) {
105
+ data.fill(-Infinity);
106
+ data[SOAI] = 0;
107
+ continue;
108
+ }
109
+
110
+ if (step === 1) {
111
+ data.fill(-Infinity);
112
+ data[SOS] = 0;
113
+ continue;
114
+ }
115
+
116
+ const eosLogit = data[EOS];
117
+ data.subarray(0, AUDIO_OFFSET).fill(-Infinity);
118
+ data.subarray(AUDIO_END).fill(-Infinity);
119
+ data[EOS] = eosLogit;
120
+ }
121
+ return logits;
122
+ }
123
+ }
124
+
125
+ function buildLogitsProcessor(promptLength) {
126
+ const list = new LogitsProcessorList();
127
+ list.push(new SvaraLogitsProcessor(promptLength));
128
+ return list;
129
+ }
130
+
131
+ function estimateAudioTokenBudget(text) {
132
+ const spokenText = stripTrailingEmotionTag(text);
133
+ const graphemeCount = Array.from(
134
+ new Intl.Segmenter(undefined, { granularity: "grapheme" }).segment(spokenText),
135
+ ({ segment }) => segment,
136
+ ).filter((segment) => /\S/u.test(segment)).length;
137
+ const punctuationGroups = Array.from(
138
+ spokenText.matchAll(/[.,!?;:।॥…\-—]+/gu),
139
+ ).length;
140
+ const wordCount = spokenText.split(/\s+/u).filter(Boolean).length;
141
+
142
+ const roughBudget = graphemeCount * 12 + wordCount * 20 + punctuationGroups * 28 + 84;
143
+ const clampedBudget = Math.max(224, Math.min(1120, roughBudget));
144
+ return Math.ceil(clampedBudget / 7) * 7;
145
+ }
146
+
147
+ function getTrailingEmotionTag(text) {
148
+ return text.match(/\s*(<[^>]+>)\s*$/u)?.[1] ?? "";
149
+ }
150
+
151
+ function stripTrailingEmotionTag(text) {
152
+ return text.replace(/\s*<[^>]+>\s*$/u, "").trim();
153
+ }
154
+
155
+ function normalizeTextForSvara(text) {
156
+ return text
157
+ .replace(/\.{2,}/gu, ",")
158
+ .replace(/…+/gu, ",")
159
+ .replace(/[—–]+/gu, ",")
160
+ .replace(/\s+/gu, " ")
161
+ .replace(/\s*([,.;!?।॥])\s*/gu, "$1 ")
162
+ .trim();
163
+ }
164
+
165
+ function countChunkGraphemes(chunk) {
166
+ return Array.from(
167
+ new Intl.Segmenter(undefined, { granularity: "grapheme" }).segment(chunk),
168
+ ({ segment }) => segment,
169
+ ).filter((segment) => /\S/u.test(segment)).length;
170
+ }
171
+
172
+ function countChunkWords(chunk) {
173
+ return chunk.split(/\s+/u).filter(Boolean).length;
174
+ }
175
+
176
+ function splitLongChunk(chunk) {
177
+ const graphemeCount = countChunkGraphemes(chunk);
178
+ const wordCount = countChunkWords(chunk);
179
+
180
+ if (graphemeCount <= 28 || wordCount <= 5) return [chunk];
181
+
182
+ const parts = chunk.split(/\s*,\s*/u).map((part) => part.trim()).filter(Boolean);
183
+ return parts.length > 1 ? parts : [chunk];
184
+ }
185
+
186
+ function mergeTinyChunks(chunks) {
187
+ const merged = [];
188
+
189
+ for (const chunk of chunks) {
190
+ const graphemeCount = countChunkGraphemes(chunk);
191
+ const wordCount = countChunkWords(chunk);
192
+ const shouldAttach =
193
+ merged.length > 0 &&
194
+ !/[.!?।॥]$/u.test(merged.at(-1)) &&
195
+ (graphemeCount < 10 || wordCount < 3);
196
+
197
+ if (shouldAttach) {
198
+ merged[merged.length - 1] = `${merged.at(-1)}, ${chunk}`;
199
+ continue;
200
+ }
201
+
202
+ merged.push(chunk);
203
+ }
204
+
205
+ return merged;
206
+ }
207
+
208
+ function splitTextForSvara(text) {
209
+ const emotionTag = getTrailingEmotionTag(text);
210
+ const spokenText = normalizeTextForSvara(stripTrailingEmotionTag(text));
211
+
212
+ if (!spokenText) return [];
213
+
214
+ const chunks = mergeTinyChunks(
215
+ spokenText
216
+ .match(/[^.!?।॥]+[.!?।॥]?/gu)
217
+ ?.map((part) => part.trim())
218
+ .filter(Boolean)
219
+ .flatMap(splitLongChunk) ?? [],
220
+ );
221
+
222
+ if (!emotionTag) return chunks;
223
+ return chunks.map((chunk, index) =>
224
+ index === chunks.length - 1 ? `${chunk} ${emotionTag}` : chunk,
225
+ );
226
+ }
227
+
228
+ function mergeTinyLeadingChunks(chunks) {
229
+ const merged = [];
230
+
231
+ for (let i = 0; i < chunks.length; i++) {
232
+ const chunk = chunks[i];
233
+ const graphemeCount = countChunkGraphemes(chunk);
234
+ const wordCount = countChunkWords(chunk);
235
+
236
+ if (graphemeCount < 10 && wordCount < 3) {
237
+ if (i + 1 < chunks.length) {
238
+ chunks[i + 1] = `${chunk}, ${chunks[i + 1]}`;
239
+ continue;
240
+ }
241
+ if (merged.length > 0) {
242
+ merged[merged.length - 1] = `${merged.at(-1)}, ${chunk}`;
243
+ continue;
244
+ }
245
+ }
246
+
247
+ merged.push(chunk);
248
+ }
249
+
250
+ return merged;
251
+ }
252
+
253
+ function splitEmotionSafeTextForSvara(text) {
254
+ const emotionTag = getTrailingEmotionTag(text);
255
+ const spokenText = normalizeTextForSvara(stripTrailingEmotionTag(text));
256
+
257
+ if (!spokenText) return [];
258
+
259
+ const chunks = spokenText
260
+ .match(/[^.!?।॥]+[.!?।॥]?/gu)
261
+ ?.map((part) => part.trim())
262
+ .filter(Boolean)
263
+ .flatMap((sentence) => {
264
+ const commaParts = sentence
265
+ .split(/\s*,\s*/u)
266
+ .map((part) => part.trim())
267
+ .filter(Boolean);
268
+ return mergeTinyLeadingChunks(commaParts);
269
+ }) ?? [];
270
+
271
+ if (!emotionTag) return chunks;
272
+ return chunks.map((chunk, index) =>
273
+ index === chunks.length - 1 ? `${chunk} ${emotionTag}` : chunk,
274
+ );
275
+ }
276
+
277
+ function splitFinalEmotionClauseTextForSvara(text) {
278
+ const emotionTag = getTrailingEmotionTag(text);
279
+ const spokenText = normalizeTextForSvara(stripTrailingEmotionTag(text));
280
+
281
+ if (!spokenText) return [];
282
+
283
+ const chunks = mergeTinyLeadingChunks(
284
+ spokenText.split(/\s*,\s*/u).map((part) => part.trim()).filter(Boolean),
285
+ );
286
+
287
+ if (!emotionTag) return chunks;
288
+ return chunks.map((chunk, index) =>
289
+ index === chunks.length - 1 ? `${chunk} ${emotionTag}` : chunk,
290
+ );
291
+ }
292
+
293
+ function buildPromptVariants(text) {
294
+ const rawText = text.trim();
295
+ const spokenText = normalizeTextForSvara(stripTrailingEmotionTag(text));
296
+ if (!rawText && !spokenText) return [];
297
+
298
+ const variants = rawText ? [[rawText]] : [];
299
+ variants.push(
300
+ splitTextForSvara(text),
301
+ splitEmotionSafeTextForSvara(text),
302
+ splitFinalEmotionClauseTextForSvara(text),
303
+ );
304
+
305
+ if (getTrailingEmotionTag(text)) {
306
+ variants.push([spokenText]);
307
+ variants.push(splitEmotionSafeTextForSvara(spokenText));
308
+ }
309
+
310
+ const seen = new Set();
311
+ return variants.filter((chunks) => {
312
+ if (chunks.length === 0) return false;
313
+ const key = chunks.join("\u241e");
314
+ if (seen.has(key)) return false;
315
+ seen.add(key);
316
+ return true;
317
+ });
318
+ }
319
+
320
+ function pauseDurationForChunk(chunk, isLast) {
321
+ if (isLast) return 0;
322
+ const trimmed = chunk.trim();
323
+ if (/[!?]$/u.test(trimmed)) return 0.26;
324
+ if (/[.]$/u.test(trimmed)) return 0.3;
325
+ return 0.18;
326
+ }
327
+
328
+ function concatFloat32Arrays(chunks) {
329
+ const totalLength = chunks.reduce((sum, chunk) => sum + chunk.length, 0);
330
+ const merged = new Float32Array(totalLength);
331
+ let offset = 0;
332
+ for (const chunk of chunks) {
333
+ merged.set(chunk, offset);
334
+ offset += chunk.length;
335
+ }
336
+ return merged;
337
+ }
338
+
339
+ function pcmStats(samples) {
340
+ let peak = 0;
341
+ let sumSquares = 0;
342
+
343
+ for (let i = 0; i < samples.length; i++) {
344
+ const value = Math.abs(samples[i]);
345
+ if (value > peak) peak = value;
346
+ sumSquares += value * value;
347
+ }
348
+
349
+ const rms = samples.length > 0 ? Math.sqrt(sumSquares / samples.length) : 0;
350
+ return { peak, rms };
351
+ }
352
+
353
+ function isNearlySilent(samples) {
354
+ const { peak, rms } = pcmStats(samples);
355
+ return peak < 0.006 && rms < 0.0015;
356
+ }
357
+
358
+ function isComplexQ4Prompt(text) {
359
+ const spokenText = stripTrailingEmotionTag(text);
360
+ const wordCount = countChunkWords(spokenText);
361
+ const punctuationGroups = Array.from(
362
+ spokenText.matchAll(/[.,!?;:।॥…\-—]+/gu),
363
+ ).length;
364
+ return punctuationGroups >= 3 || wordCount >= 8 || (
365
+ getTrailingEmotionTag(text) && punctuationGroups >= 1 && wordCount >= 5
366
+ );
367
+ }
368
+
369
+ async function synthesizeChunks(tokenizer, lm, speaker_id, chunks, generation) {
370
+ const pcmChunks = [];
371
+
372
+ for (let index = 0; index < chunks.length; index++) {
373
+ const chunk = chunks[index];
374
+ const promptIds = buildPrompt(tokenizer, chunk, speaker_id);
375
+ const inputIds = new Tensor(
376
+ "int64",
377
+ BigInt64Array.from(promptIds.map(BigInt)),
378
+ [1, promptIds.length],
379
+ );
380
+
381
+ const maxAudioTokens = estimateAudioTokenBudget(chunk);
382
+ const out = await lm.generate({
383
+ inputs: inputIds,
384
+ max_new_tokens: maxAudioTokens + 3,
385
+ logits_processor: buildLogitsProcessor(promptIds.length),
386
+ ...generation,
387
+ repetition_penalty: 1.0,
388
+ eos_token_id: EOS,
389
+ });
390
+
391
+ const allIds = Array.from(out.data, (x) => Number(x));
392
+ const audioIds = extractAudioTokens(allIds, promptIds.length);
393
+ if (audioIds.length === 0) {
394
+ throw new Error(`LM produced no audio tokens for chunk ${index + 1}/${chunks.length}.`);
395
+ }
396
+
397
+ const pcm = await decodeSnacStable(audioIds);
398
+ pcmChunks.push(pcm);
399
+
400
+ const pauseSeconds = pauseDurationForChunk(chunk, index === chunks.length - 1);
401
+ if (pauseSeconds > 0) {
402
+ pcmChunks.push(new Float32Array(Math.round(SAMPLE_RATE * pauseSeconds)));
403
+ }
404
+ }
405
+
406
+ return concatFloat32Arrays(pcmChunks);
407
+ }
408
+
409
+ // --- Token-stream → SNAC code conversion ------------------------------------
410
+ // Reference: mlx_audio/tts/models/llama/llama.py:codes_to_layers
411
+ // layer_1 (band 0): [c0] — 1 code per coarse frame
412
+ // layer_2 (bands 1, 4): [c1, c4] — 2 codes per coarse frame
413
+ // layer_3 (bands 2, 3, 5, 6): [c2, c3, c5, c6] — 4 codes per coarse frame
414
+ function codesToLayers(audioTokenIds) {
415
+ const N = Math.floor(audioTokenIds.length / 7);
416
+ const l1 = new BigInt64Array(N);
417
+ const l2 = new BigInt64Array(N * 2);
418
+ const l3 = new BigInt64Array(N * 4);
419
+ for (let i = 0; i < N; i++) {
420
+ const base = i * 7;
421
+ l1[i] = BigInt(audioTokenIds[base ] - AUDIO_OFFSET - 0 * 4096);
422
+ l2[2 * i + 0] = BigInt(audioTokenIds[base + 1] - AUDIO_OFFSET - 1 * 4096);
423
+ l3[4 * i + 0] = BigInt(audioTokenIds[base + 2] - AUDIO_OFFSET - 2 * 4096);
424
+ l3[4 * i + 1] = BigInt(audioTokenIds[base + 3] - AUDIO_OFFSET - 3 * 4096);
425
+ l2[2 * i + 1] = BigInt(audioTokenIds[base + 4] - AUDIO_OFFSET - 4 * 4096);
426
+ l3[4 * i + 2] = BigInt(audioTokenIds[base + 5] - AUDIO_OFFSET - 5 * 4096);
427
+ l3[4 * i + 3] = BigInt(audioTokenIds[base + 6] - AUDIO_OFFSET - 6 * 4096);
428
+ }
429
+ return { l1, l2, l3, N };
430
+ }
431
+
432
+ async function decodeSnacWindow(audioTokenIds) {
433
+ const snac = await getSnac();
434
+ const { l1, l2, l3, N } = codesToLayers(audioTokenIds);
435
+ const feeds = {
436
+ [snac.inputNames[0]]: new ort.Tensor("int64", l1, [1, N]),
437
+ [snac.inputNames[1]]: new ort.Tensor("int64", l2, [1, N * 2]),
438
+ [snac.inputNames[2]]: new ort.Tensor("int64", l3, [1, N * 4]),
439
+ };
440
+ const out = await snac.run(feeds);
441
+ return out[snac.outputNames[0]].data;
442
+ }
443
+
444
+ async function decodeSnacStable(audioTokenIds) {
445
+ const numFrames = Math.floor(audioTokenIds.length / 7);
446
+ if (numFrames === 0) return new Float32Array(0);
447
+
448
+ if (numFrames < WINDOW_FRAMES) {
449
+ return await decodeSnacWindow(audioTokenIds);
450
+ }
451
+
452
+ const chunks = [];
453
+ let totalLength = 0;
454
+
455
+ for (let start = 0; start <= numFrames - WINDOW_FRAMES; start++) {
456
+ const windowIds = audioTokenIds.slice(start * 7, (start + WINDOW_FRAMES) * 7);
457
+ const decoded = await decodeSnacWindow(windowIds);
458
+ const stable = decoded.slice(WINDOW_AUDIO_START, WINDOW_AUDIO_END);
459
+ chunks.push(stable);
460
+ totalLength += stable.length;
461
+ }
462
+
463
+ const merged = new Float32Array(totalLength);
464
+ let offset = 0;
465
+ for (const chunk of chunks) {
466
+ merged.set(chunk, offset);
467
+ offset += chunk.length;
468
+ }
469
+ return merged;
470
+ }
471
+
472
+ // Match the exported ONNX repo README:
473
+ // [SOH, BOS, "<voice>: <text>" tokens, EOT, EOH]
474
+ // The model predicts SOAI -> SOS -> audio tokens -> EOS itself.
475
+ function buildPrompt(tokenizer, text, voice) {
476
+ const body = tokenizer.encode(`${voice}: ${text}`, { add_special_tokens: false });
477
+ return [SOH, tokenizer.bos_token_id, ...body, EOT, EOH];
478
+ }
479
+
480
+ // Keep audio tokens after the first START_OF_SPEECH emitted by the model.
481
+ function extractAudioTokens(allTokenIds, promptLength) {
482
+ let sosIdx = -1;
483
+ for (let i = promptLength; i < allTokenIds.length; i++) {
484
+ if (allTokenIds[i] === SOS) {
485
+ sosIdx = i;
486
+ break;
487
+ }
488
+ }
489
+ if (sosIdx === -1) return [];
490
+
491
+ const audio = [];
492
+ for (let i = sosIdx + 1; i < allTokenIds.length; i++) {
493
+ const tokenId = allTokenIds[i];
494
+ if (tokenId === EOS) break;
495
+ if (tokenId >= AUDIO_OFFSET && tokenId < AUDIO_END) {
496
+ audio.push(tokenId);
497
+ }
498
+ }
499
+ return audio.slice(0, audio.length - (audio.length % 7));
500
+ }
501
+
502
+ // --- WAV encoder (24 kHz, mono, PCM16) --------------------------------------
503
+ function pcmFloat32ToWav(samples, sampleRate) {
504
+ const bufLen = 44 + samples.length * 2;
505
+ const buf = new ArrayBuffer(bufLen);
506
+ const v = new DataView(buf);
507
+ let p = 0;
508
+ const w = (s) => { for (let i = 0; i < s.length; i++) v.setUint8(p++, s.charCodeAt(i)); };
509
+ w("RIFF");
510
+ v.setUint32(p, 36 + samples.length * 2, true); p += 4;
511
+ w("WAVEfmt ");
512
+ v.setUint32(p, 16, true); p += 4;
513
+ v.setUint16(p, 1, true); p += 2;
514
+ v.setUint16(p, 1, true); p += 2;
515
+ v.setUint32(p, sampleRate, true); p += 4;
516
+ v.setUint32(p, sampleRate * 2, true); p += 4;
517
+ v.setUint16(p, 2, true); p += 2;
518
+ v.setUint16(p, 16, true); p += 2;
519
+ w("data");
520
+ v.setUint32(p, samples.length * 2, true); p += 4;
521
+ for (let i = 0; i < samples.length; i++) {
522
+ const s = Math.max(-1, Math.min(1, samples[i]));
523
+ v.setInt16(p, s < 0 ? s * 0x8000 : s * 0x7fff, true);
524
+ p += 2;
525
+ }
526
+ return buf;
527
+ }
528
+
529
+ // --- Sampling defaults per dtype --------------------------------------------
530
+ // Transformers.js v4 currently ignores top-k/top-p on this path, so unconstrained
531
+ // sampling drifts badly on quantized Svara and turns later words robotic. Use
532
+ // greedy decoding by default for stability; q8 can tolerate a little sampling.
533
+ function generationFor(dtype) {
534
+ return dtype === "q8"
535
+ ? { do_sample: true, temperature: 0.35, min_new_tokens: 30 }
536
+ : { do_sample: false, min_new_tokens: 30 };
537
+ }
538
+
539
+ function generationPlansFor(dtype, text) {
540
+ const base = generationFor(dtype);
541
+ if (dtype !== "q4f16" || !isComplexQ4Prompt(text)) {
542
+ return [base];
543
+ }
544
+
545
+ return [
546
+ {
547
+ do_sample: true,
548
+ temperature: 0.6,
549
+ top_k: 40,
550
+ top_p: 0.9,
551
+ min_new_tokens: 30,
552
+ },
553
+ base,
554
+ ];
555
+ }
556
+
557
+ // --- Message handler --------------------------------------------------------
558
+ self.addEventListener("message", async (e) => {
559
+ const { type, text, speaker_id, dtype: requested } = e.data;
560
+ const dtype = SUPPORTED_DTYPES.has(requested) ? requested : "q4f16";
561
+
562
+ try {
563
+ if (type === "preload") {
564
+ // Triggered by the explicit "Load model" action in the UI.
565
+ self.postMessage({ status: "loading", dtype });
566
+ await Promise.all([getTokenizer(), getSnac(), getLM(dtype)]);
567
+ self.postMessage({ status: "ready", dtype });
568
+ return;
569
+ }
570
+
571
+ self.postMessage({ status: "loading", dtype });
572
+ const [tokenizer, lm] = await Promise.all([getTokenizer(), getLM(dtype)]);
573
+ await getSnac(); // warm
574
+
575
+ const variants = buildPromptVariants(text);
576
+ if (variants.length === 0) {
577
+ throw new Error("No speakable text found after normalization.");
578
+ }
579
+
580
+ const generations = generationPlansFor(dtype, text);
581
+ let mergedPcm = null;
582
+ let lastError = null;
583
+
584
+ for (const generation of generations) {
585
+ for (const chunks of variants) {
586
+ try {
587
+ const candidate = await synthesizeChunks(
588
+ tokenizer,
589
+ lm,
590
+ speaker_id,
591
+ chunks,
592
+ generation,
593
+ );
594
+ if (isNearlySilent(candidate)) {
595
+ lastError = new Error("Generated near-silent audio.");
596
+ continue;
597
+ }
598
+ mergedPcm = candidate;
599
+ break;
600
+ } catch (err) {
601
+ lastError = err;
602
+ }
603
+ }
604
+ if (mergedPcm) {
605
+ break;
606
+ }
607
+ }
608
+
609
+ if (!mergedPcm) {
610
+ throw lastError ?? new Error("Synthesis failed for all prompt variants.");
611
+ }
612
+
613
+ const wav = pcmFloat32ToWav(mergedPcm, SAMPLE_RATE);
614
+ const blob = new Blob([wav], { type: "audio/wav" });
615
+ self.postMessage({
616
+ status: "complete",
617
+ audio: URL.createObjectURL(blob),
618
+ text,
619
+ voice: speaker_id,
620
+ dtype,
621
+ });
622
+ } catch (err) {
623
+ self.postMessage({ status: "error", data: String(err), dtype });
624
+ console.error(err);
625
+ }
626
+ });
style.css DELETED
@@ -1,28 +0,0 @@
1
- body {
2
- padding: 2rem;
3
- font-family: -apple-system, BlinkMacSystemFont, "Arial", sans-serif;
4
- }
5
-
6
- h1 {
7
- font-size: 16px;
8
- margin-top: 0;
9
- }
10
-
11
- p {
12
- color: rgb(107, 114, 128);
13
- font-size: 15px;
14
- margin-bottom: 10px;
15
- margin-top: 5px;
16
- }
17
-
18
- .card {
19
- max-width: 620px;
20
- margin: 0 auto;
21
- padding: 16px;
22
- border: 1px solid lightgray;
23
- border-radius: 16px;
24
- }
25
-
26
- .card p:last-child {
27
- margin-bottom: 0;
28
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tailwind.config.js ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ /** @type {import('tailwindcss').Config} */
2
+ export default {
3
+ content: ["./index.html", "./src/**/*.{js,ts,jsx,tsx}"],
4
+ theme: {
5
+ extend: {},
6
+ },
7
+ plugins: [],
8
+ };
tools/run_svara_onnx_local.py ADDED
@@ -0,0 +1,248 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+
3
+ import argparse
4
+ import math
5
+ import wave
6
+ from pathlib import Path
7
+
8
+ import numpy as np
9
+ import onnxruntime as ort
10
+ import torch
11
+ from optimum.onnxruntime import ORTModelForCausalLM
12
+ from transformers import AutoTokenizer, LogitsProcessor, LogitsProcessorList
13
+
14
+
15
+ EOT = 128009
16
+ SOS = 128257
17
+ EOS = 128258
18
+ SOH = 128259
19
+ EOH = 128260
20
+ SOAI = 128261
21
+ AUDIO_OFFSET = 128266
22
+ AUDIO_END = AUDIO_OFFSET + 7 * 4096
23
+ WINDOW_FRAMES = 4
24
+ WINDOW_AUDIO_START = 2048
25
+ WINDOW_AUDIO_END = 4096
26
+ SAMPLE_RATE = 24000
27
+
28
+
29
+ class SvaraLogitsProcessor(LogitsProcessor):
30
+ def __init__(self, prompt_length: int) -> None:
31
+ self.prompt_length = prompt_length
32
+
33
+ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
34
+ for row in range(scores.shape[0]):
35
+ step = input_ids[row].shape[0] - self.prompt_length
36
+
37
+ if step == 0:
38
+ scores[row].fill_(-float("inf"))
39
+ scores[row, SOAI] = 0
40
+ continue
41
+
42
+ if step == 1:
43
+ scores[row].fill_(-float("inf"))
44
+ scores[row, SOS] = 0
45
+ continue
46
+
47
+ eos_logit = scores[row, EOS].item()
48
+ scores[row, :AUDIO_OFFSET] = -float("inf")
49
+ scores[row, AUDIO_END:] = -float("inf")
50
+ scores[row, EOS] = eos_logit
51
+
52
+ return scores
53
+
54
+
55
+ def build_prompt(tokenizer: AutoTokenizer, text: str, voice: str) -> list[int]:
56
+ body = tokenizer.encode(f"{voice}: {text}", add_special_tokens=False)
57
+ return [SOH, tokenizer.bos_token_id, *body, EOT, EOH]
58
+
59
+
60
+ def extract_audio_tokens(all_token_ids: list[int], prompt_length: int) -> list[int]:
61
+ try:
62
+ sos_idx = next(i for i in range(prompt_length, len(all_token_ids)) if all_token_ids[i] == SOS)
63
+ except StopIteration:
64
+ return []
65
+
66
+ audio = []
67
+ for token_id in all_token_ids[sos_idx + 1 :]:
68
+ if token_id == EOS:
69
+ break
70
+ if AUDIO_OFFSET <= token_id < AUDIO_END:
71
+ audio.append(token_id)
72
+
73
+ return audio[: len(audio) - (len(audio) % 7)]
74
+
75
+
76
+ def codes_to_layers(audio_token_ids: list[int]) -> tuple[np.ndarray, np.ndarray, np.ndarray, int]:
77
+ n = len(audio_token_ids) // 7
78
+ layer_1 = np.zeros((1, n), dtype=np.int64)
79
+ layer_2 = np.zeros((1, n * 2), dtype=np.int64)
80
+ layer_3 = np.zeros((1, n * 4), dtype=np.int64)
81
+
82
+ for i in range(n):
83
+ base = i * 7
84
+ layer_1[0, i] = audio_token_ids[base] - AUDIO_OFFSET
85
+ layer_2[0, 2 * i] = audio_token_ids[base + 1] - AUDIO_OFFSET - 1 * 4096
86
+ layer_3[0, 4 * i] = audio_token_ids[base + 2] - AUDIO_OFFSET - 2 * 4096
87
+ layer_3[0, 4 * i + 1] = audio_token_ids[base + 3] - AUDIO_OFFSET - 3 * 4096
88
+ layer_2[0, 2 * i + 1] = audio_token_ids[base + 4] - AUDIO_OFFSET - 4 * 4096
89
+ layer_3[0, 4 * i + 2] = audio_token_ids[base + 5] - AUDIO_OFFSET - 5 * 4096
90
+ layer_3[0, 4 * i + 3] = audio_token_ids[base + 6] - AUDIO_OFFSET - 6 * 4096
91
+
92
+ return layer_1, layer_2, layer_3, n
93
+
94
+
95
+ def decode_snac_window(session: ort.InferenceSession, audio_token_ids: list[int]) -> np.ndarray:
96
+ layer_1, layer_2, layer_3, n = codes_to_layers(audio_token_ids)
97
+ outputs = session.run(
98
+ None,
99
+ {
100
+ session.get_inputs()[0].name: layer_1,
101
+ session.get_inputs()[1].name: layer_2,
102
+ session.get_inputs()[2].name: layer_3,
103
+ },
104
+ )
105
+ return outputs[0].reshape(-1).astype(np.float32, copy=False)
106
+
107
+
108
+ def decode_snac_stable(session: ort.InferenceSession, audio_token_ids: list[int]) -> np.ndarray:
109
+ num_frames = len(audio_token_ids) // 7
110
+ if num_frames == 0:
111
+ return np.zeros(0, dtype=np.float32)
112
+
113
+ if num_frames < WINDOW_FRAMES:
114
+ return decode_snac_window(session, audio_token_ids)
115
+
116
+ chunks = []
117
+ for start in range(0, num_frames - WINDOW_FRAMES + 1):
118
+ window_ids = audio_token_ids[start * 7 : (start + WINDOW_FRAMES) * 7]
119
+ decoded = decode_snac_window(session, window_ids)
120
+ chunks.append(decoded[WINDOW_AUDIO_START:WINDOW_AUDIO_END])
121
+
122
+ return np.concatenate(chunks, axis=0)
123
+
124
+
125
+ def write_wav(path: Path, samples: np.ndarray) -> None:
126
+ pcm = np.clip(samples, -1.0, 1.0)
127
+ pcm16 = np.where(pcm < 0, pcm * 32768.0, pcm * 32767.0).astype(np.int16)
128
+ with wave.open(str(path), "wb") as handle:
129
+ handle.setnchannels(1)
130
+ handle.setsampwidth(2)
131
+ handle.setframerate(SAMPLE_RATE)
132
+ handle.writeframes(pcm16.tobytes())
133
+
134
+
135
+ def audio_stats(samples: np.ndarray) -> tuple[float, float, float, float]:
136
+ if samples.size == 0:
137
+ return 0.0, 0.0, -float("inf"), -float("inf")
138
+
139
+ peak = float(np.max(np.abs(samples)))
140
+ rms = float(np.sqrt(np.mean(np.square(samples, dtype=np.float64))))
141
+ peak_db = 20.0 * math.log10(max(peak, 1e-12))
142
+ rms_db = 20.0 * math.log10(max(rms, 1e-12))
143
+ return peak, rms, peak_db, rms_db
144
+
145
+
146
+ def generation_kwargs(dtype: str) -> dict:
147
+ if dtype == "q8":
148
+ return {"do_sample": True, "temperature": 0.35, "min_new_tokens": 30}
149
+ return {"do_sample": False, "min_new_tokens": 30}
150
+
151
+
152
+ def main() -> None:
153
+ parser = argparse.ArgumentParser()
154
+ parser.add_argument("--model-dir", default=".hf-models/svara-tts-v1-ONNX")
155
+ parser.add_argument("--snac-dir", default=".hf-models/snac_24khz-ONNX/onnx")
156
+ parser.add_argument("--dtype", choices=["q4f16", "q8"], default="q4f16")
157
+ parser.add_argument("--provider", default="CPUExecutionProvider")
158
+ parser.add_argument("--voice", default="Hindi (Female)")
159
+ parser.add_argument("--text", required=True)
160
+ parser.add_argument("--out", required=True)
161
+ parser.add_argument("--max-new-tokens", type=int, default=2048)
162
+ parser.add_argument("--fix-mistral-regex", action="store_true")
163
+ parser.add_argument("--do-sample", action="store_true")
164
+ parser.add_argument("--temperature", type=float, default=None)
165
+ parser.add_argument("--top-k", type=int, default=None)
166
+ parser.add_argument("--top-p", type=float, default=None)
167
+ args = parser.parse_args()
168
+
169
+ model_dir = Path(args.model_dir)
170
+ snac_dir = Path(args.snac_dir)
171
+ model_file = "model_q4f16.onnx" if args.dtype == "q4f16" else "model_quantized.onnx"
172
+
173
+ print(f"loading tokenizer from {model_dir}")
174
+ tokenizer = AutoTokenizer.from_pretrained(
175
+ model_dir,
176
+ local_files_only=True,
177
+ fix_mistral_regex=args.fix_mistral_regex,
178
+ )
179
+
180
+ print(f"loading model {model_file} on {args.provider}")
181
+ model = ORTModelForCausalLM.from_pretrained(
182
+ model_dir,
183
+ subfolder="onnx",
184
+ file_name=model_file,
185
+ provider=args.provider,
186
+ use_io_binding=False,
187
+ local_files_only=True,
188
+ )
189
+
190
+ decoder_file = "decoder_model.onnx"
191
+ print(f"loading snac decoder {decoder_file}")
192
+ snac = ort.InferenceSession(
193
+ str(snac_dir / decoder_file),
194
+ providers=[args.provider, "CPUExecutionProvider"],
195
+ )
196
+
197
+ prompt_ids = build_prompt(tokenizer, args.text, args.voice)
198
+ input_ids = torch.tensor([prompt_ids], dtype=torch.long)
199
+ logits_processor = LogitsProcessorList([SvaraLogitsProcessor(len(prompt_ids))])
200
+
201
+ print(f"prompt_length={len(prompt_ids)} max_new_tokens={args.max_new_tokens}")
202
+ print(f'prompt={args.voice}: {args.text}')
203
+
204
+ gen_kwargs = generation_kwargs(args.dtype)
205
+ if args.do_sample:
206
+ gen_kwargs["do_sample"] = True
207
+ if args.temperature is not None:
208
+ gen_kwargs["temperature"] = args.temperature
209
+ if args.top_k is not None:
210
+ gen_kwargs["top_k"] = args.top_k
211
+ if args.top_p is not None:
212
+ gen_kwargs["top_p"] = args.top_p
213
+
214
+ output = model.generate(
215
+ input_ids=input_ids,
216
+ max_new_tokens=args.max_new_tokens,
217
+ logits_processor=logits_processor,
218
+ repetition_penalty=1.0,
219
+ eos_token_id=EOS,
220
+ **gen_kwargs,
221
+ )
222
+
223
+ if isinstance(output, torch.Tensor):
224
+ all_ids = output[0].tolist()
225
+ else:
226
+ all_ids = output.sequences[0].tolist()
227
+
228
+ audio_ids = extract_audio_tokens(all_ids, len(prompt_ids))
229
+ print(f"total_tokens={len(all_ids)} audio_tokens={len(audio_ids)} frames={len(audio_ids) // 7}")
230
+ if not audio_ids:
231
+ raise RuntimeError("no audio tokens produced")
232
+
233
+ pcm = decode_snac_stable(snac, audio_ids)
234
+ peak, rms, peak_db, rms_db = audio_stats(pcm)
235
+ print(
236
+ "samples="
237
+ f"{pcm.size} duration_s={pcm.size / SAMPLE_RATE:.3f} "
238
+ f"peak={peak:.6f} peak_db={peak_db:.2f} rms={rms:.6f} rms_db={rms_db:.2f}"
239
+ )
240
+
241
+ out_path = Path(args.out)
242
+ out_path.parent.mkdir(parents=True, exist_ok=True)
243
+ write_wav(out_path, pcm)
244
+ print(f"wrote {out_path}")
245
+
246
+
247
+ if __name__ == "__main__":
248
+ main()
vite.config.js ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { defineConfig } from "vite";
2
+ import react from "@vitejs/plugin-react";
3
+ import { viteStaticCopy } from "vite-plugin-static-copy";
4
+
5
+ // https://vite.dev/config/
6
+ export default defineConfig({
7
+ plugins: [
8
+ react(),
9
+ // ORT-Web's .wasm/.mjs runtime files aren't served by Vite by default.
10
+ // Copy them from node_modules into the dev server + build output so the
11
+ // worker can load them via /ort-wasm/<file>.
12
+ viteStaticCopy({
13
+ targets: [
14
+ {
15
+ src: "node_modules/onnxruntime-web/dist/*.{wasm,mjs}",
16
+ dest: "ort-wasm",
17
+ },
18
+ ],
19
+ }),
20
+ ],
21
+ worker: { format: "es" },
22
+ build: {
23
+ target: "esnext",
24
+ },
25
+ });