himahande45 commited on
Commit
dffe2a3
·
verified ·
1 Parent(s): 0e9e909

Make app.py forward to VM frontend

Browse files
Files changed (1) hide show
  1. app.py +1 -448
app.py CHANGED
@@ -1,451 +1,4 @@
1
- from __future__ import annotations
2
-
3
- import json
4
- import os
5
- import sys
6
- import threading
7
- import time
8
- import traceback
9
- from pathlib import Path
10
-
11
- import gradio as gr
12
- import numpy as np
13
- import torch
14
- from huggingface_hub import snapshot_download
15
-
16
- APP_DIR = Path(__file__).resolve().parent
17
-
18
-
19
- def resolve_persist_root() -> Path:
20
- data_root = Path("/data")
21
- if data_root.exists() and os.access(data_root, os.W_OK):
22
- return data_root
23
-
24
- local_root = APP_DIR / ".cache"
25
- local_root.mkdir(parents=True, exist_ok=True)
26
- return local_root
27
-
28
-
29
- PERSIST_ROOT = resolve_persist_root()
30
- HF_HOME = PERSIST_ROOT / "huggingface"
31
- HF_HOME.mkdir(parents=True, exist_ok=True)
32
- os.environ.setdefault("HF_HOME", str(HF_HOME))
33
- os.environ.setdefault("HF_HUB_CACHE", str(HF_HOME / "hub"))
34
-
35
- sys.path.insert(0, str(APP_DIR))
36
-
37
- from voxcpm import VoxCPM
38
- from voxcpm.model.voxcpm import LoRAConfig
39
-
40
- SPACE_TITLE = "IndicVox: Hindi & Tamil Code-Switching TTS"
41
- MODEL_REPO_ID = "himahande45/multilingual-tts"
42
- PROMPTS_FILE = APP_DIR / "code_switch_prompts.json"
43
- VOICE_DIR = APP_DIR / "assets" / "voices"
44
- DEFAULT_PROFILE = "Tamil Focus"
45
- DEFAULT_VOICE = "Tamil Female Research Voice"
46
- DEFAULT_TEXT = "இந்த experimentக்கு clean reference audio use பண்ணணும், இல்லனா output quality drop ஆகும்."
47
-
48
- MODEL_PATTERNS = [
49
- "VoxCPM2_local/*",
50
- "finetune_checkpoints/step_0000500/lora_config.json",
51
- "finetune_checkpoints/step_0000500/lora_weights.safetensors",
52
- "finetune_checkpoints/step_0001000/lora_config.json",
53
- "finetune_checkpoints/step_0001000/lora_weights.safetensors",
54
- ]
55
-
56
- PROFILES = {
57
- "Tamil Focus": {
58
- "description": "Best for Tamil and Tamil-English code-switched prompts.",
59
- "checkpoint_dir": "finetune_checkpoints/step_0001000",
60
- },
61
- "Hindi Focus": {
62
- "description": "Best for Hindi and Hindi-English code-switched prompts.",
63
- "checkpoint_dir": "finetune_checkpoints/step_0000500",
64
- },
65
- "Research Baseline": {
66
- "description": "Base multilingual checkpoint without paper fine-tuning.",
67
- "checkpoint_dir": None,
68
- },
69
- }
70
-
71
- VOICE_PRESETS = {
72
- "Hindi Research Voice": {
73
- "path": VOICE_DIR / "hin_m_ref_00.wav",
74
- "transcript": "लेकिन क्या यह हम सभी कार्यक्रमों के साथ कर सकते?",
75
- "summary": "Short Hindi reference used for sharper Hindi + English prompting.",
76
- },
77
- "Tamil Female Research Voice": {
78
- "path": VOICE_DIR / "tam_f_ref_00.wav",
79
- "transcript": "விக்கற நேரத்தையும் லாபத்தையும் பொறுத்து, இந்த டேக்ஸை ஷார்ட் டேர்ம் இல்ல லாங் டேர்ம்னு பிரிப்பாங்க.",
80
- "summary": "Clear Tamil reference with stable conversational prosody.",
81
- },
82
- "Tamil Male Research Voice": {
83
- "path": VOICE_DIR / "tam_m_ref_00.wav",
84
- "transcript": "கொரோனா பாதிப்பு காலத்தில் எண்பது கோடி மக்களுக்கு உணவு தானியம் வழங்கப்பட்டதாகவும் அவர் தெரிவித்தார்.",
85
- "summary": "Tamil male reference that holds rhythm well on longer prompts.",
86
- },
87
- "Text Only": {
88
- "path": None,
89
- "transcript": None,
90
- "summary": "Zero-shot generation without a reference voice clip.",
91
- },
92
- }
93
-
94
- CUSTOM_CSS = """
95
- #app-shell {
96
- max-width: 1180px;
97
- margin: 0 auto;
98
- }
99
- #hero {
100
- padding: 24px 26px 12px 26px;
101
- border: 1px solid rgba(255, 255, 255, 0.08);
102
- border-radius: 22px;
103
- background:
104
- radial-gradient(circle at top right, rgba(99, 102, 241, 0.16), transparent 34%),
105
- radial-gradient(circle at bottom left, rgba(16, 185, 129, 0.14), transparent 30%),
106
- rgba(15, 23, 42, 0.74);
107
- }
108
- .stat-chip {
109
- display: inline-block;
110
- margin: 6px 8px 0 0;
111
- padding: 8px 12px;
112
- border-radius: 999px;
113
- background: rgba(255, 255, 255, 0.06);
114
- font-size: 0.92rem;
115
- }
116
- .footnote {
117
- opacity: 0.78;
118
- font-size: 0.94rem;
119
- }
120
- footer {
121
- visibility: hidden;
122
- }
123
- """
124
-
125
- if torch.cuda.is_available():
126
- torch.backends.cuda.matmul.allow_tf32 = True
127
- torch.backends.cudnn.allow_tf32 = True
128
- torch.set_float32_matmul_precision("high")
129
-
130
- THEME = gr.themes.Soft(primary_hue="indigo", secondary_hue="emerald")
131
-
132
-
133
- def load_examples() -> list[list[str]]:
134
- with PROMPTS_FILE.open("r", encoding="utf-8") as f:
135
- prompt_bank = json.load(f)
136
-
137
- return [
138
- [prompt_bank["hi_en"][0]["text"], "Hindi Focus", "Hindi Research Voice"],
139
- [prompt_bank["hi_en"][9]["text"], "Hindi Focus", "Hindi Research Voice"],
140
- [prompt_bank["hi_en"][16]["text"], "Hindi Focus", "Hindi Research Voice"],
141
- [prompt_bank["ta_en"][0]["text"], "Tamil Focus", "Tamil Female Research Voice"],
142
- [prompt_bank["ta_en"][9]["text"], "Tamil Focus", "Tamil Female Research Voice"],
143
- [prompt_bank["ta_en"][14]["text"], "Tamil Focus", "Tamil Male Research Voice"],
144
- ]
145
-
146
-
147
- def profile_markdown(profile_name: str) -> str:
148
- description = PROFILES[profile_name]["description"]
149
- return f"**{profile_name}** \n{description}"
150
-
151
-
152
- def voice_markdown(voice_name: str) -> str:
153
- voice = VOICE_PRESETS[voice_name]
154
- if voice["path"] is None:
155
- return f"**{voice_name}** \n{voice['summary']}"
156
- transcript = voice["transcript"]
157
- return f"**{voice_name}** \n{voice['summary']} \nReference transcript: `{transcript}`"
158
-
159
-
160
- def dynamic_max_len(text: str) -> int:
161
- char_count = max(len(text.strip()), 1)
162
- return max(280, min(900, int(char_count * 7.5)))
163
-
164
-
165
- class ModelManager:
166
- def __init__(self) -> None:
167
- self.lock = threading.Lock()
168
- self.repo_dir = self._resolve_repo_dir()
169
- self.base_dir = self.repo_dir / "VoxCPM2_local"
170
- self.loaded_profile: str | None = None
171
- self.active_profile: str | None = None
172
- self.model = self._load_model()
173
- self.activate_profile(DEFAULT_PROFILE)
174
-
175
- def _resolve_repo_dir(self) -> Path:
176
- local_repo = os.getenv("INDICVOX_LOCAL_MODEL_REPO")
177
- if local_repo:
178
- path = Path(local_repo).expanduser().resolve()
179
- if path.exists():
180
- return path
181
- raise FileNotFoundError(f"INDICVOX_LOCAL_MODEL_REPO does not exist: {path}")
182
-
183
- token = os.getenv("HF_TOKEN")
184
- snapshot_path = snapshot_download(
185
- repo_id=MODEL_REPO_ID,
186
- repo_type="model",
187
- allow_patterns=MODEL_PATTERNS,
188
- token=token,
189
- )
190
- return Path(snapshot_path)
191
-
192
- def _load_lora_config(self, checkpoint_dir: Path) -> LoRAConfig:
193
- payload = json.loads((checkpoint_dir / "lora_config.json").read_text(encoding="utf-8"))
194
- return LoRAConfig(**payload["lora_config"])
195
-
196
- def _load_model(self) -> VoxCPM:
197
- if not torch.cuda.is_available():
198
- raise RuntimeError("A GPU runtime is required. Request an A10G/L4 Space and restart.")
199
-
200
- checkpoint_dir = self.repo_dir / PROFILES[DEFAULT_PROFILE]["checkpoint_dir"]
201
- lora_config = self._load_lora_config(checkpoint_dir)
202
- model = VoxCPM.from_pretrained(
203
- hf_model_id=str(self.base_dir),
204
- load_denoiser=False,
205
- optimize=False,
206
- lora_config=lora_config,
207
- )
208
- return model
209
-
210
- def activate_profile(self, profile_name: str) -> None:
211
- spec = PROFILES[profile_name]
212
- checkpoint_dir = spec["checkpoint_dir"]
213
-
214
- if checkpoint_dir is None:
215
- self.model.set_lora_enabled(False)
216
- self.active_profile = profile_name
217
- return
218
-
219
- if self.loaded_profile != profile_name:
220
- if self.loaded_profile is not None:
221
- self.model.unload_lora()
222
- self.model.load_lora(str(self.repo_dir / checkpoint_dir))
223
- self.loaded_profile = profile_name
224
-
225
- self.model.set_lora_enabled(True)
226
- self.active_profile = profile_name
227
-
228
- def synthesize(
229
- self,
230
- text: str,
231
- profile_name: str,
232
- voice_name: str,
233
- cfg_value: float,
234
- inference_steps: int,
235
- ) -> tuple[tuple[int, np.ndarray], str]:
236
- clean_text = text.strip()
237
- if not clean_text:
238
- raise gr.Error("Enter a prompt first.")
239
-
240
- start = time.perf_counter()
241
- with self.lock:
242
- self.activate_profile(profile_name)
243
- kwargs = {
244
- "text": clean_text,
245
- "cfg_value": float(cfg_value),
246
- "inference_timesteps": int(inference_steps),
247
- "max_len": dynamic_max_len(clean_text),
248
- }
249
-
250
- voice = VOICE_PRESETS[voice_name]
251
- if voice["path"] is not None:
252
- kwargs["prompt_wav_path"] = str(voice["path"])
253
- kwargs["prompt_text"] = voice["transcript"]
254
-
255
- wav = self.model.generate(**kwargs)
256
- sample_rate = int(self.model.tts_model.sample_rate)
257
-
258
- if isinstance(wav, torch.Tensor):
259
- wav = wav.detach().cpu().numpy()
260
- wav = np.asarray(wav, dtype=np.float32).squeeze()
261
- wav = np.clip(wav, -1.0, 1.0)
262
-
263
- elapsed = time.perf_counter() - start
264
- duration = float(wav.shape[-1]) / sample_rate if wav.size else 0.0
265
- rtf = elapsed / duration if duration > 0 else float("nan")
266
- speed_line = f"RTF {rtf:.2f}x" if np.isfinite(rtf) else "RTF n/a"
267
- status = (
268
- f"**Ready** \n"
269
- f"Profile: `{profile_name}` \n"
270
- f"Voice: `{voice_name}` \n"
271
- f"Audio length: `{duration:.2f}s` \n"
272
- f"Generation time: `{elapsed:.2f}s` ({speed_line})"
273
- )
274
- return (sample_rate, wav), status
275
-
276
- def boot_markdown(self) -> str:
277
- gpu_name = torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU"
278
- active_profile = self.active_profile or DEFAULT_PROFILE
279
- return (
280
- f"**GPU Ready** \n"
281
- f"Runtime: `{gpu_name}` \n"
282
- f"Warm profile: `{active_profile}` \n"
283
- f"Model source: `{MODEL_REPO_ID}`"
284
- )
285
-
286
-
287
- BOOT_ERROR: str | None = None
288
- MODEL_MANAGER: ModelManager | None = None
289
-
290
- try:
291
- MODEL_MANAGER = ModelManager()
292
- except Exception:
293
- BOOT_ERROR = traceback.format_exc()
294
-
295
- EXAMPLES = load_examples()
296
-
297
-
298
- def synthesize(text: str, profile_name: str, voice_name: str, cfg_value: float, inference_steps: int):
299
- if MODEL_MANAGER is None:
300
- raise gr.Error(f"Model initialization failed.\n\n{BOOT_ERROR}")
301
- return MODEL_MANAGER.synthesize(text, profile_name, voice_name, cfg_value, inference_steps)
302
-
303
-
304
- def voice_preview(voice_name: str):
305
- voice = VOICE_PRESETS[voice_name]
306
- preview_path = str(voice["path"]) if voice["path"] is not None else None
307
- return preview_path, voice_markdown(voice_name)
308
-
309
-
310
- def clear_prompt() -> str:
311
- return ""
312
-
313
-
314
- def boot_status() -> str:
315
- if MODEL_MANAGER is not None:
316
- return MODEL_MANAGER.boot_markdown()
317
- return f"**Startup Error** \n```text\n{BOOT_ERROR}\n```"
318
-
319
-
320
- with gr.Blocks() as demo:
321
- with gr.Column(elem_id="app-shell"):
322
- gr.HTML(
323
- """
324
- <div id="hero">
325
- <h1>IndicVox</h1>
326
- <p>Research demo for multilingual TTS across Hindi, Tamil, and code-switched prompts.</p>
327
- <div>
328
- <span class="stat-chip">GPU-backed Space</span>
329
- <span class="stat-chip">Warm-loaded model</span>
330
- <span class="stat-chip">Hindi + Tamil + English prompts</span>
331
- </div>
332
- </div>
333
- """
334
- )
335
-
336
- with gr.Row():
337
- with gr.Column(scale=5):
338
- prompt = gr.Textbox(
339
- label="Prompt",
340
- value=DEFAULT_TEXT,
341
- lines=5,
342
- max_lines=8,
343
- placeholder="Type Hindi, Tamil, or code-switched text here...",
344
- )
345
-
346
- with gr.Row():
347
- profile = gr.Dropdown(
348
- choices=list(PROFILES.keys()),
349
- value=DEFAULT_PROFILE,
350
- label="Model Profile",
351
- info="Switch between the Hindi-tuned and Tamil-tuned research profiles.",
352
- )
353
- voice = gr.Dropdown(
354
- choices=list(VOICE_PRESETS.keys()),
355
- value=DEFAULT_VOICE,
356
- label="Voice Preset",
357
- info="Built-in research voices plus a zero-shot option.",
358
- )
359
-
360
- with gr.Accordion("Advanced Settings", open=False):
361
- with gr.Row():
362
- cfg_value = gr.Slider(
363
- minimum=1.0,
364
- maximum=4.0,
365
- value=2.0,
366
- step=0.1,
367
- label="CFG",
368
- info="Higher values usually sound more guided but less relaxed.",
369
- )
370
- inference_steps = gr.Slider(
371
- minimum=6,
372
- maximum=16,
373
- value=10,
374
- step=1,
375
- label="Diffusion Steps",
376
- info="10 is the paper demo default.",
377
- )
378
-
379
- with gr.Row():
380
- generate_btn = gr.Button("Generate Speech", variant="primary", size="lg")
381
- clear_btn = gr.Button("Clear Prompt")
382
-
383
- with gr.Row():
384
- profile_info = gr.Markdown(profile_markdown(DEFAULT_PROFILE))
385
- voice_info = gr.Markdown(voice_markdown(DEFAULT_VOICE))
386
-
387
- with gr.Column(scale=4):
388
- status = gr.Markdown(boot_status())
389
- output_audio = gr.Audio(
390
- label="Synthesized Audio",
391
- autoplay=False,
392
- format="wav",
393
- )
394
- voice_preview_audio = gr.Audio(
395
- label="Voice Preset Preview",
396
- value=str(VOICE_PRESETS[DEFAULT_VOICE]["path"]),
397
- interactive=False,
398
- autoplay=False,
399
- format="wav",
400
- )
401
- gr.Markdown(
402
- "The demo keeps the base model resident on GPU and swaps paper checkpoints on demand.",
403
- elem_classes=["footnote"],
404
- )
405
-
406
- with gr.Tabs():
407
- with gr.Tab("Hindi + English Examples"):
408
- gr.Examples(
409
- examples=[row for row in EXAMPLES if row[1] == "Hindi Focus"],
410
- inputs=[prompt, profile, voice],
411
- cache_examples=False,
412
- )
413
- with gr.Tab("Tamil + English Examples"):
414
- gr.Examples(
415
- examples=[row for row in EXAMPLES if row[1] == "Tamil Focus"],
416
- inputs=[prompt, profile, voice],
417
- cache_examples=False,
418
- )
419
-
420
- gr.Markdown(
421
- """
422
- **Demo notes**
423
-
424
- - `Hindi Focus` maps to the Hindi-strong checkpoint from the paper experiments.
425
- - `Tamil Focus` maps to the Tamil + code-switch checkpoint and is the default for the Space.
426
- - `Text Only` skips the reference clip and runs zero-shot synthesis.
427
- """,
428
- elem_classes=["footnote"],
429
- )
430
-
431
- generate_btn.click(
432
- fn=synthesize,
433
- inputs=[prompt, profile, voice, cfg_value, inference_steps],
434
- outputs=[output_audio, status],
435
- api_name="synthesize",
436
- )
437
- prompt.submit(
438
- fn=synthesize,
439
- inputs=[prompt, profile, voice, cfg_value, inference_steps],
440
- outputs=[output_audio, status],
441
- api_name=False,
442
- )
443
- profile.change(fn=profile_markdown, inputs=profile, outputs=profile_info, api_name=False)
444
- voice.change(fn=voice_preview, inputs=voice, outputs=[voice_preview_audio, voice_info], api_name=False)
445
- clear_btn.click(fn=clear_prompt, outputs=prompt, api_name=False)
446
-
447
-
448
- demo.queue(default_concurrency_limit=1, max_size=16)
449
 
450
  if __name__ == "__main__":
451
  demo.launch(theme=THEME, css=CUSTOM_CSS)
 
1
+ from frontend_app import CUSTOM_CSS, THEME, demo
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
  if __name__ == "__main__":
4
  demo.launch(theme=THEME, css=CUSTOM_CSS)