| from __future__ import annotations |
|
|
| import json |
| import os |
| import tempfile |
| from pathlib import Path |
|
|
| import gradio as gr |
| import requests |
|
|
| APP_DIR = Path(__file__).resolve().parent |
| PROMPTS_FILE = APP_DIR / "code_switch_prompts.json" |
| VOICE_DIR = APP_DIR / "assets" / "voices" |
| API_URL = os.getenv("INDICVOX_API_URL", "").rstrip("/") |
| BACKEND_TOKEN = os.getenv("INDICVOX_BACKEND_TOKEN", "") |
| DEFAULT_PROFILE = "Tamil Focus" |
| DEFAULT_VOICE = "Tamil Female Research Voice" |
| DEFAULT_TEXT = "இந்த experimentக்கு clean reference audio use பண்ணணும், இல்லனா output quality drop ஆகும்." |
| TIMEOUT_S = 600 |
| SESSION = requests.Session() |
|
|
| PROFILES = { |
| "Tamil Focus": { |
| "description": "Best for Tamil and Tamil-English code-switched prompts.", |
| }, |
| "Hindi Focus": { |
| "description": "Best for Hindi and Hindi-English code-switched prompts.", |
| }, |
| "Research Baseline": { |
| "description": "Base multilingual checkpoint without paper fine-tuning.", |
| }, |
| } |
|
|
| VOICE_PRESETS = { |
| "Hindi Research Voice": { |
| "path": VOICE_DIR / "hin_m_ref_00.wav", |
| "transcript": "लेकिन क्या यह हम सभी कार्यक्रमों के साथ कर सकते?", |
| "summary": "Short Hindi reference used for sharper Hindi + English prompting.", |
| }, |
| "Tamil Female Research Voice": { |
| "path": VOICE_DIR / "tam_f_ref_00.wav", |
| "transcript": "விக்கற நேரத்தையும் லாபத்தையும் பொறுத்து, இந்த டேக்ஸை ஷார்ட் டேர்ம் இல்ல லாங் டேர்ம்னு பிரிப்பாங்க.", |
| "summary": "Clear Tamil reference with stable conversational prosody.", |
| }, |
| "Tamil Male Research Voice": { |
| "path": VOICE_DIR / "tam_m_ref_00.wav", |
| "transcript": "கொரோனா பாதிப்பு காலத்தில் எண்பது கோடி மக்களுக்கு உணவு தானியம் வழங்கப்பட்டதாகவும் அவர் தெரிவித்தார்.", |
| "summary": "Tamil male reference that holds rhythm well on longer prompts.", |
| }, |
| "Text Only": { |
| "path": None, |
| "transcript": None, |
| "summary": "Zero-shot generation without a reference voice clip.", |
| }, |
| } |
|
|
| CUSTOM_CSS = """ |
| #app-shell { |
| max-width: 1180px; |
| margin: 0 auto; |
| } |
| #hero { |
| padding: 24px 26px 12px 26px; |
| border: 1px solid rgba(255, 255, 255, 0.08); |
| border-radius: 22px; |
| background: |
| radial-gradient(circle at top right, rgba(99, 102, 241, 0.16), transparent 34%), |
| radial-gradient(circle at bottom left, rgba(16, 185, 129, 0.14), transparent 30%), |
| rgba(15, 23, 42, 0.74); |
| } |
| .stat-chip { |
| display: inline-block; |
| margin: 6px 8px 0 0; |
| padding: 8px 12px; |
| border-radius: 999px; |
| background: rgba(255, 255, 255, 0.06); |
| font-size: 0.92rem; |
| } |
| .footnote { |
| opacity: 0.78; |
| font-size: 0.94rem; |
| } |
| footer { |
| visibility: hidden; |
| } |
| """ |
|
|
| THEME = gr.themes.Soft(primary_hue="indigo", secondary_hue="emerald") |
|
|
|
|
| def load_examples() -> list[list[str]]: |
| with PROMPTS_FILE.open("r", encoding="utf-8") as f: |
| prompt_bank = json.load(f) |
|
|
| return [ |
| [prompt_bank["hi_en"][0]["text"], "Hindi Focus", "Hindi Research Voice"], |
| [prompt_bank["hi_en"][9]["text"], "Hindi Focus", "Hindi Research Voice"], |
| [prompt_bank["hi_en"][16]["text"], "Hindi Focus", "Hindi Research Voice"], |
| [prompt_bank["ta_en"][0]["text"], "Tamil Focus", "Tamil Female Research Voice"], |
| [prompt_bank["ta_en"][9]["text"], "Tamil Focus", "Tamil Female Research Voice"], |
| [prompt_bank["ta_en"][14]["text"], "Tamil Focus", "Tamil Male Research Voice"], |
| ] |
|
|
|
|
| EXAMPLES = load_examples() |
|
|
|
|
| def profile_markdown(profile_name: str) -> str: |
| return f"**{profile_name}** \n{PROFILES[profile_name]['description']}" |
|
|
|
|
| def voice_markdown(voice_name: str) -> str: |
| voice = VOICE_PRESETS[voice_name] |
| if voice["path"] is None: |
| return f"**{voice_name}** \n{voice['summary']}" |
| return ( |
| f"**{voice_name}** \n" |
| f"{voice['summary']} \n" |
| f"Reference transcript: `{voice['transcript']}`" |
| ) |
|
|
|
|
| def auth_headers() -> dict[str, str]: |
| headers: dict[str, str] = {} |
| if BACKEND_TOKEN: |
| headers["x-api-key"] = BACKEND_TOKEN |
| return headers |
|
|
|
|
| def backend_status() -> str: |
| if not API_URL: |
| return "**Backend Not Configured** \nSet `INDICVOX_API_URL` in Space secrets." |
|
|
| try: |
| response = SESSION.get(f"{API_URL}/health", headers=auth_headers(), timeout=10) |
| response.raise_for_status() |
| payload = response.json() |
| except Exception as exc: |
| return ( |
| f"**Backend Unreachable** \n" |
| f"Endpoint: `{API_URL}` \n" |
| f"Error: `{type(exc).__name__}: {exc}`" |
| ) |
|
|
| return ( |
| f"**VM Backend Ready** \n" |
| f"Endpoint: `{API_URL}` \n" |
| f"GPU: `{payload.get('gpu', 'unknown')}` \n" |
| f"Warm profile: `{payload.get('active_profile', 'unknown')}` \n" |
| f"Uptime: `{payload.get('uptime_s', 'unknown')}s`" |
| ) |
|
|
|
|
| def synthesize(text: str, profile_name: str, voice_name: str, cfg_value: float, inference_steps: int): |
| clean_text = text.strip() |
| if not clean_text: |
| raise gr.Error("Enter a prompt first.") |
| if not API_URL: |
| raise gr.Error("`INDICVOX_API_URL` is not configured on the Space.") |
|
|
| response = SESSION.post( |
| f"{API_URL}/synthesize", |
| headers=auth_headers(), |
| json={ |
| "text": clean_text, |
| "profile_name": profile_name, |
| "voice_name": voice_name, |
| "cfg_value": float(cfg_value), |
| "inference_steps": int(inference_steps), |
| }, |
| timeout=TIMEOUT_S, |
| ) |
|
|
| if not response.ok: |
| detail = response.text |
| try: |
| detail = response.json().get("detail", detail) |
| except Exception: |
| pass |
| raise gr.Error(f"Backend error {response.status_code}: {detail}") |
|
|
| with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f: |
| f.write(response.content) |
| audio_path = f.name |
|
|
| audio_seconds = response.headers.get("X-IndicVox-Audio-Seconds", "n/a") |
| generation_seconds = response.headers.get("X-IndicVox-Generation-Seconds", "n/a") |
| rtf = response.headers.get("X-IndicVox-RTF", "n/a") |
| gpu = response.headers.get("X-IndicVox-GPU", "unknown") |
| status = ( |
| f"**Ready** \n" |
| f"Profile: `{profile_name}` \n" |
| f"Voice: `{voice_name}` \n" |
| f"GPU backend: `{gpu}` \n" |
| f"Audio length: `{audio_seconds}s` \n" |
| f"Generation time: `{generation_seconds}s` \n" |
| f"RTF: `{rtf}`" |
| ) |
| return audio_path, status |
|
|
|
|
| def voice_preview(voice_name: str): |
| voice = VOICE_PRESETS[voice_name] |
| preview_path = str(voice["path"]) if voice["path"] is not None else None |
| return preview_path, voice_markdown(voice_name) |
|
|
|
|
| def clear_prompt() -> str: |
| return "" |
|
|
|
|
| with gr.Blocks() as demo: |
| with gr.Column(elem_id="app-shell"): |
| gr.HTML( |
| """ |
| <div id="hero"> |
| <h1>IndicVox</h1> |
| <p>Research demo for multilingual TTS across Hindi, Tamil, and code-switched prompts.</p> |
| <div> |
| <span class="stat-chip">HF Space frontend</span> |
| <span class="stat-chip">VM-hosted H100 backend</span> |
| <span class="stat-chip">Hindi + Tamil + English prompts</span> |
| </div> |
| </div> |
| """ |
| ) |
|
|
| with gr.Row(): |
| with gr.Column(scale=5): |
| prompt = gr.Textbox( |
| label="Prompt", |
| value=DEFAULT_TEXT, |
| lines=5, |
| max_lines=8, |
| placeholder="Type Hindi, Tamil, or code-switched text here...", |
| ) |
|
|
| with gr.Row(): |
| profile = gr.Dropdown( |
| choices=list(PROFILES.keys()), |
| value=DEFAULT_PROFILE, |
| label="Model Profile", |
| info="Switch between the Hindi-tuned and Tamil-tuned research profiles.", |
| ) |
| voice = gr.Dropdown( |
| choices=list(VOICE_PRESETS.keys()), |
| value=DEFAULT_VOICE, |
| label="Voice Preset", |
| info="Built-in research voices plus a zero-shot option.", |
| ) |
|
|
| with gr.Accordion("Advanced Settings", open=False): |
| with gr.Row(): |
| cfg_value = gr.Slider( |
| minimum=1.0, |
| maximum=4.0, |
| value=2.0, |
| step=0.1, |
| label="CFG", |
| ) |
| inference_steps = gr.Slider( |
| minimum=6, |
| maximum=16, |
| value=10, |
| step=1, |
| label="Diffusion Steps", |
| ) |
|
|
| with gr.Row(): |
| generate_btn = gr.Button("Generate Speech", variant="primary", size="lg") |
| clear_btn = gr.Button("Clear Prompt") |
| refresh_btn = gr.Button("Refresh Backend Status") |
|
|
| with gr.Row(): |
| profile_info = gr.Markdown(profile_markdown(DEFAULT_PROFILE)) |
| voice_info = gr.Markdown(voice_markdown(DEFAULT_VOICE)) |
|
|
| with gr.Column(scale=4): |
| backend_info = gr.Markdown(backend_status()) |
| output_audio = gr.Audio( |
| label="Synthesized Audio", |
| autoplay=False, |
| format="wav", |
| ) |
| generation_info = gr.Markdown("Generate a sample to see timing details.") |
| voice_preview_audio = gr.Audio( |
| label="Voice Preset Preview", |
| value=str(VOICE_PRESETS[DEFAULT_VOICE]["path"]), |
| interactive=False, |
| autoplay=False, |
| format="wav", |
| ) |
| gr.Markdown( |
| "Inference runs on the external VM GPU; the Space only provides the paper demo UI.", |
| elem_classes=["footnote"], |
| ) |
|
|
| with gr.Tabs(): |
| with gr.Tab("Hindi + English Examples"): |
| gr.Examples( |
| examples=[row for row in EXAMPLES if row[1] == "Hindi Focus"], |
| inputs=[prompt, profile, voice], |
| cache_examples=False, |
| ) |
| with gr.Tab("Tamil + English Examples"): |
| gr.Examples( |
| examples=[row for row in EXAMPLES if row[1] == "Tamil Focus"], |
| inputs=[prompt, profile, voice], |
| cache_examples=False, |
| ) |
|
|
| gr.Markdown( |
| """ |
| **Demo notes** |
| |
| - `Hindi Focus` maps to the Hindi-strong checkpoint from the paper experiments. |
| - `Tamil Focus` maps to the Tamil + code-switch checkpoint and is the default for the demo. |
| - `Text Only` skips the reference clip and runs zero-shot synthesis. |
| """, |
| elem_classes=["footnote"], |
| ) |
|
|
| demo.load(fn=backend_status, outputs=backend_info, api_name=False) |
| generate_btn.click( |
| fn=synthesize, |
| inputs=[prompt, profile, voice, cfg_value, inference_steps], |
| outputs=[output_audio, generation_info], |
| api_name="synthesize", |
| ) |
| prompt.submit( |
| fn=synthesize, |
| inputs=[prompt, profile, voice, cfg_value, inference_steps], |
| outputs=[output_audio, generation_info], |
| api_name=False, |
| ) |
| profile.change(fn=profile_markdown, inputs=profile, outputs=profile_info, api_name=False) |
| voice.change(fn=voice_preview, inputs=voice, outputs=[voice_preview_audio, voice_info], api_name=False) |
| clear_btn.click(fn=clear_prompt, outputs=prompt, api_name=False) |
| refresh_btn.click(fn=backend_status, outputs=backend_info, api_name=False) |
|
|
| demo.queue(default_concurrency_limit=2, max_size=32) |
|
|
| if __name__ == "__main__": |
| demo.launch(theme=THEME, css=CUSTOM_CSS) |
|
|