from __future__ import annotations import json import os import tempfile from pathlib import Path import gradio as gr import requests APP_DIR = Path(__file__).resolve().parent PROMPTS_FILE = APP_DIR / "code_switch_prompts.json" VOICE_DIR = APP_DIR / "assets" / "voices" API_URL = os.getenv("INDICVOX_API_URL", "").rstrip("/") BACKEND_TOKEN = os.getenv("INDICVOX_BACKEND_TOKEN", "") DEFAULT_PROFILE = "Tamil Focus" DEFAULT_VOICE = "Tamil Female Research Voice" DEFAULT_TEXT = "இந்த experimentக்கு clean reference audio use பண்ணணும், இல்லனா output quality drop ஆகும்." TIMEOUT_S = 600 SESSION = requests.Session() PROFILES = { "Tamil Focus": { "description": "Best for Tamil and Tamil-English code-switched prompts.", }, "Hindi Focus": { "description": "Best for Hindi and Hindi-English code-switched prompts.", }, "Research Baseline": { "description": "Base multilingual checkpoint without paper fine-tuning.", }, } VOICE_PRESETS = { "Hindi Research Voice": { "path": VOICE_DIR / "hin_m_ref_00.wav", "transcript": "लेकिन क्या यह हम सभी कार्यक्रमों के साथ कर सकते?", "summary": "Short Hindi reference used for sharper Hindi + English prompting.", }, "Tamil Female Research Voice": { "path": VOICE_DIR / "tam_f_ref_00.wav", "transcript": "விக்கற நேரத்தையும் லாபத்தையும் பொறுத்து, இந்த டேக்ஸை ஷார்ட் டேர்ம் இல்ல லாங் டேர்ம்னு பிரிப்பாங்க.", "summary": "Clear Tamil reference with stable conversational prosody.", }, "Tamil Male Research Voice": { "path": VOICE_DIR / "tam_m_ref_00.wav", "transcript": "கொரோனா பாதிப்பு காலத்தில் எண்பது கோடி மக்களுக்கு உணவு தானியம் வழங்கப்பட்டதாகவும் அவர் தெரிவித்தார்.", "summary": "Tamil male reference that holds rhythm well on longer prompts.", }, "Text Only": { "path": None, "transcript": None, "summary": "Zero-shot generation without a reference voice clip.", }, } CUSTOM_CSS = """ #app-shell { max-width: 1180px; margin: 0 auto; } #hero { padding: 24px 26px 12px 26px; border: 1px solid rgba(255, 255, 255, 0.08); border-radius: 22px; background: radial-gradient(circle at top right, rgba(99, 102, 241, 0.16), transparent 34%), radial-gradient(circle at bottom left, rgba(16, 185, 129, 0.14), transparent 30%), rgba(15, 23, 42, 0.74); } .stat-chip { display: inline-block; margin: 6px 8px 0 0; padding: 8px 12px; border-radius: 999px; background: rgba(255, 255, 255, 0.06); font-size: 0.92rem; } .footnote { opacity: 0.78; font-size: 0.94rem; } footer { visibility: hidden; } """ THEME = gr.themes.Soft(primary_hue="indigo", secondary_hue="emerald") def load_examples() -> list[list[str]]: with PROMPTS_FILE.open("r", encoding="utf-8") as f: prompt_bank = json.load(f) return [ [prompt_bank["hi_en"][0]["text"], "Hindi Focus", "Hindi Research Voice"], [prompt_bank["hi_en"][9]["text"], "Hindi Focus", "Hindi Research Voice"], [prompt_bank["hi_en"][16]["text"], "Hindi Focus", "Hindi Research Voice"], [prompt_bank["ta_en"][0]["text"], "Tamil Focus", "Tamil Female Research Voice"], [prompt_bank["ta_en"][9]["text"], "Tamil Focus", "Tamil Female Research Voice"], [prompt_bank["ta_en"][14]["text"], "Tamil Focus", "Tamil Male Research Voice"], ] EXAMPLES = load_examples() def profile_markdown(profile_name: str) -> str: return f"**{profile_name}** \n{PROFILES[profile_name]['description']}" def voice_markdown(voice_name: str) -> str: voice = VOICE_PRESETS[voice_name] if voice["path"] is None: return f"**{voice_name}** \n{voice['summary']}" return ( f"**{voice_name}** \n" f"{voice['summary']} \n" f"Reference transcript: `{voice['transcript']}`" ) def auth_headers() -> dict[str, str]: headers: dict[str, str] = {} if BACKEND_TOKEN: headers["x-api-key"] = BACKEND_TOKEN return headers def backend_status() -> str: if not API_URL: return "**Backend Not Configured** \nSet `INDICVOX_API_URL` in Space secrets." try: response = SESSION.get(f"{API_URL}/health", headers=auth_headers(), timeout=10) response.raise_for_status() payload = response.json() except Exception as exc: return ( f"**Backend Unreachable** \n" f"Endpoint: `{API_URL}` \n" f"Error: `{type(exc).__name__}: {exc}`" ) return ( f"**VM Backend Ready** \n" f"Endpoint: `{API_URL}` \n" f"GPU: `{payload.get('gpu', 'unknown')}` \n" f"Warm profile: `{payload.get('active_profile', 'unknown')}` \n" f"Uptime: `{payload.get('uptime_s', 'unknown')}s`" ) def synthesize(text: str, profile_name: str, voice_name: str, cfg_value: float, inference_steps: int): clean_text = text.strip() if not clean_text: raise gr.Error("Enter a prompt first.") if not API_URL: raise gr.Error("`INDICVOX_API_URL` is not configured on the Space.") response = SESSION.post( f"{API_URL}/synthesize", headers=auth_headers(), json={ "text": clean_text, "profile_name": profile_name, "voice_name": voice_name, "cfg_value": float(cfg_value), "inference_steps": int(inference_steps), }, timeout=TIMEOUT_S, ) if not response.ok: detail = response.text try: detail = response.json().get("detail", detail) except Exception: pass raise gr.Error(f"Backend error {response.status_code}: {detail}") with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f: f.write(response.content) audio_path = f.name audio_seconds = response.headers.get("X-IndicVox-Audio-Seconds", "n/a") generation_seconds = response.headers.get("X-IndicVox-Generation-Seconds", "n/a") rtf = response.headers.get("X-IndicVox-RTF", "n/a") gpu = response.headers.get("X-IndicVox-GPU", "unknown") status = ( f"**Ready** \n" f"Profile: `{profile_name}` \n" f"Voice: `{voice_name}` \n" f"GPU backend: `{gpu}` \n" f"Audio length: `{audio_seconds}s` \n" f"Generation time: `{generation_seconds}s` \n" f"RTF: `{rtf}`" ) return audio_path, status def voice_preview(voice_name: str): voice = VOICE_PRESETS[voice_name] preview_path = str(voice["path"]) if voice["path"] is not None else None return preview_path, voice_markdown(voice_name) def clear_prompt() -> str: return "" with gr.Blocks() as demo: with gr.Column(elem_id="app-shell"): gr.HTML( """

IndicVox

Research demo for multilingual TTS across Hindi, Tamil, and code-switched prompts.

HF Space frontend VM-hosted H100 backend Hindi + Tamil + English prompts
""" ) with gr.Row(): with gr.Column(scale=5): prompt = gr.Textbox( label="Prompt", value=DEFAULT_TEXT, lines=5, max_lines=8, placeholder="Type Hindi, Tamil, or code-switched text here...", ) with gr.Row(): profile = gr.Dropdown( choices=list(PROFILES.keys()), value=DEFAULT_PROFILE, label="Model Profile", info="Switch between the Hindi-tuned and Tamil-tuned research profiles.", ) voice = gr.Dropdown( choices=list(VOICE_PRESETS.keys()), value=DEFAULT_VOICE, label="Voice Preset", info="Built-in research voices plus a zero-shot option.", ) with gr.Accordion("Advanced Settings", open=False): with gr.Row(): cfg_value = gr.Slider( minimum=1.0, maximum=4.0, value=2.0, step=0.1, label="CFG", ) inference_steps = gr.Slider( minimum=6, maximum=16, value=10, step=1, label="Diffusion Steps", ) with gr.Row(): generate_btn = gr.Button("Generate Speech", variant="primary", size="lg") clear_btn = gr.Button("Clear Prompt") refresh_btn = gr.Button("Refresh Backend Status") with gr.Row(): profile_info = gr.Markdown(profile_markdown(DEFAULT_PROFILE)) voice_info = gr.Markdown(voice_markdown(DEFAULT_VOICE)) with gr.Column(scale=4): backend_info = gr.Markdown(backend_status()) output_audio = gr.Audio( label="Synthesized Audio", autoplay=False, format="wav", ) generation_info = gr.Markdown("Generate a sample to see timing details.") voice_preview_audio = gr.Audio( label="Voice Preset Preview", value=str(VOICE_PRESETS[DEFAULT_VOICE]["path"]), interactive=False, autoplay=False, format="wav", ) gr.Markdown( "Inference runs on the external VM GPU; the Space only provides the paper demo UI.", elem_classes=["footnote"], ) with gr.Tabs(): with gr.Tab("Hindi + English Examples"): gr.Examples( examples=[row for row in EXAMPLES if row[1] == "Hindi Focus"], inputs=[prompt, profile, voice], cache_examples=False, ) with gr.Tab("Tamil + English Examples"): gr.Examples( examples=[row for row in EXAMPLES if row[1] == "Tamil Focus"], inputs=[prompt, profile, voice], cache_examples=False, ) gr.Markdown( """ **Demo notes** - `Hindi Focus` maps to the Hindi-strong checkpoint from the paper experiments. - `Tamil Focus` maps to the Tamil + code-switch checkpoint and is the default for the demo. - `Text Only` skips the reference clip and runs zero-shot synthesis. """, elem_classes=["footnote"], ) demo.load(fn=backend_status, outputs=backend_info, api_name=False) generate_btn.click( fn=synthesize, inputs=[prompt, profile, voice, cfg_value, inference_steps], outputs=[output_audio, generation_info], api_name="synthesize", ) prompt.submit( fn=synthesize, inputs=[prompt, profile, voice, cfg_value, inference_steps], outputs=[output_audio, generation_info], api_name=False, ) profile.change(fn=profile_markdown, inputs=profile, outputs=profile_info, api_name=False) voice.change(fn=voice_preview, inputs=voice, outputs=[voice_preview_audio, voice_info], api_name=False) clear_btn.click(fn=clear_prompt, outputs=prompt, api_name=False) refresh_btn.click(fn=backend_status, outputs=backend_info, api_name=False) demo.queue(default_concurrency_limit=2, max_size=32) if __name__ == "__main__": demo.launch(theme=THEME, css=CUSTOM_CSS)