from __future__ import annotations import json import os import tempfile from pathlib import Path import gradio as gr import requests APP_DIR = Path(__file__).resolve().parent PROMPTS_FILE = APP_DIR / "code_switch_prompts.json" VOICE_DIR = APP_DIR / "assets" / "voices" API_URL = os.getenv("INDICVOX_API_URL", "").rstrip("/") BACKEND_TOKEN = os.getenv("INDICVOX_BACKEND_TOKEN", "") DEFAULT_PROFILE = "Tamil Focus" DEFAULT_VOICE = "Tamil Female Research Voice" DEFAULT_TEXT = "இந்த experimentக்கு clean reference audio use பண்ணணும், இல்லனா output quality drop ஆகும்." TIMEOUT_S = 600 SESSION = requests.Session() PROFILES = { "Tamil Focus": { "description": "Best for Tamil and Tamil-English code-switched prompts.", }, "Hindi Focus": { "description": "Best for Hindi and Hindi-English code-switched prompts.", }, "Research Baseline": { "description": "Base multilingual checkpoint without paper fine-tuning.", }, } VOICE_PRESETS = { "Hindi Research Voice": { "path": VOICE_DIR / "hin_m_ref_00.wav", "transcript": "लेकिन क्या यह हम सभी कार्यक्रमों के साथ कर सकते?", "summary": "Short Hindi reference used for sharper Hindi + English prompting.", }, "Tamil Female Research Voice": { "path": VOICE_DIR / "tam_f_ref_00.wav", "transcript": "விக்கற நேரத்தையும் லாபத்தையும் பொறுத்து, இந்த டேக்ஸை ஷார்ட் டேர்ம் இல்ல லாங் டேர்ம்னு பிரிப்பாங்க.", "summary": "Clear Tamil reference with stable conversational prosody.", }, "Tamil Male Research Voice": { "path": VOICE_DIR / "tam_m_ref_00.wav", "transcript": "கொரோனா பாதிப்பு காலத்தில் எண்பது கோடி மக்களுக்கு உணவு தானியம் வழங்கப்பட்டதாகவும் அவர் தெரிவித்தார்.", "summary": "Tamil male reference that holds rhythm well on longer prompts.", }, "Text Only": { "path": None, "transcript": None, "summary": "Zero-shot generation without a reference voice clip.", }, } CUSTOM_CSS = """ #app-shell { max-width: 1180px; margin: 0 auto; } #hero { padding: 24px 26px 12px 26px; border: 1px solid rgba(255, 255, 255, 0.08); border-radius: 22px; background: radial-gradient(circle at top right, rgba(99, 102, 241, 0.16), transparent 34%), radial-gradient(circle at bottom left, rgba(16, 185, 129, 0.14), transparent 30%), rgba(15, 23, 42, 0.74); } .stat-chip { display: inline-block; margin: 6px 8px 0 0; padding: 8px 12px; border-radius: 999px; background: rgba(255, 255, 255, 0.06); font-size: 0.92rem; } .footnote { opacity: 0.78; font-size: 0.94rem; } footer { visibility: hidden; } """ THEME = gr.themes.Soft(primary_hue="indigo", secondary_hue="emerald") def load_examples() -> list[list[str]]: with PROMPTS_FILE.open("r", encoding="utf-8") as f: prompt_bank = json.load(f) return [ [prompt_bank["hi_en"][0]["text"], "Hindi Focus", "Hindi Research Voice"], [prompt_bank["hi_en"][9]["text"], "Hindi Focus", "Hindi Research Voice"], [prompt_bank["hi_en"][16]["text"], "Hindi Focus", "Hindi Research Voice"], [prompt_bank["ta_en"][0]["text"], "Tamil Focus", "Tamil Female Research Voice"], [prompt_bank["ta_en"][9]["text"], "Tamil Focus", "Tamil Female Research Voice"], [prompt_bank["ta_en"][14]["text"], "Tamil Focus", "Tamil Male Research Voice"], ] EXAMPLES = load_examples() def profile_markdown(profile_name: str) -> str: return f"**{profile_name}** \n{PROFILES[profile_name]['description']}" def voice_markdown(voice_name: str) -> str: voice = VOICE_PRESETS[voice_name] if voice["path"] is None: return f"**{voice_name}** \n{voice['summary']}" return ( f"**{voice_name}** \n" f"{voice['summary']} \n" f"Reference transcript: `{voice['transcript']}`" ) def auth_headers() -> dict[str, str]: headers: dict[str, str] = {} if BACKEND_TOKEN: headers["x-api-key"] = BACKEND_TOKEN return headers def backend_status() -> str: if not API_URL: return "**Backend Not Configured** \nSet `INDICVOX_API_URL` in Space secrets." try: response = SESSION.get(f"{API_URL}/health", headers=auth_headers(), timeout=10) response.raise_for_status() payload = response.json() except Exception as exc: return ( f"**Backend Unreachable** \n" f"Endpoint: `{API_URL}` \n" f"Error: `{type(exc).__name__}: {exc}`" ) return ( f"**VM Backend Ready** \n" f"Endpoint: `{API_URL}` \n" f"GPU: `{payload.get('gpu', 'unknown')}` \n" f"Warm profile: `{payload.get('active_profile', 'unknown')}` \n" f"Uptime: `{payload.get('uptime_s', 'unknown')}s`" ) def synthesize(text: str, profile_name: str, voice_name: str, cfg_value: float, inference_steps: int): clean_text = text.strip() if not clean_text: raise gr.Error("Enter a prompt first.") if not API_URL: raise gr.Error("`INDICVOX_API_URL` is not configured on the Space.") response = SESSION.post( f"{API_URL}/synthesize", headers=auth_headers(), json={ "text": clean_text, "profile_name": profile_name, "voice_name": voice_name, "cfg_value": float(cfg_value), "inference_steps": int(inference_steps), }, timeout=TIMEOUT_S, ) if not response.ok: detail = response.text try: detail = response.json().get("detail", detail) except Exception: pass raise gr.Error(f"Backend error {response.status_code}: {detail}") with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f: f.write(response.content) audio_path = f.name audio_seconds = response.headers.get("X-IndicVox-Audio-Seconds", "n/a") generation_seconds = response.headers.get("X-IndicVox-Generation-Seconds", "n/a") rtf = response.headers.get("X-IndicVox-RTF", "n/a") gpu = response.headers.get("X-IndicVox-GPU", "unknown") status = ( f"**Ready** \n" f"Profile: `{profile_name}` \n" f"Voice: `{voice_name}` \n" f"GPU backend: `{gpu}` \n" f"Audio length: `{audio_seconds}s` \n" f"Generation time: `{generation_seconds}s` \n" f"RTF: `{rtf}`" ) return audio_path, status def voice_preview(voice_name: str): voice = VOICE_PRESETS[voice_name] preview_path = str(voice["path"]) if voice["path"] is not None else None return preview_path, voice_markdown(voice_name) def clear_prompt() -> str: return "" with gr.Blocks() as demo: with gr.Column(elem_id="app-shell"): gr.HTML( """
Research demo for multilingual TTS across Hindi, Tamil, and code-switched prompts.