himahande45's picture
Switch Space to VM-backed frontend
0e9e909 verified
from __future__ import annotations
import json
import os
import tempfile
from pathlib import Path
import gradio as gr
import requests
APP_DIR = Path(__file__).resolve().parent
PROMPTS_FILE = APP_DIR / "code_switch_prompts.json"
VOICE_DIR = APP_DIR / "assets" / "voices"
API_URL = os.getenv("INDICVOX_API_URL", "").rstrip("/")
BACKEND_TOKEN = os.getenv("INDICVOX_BACKEND_TOKEN", "")
DEFAULT_PROFILE = "Tamil Focus"
DEFAULT_VOICE = "Tamil Female Research Voice"
DEFAULT_TEXT = "இந்த experimentக்கு clean reference audio use பண்ணணும், இல்லனா output quality drop ஆகும்."
TIMEOUT_S = 600
SESSION = requests.Session()
PROFILES = {
"Tamil Focus": {
"description": "Best for Tamil and Tamil-English code-switched prompts.",
},
"Hindi Focus": {
"description": "Best for Hindi and Hindi-English code-switched prompts.",
},
"Research Baseline": {
"description": "Base multilingual checkpoint without paper fine-tuning.",
},
}
VOICE_PRESETS = {
"Hindi Research Voice": {
"path": VOICE_DIR / "hin_m_ref_00.wav",
"transcript": "लेकिन क्या यह हम सभी कार्यक्रमों के साथ कर सकते?",
"summary": "Short Hindi reference used for sharper Hindi + English prompting.",
},
"Tamil Female Research Voice": {
"path": VOICE_DIR / "tam_f_ref_00.wav",
"transcript": "விக்கற நேரத்தையும் லாபத்தையும் பொறுத்து, இந்த டேக்ஸை ஷார்ட் டேர்ம் இல்ல லாங் டேர்ம்னு பிரிப்பாங்க.",
"summary": "Clear Tamil reference with stable conversational prosody.",
},
"Tamil Male Research Voice": {
"path": VOICE_DIR / "tam_m_ref_00.wav",
"transcript": "கொரோனா பாதிப்பு காலத்தில் எண்பது கோடி மக்களுக்கு உணவு தானியம் வழங்கப்பட்டதாகவும் அவர் தெரிவித்தார்.",
"summary": "Tamil male reference that holds rhythm well on longer prompts.",
},
"Text Only": {
"path": None,
"transcript": None,
"summary": "Zero-shot generation without a reference voice clip.",
},
}
CUSTOM_CSS = """
#app-shell {
max-width: 1180px;
margin: 0 auto;
}
#hero {
padding: 24px 26px 12px 26px;
border: 1px solid rgba(255, 255, 255, 0.08);
border-radius: 22px;
background:
radial-gradient(circle at top right, rgba(99, 102, 241, 0.16), transparent 34%),
radial-gradient(circle at bottom left, rgba(16, 185, 129, 0.14), transparent 30%),
rgba(15, 23, 42, 0.74);
}
.stat-chip {
display: inline-block;
margin: 6px 8px 0 0;
padding: 8px 12px;
border-radius: 999px;
background: rgba(255, 255, 255, 0.06);
font-size: 0.92rem;
}
.footnote {
opacity: 0.78;
font-size: 0.94rem;
}
footer {
visibility: hidden;
}
"""
THEME = gr.themes.Soft(primary_hue="indigo", secondary_hue="emerald")
def load_examples() -> list[list[str]]:
with PROMPTS_FILE.open("r", encoding="utf-8") as f:
prompt_bank = json.load(f)
return [
[prompt_bank["hi_en"][0]["text"], "Hindi Focus", "Hindi Research Voice"],
[prompt_bank["hi_en"][9]["text"], "Hindi Focus", "Hindi Research Voice"],
[prompt_bank["hi_en"][16]["text"], "Hindi Focus", "Hindi Research Voice"],
[prompt_bank["ta_en"][0]["text"], "Tamil Focus", "Tamil Female Research Voice"],
[prompt_bank["ta_en"][9]["text"], "Tamil Focus", "Tamil Female Research Voice"],
[prompt_bank["ta_en"][14]["text"], "Tamil Focus", "Tamil Male Research Voice"],
]
EXAMPLES = load_examples()
def profile_markdown(profile_name: str) -> str:
return f"**{profile_name}** \n{PROFILES[profile_name]['description']}"
def voice_markdown(voice_name: str) -> str:
voice = VOICE_PRESETS[voice_name]
if voice["path"] is None:
return f"**{voice_name}** \n{voice['summary']}"
return (
f"**{voice_name}** \n"
f"{voice['summary']} \n"
f"Reference transcript: `{voice['transcript']}`"
)
def auth_headers() -> dict[str, str]:
headers: dict[str, str] = {}
if BACKEND_TOKEN:
headers["x-api-key"] = BACKEND_TOKEN
return headers
def backend_status() -> str:
if not API_URL:
return "**Backend Not Configured** \nSet `INDICVOX_API_URL` in Space secrets."
try:
response = SESSION.get(f"{API_URL}/health", headers=auth_headers(), timeout=10)
response.raise_for_status()
payload = response.json()
except Exception as exc:
return (
f"**Backend Unreachable** \n"
f"Endpoint: `{API_URL}` \n"
f"Error: `{type(exc).__name__}: {exc}`"
)
return (
f"**VM Backend Ready** \n"
f"Endpoint: `{API_URL}` \n"
f"GPU: `{payload.get('gpu', 'unknown')}` \n"
f"Warm profile: `{payload.get('active_profile', 'unknown')}` \n"
f"Uptime: `{payload.get('uptime_s', 'unknown')}s`"
)
def synthesize(text: str, profile_name: str, voice_name: str, cfg_value: float, inference_steps: int):
clean_text = text.strip()
if not clean_text:
raise gr.Error("Enter a prompt first.")
if not API_URL:
raise gr.Error("`INDICVOX_API_URL` is not configured on the Space.")
response = SESSION.post(
f"{API_URL}/synthesize",
headers=auth_headers(),
json={
"text": clean_text,
"profile_name": profile_name,
"voice_name": voice_name,
"cfg_value": float(cfg_value),
"inference_steps": int(inference_steps),
},
timeout=TIMEOUT_S,
)
if not response.ok:
detail = response.text
try:
detail = response.json().get("detail", detail)
except Exception:
pass
raise gr.Error(f"Backend error {response.status_code}: {detail}")
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
f.write(response.content)
audio_path = f.name
audio_seconds = response.headers.get("X-IndicVox-Audio-Seconds", "n/a")
generation_seconds = response.headers.get("X-IndicVox-Generation-Seconds", "n/a")
rtf = response.headers.get("X-IndicVox-RTF", "n/a")
gpu = response.headers.get("X-IndicVox-GPU", "unknown")
status = (
f"**Ready** \n"
f"Profile: `{profile_name}` \n"
f"Voice: `{voice_name}` \n"
f"GPU backend: `{gpu}` \n"
f"Audio length: `{audio_seconds}s` \n"
f"Generation time: `{generation_seconds}s` \n"
f"RTF: `{rtf}`"
)
return audio_path, status
def voice_preview(voice_name: str):
voice = VOICE_PRESETS[voice_name]
preview_path = str(voice["path"]) if voice["path"] is not None else None
return preview_path, voice_markdown(voice_name)
def clear_prompt() -> str:
return ""
with gr.Blocks() as demo:
with gr.Column(elem_id="app-shell"):
gr.HTML(
"""
<div id="hero">
<h1>IndicVox</h1>
<p>Research demo for multilingual TTS across Hindi, Tamil, and code-switched prompts.</p>
<div>
<span class="stat-chip">HF Space frontend</span>
<span class="stat-chip">VM-hosted H100 backend</span>
<span class="stat-chip">Hindi + Tamil + English prompts</span>
</div>
</div>
"""
)
with gr.Row():
with gr.Column(scale=5):
prompt = gr.Textbox(
label="Prompt",
value=DEFAULT_TEXT,
lines=5,
max_lines=8,
placeholder="Type Hindi, Tamil, or code-switched text here...",
)
with gr.Row():
profile = gr.Dropdown(
choices=list(PROFILES.keys()),
value=DEFAULT_PROFILE,
label="Model Profile",
info="Switch between the Hindi-tuned and Tamil-tuned research profiles.",
)
voice = gr.Dropdown(
choices=list(VOICE_PRESETS.keys()),
value=DEFAULT_VOICE,
label="Voice Preset",
info="Built-in research voices plus a zero-shot option.",
)
with gr.Accordion("Advanced Settings", open=False):
with gr.Row():
cfg_value = gr.Slider(
minimum=1.0,
maximum=4.0,
value=2.0,
step=0.1,
label="CFG",
)
inference_steps = gr.Slider(
minimum=6,
maximum=16,
value=10,
step=1,
label="Diffusion Steps",
)
with gr.Row():
generate_btn = gr.Button("Generate Speech", variant="primary", size="lg")
clear_btn = gr.Button("Clear Prompt")
refresh_btn = gr.Button("Refresh Backend Status")
with gr.Row():
profile_info = gr.Markdown(profile_markdown(DEFAULT_PROFILE))
voice_info = gr.Markdown(voice_markdown(DEFAULT_VOICE))
with gr.Column(scale=4):
backend_info = gr.Markdown(backend_status())
output_audio = gr.Audio(
label="Synthesized Audio",
autoplay=False,
format="wav",
)
generation_info = gr.Markdown("Generate a sample to see timing details.")
voice_preview_audio = gr.Audio(
label="Voice Preset Preview",
value=str(VOICE_PRESETS[DEFAULT_VOICE]["path"]),
interactive=False,
autoplay=False,
format="wav",
)
gr.Markdown(
"Inference runs on the external VM GPU; the Space only provides the paper demo UI.",
elem_classes=["footnote"],
)
with gr.Tabs():
with gr.Tab("Hindi + English Examples"):
gr.Examples(
examples=[row for row in EXAMPLES if row[1] == "Hindi Focus"],
inputs=[prompt, profile, voice],
cache_examples=False,
)
with gr.Tab("Tamil + English Examples"):
gr.Examples(
examples=[row for row in EXAMPLES if row[1] == "Tamil Focus"],
inputs=[prompt, profile, voice],
cache_examples=False,
)
gr.Markdown(
"""
**Demo notes**
- `Hindi Focus` maps to the Hindi-strong checkpoint from the paper experiments.
- `Tamil Focus` maps to the Tamil + code-switch checkpoint and is the default for the demo.
- `Text Only` skips the reference clip and runs zero-shot synthesis.
""",
elem_classes=["footnote"],
)
demo.load(fn=backend_status, outputs=backend_info, api_name=False)
generate_btn.click(
fn=synthesize,
inputs=[prompt, profile, voice, cfg_value, inference_steps],
outputs=[output_audio, generation_info],
api_name="synthesize",
)
prompt.submit(
fn=synthesize,
inputs=[prompt, profile, voice, cfg_value, inference_steps],
outputs=[output_audio, generation_info],
api_name=False,
)
profile.change(fn=profile_markdown, inputs=profile, outputs=profile_info, api_name=False)
voice.change(fn=voice_preview, inputs=voice, outputs=[voice_preview_audio, voice_info], api_name=False)
clear_btn.click(fn=clear_prompt, outputs=prompt, api_name=False)
refresh_btn.click(fn=backend_status, outputs=backend_info, api_name=False)
demo.queue(default_concurrency_limit=2, max_size=32)
if __name__ == "__main__":
demo.launch(theme=THEME, css=CUSTOM_CSS)