Spaces:

himahande45
/

indicvox-hindi-tamil-codeswitching-tts

Running

App Files Files Community

indicvox-hindi-tamil-codeswitching-tts / frontend_app.py

himahande45

Switch Space to VM-backed frontend

0e9e909 verified 11 days ago

raw

history blame contribute delete

12.6 kB

	from __future__ import annotations

	import json
	import os
	import tempfile
	from pathlib import Path

	import gradio as gr
	import requests

	APP_DIR = Path(__file__).resolve().parent
	PROMPTS_FILE = APP_DIR / "code_switch_prompts.json"
	VOICE_DIR = APP_DIR / "assets" / "voices"
	API_URL = os.getenv("INDICVOX_API_URL", "").rstrip("/")
	BACKEND_TOKEN = os.getenv("INDICVOX_BACKEND_TOKEN", "")
	DEFAULT_PROFILE = "Tamil Focus"
	DEFAULT_VOICE = "Tamil Female Research Voice"
	DEFAULT_TEXT = "இந்த experimentக்கு clean reference audio use பண்ணணும், இல்லனா output quality drop ஆகும்."
	TIMEOUT_S = 600
	SESSION = requests.Session()

	PROFILES = {
	"Tamil Focus": {
	"description": "Best for Tamil and Tamil-English code-switched prompts.",
	},
	"Hindi Focus": {
	"description": "Best for Hindi and Hindi-English code-switched prompts.",
	},
	"Research Baseline": {
	"description": "Base multilingual checkpoint without paper fine-tuning.",
	},
	}

	VOICE_PRESETS = {
	"Hindi Research Voice": {
	"path": VOICE_DIR / "hin_m_ref_00.wav",
	"transcript": "लेकिन क्या यह हम सभी कार्यक्रमों के साथ कर सकते?",
	"summary": "Short Hindi reference used for sharper Hindi + English prompting.",
	},
	"Tamil Female Research Voice": {
	"path": VOICE_DIR / "tam_f_ref_00.wav",
	"transcript": "விக்கற நேரத்தையும் லாபத்தையும் பொறுத்து, இந்த டேக்ஸை ஷார்ட் டேர்ம் இல்ல லாங் டேர்ம்னு பிரிப்பாங்க.",
	"summary": "Clear Tamil reference with stable conversational prosody.",
	},
	"Tamil Male Research Voice": {
	"path": VOICE_DIR / "tam_m_ref_00.wav",
	"transcript": "கொரோனா பாதிப்பு காலத்தில் எண்பது கோடி மக்களுக்கு உணவு தானியம் வழங்கப்பட்டதாகவும் அவர் தெரிவித்தார்.",
	"summary": "Tamil male reference that holds rhythm well on longer prompts.",
	},
	"Text Only": {
	"path": None,
	"transcript": None,
	"summary": "Zero-shot generation without a reference voice clip.",
	},
	}

	CUSTOM_CSS = """
	#app-shell {
	max-width: 1180px;
	margin: 0 auto;
	}
	#hero {
	padding: 24px 26px 12px 26px;
	border: 1px solid rgba(255, 255, 255, 0.08);
	border-radius: 22px;
	background:
	radial-gradient(circle at top right, rgba(99, 102, 241, 0.16), transparent 34%),
	radial-gradient(circle at bottom left, rgba(16, 185, 129, 0.14), transparent 30%),
	rgba(15, 23, 42, 0.74);
	}
	.stat-chip {
	display: inline-block;
	margin: 6px 8px 0 0;
	padding: 8px 12px;
	border-radius: 999px;
	background: rgba(255, 255, 255, 0.06);
	font-size: 0.92rem;
	}
	.footnote {
	opacity: 0.78;
	font-size: 0.94rem;
	}
	footer {
	visibility: hidden;
	}
	"""

	THEME = gr.themes.Soft(primary_hue="indigo", secondary_hue="emerald")


	def load_examples() -> list[list[str]]:
	with PROMPTS_FILE.open("r", encoding="utf-8") as f:
	prompt_bank = json.load(f)

	return [
	[prompt_bank["hi_en"][0]["text"], "Hindi Focus", "Hindi Research Voice"],
	[prompt_bank["hi_en"][9]["text"], "Hindi Focus", "Hindi Research Voice"],
	[prompt_bank["hi_en"][16]["text"], "Hindi Focus", "Hindi Research Voice"],
	[prompt_bank["ta_en"][0]["text"], "Tamil Focus", "Tamil Female Research Voice"],
	[prompt_bank["ta_en"][9]["text"], "Tamil Focus", "Tamil Female Research Voice"],
	[prompt_bank["ta_en"][14]["text"], "Tamil Focus", "Tamil Male Research Voice"],
	]


	EXAMPLES = load_examples()


	def profile_markdown(profile_name: str) -> str:
	return f"{profile_name} \n{PROFILES[profile_name]['description']}"


	def voice_markdown(voice_name: str) -> str:
	voice = VOICE_PRESETS[voice_name]
	if voice["path"] is None:
	return f"{voice_name} \n{voice['summary']}"
	return (
	f"{voice_name} \n"
	f"{voice['summary']} \n"
	f"Reference transcript: `{voice['transcript']}`"
	)


	def auth_headers() -> dict[str, str]:
	headers: dict[str, str] = {}
	if BACKEND_TOKEN:
	headers["x-api-key"] = BACKEND_TOKEN
	return headers


	def backend_status() -> str:
	if not API_URL:
	return "Backend Not Configured \nSet `INDICVOX_API_URL` in Space secrets."

	try:
	response = SESSION.get(f"{API_URL}/health", headers=auth_headers(), timeout=10)
	response.raise_for_status()
	payload = response.json()
	except Exception as exc:
	return (
	f"Backend Unreachable \n"
	f"Endpoint: `{API_URL}` \n"
	f"Error: `{type(exc).__name__}: {exc}`"
	)

	return (
	f"VM Backend Ready \n"
	f"Endpoint: `{API_URL}` \n"
	f"GPU: `{payload.get('gpu', 'unknown')}` \n"
	f"Warm profile: `{payload.get('active_profile', 'unknown')}` \n"
	f"Uptime: `{payload.get('uptime_s', 'unknown')}s`"
	)


	def synthesize(text: str, profile_name: str, voice_name: str, cfg_value: float, inference_steps: int):
	clean_text = text.strip()
	if not clean_text:
	raise gr.Error("Enter a prompt first.")
	if not API_URL:
	raise gr.Error("`INDICVOX_API_URL` is not configured on the Space.")

	response = SESSION.post(
	f"{API_URL}/synthesize",
	headers=auth_headers(),
	json={
	"text": clean_text,
	"profile_name": profile_name,
	"voice_name": voice_name,
	"cfg_value": float(cfg_value),
	"inference_steps": int(inference_steps),
	},
	timeout=TIMEOUT_S,
	)

	if not response.ok:
	detail = response.text
	try:
	detail = response.json().get("detail", detail)
	except Exception:
	pass
	raise gr.Error(f"Backend error {response.status_code}: {detail}")

	with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
	f.write(response.content)
	audio_path = f.name

	audio_seconds = response.headers.get("X-IndicVox-Audio-Seconds", "n/a")
	generation_seconds = response.headers.get("X-IndicVox-Generation-Seconds", "n/a")
	rtf = response.headers.get("X-IndicVox-RTF", "n/a")
	gpu = response.headers.get("X-IndicVox-GPU", "unknown")
	status = (
	f"Ready \n"
	f"Profile: `{profile_name}` \n"
	f"Voice: `{voice_name}` \n"
	f"GPU backend: `{gpu}` \n"
	f"Audio length: `{audio_seconds}s` \n"
	f"Generation time: `{generation_seconds}s` \n"
	f"RTF: `{rtf}`"
	)
	return audio_path, status


	def voice_preview(voice_name: str):
	voice = VOICE_PRESETS[voice_name]
	preview_path = str(voice["path"]) if voice["path"] is not None else None
	return preview_path, voice_markdown(voice_name)


	def clear_prompt() -> str:
	return ""


	with gr.Blocks() as demo:
	with gr.Column(elem_id="app-shell"):
	gr.HTML(
	"""
	<div id="hero">
	<h1>IndicVox</h1>
	<p>Research demo for multilingual TTS across Hindi, Tamil, and code-switched prompts.</p>
	<div>
	<span class="stat-chip">HF Space frontend</span>
	<span class="stat-chip">VM-hosted H100 backend</span>
	<span class="stat-chip">Hindi + Tamil + English prompts</span>
	</div>
	</div>
	"""
	)

	with gr.Row():
	with gr.Column(scale=5):
	prompt = gr.Textbox(
	label="Prompt",
	value=DEFAULT_TEXT,
	lines=5,
	max_lines=8,
	placeholder="Type Hindi, Tamil, or code-switched text here...",
	)

	with gr.Row():
	profile = gr.Dropdown(
	choices=list(PROFILES.keys()),
	value=DEFAULT_PROFILE,
	label="Model Profile",
	info="Switch between the Hindi-tuned and Tamil-tuned research profiles.",
	)
	voice = gr.Dropdown(
	choices=list(VOICE_PRESETS.keys()),
	value=DEFAULT_VOICE,
	label="Voice Preset",
	info="Built-in research voices plus a zero-shot option.",
	)

	with gr.Accordion("Advanced Settings", open=False):
	with gr.Row():
	cfg_value = gr.Slider(
	minimum=1.0,
	maximum=4.0,
	value=2.0,
	step=0.1,
	label="CFG",
	)
	inference_steps = gr.Slider(
	minimum=6,
	maximum=16,
	value=10,
	step=1,
	label="Diffusion Steps",
	)

	with gr.Row():
	generate_btn = gr.Button("Generate Speech", variant="primary", size="lg")
	clear_btn = gr.Button("Clear Prompt")
	refresh_btn = gr.Button("Refresh Backend Status")

	with gr.Row():
	profile_info = gr.Markdown(profile_markdown(DEFAULT_PROFILE))
	voice_info = gr.Markdown(voice_markdown(DEFAULT_VOICE))

	with gr.Column(scale=4):
	backend_info = gr.Markdown(backend_status())
	output_audio = gr.Audio(
	label="Synthesized Audio",
	autoplay=False,
	format="wav",
	)
	generation_info = gr.Markdown("Generate a sample to see timing details.")
	voice_preview_audio = gr.Audio(
	label="Voice Preset Preview",
	value=str(VOICE_PRESETS[DEFAULT_VOICE]["path"]),
	interactive=False,
	autoplay=False,
	format="wav",
	)
	gr.Markdown(
	"Inference runs on the external VM GPU; the Space only provides the paper demo UI.",
	elem_classes=["footnote"],
	)

	with gr.Tabs():
	with gr.Tab("Hindi + English Examples"):
	gr.Examples(
	examples=[row for row in EXAMPLES if row[1] == "Hindi Focus"],
	inputs=[prompt, profile, voice],
	cache_examples=False,
	)
	with gr.Tab("Tamil + English Examples"):
	gr.Examples(
	examples=[row for row in EXAMPLES if row[1] == "Tamil Focus"],
	inputs=[prompt, profile, voice],
	cache_examples=False,
	)

	gr.Markdown(
	"""
	Demo notes

	- `Hindi Focus` maps to the Hindi-strong checkpoint from the paper experiments.
	- `Tamil Focus` maps to the Tamil + code-switch checkpoint and is the default for the demo.
	- `Text Only` skips the reference clip and runs zero-shot synthesis.
	""",
	elem_classes=["footnote"],
	)

	demo.load(fn=backend_status, outputs=backend_info, api_name=False)
	generate_btn.click(
	fn=synthesize,
	inputs=[prompt, profile, voice, cfg_value, inference_steps],
	outputs=[output_audio, generation_info],
	api_name="synthesize",
	)
	prompt.submit(
	fn=synthesize,
	inputs=[prompt, profile, voice, cfg_value, inference_steps],
	outputs=[output_audio, generation_info],
	api_name=False,
	)
	profile.change(fn=profile_markdown, inputs=profile, outputs=profile_info, api_name=False)
	voice.change(fn=voice_preview, inputs=voice, outputs=[voice_preview_audio, voice_info], api_name=False)
	clear_btn.click(fn=clear_prompt, outputs=prompt, api_name=False)
	refresh_btn.click(fn=backend_status, outputs=backend_info, api_name=False)

	demo.queue(default_concurrency_limit=2, max_size=32)

	if __name__ == "__main__":
	demo.launch(theme=THEME, css=CUSTOM_CSS)