Spaces:

himahande45
/

indicvox-hindi-tamil-codeswitching-tts

Running

App Files Files Community

himahande45 commited on 11 days ago

Commit

0e9e909

verified ·

1 Parent(s): 402a61f

Switch Space to VM-backed frontend

Browse files

Files changed (3) hide show

README.md +6 -4
frontend_app.py +344 -0
requirements.txt +1 -12

README.md CHANGED Viewed

@@ -5,15 +5,15 @@ colorFrom: indigo
 colorTo: green
 sdk: gradio
 sdk_version: 6.12.0
-app_file: app.py
 pinned: false
 python_version: "3.10.16"
-suggested_hardware: a10g-small
 ---
 # IndicVox
-IndicVox is a GPU-backed research demo for multilingual text-to-speech across Hindi, Tamil, and code-switched prompts. The Space exposes the paper checkpoints through a clean Gradio UI with built-in voice presets and example prompts.
 ## What it includes
@@ -22,6 +22,7 @@ IndicVox is a GPU-backed research demo for multilingual text-to-speech across Hi
 - `Research Baseline` for direct comparison against the untuned multilingual model
 - Built-in research voice presets for fast demo playback
 - Zero-shot `Text Only` mode if you want to skip reference conditioning
 ## Usage
@@ -32,5 +33,6 @@ IndicVox is a GPU-backed research demo for multilingual text-to-speech across Hi
 ## Notes
-- The base multilingual model stays resident on GPU memory and the paper checkpoints are swapped on demand.
 - The Space is meant for inference/demo usage, not batch evaluation.

 colorTo: green
 sdk: gradio
 sdk_version: 6.12.0
+app_file: frontend_app.py
 pinned: false
 python_version: "3.10.16"
+suggested_hardware: cpu-basic
 ---
 # IndicVox
+IndicVox is a research demo for multilingual text-to-speech across Hindi, Tamil, and code-switched prompts. The Space hosts the frontend UI, while inference runs on an external GPU VM backend.
 ## What it includes
 - `Research Baseline` for direct comparison against the untuned multilingual model
 - Built-in research voice presets for fast demo playback
 - Zero-shot `Text Only` mode if you want to skip reference conditioning
+- VM-backed inference on a dedicated GPU server
 ## Usage
 ## Notes
+- The frontend expects `INDICVOX_API_URL` to point at the VM backend.
+- If the backend is token-protected, set `INDICVOX_BACKEND_TOKEN` in Space secrets too.
 - The Space is meant for inference/demo usage, not batch evaluation.

frontend_app.py ADDED Viewed

	@@ -0,0 +1,344 @@

+from __future__ import annotations
+import json
+import os
+import tempfile
+from pathlib import Path
+import gradio as gr
+import requests
+APP_DIR = Path(__file__).resolve().parent
+PROMPTS_FILE = APP_DIR / "code_switch_prompts.json"
+VOICE_DIR = APP_DIR / "assets" / "voices"
+API_URL = os.getenv("INDICVOX_API_URL", "").rstrip("/")
+BACKEND_TOKEN = os.getenv("INDICVOX_BACKEND_TOKEN", "")
+DEFAULT_PROFILE = "Tamil Focus"
+DEFAULT_VOICE = "Tamil Female Research Voice"
+DEFAULT_TEXT = "இந்த experimentக்கு clean reference audio use பண்ணணும், இல்லனா output quality drop ஆகும்."
+TIMEOUT_S = 600
+SESSION = requests.Session()
+PROFILES = {
+    "Tamil Focus": {
+        "description": "Best for Tamil and Tamil-English code-switched prompts.",
+    },
+    "Hindi Focus": {
+        "description": "Best for Hindi and Hindi-English code-switched prompts.",
+    },
+    "Research Baseline": {
+        "description": "Base multilingual checkpoint without paper fine-tuning.",
+    },
+}
+VOICE_PRESETS = {
+    "Hindi Research Voice": {
+        "path": VOICE_DIR / "hin_m_ref_00.wav",
+        "transcript": "लेकिन क्या यह हम सभी कार्यक्रमों के साथ कर सकते?",
+        "summary": "Short Hindi reference used for sharper Hindi + English prompting.",
+    },
+    "Tamil Female Research Voice": {
+        "path": VOICE_DIR / "tam_f_ref_00.wav",
+        "transcript": "விக்கற நேரத்தையும் லாபத்தையும் பொறுத்து, இந்த டேக்ஸை ஷார்ட் டேர்ம் இல்ல லாங் டேர்ம்னு பிரிப்பாங்க.",
+        "summary": "Clear Tamil reference with stable conversational prosody.",
+    },
+    "Tamil Male Research Voice": {
+        "path": VOICE_DIR / "tam_m_ref_00.wav",
+        "transcript": "கொரோனா பாதிப்பு காலத்தில் எண்பது கோடி மக்களுக்கு  உணவு தானியம் வழங்கப்பட்டதாகவும் அவர் தெரிவித்தார்.",
+        "summary": "Tamil male reference that holds rhythm well on longer prompts.",
+    },
+    "Text Only": {
+        "path": None,
+        "transcript": None,
+        "summary": "Zero-shot generation without a reference voice clip.",
+    },
+}
+CUSTOM_CSS = """
+#app-shell {
+    max-width: 1180px;
+    margin: 0 auto;
+}
+#hero {
+    padding: 24px 26px 12px 26px;
+    border: 1px solid rgba(255, 255, 255, 0.08);
+    border-radius: 22px;
+    background:
+        radial-gradient(circle at top right, rgba(99, 102, 241, 0.16), transparent 34%),
+        radial-gradient(circle at bottom left, rgba(16, 185, 129, 0.14), transparent 30%),
+        rgba(15, 23, 42, 0.74);
+}
+.stat-chip {
+    display: inline-block;
+    margin: 6px 8px 0 0;
+    padding: 8px 12px;
+    border-radius: 999px;
+    background: rgba(255, 255, 255, 0.06);
+    font-size: 0.92rem;
+}
+.footnote {
+    opacity: 0.78;
+    font-size: 0.94rem;
+}
+footer {
+    visibility: hidden;
+}
+"""
+THEME = gr.themes.Soft(primary_hue="indigo", secondary_hue="emerald")
+def load_examples() -> list[list[str]]:
+    with PROMPTS_FILE.open("r", encoding="utf-8") as f:
+        prompt_bank = json.load(f)
+    return [
+        [prompt_bank["hi_en"][0]["text"], "Hindi Focus", "Hindi Research Voice"],
+        [prompt_bank["hi_en"][9]["text"], "Hindi Focus", "Hindi Research Voice"],
+        [prompt_bank["hi_en"][16]["text"], "Hindi Focus", "Hindi Research Voice"],
+        [prompt_bank["ta_en"][0]["text"], "Tamil Focus", "Tamil Female Research Voice"],
+        [prompt_bank["ta_en"][9]["text"], "Tamil Focus", "Tamil Female Research Voice"],
+        [prompt_bank["ta_en"][14]["text"], "Tamil Focus", "Tamil Male Research Voice"],
+    ]
+EXAMPLES = load_examples()
+def profile_markdown(profile_name: str) -> str:
+    return f"**{profile_name}**  \n{PROFILES[profile_name]['description']}"
+def voice_markdown(voice_name: str) -> str:
+    voice = VOICE_PRESETS[voice_name]
+    if voice["path"] is None:
+        return f"**{voice_name}**  \n{voice['summary']}"
+    return (
+        f"**{voice_name}**  \n"
+        f"{voice['summary']}  \n"
+        f"Reference transcript: `{voice['transcript']}`"
+    )
+def auth_headers() -> dict[str, str]:
+    headers: dict[str, str] = {}
+    if BACKEND_TOKEN:
+        headers["x-api-key"] = BACKEND_TOKEN
+    return headers
+def backend_status() -> str:
+    if not API_URL:
+        return "**Backend Not Configured**  \nSet `INDICVOX_API_URL` in Space secrets."
+    try:
+        response = SESSION.get(f"{API_URL}/health", headers=auth_headers(), timeout=10)
+        response.raise_for_status()
+        payload = response.json()
+    except Exception as exc:
+        return (
+            f"**Backend Unreachable**  \n"
+            f"Endpoint: `{API_URL}`  \n"
+            f"Error: `{type(exc).__name__}: {exc}`"
+        )
+    return (
+        f"**VM Backend Ready**  \n"
+        f"Endpoint: `{API_URL}`  \n"
+        f"GPU: `{payload.get('gpu', 'unknown')}`  \n"
+        f"Warm profile: `{payload.get('active_profile', 'unknown')}`  \n"
+        f"Uptime: `{payload.get('uptime_s', 'unknown')}s`"
+    )
+def synthesize(text: str, profile_name: str, voice_name: str, cfg_value: float, inference_steps: int):
+    clean_text = text.strip()
+    if not clean_text:
+        raise gr.Error("Enter a prompt first.")
+    if not API_URL:
+        raise gr.Error("`INDICVOX_API_URL` is not configured on the Space.")
+    response = SESSION.post(
+        f"{API_URL}/synthesize",
+        headers=auth_headers(),
+        json={
+            "text": clean_text,
+            "profile_name": profile_name,
+            "voice_name": voice_name,
+            "cfg_value": float(cfg_value),
+            "inference_steps": int(inference_steps),
+        },
+        timeout=TIMEOUT_S,
+    )
+    if not response.ok:
+        detail = response.text
+        try:
+            detail = response.json().get("detail", detail)
+        except Exception:
+            pass
+        raise gr.Error(f"Backend error {response.status_code}: {detail}")
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
+        f.write(response.content)
+        audio_path = f.name
+    audio_seconds = response.headers.get("X-IndicVox-Audio-Seconds", "n/a")
+    generation_seconds = response.headers.get("X-IndicVox-Generation-Seconds", "n/a")
+    rtf = response.headers.get("X-IndicVox-RTF", "n/a")
+    gpu = response.headers.get("X-IndicVox-GPU", "unknown")
+    status = (
+        f"**Ready**  \n"
+        f"Profile: `{profile_name}`  \n"
+        f"Voice: `{voice_name}`  \n"
+        f"GPU backend: `{gpu}`  \n"
+        f"Audio length: `{audio_seconds}s`  \n"
+        f"Generation time: `{generation_seconds}s`  \n"
+        f"RTF: `{rtf}`"
+    )
+    return audio_path, status
+def voice_preview(voice_name: str):
+    voice = VOICE_PRESETS[voice_name]
+    preview_path = str(voice["path"]) if voice["path"] is not None else None
+    return preview_path, voice_markdown(voice_name)
+def clear_prompt() -> str:
+    return ""
+with gr.Blocks() as demo:
+    with gr.Column(elem_id="app-shell"):
+        gr.HTML(
+            """
+            <div id="hero">
+              <h1>IndicVox</h1>
+              <p>Research demo for multilingual TTS across Hindi, Tamil, and code-switched prompts.</p>
+              <div>
+                <span class="stat-chip">HF Space frontend</span>
+                <span class="stat-chip">VM-hosted H100 backend</span>
+                <span class="stat-chip">Hindi + Tamil + English prompts</span>
+              </div>
+            </div>
+            """
+        )
+        with gr.Row():
+            with gr.Column(scale=5):
+                prompt = gr.Textbox(
+                    label="Prompt",
+                    value=DEFAULT_TEXT,
+                    lines=5,
+                    max_lines=8,
+                    placeholder="Type Hindi, Tamil, or code-switched text here...",
+                )
+                with gr.Row():
+                    profile = gr.Dropdown(
+                        choices=list(PROFILES.keys()),
+                        value=DEFAULT_PROFILE,
+                        label="Model Profile",
+                        info="Switch between the Hindi-tuned and Tamil-tuned research profiles.",
+                    )
+                    voice = gr.Dropdown(
+                        choices=list(VOICE_PRESETS.keys()),
+                        value=DEFAULT_VOICE,
+                        label="Voice Preset",
+                        info="Built-in research voices plus a zero-shot option.",
+                    )
+                with gr.Accordion("Advanced Settings", open=False):
+                    with gr.Row():
+                        cfg_value = gr.Slider(
+                            minimum=1.0,
+                            maximum=4.0,
+                            value=2.0,
+                            step=0.1,
+                            label="CFG",
+                        )
+                        inference_steps = gr.Slider(
+                            minimum=6,
+                            maximum=16,
+                            value=10,
+                            step=1,
+                            label="Diffusion Steps",
+                        )
+                with gr.Row():
+                    generate_btn = gr.Button("Generate Speech", variant="primary", size="lg")
+                    clear_btn = gr.Button("Clear Prompt")
+                    refresh_btn = gr.Button("Refresh Backend Status")
+                with gr.Row():
+                    profile_info = gr.Markdown(profile_markdown(DEFAULT_PROFILE))
+                    voice_info = gr.Markdown(voice_markdown(DEFAULT_VOICE))
+            with gr.Column(scale=4):
+                backend_info = gr.Markdown(backend_status())
+                output_audio = gr.Audio(
+                    label="Synthesized Audio",
+                    autoplay=False,
+                    format="wav",
+                )
+                generation_info = gr.Markdown("Generate a sample to see timing details.")
+                voice_preview_audio = gr.Audio(
+                    label="Voice Preset Preview",
+                    value=str(VOICE_PRESETS[DEFAULT_VOICE]["path"]),
+                    interactive=False,
+                    autoplay=False,
+                    format="wav",
+                )
+                gr.Markdown(
+                    "Inference runs on the external VM GPU; the Space only provides the paper demo UI.",
+                    elem_classes=["footnote"],
+                )
+        with gr.Tabs():
+            with gr.Tab("Hindi + English Examples"):
+                gr.Examples(
+                    examples=[row for row in EXAMPLES if row[1] == "Hindi Focus"],
+                    inputs=[prompt, profile, voice],
+                    cache_examples=False,
+                )
+            with gr.Tab("Tamil + English Examples"):
+                gr.Examples(
+                    examples=[row for row in EXAMPLES if row[1] == "Tamil Focus"],
+                    inputs=[prompt, profile, voice],
+                    cache_examples=False,
+                )
+        gr.Markdown(
+            """
+            **Demo notes**
+            - `Hindi Focus` maps to the Hindi-strong checkpoint from the paper experiments.
+            - `Tamil Focus` maps to the Tamil + code-switch checkpoint and is the default for the demo.
+            - `Text Only` skips the reference clip and runs zero-shot synthesis.
+            """,
+            elem_classes=["footnote"],
+        )
+    demo.load(fn=backend_status, outputs=backend_info, api_name=False)
+    generate_btn.click(
+        fn=synthesize,
+        inputs=[prompt, profile, voice, cfg_value, inference_steps],
+        outputs=[output_audio, generation_info],
+        api_name="synthesize",
+    )
+    prompt.submit(
+        fn=synthesize,
+        inputs=[prompt, profile, voice, cfg_value, inference_steps],
+        outputs=[output_audio, generation_info],
+        api_name=False,
+    )
+    profile.change(fn=profile_markdown, inputs=profile, outputs=profile_info, api_name=False)
+    voice.change(fn=voice_preview, inputs=voice, outputs=[voice_preview_audio, voice_info], api_name=False)
+    clear_btn.click(fn=clear_prompt, outputs=prompt, api_name=False)
+    refresh_btn.click(fn=backend_status, outputs=backend_info, api_name=False)
+demo.queue(default_concurrency_limit=2, max_size=32)
+if __name__ == "__main__":
+    demo.launch(theme=THEME, css=CUSTOM_CSS)

requirements.txt CHANGED Viewed

@@ -1,13 +1,2 @@
 gradio>=6,<7
-huggingface_hub>=1.0
-numpy<3
-torch>=2.5.0
-torchaudio>=2.5.0
-transformers>=4.36.2
-einops>=0.8.0
-inflect>=7.0.0
-wetext
-librosa>=0.10.2
-soundfile>=0.12.1
-pydantic>=2
-safetensors>=0.4.5


1	gradio>=6,<7
2	+ requests>=2.31.0