Spaces:

himahande45
/

indicvox-hindi-tamil-codeswitching-tts

Running

App Files Files Community

himahande45 commited on 11 days ago

Commit

dffe2a3

verified ·

1 Parent(s): 0e9e909

Make app.py forward to VM frontend

Browse files

Files changed (1) hide show

app.py +1 -448

app.py CHANGED Viewed

@@ -1,451 +1,4 @@
-from __future__ import annotations
-import json
-import os
-import sys
-import threading
-import time
-import traceback
-from pathlib import Path
-import gradio as gr
-import numpy as np
-import torch
-from huggingface_hub import snapshot_download
-APP_DIR = Path(__file__).resolve().parent
-def resolve_persist_root() -> Path:
-    data_root = Path("/data")
-    if data_root.exists() and os.access(data_root, os.W_OK):
-        return data_root
-    local_root = APP_DIR / ".cache"
-    local_root.mkdir(parents=True, exist_ok=True)
-    return local_root
-PERSIST_ROOT = resolve_persist_root()
-HF_HOME = PERSIST_ROOT / "huggingface"
-HF_HOME.mkdir(parents=True, exist_ok=True)
-os.environ.setdefault("HF_HOME", str(HF_HOME))
-os.environ.setdefault("HF_HUB_CACHE", str(HF_HOME / "hub"))
-sys.path.insert(0, str(APP_DIR))
-from voxcpm import VoxCPM
-from voxcpm.model.voxcpm import LoRAConfig
-SPACE_TITLE = "IndicVox: Hindi & Tamil Code-Switching TTS"
-MODEL_REPO_ID = "himahande45/multilingual-tts"
-PROMPTS_FILE = APP_DIR / "code_switch_prompts.json"
-VOICE_DIR = APP_DIR / "assets" / "voices"
-DEFAULT_PROFILE = "Tamil Focus"
-DEFAULT_VOICE = "Tamil Female Research Voice"
-DEFAULT_TEXT = "இந்த experimentக்கு clean reference audio use பண்ணணும், இல்லனா output quality drop ஆகும்."
-MODEL_PATTERNS = [
-    "VoxCPM2_local/*",
-    "finetune_checkpoints/step_0000500/lora_config.json",
-    "finetune_checkpoints/step_0000500/lora_weights.safetensors",
-    "finetune_checkpoints/step_0001000/lora_config.json",
-    "finetune_checkpoints/step_0001000/lora_weights.safetensors",
-]
-PROFILES = {
-    "Tamil Focus": {
-        "description": "Best for Tamil and Tamil-English code-switched prompts.",
-        "checkpoint_dir": "finetune_checkpoints/step_0001000",
-    },
-    "Hindi Focus": {
-        "description": "Best for Hindi and Hindi-English code-switched prompts.",
-        "checkpoint_dir": "finetune_checkpoints/step_0000500",
-    },
-    "Research Baseline": {
-        "description": "Base multilingual checkpoint without paper fine-tuning.",
-        "checkpoint_dir": None,
-    },
-}
-VOICE_PRESETS = {
-    "Hindi Research Voice": {
-        "path": VOICE_DIR / "hin_m_ref_00.wav",
-        "transcript": "लेकिन क्या यह हम सभी कार्यक्रमों के साथ कर सकते?",
-        "summary": "Short Hindi reference used for sharper Hindi + English prompting.",
-    },
-    "Tamil Female Research Voice": {
-        "path": VOICE_DIR / "tam_f_ref_00.wav",
-        "transcript": "விக்கற நேரத்தையும் லாபத்தையும் பொறுத்து, இந்த டேக்ஸை ஷார்ட் டேர்ம் இல்ல லாங் டேர்ம்னு பிரிப்பாங்க.",
-        "summary": "Clear Tamil reference with stable conversational prosody.",
-    },
-    "Tamil Male Research Voice": {
-        "path": VOICE_DIR / "tam_m_ref_00.wav",
-        "transcript": "கொரோனா பாதிப்பு காலத்தில் எண்பது கோடி மக்களுக்கு  உணவு தானியம் வழங்கப்பட்டதாகவும் அவர் தெரிவித்தார்.",
-        "summary": "Tamil male reference that holds rhythm well on longer prompts.",
-    },
-    "Text Only": {
-        "path": None,
-        "transcript": None,
-        "summary": "Zero-shot generation without a reference voice clip.",
-    },
-}
-CUSTOM_CSS = """
-#app-shell {
-    max-width: 1180px;
-    margin: 0 auto;
-}
-#hero {
-    padding: 24px 26px 12px 26px;
-    border: 1px solid rgba(255, 255, 255, 0.08);
-    border-radius: 22px;
-    background:
-        radial-gradient(circle at top right, rgba(99, 102, 241, 0.16), transparent 34%),
-        radial-gradient(circle at bottom left, rgba(16, 185, 129, 0.14), transparent 30%),
-        rgba(15, 23, 42, 0.74);
-}
-.stat-chip {
-    display: inline-block;
-    margin: 6px 8px 0 0;
-    padding: 8px 12px;
-    border-radius: 999px;
-    background: rgba(255, 255, 255, 0.06);
-    font-size: 0.92rem;
-}
-.footnote {
-    opacity: 0.78;
-    font-size: 0.94rem;
-}
-footer {
-    visibility: hidden;
-}
-"""
-if torch.cuda.is_available():
-    torch.backends.cuda.matmul.allow_tf32 = True
-    torch.backends.cudnn.allow_tf32 = True
-    torch.set_float32_matmul_precision("high")
-THEME = gr.themes.Soft(primary_hue="indigo", secondary_hue="emerald")
-def load_examples() -> list[list[str]]:
-    with PROMPTS_FILE.open("r", encoding="utf-8") as f:
-        prompt_bank = json.load(f)
-    return [
-        [prompt_bank["hi_en"][0]["text"], "Hindi Focus", "Hindi Research Voice"],
-        [prompt_bank["hi_en"][9]["text"], "Hindi Focus", "Hindi Research Voice"],
-        [prompt_bank["hi_en"][16]["text"], "Hindi Focus", "Hindi Research Voice"],
-        [prompt_bank["ta_en"][0]["text"], "Tamil Focus", "Tamil Female Research Voice"],
-        [prompt_bank["ta_en"][9]["text"], "Tamil Focus", "Tamil Female Research Voice"],
-        [prompt_bank["ta_en"][14]["text"], "Tamil Focus", "Tamil Male Research Voice"],
-    ]
-def profile_markdown(profile_name: str) -> str:
-    description = PROFILES[profile_name]["description"]
-    return f"**{profile_name}**  \n{description}"
-def voice_markdown(voice_name: str) -> str:
-    voice = VOICE_PRESETS[voice_name]
-    if voice["path"] is None:
-        return f"**{voice_name}**  \n{voice['summary']}"
-    transcript = voice["transcript"]
-    return f"**{voice_name}**  \n{voice['summary']}  \nReference transcript: `{transcript}`"
-def dynamic_max_len(text: str) -> int:
-    char_count = max(len(text.strip()), 1)
-    return max(280, min(900, int(char_count * 7.5)))
-class ModelManager:
-    def __init__(self) -> None:
-        self.lock = threading.Lock()
-        self.repo_dir = self._resolve_repo_dir()
-        self.base_dir = self.repo_dir / "VoxCPM2_local"
-        self.loaded_profile: str | None = None
-        self.active_profile: str | None = None
-        self.model = self._load_model()
-        self.activate_profile(DEFAULT_PROFILE)
-    def _resolve_repo_dir(self) -> Path:
-        local_repo = os.getenv("INDICVOX_LOCAL_MODEL_REPO")
-        if local_repo:
-            path = Path(local_repo).expanduser().resolve()
-            if path.exists():
-                return path
-            raise FileNotFoundError(f"INDICVOX_LOCAL_MODEL_REPO does not exist: {path}")
-        token = os.getenv("HF_TOKEN")
-        snapshot_path = snapshot_download(
-            repo_id=MODEL_REPO_ID,
-            repo_type="model",
-            allow_patterns=MODEL_PATTERNS,
-            token=token,
-        )
-        return Path(snapshot_path)
-    def _load_lora_config(self, checkpoint_dir: Path) -> LoRAConfig:
-        payload = json.loads((checkpoint_dir / "lora_config.json").read_text(encoding="utf-8"))
-        return LoRAConfig(**payload["lora_config"])
-    def _load_model(self) -> VoxCPM:
-        if not torch.cuda.is_available():
-            raise RuntimeError("A GPU runtime is required. Request an A10G/L4 Space and restart.")
-        checkpoint_dir = self.repo_dir / PROFILES[DEFAULT_PROFILE]["checkpoint_dir"]
-        lora_config = self._load_lora_config(checkpoint_dir)
-        model = VoxCPM.from_pretrained(
-            hf_model_id=str(self.base_dir),
-            load_denoiser=False,
-            optimize=False,
-            lora_config=lora_config,
-        )
-        return model
-    def activate_profile(self, profile_name: str) -> None:
-        spec = PROFILES[profile_name]
-        checkpoint_dir = spec["checkpoint_dir"]
-        if checkpoint_dir is None:
-            self.model.set_lora_enabled(False)
-            self.active_profile = profile_name
-            return
-        if self.loaded_profile != profile_name:
-            if self.loaded_profile is not None:
-                self.model.unload_lora()
-            self.model.load_lora(str(self.repo_dir / checkpoint_dir))
-            self.loaded_profile = profile_name
-        self.model.set_lora_enabled(True)
-        self.active_profile = profile_name
-    def synthesize(
-        self,
-        text: str,
-        profile_name: str,
-        voice_name: str,
-        cfg_value: float,
-        inference_steps: int,
-    ) -> tuple[tuple[int, np.ndarray], str]:
-        clean_text = text.strip()
-        if not clean_text:
-            raise gr.Error("Enter a prompt first.")
-        start = time.perf_counter()
-        with self.lock:
-            self.activate_profile(profile_name)
-            kwargs = {
-                "text": clean_text,
-                "cfg_value": float(cfg_value),
-                "inference_timesteps": int(inference_steps),
-                "max_len": dynamic_max_len(clean_text),
-            }
-            voice = VOICE_PRESETS[voice_name]
-            if voice["path"] is not None:
-                kwargs["prompt_wav_path"] = str(voice["path"])
-                kwargs["prompt_text"] = voice["transcript"]
-            wav = self.model.generate(**kwargs)
-            sample_rate = int(self.model.tts_model.sample_rate)
-        if isinstance(wav, torch.Tensor):
-            wav = wav.detach().cpu().numpy()
-        wav = np.asarray(wav, dtype=np.float32).squeeze()
-        wav = np.clip(wav, -1.0, 1.0)
-        elapsed = time.perf_counter() - start
-        duration = float(wav.shape[-1]) / sample_rate if wav.size else 0.0
-        rtf = elapsed / duration if duration > 0 else float("nan")
-        speed_line = f"RTF {rtf:.2f}x" if np.isfinite(rtf) else "RTF n/a"
-        status = (
-            f"**Ready**  \n"
-            f"Profile: `{profile_name}`  \n"
-            f"Voice: `{voice_name}`  \n"
-            f"Audio length: `{duration:.2f}s`  \n"
-            f"Generation time: `{elapsed:.2f}s` ({speed_line})"
-        )
-        return (sample_rate, wav), status
-    def boot_markdown(self) -> str:
-        gpu_name = torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU"
-        active_profile = self.active_profile or DEFAULT_PROFILE
-        return (
-            f"**GPU Ready**  \n"
-            f"Runtime: `{gpu_name}`  \n"
-            f"Warm profile: `{active_profile}`  \n"
-            f"Model source: `{MODEL_REPO_ID}`"
-        )
-BOOT_ERROR: str | None = None
-MODEL_MANAGER: ModelManager | None = None
-try:
-    MODEL_MANAGER = ModelManager()
-except Exception:
-    BOOT_ERROR = traceback.format_exc()
-EXAMPLES = load_examples()
-def synthesize(text: str, profile_name: str, voice_name: str, cfg_value: float, inference_steps: int):
-    if MODEL_MANAGER is None:
-        raise gr.Error(f"Model initialization failed.\n\n{BOOT_ERROR}")
-    return MODEL_MANAGER.synthesize(text, profile_name, voice_name, cfg_value, inference_steps)
-def voice_preview(voice_name: str):
-    voice = VOICE_PRESETS[voice_name]
-    preview_path = str(voice["path"]) if voice["path"] is not None else None
-    return preview_path, voice_markdown(voice_name)
-def clear_prompt() -> str:
-    return ""
-def boot_status() -> str:
-    if MODEL_MANAGER is not None:
-        return MODEL_MANAGER.boot_markdown()
-    return f"**Startup Error**  \n```text\n{BOOT_ERROR}\n```"
-with gr.Blocks() as demo:
-    with gr.Column(elem_id="app-shell"):
-        gr.HTML(
-            """
-            <div id="hero">
-              <h1>IndicVox</h1>
-              <p>Research demo for multilingual TTS across Hindi, Tamil, and code-switched prompts.</p>
-              <div>
-                <span class="stat-chip">GPU-backed Space</span>
-                <span class="stat-chip">Warm-loaded model</span>
-                <span class="stat-chip">Hindi + Tamil + English prompts</span>
-              </div>
-            </div>
-            """
-        )
-        with gr.Row():
-            with gr.Column(scale=5):
-                prompt = gr.Textbox(
-                    label="Prompt",
-                    value=DEFAULT_TEXT,
-                    lines=5,
-                    max_lines=8,
-                    placeholder="Type Hindi, Tamil, or code-switched text here...",
-                )
-                with gr.Row():
-                    profile = gr.Dropdown(
-                        choices=list(PROFILES.keys()),
-                        value=DEFAULT_PROFILE,
-                        label="Model Profile",
-                        info="Switch between the Hindi-tuned and Tamil-tuned research profiles.",
-                    )
-                    voice = gr.Dropdown(
-                        choices=list(VOICE_PRESETS.keys()),
-                        value=DEFAULT_VOICE,
-                        label="Voice Preset",
-                        info="Built-in research voices plus a zero-shot option.",
-                    )
-                with gr.Accordion("Advanced Settings", open=False):
-                    with gr.Row():
-                        cfg_value = gr.Slider(
-                            minimum=1.0,
-                            maximum=4.0,
-                            value=2.0,
-                            step=0.1,
-                            label="CFG",
-                            info="Higher values usually sound more guided but less relaxed.",
-                        )
-                        inference_steps = gr.Slider(
-                            minimum=6,
-                            maximum=16,
-                            value=10,
-                            step=1,
-                            label="Diffusion Steps",
-                            info="10 is the paper demo default.",
-                        )
-                with gr.Row():
-                    generate_btn = gr.Button("Generate Speech", variant="primary", size="lg")
-                    clear_btn = gr.Button("Clear Prompt")
-                with gr.Row():
-                    profile_info = gr.Markdown(profile_markdown(DEFAULT_PROFILE))
-                    voice_info = gr.Markdown(voice_markdown(DEFAULT_VOICE))
-            with gr.Column(scale=4):
-                status = gr.Markdown(boot_status())
-                output_audio = gr.Audio(
-                    label="Synthesized Audio",
-                    autoplay=False,
-                    format="wav",
-                )
-                voice_preview_audio = gr.Audio(
-                    label="Voice Preset Preview",
-                    value=str(VOICE_PRESETS[DEFAULT_VOICE]["path"]),
-                    interactive=False,
-                    autoplay=False,
-                    format="wav",
-                )
-                gr.Markdown(
-                    "The demo keeps the base model resident on GPU and swaps paper checkpoints on demand.",
-                    elem_classes=["footnote"],
-                )
-        with gr.Tabs():
-            with gr.Tab("Hindi + English Examples"):
-                gr.Examples(
-                    examples=[row for row in EXAMPLES if row[1] == "Hindi Focus"],
-                    inputs=[prompt, profile, voice],
-                    cache_examples=False,
-                )
-            with gr.Tab("Tamil + English Examples"):
-                gr.Examples(
-                    examples=[row for row in EXAMPLES if row[1] == "Tamil Focus"],
-                    inputs=[prompt, profile, voice],
-                    cache_examples=False,
-                )
-        gr.Markdown(
-            """
-            **Demo notes**
-            - `Hindi Focus` maps to the Hindi-strong checkpoint from the paper experiments.
-            - `Tamil Focus` maps to the Tamil + code-switch checkpoint and is the default for the Space.
-            - `Text Only` skips the reference clip and runs zero-shot synthesis.
-            """,
-            elem_classes=["footnote"],
-        )
-    generate_btn.click(
-        fn=synthesize,
-        inputs=[prompt, profile, voice, cfg_value, inference_steps],
-        outputs=[output_audio, status],
-        api_name="synthesize",
-    )
-    prompt.submit(
-        fn=synthesize,
-        inputs=[prompt, profile, voice, cfg_value, inference_steps],
-        outputs=[output_audio, status],
-        api_name=False,
-    )
-    profile.change(fn=profile_markdown, inputs=profile, outputs=profile_info, api_name=False)
-    voice.change(fn=voice_preview, inputs=voice, outputs=[voice_preview_audio, voice_info], api_name=False)
-    clear_btn.click(fn=clear_prompt, outputs=prompt, api_name=False)
-demo.queue(default_concurrency_limit=1, max_size=16)
 if __name__ == "__main__":
     demo.launch(theme=THEME, css=CUSTOM_CSS)

+from frontend_app import CUSTOM_CSS, THEME, demo
 if __name__ == "__main__":
     demo.launch(theme=THEME, css=CUSTOM_CSS)