Spaces:

NewGame
/

AccentVector

Running

App Files Files Community

NewGame commited on 7 days ago

Commit

c07485b

1 Parent(s): e9e1122

add the second accent feature

Browse files

Files changed (1) hide show

app.py +88 -43

app.py CHANGED Viewed

@@ -8,6 +8,7 @@ lifetime of the Space instance.
 """
 import os
 import tempfile
 import gradio as gr
@@ -15,6 +16,7 @@ import torch
 from huggingface_hub import snapshot_download
 from accent_task_vectors.inference import load_xtts_model, attach_lora_adapter
 # ---------------------------------------------------------------------------
 # Model registry (mirrors download_checkpoints.py)
@@ -54,10 +56,9 @@ ACCENTS_BY_LANGUAGE = {
 # Paths
 # ---------------------------------------------------------------------------
-CACHE_DIR     = os.environ.get("MODEL_CACHE_DIR", "model_cache")
 PRETRAINED_DIR = os.path.join(CACHE_DIR, "pretrained")
-# Keys in config.json that hold pretrained model paths
 _PRETRAINED_PATH_FIELDS = {
     "mel_norm_file":   "mel_stats.pth",
     "dvae_checkpoint": "dvae.pth",
@@ -66,17 +67,17 @@ _PRETRAINED_PATH_FIELDS = {
 }
 # ---------------------------------------------------------------------------
-# In-memory model cache  {(language, accent): tts}
 # ---------------------------------------------------------------------------
-_model_cache: dict = {}
 _device = "cuda" if torch.cuda.is_available() else "cpu"
 def _patch_config(config_path: str, pretrained_dir: str) -> None:
-    """Rewrite pretrained model paths in config.json to point to local dir."""
-    import json
     with open(config_path) as f:
         config = json.load(f)
@@ -103,7 +104,6 @@ def _patch_config(config_path: str, pretrained_dir: str) -> None:
 def _ensure_pretrained() -> None:
-    """Download the base pretrained XTTS model if not already cached."""
     if not os.path.isdir(PRETRAINED_DIR):
         print(f"Downloading pretrained model from {PRETRAINED_REPO} …")
         snapshot_download(
@@ -113,18 +113,11 @@ def _ensure_pretrained() -> None:
         )
-def _load_model(language: str, accent: str) -> object:
-    """Return a cached (or freshly loaded) TTS model for the given combination."""
-    key = (language, accent)
-    if key in _model_cache:
-        return _model_cache[key]
-    _ensure_pretrained()
-    repo_id = MODELS[key]
     lora_dir = os.path.join(CACHE_DIR, f"{accent.lower()}-accent-{language.lower()}")
     if not os.path.isdir(lora_dir):
         print(f"Downloading LoRA adapter from {repo_id} …")
         snapshot_download(
             repo_id=repo_id,
@@ -133,15 +126,33 @@ def _load_model(language: str, accent: str) -> object:
             allow_patterns=["config.json", "lora/best_model/**"],
         )
         _patch_config(os.path.join(lora_dir, "config.json"), PRETRAINED_DIR)
     checkpoint_path = os.path.join(PRETRAINED_DIR, "checkpoint_0.pth")
-    config_path     = os.path.join(lora_dir, "config.json")
-    lora_path       = os.path.join(lora_dir, "lora", "best_model")
     tts = load_xtts_model(checkpoint_path, config_path, device=_device)
-    tts = attach_lora_adapter(tts, lora_path=lora_path)
-    _model_cache[key] = tts
     return tts
@@ -149,22 +160,37 @@ def _load_model(language: str, accent: str) -> object:
 # Inference function called by Gradio
 # ---------------------------------------------------------------------------
-def synthesise(text: str, speaker_audio: str, language: str, accent: str, lora_coeff: float):
     if not text.strip():
         raise gr.Error("Please enter some text to synthesise.")
     if speaker_audio is None:
         raise gr.Error("Please upload a reference speaker audio file.")
-    if (language, accent) not in MODELS:
-        raise gr.Error(f"Unsupported combination: language={language}, accent={accent}.")
-    tts = _load_model(language, accent)
-    # Scale LoRA if needed
-    if lora_coeff != 1.0:
-        from accent_task_vectors.inference.inference import _scale_lora
-        # Reset to 1.0 first, then apply desired coefficient
-        _scale_lora(tts, lora_coeff / getattr(tts, "_last_lora_coeff", 1.0))
-    tts._last_lora_coeff = lora_coeff
     with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
         output_path = tmp.name
@@ -211,34 +237,52 @@ the speaker's **accent**, upload a short reference audio clip, and type your tex
                 label="Reference speaker audio (3–10 s)",
                 type="filepath",
             )
             with gr.Row():
                 language_dd = gr.Dropdown(
                     label="Output language",
                     choices=list(ACCENTS_BY_LANGUAGE.keys()),
                     value="English",
                 )
-                accent_dd = gr.Dropdown(
                     label="Speaker accent",
                     choices=ACCENTS_BY_LANGUAGE["English"],
                     value="English",
                 )
-            lora_coeff = gr.Slider(
-                label="Accent strength (LoRA coefficient)",
-                minimum=0.0,
-                maximum=2.0,
-                step=0.05,
-                value=1.0,
             )
             generate_btn = gr.Button("Generate", variant="primary")
         with gr.Column():
             audio_output = gr.Audio(label="Generated speech", type="filepath")
-    language_dd.change(fn=update_accent_choices, inputs=language_dd, outputs=accent_dd)
     generate_btn.click(
         fn=synthesise,
-        inputs=[text_input, speaker_audio, language_dd, accent_dd, lora_coeff],
         outputs=audio_output,
     )
@@ -250,8 +294,9 @@ the speaker's **accent**, upload a short reference audio clip, and type your tex
 2. **Speaker accent** — the L1 accent of the target speaker style.
 3. **Reference audio** — a clean 3–10 second clip of any speaker; the model
    clones the voice while applying the chosen accent.
-4. **Accent strength** — scale the LoRA adapter contribution (1.0 = default,
-   0 = no accent modification, >1 = stronger accent).
 Models are downloaded automatically on first use.
 """

 """
 import os
+import json
 import tempfile
 import gradio as gr
 from huggingface_hub import snapshot_download
 from accent_task_vectors.inference import load_xtts_model, attach_lora_adapter
+from accent_task_vectors.inference.inference import _scale_lora
 # ---------------------------------------------------------------------------
 # Model registry (mirrors download_checkpoints.py)
 # Paths
 # ---------------------------------------------------------------------------
+CACHE_DIR      = os.environ.get("MODEL_CACHE_DIR", "model_cache")
 PRETRAINED_DIR = os.path.join(CACHE_DIR, "pretrained")
 _PRETRAINED_PATH_FIELDS = {
     "mel_norm_file":   "mel_stats.pth",
     "dvae_checkpoint": "dvae.pth",
 }
 # ---------------------------------------------------------------------------
+# In-memory model cache
+#   _model_cache:    (language, accent1, accent2|None) -> tts
+#   _current_coeffs: same key -> (coeff1, coeff2)
 # ---------------------------------------------------------------------------
+_model_cache:    dict = {}
+_current_coeffs: dict = {}
 _device = "cuda" if torch.cuda.is_available() else "cpu"
 def _patch_config(config_path: str, pretrained_dir: str) -> None:
     with open(config_path) as f:
         config = json.load(f)
 def _ensure_pretrained() -> None:
     if not os.path.isdir(PRETRAINED_DIR):
         print(f"Downloading pretrained model from {PRETRAINED_REPO} …")
         snapshot_download(
         )
+def _download_lora(language: str, accent: str) -> str:
+    """Download a LoRA adapter if needed; return its local directory."""
     lora_dir = os.path.join(CACHE_DIR, f"{accent.lower()}-accent-{language.lower()}")
     if not os.path.isdir(lora_dir):
+        repo_id = MODELS[(language, accent)]
         print(f"Downloading LoRA adapter from {repo_id} …")
         snapshot_download(
             repo_id=repo_id,
             allow_patterns=["config.json", "lora/best_model/**"],
         )
         _patch_config(os.path.join(lora_dir, "config.json"), PRETRAINED_DIR)
+    return lora_dir
+def _load_model(language: str, accent1: str, accent2: str | None):
+    """Return a cached TTS model with adapter(s) loaded at coeff=1.0."""
+    key = (language, accent1, accent2)
+    if key in _model_cache:
+        return _model_cache[key]
+    _ensure_pretrained()
+    lora_dir1 = _download_lora(language, accent1)
     checkpoint_path = os.path.join(PRETRAINED_DIR, "checkpoint_0.pth")
+    config_path     = os.path.join(lora_dir1, "config.json")
+    lora_path1      = os.path.join(lora_dir1, "lora", "best_model")
     tts = load_xtts_model(checkpoint_path, config_path, device=_device)
+    tts = attach_lora_adapter(tts, lora_path=lora_path1, adapter_name="default", scaling_coef=1.0)
+    if accent2 is not None:
+        lora_dir2  = _download_lora(language, accent2)
+        lora_path2 = os.path.join(lora_dir2, "lora", "best_model")
+        tts = attach_lora_adapter(tts, lora_path=lora_path2, adapter_name="other", scaling_coef=1.0)
+        tts.synthesizer.tts_model.set_adapter(["default", "other"])
+    _model_cache[key]    = tts
+    _current_coeffs[key] = (1.0, 1.0)
     return tts
 # Inference function called by Gradio
 # ---------------------------------------------------------------------------
+def synthesise(
+    text: str,
+    speaker_audio: str,
+    language: str,
+    accent1: str,
+    coeff1: float,
+    enable_second: bool,
+    accent2: str,
+    coeff2: float,
+):
     if not text.strip():
         raise gr.Error("Please enter some text to synthesise.")
     if speaker_audio is None:
         raise gr.Error("Please upload a reference speaker audio file.")
+    if (language, accent1) not in MODELS:
+        raise gr.Error(f"Unsupported combination: language={language}, accent={accent1}.")
+    accent2_key = accent2 if enable_second else None
+    if enable_second and (language, accent2) not in MODELS:
+        raise gr.Error(f"Unsupported combination: language={language}, accent={accent2}.")
+    tts = _load_model(language, accent1, accent2_key)
+    key = (language, accent1, accent2_key)
+    # Rescale adapters from their current cached coefficients to the desired ones
+    prev_coeff1, prev_coeff2 = _current_coeffs[key]
+    _scale_lora(tts, coeff1 / prev_coeff1, adapter_name="default")
+    if accent2_key is not None:
+        _scale_lora(tts, coeff2 / prev_coeff2, adapter_name="other")
+    _current_coeffs[key] = (coeff1, coeff2 if accent2_key else 1.0)
     with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
         output_path = tmp.name
                 label="Reference speaker audio (3–10 s)",
                 type="filepath",
             )
             with gr.Row():
                 language_dd = gr.Dropdown(
                     label="Output language",
                     choices=list(ACCENTS_BY_LANGUAGE.keys()),
                     value="English",
                 )
+                accent1_dd = gr.Dropdown(
                     label="Speaker accent",
                     choices=ACCENTS_BY_LANGUAGE["English"],
                     value="English",
                 )
+            coeff1_slider = gr.Slider(
+                label="Accent strength",
+                minimum=0.0, maximum=1.0, step=0.05, value=1.0,
             )
+            with gr.Accordion("Mix a second accent (optional)", open=False):
+                enable_second = gr.Checkbox(label="Enable second accent", value=False)
+                accent2_dd = gr.Dropdown(
+                    label="Second accent",
+                    choices=ACCENTS_BY_LANGUAGE["English"],
+                    value="Hindi",
+                    interactive=True,
+                )
+                coeff2_slider = gr.Slider(
+                    label="Second accent strength",
+                    minimum=0.0, maximum=1.0, step=0.05, value=0.5,
+                )
             generate_btn = gr.Button("Generate", variant="primary")
         with gr.Column():
             audio_output = gr.Audio(label="Generated speech", type="filepath")
+    # Update both accent dropdowns when language changes
+    language_dd.change(fn=update_accent_choices, inputs=language_dd, outputs=accent1_dd)
+    language_dd.change(fn=update_accent_choices, inputs=language_dd, outputs=accent2_dd)
     generate_btn.click(
         fn=synthesise,
+        inputs=[
+            text_input, speaker_audio,
+            language_dd, accent1_dd, coeff1_slider,
+            enable_second, accent2_dd, coeff2_slider,
+        ],
         outputs=audio_output,
     )
 2. **Speaker accent** — the L1 accent of the target speaker style.
 3. **Reference audio** — a clean 3–10 second clip of any speaker; the model
    clones the voice while applying the chosen accent.
+4. **Accent strength** — LoRA adapter contribution (0 = no accent effect, 1 = full).
+5. **Mix a second accent** — optionally blend two accents together by enabling
+   a second adapter and setting its strength independently.
 Models are downloaded automatically on first use.
 """