Spaces:

NewGame
/

AccentVector

Running

App Files Files Community

NewGame commited on 6 days ago

Commit

2b90282

1 Parent(s): 46364f7

Add Gradio demo

Browse files

Files changed (4) hide show

README.md +11 -6
app.py +261 -0
packages.txt +2 -0
requirements.txt +16 -0

README.md CHANGED Viewed

@@ -1,12 +1,17 @@
 ---
-title: AccentVector
-emoji: ⚡
-colorFrom: indigo
-colorTo: green
 sdk: gradio
-sdk_version: 6.12.0
 app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Accent Vectors
+emoji: 🗣️
+colorFrom: blue
+colorTo: indigo
 sdk: gradio
+sdk_version: "4.44.0"
 app_file: app.py
 pinned: false
+license: apache-2.0
 ---
+# Accent Vectors
+Synthesise speech with a controllable accent using task arithmetic on XTTS v2.
+See the [main repository](https://github.com/NewGamezzz/AccentVector) for training code and details.

app.py ADDED Viewed

	@@ -0,0 +1,261 @@

+"""Gradio demo for Accent Vectors.
+Lets users synthesise speech with a controllable accent directly in the
+browser — no local setup required.
+Models are downloaded from Hugging Face on first use and cached for the
+lifetime of the Space instance.
+"""
+import os
+import tempfile
+import gradio as gr
+import torch
+from huggingface_hub import snapshot_download
+from accent_task_vectors.inference import load_xtts_model, attach_lora_adapter
+# ---------------------------------------------------------------------------
+# Model registry (mirrors download_checkpoints.py)
+# ---------------------------------------------------------------------------
+PRETRAINED_REPO = "NewGame/pretrained-xtts"
+MODELS = {
+    ("English",  "English"):  "NewGame/english-accent-english-xtts",
+    ("English",  "Hindi"):    "NewGame/hindi-accent-english-xtts",
+    ("English",  "German"):   "NewGame/german-accent-english-xtts",
+    ("English",  "French"):   "NewGame/french-accent-english-xtts",
+    ("English",  "Spanish"):  "NewGame/spanish-accent-english-xtts",
+    ("English",  "Mandarin"): "NewGame/mandarin-accent-english-xtts",
+    ("Spanish",  "English"):  "NewGame/english-accent-spanish-xtts",
+    ("German",   "English"):  "NewGame/english-accent-german-xtts",
+    ("Mandarin", "English"):  "NewGame/english-accent-mandarin-xtts",
+}
+# Language code passed to the TTS model
+LANGUAGE_CODES = {
+    "English":  "en",
+    "Spanish":  "es",
+    "German":   "de",
+    "Mandarin": "zh-cn",
+}
+# Accents available for each output language
+ACCENTS_BY_LANGUAGE = {
+    "English":  ["English", "Hindi", "German", "French", "Spanish", "Mandarin"],
+    "Spanish":  ["English"],
+    "German":   ["English"],
+    "Mandarin": ["English"],
+}
+# ---------------------------------------------------------------------------
+# Paths
+# ---------------------------------------------------------------------------
+CACHE_DIR     = os.environ.get("MODEL_CACHE_DIR", "model_cache")
+PRETRAINED_DIR = os.path.join(CACHE_DIR, "pretrained")
+# Keys in config.json that hold pretrained model paths
+_PRETRAINED_PATH_FIELDS = {
+    "mel_norm_file":   "mel_stats.pth",
+    "dvae_checkpoint": "dvae.pth",
+    "xtts_checkpoint": "model.pth",
+    "tokenizer_file":  "vocab.json",
+}
+# ---------------------------------------------------------------------------
+# In-memory model cache  {(language, accent): tts}
+# ---------------------------------------------------------------------------
+_model_cache: dict = {}
+_device = "cuda" if torch.cuda.is_available() else "cpu"
+def _patch_config(config_path: str, pretrained_dir: str) -> None:
+    """Rewrite pretrained model paths in config.json to point to local dir."""
+    import json
+    with open(config_path) as f:
+        config = json.load(f)
+    abs_pretrained = os.path.abspath(pretrained_dir)
+    changed = False
+    def _patch(obj):
+        nonlocal changed
+        if isinstance(obj, dict):
+            for key, filename in _PRETRAINED_PATH_FIELDS.items():
+                if key in obj:
+                    new_val = os.path.join(abs_pretrained, filename)
+                    if obj[key] != new_val:
+                        obj[key] = new_val
+                        changed = True
+            for v in obj.values():
+                _patch(v)
+    _patch(config)
+    if changed:
+        with open(config_path, "w") as f:
+            json.dump(config, f, indent=2)
+def _ensure_pretrained() -> None:
+    """Download the base pretrained XTTS model if not already cached."""
+    if not os.path.isdir(PRETRAINED_DIR):
+        print(f"Downloading pretrained model from {PRETRAINED_REPO} …")
+        snapshot_download(
+            repo_id=PRETRAINED_REPO,
+            repo_type="model",
+            local_dir=PRETRAINED_DIR,
+        )
+def _load_model(language: str, accent: str) -> object:
+    """Return a cached (or freshly loaded) TTS model for the given combination."""
+    key = (language, accent)
+    if key in _model_cache:
+        return _model_cache[key]
+    _ensure_pretrained()
+    repo_id = MODELS[key]
+    lora_dir = os.path.join(CACHE_DIR, f"{accent.lower()}-accent-{language.lower()}")
+    if not os.path.isdir(lora_dir):
+        print(f"Downloading LoRA adapter from {repo_id} …")
+        snapshot_download(
+            repo_id=repo_id,
+            repo_type="model",
+            local_dir=lora_dir,
+            allow_patterns=["config.json", "lora/best_model/**"],
+        )
+        _patch_config(os.path.join(lora_dir, "config.json"), PRETRAINED_DIR)
+    checkpoint_path = os.path.join(PRETRAINED_DIR, "checkpoint_0.pth")
+    config_path     = os.path.join(lora_dir, "config.json")
+    lora_path       = os.path.join(lora_dir, "lora", "best_model")
+    tts = load_xtts_model(checkpoint_path, config_path, device=_device)
+    tts = attach_lora_adapter(tts, lora_path=lora_path)
+    _model_cache[key] = tts
+    return tts
+# ---------------------------------------------------------------------------
+# Inference function called by Gradio
+# ---------------------------------------------------------------------------
+def synthesise(text: str, speaker_audio: str, language: str, accent: str, lora_coeff: float):
+    if not text.strip():
+        raise gr.Error("Please enter some text to synthesise.")
+    if speaker_audio is None:
+        raise gr.Error("Please upload a reference speaker audio file.")
+    if (language, accent) not in MODELS:
+        raise gr.Error(f"Unsupported combination: language={language}, accent={accent}.")
+    tts = _load_model(language, accent)
+    # Scale LoRA if needed
+    if lora_coeff != 1.0:
+        from accent_task_vectors.inference.inference import _scale_lora
+        # Reset to 1.0 first, then apply desired coefficient
+        _scale_lora(tts, lora_coeff / getattr(tts, "_last_lora_coeff", 1.0))
+    tts._last_lora_coeff = lora_coeff
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
+        output_path = tmp.name
+    tts.tts_to_file(
+        text=text,
+        speaker_wav=speaker_audio,
+        language=LANGUAGE_CODES[language],
+        file_path=output_path,
+    )
+    return output_path
+# ---------------------------------------------------------------------------
+# Gradio UI
+# ---------------------------------------------------------------------------
+def update_accent_choices(language: str):
+    accents = ACCENTS_BY_LANGUAGE.get(language, [])
+    return gr.update(choices=accents, value=accents[0])
+with gr.Blocks(title="Accent Vectors") as demo:
+    gr.Markdown(
+        """
+# Accent Vectors
+Synthesise speech with a controllable accent — pick the output **language**,
+the speaker's **accent**, upload a short reference audio clip, and type your text.
+> **Paper:** *Accent Vector: Controllable Accent Manipulation for Multilingual TTS
+> Without Accented Data* (Interspeech 2026)
+"""
+    )
+    with gr.Row():
+        with gr.Column():
+            text_input = gr.Textbox(
+                label="Text to synthesise",
+                placeholder="Type something here…",
+                lines=3,
+            )
+            speaker_audio = gr.Audio(
+                label="Reference speaker audio (3–10 s)",
+                type="filepath",
+            )
+            with gr.Row():
+                language_dd = gr.Dropdown(
+                    label="Output language",
+                    choices=list(ACCENTS_BY_LANGUAGE.keys()),
+                    value="English",
+                )
+                accent_dd = gr.Dropdown(
+                    label="Speaker accent",
+                    choices=ACCENTS_BY_LANGUAGE["English"],
+                    value="English",
+                )
+            lora_coeff = gr.Slider(
+                label="Accent strength (LoRA coefficient)",
+                minimum=0.0,
+                maximum=2.0,
+                step=0.05,
+                value=1.0,
+            )
+            generate_btn = gr.Button("Generate", variant="primary")
+        with gr.Column():
+            audio_output = gr.Audio(label="Generated speech", type="filepath")
+    language_dd.change(fn=update_accent_choices, inputs=language_dd, outputs=accent_dd)
+    generate_btn.click(
+        fn=synthesise,
+        inputs=[text_input, speaker_audio, language_dd, accent_dd, lora_coeff],
+        outputs=audio_output,
+    )
+    gr.Markdown(
+        """
+---
+### How to use
+1. **Output language** — the language the model will speak in.
+2. **Speaker accent** — the L1 accent of the target speaker style.
+3. **Reference audio** — a clean 3–10 second clip of any speaker; the model
+   clones the voice while applying the chosen accent.
+4. **Accent strength** — scale the LoRA adapter contribution (1.0 = default,
+   0 = no accent modification, >1 = stronger accent).
+Models are downloaded automatically on first use.
+"""
+    )
+if __name__ == "__main__":
+    demo.launch()

packages.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ libsndfile1
2	+ ffmpeg

requirements.txt ADDED Viewed

	@@ -0,0 +1,16 @@

+# Install the accent_task_vectors package and modified Coqui TTS from GitHub
+git+https://github.com/NewGamezzz/AccentVector.git
+git+https://github.com/NewGamezzz/AccentVector.git#subdirectory=TTS
+# Runtime dependencies (versions match setup.py)
+torch==2.5.0
+torchaudio==2.5.0
+numpy
+pandas
+pyyaml
+tqdm
+soundfile
+safetensors
+peft==0.10.0
+huggingface_hub
+gradio