"""Gradio demo for Accent Vectors. Lets users synthesise speech with a controllable accent directly in the browser — no local setup required. Models are downloaded from Hugging Face on first use and cached for the lifetime of the Space instance. """ import os import json import tempfile import gradio as gr import torch from huggingface_hub import snapshot_download from accent_task_vectors.inference import load_xtts_model, attach_lora_adapter from accent_task_vectors.inference.inference import _scale_lora # --------------------------------------------------------------------------- # Model registry (mirrors download_checkpoints.py) # --------------------------------------------------------------------------- PRETRAINED_REPO = "NewGame/pretrained-xtts" MODELS = { ("English", "English"): "NewGame/english-accent-english-xtts", ("English", "Hindi"): "NewGame/hindi-accent-english-xtts", ("English", "German"): "NewGame/german-accent-english-xtts", ("English", "French"): "NewGame/french-accent-english-xtts", ("English", "Spanish"): "NewGame/spanish-accent-english-xtts", ("English", "Mandarin"): "NewGame/mandarin-accent-english-xtts", ("Spanish", "English"): "NewGame/english-accent-spanish-xtts", ("German", "English"): "NewGame/english-accent-german-xtts", ("Mandarin", "English"): "NewGame/english-accent-mandarin-xtts", } # Language code passed to the TTS model LANGUAGE_CODES = { "English": "en", "Spanish": "es", "German": "de", "Mandarin": "zh-cn", } # Accents available for each output language ACCENTS_BY_LANGUAGE = { "English": ["English", "Hindi", "German", "French", "Spanish", "Mandarin"], "Spanish": ["English"], "German": ["English"], "Mandarin": ["English"], } # --------------------------------------------------------------------------- # Paths # --------------------------------------------------------------------------- CACHE_DIR = os.environ.get("MODEL_CACHE_DIR", "model_cache") PRETRAINED_DIR = os.path.join(CACHE_DIR, "pretrained") _PRETRAINED_PATH_FIELDS = { "mel_norm_file": "mel_stats.pth", "dvae_checkpoint": "dvae.pth", "xtts_checkpoint": "model.pth", "tokenizer_file": "vocab.json", } # --------------------------------------------------------------------------- # In-memory model cache # _model_cache: (language, accent1, accent2|None) -> tts # _current_coeffs: same key -> (coeff1, coeff2) # --------------------------------------------------------------------------- _model_cache: dict = {} _current_coeffs: dict = {} _device = "cuda" if torch.cuda.is_available() else "cpu" def _patch_config(config_path: str, pretrained_dir: str) -> None: with open(config_path) as f: config = json.load(f) abs_pretrained = os.path.abspath(pretrained_dir) changed = False def _patch(obj): nonlocal changed if isinstance(obj, dict): for key, filename in _PRETRAINED_PATH_FIELDS.items(): if key in obj: new_val = os.path.join(abs_pretrained, filename) if obj[key] != new_val: obj[key] = new_val changed = True for v in obj.values(): _patch(v) _patch(config) if changed: with open(config_path, "w") as f: json.dump(config, f, indent=2) def _ensure_pretrained() -> None: if not os.path.isdir(PRETRAINED_DIR): print(f"Downloading pretrained model from {PRETRAINED_REPO} …") snapshot_download( repo_id=PRETRAINED_REPO, repo_type="model", local_dir=PRETRAINED_DIR, ) def _download_lora(language: str, accent: str) -> str: """Download a LoRA adapter if needed; return its local directory.""" lora_dir = os.path.join(CACHE_DIR, f"{accent.lower()}-accent-{language.lower()}") if not os.path.isdir(lora_dir): repo_id = MODELS[(language, accent)] print(f"Downloading LoRA adapter from {repo_id} …") snapshot_download( repo_id=repo_id, repo_type="model", local_dir=lora_dir, allow_patterns=["config.json", "lora/best_model/**"], ) _patch_config(os.path.join(lora_dir, "config.json"), PRETRAINED_DIR) return lora_dir def _load_model(language: str, accent1: str, accent2: str | None): """Return a cached TTS model with adapter(s) loaded at coeff=1.0.""" key = (language, accent1, accent2) if key in _model_cache: return _model_cache[key] _ensure_pretrained() lora_dir1 = _download_lora(language, accent1) checkpoint_path = os.path.join(PRETRAINED_DIR, "checkpoint_0.pth") config_path = os.path.join(lora_dir1, "config.json") lora_path1 = os.path.join(lora_dir1, "lora", "best_model") tts = load_xtts_model(checkpoint_path, config_path, device=_device) tts = attach_lora_adapter(tts, lora_path=lora_path1, adapter_name="default", scaling_coef=1.0) if accent2 is not None: lora_dir2 = _download_lora(language, accent2) lora_path2 = os.path.join(lora_dir2, "lora", "best_model") tts = attach_lora_adapter(tts, lora_path=lora_path2, adapter_name="other", scaling_coef=1.0) tts.synthesizer.tts_model.set_adapter(["default", "other"]) _model_cache[key] = tts _current_coeffs[key] = (1.0, 1.0) return tts # --------------------------------------------------------------------------- # Inference function called by Gradio # --------------------------------------------------------------------------- def synthesise( text: str, speaker_audio: str, language: str, accent1: str, coeff1: float, enable_second: bool, accent2: str, coeff2: float, ): if not text.strip(): raise gr.Error("Please enter some text to synthesise.") if speaker_audio is None: raise gr.Error("Please upload a reference speaker audio file.") if (language, accent1) not in MODELS: raise gr.Error(f"Unsupported combination: language={language}, accent={accent1}.") accent2_key = accent2 if enable_second else None if enable_second and (language, accent2) not in MODELS: raise gr.Error(f"Unsupported combination: language={language}, accent={accent2}.") tts = _load_model(language, accent1, accent2_key) key = (language, accent1, accent2_key) # Rescale adapters from their current cached coefficients to the desired ones prev_coeff1, prev_coeff2 = _current_coeffs[key] _scale_lora(tts, coeff1 / prev_coeff1, adapter_name="default") if accent2_key is not None: _scale_lora(tts, coeff2 / prev_coeff2, adapter_name="other") _current_coeffs[key] = (coeff1, coeff2 if accent2_key else 1.0) with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp: output_path = tmp.name tts.tts_to_file( text=text, speaker_wav=speaker_audio, language=LANGUAGE_CODES[language], file_path=output_path, ) return output_path # --------------------------------------------------------------------------- # Gradio UI # --------------------------------------------------------------------------- def update_accent_choices(language: str): accents = ACCENTS_BY_LANGUAGE.get(language, []) return gr.update(choices=accents, value=accents[0]) with gr.Blocks(title="Accent Vectors") as demo: gr.Markdown( """ # Accent Vectors Synthesise speech with a controllable accent — pick the output **language**, the speaker's **accent**, upload a short reference audio clip, and type your text. > **Paper:** *Accent Vector: Controllable Accent Manipulation for Multilingual TTS > Without Accented Data* (submitted to Interspeech 2026) """ ) with gr.Row(): with gr.Column(): text_input = gr.Textbox( label="Text to synthesise", placeholder="Type something here…", lines=3, ) speaker_audio = gr.Audio( label="Reference speaker audio (3–10 s)", type="filepath", ) with gr.Row(): language_dd = gr.Dropdown( label="Output language", choices=list(ACCENTS_BY_LANGUAGE.keys()), value="English", ) accent1_dd = gr.Dropdown( label="Speaker accent", choices=ACCENTS_BY_LANGUAGE["English"], value="English", ) coeff1_slider = gr.Slider( label="Accent strength", minimum=0.0, maximum=1.0, step=0.05, value=1.0, ) with gr.Accordion("Mix a second accent (optional)", open=False): enable_second = gr.Checkbox(label="Enable second accent", value=False) accent2_dd = gr.Dropdown( label="Second accent", choices=ACCENTS_BY_LANGUAGE["English"], value="Hindi", interactive=True, ) coeff2_slider = gr.Slider( label="Second accent strength", minimum=0.0, maximum=1.0, step=0.05, value=0.5, ) generate_btn = gr.Button("Generate", variant="primary") with gr.Column(): audio_output = gr.Audio(label="Generated speech", type="filepath") # Update both accent dropdowns when language changes language_dd.change(fn=update_accent_choices, inputs=language_dd, outputs=accent1_dd) language_dd.change(fn=update_accent_choices, inputs=language_dd, outputs=accent2_dd) generate_btn.click( fn=synthesise, inputs=[ text_input, speaker_audio, language_dd, accent1_dd, coeff1_slider, enable_second, accent2_dd, coeff2_slider, ], outputs=audio_output, ) gr.Markdown( """ --- ### How to use 1. **Output language** — the language the model will speak in. 2. **Speaker accent** — the L1 accent of the target speaker style. 3. **Reference audio** — a clean 3–10 second clip of any speaker; the model clones the voice while applying the chosen accent. 4. **Accent strength** — LoRA adapter contribution (0 = no accent effect, 1 = full). 5. **Mix a second accent** — optionally blend two accents together by enabling a second adapter and setting its strength independently. Models are downloaded automatically on first use. """ ) if __name__ == "__main__": demo.launch()