Spaces:

NewGame
/

AccentVector

Running

App Files Files Community

AccentVector / app.py

NewGame

update deatils

7f59768 4 days ago

raw

history blame contribute delete

10.7 kB

	"""Gradio demo for Accent Vectors.

	Lets users synthesise speech with a controllable accent directly in the
	browser — no local setup required.

	Models are downloaded from Hugging Face on first use and cached for the
	lifetime of the Space instance.
	"""

	import os
	import json
	import tempfile

	import gradio as gr
	import torch
	from huggingface_hub import snapshot_download

	from accent_task_vectors.inference import load_xtts_model, attach_lora_adapter
	from accent_task_vectors.inference.inference import _scale_lora

	# ---------------------------------------------------------------------------
	# Model registry (mirrors download_checkpoints.py)
	# ---------------------------------------------------------------------------

	PRETRAINED_REPO = "NewGame/pretrained-xtts"

	MODELS = {
	("English", "English"): "NewGame/english-accent-english-xtts",
	("English", "Hindi"): "NewGame/hindi-accent-english-xtts",
	("English", "German"): "NewGame/german-accent-english-xtts",
	("English", "French"): "NewGame/french-accent-english-xtts",
	("English", "Spanish"): "NewGame/spanish-accent-english-xtts",
	("English", "Mandarin"): "NewGame/mandarin-accent-english-xtts",
	("Spanish", "English"): "NewGame/english-accent-spanish-xtts",
	("German", "English"): "NewGame/english-accent-german-xtts",
	("Mandarin", "English"): "NewGame/english-accent-mandarin-xtts",
	}

	# Language code passed to the TTS model
	LANGUAGE_CODES = {
	"English": "en",
	"Spanish": "es",
	"German": "de",
	"Mandarin": "zh-cn",
	}

	# Accents available for each output language
	ACCENTS_BY_LANGUAGE = {
	"English": ["English", "Hindi", "German", "French", "Spanish", "Mandarin"],
	"Spanish": ["English"],
	"German": ["English"],
	"Mandarin": ["English"],
	}

	# ---------------------------------------------------------------------------
	# Paths
	# ---------------------------------------------------------------------------

	CACHE_DIR = os.environ.get("MODEL_CACHE_DIR", "model_cache")
	PRETRAINED_DIR = os.path.join(CACHE_DIR, "pretrained")

	_PRETRAINED_PATH_FIELDS = {
	"mel_norm_file": "mel_stats.pth",
	"dvae_checkpoint": "dvae.pth",
	"xtts_checkpoint": "model.pth",
	"tokenizer_file": "vocab.json",
	}

	# ---------------------------------------------------------------------------
	# In-memory model cache
	# _model_cache: (language, accent1, accent2\|None) -> tts
	# _current_coeffs: same key -> (coeff1, coeff2)
	# ---------------------------------------------------------------------------

	_model_cache: dict = {}
	_current_coeffs: dict = {}
	_device = "cuda" if torch.cuda.is_available() else "cpu"


	def _patch_config(config_path: str, pretrained_dir: str) -> None:
	with open(config_path) as f:
	config = json.load(f)

	abs_pretrained = os.path.abspath(pretrained_dir)
	changed = False

	def _patch(obj):
	nonlocal changed
	if isinstance(obj, dict):
	for key, filename in _PRETRAINED_PATH_FIELDS.items():
	if key in obj:
	new_val = os.path.join(abs_pretrained, filename)
	if obj[key] != new_val:
	obj[key] = new_val
	changed = True
	for v in obj.values():
	_patch(v)

	_patch(config)

	if changed:
	with open(config_path, "w") as f:
	json.dump(config, f, indent=2)


	def _ensure_pretrained() -> None:
	if not os.path.isdir(PRETRAINED_DIR):
	print(f"Downloading pretrained model from {PRETRAINED_REPO} …")
	snapshot_download(
	repo_id=PRETRAINED_REPO,
	repo_type="model",
	local_dir=PRETRAINED_DIR,
	)


	def _download_lora(language: str, accent: str) -> str:
	"""Download a LoRA adapter if needed; return its local directory."""
	lora_dir = os.path.join(CACHE_DIR, f"{accent.lower()}-accent-{language.lower()}")
	if not os.path.isdir(lora_dir):
	repo_id = MODELS[(language, accent)]
	print(f"Downloading LoRA adapter from {repo_id} …")
	snapshot_download(
	repo_id=repo_id,
	repo_type="model",
	local_dir=lora_dir,
	allow_patterns=["config.json", "lora/best_model/**"],
	)
	_patch_config(os.path.join(lora_dir, "config.json"), PRETRAINED_DIR)
	return lora_dir


	def _load_model(language: str, accent1: str, accent2: str \| None):
	"""Return a cached TTS model with adapter(s) loaded at coeff=1.0."""
	key = (language, accent1, accent2)
	if key in _model_cache:
	return _model_cache[key]

	_ensure_pretrained()

	lora_dir1 = _download_lora(language, accent1)
	checkpoint_path = os.path.join(PRETRAINED_DIR, "checkpoint_0.pth")
	config_path = os.path.join(lora_dir1, "config.json")
	lora_path1 = os.path.join(lora_dir1, "lora", "best_model")

	tts = load_xtts_model(checkpoint_path, config_path, device=_device)
	tts = attach_lora_adapter(tts, lora_path=lora_path1, adapter_name="default", scaling_coef=1.0)

	if accent2 is not None:
	lora_dir2 = _download_lora(language, accent2)
	lora_path2 = os.path.join(lora_dir2, "lora", "best_model")
	tts = attach_lora_adapter(tts, lora_path=lora_path2, adapter_name="other", scaling_coef=1.0)
	tts.synthesizer.tts_model.set_adapter(["default", "other"])

	_model_cache[key] = tts
	_current_coeffs[key] = (1.0, 1.0)
	return tts


	# ---------------------------------------------------------------------------
	# Inference function called by Gradio
	# ---------------------------------------------------------------------------

	def synthesise(
	text: str,
	speaker_audio: str,
	language: str,
	accent1: str,
	coeff1: float,
	enable_second: bool,
	accent2: str,
	coeff2: float,
	):
	if not text.strip():
	raise gr.Error("Please enter some text to synthesise.")
	if speaker_audio is None:
	raise gr.Error("Please upload a reference speaker audio file.")
	if (language, accent1) not in MODELS:
	raise gr.Error(f"Unsupported combination: language={language}, accent={accent1}.")

	accent2_key = accent2 if enable_second else None

	if enable_second and (language, accent2) not in MODELS:
	raise gr.Error(f"Unsupported combination: language={language}, accent={accent2}.")

	tts = _load_model(language, accent1, accent2_key)
	key = (language, accent1, accent2_key)

	# Rescale adapters from their current cached coefficients to the desired ones
	prev_coeff1, prev_coeff2 = _current_coeffs[key]
	_scale_lora(tts, coeff1 / prev_coeff1, adapter_name="default")
	if accent2_key is not None:
	_scale_lora(tts, coeff2 / prev_coeff2, adapter_name="other")
	_current_coeffs[key] = (coeff1, coeff2 if accent2_key else 1.0)

	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
	output_path = tmp.name

	tts.tts_to_file(
	text=text,
	speaker_wav=speaker_audio,
	language=LANGUAGE_CODES[language],
	file_path=output_path,
	)

	return output_path


	# ---------------------------------------------------------------------------
	# Gradio UI
	# ---------------------------------------------------------------------------

	def update_accent_choices(language: str):
	accents = ACCENTS_BY_LANGUAGE.get(language, [])
	return gr.update(choices=accents, value=accents[0])


	with gr.Blocks(title="Accent Vectors") as demo:
	gr.Markdown(
	"""
	# Accent Vectors
	Synthesise speech with a controllable accent — pick the output language,
	the speaker's accent, upload a short reference audio clip, and type your text.

	> Paper: *Accent Vector: Controllable Accent Manipulation for Multilingual TTS
	> Without Accented Data* (submitted to Interspeech 2026)
	"""
	)

	with gr.Row():
	with gr.Column():
	text_input = gr.Textbox(
	label="Text to synthesise",
	placeholder="Type something here…",
	lines=3,
	)
	speaker_audio = gr.Audio(
	label="Reference speaker audio (3–10 s)",
	type="filepath",
	)

	with gr.Row():
	language_dd = gr.Dropdown(
	label="Output language",
	choices=list(ACCENTS_BY_LANGUAGE.keys()),
	value="English",
	)
	accent1_dd = gr.Dropdown(
	label="Speaker accent",
	choices=ACCENTS_BY_LANGUAGE["English"],
	value="English",
	)
	coeff1_slider = gr.Slider(
	label="Accent strength",
	minimum=0.0, maximum=1.0, step=0.05, value=1.0,
	)

	with gr.Accordion("Mix a second accent (optional)", open=False):
	enable_second = gr.Checkbox(label="Enable second accent", value=False)
	accent2_dd = gr.Dropdown(
	label="Second accent",
	choices=ACCENTS_BY_LANGUAGE["English"],
	value="Hindi",
	interactive=True,
	)
	coeff2_slider = gr.Slider(
	label="Second accent strength",
	minimum=0.0, maximum=1.0, step=0.05, value=0.5,
	)

	generate_btn = gr.Button("Generate", variant="primary")

	with gr.Column():
	audio_output = gr.Audio(label="Generated speech", type="filepath")

	# Update both accent dropdowns when language changes
	language_dd.change(fn=update_accent_choices, inputs=language_dd, outputs=accent1_dd)
	language_dd.change(fn=update_accent_choices, inputs=language_dd, outputs=accent2_dd)

	generate_btn.click(
	fn=synthesise,
	inputs=[
	text_input, speaker_audio,
	language_dd, accent1_dd, coeff1_slider,
	enable_second, accent2_dd, coeff2_slider,
	],
	outputs=audio_output,
	)

	gr.Markdown(
	"""
	---
	### How to use
	1. Output language — the language the model will speak in.
	2. Speaker accent — the L1 accent of the target speaker style.
	3. Reference audio — a clean 3–10 second clip of any speaker; the model
	clones the voice while applying the chosen accent.
	4. Accent strength — LoRA adapter contribution (0 = no accent effect, 1 = full).
	5. Mix a second accent — optionally blend two accents together by enabling
	a second adapter and setting its strength independently.

	Models are downloaded automatically on first use.
	"""
	)

	if __name__ == "__main__":
	demo.launch()