NewGame commited on
Commit
2b90282
·
1 Parent(s): 46364f7

Add Gradio demo

Browse files
Files changed (4) hide show
  1. README.md +11 -6
  2. app.py +261 -0
  3. packages.txt +2 -0
  4. requirements.txt +16 -0
README.md CHANGED
@@ -1,12 +1,17 @@
1
  ---
2
- title: AccentVector
3
- emoji:
4
- colorFrom: indigo
5
- colorTo: green
6
  sdk: gradio
7
- sdk_version: 6.12.0
8
  app_file: app.py
9
  pinned: false
 
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
1
  ---
2
+ title: Accent Vectors
3
+ emoji: 🗣️
4
+ colorFrom: blue
5
+ colorTo: indigo
6
  sdk: gradio
7
+ sdk_version: "4.44.0"
8
  app_file: app.py
9
  pinned: false
10
+ license: apache-2.0
11
  ---
12
 
13
+ # Accent Vectors
14
+
15
+ Synthesise speech with a controllable accent using task arithmetic on XTTS v2.
16
+
17
+ See the [main repository](https://github.com/NewGamezzz/AccentVector) for training code and details.
app.py ADDED
@@ -0,0 +1,261 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Gradio demo for Accent Vectors.
2
+
3
+ Lets users synthesise speech with a controllable accent directly in the
4
+ browser — no local setup required.
5
+
6
+ Models are downloaded from Hugging Face on first use and cached for the
7
+ lifetime of the Space instance.
8
+ """
9
+
10
+ import os
11
+ import tempfile
12
+
13
+ import gradio as gr
14
+ import torch
15
+ from huggingface_hub import snapshot_download
16
+
17
+ from accent_task_vectors.inference import load_xtts_model, attach_lora_adapter
18
+
19
+ # ---------------------------------------------------------------------------
20
+ # Model registry (mirrors download_checkpoints.py)
21
+ # ---------------------------------------------------------------------------
22
+
23
+ PRETRAINED_REPO = "NewGame/pretrained-xtts"
24
+
25
+ MODELS = {
26
+ ("English", "English"): "NewGame/english-accent-english-xtts",
27
+ ("English", "Hindi"): "NewGame/hindi-accent-english-xtts",
28
+ ("English", "German"): "NewGame/german-accent-english-xtts",
29
+ ("English", "French"): "NewGame/french-accent-english-xtts",
30
+ ("English", "Spanish"): "NewGame/spanish-accent-english-xtts",
31
+ ("English", "Mandarin"): "NewGame/mandarin-accent-english-xtts",
32
+ ("Spanish", "English"): "NewGame/english-accent-spanish-xtts",
33
+ ("German", "English"): "NewGame/english-accent-german-xtts",
34
+ ("Mandarin", "English"): "NewGame/english-accent-mandarin-xtts",
35
+ }
36
+
37
+ # Language code passed to the TTS model
38
+ LANGUAGE_CODES = {
39
+ "English": "en",
40
+ "Spanish": "es",
41
+ "German": "de",
42
+ "Mandarin": "zh-cn",
43
+ }
44
+
45
+ # Accents available for each output language
46
+ ACCENTS_BY_LANGUAGE = {
47
+ "English": ["English", "Hindi", "German", "French", "Spanish", "Mandarin"],
48
+ "Spanish": ["English"],
49
+ "German": ["English"],
50
+ "Mandarin": ["English"],
51
+ }
52
+
53
+ # ---------------------------------------------------------------------------
54
+ # Paths
55
+ # ---------------------------------------------------------------------------
56
+
57
+ CACHE_DIR = os.environ.get("MODEL_CACHE_DIR", "model_cache")
58
+ PRETRAINED_DIR = os.path.join(CACHE_DIR, "pretrained")
59
+
60
+ # Keys in config.json that hold pretrained model paths
61
+ _PRETRAINED_PATH_FIELDS = {
62
+ "mel_norm_file": "mel_stats.pth",
63
+ "dvae_checkpoint": "dvae.pth",
64
+ "xtts_checkpoint": "model.pth",
65
+ "tokenizer_file": "vocab.json",
66
+ }
67
+
68
+ # ---------------------------------------------------------------------------
69
+ # In-memory model cache {(language, accent): tts}
70
+ # ---------------------------------------------------------------------------
71
+
72
+ _model_cache: dict = {}
73
+ _device = "cuda" if torch.cuda.is_available() else "cpu"
74
+
75
+
76
+ def _patch_config(config_path: str, pretrained_dir: str) -> None:
77
+ """Rewrite pretrained model paths in config.json to point to local dir."""
78
+ import json
79
+
80
+ with open(config_path) as f:
81
+ config = json.load(f)
82
+
83
+ abs_pretrained = os.path.abspath(pretrained_dir)
84
+ changed = False
85
+
86
+ def _patch(obj):
87
+ nonlocal changed
88
+ if isinstance(obj, dict):
89
+ for key, filename in _PRETRAINED_PATH_FIELDS.items():
90
+ if key in obj:
91
+ new_val = os.path.join(abs_pretrained, filename)
92
+ if obj[key] != new_val:
93
+ obj[key] = new_val
94
+ changed = True
95
+ for v in obj.values():
96
+ _patch(v)
97
+
98
+ _patch(config)
99
+
100
+ if changed:
101
+ with open(config_path, "w") as f:
102
+ json.dump(config, f, indent=2)
103
+
104
+
105
+ def _ensure_pretrained() -> None:
106
+ """Download the base pretrained XTTS model if not already cached."""
107
+ if not os.path.isdir(PRETRAINED_DIR):
108
+ print(f"Downloading pretrained model from {PRETRAINED_REPO} …")
109
+ snapshot_download(
110
+ repo_id=PRETRAINED_REPO,
111
+ repo_type="model",
112
+ local_dir=PRETRAINED_DIR,
113
+ )
114
+
115
+
116
+ def _load_model(language: str, accent: str) -> object:
117
+ """Return a cached (or freshly loaded) TTS model for the given combination."""
118
+ key = (language, accent)
119
+ if key in _model_cache:
120
+ return _model_cache[key]
121
+
122
+ _ensure_pretrained()
123
+
124
+ repo_id = MODELS[key]
125
+ lora_dir = os.path.join(CACHE_DIR, f"{accent.lower()}-accent-{language.lower()}")
126
+
127
+ if not os.path.isdir(lora_dir):
128
+ print(f"Downloading LoRA adapter from {repo_id} …")
129
+ snapshot_download(
130
+ repo_id=repo_id,
131
+ repo_type="model",
132
+ local_dir=lora_dir,
133
+ allow_patterns=["config.json", "lora/best_model/**"],
134
+ )
135
+ _patch_config(os.path.join(lora_dir, "config.json"), PRETRAINED_DIR)
136
+
137
+ checkpoint_path = os.path.join(PRETRAINED_DIR, "checkpoint_0.pth")
138
+ config_path = os.path.join(lora_dir, "config.json")
139
+ lora_path = os.path.join(lora_dir, "lora", "best_model")
140
+
141
+ tts = load_xtts_model(checkpoint_path, config_path, device=_device)
142
+ tts = attach_lora_adapter(tts, lora_path=lora_path)
143
+
144
+ _model_cache[key] = tts
145
+ return tts
146
+
147
+
148
+ # ---------------------------------------------------------------------------
149
+ # Inference function called by Gradio
150
+ # ---------------------------------------------------------------------------
151
+
152
+ def synthesise(text: str, speaker_audio: str, language: str, accent: str, lora_coeff: float):
153
+ if not text.strip():
154
+ raise gr.Error("Please enter some text to synthesise.")
155
+ if speaker_audio is None:
156
+ raise gr.Error("Please upload a reference speaker audio file.")
157
+ if (language, accent) not in MODELS:
158
+ raise gr.Error(f"Unsupported combination: language={language}, accent={accent}.")
159
+
160
+ tts = _load_model(language, accent)
161
+
162
+ # Scale LoRA if needed
163
+ if lora_coeff != 1.0:
164
+ from accent_task_vectors.inference.inference import _scale_lora
165
+ # Reset to 1.0 first, then apply desired coefficient
166
+ _scale_lora(tts, lora_coeff / getattr(tts, "_last_lora_coeff", 1.0))
167
+ tts._last_lora_coeff = lora_coeff
168
+
169
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
170
+ output_path = tmp.name
171
+
172
+ tts.tts_to_file(
173
+ text=text,
174
+ speaker_wav=speaker_audio,
175
+ language=LANGUAGE_CODES[language],
176
+ file_path=output_path,
177
+ )
178
+
179
+ return output_path
180
+
181
+
182
+ # ---------------------------------------------------------------------------
183
+ # Gradio UI
184
+ # ---------------------------------------------------------------------------
185
+
186
+ def update_accent_choices(language: str):
187
+ accents = ACCENTS_BY_LANGUAGE.get(language, [])
188
+ return gr.update(choices=accents, value=accents[0])
189
+
190
+
191
+ with gr.Blocks(title="Accent Vectors") as demo:
192
+ gr.Markdown(
193
+ """
194
+ # Accent Vectors
195
+ Synthesise speech with a controllable accent — pick the output **language**,
196
+ the speaker's **accent**, upload a short reference audio clip, and type your text.
197
+
198
+ > **Paper:** *Accent Vector: Controllable Accent Manipulation for Multilingual TTS
199
+ > Without Accented Data* (Interspeech 2026)
200
+ """
201
+ )
202
+
203
+ with gr.Row():
204
+ with gr.Column():
205
+ text_input = gr.Textbox(
206
+ label="Text to synthesise",
207
+ placeholder="Type something here…",
208
+ lines=3,
209
+ )
210
+ speaker_audio = gr.Audio(
211
+ label="Reference speaker audio (3–10 s)",
212
+ type="filepath",
213
+ )
214
+ with gr.Row():
215
+ language_dd = gr.Dropdown(
216
+ label="Output language",
217
+ choices=list(ACCENTS_BY_LANGUAGE.keys()),
218
+ value="English",
219
+ )
220
+ accent_dd = gr.Dropdown(
221
+ label="Speaker accent",
222
+ choices=ACCENTS_BY_LANGUAGE["English"],
223
+ value="English",
224
+ )
225
+ lora_coeff = gr.Slider(
226
+ label="Accent strength (LoRA coefficient)",
227
+ minimum=0.0,
228
+ maximum=2.0,
229
+ step=0.05,
230
+ value=1.0,
231
+ )
232
+ generate_btn = gr.Button("Generate", variant="primary")
233
+
234
+ with gr.Column():
235
+ audio_output = gr.Audio(label="Generated speech", type="filepath")
236
+
237
+ language_dd.change(fn=update_accent_choices, inputs=language_dd, outputs=accent_dd)
238
+
239
+ generate_btn.click(
240
+ fn=synthesise,
241
+ inputs=[text_input, speaker_audio, language_dd, accent_dd, lora_coeff],
242
+ outputs=audio_output,
243
+ )
244
+
245
+ gr.Markdown(
246
+ """
247
+ ---
248
+ ### How to use
249
+ 1. **Output language** — the language the model will speak in.
250
+ 2. **Speaker accent** — the L1 accent of the target speaker style.
251
+ 3. **Reference audio** — a clean 3–10 second clip of any speaker; the model
252
+ clones the voice while applying the chosen accent.
253
+ 4. **Accent strength** — scale the LoRA adapter contribution (1.0 = default,
254
+ 0 = no accent modification, >1 = stronger accent).
255
+
256
+ Models are downloaded automatically on first use.
257
+ """
258
+ )
259
+
260
+ if __name__ == "__main__":
261
+ demo.launch()
packages.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ libsndfile1
2
+ ffmpeg
requirements.txt ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Install the accent_task_vectors package and modified Coqui TTS from GitHub
2
+ git+https://github.com/NewGamezzz/AccentVector.git
3
+ git+https://github.com/NewGamezzz/AccentVector.git#subdirectory=TTS
4
+
5
+ # Runtime dependencies (versions match setup.py)
6
+ torch==2.5.0
7
+ torchaudio==2.5.0
8
+ numpy
9
+ pandas
10
+ pyyaml
11
+ tqdm
12
+ soundfile
13
+ safetensors
14
+ peft==0.10.0
15
+ huggingface_hub
16
+ gradio