| import os |
|
|
| import gradio as gr |
| import torch |
| from styletts2importable import compute_style, device, inference |
| from txtsplit import txtsplit |
| import numpy as np |
| import phonemizer |
|
|
|
|
| theme = gr.themes.Base( |
| font=[ |
| gr.themes.GoogleFont("Libre Franklin"), |
| gr.themes.GoogleFont("Public Sans"), |
| "system-ui", |
| "sans-serif", |
| ], |
| ) |
| voicelist = [ |
| "f-us-1", |
| "f-us-2", |
| "f-us-3", |
| "f-us-4", |
| "m-us-1", |
| "m-us-2", |
| "m-us-3", |
| "m-us-4", |
| ] |
| voices = {} |
|
|
| global_phonemizer = phonemizer.backend.EspeakBackend( |
| language="en-us", preserve_punctuation=True, with_stress=True |
| ) |
| |
| for v in voicelist: |
| cache_path = f"voices/{v}.wav.npy" |
| if os.path.exists(cache_path): |
| voices[v] = torch.from_numpy(np.load(cache_path)).to(device) |
| else: |
| style = compute_style(f"voices/{v}.wav") |
| voices[v] = style |
| np.save(cache_path, style.cpu().numpy()) |
|
|
|
|
| def synthesize(text, voice, lngsteps): |
| if text.strip() == "": |
| raise gr.Error("You must enter some text") |
| if len(text) > 50000: |
| raise gr.Error("Text must be <50k characters") |
| print("*** saying ***") |
| print(text) |
| print("*** end ***") |
| texts = txtsplit(text) |
| v = voice.lower() |
| audios = [] |
| for t in texts: |
| audios.append( |
| inference( |
| t, |
| voices[v], |
| alpha=0.3, |
| beta=0.7, |
| diffusion_steps=lngsteps, |
| embedding_scale=1, |
| ) |
| ) |
| return (24000, np.concatenate(audios)) |
|
|
|
|
| with gr.Blocks() as vctk: |
| with gr.Row(): |
| with gr.Column(scale=1): |
| inp = gr.Textbox( |
| label="Text", |
| info="What would you like StyleTTS 2 to read? It works better on full sentences.", |
| interactive=True, |
| ) |
| voice = gr.Dropdown( |
| voicelist, |
| label="Voice", |
| info="Select a default voice.", |
| value="m-us-2", |
| interactive=True, |
| ) |
| multispeakersteps = gr.Slider( |
| minimum=3, |
| maximum=15, |
| value=3, |
| step=1, |
| label="Diffusion Steps", |
| info="Theoretically, higher should be better quality but slower, but we cannot notice a difference. Try with lower steps first - it is faster", |
| interactive=True, |
| ) |
| |
| with gr.Column(scale=1): |
| btn = gr.Button("Synthesize", variant="primary") |
| audio = gr.Audio( |
| interactive=False, |
| label="Synthesized Audio", |
| waveform_options={"waveform_progress_color": "#3C82F6"}, |
| ) |
| btn.click( |
| synthesize, |
| inputs=[inp, voice, multispeakersteps], |
| outputs=[audio], |
| concurrency_limit=4, |
| ) |
|
|
| with gr.Blocks( |
| title="StyleTTS 2", css="footer{display:none !important}", theme=theme |
| ) as demo: |
| gr.TabbedInterface( |
| [vctk], ["Multi-Voice", "Voice Cloning", "LJSpeech", "Long Text [Beta]"] |
| ) |
| if __name__ == "__main__": |
| |
| print("Launching") |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| demo.queue(api_open=True, max_size=None).launch(show_api=False) |
| print("Launched") |
|
|