| import gradio as gr |
| import torch |
| import soundfile as sf |
| import tempfile |
| import os |
| import traceback |
|
|
| from io import BytesIO |
| from zipvoice.luxvoice import LuxTTS |
|
|
| |
| |
| |
| os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1" |
| os.environ["TOKENIZERS_PARALLELISM"] = "false" |
|
|
| |
| |
| |
| device = "cuda" if torch.cuda.is_available() else "cpu" |
| lux_tts = LuxTTS("YatharthS/LuxTTS", device=device) |
|
|
| |
| |
| |
| def prepare_audio(file_obj): |
| if isinstance(file_obj, str): |
| return file_obj |
|
|
| data = file_obj.read() |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f: |
| f.write(data) |
| return f.name |
|
|
| |
| |
| |
| def generate_speech( |
| ref_audio_file, |
| reference_transcript, |
| text, |
| rms=0.01, |
| t_shift=0.9, |
| num_steps=4, |
| speed=1.0, |
| ref_duration=6.0 |
| ): |
| prompt_path = None |
|
|
| try: |
| if not ref_audio_file: |
| return None, "No reference audio" |
|
|
| prompt_path = prepare_audio(ref_audio_file) |
|
|
| encoded = lux_tts.encode_prompt( |
| prompt_path, |
| duration=ref_duration, |
| rms=rms, |
| prompt_text=reference_transcript.strip() or None |
| ) |
|
|
| audio = lux_tts.generate_speech( |
| text, |
| encoded, |
| num_steps=num_steps, |
| t_shift=t_shift, |
| speed=speed, |
| return_smooth=False |
| ).cpu().numpy().squeeze() |
|
|
| return (48000, audio), "Success" |
|
|
| except Exception: |
| return None, traceback.format_exc() |
|
|
| finally: |
| if prompt_path and os.path.exists(prompt_path) and not isinstance(ref_audio_file, str): |
| try: |
| os.remove(prompt_path) |
| except: |
| pass |
|
|
| |
| |
| |
| with gr.Blocks(title="LuxTTS Voice Cloning") as demo: |
| gr.Markdown("# 🎤 LuxTTS Voice Cloning") |
|
|
| with gr.Row(): |
| with gr.Column(): |
| ref_audio = gr.Audio(type="filepath", label="Reference Audio") |
| ref_text = gr.Textbox(label="Reference Transcript") |
|
|
| with gr.Column(): |
| text = gr.Textbox(lines=5, label="Text to Generate") |
|
|
| with gr.Accordion("Advanced", open=False): |
| rms = gr.Slider(0.001, 0.05, value=0.01) |
| t_shift = gr.Slider(0.1, 1.5, value=0.9) |
| steps = gr.Slider(1, 10, value=4, step=1) |
| speed = gr.Slider(0.5, 2.0, value=1.0) |
| duration = gr.Slider(1.0, 20.0, value=6.0) |
|
|
| btn = gr.Button("Generate") |
| out_audio = gr.Audio(type="numpy") |
| status = gr.Textbox() |
|
|
| btn.click( |
| generate_speech, |
| inputs=[ref_audio, ref_text, text, rms, t_shift, steps, speed, duration], |
| outputs=[out_audio, status] |
| ) |
|
|
| demo.launch() |