import gradio as gr import torch import soundfile as sf import tempfile import os import traceback from io import BytesIO from zipvoice.luxvoice import LuxTTS # ----------------------------- # ENV OPTIMIZATION # ----------------------------- os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1" os.environ["TOKENIZERS_PARALLELISM"] = "false" # ----------------------------- # LOAD MODEL ON START # ----------------------------- device = "cuda" if torch.cuda.is_available() else "cpu" lux_tts = LuxTTS("YatharthS/LuxTTS", device=device) # ----------------------------- # AUDIO HANDLING # ----------------------------- def prepare_audio(file_obj): if isinstance(file_obj, str): return file_obj data = file_obj.read() with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f: f.write(data) return f.name # ----------------------------- # MAIN FUNCTION # ----------------------------- def generate_speech( ref_audio_file, reference_transcript, text, rms=0.01, t_shift=0.9, num_steps=4, speed=1.0, ref_duration=6.0 ): prompt_path = None try: if not ref_audio_file: return None, "No reference audio" prompt_path = prepare_audio(ref_audio_file) encoded = lux_tts.encode_prompt( prompt_path, duration=ref_duration, rms=rms, prompt_text=reference_transcript.strip() or None ) audio = lux_tts.generate_speech( text, encoded, num_steps=num_steps, t_shift=t_shift, speed=speed, return_smooth=False ).cpu().numpy().squeeze() return (48000, audio), "Success" except Exception: return None, traceback.format_exc() finally: if prompt_path and os.path.exists(prompt_path) and not isinstance(ref_audio_file, str): try: os.remove(prompt_path) except: pass # ----------------------------- # UI # ----------------------------- with gr.Blocks(title="LuxTTS Voice Cloning") as demo: gr.Markdown("# 🎤 LuxTTS Voice Cloning") with gr.Row(): with gr.Column(): ref_audio = gr.Audio(type="filepath", label="Reference Audio") ref_text = gr.Textbox(label="Reference Transcript") with gr.Column(): text = gr.Textbox(lines=5, label="Text to Generate") with gr.Accordion("Advanced", open=False): rms = gr.Slider(0.001, 0.05, value=0.01) t_shift = gr.Slider(0.1, 1.5, value=0.9) steps = gr.Slider(1, 10, value=4, step=1) speed = gr.Slider(0.5, 2.0, value=1.0) duration = gr.Slider(1.0, 20.0, value=6.0) btn = gr.Button("Generate") out_audio = gr.Audio(type="numpy") status = gr.Textbox() btn.click( generate_speech, inputs=[ref_audio, ref_text, text, rms, t_shift, steps, speed, duration], outputs=[out_audio, status] ) demo.launch()