VoxMorph / app.py
Robobyte's picture
Update app.py
e1bcdd0 verified
import gradio as gr
import torch
import soundfile as sf
import tempfile
import os
import traceback
from io import BytesIO
from zipvoice.luxvoice import LuxTTS
# -----------------------------
# ENV OPTIMIZATION
# -----------------------------
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# -----------------------------
# LOAD MODEL ON START
# -----------------------------
device = "cuda" if torch.cuda.is_available() else "cpu"
lux_tts = LuxTTS("YatharthS/LuxTTS", device=device)
# -----------------------------
# AUDIO HANDLING
# -----------------------------
def prepare_audio(file_obj):
if isinstance(file_obj, str):
return file_obj
data = file_obj.read()
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
f.write(data)
return f.name
# -----------------------------
# MAIN FUNCTION
# -----------------------------
def generate_speech(
ref_audio_file,
reference_transcript,
text,
rms=0.01,
t_shift=0.9,
num_steps=4,
speed=1.0,
ref_duration=6.0
):
prompt_path = None
try:
if not ref_audio_file:
return None, "No reference audio"
prompt_path = prepare_audio(ref_audio_file)
encoded = lux_tts.encode_prompt(
prompt_path,
duration=ref_duration,
rms=rms,
prompt_text=reference_transcript.strip() or None
)
audio = lux_tts.generate_speech(
text,
encoded,
num_steps=num_steps,
t_shift=t_shift,
speed=speed,
return_smooth=False
).cpu().numpy().squeeze()
return (48000, audio), "Success"
except Exception:
return None, traceback.format_exc()
finally:
if prompt_path and os.path.exists(prompt_path) and not isinstance(ref_audio_file, str):
try:
os.remove(prompt_path)
except:
pass
# -----------------------------
# UI
# -----------------------------
with gr.Blocks(title="LuxTTS Voice Cloning") as demo:
gr.Markdown("# 🎤 LuxTTS Voice Cloning")
with gr.Row():
with gr.Column():
ref_audio = gr.Audio(type="filepath", label="Reference Audio")
ref_text = gr.Textbox(label="Reference Transcript")
with gr.Column():
text = gr.Textbox(lines=5, label="Text to Generate")
with gr.Accordion("Advanced", open=False):
rms = gr.Slider(0.001, 0.05, value=0.01)
t_shift = gr.Slider(0.1, 1.5, value=0.9)
steps = gr.Slider(1, 10, value=4, step=1)
speed = gr.Slider(0.5, 2.0, value=1.0)
duration = gr.Slider(1.0, 20.0, value=6.0)
btn = gr.Button("Generate")
out_audio = gr.Audio(type="numpy")
status = gr.Textbox()
btn.click(
generate_speech,
inputs=[ref_audio, ref_text, text, rms, t_shift, steps, speed, duration],
outputs=[out_audio, status]
)
demo.launch()