import gradio as gr
import torch
import soundfile as sf
import tempfile
import os
import traceback

from io import BytesIO
from zipvoice.luxvoice import LuxTTS

# -----------------------------
# ENV OPTIMIZATION
# -----------------------------
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# -----------------------------
# LOAD MODEL ON START
# -----------------------------
device = "cuda" if torch.cuda.is_available() else "cpu"
lux_tts = LuxTTS("YatharthS/LuxTTS", device=device)

# -----------------------------
# AUDIO HANDLING
# -----------------------------
def prepare_audio(file_obj):
    if isinstance(file_obj, str):
        return file_obj

    data = file_obj.read()
    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
        f.write(data)
        return f.name

# -----------------------------
# MAIN FUNCTION
# -----------------------------
def generate_speech(
    ref_audio_file,
    reference_transcript,
    text,
    rms=0.01,
    t_shift=0.9,
    num_steps=4,
    speed=1.0,
    ref_duration=6.0
):
    prompt_path = None

    try:
        if not ref_audio_file:
            return None, "No reference audio"

        prompt_path = prepare_audio(ref_audio_file)

        encoded = lux_tts.encode_prompt(
            prompt_path,
            duration=ref_duration,
            rms=rms,
            prompt_text=reference_transcript.strip() or None
        )

        audio = lux_tts.generate_speech(
            text,
            encoded,
            num_steps=num_steps,
            t_shift=t_shift,
            speed=speed,
            return_smooth=False
        ).cpu().numpy().squeeze()

        return (48000, audio), "Success"

    except Exception:
        return None, traceback.format_exc()

    finally:
        if prompt_path and os.path.exists(prompt_path) and not isinstance(ref_audio_file, str):
            try:
                os.remove(prompt_path)
            except:
                pass

# -----------------------------
# UI
# -----------------------------
with gr.Blocks(title="LuxTTS Voice Cloning") as demo:
    gr.Markdown("# 🎤 LuxTTS Voice Cloning")

    with gr.Row():
        with gr.Column():
            ref_audio = gr.Audio(type="filepath", label="Reference Audio")
            ref_text = gr.Textbox(label="Reference Transcript")

        with gr.Column():
            text = gr.Textbox(lines=5, label="Text to Generate")

            with gr.Accordion("Advanced", open=False):
                rms = gr.Slider(0.001, 0.05, value=0.01)
                t_shift = gr.Slider(0.1, 1.5, value=0.9)
                steps = gr.Slider(1, 10, value=4, step=1)
                speed = gr.Slider(0.5, 2.0, value=1.0)
                duration = gr.Slider(1.0, 20.0, value=6.0)

            btn = gr.Button("Generate")
            out_audio = gr.Audio(type="numpy")
            status = gr.Textbox()

    btn.click(
        generate_speech,
        inputs=[ref_audio, ref_text, text, rms, t_shift, steps, speed, duration],
        outputs=[out_audio, status]
    )

demo.launch()