import gradio as gr
import torch
import torchaudio
import soundfile as sf
import os
import tempfile
import spaces
from datetime import datetime
from omnivoice import OmniVoice

# ─── Language selection ───
LANGUAGE_CHOICES = [
    "Kabyle (default)",
    "Standard Moroccan Tamazight",
    "Tahaggart Tamahaq",
    "Algerian Arabic"
]

LANG_CODE_MAP = {
    "Kabyle (default)": "kab",
    "Standard Moroccan Tamazight": "zgh",
    "Tahaggart Tamahaq": "thv",
    "Algerian Arabic": "arq",
}

# Default Kabyle text (kept as original)
DEFAULT_TEXT = """Awal n "Uṛdinatur" neqqar-as "Aselkim" s teqbaylit. Ma yella d "Linux" d Anagraw n Wammud."""

# Example sentences for each language (displayed when selected)
EXAMPLE_SENTENCES = {
    "Kabyle (default)": DEFAULT_TEXT,
    "Standard Moroccan Tamazight": "ⴰⵣⵓⵍ ⵎⴰⵙⵙⴰ ⵎⵎⵉ ⵏⵏⵓⵏ. ⵎⴰⵏⵉⴽ ⵜⵍⵍⵉⴷ? ⴰⴷ ⵏⵏⵓⵖ ⵏⵏⴰⵖ ⴰⵙⵙⴰ.",
    "Tahaggart Tamahaq": "ⵎⴰⵙⵙⴰ ⵏⵏⵓⵏ, ⵎⴰⵏⵉⴽ ⵜⵏⵏⴰⵍⴰⵎ? ⴰⴷⴰⵖ ⵏⴰⵔⴰ ⵙ ⵓⵖⵔⵎ ⵏⵏⵖ.",
    "Algerian Arabic": "شحال شْبَابْ ليوم. ليوما رانا حابين نروحو للبحر. تحب تجي معانا ولٌا لا؟"
}

# ─── Pre‑loaded cloned voices ───
PRELOADED_VOICES = {
    "Upload my own": None,
    "Muhya (pre‑loaded)": "assets/muhya.mp3",
}

# ─── Model ───
print("Loading model...")
device = "cuda" if torch.cuda.is_available() else "cpu"
dtype = torch.float16 if device == "cuda" else torch.float32
model = OmniVoice.from_pretrained("k2-fsa/OmniVoice", device_map=device, dtype=dtype)
print(f"Model loaded ({device})")

MAX_WORDS = 50

def _count_words(text):
    """Count words in a string (splits on whitespace)."""
    if not text:
        return 0
    return len(text.strip().split())

def _build_instruct(gender, age, pitch, style):
    parts = []
    if gender and gender != "Auto":
        parts.append(gender.lower())
    if age and age != "Auto":
        parts.append(age.lower())
    if pitch and pitch != "Auto":
        parts.append(f"{pitch.lower()} pitch")
    if style and style != "Auto":
        parts.append(style.lower())
    return ", ".join(parts) if parts else None

def _save_audio(audio_tensor, sample_rate=24000):
    """Save audio tensor to a temporary WAV file with robust shape handling."""
    try:
        if not isinstance(audio_tensor, torch.Tensor):
            audio_tensor = torch.tensor(audio_tensor)
        audio_tensor = audio_tensor.cpu()
        
        # Normalize shape: ensure [channels, samples] or [samples]
        while audio_tensor.dim() > 2:
            audio_tensor = audio_tensor.squeeze(0)
        
        if audio_tensor.dim() == 1:
            # Mono: [samples] -> [samples, 1] for soundfile
            audio_np = audio_tensor.unsqueeze(-1).numpy()
        elif audio_tensor.dim() == 2:
            # Could be [channels, samples] or [samples, channels]
            # OmniVoice typically outputs [1, samples] or [channels, samples]
            if audio_tensor.shape[0] <= 4 and audio_tensor.shape[1] > audio_tensor.shape[0]:
                # Likely [channels, samples] -> transpose to [samples, channels]
                audio_np = audio_tensor.T.numpy()
            else:
                # Likely [samples, channels] already
                audio_np = audio_tensor.numpy()
        else:
            audio_np = audio_tensor.numpy()
        
        # Ensure 2D for soundfile: [samples, channels]
        if audio_np.ndim == 1:
            audio_np = audio_np.reshape(-1, 1)
        
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
            sf.write(f.name, audio_np, sample_rate)
            return f.name
    except Exception as e:
        raise RuntimeError(f"Failed to save audio: {e}")

def update_example_text(lang_choice):
    return EXAMPLE_SENTENCES.get(lang_choice, DEFAULT_TEXT)

# ─── Helper to force gender and switch to Voice Design mode ───
def set_male():
    return [gr.update(value="Male"), gr.update(value="Voice Design")]

def set_female():
    return [gr.update(value="Female"), gr.update(value="Voice Design")]

# ─── Voice Design / Auto ───
@spaces.GPU
def generate_design(text, mode, lang_choice, gender, age, pitch, style,
                    speed, duration, num_step, guidance_scale, denoise, postprocess):
    if not text or not text.strip():
        return None, "Please enter text."
    
    word_count = _count_words(text)
    if word_count > MAX_WORDS:
        return None, f"Text too long: {word_count} words (max {MAX_WORDS}). Please shorten your input."
    
    lang_code = LANG_CODE_MAP.get(lang_choice, "kab")
    kwargs = dict(num_step=int(num_step), guidance_scale=guidance_scale, denoise=denoise)
    kwargs["language"] = lang_code
    if mode == "Voice Design":
        instruct = _build_instruct(gender, age, pitch, style)
        if instruct:
            kwargs["instruct"] = instruct
    if duration and duration > 0:
        kwargs["duration"] = duration
    else:
        kwargs["speed"] = speed
    if postprocess:
        kwargs["postprocess_output"] = True
    try:
        audio = model.generate(text=text, **kwargs)
        path = _save_audio(audio[0], 24000)
        duration_sec = audio[0].shape[-1] / 24000 if hasattr(audio[0], 'shape') else 0
        return path, f"Generation complete ({duration_sec:.1f}s)"
    except Exception as e:
        return None, f"Error: {e}"

# ─── Voice Clone ───
@spaces.GPU
def generate_clone(text, voice_choice, ref_audio, ref_text, lang_choice, speed, duration,
                   num_step, guidance_scale, denoise, postprocess):
    if not text or not text.strip():
        return None, "Please enter text."
    
    word_count = _count_words(text)
    if word_count > MAX_WORDS:
        return None, f"Text too long: {word_count} words (max {MAX_WORDS}). Please shorten your input."
    
    # Determine the actual reference audio path
    preloaded_path = PRELOADED_VOICES.get(voice_choice)
    if preloaded_path:
        ref_audio = preloaded_path
    elif ref_audio is None:
        return None, "Please upload reference audio or select a pre‑loaded voice."
    
    # Ensure ref_audio is a valid file path
    if isinstance(ref_audio, tuple):
        ref_audio = ref_audio[0]  # Gradio sometimes returns (sample_rate, data) tuples
    
    lang_code = LANG_CODE_MAP.get(lang_choice, "kab")
    kwargs = dict(num_step=int(num_step), guidance_scale=guidance_scale, denoise=denoise)
    kwargs["language"] = lang_code
    if duration and duration > 0:
        kwargs["duration"] = duration
    else:
        kwargs["speed"] = speed
    if postprocess:
        kwargs["postprocess_output"] = True
    try:
        audio = model.generate(
            text=text,
            ref_audio=ref_audio,
            ref_text=ref_text if ref_text and ref_text.strip() else None,
            **kwargs,
        )
        path = _save_audio(audio[0], 24000)
        duration_sec = audio[0].shape[-1] / 24000 if hasattr(audio[0], 'shape') else 0
        return path, f"Generation complete ({duration_sec:.1f}s)"
    except Exception as e:
        return None, f"Error: {e}"

def toggle_ref_audio(voice_choice):
    """Show/hide the manual upload field based on voice selection."""
    return gr.update(visible=(voice_choice == "Upload my own"))

# ─── UI ───
CSS = """
.main-title { text-align: center; font-size: 1.8em; font-weight: 800; margin-bottom: 0; }
.subtitle { text-align: center; color: #888; font-size: 0.9em; margin-bottom: 1em; }
footer { display: none !important; }
.word-counter { text-align: right; font-size: 0.85em; color: #666; margin-top: -0.5em; }
.word-counter.over-limit { color: #d32f2f; font-weight: bold; }
"""

with gr.Blocks(title="OmniVoice") as app:
    gr.HTML("<h1 class='main-title'>OmniVoice</h1>")
    gr.HTML("<p class='subtitle'>AI Voice Generator — Kabyle + Regional Languages</p>")

    with gr.Tabs():
        # ── Voice Design / Auto ──
        with gr.Tab("Voice Design"):
            with gr.Row():
                with gr.Column(scale=1):
                    d_text = gr.Textbox(
                        label="Text to speak", lines=6,
                        placeholder=f"Enter text in the selected language... (max {MAX_WORDS} words)",
                        value=DEFAULT_TEXT
                    )
                    d_word_counter = gr.HTML(
                        value=f'<div class="word-counter">{_count_words(DEFAULT_TEXT)} / {MAX_WORDS} words</div>'
                    )
                    d_mode = gr.Radio(["Auto", "Voice Design"], value="Auto", label="Mode")
                    d_lang = gr.Dropdown(choices=LANGUAGE_CHOICES, value="Kabyle (default)",
                                         label="Language", info="Select the language of the input text")

                    # Update example text when language changes
                    d_lang.change(fn=update_example_text, inputs=d_lang, outputs=d_text)

                    # ── Always visible gender buttons ──
                    with gr.Row():
                        male_btn = gr.Button("Masculine Voice", variant="secondary")
                        female_btn = gr.Button("Feminine Voice", variant="secondary")
                    gr.Markdown("*These buttons switch to Voice Design mode and set the gender. Fine‑tune other attributes below.*")

                    # Voice design attributes (visible only when mode == "Voice Design")
                    with gr.Group(visible=False) as d_voice_opts:
                        with gr.Row():
                            d_gender = gr.Dropdown(["Auto", "Female", "Male"],
                                                    value="Auto", label="Gender")
                            d_age = gr.Dropdown(["Auto", "Child", "Young", "Middle-aged", "Elderly"],
                                                 value="Auto", label="Age")
                        with gr.Row():
                            d_pitch = gr.Dropdown(
                                ["Auto", "Very low", "Low", "Moderate", "High", "Very high"],
                                value="Auto", label="Pitch")
                            d_style = gr.Dropdown(["Auto", "Whisper"],
                                                   value="Auto", label="Style")

                    d_speed = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Speed")

                    with gr.Accordion("Advanced Settings", open=False):
                        d_duration = gr.Number(value=0, label="Duration (seconds)",
                                               info="0 for auto. If set, Speed is ignored")
                        d_steps = gr.Slider(4, 64, value=32, step=1, label="Inference Steps")
                        d_cfg = gr.Slider(0.5, 5.0, value=2.0, step=0.1, label="Guidance Scale")
                        d_denoise = gr.Checkbox(value=True, label="Denoise")
                        d_postprocess = gr.Checkbox(value=True, label="Postprocess (silence removal)")

                    d_btn = gr.Button("Generate Audio", variant="primary", size="lg")

                with gr.Column(scale=1):
                    d_audio = gr.Audio(label="Generated Audio")
                    d_status = gr.Textbox(label="Status", interactive=False)

            # Live word counter update
            def update_word_counter(text):
                count = _count_words(text)
                css_class = "word-counter over-limit" if count > MAX_WORDS else "word-counter"
                return f'<div class="{css_class}">{count} / {MAX_WORDS} words</div>'
            
            d_text.change(fn=update_word_counter, inputs=d_text, outputs=d_word_counter)

            # Button events
            male_btn.click(fn=set_male, inputs=[], outputs=[d_gender, d_mode])
            female_btn.click(fn=set_female, inputs=[], outputs=[d_gender, d_mode])

            # Show/hide detailed voice options based on mode
            d_mode.change(
                fn=lambda m: gr.update(visible=m == "Voice Design"),
                inputs=d_mode, outputs=d_voice_opts,
            )
            d_btn.click(
                fn=generate_design,
                inputs=[d_text, d_mode, d_lang, d_gender, d_age, d_pitch, d_style,
                        d_speed, d_duration, d_steps, d_cfg, d_denoise, d_postprocess],
                outputs=[d_audio, d_status],
            )

        # ── Voice Clone ──
        with gr.Tab("Voice Clone"):
            with gr.Row():
                with gr.Column(scale=1):
                    c_text = gr.Textbox(
                        label="Text to speak", lines=6,
                        placeholder=f"Enter text in the selected language... (max {MAX_WORDS} words)",
                        value=DEFAULT_TEXT
                    )
                    c_word_counter = gr.HTML(
                        value=f'<div class="word-counter">{_count_words(DEFAULT_TEXT)} / {MAX_WORDS} words</div>'
                    )
                    
                    # Pre-loaded voice selector
                    c_voice_choice = gr.Dropdown(
                        choices=list(PRELOADED_VOICES.keys()),
                        value="Upload my own",
                        label="Voice Source",
                        info="Choose a pre‑loaded voice or upload your own"
                    )
                    
                    # Manual upload (hidden when a pre-loaded voice is selected)
                    c_ref = gr.Audio(
                        label="Reference Audio (3–15 seconds)", 
                        type="filepath",
                        visible=True
                    )
                    
                    c_ref_text = gr.Textbox(label="Transcription (optional)", lines=2,
                                             placeholder="Leave empty for auto-transcription")
                    c_lang = gr.Dropdown(choices=LANGUAGE_CHOICES, value="Kabyle (default)",
                                         label="Language", info="Select the language of the input text")

                    # Update example text when language changes
                    c_lang.change(fn=update_example_text, inputs=c_lang, outputs=c_text)

                    c_speed = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Speed")

                    with gr.Accordion("Advanced Settings", open=False):
                        c_duration = gr.Number(value=0, label="Duration (seconds)")
                        c_steps = gr.Slider(4, 64, value=32, step=1, label="Inference Steps")
                        c_cfg = gr.Slider(0.5, 5.0, value=2.0, step=0.1, label="Guidance Scale")
                        c_denoise = gr.Checkbox(value=True, label="Denoise")
                        c_postprocess = gr.Checkbox(value=True, label="Postprocess (silence removal)")

                    c_btn = gr.Button("Generate Audio", variant="primary", size="lg")

                with gr.Column(scale=1):
                    c_audio = gr.Audio(label="Generated Audio")
                    c_status = gr.Textbox(label="Status", interactive=False)

            # Live word counter update
            c_text.change(fn=update_word_counter, inputs=c_text, outputs=c_word_counter)

            # Toggle upload field visibility
            c_voice_choice.change(
                fn=toggle_ref_audio,
                inputs=c_voice_choice,
                outputs=c_ref
            )

            c_btn.click(
                fn=generate_clone,
                inputs=[c_text, c_voice_choice, c_ref, c_ref_text, c_lang, c_speed,
                        c_duration, c_steps, c_cfg, c_denoise, c_postprocess],
                outputs=[c_audio, c_status],
            )

if __name__ == "__main__":
    app.launch(css=CSS)