import os
# allow loading the CMU Arctic xvectors dataset script on HF
os.environ["HF_DATASETS_TRUST_REMOTE_CODE"] = "1"

import gradio as gr
import torch
import numpy as np
from transformers import pipeline, AutoProcessor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from datasets import load_dataset

# ---------------------------
# CPU-only global handles
# ---------------------------
_captioner = None
_tts_processor = None
_tts_model = None
_tts_vocoder = None
_speaker_embeddings = None  # required by SpeechT5


def load_models_cpu():
    """Load BLIP-2 (image captioning) and SpeechT5 (text-to-speech) on CPU."""
    global _captioner, _tts_processor, _tts_model, _tts_vocoder, _speaker_embeddings

    if _captioner is None:
        print("Loading BLIP-2 image captioning model (CPU)...")
        _captioner = pipeline(
            task="image-to-text",
            model="Salesforce/blip2-flan-t5-xl",
            dtype=torch.float32,   # CPU dtype (alias of torch_dtype)
            device_map=None,       # ensure CPU
        )

    if _tts_processor is None:
        print("Loading SpeechT5 processor...")
        _tts_processor = AutoProcessor.from_pretrained("microsoft/speecht5_tts")

    if _tts_model is None:
        print("Loading SpeechT5 TTS model (CPU)...")
        _tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to("cpu").eval()

    if _tts_vocoder is None:
        print("Loading SpeechT5 HiFiGAN vocoder (CPU)...")
        _tts_vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to("cpu").eval()

    if _speaker_embeddings is None:
        print("Loading default speaker embeddings for SpeechT5...")
        try:
            emb_ds = load_dataset(
                "Matthijs/cmu-arctic-xvectors",
                split="validation",
                trust_remote_code=True,  # needed with modern datasets
            )
            # HF examples commonly use index 7306 (speaker "slt")
            emb = emb_ds[7306]["xvector"]
            _speaker_embeddings = torch.tensor(emb, dtype=torch.float32).unsqueeze(0)
        except Exception as e:
            print(f"Speaker embedding load failed: {e}. Using a random voice embedding.")
            # SpeechT5 expects shape (1, 512)
            _speaker_embeddings = torch.randn(1, 512, dtype=torch.float32)


def describe_and_speak(image):
    """Generate an English caption for the image and speak it aloud (defaults only)."""
    load_models_cpu()

    # --- 1) Caption (defaults; no beams/tokens passed) ---
    result = _captioner(image)
    caption = (result[0].get("generated_text", "") if result else "").strip()
    if not caption:
        caption = "A description could not be generated for this image."

    # --- 2) Text → Speech (SpeechT5) ---
    try:
        inputs = _tts_processor(text=caption, return_tensors="pt")
        with torch.no_grad():
            speech = _tts_model.generate_speech(
                inputs["input_ids"],
                speaker_embeddings=_speaker_embeddings,
                vocoder=_tts_vocoder,
            )
        sr = 16000  # SpeechT5 HiFiGAN outputs 16 kHz mono
        audio = np.asarray(speech.numpy(), dtype=np.float32)
    except Exception as e:
        caption += f"\n\n[TTS error: {e}]"
        sr = 22050
        audio = np.zeros(sr, dtype=np.float32)

    return caption, (sr, audio)


# ---------------------------
# Gradio UI
# ---------------------------
with gr.Blocks(title="Image → Speech (HF models, CPU)") as demo:
    gr.Markdown(
        """
        # 🖼️ Image → 🎙️ Speech (CPU)
        1) Caption with **BLIP-2** → 2) Speak with **SpeechT5** (HiFiGAN vocoder).  
        *First run downloads models and speaker embeddings — please wait.*
        """
    )

    inp_image = gr.Image(type="pil", label="Upload an image (JPG/PNG)")
    out_text = gr.Textbox(label="Generated Caption", lines=3)
    out_audio = gr.Audio(label="Spoken Caption", type="numpy")

    btn = gr.Button("Generate")
    btn.click(describe_and_speak, [inp_image], [out_text, out_audio])

if __name__ == "__main__":
    demo.launch()