Trial / app.py
Maria604's picture
fix
50b13cb
import os
# allow loading the CMU Arctic xvectors dataset script on HF
os.environ["HF_DATASETS_TRUST_REMOTE_CODE"] = "1"
import gradio as gr
import torch
import numpy as np
from transformers import pipeline, AutoProcessor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from datasets import load_dataset
# ---------------------------
# CPU-only global handles
# ---------------------------
_captioner = None
_tts_processor = None
_tts_model = None
_tts_vocoder = None
_speaker_embeddings = None # required by SpeechT5
def load_models_cpu():
"""Load BLIP-2 (image captioning) and SpeechT5 (text-to-speech) on CPU."""
global _captioner, _tts_processor, _tts_model, _tts_vocoder, _speaker_embeddings
if _captioner is None:
print("Loading BLIP-2 image captioning model (CPU)...")
_captioner = pipeline(
task="image-to-text",
model="Salesforce/blip2-flan-t5-xl",
dtype=torch.float32, # CPU dtype (alias of torch_dtype)
device_map=None, # ensure CPU
)
if _tts_processor is None:
print("Loading SpeechT5 processor...")
_tts_processor = AutoProcessor.from_pretrained("microsoft/speecht5_tts")
if _tts_model is None:
print("Loading SpeechT5 TTS model (CPU)...")
_tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to("cpu").eval()
if _tts_vocoder is None:
print("Loading SpeechT5 HiFiGAN vocoder (CPU)...")
_tts_vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to("cpu").eval()
if _speaker_embeddings is None:
print("Loading default speaker embeddings for SpeechT5...")
try:
emb_ds = load_dataset(
"Matthijs/cmu-arctic-xvectors",
split="validation",
trust_remote_code=True, # needed with modern datasets
)
# HF examples commonly use index 7306 (speaker "slt")
emb = emb_ds[7306]["xvector"]
_speaker_embeddings = torch.tensor(emb, dtype=torch.float32).unsqueeze(0)
except Exception as e:
print(f"Speaker embedding load failed: {e}. Using a random voice embedding.")
# SpeechT5 expects shape (1, 512)
_speaker_embeddings = torch.randn(1, 512, dtype=torch.float32)
def describe_and_speak(image):
"""Generate an English caption for the image and speak it aloud (defaults only)."""
load_models_cpu()
# --- 1) Caption (defaults; no beams/tokens passed) ---
result = _captioner(image)
caption = (result[0].get("generated_text", "") if result else "").strip()
if not caption:
caption = "A description could not be generated for this image."
# --- 2) Text β†’ Speech (SpeechT5) ---
try:
inputs = _tts_processor(text=caption, return_tensors="pt")
with torch.no_grad():
speech = _tts_model.generate_speech(
inputs["input_ids"],
speaker_embeddings=_speaker_embeddings,
vocoder=_tts_vocoder,
)
sr = 16000 # SpeechT5 HiFiGAN outputs 16 kHz mono
audio = np.asarray(speech.numpy(), dtype=np.float32)
except Exception as e:
caption += f"\n\n[TTS error: {e}]"
sr = 22050
audio = np.zeros(sr, dtype=np.float32)
return caption, (sr, audio)
# ---------------------------
# Gradio UI
# ---------------------------
with gr.Blocks(title="Image β†’ Speech (HF models, CPU)") as demo:
gr.Markdown(
"""
# πŸ–ΌοΈ Image β†’ πŸŽ™οΈ Speech (CPU)
1) Caption with **BLIP-2** β†’ 2) Speak with **SpeechT5** (HiFiGAN vocoder).
*First run downloads models and speaker embeddings β€” please wait.*
"""
)
inp_image = gr.Image(type="pil", label="Upload an image (JPG/PNG)")
out_text = gr.Textbox(label="Generated Caption", lines=3)
out_audio = gr.Audio(label="Spoken Caption", type="numpy")
btn = gr.Button("Generate")
btn.click(describe_and_speak, [inp_image], [out_text, out_audio])
if __name__ == "__main__":
demo.launch()