import os # allow loading the CMU Arctic xvectors dataset script on HF os.environ["HF_DATASETS_TRUST_REMOTE_CODE"] = "1" import gradio as gr import torch import numpy as np from transformers import pipeline, AutoProcessor, SpeechT5ForTextToSpeech, SpeechT5HifiGan from datasets import load_dataset # --------------------------- # CPU-only global handles # --------------------------- _captioner = None _tts_processor = None _tts_model = None _tts_vocoder = None _speaker_embeddings = None # required by SpeechT5 def load_models_cpu(): """Load BLIP-2 (image captioning) and SpeechT5 (text-to-speech) on CPU.""" global _captioner, _tts_processor, _tts_model, _tts_vocoder, _speaker_embeddings if _captioner is None: print("Loading BLIP-2 image captioning model (CPU)...") _captioner = pipeline( task="image-to-text", model="Salesforce/blip2-flan-t5-xl", dtype=torch.float32, # CPU dtype (alias of torch_dtype) device_map=None, # ensure CPU ) if _tts_processor is None: print("Loading SpeechT5 processor...") _tts_processor = AutoProcessor.from_pretrained("microsoft/speecht5_tts") if _tts_model is None: print("Loading SpeechT5 TTS model (CPU)...") _tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to("cpu").eval() if _tts_vocoder is None: print("Loading SpeechT5 HiFiGAN vocoder (CPU)...") _tts_vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to("cpu").eval() if _speaker_embeddings is None: print("Loading default speaker embeddings for SpeechT5...") try: emb_ds = load_dataset( "Matthijs/cmu-arctic-xvectors", split="validation", trust_remote_code=True, # needed with modern datasets ) # HF examples commonly use index 7306 (speaker "slt") emb = emb_ds[7306]["xvector"] _speaker_embeddings = torch.tensor(emb, dtype=torch.float32).unsqueeze(0) except Exception as e: print(f"Speaker embedding load failed: {e}. Using a random voice embedding.") # SpeechT5 expects shape (1, 512) _speaker_embeddings = torch.randn(1, 512, dtype=torch.float32) def describe_and_speak(image): """Generate an English caption for the image and speak it aloud (defaults only).""" load_models_cpu() # --- 1) Caption (defaults; no beams/tokens passed) --- result = _captioner(image) caption = (result[0].get("generated_text", "") if result else "").strip() if not caption: caption = "A description could not be generated for this image." # --- 2) Text → Speech (SpeechT5) --- try: inputs = _tts_processor(text=caption, return_tensors="pt") with torch.no_grad(): speech = _tts_model.generate_speech( inputs["input_ids"], speaker_embeddings=_speaker_embeddings, vocoder=_tts_vocoder, ) sr = 16000 # SpeechT5 HiFiGAN outputs 16 kHz mono audio = np.asarray(speech.numpy(), dtype=np.float32) except Exception as e: caption += f"\n\n[TTS error: {e}]" sr = 22050 audio = np.zeros(sr, dtype=np.float32) return caption, (sr, audio) # --------------------------- # Gradio UI # --------------------------- with gr.Blocks(title="Image → Speech (HF models, CPU)") as demo: gr.Markdown( """ # 🖼️ Image → 🎙️ Speech (CPU) 1) Caption with **BLIP-2** → 2) Speak with **SpeechT5** (HiFiGAN vocoder). *First run downloads models and speaker embeddings — please wait.* """ ) inp_image = gr.Image(type="pil", label="Upload an image (JPG/PNG)") out_text = gr.Textbox(label="Generated Caption", lines=3) out_audio = gr.Audio(label="Spoken Caption", type="numpy") btn = gr.Button("Generate") btn.click(describe_and_speak, [inp_image], [out_text, out_audio]) if __name__ == "__main__": demo.launch()