| import os |
| |
| os.environ["HF_DATASETS_TRUST_REMOTE_CODE"] = "1" |
|
|
| import gradio as gr |
| import torch |
| import numpy as np |
| from transformers import pipeline, AutoProcessor, SpeechT5ForTextToSpeech, SpeechT5HifiGan |
| from datasets import load_dataset |
|
|
| |
| |
| |
| _captioner = None |
| _tts_processor = None |
| _tts_model = None |
| _tts_vocoder = None |
| _speaker_embeddings = None |
|
|
|
|
| def load_models_cpu(): |
| """Load BLIP-2 (image captioning) and SpeechT5 (text-to-speech) on CPU.""" |
| global _captioner, _tts_processor, _tts_model, _tts_vocoder, _speaker_embeddings |
|
|
| if _captioner is None: |
| print("Loading BLIP-2 image captioning model (CPU)...") |
| _captioner = pipeline( |
| task="image-to-text", |
| model="Salesforce/blip2-flan-t5-xl", |
| dtype=torch.float32, |
| device_map=None, |
| ) |
|
|
| if _tts_processor is None: |
| print("Loading SpeechT5 processor...") |
| _tts_processor = AutoProcessor.from_pretrained("microsoft/speecht5_tts") |
|
|
| if _tts_model is None: |
| print("Loading SpeechT5 TTS model (CPU)...") |
| _tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to("cpu").eval() |
|
|
| if _tts_vocoder is None: |
| print("Loading SpeechT5 HiFiGAN vocoder (CPU)...") |
| _tts_vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to("cpu").eval() |
|
|
| if _speaker_embeddings is None: |
| print("Loading default speaker embeddings for SpeechT5...") |
| try: |
| emb_ds = load_dataset( |
| "Matthijs/cmu-arctic-xvectors", |
| split="validation", |
| trust_remote_code=True, |
| ) |
| |
| emb = emb_ds[7306]["xvector"] |
| _speaker_embeddings = torch.tensor(emb, dtype=torch.float32).unsqueeze(0) |
| except Exception as e: |
| print(f"Speaker embedding load failed: {e}. Using a random voice embedding.") |
| |
| _speaker_embeddings = torch.randn(1, 512, dtype=torch.float32) |
|
|
|
|
| def describe_and_speak(image): |
| """Generate an English caption for the image and speak it aloud (defaults only).""" |
| load_models_cpu() |
|
|
| |
| result = _captioner(image) |
| caption = (result[0].get("generated_text", "") if result else "").strip() |
| if not caption: |
| caption = "A description could not be generated for this image." |
|
|
| |
| try: |
| inputs = _tts_processor(text=caption, return_tensors="pt") |
| with torch.no_grad(): |
| speech = _tts_model.generate_speech( |
| inputs["input_ids"], |
| speaker_embeddings=_speaker_embeddings, |
| vocoder=_tts_vocoder, |
| ) |
| sr = 16000 |
| audio = np.asarray(speech.numpy(), dtype=np.float32) |
| except Exception as e: |
| caption += f"\n\n[TTS error: {e}]" |
| sr = 22050 |
| audio = np.zeros(sr, dtype=np.float32) |
|
|
| return caption, (sr, audio) |
|
|
|
|
| |
| |
| |
| with gr.Blocks(title="Image β Speech (HF models, CPU)") as demo: |
| gr.Markdown( |
| """ |
| # πΌοΈ Image β ποΈ Speech (CPU) |
| 1) Caption with **BLIP-2** β 2) Speak with **SpeechT5** (HiFiGAN vocoder). |
| *First run downloads models and speaker embeddings β please wait.* |
| """ |
| ) |
|
|
| inp_image = gr.Image(type="pil", label="Upload an image (JPG/PNG)") |
| out_text = gr.Textbox(label="Generated Caption", lines=3) |
| out_audio = gr.Audio(label="Spoken Caption", type="numpy") |
|
|
| btn = gr.Button("Generate") |
| btn.click(describe_and_speak, [inp_image], [out_text, out_audio]) |
|
|
| if __name__ == "__main__": |
| demo.launch() |
|
|