Spaces:

Maria604
/

Trial

Sleeping

App Files Files Community

Maria604 commited on Oct 27, 2025

Commit

819778a

1 Parent(s): a02db7d

fix

Browse files

Files changed (2) hide show

app.py +32 -19
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -2,33 +2,44 @@ import gradio as gr
 import torch
 import numpy as np
 from transformers import pipeline
 # ---------------------------
 # CPU-only model loaders
 # ---------------------------
 _captioner = None
 _tts = None
 def load_models_cpu():
-    """Load BLIP-2 (image captioning) and ESPnet VITS (text-to-speech) on CPU."""
-    global _captioner, _tts
     if _captioner is None:
         print("Loading BLIP-2 image captioning model...")
         _captioner = pipeline(
             task="image-to-text",
-            model="Salesforce/blip2-flan-t5-xl",  # high-quality public model
-            torch_dtype=torch.float32,
-            device_map=None,  # CPU only
         )
     if _tts is None:
-        print("Loading ESPnet VITS text-to-speech model...")
         _tts = pipeline(
             task="text-to-speech",
-            model="espnet/kan-bayashi_ljspeech_vits",  # English-only TTS
         )
 def describe_and_speak(image, beams, max_tokens):
     """Generate an English caption for the image and read it aloud."""
     load_models_cpu()
@@ -40,11 +51,14 @@ def describe_and_speak(image, beams, max_tokens):
     if not caption:
         caption = "A description could not be generated for this image."
-    # --- Step 2: Convert text to speech ---
     try:
-        tts_output = _tts(caption)
-        audio = np.array(tts_output["audio"], dtype=np.float32)
-        sr = tts_output["sampling_rate"]
     except Exception as e:
         caption += f"\n\n[TTS error: {e}]"
         sr = 22050
@@ -59,19 +73,18 @@ with gr.Blocks(title="Image → Speech (Hugging Face models, CPU)") as demo:
     gr.Markdown(
         """
         # 🖼️ Image → 🎙️ Speech
-        Upload an image, and the app will:
-        1. Generate a caption using **BLIP-2**
-        2. Read it aloud using **ESPnet VITS**
-        *Runs fully on CPU (Hugging Face public models).
-        First run may take a few minutes while models download.*
         """
     )
     with gr.Row():
-        inp_image = gr.Image(type="pil", label="Upload an image (JPG or PNG)")
         with gr.Column():
-            beams = gr.Slider(1, 4, value=2, step=1, label="Caption beams (quality vs. speed)")
             max_tokens = gr.Slider(10, 60, value=30, step=5, label="Max caption tokens")
     with gr.Row():
@@ -79,7 +92,7 @@ with gr.Blocks(title="Image → Speech (Hugging Face models, CPU)") as demo:
         out_audio = gr.Audio(label="Spoken Caption", type="numpy")
     btn = gr.Button("Generate")
-    btn.click(fn=describe_and_speak, inputs=[inp_image, beams, max_tokens], outputs=[out_text, out_audio])
 if __name__ == "__main__":
     demo.launch()

 import torch
 import numpy as np
 from transformers import pipeline
+from datasets import load_dataset
 # ---------------------------
 # CPU-only model loaders
 # ---------------------------
 _captioner = None
 _tts = None
+_speaker_embeddings = None  # for SpeechT5
 def load_models_cpu():
+    """Load BLIP-2 (image captioning) and SpeechT5 (text-to-speech) on CPU."""
+    global _captioner, _tts, _speaker_embeddings
     if _captioner is None:
         print("Loading BLIP-2 image captioning model...")
         _captioner = pipeline(
             task="image-to-text",
+            model="Salesforce/blip2-flan-t5-xl",   # quality; CPU-friendly (just slower)
+            dtype=torch.float32,                    # use CPU dtype (torch_dtype is deprecated alias)
+            device_map=None,                        # ensure CPU
         )
     if _tts is None:
+        print("Loading SpeechT5 TTS + vocoder...")
+        # Text-to-speech model + vocoder (both on HF)
         _tts = pipeline(
             task="text-to-speech",
+            model="microsoft/speecht5_tts",
+            vocoder="microsoft/speecht5_hifigan",
         )
+    if _speaker_embeddings is None:
+        print("Loading default speaker embeddings for SpeechT5...")
+        # Standard HF demo speaker from CMU Arctic xvectors (female speaker "slt")
+        emb_ds = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
+        # Pick a representative embedding; index 7306 is common in examples
+        _speaker_embeddings = torch.tensor(emb_ds[7306]["xvector"]).unsqueeze(0)
 def describe_and_speak(image, beams, max_tokens):
     """Generate an English caption for the image and read it aloud."""
     load_models_cpu()
     if not caption:
         caption = "A description could not be generated for this image."
+    # --- Step 2: Convert text to speech (SpeechT5 needs speaker embeddings) ---
     try:
+        tts_out = _tts(
+            caption,
+            forward_params={"speaker_embeddings": _speaker_embeddings}
+        )
+        audio = np.asarray(tts_out["audio"], dtype=np.float32)
+        sr = int(tts_out["sampling_rate"])
     except Exception as e:
         caption += f"\n\n[TTS error: {e}]"
         sr = 22050
     gr.Markdown(
         """
         # 🖼️ Image → 🎙️ Speech
+        Upload an image. The app will:
+        1) Caption it with **BLIP-2**
+        2) Speak the caption with **SpeechT5** (HF), CPU-only
+        *First run may take a few minutes while models & speaker embeddings download.*
         """
     )
     with gr.Row():
+        inp_image = gr.Image(type="pil", label="Upload an image (JPG/PNG)")
         with gr.Column():
+            beams = gr.Slider(1, 4, value=2, step=1, label="Caption beams (quality vs speed)")
             max_tokens = gr.Slider(10, 60, value=30, step=5, label="Max caption tokens")
     with gr.Row():
         out_audio = gr.Audio(label="Spoken Caption", type="numpy")
     btn = gr.Button("Generate")
+    btn.click(describe_and_speak, [inp_image, beams, max_tokens], [out_text, out_audio])
 if __name__ == "__main__":
     demo.launch()

requirements.txt CHANGED Viewed

@@ -9,3 +9,4 @@ safetensors
 timm
 scipy
 numpy<2.0

 timm
 scipy
 numpy<2.0
+datasets