Maria604 commited on
Commit
819778a
ยท
1 Parent(s): a02db7d
Files changed (2) hide show
  1. app.py +32 -19
  2. requirements.txt +1 -0
app.py CHANGED
@@ -2,33 +2,44 @@ import gradio as gr
2
  import torch
3
  import numpy as np
4
  from transformers import pipeline
 
5
 
6
  # ---------------------------
7
  # CPU-only model loaders
8
  # ---------------------------
9
  _captioner = None
10
  _tts = None
 
11
 
12
  def load_models_cpu():
13
- """Load BLIP-2 (image captioning) and ESPnet VITS (text-to-speech) on CPU."""
14
- global _captioner, _tts
15
 
16
  if _captioner is None:
17
  print("Loading BLIP-2 image captioning model...")
18
  _captioner = pipeline(
19
  task="image-to-text",
20
- model="Salesforce/blip2-flan-t5-xl", # high-quality public model
21
- torch_dtype=torch.float32,
22
- device_map=None, # CPU only
23
  )
24
 
25
  if _tts is None:
26
- print("Loading ESPnet VITS text-to-speech model...")
 
27
  _tts = pipeline(
28
  task="text-to-speech",
29
- model="espnet/kan-bayashi_ljspeech_vits", # English-only TTS
 
30
  )
31
 
 
 
 
 
 
 
 
32
  def describe_and_speak(image, beams, max_tokens):
33
  """Generate an English caption for the image and read it aloud."""
34
  load_models_cpu()
@@ -40,11 +51,14 @@ def describe_and_speak(image, beams, max_tokens):
40
  if not caption:
41
  caption = "A description could not be generated for this image."
42
 
43
- # --- Step 2: Convert text to speech ---
44
  try:
45
- tts_output = _tts(caption)
46
- audio = np.array(tts_output["audio"], dtype=np.float32)
47
- sr = tts_output["sampling_rate"]
 
 
 
48
  except Exception as e:
49
  caption += f"\n\n[TTS error: {e}]"
50
  sr = 22050
@@ -59,19 +73,18 @@ with gr.Blocks(title="Image โ†’ Speech (Hugging Face models, CPU)") as demo:
59
  gr.Markdown(
60
  """
61
  # ๐Ÿ–ผ๏ธ Image โ†’ ๐ŸŽ™๏ธ Speech
62
- Upload an image, and the app will:
63
- 1. Generate a caption using **BLIP-2**
64
- 2. Read it aloud using **ESPnet VITS**
65
 
66
- *Runs fully on CPU (Hugging Face public models).
67
- First run may take a few minutes while models download.*
68
  """
69
  )
70
 
71
  with gr.Row():
72
- inp_image = gr.Image(type="pil", label="Upload an image (JPG or PNG)")
73
  with gr.Column():
74
- beams = gr.Slider(1, 4, value=2, step=1, label="Caption beams (quality vs. speed)")
75
  max_tokens = gr.Slider(10, 60, value=30, step=5, label="Max caption tokens")
76
 
77
  with gr.Row():
@@ -79,7 +92,7 @@ with gr.Blocks(title="Image โ†’ Speech (Hugging Face models, CPU)") as demo:
79
  out_audio = gr.Audio(label="Spoken Caption", type="numpy")
80
 
81
  btn = gr.Button("Generate")
82
- btn.click(fn=describe_and_speak, inputs=[inp_image, beams, max_tokens], outputs=[out_text, out_audio])
83
 
84
  if __name__ == "__main__":
85
  demo.launch()
 
2
  import torch
3
  import numpy as np
4
  from transformers import pipeline
5
+ from datasets import load_dataset
6
 
7
  # ---------------------------
8
  # CPU-only model loaders
9
  # ---------------------------
10
  _captioner = None
11
  _tts = None
12
+ _speaker_embeddings = None # for SpeechT5
13
 
14
  def load_models_cpu():
15
+ """Load BLIP-2 (image captioning) and SpeechT5 (text-to-speech) on CPU."""
16
+ global _captioner, _tts, _speaker_embeddings
17
 
18
  if _captioner is None:
19
  print("Loading BLIP-2 image captioning model...")
20
  _captioner = pipeline(
21
  task="image-to-text",
22
+ model="Salesforce/blip2-flan-t5-xl", # quality; CPU-friendly (just slower)
23
+ dtype=torch.float32, # use CPU dtype (torch_dtype is deprecated alias)
24
+ device_map=None, # ensure CPU
25
  )
26
 
27
  if _tts is None:
28
+ print("Loading SpeechT5 TTS + vocoder...")
29
+ # Text-to-speech model + vocoder (both on HF)
30
  _tts = pipeline(
31
  task="text-to-speech",
32
+ model="microsoft/speecht5_tts",
33
+ vocoder="microsoft/speecht5_hifigan",
34
  )
35
 
36
+ if _speaker_embeddings is None:
37
+ print("Loading default speaker embeddings for SpeechT5...")
38
+ # Standard HF demo speaker from CMU Arctic xvectors (female speaker "slt")
39
+ emb_ds = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
40
+ # Pick a representative embedding; index 7306 is common in examples
41
+ _speaker_embeddings = torch.tensor(emb_ds[7306]["xvector"]).unsqueeze(0)
42
+
43
  def describe_and_speak(image, beams, max_tokens):
44
  """Generate an English caption for the image and read it aloud."""
45
  load_models_cpu()
 
51
  if not caption:
52
  caption = "A description could not be generated for this image."
53
 
54
+ # --- Step 2: Convert text to speech (SpeechT5 needs speaker embeddings) ---
55
  try:
56
+ tts_out = _tts(
57
+ caption,
58
+ forward_params={"speaker_embeddings": _speaker_embeddings}
59
+ )
60
+ audio = np.asarray(tts_out["audio"], dtype=np.float32)
61
+ sr = int(tts_out["sampling_rate"])
62
  except Exception as e:
63
  caption += f"\n\n[TTS error: {e}]"
64
  sr = 22050
 
73
  gr.Markdown(
74
  """
75
  # ๐Ÿ–ผ๏ธ Image โ†’ ๐ŸŽ™๏ธ Speech
76
+ Upload an image. The app will:
77
+ 1) Caption it with **BLIP-2**
78
+ 2) Speak the caption with **SpeechT5** (HF), CPU-only
79
 
80
+ *First run may take a few minutes while models & speaker embeddings download.*
 
81
  """
82
  )
83
 
84
  with gr.Row():
85
+ inp_image = gr.Image(type="pil", label="Upload an image (JPG/PNG)")
86
  with gr.Column():
87
+ beams = gr.Slider(1, 4, value=2, step=1, label="Caption beams (quality vs speed)")
88
  max_tokens = gr.Slider(10, 60, value=30, step=5, label="Max caption tokens")
89
 
90
  with gr.Row():
 
92
  out_audio = gr.Audio(label="Spoken Caption", type="numpy")
93
 
94
  btn = gr.Button("Generate")
95
+ btn.click(describe_and_speak, [inp_image, beams, max_tokens], [out_text, out_audio])
96
 
97
  if __name__ == "__main__":
98
  demo.launch()
requirements.txt CHANGED
@@ -9,3 +9,4 @@ safetensors
9
  timm
10
  scipy
11
  numpy<2.0
 
 
9
  timm
10
  scipy
11
  numpy<2.0
12
+ datasets