Spaces:

Maria604
/

Trial

Sleeping

App Files Files Community

Trial / app.py

Maria604

fix

50b13cb 6 months ago

raw

history blame contribute delete

4.12 kB

	import os
	# allow loading the CMU Arctic xvectors dataset script on HF
	os.environ["HF_DATASETS_TRUST_REMOTE_CODE"] = "1"

	import gradio as gr
	import torch
	import numpy as np
	from transformers import pipeline, AutoProcessor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
	from datasets import load_dataset

	# ---------------------------
	# CPU-only global handles
	# ---------------------------
	_captioner = None
	_tts_processor = None
	_tts_model = None
	_tts_vocoder = None
	_speaker_embeddings = None # required by SpeechT5


	def load_models_cpu():
	"""Load BLIP-2 (image captioning) and SpeechT5 (text-to-speech) on CPU."""
	global _captioner, _tts_processor, _tts_model, _tts_vocoder, _speaker_embeddings

	if _captioner is None:
	print("Loading BLIP-2 image captioning model (CPU)...")
	_captioner = pipeline(
	task="image-to-text",
	model="Salesforce/blip2-flan-t5-xl",
	dtype=torch.float32, # CPU dtype (alias of torch_dtype)
	device_map=None, # ensure CPU
	)

	if _tts_processor is None:
	print("Loading SpeechT5 processor...")
	_tts_processor = AutoProcessor.from_pretrained("microsoft/speecht5_tts")

	if _tts_model is None:
	print("Loading SpeechT5 TTS model (CPU)...")
	_tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to("cpu").eval()

	if _tts_vocoder is None:
	print("Loading SpeechT5 HiFiGAN vocoder (CPU)...")
	_tts_vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to("cpu").eval()

	if _speaker_embeddings is None:
	print("Loading default speaker embeddings for SpeechT5...")
	try:
	emb_ds = load_dataset(
	"Matthijs/cmu-arctic-xvectors",
	split="validation",
	trust_remote_code=True, # needed with modern datasets
	)
	# HF examples commonly use index 7306 (speaker "slt")
	emb = emb_ds[7306]["xvector"]
	_speaker_embeddings = torch.tensor(emb, dtype=torch.float32).unsqueeze(0)
	except Exception as e:
	print(f"Speaker embedding load failed: {e}. Using a random voice embedding.")
	# SpeechT5 expects shape (1, 512)
	_speaker_embeddings = torch.randn(1, 512, dtype=torch.float32)


	def describe_and_speak(image):
	"""Generate an English caption for the image and speak it aloud (defaults only)."""
	load_models_cpu()

	# --- 1) Caption (defaults; no beams/tokens passed) ---
	result = _captioner(image)
	caption = (result[0].get("generated_text", "") if result else "").strip()
	if not caption:
	caption = "A description could not be generated for this image."

	# --- 2) Text → Speech (SpeechT5) ---
	try:
	inputs = _tts_processor(text=caption, return_tensors="pt")
	with torch.no_grad():
	speech = _tts_model.generate_speech(
	inputs["input_ids"],
	speaker_embeddings=_speaker_embeddings,
	vocoder=_tts_vocoder,
	)
	sr = 16000 # SpeechT5 HiFiGAN outputs 16 kHz mono
	audio = np.asarray(speech.numpy(), dtype=np.float32)
	except Exception as e:
	caption += f"\n\n[TTS error: {e}]"
	sr = 22050
	audio = np.zeros(sr, dtype=np.float32)

	return caption, (sr, audio)


	# ---------------------------
	# Gradio UI
	# ---------------------------
	with gr.Blocks(title="Image → Speech (HF models, CPU)") as demo:
	gr.Markdown(
	"""
	# 🖼️ Image → 🎙️ Speech (CPU)
	1) Caption with BLIP-2 → 2) Speak with SpeechT5 (HiFiGAN vocoder).
	First run downloads models and speaker embeddings — please wait.
	"""
	)

	inp_image = gr.Image(type="pil", label="Upload an image (JPG/PNG)")
	out_text = gr.Textbox(label="Generated Caption", lines=3)
	out_audio = gr.Audio(label="Spoken Caption", type="numpy")

	btn = gr.Button("Generate")
	btn.click(describe_and_speak, [inp_image], [out_text, out_audio])

	if __name__ == "__main__":
	demo.launch()