| import streamlit as st |
| from transformers import pipeline |
| from PIL import Image |
|
|
| |
| image_to_text = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning") |
| text_to_speech = pipeline("text-to-speech", model="facebook/mms-tts-eng") |
|
|
| st.title("Image-to-Text and Text-to-Speech App") |
|
|
| |
| uploaded_image = st.file_uploader("Upload an image", type=["jpg", "png", "jpeg"]) |
|
|
| if uploaded_image: |
| image = Image.open(uploaded_image) |
| st.image(image, caption="Uploaded Image", use_container_width=True) |
| |
| |
| text_output = image_to_text(image)[0]['generated_text'] |
| st.write("### Extracted Text:") |
| st.write(text_output) |
| |
| |
| speech_output = text_to_speech(text_output) |
| st.write("### Listen to Speech Output:") |
| st.audio(speech_output['audio'], |
| format="audio/wav", |
| start_time=0, |
| sample_rate = speech_output['sampling_rate']) |