import streamlit as st from transformers import pipeline from gtts import gTTS from PIL import Image import time from playsound import playsound @st.cache(allow_output_mutation=True) def load_model(): """Retrieves the trained model""" model = pipeline('image-to-text') return model def main(): caption = load_model() st.title('Welcome to image to speech app') instructions = """Click an image using inbuilt camera or upload an image file""" st.write(instructions) img = None pictureCam = st.camera_input('Take a picture') pictureUpload = st.file_uploader('Upload An Image') if pictureCam : st.write('clicked image from webcam') st.image(pictureCam) img = Image.open(pictureCam) elif pictureUpload : st.write('uploaded image from device') st.image(pictureUpload) img = Image.open(pictureUpload) if img is not None : description = caption(img) generated_text = description[0]['generated_text'] st.write(generated_text) generated_audio = gTTS(generated_text) generated_audio.save('demo.mp3') audio_file = open('demo.mp3', 'rb') audio_bytes = audio_file.read() st.audio(audio_bytes, format='audio/ogg',start_time=0) if __name__ == '__main__' : main()