# https://learn.microsoft.com/en-us/azure/cognitive-services/speech-service/get-started-speech-to-text?tabs=linux%2Cterminal&pivots=programming-language-python import numpy as np from utils import recognize_from_stream from azure.cognitiveservices.speech.audio import PushAudioInputStream, AudioStreamFormat import gradio as gr import os import time stream = PushAudioInputStream(AudioStreamFormat(48000,)) # sample rate is important msg_queue = [] chat_history = [ ] def rec_cb(evt): print("##########################") print(evt.result.text) if evt.result.text: chat_history.append({'role':'user', 'content':evt.result.text}) speech_recognizer = recognize_from_stream(stream,rec_cb) def transcribe(speech): sample_rate, speech = speech print(time.time(), (sample_rate, len(speech))) stream.write(speech.tobytes()) user_msg = [i['content'] for i in chat_history if i['role']=='user'] box_msg = [i['content'] for i in chat_history if i['role']=='assistant'] return list(zip(user_msg,box_msg)) demo = gr.Interface( title="实时语音识别", description="使用Azure的语音识别服务,实时识别麦克风输入的语音。", fn=transcribe, inputs=[ gr.Audio(source="microphone", type="numpy",streaming=True), ], outputs= [ gr.Chatbot(), ], live=True) if __name__ == '__main__': demo.launch(share=True,show_error=True)