Spaces:

mintyeatech
/

lingobot

No application file

App Files Files Community

gent commited on Jun 6, 2023

Commit

f96a97b

1 Parent(s): ab9cb62

realtime asr

Browse files

Files changed (10) hide show

.gitignore +2 -1
README.md +1 -1
app.py +1 -1
demo.py +0 -0
demos/character.py +60 -0
demos/chat.py +56 -0
demos/example.py +24 -0
demos/realtime_asr.py +53 -0
demos/scene.py +60 -0
utils/asr.py +40 -12

.gitignore CHANGED Viewed

@@ -1,5 +1,6 @@
 apps
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]

 apps
+audio_samples
+flagged
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]

README.md CHANGED Viewed

@@ -5,7 +5,7 @@ colorFrom: green
 colorTo: yellow
 sdk: gradio
 sdk_version: 3.32.0
-app_file: app.py
 pinned: false
 license: apache-2.0
 ---

 colorTo: yellow
 sdk: gradio
 sdk_version: 3.32.0
+app_file: demos/realtime_asr.py
 pinned: false
 license: apache-2.0
 ---

app.py CHANGED Viewed

@@ -111,7 +111,7 @@ def main():
         btn.click(export, None, [ chatbot, outputs])
-    demo.launch()
 if __name__ == "__main__":
     main()

         btn.click(export, None, [ chatbot, outputs])
+    demo.launch(show_error=True)
 if __name__ == "__main__":
     main()

demo.py ADDED Viewed

File without changes

demos/character.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import gradio as gr
+import time
+from utils.asr import recognize_from_file
+from utils.tts import tts
+from utils.llm import generate_response
+chat_history = [
+]
+def convert_chatbox(chat_history):
+    return [f"{i['role']}: {i['content']}" for i in chat_history]
+with gr.Blocks() as demo:
+    character = gr.Textbox(lines=5, label="Character", placeholder="Enter your character name")
+    def init(character):
+        global chat_history
+        chat_history=[
+            {"role": "system", "content": "Pretend to be a character and have a causal conversation with the user. Respond to the user's questions and ask questions back. The responses start with 'character:' if you are the character and 'GPT:' if you are ChatGPT. You are the character and never respond as the user. You are a character: "+character},
+        ]
+    init_bt=gr.Button("Init", type="button", label="Start")
+    init_bt.click(init, [character], )
+    chatbot = gr.Chatbot()
+    with gr.Row():
+        msg = gr.Textbox()
+        audio = gr.Audio(source="microphone", type="filepath", streaming=False)
+    def respond(message):
+        # TODO: replace this with real GPT model
+        chat_history.append({'role': 'user', 'content': message})
+        result = generate_response(chat_history)
+        mesg=result['choices'][0]['message']
+        print("recv: ", mesg)
+        response = mesg['content']
+        chat_history.append(mesg)
+        chatbot.value.append((message,response))
+        print("chat_history: ", chatbot.value)
+        return None, chatbot.value
+    msg.submit(respond, [msg], [msg,chatbot])
+    def transcribe(audio_file):
+        print("start transcribe, ", audio_file)
+        start = time.time()
+        text = recognize_from_file(audio_file)
+        print("use ", time.time()-start)
+        print("transcribe done, ", text)
+        return respond(text)
+    audio.change(transcribe, [audio], [audio, chatbot])
+demo.launch()

demos/chat.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import gradio as gr
+import time
+from utils.asr import recognize_from_file
+from utils.tts import tts
+from utils.llm import generate_response
+chat_history = []
+def convert_chatbox(chat_history):
+    return [f"{i['role']}: {i['content']}" for i in chat_history]
+with gr.Blocks() as demo:
+    chatbot = gr.Chatbot()
+    with gr.Row():
+        msg = gr.Textbox()
+        audio = gr.Audio(source="microphone", type="filepath", streaming=False)
+    player = gr.Audio( type="filepath", label="Speaker",interactive=False)
+    clear = gr.Button("Clear")
+    def respond(message):
+        # TODO: replace this with real GPT model
+        chat_history.append({'role': 'user', 'content': message})
+        result = generate_response(chat_history)
+        mesg=result['choices'][0]['message']
+        print("recv: ", mesg)
+        response = mesg['content']
+        chat_history.append(mesg)
+        # write to file
+        result = tts(response)
+        with open("temp.wav", "wb") as audio_file:
+            audio_file.write(result.audio_data)
+        print("write to temp.wav")
+        chatbot.value.append((message,response))
+        print("chat_history: ", chatbot.value)
+        return None, "temp.wav", chatbot.value
+    msg.submit(respond, [msg], [msg, player,chatbot])
+    def transcribe(audio_file):
+        print("start transcribe, ", audio_file)
+        start = time.time()
+        text = recognize_from_file(audio_file)
+        print("use ", time.time()-start)
+        print("transcribe done, ", text)
+        return respond(text)
+    audio.change(transcribe, [audio], [audio, player, chatbot])
+    clear.click(lambda: None, None, chatbot, queue=False)
+demo.launch()

demos/example.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from transformers import pipeline
+import gradio as gr
+import time
+p = pipeline("automatic-speech-recognition")
+def transcribe(audio, state=""):
+    print(time.time(),audio)
+    time.sleep(2)
+    text = p(audio)["text"]
+    state += text + " "
+    return state, state
+gr.Interface(
+    fn=transcribe,
+    inputs=[
+        gr.Audio(source="microphone", type="filepath", streaming=True),
+        "state"
+    ],
+    outputs=[
+        "textbox",
+        "state"
+    ],
+    live=True).launch()

demos/realtime_asr.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import numpy as np
+from utils import recognize_from_stream, generate_response
+from azure.cognitiveservices.speech.audio import PushAudioInputStream, AudioStreamFormat
+import gradio as gr
+import os
+import time
+stream = PushAudioInputStream(AudioStreamFormat(48000,)) # sample rate is important
+msg_queue = []
+chat_history = [
+]
+def rec_cb(evt):
+    print("##########################")
+    print(evt.result.text)
+    if evt.result.text:
+        chat_history.append({'role':'user', 'content':evt.result.text})
+        response = generate_response(chat_history)
+        chat_history.append(response['choices'][0]['message'])
+speech_recognizer = recognize_from_stream(stream,rec_cb)
+def transcribe(speech):
+    sample_rate, speech = speech
+    # print(time.time(), (sample_rate, len(speech)))
+    stream.write(speech.tobytes())
+    user_msg = [i['content'] for i in chat_history if i['role']=='user']
+    box_msg = [i['content'] for i in chat_history if i['role']=='assistant']
+    return list(zip(user_msg,box_msg))
+demo = gr.Interface(
+    title="实时语音识别",
+    description="使用Azure的语音识别服务，实时识别麦克风输入的语音。",
+    fn=transcribe,
+    inputs=[
+        gr.Audio(source="microphone", type="numpy",streaming=True),
+    ],
+    outputs= [
+        gr.Chatbot(),
+    ],
+    flagging_callback=gr.SimpleCSVLogger(),
+    live=True)
+if __name__ == '__main__':
+    demo.launch()

demos/scene.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import gradio as gr
+import time
+from utils.asr import recognize_from_file
+from utils.tts import tts
+from utils.llm import generate_response
+chat_history = [
+]
+def convert_chatbox(chat_history):
+    return [f"{i['role']}: {i['content']}" for i in chat_history]
+with gr.Blocks() as demo:
+    scene = gr.Textbox(lines=5, label="scene", placeholder="Enter your scene name")
+    def init(scene):
+        global chat_history
+        chat_history=[
+            {"role": "system", "content": "Pretend to be a character and have a causal conversation with the user.  Respond to the user's questions and ask questions back. The responses start with 'character:' if you are the character and 'GPT:' if you are ChatGPT. You are the character and never respond as the user. The conversation happens in the scene: " + scene},
+        ]
+    init_bt=gr.Button("Init", type="button", label="Start")
+    init_bt.click(init, [scene], )
+    chatbot = gr.Chatbot()
+    with gr.Row():
+        msg = gr.Textbox()
+        audio = gr.Audio(source="microphone", type="filepath", streaming=False)
+    def respond(message):
+        # TODO: replace this with real GPT model
+        chat_history.append({'role': 'user', 'content': message})
+        result = generate_response(chat_history)
+        mesg=result['choices'][0]['message']
+        print("recv: ", mesg)
+        response = mesg['content']
+        chat_history.append(mesg)
+        chatbot.value.append((message,response))
+        print("chat_history: ", chatbot.value)
+        return None, chatbot.value
+    msg.submit(respond, [msg], [msg,chatbot])
+    def transcribe(audio_file):
+        print("start transcribe, ", audio_file)
+        start = time.time()
+        text = recognize_from_file(audio_file)
+        print("use ", time.time()-start)
+        print("transcribe done, ", text)
+        return respond(text)
+    audio.change(transcribe, [audio], [audio, chatbot])
+demo.launch()

utils/asr.py CHANGED Viewed

@@ -1,19 +1,15 @@
 import azure.cognitiveservices.speech as speechsdk
 import os
 import time
 # Replace with your own subscription key and service region
-def get_recoginizer(**kwargs):
-    speech_config = speechsdk.SpeechConfig(subscription=os.environ.get('SPEECH_KEY'), region=os.environ.get('SPEECH_REGION'))
-    speech_config.speech_recognition_language="en-US"
-    audio_config = speechsdk.audio.AudioConfig(**kwargs)
-    return speechsdk.SpeechRecognizer(speech_config=speech_config,audio_config=audio_config)
 def recognize_from_file(file=None):
     # This example requires environment variables named "SPEECH_KEY" and "SPEECH_REGION"
-    speech_config = speechsdk.SpeechConfig(subscription=os.environ.get('SPEECH_KEY',''), region=os.environ.get('SPEECH_REGION',''))
-    speech_config.speech_recognition_language="en-US"
     audio_config = speechsdk.audio.AudioConfig(filename=file)
     speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
@@ -22,8 +18,40 @@ def recognize_from_file(file=None):
     return result.text
 if __name__ == '__main__':
-    for audio_file in os.listdir("audio_samples"):
-        start = time.time()
-        print(recognize_from_file(f"audio_samples/{audio_file}"), " in ", time.time()-start)

 import azure.cognitiveservices.speech as speechsdk
+from azure.cognitiveservices.speech.audio import PushAudioInputStream
 import os
 import time
 # Replace with your own subscription key and service region
+speech_config = speechsdk.SpeechConfig(subscription=os.environ.get('SPEECH_KEY',''), region=os.environ.get('SPEECH_REGION',''))
+speech_config.speech_recognition_language="en-US"
 def recognize_from_file(file=None):
     # This example requires environment variables named "SPEECH_KEY" and "SPEECH_REGION"
     audio_config = speechsdk.audio.AudioConfig(filename=file)
     speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
     return result.text
+def recognize_from_stream(stream,rec_cb):
+    audio_config = speechsdk.audio.AudioConfig(stream=stream)
+    speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
+    speech_recognizer.recognizing.connect(lambda evt: print('RECOGNIZING: {}'.format(evt)))
+    speech_recognizer.recognized.connect(rec_cb)
+    speech_recognizer.session_started.connect(lambda evt: print('SESSION STARTED: {}'.format(evt)))
+    speech_recognizer.session_stopped.connect(lambda evt: print('SESSION STOPPED {}'.format(evt)))
+    speech_recognizer.canceled.connect(lambda evt: print('CANCELED {}'.format(evt)))
+    def stop_cb(evt):
+        print('CLOSING on {}'.format(evt))
+        speech_recognizer.stop_continuous_recognition()
+    speech_recognizer.session_stopped.connect(stop_cb)
+    speech_recognizer.canceled.connect(stop_cb)
+    speech_recognizer.start_continuous_recognition_async()
+    return speech_recognizer
 if __name__ == '__main__':
+    ## Recognize from file
+    # for audio_file in os.listdir("audio_samples"):
+    #     start = time.time()
+    #     print(recognize_from_file(f"audio_samples/{audio_file}"), " in ", time.time()-start)
+    ## real-time recognition
+    def rec_cb(evt):
+        print("##########################")
+        print(evt.result.text)
+    stream = PushAudioInputStream()
+    stream.write(open("audio_samples/audo_0.wav","rb").read())
+    recognize_from_stream(stream,rec_cb)
+    time.sleep(10)