Spaces:
No application file
No application file
gent commited on
Commit ·
f96a97b
1
Parent(s): ab9cb62
realtime asr
Browse files- .gitignore +2 -1
- README.md +1 -1
- app.py +1 -1
- demo.py +0 -0
- demos/character.py +60 -0
- demos/chat.py +56 -0
- demos/example.py +24 -0
- demos/realtime_asr.py +53 -0
- demos/scene.py +60 -0
- utils/asr.py +40 -12
.gitignore
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
apps
|
| 2 |
-
|
|
|
|
| 3 |
# Byte-compiled / optimized / DLL files
|
| 4 |
__pycache__/
|
| 5 |
*.py[cod]
|
|
|
|
| 1 |
apps
|
| 2 |
+
audio_samples
|
| 3 |
+
flagged
|
| 4 |
# Byte-compiled / optimized / DLL files
|
| 5 |
__pycache__/
|
| 6 |
*.py[cod]
|
README.md
CHANGED
|
@@ -5,7 +5,7 @@ colorFrom: green
|
|
| 5 |
colorTo: yellow
|
| 6 |
sdk: gradio
|
| 7 |
sdk_version: 3.32.0
|
| 8 |
-
app_file:
|
| 9 |
pinned: false
|
| 10 |
license: apache-2.0
|
| 11 |
---
|
|
|
|
| 5 |
colorTo: yellow
|
| 6 |
sdk: gradio
|
| 7 |
sdk_version: 3.32.0
|
| 8 |
+
app_file: demos/realtime_asr.py
|
| 9 |
pinned: false
|
| 10 |
license: apache-2.0
|
| 11 |
---
|
app.py
CHANGED
|
@@ -111,7 +111,7 @@ def main():
|
|
| 111 |
|
| 112 |
btn.click(export, None, [ chatbot, outputs])
|
| 113 |
|
| 114 |
-
demo.launch()
|
| 115 |
|
| 116 |
if __name__ == "__main__":
|
| 117 |
main()
|
|
|
|
| 111 |
|
| 112 |
btn.click(export, None, [ chatbot, outputs])
|
| 113 |
|
| 114 |
+
demo.launch(show_error=True)
|
| 115 |
|
| 116 |
if __name__ == "__main__":
|
| 117 |
main()
|
demo.py
ADDED
|
File without changes
|
demos/character.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import time
|
| 3 |
+
from utils.asr import recognize_from_file
|
| 4 |
+
from utils.tts import tts
|
| 5 |
+
from utils.llm import generate_response
|
| 6 |
+
|
| 7 |
+
chat_history = [
|
| 8 |
+
]
|
| 9 |
+
def convert_chatbox(chat_history):
|
| 10 |
+
return [f"{i['role']}: {i['content']}" for i in chat_history]
|
| 11 |
+
|
| 12 |
+
with gr.Blocks() as demo:
|
| 13 |
+
character = gr.Textbox(lines=5, label="Character", placeholder="Enter your character name")
|
| 14 |
+
|
| 15 |
+
def init(character):
|
| 16 |
+
global chat_history
|
| 17 |
+
chat_history=[
|
| 18 |
+
{"role": "system", "content": "Pretend to be a character and have a causal conversation with the user. Respond to the user's questions and ask questions back. The responses start with 'character:' if you are the character and 'GPT:' if you are ChatGPT. You are the character and never respond as the user. You are a character: "+character},
|
| 19 |
+
]
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
init_bt=gr.Button("Init", type="button", label="Start")
|
| 23 |
+
init_bt.click(init, [character], )
|
| 24 |
+
|
| 25 |
+
chatbot = gr.Chatbot()
|
| 26 |
+
with gr.Row():
|
| 27 |
+
msg = gr.Textbox()
|
| 28 |
+
audio = gr.Audio(source="microphone", type="filepath", streaming=False)
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def respond(message):
|
| 32 |
+
# TODO: replace this with real GPT model
|
| 33 |
+
chat_history.append({'role': 'user', 'content': message})
|
| 34 |
+
result = generate_response(chat_history)
|
| 35 |
+
mesg=result['choices'][0]['message']
|
| 36 |
+
print("recv: ", mesg)
|
| 37 |
+
|
| 38 |
+
response = mesg['content']
|
| 39 |
+
chat_history.append(mesg)
|
| 40 |
+
|
| 41 |
+
chatbot.value.append((message,response))
|
| 42 |
+
print("chat_history: ", chatbot.value)
|
| 43 |
+
|
| 44 |
+
return None, chatbot.value
|
| 45 |
+
|
| 46 |
+
msg.submit(respond, [msg], [msg,chatbot])
|
| 47 |
+
|
| 48 |
+
def transcribe(audio_file):
|
| 49 |
+
print("start transcribe, ", audio_file)
|
| 50 |
+
|
| 51 |
+
start = time.time()
|
| 52 |
+
text = recognize_from_file(audio_file)
|
| 53 |
+
print("use ", time.time()-start)
|
| 54 |
+
|
| 55 |
+
print("transcribe done, ", text)
|
| 56 |
+
return respond(text)
|
| 57 |
+
|
| 58 |
+
audio.change(transcribe, [audio], [audio, chatbot])
|
| 59 |
+
|
| 60 |
+
demo.launch()
|
demos/chat.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import time
|
| 3 |
+
from utils.asr import recognize_from_file
|
| 4 |
+
from utils.tts import tts
|
| 5 |
+
from utils.llm import generate_response
|
| 6 |
+
|
| 7 |
+
chat_history = []
|
| 8 |
+
def convert_chatbox(chat_history):
|
| 9 |
+
return [f"{i['role']}: {i['content']}" for i in chat_history]
|
| 10 |
+
|
| 11 |
+
with gr.Blocks() as demo:
|
| 12 |
+
chatbot = gr.Chatbot()
|
| 13 |
+
with gr.Row():
|
| 14 |
+
msg = gr.Textbox()
|
| 15 |
+
audio = gr.Audio(source="microphone", type="filepath", streaming=False)
|
| 16 |
+
player = gr.Audio( type="filepath", label="Speaker",interactive=False)
|
| 17 |
+
clear = gr.Button("Clear")
|
| 18 |
+
|
| 19 |
+
def respond(message):
|
| 20 |
+
# TODO: replace this with real GPT model
|
| 21 |
+
chat_history.append({'role': 'user', 'content': message})
|
| 22 |
+
result = generate_response(chat_history)
|
| 23 |
+
mesg=result['choices'][0]['message']
|
| 24 |
+
print("recv: ", mesg)
|
| 25 |
+
|
| 26 |
+
response = mesg['content']
|
| 27 |
+
chat_history.append(mesg)
|
| 28 |
+
|
| 29 |
+
# write to file
|
| 30 |
+
result = tts(response)
|
| 31 |
+
with open("temp.wav", "wb") as audio_file:
|
| 32 |
+
audio_file.write(result.audio_data)
|
| 33 |
+
|
| 34 |
+
print("write to temp.wav")
|
| 35 |
+
|
| 36 |
+
chatbot.value.append((message,response))
|
| 37 |
+
print("chat_history: ", chatbot.value)
|
| 38 |
+
|
| 39 |
+
return None, "temp.wav", chatbot.value
|
| 40 |
+
|
| 41 |
+
msg.submit(respond, [msg], [msg, player,chatbot])
|
| 42 |
+
|
| 43 |
+
def transcribe(audio_file):
|
| 44 |
+
print("start transcribe, ", audio_file)
|
| 45 |
+
|
| 46 |
+
start = time.time()
|
| 47 |
+
text = recognize_from_file(audio_file)
|
| 48 |
+
print("use ", time.time()-start)
|
| 49 |
+
|
| 50 |
+
print("transcribe done, ", text)
|
| 51 |
+
return respond(text)
|
| 52 |
+
|
| 53 |
+
audio.change(transcribe, [audio], [audio, player, chatbot])
|
| 54 |
+
clear.click(lambda: None, None, chatbot, queue=False)
|
| 55 |
+
|
| 56 |
+
demo.launch()
|
demos/example.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import pipeline
|
| 2 |
+
import gradio as gr
|
| 3 |
+
import time
|
| 4 |
+
|
| 5 |
+
p = pipeline("automatic-speech-recognition")
|
| 6 |
+
|
| 7 |
+
def transcribe(audio, state=""):
|
| 8 |
+
print(time.time(),audio)
|
| 9 |
+
time.sleep(2)
|
| 10 |
+
text = p(audio)["text"]
|
| 11 |
+
state += text + " "
|
| 12 |
+
return state, state
|
| 13 |
+
|
| 14 |
+
gr.Interface(
|
| 15 |
+
fn=transcribe,
|
| 16 |
+
inputs=[
|
| 17 |
+
gr.Audio(source="microphone", type="filepath", streaming=True),
|
| 18 |
+
"state"
|
| 19 |
+
],
|
| 20 |
+
outputs=[
|
| 21 |
+
"textbox",
|
| 22 |
+
"state"
|
| 23 |
+
],
|
| 24 |
+
live=True).launch()
|
demos/realtime_asr.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
from utils import recognize_from_stream, generate_response
|
| 3 |
+
from azure.cognitiveservices.speech.audio import PushAudioInputStream, AudioStreamFormat
|
| 4 |
+
|
| 5 |
+
import gradio as gr
|
| 6 |
+
import os
|
| 7 |
+
|
| 8 |
+
import time
|
| 9 |
+
|
| 10 |
+
stream = PushAudioInputStream(AudioStreamFormat(48000,)) # sample rate is important
|
| 11 |
+
|
| 12 |
+
msg_queue = []
|
| 13 |
+
|
| 14 |
+
chat_history = [
|
| 15 |
+
]
|
| 16 |
+
|
| 17 |
+
def rec_cb(evt):
|
| 18 |
+
print("##########################")
|
| 19 |
+
print(evt.result.text)
|
| 20 |
+
if evt.result.text:
|
| 21 |
+
chat_history.append({'role':'user', 'content':evt.result.text})
|
| 22 |
+
response = generate_response(chat_history)
|
| 23 |
+
chat_history.append(response['choices'][0]['message'])
|
| 24 |
+
|
| 25 |
+
speech_recognizer = recognize_from_stream(stream,rec_cb)
|
| 26 |
+
|
| 27 |
+
def transcribe(speech):
|
| 28 |
+
sample_rate, speech = speech
|
| 29 |
+
# print(time.time(), (sample_rate, len(speech)))
|
| 30 |
+
|
| 31 |
+
stream.write(speech.tobytes())
|
| 32 |
+
|
| 33 |
+
user_msg = [i['content'] for i in chat_history if i['role']=='user']
|
| 34 |
+
box_msg = [i['content'] for i in chat_history if i['role']=='assistant']
|
| 35 |
+
|
| 36 |
+
return list(zip(user_msg,box_msg))
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
demo = gr.Interface(
|
| 40 |
+
title="实时语音识别",
|
| 41 |
+
description="使用Azure的语音识别服务,实时识别麦克风输入的语音。",
|
| 42 |
+
fn=transcribe,
|
| 43 |
+
inputs=[
|
| 44 |
+
gr.Audio(source="microphone", type="numpy",streaming=True),
|
| 45 |
+
],
|
| 46 |
+
outputs= [
|
| 47 |
+
gr.Chatbot(),
|
| 48 |
+
],
|
| 49 |
+
flagging_callback=gr.SimpleCSVLogger(),
|
| 50 |
+
live=True)
|
| 51 |
+
|
| 52 |
+
if __name__ == '__main__':
|
| 53 |
+
demo.launch()
|
demos/scene.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import time
|
| 3 |
+
from utils.asr import recognize_from_file
|
| 4 |
+
from utils.tts import tts
|
| 5 |
+
from utils.llm import generate_response
|
| 6 |
+
|
| 7 |
+
chat_history = [
|
| 8 |
+
]
|
| 9 |
+
def convert_chatbox(chat_history):
|
| 10 |
+
return [f"{i['role']}: {i['content']}" for i in chat_history]
|
| 11 |
+
|
| 12 |
+
with gr.Blocks() as demo:
|
| 13 |
+
scene = gr.Textbox(lines=5, label="scene", placeholder="Enter your scene name")
|
| 14 |
+
|
| 15 |
+
def init(scene):
|
| 16 |
+
global chat_history
|
| 17 |
+
chat_history=[
|
| 18 |
+
{"role": "system", "content": "Pretend to be a character and have a causal conversation with the user. Respond to the user's questions and ask questions back. The responses start with 'character:' if you are the character and 'GPT:' if you are ChatGPT. You are the character and never respond as the user. The conversation happens in the scene: " + scene},
|
| 19 |
+
]
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
init_bt=gr.Button("Init", type="button", label="Start")
|
| 23 |
+
init_bt.click(init, [scene], )
|
| 24 |
+
|
| 25 |
+
chatbot = gr.Chatbot()
|
| 26 |
+
with gr.Row():
|
| 27 |
+
msg = gr.Textbox()
|
| 28 |
+
audio = gr.Audio(source="microphone", type="filepath", streaming=False)
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def respond(message):
|
| 32 |
+
# TODO: replace this with real GPT model
|
| 33 |
+
chat_history.append({'role': 'user', 'content': message})
|
| 34 |
+
result = generate_response(chat_history)
|
| 35 |
+
mesg=result['choices'][0]['message']
|
| 36 |
+
print("recv: ", mesg)
|
| 37 |
+
|
| 38 |
+
response = mesg['content']
|
| 39 |
+
chat_history.append(mesg)
|
| 40 |
+
|
| 41 |
+
chatbot.value.append((message,response))
|
| 42 |
+
print("chat_history: ", chatbot.value)
|
| 43 |
+
|
| 44 |
+
return None, chatbot.value
|
| 45 |
+
|
| 46 |
+
msg.submit(respond, [msg], [msg,chatbot])
|
| 47 |
+
|
| 48 |
+
def transcribe(audio_file):
|
| 49 |
+
print("start transcribe, ", audio_file)
|
| 50 |
+
|
| 51 |
+
start = time.time()
|
| 52 |
+
text = recognize_from_file(audio_file)
|
| 53 |
+
print("use ", time.time()-start)
|
| 54 |
+
|
| 55 |
+
print("transcribe done, ", text)
|
| 56 |
+
return respond(text)
|
| 57 |
+
|
| 58 |
+
audio.change(transcribe, [audio], [audio, chatbot])
|
| 59 |
+
|
| 60 |
+
demo.launch()
|
utils/asr.py
CHANGED
|
@@ -1,19 +1,15 @@
|
|
| 1 |
import azure.cognitiveservices.speech as speechsdk
|
|
|
|
| 2 |
import os
|
| 3 |
import time
|
| 4 |
# Replace with your own subscription key and service region
|
| 5 |
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
audio_config = speechsdk.audio.AudioConfig(**kwargs)
|
| 10 |
-
return speechsdk.SpeechRecognizer(speech_config=speech_config,audio_config=audio_config)
|
| 11 |
|
| 12 |
def recognize_from_file(file=None):
|
| 13 |
# This example requires environment variables named "SPEECH_KEY" and "SPEECH_REGION"
|
| 14 |
-
speech_config = speechsdk.SpeechConfig(subscription=os.environ.get('SPEECH_KEY',''), region=os.environ.get('SPEECH_REGION',''))
|
| 15 |
-
speech_config.speech_recognition_language="en-US"
|
| 16 |
-
|
| 17 |
audio_config = speechsdk.audio.AudioConfig(filename=file)
|
| 18 |
|
| 19 |
speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
|
|
@@ -22,8 +18,40 @@ def recognize_from_file(file=None):
|
|
| 22 |
return result.text
|
| 23 |
|
| 24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
if __name__ == '__main__':
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import azure.cognitiveservices.speech as speechsdk
|
| 2 |
+
from azure.cognitiveservices.speech.audio import PushAudioInputStream
|
| 3 |
import os
|
| 4 |
import time
|
| 5 |
# Replace with your own subscription key and service region
|
| 6 |
|
| 7 |
+
speech_config = speechsdk.SpeechConfig(subscription=os.environ.get('SPEECH_KEY',''), region=os.environ.get('SPEECH_REGION',''))
|
| 8 |
+
speech_config.speech_recognition_language="en-US"
|
| 9 |
+
|
|
|
|
|
|
|
| 10 |
|
| 11 |
def recognize_from_file(file=None):
|
| 12 |
# This example requires environment variables named "SPEECH_KEY" and "SPEECH_REGION"
|
|
|
|
|
|
|
|
|
|
| 13 |
audio_config = speechsdk.audio.AudioConfig(filename=file)
|
| 14 |
|
| 15 |
speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
|
|
|
|
| 18 |
return result.text
|
| 19 |
|
| 20 |
|
| 21 |
+
def recognize_from_stream(stream,rec_cb):
|
| 22 |
+
audio_config = speechsdk.audio.AudioConfig(stream=stream)
|
| 23 |
+
speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
|
| 24 |
+
|
| 25 |
+
speech_recognizer.recognizing.connect(lambda evt: print('RECOGNIZING: {}'.format(evt)))
|
| 26 |
+
speech_recognizer.recognized.connect(rec_cb)
|
| 27 |
+
speech_recognizer.session_started.connect(lambda evt: print('SESSION STARTED: {}'.format(evt)))
|
| 28 |
+
speech_recognizer.session_stopped.connect(lambda evt: print('SESSION STOPPED {}'.format(evt)))
|
| 29 |
+
speech_recognizer.canceled.connect(lambda evt: print('CANCELED {}'.format(evt)))
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def stop_cb(evt):
|
| 33 |
+
print('CLOSING on {}'.format(evt))
|
| 34 |
+
speech_recognizer.stop_continuous_recognition()
|
| 35 |
+
|
| 36 |
+
speech_recognizer.session_stopped.connect(stop_cb)
|
| 37 |
+
speech_recognizer.canceled.connect(stop_cb)
|
| 38 |
+
|
| 39 |
+
speech_recognizer.start_continuous_recognition_async()
|
| 40 |
+
return speech_recognizer
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
|
| 44 |
if __name__ == '__main__':
|
| 45 |
+
## Recognize from file
|
| 46 |
+
# for audio_file in os.listdir("audio_samples"):
|
| 47 |
+
# start = time.time()
|
| 48 |
+
# print(recognize_from_file(f"audio_samples/{audio_file}"), " in ", time.time()-start)
|
| 49 |
+
|
| 50 |
+
## real-time recognition
|
| 51 |
+
def rec_cb(evt):
|
| 52 |
+
print("##########################")
|
| 53 |
+
print(evt.result.text)
|
| 54 |
+
stream = PushAudioInputStream()
|
| 55 |
+
stream.write(open("audio_samples/audo_0.wav","rb").read())
|
| 56 |
+
recognize_from_stream(stream,rec_cb)
|
| 57 |
+
time.sleep(10)
|