gent commited on
Commit
f96a97b
·
1 Parent(s): ab9cb62

realtime asr

Browse files
Files changed (10) hide show
  1. .gitignore +2 -1
  2. README.md +1 -1
  3. app.py +1 -1
  4. demo.py +0 -0
  5. demos/character.py +60 -0
  6. demos/chat.py +56 -0
  7. demos/example.py +24 -0
  8. demos/realtime_asr.py +53 -0
  9. demos/scene.py +60 -0
  10. utils/asr.py +40 -12
.gitignore CHANGED
@@ -1,5 +1,6 @@
1
  apps
2
-
 
3
  # Byte-compiled / optimized / DLL files
4
  __pycache__/
5
  *.py[cod]
 
1
  apps
2
+ audio_samples
3
+ flagged
4
  # Byte-compiled / optimized / DLL files
5
  __pycache__/
6
  *.py[cod]
README.md CHANGED
@@ -5,7 +5,7 @@ colorFrom: green
5
  colorTo: yellow
6
  sdk: gradio
7
  sdk_version: 3.32.0
8
- app_file: app.py
9
  pinned: false
10
  license: apache-2.0
11
  ---
 
5
  colorTo: yellow
6
  sdk: gradio
7
  sdk_version: 3.32.0
8
+ app_file: demos/realtime_asr.py
9
  pinned: false
10
  license: apache-2.0
11
  ---
app.py CHANGED
@@ -111,7 +111,7 @@ def main():
111
 
112
  btn.click(export, None, [ chatbot, outputs])
113
 
114
- demo.launch()
115
 
116
  if __name__ == "__main__":
117
  main()
 
111
 
112
  btn.click(export, None, [ chatbot, outputs])
113
 
114
+ demo.launch(show_error=True)
115
 
116
  if __name__ == "__main__":
117
  main()
demo.py ADDED
File without changes
demos/character.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import time
3
+ from utils.asr import recognize_from_file
4
+ from utils.tts import tts
5
+ from utils.llm import generate_response
6
+
7
+ chat_history = [
8
+ ]
9
+ def convert_chatbox(chat_history):
10
+ return [f"{i['role']}: {i['content']}" for i in chat_history]
11
+
12
+ with gr.Blocks() as demo:
13
+ character = gr.Textbox(lines=5, label="Character", placeholder="Enter your character name")
14
+
15
+ def init(character):
16
+ global chat_history
17
+ chat_history=[
18
+ {"role": "system", "content": "Pretend to be a character and have a causal conversation with the user. Respond to the user's questions and ask questions back. The responses start with 'character:' if you are the character and 'GPT:' if you are ChatGPT. You are the character and never respond as the user. You are a character: "+character},
19
+ ]
20
+
21
+
22
+ init_bt=gr.Button("Init", type="button", label="Start")
23
+ init_bt.click(init, [character], )
24
+
25
+ chatbot = gr.Chatbot()
26
+ with gr.Row():
27
+ msg = gr.Textbox()
28
+ audio = gr.Audio(source="microphone", type="filepath", streaming=False)
29
+
30
+
31
+ def respond(message):
32
+ # TODO: replace this with real GPT model
33
+ chat_history.append({'role': 'user', 'content': message})
34
+ result = generate_response(chat_history)
35
+ mesg=result['choices'][0]['message']
36
+ print("recv: ", mesg)
37
+
38
+ response = mesg['content']
39
+ chat_history.append(mesg)
40
+
41
+ chatbot.value.append((message,response))
42
+ print("chat_history: ", chatbot.value)
43
+
44
+ return None, chatbot.value
45
+
46
+ msg.submit(respond, [msg], [msg,chatbot])
47
+
48
+ def transcribe(audio_file):
49
+ print("start transcribe, ", audio_file)
50
+
51
+ start = time.time()
52
+ text = recognize_from_file(audio_file)
53
+ print("use ", time.time()-start)
54
+
55
+ print("transcribe done, ", text)
56
+ return respond(text)
57
+
58
+ audio.change(transcribe, [audio], [audio, chatbot])
59
+
60
+ demo.launch()
demos/chat.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import time
3
+ from utils.asr import recognize_from_file
4
+ from utils.tts import tts
5
+ from utils.llm import generate_response
6
+
7
+ chat_history = []
8
+ def convert_chatbox(chat_history):
9
+ return [f"{i['role']}: {i['content']}" for i in chat_history]
10
+
11
+ with gr.Blocks() as demo:
12
+ chatbot = gr.Chatbot()
13
+ with gr.Row():
14
+ msg = gr.Textbox()
15
+ audio = gr.Audio(source="microphone", type="filepath", streaming=False)
16
+ player = gr.Audio( type="filepath", label="Speaker",interactive=False)
17
+ clear = gr.Button("Clear")
18
+
19
+ def respond(message):
20
+ # TODO: replace this with real GPT model
21
+ chat_history.append({'role': 'user', 'content': message})
22
+ result = generate_response(chat_history)
23
+ mesg=result['choices'][0]['message']
24
+ print("recv: ", mesg)
25
+
26
+ response = mesg['content']
27
+ chat_history.append(mesg)
28
+
29
+ # write to file
30
+ result = tts(response)
31
+ with open("temp.wav", "wb") as audio_file:
32
+ audio_file.write(result.audio_data)
33
+
34
+ print("write to temp.wav")
35
+
36
+ chatbot.value.append((message,response))
37
+ print("chat_history: ", chatbot.value)
38
+
39
+ return None, "temp.wav", chatbot.value
40
+
41
+ msg.submit(respond, [msg], [msg, player,chatbot])
42
+
43
+ def transcribe(audio_file):
44
+ print("start transcribe, ", audio_file)
45
+
46
+ start = time.time()
47
+ text = recognize_from_file(audio_file)
48
+ print("use ", time.time()-start)
49
+
50
+ print("transcribe done, ", text)
51
+ return respond(text)
52
+
53
+ audio.change(transcribe, [audio], [audio, player, chatbot])
54
+ clear.click(lambda: None, None, chatbot, queue=False)
55
+
56
+ demo.launch()
demos/example.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import pipeline
2
+ import gradio as gr
3
+ import time
4
+
5
+ p = pipeline("automatic-speech-recognition")
6
+
7
+ def transcribe(audio, state=""):
8
+ print(time.time(),audio)
9
+ time.sleep(2)
10
+ text = p(audio)["text"]
11
+ state += text + " "
12
+ return state, state
13
+
14
+ gr.Interface(
15
+ fn=transcribe,
16
+ inputs=[
17
+ gr.Audio(source="microphone", type="filepath", streaming=True),
18
+ "state"
19
+ ],
20
+ outputs=[
21
+ "textbox",
22
+ "state"
23
+ ],
24
+ live=True).launch()
demos/realtime_asr.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from utils import recognize_from_stream, generate_response
3
+ from azure.cognitiveservices.speech.audio import PushAudioInputStream, AudioStreamFormat
4
+
5
+ import gradio as gr
6
+ import os
7
+
8
+ import time
9
+
10
+ stream = PushAudioInputStream(AudioStreamFormat(48000,)) # sample rate is important
11
+
12
+ msg_queue = []
13
+
14
+ chat_history = [
15
+ ]
16
+
17
+ def rec_cb(evt):
18
+ print("##########################")
19
+ print(evt.result.text)
20
+ if evt.result.text:
21
+ chat_history.append({'role':'user', 'content':evt.result.text})
22
+ response = generate_response(chat_history)
23
+ chat_history.append(response['choices'][0]['message'])
24
+
25
+ speech_recognizer = recognize_from_stream(stream,rec_cb)
26
+
27
+ def transcribe(speech):
28
+ sample_rate, speech = speech
29
+ # print(time.time(), (sample_rate, len(speech)))
30
+
31
+ stream.write(speech.tobytes())
32
+
33
+ user_msg = [i['content'] for i in chat_history if i['role']=='user']
34
+ box_msg = [i['content'] for i in chat_history if i['role']=='assistant']
35
+
36
+ return list(zip(user_msg,box_msg))
37
+
38
+
39
+ demo = gr.Interface(
40
+ title="实时语音识别",
41
+ description="使用Azure的语音识别服务,实时识别麦克风输入的语音。",
42
+ fn=transcribe,
43
+ inputs=[
44
+ gr.Audio(source="microphone", type="numpy",streaming=True),
45
+ ],
46
+ outputs= [
47
+ gr.Chatbot(),
48
+ ],
49
+ flagging_callback=gr.SimpleCSVLogger(),
50
+ live=True)
51
+
52
+ if __name__ == '__main__':
53
+ demo.launch()
demos/scene.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import time
3
+ from utils.asr import recognize_from_file
4
+ from utils.tts import tts
5
+ from utils.llm import generate_response
6
+
7
+ chat_history = [
8
+ ]
9
+ def convert_chatbox(chat_history):
10
+ return [f"{i['role']}: {i['content']}" for i in chat_history]
11
+
12
+ with gr.Blocks() as demo:
13
+ scene = gr.Textbox(lines=5, label="scene", placeholder="Enter your scene name")
14
+
15
+ def init(scene):
16
+ global chat_history
17
+ chat_history=[
18
+ {"role": "system", "content": "Pretend to be a character and have a causal conversation with the user. Respond to the user's questions and ask questions back. The responses start with 'character:' if you are the character and 'GPT:' if you are ChatGPT. You are the character and never respond as the user. The conversation happens in the scene: " + scene},
19
+ ]
20
+
21
+
22
+ init_bt=gr.Button("Init", type="button", label="Start")
23
+ init_bt.click(init, [scene], )
24
+
25
+ chatbot = gr.Chatbot()
26
+ with gr.Row():
27
+ msg = gr.Textbox()
28
+ audio = gr.Audio(source="microphone", type="filepath", streaming=False)
29
+
30
+
31
+ def respond(message):
32
+ # TODO: replace this with real GPT model
33
+ chat_history.append({'role': 'user', 'content': message})
34
+ result = generate_response(chat_history)
35
+ mesg=result['choices'][0]['message']
36
+ print("recv: ", mesg)
37
+
38
+ response = mesg['content']
39
+ chat_history.append(mesg)
40
+
41
+ chatbot.value.append((message,response))
42
+ print("chat_history: ", chatbot.value)
43
+
44
+ return None, chatbot.value
45
+
46
+ msg.submit(respond, [msg], [msg,chatbot])
47
+
48
+ def transcribe(audio_file):
49
+ print("start transcribe, ", audio_file)
50
+
51
+ start = time.time()
52
+ text = recognize_from_file(audio_file)
53
+ print("use ", time.time()-start)
54
+
55
+ print("transcribe done, ", text)
56
+ return respond(text)
57
+
58
+ audio.change(transcribe, [audio], [audio, chatbot])
59
+
60
+ demo.launch()
utils/asr.py CHANGED
@@ -1,19 +1,15 @@
1
  import azure.cognitiveservices.speech as speechsdk
 
2
  import os
3
  import time
4
  # Replace with your own subscription key and service region
5
 
6
- def get_recoginizer(**kwargs):
7
- speech_config = speechsdk.SpeechConfig(subscription=os.environ.get('SPEECH_KEY'), region=os.environ.get('SPEECH_REGION'))
8
- speech_config.speech_recognition_language="en-US"
9
- audio_config = speechsdk.audio.AudioConfig(**kwargs)
10
- return speechsdk.SpeechRecognizer(speech_config=speech_config,audio_config=audio_config)
11
 
12
  def recognize_from_file(file=None):
13
  # This example requires environment variables named "SPEECH_KEY" and "SPEECH_REGION"
14
- speech_config = speechsdk.SpeechConfig(subscription=os.environ.get('SPEECH_KEY',''), region=os.environ.get('SPEECH_REGION',''))
15
- speech_config.speech_recognition_language="en-US"
16
-
17
  audio_config = speechsdk.audio.AudioConfig(filename=file)
18
 
19
  speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
@@ -22,8 +18,40 @@ def recognize_from_file(file=None):
22
  return result.text
23
 
24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  if __name__ == '__main__':
26
- for audio_file in os.listdir("audio_samples"):
27
- start = time.time()
28
- print(recognize_from_file(f"audio_samples/{audio_file}"), " in ", time.time()-start)
29
-
 
 
 
 
 
 
 
 
 
 
1
  import azure.cognitiveservices.speech as speechsdk
2
+ from azure.cognitiveservices.speech.audio import PushAudioInputStream
3
  import os
4
  import time
5
  # Replace with your own subscription key and service region
6
 
7
+ speech_config = speechsdk.SpeechConfig(subscription=os.environ.get('SPEECH_KEY',''), region=os.environ.get('SPEECH_REGION',''))
8
+ speech_config.speech_recognition_language="en-US"
9
+
 
 
10
 
11
  def recognize_from_file(file=None):
12
  # This example requires environment variables named "SPEECH_KEY" and "SPEECH_REGION"
 
 
 
13
  audio_config = speechsdk.audio.AudioConfig(filename=file)
14
 
15
  speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
 
18
  return result.text
19
 
20
 
21
+ def recognize_from_stream(stream,rec_cb):
22
+ audio_config = speechsdk.audio.AudioConfig(stream=stream)
23
+ speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
24
+
25
+ speech_recognizer.recognizing.connect(lambda evt: print('RECOGNIZING: {}'.format(evt)))
26
+ speech_recognizer.recognized.connect(rec_cb)
27
+ speech_recognizer.session_started.connect(lambda evt: print('SESSION STARTED: {}'.format(evt)))
28
+ speech_recognizer.session_stopped.connect(lambda evt: print('SESSION STOPPED {}'.format(evt)))
29
+ speech_recognizer.canceled.connect(lambda evt: print('CANCELED {}'.format(evt)))
30
+
31
+
32
+ def stop_cb(evt):
33
+ print('CLOSING on {}'.format(evt))
34
+ speech_recognizer.stop_continuous_recognition()
35
+
36
+ speech_recognizer.session_stopped.connect(stop_cb)
37
+ speech_recognizer.canceled.connect(stop_cb)
38
+
39
+ speech_recognizer.start_continuous_recognition_async()
40
+ return speech_recognizer
41
+
42
+
43
+
44
  if __name__ == '__main__':
45
+ ## Recognize from file
46
+ # for audio_file in os.listdir("audio_samples"):
47
+ # start = time.time()
48
+ # print(recognize_from_file(f"audio_samples/{audio_file}"), " in ", time.time()-start)
49
+
50
+ ## real-time recognition
51
+ def rec_cb(evt):
52
+ print("##########################")
53
+ print(evt.result.text)
54
+ stream = PushAudioInputStream()
55
+ stream.write(open("audio_samples/audo_0.wav","rb").read())
56
+ recognize_from_stream(stream,rec_cb)
57
+ time.sleep(10)