Spaces:

zeithrold
/

voice-chatbot

Runtime error

App Files Files Community

Yi Jin commited on Mar 7, 2024

Commit

177a062

unverified ·

1 Parent(s): a7c4935

Uploaded beta code

Browse files

Files changed (6) hide show

.gitignore +2 -0
.vscode/settings.json +0 -4
app.py +124 -0
iat.py +24 -4
requirements.txt +2 -1
tts.py +45 -26

.gitignore CHANGED Viewed

	@@ -177,3 +177,5 @@ pyrightconfig.json
177
178	config.yaml
179


177
178	config.yaml
179
180	+
181	+ .vscode/

.vscode/settings.json DELETED Viewed

@@ -1,4 +0,0 @@
-{
-    "python.analysis.autoImportCompletions": true,
-    "python.analysis.typeCheckingMode": "basic"
-}

app.py ADDED Viewed

	@@ -0,0 +1,124 @@

+import gradio as gr
+import os
+from loguru import logger
+from zhipuai import ZhipuAI
+from zhipuai.api_resource.chat.chat import Chat
+import yaml
+import json
+from iat import IATClient
+from tts import TTSClient
+import numpy as np
+from scipy.signal import resample
+logger.debug("Loading config")
+config_env = os.environ.get("CONFIG", "")
+if config_env:
+    logger.debug("Using environment variable for config")
+    config = json.loads(config_env)
+else:
+    logger.debug("Reading config from file")
+    with open("config.yaml", "r") as f:
+        try:
+            config = yaml.safe_load(f)["config"]
+        except yaml.YAMLError as e:
+            logger.error(e)
+            raise e
+zhipuai_config = config["zhipuai"]
+xfyun_config = config["xfyun"]
+zhipuai = ZhipuAI(api_key=zhipuai_config["apikey"])
+iat = IATClient(
+    xfyun_config["iat"]["appid"],
+    xfyun_config["iat"]["apikey"],
+    xfyun_config["iat"]["apisecret"],
+)
+tts = TTSClient(
+    xfyun_config["tts"]["appid"],
+    xfyun_config["tts"]["apikey"],
+    xfyun_config["tts"]["apisecret"],
+)
+def build_zhipuai_history(history: list[list[str]]):
+    result = [{"role": "system", "content": config["zhipuai"]["prompt"]}]
+    for history_element in history:
+        user_message, assistant_message = history_element
+        if user_message != None:
+            result += [{"role": "user", "content": user_message}]
+        if assistant_message != None:
+            result += [{"role": "assistant", "content": assistant_message}]
+    return result
+def add_text(history, text):
+    history = history + [(text, None)]
+    return history, gr.Textbox(value="", interactive=False)
+def bot(history):
+    zhipuai_history = build_zhipuai_history(history)
+    res = zhipuai.chat.completions.create(
+        model="glm-4", messages=zhipuai_history, stream=True
+    )
+    history[-1][1] = ""
+    for chunk in res:
+        history[-1][1] += chunk.choices[0].delta.content
+        yield history
+async def generate_text(audio: tuple[int, np.ndarray]):
+    logger.debug(f"Generating text from audio")
+    logger.debug(f"Sampling rate: {audio[0]}, resampling to 16000")
+    audio = (16000, resample(audio[1], 16000))
+    result_list = []
+    async for result in iat.dictate(audio):
+        logger.debug(f"Result: {result}")
+        result_list.append(result)
+    return "".join(result_list)
+async def generate_audio(history: list[list[str]]):
+    logger.debug(f"Generating audio from text")
+    text = history[-1][-1]
+    result = await tts.generate(text)
+    return result
+with gr.Blocks() as demo:
+    title = gr.Markdown("# 老王元宇宙受害者")
+    chatbot = gr.Chatbot(
+        [],
+        elem_id="chatbot",
+        bubble_full_width=False,
+    )
+    with gr.Row():
+        txt = gr.Textbox(
+            scale=4,
+            show_label=False,
+            placeholder="Enter text and press enter",
+            container=False,
+        )
+        submit_button = gr.Button(value="提交", variant="primary")
+    with gr.Row():
+        with gr.Column():
+            user_title = gr.Markdown("## 用户语音识别")
+            user_audio = gr.Audio(type="numpy", sources=['microphone'])
+            user_audio_submit = gr.Button(value="上传用户语音并转换", variant="primary")
+        with gr.Column():
+            user_title = gr.Markdown("## 机器人语音合成")
+            bot_audio = gr.Audio()
+            bot_audio_submit = gr.Button(value="将机器人最后一个回复转换为语音", variant="primary")
+    user_audio_submit.click(generate_text, [user_audio], outputs=txt)
+    bot_audio_submit.click(generate_audio, [chatbot], outputs=bot_audio)
+    txt_msg = submit_button.click(add_text, [chatbot, txt], [chatbot, txt], queue=False).then(
+        bot, chatbot, chatbot, api_name="bot_response"
+    )
+    txt_msg.then(lambda: gr.Textbox(interactive=True), None, [txt], queue=False)
+demo.launch()

iat.py CHANGED Viewed

@@ -7,6 +7,7 @@ import base64
 import numpy as np
 import json
 import websockets.client
 from websockets.exceptions import ConnectionClosedError
 import time
@@ -52,29 +53,35 @@ class IATClient:
         path = parse_result.path
         sign_raw_str = f"host: {host}\ndate: {date}\nGET {path} HTTP/1.1"
         sign_sha = hmac.new(
             self.api_secret.encode("utf-8"),
             sign_raw_str.encode("utf-8"),
             digestmod=hashlib.sha256,
         ).digest()
-        auth_origin = 'api_key="%s", algorithm="%s", headers="%s", signature="%s"' % (
             self.api_key,
             "hmac-sha256",
             "host date request-line",
             sign_sha,
         )
-        auth = base64.b64encode(auth_origin.encode("utf-8")).decode("utf-8")
         params = {
             "authorization": auth,
             "date": date,
             "host": host,
         }
         url = f"{self.endpoint}?{urlencode(params)}"
         return url
     def prepare_data(self, audio: bytes, chunk_size=1280, sampling_rate=16000):
         status = STATUS_FIRST_FRAME
         for i in range(0, len(audio), chunk_size):
             chunk = audio[i : i + chunk_size]
             if i + chunk_size >= len(audio):
                 status = STATUS_LAST_FRAME
@@ -91,16 +98,29 @@ class IATClient:
             yield payload
             status = STATUS_CONTINUE_FRAME
-    async def dictate(self, audio: tuple[int, np.ndarray], interval=0.4):
         url = self.create_url()
         sampling_rate, source = audio
         pcm = self.encode_pcm(source)
         async with websockets.client.connect(url) as ws:
             for payload in self.prepare_data(pcm, sampling_rate=sampling_rate):
                 await ws.send(json.dumps(payload))
                 time.sleep(interval)
             try:
                 async for message in ws:
-                    yield message
             except ConnectionClosedError as e:
                 print(f"Connection closed: {e.code} {e.reason}")

 import numpy as np
 import json
 import websockets.client
+from loguru import logger
 from websockets.exceptions import ConnectionClosedError
 import time
         path = parse_result.path
         sign_raw_str = f"host: {host}\ndate: {date}\nGET {path} HTTP/1.1"
+        logger.debug(f"Sign raw string: {sign_raw_str}")
         sign_sha = hmac.new(
             self.api_secret.encode("utf-8"),
             sign_raw_str.encode("utf-8"),
             digestmod=hashlib.sha256,
         ).digest()
+        sign_sha = base64.b64encode(sign_sha).decode("utf-8")
+        auth_raw_str = 'api_key="%s", algorithm="%s", headers="%s", signature="%s"' % (
             self.api_key,
             "hmac-sha256",
             "host date request-line",
             sign_sha,
         )
+        logger.debug(f"Authorization: {auth_raw_str}")
+        auth = base64.b64encode(auth_raw_str.encode("utf-8")).decode("utf-8")
         params = {
             "authorization": auth,
             "date": date,
             "host": host,
         }
         url = f"{self.endpoint}?{urlencode(params)}"
+        logger.debug(f"URL: {url}")
         return url
     def prepare_data(self, audio: bytes, chunk_size=1280, sampling_rate=16000):
         status = STATUS_FIRST_FRAME
+        logger.debug(f"Total audio length: {len(audio)}")
         for i in range(0, len(audio), chunk_size):
+            logger.debug(f"Processing chunk {i} to {i + chunk_size}")
             chunk = audio[i : i + chunk_size]
             if i + chunk_size >= len(audio):
                 status = STATUS_LAST_FRAME
             yield payload
             status = STATUS_CONTINUE_FRAME
+    async def dictate(self, audio: tuple[int, np.ndarray], interval=0.04):
+        logger.debug(f"Generate URL")
         url = self.create_url()
+        logger.debug("Encoding audio to PCM")
         sampling_rate, source = audio
         pcm = self.encode_pcm(source)
         async with websockets.client.connect(url) as ws:
             for payload in self.prepare_data(pcm, sampling_rate=sampling_rate):
+                logger.debug('Sending payload')
                 await ws.send(json.dumps(payload))
                 time.sleep(interval)
             try:
                 async for message in ws:
+                    data: dict = json.loads(message)
+                    logger.debug(f"Received data: {data}")
+                    if not 'data' in data.keys():
+                        yield ''
+                        break
+                    is_end = data["data"]["status"] == STATUS_LAST_FRAME
+                    ws_list = data["data"]["result"]["ws"]
+                    text = ''.join([cw["w"] for cw in sum([ws["cw"] for ws in ws_list], [])])
+                    yield text
+                    if is_end:
+                        break
             except ConnectionClosedError as e:
                 print(f"Connection closed: {e.code} {e.reason}")

requirements.txt CHANGED Viewed

@@ -1,7 +1,8 @@
 gradio
 jupyter
 requests
-websocket-client
 zhipuai
 loguru
 numpy

 gradio
 jupyter
 requests
+websockets
 zhipuai
 loguru
 numpy
+scipy

tts.py CHANGED Viewed

@@ -1,13 +1,14 @@
 from urllib.parse import urlparse, urlencode
 from wsgiref.handlers import format_date_time
 from datetime import datetime
 import hmac
 import hashlib
 import base64
 import numpy as np
 import json
 import websockets.client
-from websockets.exceptions import ConnectionClosedError
 import time
 STATUS_FIRST_FRAME = 0
@@ -16,7 +17,6 @@ STATUS_LAST_FRAME = 2
 class TTSClient:
     def __init__(
         self,
         app_id: str,
@@ -29,22 +29,24 @@ class TTSClient:
         self.api_secret = api_secret
         self.endpoint = endpoint
         self.common_args = {"app_id": self.app_id}
-        self.business_args = {
             "aue": "raw",
-            "auf": "audio/L16;rate=16000",
             "vcn": "xiaoyan",
             "tte": "utf8",
         }
-    def prepare_data(self, text: str):
-        return {
             "common": self.common_args,
-            "business": self.business_args,
             "data": {
                 "status": 2,
                 "text": str(base64.b64encode(text.encode("utf-8")), "UTF8"),
             },
         }
     def create_url(self):
         parse_result = urlparse(self.endpoint)
@@ -54,45 +56,62 @@ class TTSClient:
         path = parse_result.path
         sign_raw_str = f"host: {host}\ndate: {date}\nGET {path} HTTP/1.1"
         sign_sha = hmac.new(
             self.api_secret.encode("utf-8"),
             sign_raw_str.encode("utf-8"),
             digestmod=hashlib.sha256,
         ).digest()
-        auth_origin = 'api_key="%s", algorithm="%s", headers="%s", signature="%s"' % (
             self.api_key,
             "hmac-sha256",
             "host date request-line",
             sign_sha,
         )
-        auth = base64.b64encode(auth_origin.encode("utf-8")).decode("utf-8")
         params = {
             "authorization": auth,
             "date": date,
             "host": host,
         }
         url = f"{self.endpoint}?{urlencode(params)}"
         return url
     def parse_result(self, result: bytes) -> np.ndarray:
         return np.frombuffer(result, dtype=np.int16)
-    async def generate(self, text: str):
         url = self.create_url()
-        data = self.prepare_data(text)
         result = bytearray()
-        async with websockets.client.connect(url) as ws:
-            await ws.send(json.dumps(data))
-            while True:
-                try:
-                    message = await ws.recv()
-                    message = json.loads(message)
-                    audio = message["data"]["audio"]
-                    audio = base64.b64decode(audio)
-                    status = message["data"]["status"]
-                    result += audio
-                    if status == STATUS_LAST_FRAME:
                         break
-                except ConnectionClosedError:
-                    break
-        return self.parse_result(bytes(result))

 from urllib.parse import urlparse, urlencode
 from wsgiref.handlers import format_date_time
 from datetime import datetime
+from loguru import logger
 import hmac
 import hashlib
 import base64
 import numpy as np
 import json
 import websockets.client
+from websockets.exceptions import ConnectionClosedError, InvalidStatusCode
 import time
 STATUS_FIRST_FRAME = 0
 class TTSClient:
     def __init__(
         self,
         app_id: str,
         self.api_secret = api_secret
         self.endpoint = endpoint
         self.common_args = {"app_id": self.app_id}
+    def prepare_data(self, text: str, sampling_rate=16000):
+        business_args = {
             "aue": "raw",
+            "auf": f"audio/L16;rate={sampling_rate}",
             "vcn": "xiaoyan",
             "tte": "utf8",
         }
+        result = {
             "common": self.common_args,
+            "business": business_args,
             "data": {
                 "status": 2,
                 "text": str(base64.b64encode(text.encode("utf-8")), "UTF8"),
             },
         }
+        logger.debug(f"Data: {result}")
+        return result
     def create_url(self):
         parse_result = urlparse(self.endpoint)
         path = parse_result.path
         sign_raw_str = f"host: {host}\ndate: {date}\nGET {path} HTTP/1.1"
+        logger.debug(f"Sign raw string: {sign_raw_str}")
         sign_sha = hmac.new(
             self.api_secret.encode("utf-8"),
             sign_raw_str.encode("utf-8"),
             digestmod=hashlib.sha256,
         ).digest()
+        sign_sha = base64.b64encode(sign_sha).decode("utf-8")
+        auth_raw_str = 'api_key="%s", algorithm="%s", headers="%s", signature="%s"' % (
             self.api_key,
             "hmac-sha256",
             "host date request-line",
             sign_sha,
         )
+        logger.debug(f"Authorization: {auth_raw_str}")
+        auth = base64.b64encode(auth_raw_str.encode("utf-8")).decode("utf-8")
         params = {
             "authorization": auth,
             "date": date,
             "host": host,
         }
         url = f"{self.endpoint}?{urlencode(params)}"
+        logger.debug(f"URL: {url}")
         return url
     def parse_result(self, result: bytes) -> np.ndarray:
         return np.frombuffer(result, dtype=np.int16)
+    async def generate(self, text: str, sampling_rate=16000):
+        logger.debug("Generate URL")
         url = self.create_url()
+        logger.debug("Preparing Data")
+        data = self.prepare_data(text, sampling_rate)
         result = bytearray()
+        try:
+            async with websockets.client.connect(url) as ws:
+                logger.debug("Sending Data")
+                await ws.send(json.dumps(data))
+                while True:
+                    try:
+                        message = await ws.recv()
+                        message = json.loads(message)
+                        logger.debug(f"Received message: {message}")
+                        audio = message["data"]["audio"]
+                        logger.debug(f"Received audio length: {len(audio)}")
+                        audio = base64.b64decode(audio)
+                        status = message["data"]["status"]
+                        result += audio
+                        if status == STATUS_LAST_FRAME:
+                            break
+                    except ConnectionClosedError:
                         break
+        except InvalidStatusCode as e:
+            logger.error(f"Error: {e}")
+            raise e
+        logger.success("Audio generation finished")
+        return sampling_rate, self.parse_result(bytes(result))
+__all__ = ["TTSClient"]