classification / app.py
picard.tseng
update
7385733
import streamlit as st
from streamlit_mic_recorder import mic_recorder
#import whisper
import tempfile
import os
from pydub import AudioSegment
from faster_whisper import WhisperModel
# Load whisper model
@st.cache_resource
def load_model():
#return whisper.load_model("small")
return WhisperModel("large-v3", device="auto", compute_type="int8")
whisper_model = load_model()
st.title("🗣 中文語音識別 (Whisper + Mic Recorder)")
mode = st.radio("選擇輸入方式", ["🎤 使用麥克風錄音", "📁 上傳本地音檔","✍️ 手動文字輸入"], horizontal=True)
def prompt_switch(index, input_text):
print("prompt_switch")
prompts = {
"Qwen/Qwen2.5-7B-Instruct-Turbo": """
You are an assistant for intent classification.
Your task is to classify a given user input into one of the following two categories:
1."Reservation": user input is related to or imply a a restaurant reservation.
2."unrelated": user input is anything else.
Your response should be in JSON format either
{{"result": "Reservation"}} or {{"result": "Others"}}.
If the user input is related to restaurant reservation, return {{"result": "Reservation"}};
If the user input is anything else, return {{"result": "Others"}}.
Here is the user input: {input}
""".strip(),
"Qwen/Qwen2.5-Coder-32B-Instruct": """
You are an assistant for intent classification.
Your task is to classify a given user input into one of the following two categories:
"Reservation": user input is related to or imply a a restaurant reservation.
"unrelated": user input is anything else.
Your response should be in JSON format either {{"result": "Reservation"}} or {{"result": "Others"}}.
Here is the user input: {input}
""".strip(),
"google/gemma-2b-it": """
You are an assistant for intent classification.
Your task is to classify a given user input into one of the following two categories:
1."Reservation": user input is related to or imply a a restaurant reservation.
2."unrelated": user input is anything else.
Your response should be in JSON format either
{{"result": "Reservation"}} or {{"result": "Others"}}.
If the user input is related to restaurant reservation, return {{"result": "Reservation"}};
If the user input is anything else, return {{"result": "Others"}}.
Here is the user input: {input}
""".strip(),
"google/gemma-2-9b-it": """
You are an assistant for intent classification.
Your task is to classify a given user input into one of the following two categories:
1."Reservation": user input is related to or imply a a restaurant reservation.
2."unrelated": user input is anything else.
Your response should be in JSON format either
{{"result": "Reservation"}} or {{"result": "Others"}}.
If the user input is related to restaurant reservation, return {{"result": "Reservation"}};
If the user input is anything else, return {{"result": "Others"}}.
Here is the user input: {input}
""".strip(),
"google/gemma-2-27b-it": """
You are an assistant for intent classification.
Your task is to classify a given user input into one of the following two categories:
1."Reservation": user input is related to or imply a a restaurant reservation.
2."unrelated": user input is anything else.
Your response should be in JSON format either
{{"result": "Reservation"}} or {{"result": "Others"}}.
If the user input is related to restaurant reservation, return {{"result": "Reservation"}};
If the user input is anything else, return {{"result": "Others"}}.
Here is the user input: {input}
""".strip()
}
prompt = prompts[index].format(input=input_text)
return prompt
model_option = st.selectbox(
"你要選擇哪一個模型?",
("Qwen/Qwen2.5-7B-Instruct-Turbo", "Qwen/Qwen2.5-Coder-32B-Instruct","google/gemma-2b-it", "google/gemma-2-9b-it", "google/gemma-2-27b-it")
)
st.write("你選擇的模型:", model_option)
# Record audio from browser
# audio_data = mic_recorder(start_prompt="🎤 點擊開始錄音", stop_prompt="⏹️ 停止錄音", just_once=True, use_container_width=True)
import os
import outlines
import os, termcolor
from termcolor import cprint, colored
from outlines.models import openai
#from outlines.generate import choice
# '''
# prompt_messages_q32b = f"""
# You are an assistant for intent classification.
# Your task is to classify a given user input into one of the following two categories:
# "Reservation": user input is related to or imply a a restaurant reservation.
# "unrelated": user input is anything else.
# Your response should be in JSON format either {{"result": "Reservation"}} or {{"result": "Others"}}.
# Here is the user input: {input}
# """.strip()
# prompt_messages_q7b = f"""
# You are an assistant for intent classification.
# Your task is to classify a given user input into one of the following two categories:
# 1."Reservation": user input is related to or imply a a restaurant reservation.
# 2."unrelated": user input is anything else.
# Your response should be in JSON format either
# {{"result": "Reservation"}} or {{"result": "Others"}}.
# If the user input is related to restaurant reservation, return {{"result": "Reservation"}};
# If the user input is anything else, return {{"result": "Others"}}.
# Here is the user input: {input}
# """.strip()
# '''
def clssification( input):
st.write("🧠 LLM辨識意圖中..")
labels = ["Reservation", "unrelated"]
model = openai(
#"Qwen/Qwen2.5-Coder-32B-Instruct",
#"Qwen/Qwen2.5-7B-Instruct-Turbo",
#"google/gemma-2b-it",
model_option,
api_key=os.environ["TOGETHER_API_KEY"],
base_url="https://api.together.xyz/v1"
)
generator = outlines.generate.choice(model, labels)
prompt_message = prompt_switch(model_option, input)
st.write(prompt_message)
answer = generator(prompt_message)
return answer
def convert_audio_to_wav(audio_bytes, target_sample_rate=16000):
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_input:
temp_input.write(audio_bytes)
temp_input_path = temp_input.name
audio = AudioSegment.from_file(temp_input_path)
audio = audio.set_channels(1).set_frame_rate(target_sample_rate)
converted_path = temp_input_path.replace(".wav", "_converted.wav")
audio.export(converted_path, format="wav")
os.remove(temp_input_path)
return converted_path
def transcribe_audio(wav_path):
st.write("🧠 Whisper 正在識別語音..")
#result = whisper_model.transcribe(wav_path, language="zh")
#st.text_area("📜 轉寫結果", result["text"], height=200)
segments, info = whisper_model.transcribe(wav_path, language="zh")
result_text = "".join([seg.text for seg in segments])
st.text_area("📜 轉寫結果", result_text, height=200)
os.remove(wav_path)
intent_classification(result_text)
def intent_classification(input_text):
st.write("🧠 意圖識別 ")
intent=clssification(input_text)
st.write(intent)
# --- Mode: Microphone ---
if mode == "🎤 使用麥克風錄音":
audio_data = mic_recorder(start_prompt="🎤 點擊開始錄音", stop_prompt="⏹️ 停止錄音", just_once=True, use_container_width=True)
if audio_data:
st.audio(audio_data["bytes"], format="audio/wav")
wav_path = convert_audio_to_wav(audio_data["bytes"])
transcribe_audio(wav_path)
# --- Mode: File Upload ---
elif mode == "📁 上傳本地音檔":
uploaded_file = st.file_uploader("上傳音頻文件 (支持 wav, mp3, m4a 等)", type=["wav", "mp3", "m4a", "ogg", "flac"])
if uploaded_file is not None:
st.audio(uploaded_file, format="audio/wav")
wav_path = convert_audio_to_wav(uploaded_file.read())
transcribe_audio(wav_path)
elif mode == "✍️ 手動文字輸入":
manual_text = st.text_area("請輸入文字", height=200, key="manual_input")
if st.button("確認輸入"):
st.success("✅ 已接收輸入內容!")
st.text_area("📜 輸入內容", manual_text, height=200, key="manual_output")
intent_classification(manual_text)
#==============