Spaces:

Nguyen5
/

chatbot1

Sleeping

App Files Files Community

Nguyen5 commited on Dec 9, 2025

Commit

80c3670

1 Parent(s): 54f3783

commit

Browse files

Files changed (2) hide show

app.py +11 -42
speech_io.py +26 -51

app.py CHANGED Viewed

@@ -2,11 +2,7 @@
 # Version 26.11 – ohne Modi, stabil für Text + Voice
 import gradio as gr
-try:
-    from gradio_pdf import PDF
-    _HAS_PDF = True
-except ImportError:
-    _HAS_PDF = False
 from huggingface_hub import hf_hub_download
 from load_documents import load_documents, DATASET, PDF_FILE, HTML_FILE
@@ -17,10 +13,6 @@ from llm import load_llm
 from rag_pipeline import answer, PDF_BASE_URL, LAW_URL
 from speech_io import transcribe_audio, synthesize_speech
-from fastapi import FastAPI
-import realtime_server as rt
-import sys
-sys.dont_write_bytecode = True
 # =====================================================
 # INITIALISIERUNG (global)
@@ -105,7 +97,7 @@ def chatbot_text(user_message, history):
 # VOICE CHATBOT
 # =====================================================
-def chatbot_voice(audio_path, history, tts_model_id):
     # 1. Speech → Text
     text = transcribe_audio(audio_path)
     if not text:
@@ -126,7 +118,7 @@ def chatbot_voice(audio_path, history, tts_model_id):
     history = history + [{"role": "assistant", "content": bot_msg}]
     # 3. Text → Speech
-    audio = synthesize_speech(bot_msg, tts_model_id)
     return history, audio, ""
@@ -134,13 +126,13 @@ def chatbot_voice(audio_path, history, tts_model_id):
 # LAST ANSWER → TTS
 # =====================================================
-def read_last_answer(history, tts_model_id=None):
     if not history:
         return None
     for msg in reversed(history):
         if msg["role"] == "assistant":
-            return synthesize_speech(msg["content"], tts_model_id)
     return None
@@ -158,7 +150,7 @@ with gr.Blocks(title="Prüfungsrechts-Chatbot (RAG + Sprache)") as demo:
     with gr.Row():
         with gr.Column(scale=2):
-            chatbot = gr.Chatbot(label="Chat", height=500)
             msg = gr.Textbox(
                 label="Frage eingeben",
@@ -183,36 +175,21 @@ with gr.Blocks(title="Prüfungsrechts-Chatbot (RAG + Sprache)") as demo:
             gr.Markdown("### 🎙️ Spracheingabe")
             voice_in = gr.Audio(sources=["microphone"], type="filepath")
             voice_out = gr.Audio(label="Vorgelesene Antwort", type="numpy")
-            tts_lang = gr.Dropdown(
-                label="TTS Sprache",
-                choices=[
-                    "facebook/mms-tts-deu",
-                    "facebook/mms-tts-vie",
-                    "facebook/mms-tts-eng",
-                ],
-                value="facebook/mms-tts-deu",
-            )
             voice_btn = gr.Button("Sprechen & senden")
             voice_btn.click(
                 chatbot_voice,
-                [voice_in, chatbot, tts_lang],
                 [chatbot, voice_out, msg]
             )
             read_btn = gr.Button("🔁 Antwort erneut vorlesen")
             read_btn.click(
                 read_last_answer,
-                [chatbot, tts_lang],
                 [voice_out]
             )
-            gr.Markdown("### ⚡ Voice (Realtime) – thử nghiệm")
-            gr.Markdown("Sử dụng OpenAI Realtime API cho hội thoại nói. Trang test chạy cùng máy chủ này.")
-            gr.HTML("""
-            <iframe src="/realtime/" style="width:100%;height:300px;border:1px solid #ccc"></iframe>
-            """)
             clear_btn = gr.Button("Chat zurücksetzen")
             clear_btn.click(lambda: [], None, chatbot)
@@ -222,21 +199,13 @@ with gr.Blocks(title="Prüfungsrechts-Chatbot (RAG + Sprache)") as demo:
         with gr.Column(scale=1):
             gr.Markdown("### 📄 Prüfungsordnung (PDF)")
-            if _HAS_PDF:
-                PDF(_pdf_path, height=350)
-            else:
-                gr.HTML(f'<iframe src="{PDF_BASE_URL}" style="width:100%;height:350px;border:none;"></iframe>')
             gr.Markdown("### 📘 Hochschulgesetz NRW (Website)")
             gr.HTML(
                 f'<iframe src="{LAW_URL}" style="width:100%;height:350px;border:none;"></iframe>'
             )
-# FastAPI app: mount Gradio + realtime server cùng một host
-app = FastAPI()
-app = gr.mount_gradio_app(app, demo, path="/")
-app.mount("/realtime", rt.app)
 if __name__ == "__main__":
-    import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=7860)

 # Version 26.11 – ohne Modi, stabil für Text + Voice
 import gradio as gr
+from gradio_pdf import PDF
 from huggingface_hub import hf_hub_download
 from load_documents import load_documents, DATASET, PDF_FILE, HTML_FILE
 from rag_pipeline import answer, PDF_BASE_URL, LAW_URL
 from speech_io import transcribe_audio, synthesize_speech
 # =====================================================
 # INITIALISIERUNG (global)
 # VOICE CHATBOT
 # =====================================================
+def chatbot_voice(audio_path, history):
     # 1. Speech → Text
     text = transcribe_audio(audio_path)
     if not text:
     history = history + [{"role": "assistant", "content": bot_msg}]
     # 3. Text → Speech
+    audio = synthesize_speech(bot_msg)
     return history, audio, ""
 # LAST ANSWER → TTS
 # =====================================================
+def read_last_answer(history):
     if not history:
         return None
     for msg in reversed(history):
         if msg["role"] == "assistant":
+            return synthesize_speech(msg["content"])
     return None
     with gr.Row():
         with gr.Column(scale=2):
+            chatbot = gr.Chatbot(type="messages", label="Chat", height=500)
             msg = gr.Textbox(
                 label="Frage eingeben",
             gr.Markdown("### 🎙️ Spracheingabe")
             voice_in = gr.Audio(sources=["microphone"], type="filepath")
             voice_out = gr.Audio(label="Vorgelesene Antwort", type="numpy")
             voice_btn = gr.Button("Sprechen & senden")
             voice_btn.click(
                 chatbot_voice,
+                [voice_in, chatbot],
                 [chatbot, voice_out, msg]
             )
             read_btn = gr.Button("🔁 Antwort erneut vorlesen")
             read_btn.click(
                 read_last_answer,
+                [chatbot],
                 [voice_out]
             )
             clear_btn = gr.Button("Chat zurücksetzen")
             clear_btn.click(lambda: [], None, chatbot)
         with gr.Column(scale=1):
             gr.Markdown("### 📄 Prüfungsordnung (PDF)")
+            PDF(_pdf_path, height=350)
             gr.Markdown("### 📘 Hochschulgesetz NRW (Website)")
             gr.HTML(
                 f'<iframe src="{LAW_URL}" style="width:100%;height:350px;border:none;"></iframe>'
             )
 if __name__ == "__main__":
+    demo.launch()

speech_io.py CHANGED Viewed

@@ -8,20 +8,18 @@ Sprachbasierte Ein-/Ausgabe:
 Dieses File ist 100% stabil für HuggingFace Spaces.
 """
-from typing import Optional, Tuple, Dict
 import numpy as np
 import soundfile as sf
 from scipy.signal import butter, filtfilt
 from transformers import pipeline
-import librosa
-import webrtcvad
 # Modelle
 ASR_MODEL_ID = "openai/whisper-small"
 TTS_MODEL_ID = "facebook/mms-tts-deu"
 _asr = None
-_tts_cache: Dict[str, any] = {}
 # ========================================================
 # STT PIPELINE
@@ -44,15 +42,15 @@ def get_asr_pipeline():
 # TTS PIPELINE
 # ========================================================
-def get_tts_pipeline(model_id: Optional[str] = None):
-    mid = model_id or TTS_MODEL_ID
-    if mid not in _tts_cache:
-        print(f">>> Lade TTS Modell: {mid}")
-        _tts_cache[mid] = pipeline(
             task="text-to-speech",
-            model=mid,
         )
-    return _tts_cache[mid]
 # ========================================================
 # AUDIO FILTER – Noise Reduction + Highpass
@@ -78,42 +76,6 @@ def apply_fade(audio, sr, duration_ms=10):
     return audio
-def _vad_trim(audio16: np.ndarray, sr: int) -> np.ndarray:
-    vad = webrtcvad.Vad(2)
-    frame_ms = 30
-    frame_len = int(sr * frame_ms / 1000)
-    if frame_len <= 0:
-        return audio16
-    start = 0
-    end = len(audio16)
-    voiced = []
-    i = 0
-    while i + frame_len <= len(audio16):
-        frame = audio16[i:i+frame_len]
-        is_voiced = vad.is_speech(frame.tobytes(), sample_rate=sr)
-        voiced.append(is_voiced)
-        i += frame_len
-    first = next((idx for idx, v in enumerate(voiced) if v), None)
-    last = next((len(voiced)-1-idx for idx, v in enumerate(reversed(voiced)) if v), None)
-    if first is None or last is None or last < first:
-        return audio16
-    start = first * frame_len
-    end = min((last + 1) * frame_len, len(audio16))
-    return audio16[start:end]
-def preprocess_audio_for_stt(raw: np.ndarray, sr: int) -> Tuple[np.ndarray, int]:
-    if raw.ndim > 1:
-        raw = raw[:, 0]
-    y = librosa.to_mono(raw.astype(np.float32))
-    y = librosa.resample(y, orig_sr=sr, target_sr=16000)
-    y = y / (np.max(np.abs(y)) + 1e-9)
-    y16 = np.clip(y * 32767, -32768, 32767).astype(np.int16)
-    y16 = _vad_trim(y16, 16000)
-    max_samples = 16000 * 30
-    if len(y16) > max_samples:
-        y16 = y16[:max_samples]
-    return y16.astype(np.float32) / 32767.0, 16000
 # ========================================================
 # SPEECH-TO-TEXT (STT)
 # ========================================================
@@ -126,11 +88,24 @@ def transcribe_audio(audio_path: str) -> str:
     if audio_path is None:
         return ""
     data, sr = sf.read(audio_path)
-    data, sr = preprocess_audio_for_stt(data, sr)
     asr = get_asr_pipeline()
     print(">>> Transkribiere Audio...")
-    result = asr({"array": data, "sampling_rate": sr})
     text = result.get("text", "").strip()
     print("ASR:", text)
@@ -140,11 +115,11 @@ def transcribe_audio(audio_path: str) -> str:
 # TEXT-TO-SPEECH (TTS)
 # ========================================================
-def synthesize_speech(text: str, tts_model_id: Optional[str] = None):
     if not text or not text.strip():
         return None
-    tts = get_tts_pipeline(tts_model_id)
     out = tts(text)
     # rohes Audio from MMS (float32 [-1, 1])

 Dieses File ist 100% stabil für HuggingFace Spaces.
 """
+from typing import Optional, Tuple
 import numpy as np
 import soundfile as sf
 from scipy.signal import butter, filtfilt
 from transformers import pipeline
 # Modelle
 ASR_MODEL_ID = "openai/whisper-small"
 TTS_MODEL_ID = "facebook/mms-tts-deu"
 _asr = None
+_tts = None
 # ========================================================
 # STT PIPELINE
 # TTS PIPELINE
 # ========================================================
+def get_tts_pipeline():
+    global _tts
+    if _tts is None:
+        print(f">>> Lade TTS Modell: {TTS_MODEL_ID}")
+        _tts = pipeline(
             task="text-to-speech",
+            model=TTS_MODEL_ID,
         )
+    return _tts
 # ========================================================
 # AUDIO FILTER – Noise Reduction + Highpass
     return audio
 # ========================================================
 # SPEECH-TO-TEXT (STT)
 # ========================================================
     if audio_path is None:
         return ""
+    # WAV einlesen (soundfile garantiert PCM korrekt)
     data, sr = sf.read(audio_path)
+    # immer Mono
+    if len(data.shape) > 1:
+        data = data[:, 0]
+    # Whisper >30s vermeiden
+    MAX_SAMPLES = sr * 30
+    if len(data) > MAX_SAMPLES:
+        data = data[:MAX_SAMPLES]
     asr = get_asr_pipeline()
     print(">>> Transkribiere Audio...")
+    result = asr(
+        {"array": data, "sampling_rate": sr},
+    )
     text = result.get("text", "").strip()
     print("ASR:", text)
 # TEXT-TO-SPEECH (TTS)
 # ========================================================
+def synthesize_speech(text: str):
     if not text or not text.strip():
         return None
+    tts = get_tts_pipeline()
     out = tts(text)
     # rohes Audio from MMS (float32 [-1, 1])