Spaces:

Nguyen5
/

chatbot1

Sleeping

App Files Files Community

Nguyen5 commited on Dec 8, 2025

Commit

726d12e

1 Parent(s): 1917bac

commit

Browse files

Files changed (4) hide show

.gitignore +14 -0
app.py +23 -7
realtime_server.py +123 -32
speech_io.py +51 -27

.gitignore ADDED Viewed

	@@ -0,0 +1,14 @@

+__pycache__/
+*.py[cod]
+*$py.class
+.DS_Store
+*.ipynb_checkpoints/
+env/
+.venv/
+venv/
+node_modules/
+.trae/
+*.wav
+*.mp3
+*.flac
+*.ogg

app.py CHANGED Viewed

@@ -13,6 +13,8 @@ from llm import load_llm
 from rag_pipeline import answer, PDF_BASE_URL, LAW_URL
 from speech_io import transcribe_audio, synthesize_speech
 # =====================================================
 # INITIALISIERUNG (global)
@@ -97,7 +99,7 @@ def chatbot_text(user_message, history):
 # VOICE CHATBOT
 # =====================================================
-def chatbot_voice(audio_path, history):
     # 1. Speech → Text
     text = transcribe_audio(audio_path)
     if not text:
@@ -118,7 +120,7 @@ def chatbot_voice(audio_path, history):
     history = history + [{"role": "assistant", "content": bot_msg}]
     # 3. Text → Speech
-    audio = synthesize_speech(bot_msg)
     return history, audio, ""
@@ -126,13 +128,13 @@ def chatbot_voice(audio_path, history):
 # LAST ANSWER → TTS
 # =====================================================
-def read_last_answer(history):
     if not history:
         return None
     for msg in reversed(history):
         if msg["role"] == "assistant":
-            return synthesize_speech(msg["content"])
     return None
@@ -175,21 +177,36 @@ with gr.Blocks(title="Prüfungsrechts-Chatbot (RAG + Sprache)") as demo:
             gr.Markdown("### 🎙️ Spracheingabe")
             voice_in = gr.Audio(sources=["microphone"], type="filepath")
             voice_out = gr.Audio(label="Vorgelesene Antwort", type="numpy")
             voice_btn = gr.Button("Sprechen & senden")
             voice_btn.click(
                 chatbot_voice,
-                [voice_in, chatbot],
                 [chatbot, voice_out, msg]
             )
             read_btn = gr.Button("🔁 Antwort erneut vorlesen")
             read_btn.click(
                 read_last_answer,
-                [chatbot],
                 [voice_out]
             )
             clear_btn = gr.Button("Chat zurücksetzen")
             clear_btn.click(lambda: [], None, chatbot)
@@ -208,4 +225,3 @@ with gr.Blocks(title="Prüfungsrechts-Chatbot (RAG + Sprache)") as demo:
 if __name__ == "__main__":
     demo.launch()

 from rag_pipeline import answer, PDF_BASE_URL, LAW_URL
 from speech_io import transcribe_audio, synthesize_speech
+import sys
+sys.dont_write_bytecode = True
 # =====================================================
 # INITIALISIERUNG (global)
 # VOICE CHATBOT
 # =====================================================
+def chatbot_voice(audio_path, history, tts_model_id):
     # 1. Speech → Text
     text = transcribe_audio(audio_path)
     if not text:
     history = history + [{"role": "assistant", "content": bot_msg}]
     # 3. Text → Speech
+    audio = synthesize_speech(bot_msg, tts_model_id)
     return history, audio, ""
 # LAST ANSWER → TTS
 # =====================================================
+def read_last_answer(history, tts_model_id=None):
     if not history:
         return None
     for msg in reversed(history):
         if msg["role"] == "assistant":
+            return synthesize_speech(msg["content"], tts_model_id)
     return None
             gr.Markdown("### 🎙️ Spracheingabe")
             voice_in = gr.Audio(sources=["microphone"], type="filepath")
             voice_out = gr.Audio(label="Vorgelesene Antwort", type="numpy")
+            tts_lang = gr.Dropdown(
+                label="TTS Sprache",
+                choices=[
+                    "facebook/mms-tts-deu",
+                    "facebook/mms-tts-vie",
+                    "facebook/mms-tts-eng",
+                ],
+                value="facebook/mms-tts-deu",
+            )
             voice_btn = gr.Button("Sprechen & senden")
             voice_btn.click(
                 chatbot_voice,
+                [voice_in, chatbot, tts_lang],
                 [chatbot, voice_out, msg]
             )
             read_btn = gr.Button("🔁 Antwort erneut vorlesen")
             read_btn.click(
                 read_last_answer,
+                [chatbot, tts_lang],
                 [voice_out]
             )
+            gr.Markdown("### ⚡ Voice (Realtime) – thử nghiệm")
+            gr.Markdown("Sử dụng OpenAI Realtime API cho hội thoại nói. Mở trang test bên dưới.")
+            gr.HTML("""
+            <iframe src="http://localhost:8000/" style="width:100%;height:300px;border:1px solid #ccc"></iframe>
+            """)
             clear_btn = gr.Button("Chat zurücksetzen")
             clear_btn.click(lambda: [], None, chatbot)
 if __name__ == "__main__":
     demo.launch()

realtime_server.py CHANGED Viewed

@@ -11,6 +11,8 @@ from typing import Optional
 from fastapi import FastAPI, WebSocket, WebSocketDisconnect
 from fastapi.responses import HTMLResponse
 import websockets
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
 OPENAI_REALTIME_MODEL = os.getenv("OPENAI_REALTIME_MODEL", "gpt-4o-realtime-preview")
@@ -29,16 +31,64 @@ html = """
     <button id="startBtn">Start Recording</button>
     <button id="stopBtn" disabled>Stop Recording</button>
     <div id="status">Status: Ready</div>
     <div id="transcript"></div>
     <script>
         let mediaRecorder;
         let audioChunks = [];
         document.getElementById('startBtn').onclick = async () => {
             const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
             mediaRecorder = new MediaRecorder(stream);
             mediaRecorder.ondataavailable = (event) => {
                 audioChunks.push(event.data);
             };
@@ -46,23 +96,13 @@ html = """
             mediaRecorder.onstop = async () => {
                 const audioBlob = new Blob(audioChunks, { type: 'audio/wav' });
                 audioChunks = [];
-                // Convert to base64
                 const reader = new FileReader();
                 reader.readAsDataURL(audioBlob);
                 reader.onloadend = () => {
                     const base64data = reader.result.split(',')[1];
-                    // Send to server
-                    fetch('/process-audio', {
-                        method: 'POST',
-                        headers: { 'Content-Type': 'application/json' },
-                        body: JSON.stringify({ audio: base64data })
-                    })
-                    .then(response => response.json())
-                    .then(data => {
-                        document.getElementById('transcript').innerHTML =
-                            `<strong>Transkription:</strong> ${data.transcript}`;
-                    });
                 };
             };
@@ -76,7 +116,6 @@ html = """
             mediaRecorder.stop();
             document.getElementById('startBtn').disabled = false;
             document.getElementById('stopBtn').disabled = true;
-            document.getElementById('status').textContent = 'Status: Processing...';
         };
     </script>
 </body>
@@ -136,26 +175,78 @@ async def websocket_endpoint(websocket: WebSocket):
             extra_headers=headers
         ) as openai_ws:
-            # Forward messages in both directions
-            async def forward_to_openai():
                 try:
-                    while True:
-                        data = await websocket.receive_text()
-                        await openai_ws.send(data)
                 except WebSocketDisconnect:
-                    pass
-            async def forward_to_client():
                 try:
-                    async for message in openai_ws:
-                        await websocket.send_text(message)
                 except:
-                    pass
-            await asyncio.gather(
-                forward_to_openai(),
-                forward_to_client()
-            )
     except Exception as e:
         print(f"WebSocket error: {e}")
@@ -164,4 +255,4 @@ async def websocket_endpoint(websocket: WebSocket):
 if __name__ == "__main__":
     import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=8000)

 from fastapi import FastAPI, WebSocket, WebSocketDisconnect
 from fastapi.responses import HTMLResponse
 import websockets
+import sys
+sys.dont_write_bytecode = True
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
 OPENAI_REALTIME_MODEL = os.getenv("OPENAI_REALTIME_MODEL", "gpt-4o-realtime-preview")
     <button id="startBtn">Start Recording</button>
     <button id="stopBtn" disabled>Stop Recording</button>
     <div id="status">Status: Ready</div>
+    <div><label>Instructions: <input id="instructions" placeholder="Optional prompt" /></label></div>
     <div id="transcript"></div>
+    <audio id="player" controls></audio>
     <script>
         let mediaRecorder;
         let audioChunks = [];
+        let ws;
+        function ensureWS() {
+            if (ws && ws.readyState === WebSocket.OPEN) return ws;
+            ws = new WebSocket((location.protocol === 'https:' ? 'wss://' : 'ws://') + location.host + '/ws');
+            ws.onopen = () => {
+                document.getElementById('status').textContent = 'Status: WS connected';
+            };
+            ws.onmessage = (event) => {
+                try {
+                    const msg = JSON.parse(event.data);
+                    if (msg.type === 'transcript_delta') {
+                        const el = document.getElementById('transcript');
+                        el.innerHTML = `<strong>Transcript:</strong> ${el.textContent}${msg.text}`;
+                    } else if (msg.type === 'response_completed') {
+                        if (msg.audio) {
+                            const b64 = msg.audio;
+                            const audioBlob = base64ToWavBlob(b64);
+                            const url = URL.createObjectURL(audioBlob);
+                            const player = document.getElementById('player');
+                            player.src = url;
+                            player.play();
+                        }
+                        document.getElementById('status').textContent = 'Status: Completed';
+                    }
+                } catch {}
+            };
+            ws.onclose = () => {
+                document.getElementById('status').textContent = 'Status: WS closed';
+            };
+            return ws;
+        }
+        function base64ToWavBlob(base64) {
+            const byteCharacters = atob(base64);
+            const byteNumbers = new Array(byteCharacters.length);
+            for (let i = 0; i < byteCharacters.length; i++) {
+                byteNumbers[i] = byteCharacters.charCodeAt(i);
+            }
+            const byteArray = new Uint8Array(byteNumbers);
+            return new Blob([byteArray], { type: 'audio/wav' });
+        }
         document.getElementById('startBtn').onclick = async () => {
+            ensureWS();
             const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
             mediaRecorder = new MediaRecorder(stream);
+            audioChunks = [];
+            document.getElementById('transcript').textContent = '';
             mediaRecorder.ondataavailable = (event) => {
                 audioChunks.push(event.data);
             };
             mediaRecorder.onstop = async () => {
                 const audioBlob = new Blob(audioChunks, { type: 'audio/wav' });
                 audioChunks = [];
                 const reader = new FileReader();
                 reader.readAsDataURL(audioBlob);
                 reader.onloadend = () => {
                     const base64data = reader.result.split(',')[1];
+                    const instructions = document.getElementById('instructions').value || '';
+                    ws.send(JSON.stringify({ type: 'utterance', audio: base64data, instructions }));
+                    document.getElementById('status').textContent = 'Status: Sending to OpenAI...';
                 };
             };
             mediaRecorder.stop();
             document.getElementById('startBtn').disabled = false;
             document.getElementById('stopBtn').disabled = true;
         };
     </script>
 </body>
             extra_headers=headers
         ) as openai_ws:
+            async def process_utterance(b64_wav: str, instructions: Optional[str] = None):
+                # Append audio buffer
+                await openai_ws.send(json.dumps({
+                    "type": "input_audio_buffer.append",
+                    "audio": {"data": b64_wav, "format": "wav"}
+                }))
+                # Commit audio
+                await openai_ws.send(json.dumps({
+                    "type": "input_audio_buffer.commit"
+                }))
+                # Request response with audio + text
+                await openai_ws.send(json.dumps({
+                    "type": "response.create",
+                    "response": {
+                        "modalities": ["audio", "text"],
+                        "instructions": instructions or ""
+                    }
+                }))
+                audio_chunks = []
+                transcript = ""
+                # Read stream until completed
+                while True:
+                    msg = await openai_ws.recv()
+                    try:
+                        event = json.loads(msg)
+                    except:
+                        continue
+                    etype = event.get("type")
+                    if etype == "response.audio.delta":
+                        data = event.get("delta") or event.get("data")
+                        if data:
+                            audio_chunks.append(data)
+                            await websocket.send_text(json.dumps({
+                                "type": "audio_delta",
+                                "data": data
+                            }))
+                    elif etype == "response.transcript.delta":
+                        delta = event.get("delta", "")
+                        transcript += delta
+                        await websocket.send_text(json.dumps({
+                            "type": "transcript_delta",
+                            "text": delta
+                        }))
+                    elif etype == "response.completed":
+                        await websocket.send_text(json.dumps({
+                            "type": "response_completed",
+                            "transcript": transcript,
+                            "audio": "".join(audio_chunks)
+                        }))
+                        break
+            # Main loop: receive client messages
+            while True:
                 try:
+                    text = await websocket.receive_text()
                 except WebSocketDisconnect:
+                    break
                 try:
+                    msg = json.loads(text)
                 except:
+                    continue
+                mtype = msg.get("type")
+                if mtype == "utterance":
+                    b64_wav = msg.get("audio", "")
+                    instructions = msg.get("instructions", "")
+                    if b64_wav:
+                        await process_utterance(b64_wav, instructions)
+                elif mtype == "ping":
+                    await websocket.send_text(json.dumps({"type": "pong"}))
     except Exception as e:
         print(f"WebSocket error: {e}")
 if __name__ == "__main__":
     import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)

speech_io.py CHANGED Viewed

@@ -8,18 +8,20 @@ Sprachbasierte Ein-/Ausgabe:
 Dieses File ist 100% stabil für HuggingFace Spaces.
 """
-from typing import Optional, Tuple
 import numpy as np
 import soundfile as sf
 from scipy.signal import butter, filtfilt
 from transformers import pipeline
 # Modelle
 ASR_MODEL_ID = "openai/whisper-small"
 TTS_MODEL_ID = "facebook/mms-tts-deu"
 _asr = None
-_tts = None
 # ========================================================
 # STT PIPELINE
@@ -42,15 +44,15 @@ def get_asr_pipeline():
 # TTS PIPELINE
 # ========================================================
-def get_tts_pipeline():
-    global _tts
-    if _tts is None:
-        print(f">>> Lade TTS Modell: {TTS_MODEL_ID}")
-        _tts = pipeline(
             task="text-to-speech",
-            model=TTS_MODEL_ID,
         )
-    return _tts
 # ========================================================
 # AUDIO FILTER – Noise Reduction + Highpass
@@ -76,6 +78,42 @@ def apply_fade(audio, sr, duration_ms=10):
     return audio
 # ========================================================
 # SPEECH-TO-TEXT (STT)
 # ========================================================
@@ -88,24 +126,11 @@ def transcribe_audio(audio_path: str) -> str:
     if audio_path is None:
         return ""
-    # WAV einlesen (soundfile garantiert PCM korrekt)
     data, sr = sf.read(audio_path)
-    # immer Mono
-    if len(data.shape) > 1:
-        data = data[:, 0]
-    # Whisper >30s vermeiden
-    MAX_SAMPLES = sr * 30
-    if len(data) > MAX_SAMPLES:
-        data = data[:MAX_SAMPLES]
     asr = get_asr_pipeline()
     print(">>> Transkribiere Audio...")
-    result = asr(
-        {"array": data, "sampling_rate": sr},
-    )
     text = result.get("text", "").strip()
     print("ASR:", text)
@@ -115,11 +140,11 @@ def transcribe_audio(audio_path: str) -> str:
 # TEXT-TO-SPEECH (TTS)
 # ========================================================
-def synthesize_speech(text: str):
     if not text or not text.strip():
         return None
-    tts = get_tts_pipeline()
     out = tts(text)
     # rohes Audio from MMS (float32 [-1, 1])
@@ -155,4 +180,3 @@ def synthesize_speech(text: str):
     # Rückgabe: (sr, np.int16 array)
     return (sr, audio_int16)

 Dieses File ist 100% stabil für HuggingFace Spaces.
 """
+from typing import Optional, Tuple, Dict
 import numpy as np
 import soundfile as sf
 from scipy.signal import butter, filtfilt
 from transformers import pipeline
+import librosa
+import webrtcvad
 # Modelle
 ASR_MODEL_ID = "openai/whisper-small"
 TTS_MODEL_ID = "facebook/mms-tts-deu"
 _asr = None
+_tts_cache: Dict[str, any] = {}
 # ========================================================
 # STT PIPELINE
 # TTS PIPELINE
 # ========================================================
+def get_tts_pipeline(model_id: Optional[str] = None):
+    mid = model_id or TTS_MODEL_ID
+    if mid not in _tts_cache:
+        print(f">>> Lade TTS Modell: {mid}")
+        _tts_cache[mid] = pipeline(
             task="text-to-speech",
+            model=mid,
         )
+    return _tts_cache[mid]
 # ========================================================
 # AUDIO FILTER – Noise Reduction + Highpass
     return audio
+def _vad_trim(audio16: np.ndarray, sr: int) -> np.ndarray:
+    vad = webrtcvad.Vad(2)
+    frame_ms = 30
+    frame_len = int(sr * frame_ms / 1000)
+    if frame_len <= 0:
+        return audio16
+    start = 0
+    end = len(audio16)
+    voiced = []
+    i = 0
+    while i + frame_len <= len(audio16):
+        frame = audio16[i:i+frame_len]
+        is_voiced = vad.is_speech(frame.tobytes(), sample_rate=sr)
+        voiced.append(is_voiced)
+        i += frame_len
+    first = next((idx for idx, v in enumerate(voiced) if v), None)
+    last = next((len(voiced)-1-idx for idx, v in enumerate(reversed(voiced)) if v), None)
+    if first is None or last is None or last < first:
+        return audio16
+    start = first * frame_len
+    end = min((last + 1) * frame_len, len(audio16))
+    return audio16[start:end]
+def preprocess_audio_for_stt(raw: np.ndarray, sr: int) -> Tuple[np.ndarray, int]:
+    if raw.ndim > 1:
+        raw = raw[:, 0]
+    y = librosa.to_mono(raw.astype(np.float32))
+    y = librosa.resample(y, orig_sr=sr, target_sr=16000)
+    y = y / (np.max(np.abs(y)) + 1e-9)
+    y16 = np.clip(y * 32767, -32768, 32767).astype(np.int16)
+    y16 = _vad_trim(y16, 16000)
+    max_samples = 16000 * 30
+    if len(y16) > max_samples:
+        y16 = y16[:max_samples]
+    return y16.astype(np.float32) / 32767.0, 16000
 # ========================================================
 # SPEECH-TO-TEXT (STT)
 # ========================================================
     if audio_path is None:
         return ""
     data, sr = sf.read(audio_path)
+    data, sr = preprocess_audio_for_stt(data, sr)
     asr = get_asr_pipeline()
     print(">>> Transkribiere Audio...")
+    result = asr({"array": data, "sampling_rate": sr})
     text = result.get("text", "").strip()
     print("ASR:", text)
 # TEXT-TO-SPEECH (TTS)
 # ========================================================
+def synthesize_speech(text: str, tts_model_id: Optional[str] = None):
     if not text or not text.strip():
         return None
+    tts = get_tts_pipeline(tts_model_id)
     out = tts(text)
     # rohes Audio from MMS (float32 [-1, 1])
     # Rückgabe: (sr, np.int16 array)
     return (sr, audio_int16)