Nguyen5 commited on
Commit
726d12e
·
1 Parent(s): 1917bac
Files changed (4) hide show
  1. .gitignore +14 -0
  2. app.py +23 -7
  3. realtime_server.py +123 -32
  4. speech_io.py +51 -27
.gitignore ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __pycache__/
2
+ *.py[cod]
3
+ *$py.class
4
+ .DS_Store
5
+ *.ipynb_checkpoints/
6
+ env/
7
+ .venv/
8
+ venv/
9
+ node_modules/
10
+ .trae/
11
+ *.wav
12
+ *.mp3
13
+ *.flac
14
+ *.ogg
app.py CHANGED
@@ -13,6 +13,8 @@ from llm import load_llm
13
  from rag_pipeline import answer, PDF_BASE_URL, LAW_URL
14
 
15
  from speech_io import transcribe_audio, synthesize_speech
 
 
16
 
17
  # =====================================================
18
  # INITIALISIERUNG (global)
@@ -97,7 +99,7 @@ def chatbot_text(user_message, history):
97
  # VOICE CHATBOT
98
  # =====================================================
99
 
100
- def chatbot_voice(audio_path, history):
101
  # 1. Speech → Text
102
  text = transcribe_audio(audio_path)
103
  if not text:
@@ -118,7 +120,7 @@ def chatbot_voice(audio_path, history):
118
  history = history + [{"role": "assistant", "content": bot_msg}]
119
 
120
  # 3. Text → Speech
121
- audio = synthesize_speech(bot_msg)
122
 
123
  return history, audio, ""
124
 
@@ -126,13 +128,13 @@ def chatbot_voice(audio_path, history):
126
  # LAST ANSWER → TTS
127
  # =====================================================
128
 
129
- def read_last_answer(history):
130
  if not history:
131
  return None
132
 
133
  for msg in reversed(history):
134
  if msg["role"] == "assistant":
135
- return synthesize_speech(msg["content"])
136
 
137
  return None
138
 
@@ -175,21 +177,36 @@ with gr.Blocks(title="Prüfungsrechts-Chatbot (RAG + Sprache)") as demo:
175
  gr.Markdown("### 🎙️ Spracheingabe")
176
  voice_in = gr.Audio(sources=["microphone"], type="filepath")
177
  voice_out = gr.Audio(label="Vorgelesene Antwort", type="numpy")
 
 
 
 
 
 
 
 
 
178
 
179
  voice_btn = gr.Button("Sprechen & senden")
180
  voice_btn.click(
181
  chatbot_voice,
182
- [voice_in, chatbot],
183
  [chatbot, voice_out, msg]
184
  )
185
 
186
  read_btn = gr.Button("🔁 Antwort erneut vorlesen")
187
  read_btn.click(
188
  read_last_answer,
189
- [chatbot],
190
  [voice_out]
191
  )
192
 
 
 
 
 
 
 
193
  clear_btn = gr.Button("Chat zurücksetzen")
194
  clear_btn.click(lambda: [], None, chatbot)
195
 
@@ -208,4 +225,3 @@ with gr.Blocks(title="Prüfungsrechts-Chatbot (RAG + Sprache)") as demo:
208
 
209
  if __name__ == "__main__":
210
  demo.launch()
211
-
 
13
  from rag_pipeline import answer, PDF_BASE_URL, LAW_URL
14
 
15
  from speech_io import transcribe_audio, synthesize_speech
16
+ import sys
17
+ sys.dont_write_bytecode = True
18
 
19
  # =====================================================
20
  # INITIALISIERUNG (global)
 
99
  # VOICE CHATBOT
100
  # =====================================================
101
 
102
+ def chatbot_voice(audio_path, history, tts_model_id):
103
  # 1. Speech → Text
104
  text = transcribe_audio(audio_path)
105
  if not text:
 
120
  history = history + [{"role": "assistant", "content": bot_msg}]
121
 
122
  # 3. Text → Speech
123
+ audio = synthesize_speech(bot_msg, tts_model_id)
124
 
125
  return history, audio, ""
126
 
 
128
  # LAST ANSWER → TTS
129
  # =====================================================
130
 
131
+ def read_last_answer(history, tts_model_id=None):
132
  if not history:
133
  return None
134
 
135
  for msg in reversed(history):
136
  if msg["role"] == "assistant":
137
+ return synthesize_speech(msg["content"], tts_model_id)
138
 
139
  return None
140
 
 
177
  gr.Markdown("### 🎙️ Spracheingabe")
178
  voice_in = gr.Audio(sources=["microphone"], type="filepath")
179
  voice_out = gr.Audio(label="Vorgelesene Antwort", type="numpy")
180
+ tts_lang = gr.Dropdown(
181
+ label="TTS Sprache",
182
+ choices=[
183
+ "facebook/mms-tts-deu",
184
+ "facebook/mms-tts-vie",
185
+ "facebook/mms-tts-eng",
186
+ ],
187
+ value="facebook/mms-tts-deu",
188
+ )
189
 
190
  voice_btn = gr.Button("Sprechen & senden")
191
  voice_btn.click(
192
  chatbot_voice,
193
+ [voice_in, chatbot, tts_lang],
194
  [chatbot, voice_out, msg]
195
  )
196
 
197
  read_btn = gr.Button("🔁 Antwort erneut vorlesen")
198
  read_btn.click(
199
  read_last_answer,
200
+ [chatbot, tts_lang],
201
  [voice_out]
202
  )
203
 
204
+ gr.Markdown("### ⚡ Voice (Realtime) – thử nghiệm")
205
+ gr.Markdown("Sử dụng OpenAI Realtime API cho hội thoại nói. Mở trang test bên dưới.")
206
+ gr.HTML("""
207
+ <iframe src="http://localhost:8000/" style="width:100%;height:300px;border:1px solid #ccc"></iframe>
208
+ """)
209
+
210
  clear_btn = gr.Button("Chat zurücksetzen")
211
  clear_btn.click(lambda: [], None, chatbot)
212
 
 
225
 
226
  if __name__ == "__main__":
227
  demo.launch()
 
realtime_server.py CHANGED
@@ -11,6 +11,8 @@ from typing import Optional
11
  from fastapi import FastAPI, WebSocket, WebSocketDisconnect
12
  from fastapi.responses import HTMLResponse
13
  import websockets
 
 
14
 
15
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
16
  OPENAI_REALTIME_MODEL = os.getenv("OPENAI_REALTIME_MODEL", "gpt-4o-realtime-preview")
@@ -29,16 +31,64 @@ html = """
29
  <button id="startBtn">Start Recording</button>
30
  <button id="stopBtn" disabled>Stop Recording</button>
31
  <div id="status">Status: Ready</div>
 
32
  <div id="transcript"></div>
 
33
 
34
  <script>
35
  let mediaRecorder;
36
  let audioChunks = [];
37
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  document.getElementById('startBtn').onclick = async () => {
 
39
  const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
40
  mediaRecorder = new MediaRecorder(stream);
41
 
 
 
 
42
  mediaRecorder.ondataavailable = (event) => {
43
  audioChunks.push(event.data);
44
  };
@@ -46,23 +96,13 @@ html = """
46
  mediaRecorder.onstop = async () => {
47
  const audioBlob = new Blob(audioChunks, { type: 'audio/wav' });
48
  audioChunks = [];
49
-
50
- // Convert to base64
51
  const reader = new FileReader();
52
  reader.readAsDataURL(audioBlob);
53
  reader.onloadend = () => {
54
  const base64data = reader.result.split(',')[1];
55
- // Send to server
56
- fetch('/process-audio', {
57
- method: 'POST',
58
- headers: { 'Content-Type': 'application/json' },
59
- body: JSON.stringify({ audio: base64data })
60
- })
61
- .then(response => response.json())
62
- .then(data => {
63
- document.getElementById('transcript').innerHTML =
64
- `<strong>Transkription:</strong> ${data.transcript}`;
65
- });
66
  };
67
  };
68
 
@@ -76,7 +116,6 @@ html = """
76
  mediaRecorder.stop();
77
  document.getElementById('startBtn').disabled = false;
78
  document.getElementById('stopBtn').disabled = true;
79
- document.getElementById('status').textContent = 'Status: Processing...';
80
  };
81
  </script>
82
  </body>
@@ -136,26 +175,78 @@ async def websocket_endpoint(websocket: WebSocket):
136
  extra_headers=headers
137
  ) as openai_ws:
138
 
139
- # Forward messages in both directions
140
- async def forward_to_openai():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
  try:
142
- while True:
143
- data = await websocket.receive_text()
144
- await openai_ws.send(data)
145
  except WebSocketDisconnect:
146
- pass
147
-
148
- async def forward_to_client():
149
  try:
150
- async for message in openai_ws:
151
- await websocket.send_text(message)
152
  except:
153
- pass
154
-
155
- await asyncio.gather(
156
- forward_to_openai(),
157
- forward_to_client()
158
- )
 
 
 
 
159
 
160
  except Exception as e:
161
  print(f"WebSocket error: {e}")
@@ -164,4 +255,4 @@ async def websocket_endpoint(websocket: WebSocket):
164
 
165
  if __name__ == "__main__":
166
  import uvicorn
167
- uvicorn.run(app, host="0.0.0.0", port=8000)
 
11
  from fastapi import FastAPI, WebSocket, WebSocketDisconnect
12
  from fastapi.responses import HTMLResponse
13
  import websockets
14
+ import sys
15
+ sys.dont_write_bytecode = True
16
 
17
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
18
  OPENAI_REALTIME_MODEL = os.getenv("OPENAI_REALTIME_MODEL", "gpt-4o-realtime-preview")
 
31
  <button id="startBtn">Start Recording</button>
32
  <button id="stopBtn" disabled>Stop Recording</button>
33
  <div id="status">Status: Ready</div>
34
+ <div><label>Instructions: <input id="instructions" placeholder="Optional prompt" /></label></div>
35
  <div id="transcript"></div>
36
+ <audio id="player" controls></audio>
37
 
38
  <script>
39
  let mediaRecorder;
40
  let audioChunks = [];
41
+ let ws;
42
+
43
+ function ensureWS() {
44
+ if (ws && ws.readyState === WebSocket.OPEN) return ws;
45
+ ws = new WebSocket((location.protocol === 'https:' ? 'wss://' : 'ws://') + location.host + '/ws');
46
+ ws.onopen = () => {
47
+ document.getElementById('status').textContent = 'Status: WS connected';
48
+ };
49
+ ws.onmessage = (event) => {
50
+ try {
51
+ const msg = JSON.parse(event.data);
52
+ if (msg.type === 'transcript_delta') {
53
+ const el = document.getElementById('transcript');
54
+ el.innerHTML = `<strong>Transcript:</strong> ${el.textContent}${msg.text}`;
55
+ } else if (msg.type === 'response_completed') {
56
+ if (msg.audio) {
57
+ const b64 = msg.audio;
58
+ const audioBlob = base64ToWavBlob(b64);
59
+ const url = URL.createObjectURL(audioBlob);
60
+ const player = document.getElementById('player');
61
+ player.src = url;
62
+ player.play();
63
+ }
64
+ document.getElementById('status').textContent = 'Status: Completed';
65
+ }
66
+ } catch {}
67
+ };
68
+ ws.onclose = () => {
69
+ document.getElementById('status').textContent = 'Status: WS closed';
70
+ };
71
+ return ws;
72
+ }
73
+
74
+ function base64ToWavBlob(base64) {
75
+ const byteCharacters = atob(base64);
76
+ const byteNumbers = new Array(byteCharacters.length);
77
+ for (let i = 0; i < byteCharacters.length; i++) {
78
+ byteNumbers[i] = byteCharacters.charCodeAt(i);
79
+ }
80
+ const byteArray = new Uint8Array(byteNumbers);
81
+ return new Blob([byteArray], { type: 'audio/wav' });
82
+ }
83
+
84
  document.getElementById('startBtn').onclick = async () => {
85
+ ensureWS();
86
  const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
87
  mediaRecorder = new MediaRecorder(stream);
88
 
89
+ audioChunks = [];
90
+ document.getElementById('transcript').textContent = '';
91
+
92
  mediaRecorder.ondataavailable = (event) => {
93
  audioChunks.push(event.data);
94
  };
 
96
  mediaRecorder.onstop = async () => {
97
  const audioBlob = new Blob(audioChunks, { type: 'audio/wav' });
98
  audioChunks = [];
 
 
99
  const reader = new FileReader();
100
  reader.readAsDataURL(audioBlob);
101
  reader.onloadend = () => {
102
  const base64data = reader.result.split(',')[1];
103
+ const instructions = document.getElementById('instructions').value || '';
104
+ ws.send(JSON.stringify({ type: 'utterance', audio: base64data, instructions }));
105
+ document.getElementById('status').textContent = 'Status: Sending to OpenAI...';
 
 
 
 
 
 
 
 
106
  };
107
  };
108
 
 
116
  mediaRecorder.stop();
117
  document.getElementById('startBtn').disabled = false;
118
  document.getElementById('stopBtn').disabled = true;
 
119
  };
120
  </script>
121
  </body>
 
175
  extra_headers=headers
176
  ) as openai_ws:
177
 
178
+ async def process_utterance(b64_wav: str, instructions: Optional[str] = None):
179
+ # Append audio buffer
180
+ await openai_ws.send(json.dumps({
181
+ "type": "input_audio_buffer.append",
182
+ "audio": {"data": b64_wav, "format": "wav"}
183
+ }))
184
+ # Commit audio
185
+ await openai_ws.send(json.dumps({
186
+ "type": "input_audio_buffer.commit"
187
+ }))
188
+ # Request response with audio + text
189
+ await openai_ws.send(json.dumps({
190
+ "type": "response.create",
191
+ "response": {
192
+ "modalities": ["audio", "text"],
193
+ "instructions": instructions or ""
194
+ }
195
+ }))
196
+
197
+ audio_chunks = []
198
+ transcript = ""
199
+ # Read stream until completed
200
+ while True:
201
+ msg = await openai_ws.recv()
202
+ try:
203
+ event = json.loads(msg)
204
+ except:
205
+ continue
206
+
207
+ etype = event.get("type")
208
+ if etype == "response.audio.delta":
209
+ data = event.get("delta") or event.get("data")
210
+ if data:
211
+ audio_chunks.append(data)
212
+ await websocket.send_text(json.dumps({
213
+ "type": "audio_delta",
214
+ "data": data
215
+ }))
216
+ elif etype == "response.transcript.delta":
217
+ delta = event.get("delta", "")
218
+ transcript += delta
219
+ await websocket.send_text(json.dumps({
220
+ "type": "transcript_delta",
221
+ "text": delta
222
+ }))
223
+ elif etype == "response.completed":
224
+ await websocket.send_text(json.dumps({
225
+ "type": "response_completed",
226
+ "transcript": transcript,
227
+ "audio": "".join(audio_chunks)
228
+ }))
229
+ break
230
+
231
+ # Main loop: receive client messages
232
+ while True:
233
  try:
234
+ text = await websocket.receive_text()
 
 
235
  except WebSocketDisconnect:
236
+ break
 
 
237
  try:
238
+ msg = json.loads(text)
 
239
  except:
240
+ continue
241
+
242
+ mtype = msg.get("type")
243
+ if mtype == "utterance":
244
+ b64_wav = msg.get("audio", "")
245
+ instructions = msg.get("instructions", "")
246
+ if b64_wav:
247
+ await process_utterance(b64_wav, instructions)
248
+ elif mtype == "ping":
249
+ await websocket.send_text(json.dumps({"type": "pong"}))
250
 
251
  except Exception as e:
252
  print(f"WebSocket error: {e}")
 
255
 
256
  if __name__ == "__main__":
257
  import uvicorn
258
+ uvicorn.run(app, host="0.0.0.0", port=8000)
speech_io.py CHANGED
@@ -8,18 +8,20 @@ Sprachbasierte Ein-/Ausgabe:
8
  Dieses File ist 100% stabil für HuggingFace Spaces.
9
  """
10
 
11
- from typing import Optional, Tuple
12
  import numpy as np
13
  import soundfile as sf
14
  from scipy.signal import butter, filtfilt
15
  from transformers import pipeline
 
 
16
 
17
  # Modelle
18
  ASR_MODEL_ID = "openai/whisper-small"
19
  TTS_MODEL_ID = "facebook/mms-tts-deu"
20
 
21
  _asr = None
22
- _tts = None
23
 
24
  # ========================================================
25
  # STT PIPELINE
@@ -42,15 +44,15 @@ def get_asr_pipeline():
42
  # TTS PIPELINE
43
  # ========================================================
44
 
45
- def get_tts_pipeline():
46
- global _tts
47
- if _tts is None:
48
- print(f">>> Lade TTS Modell: {TTS_MODEL_ID}")
49
- _tts = pipeline(
50
  task="text-to-speech",
51
- model=TTS_MODEL_ID,
52
  )
53
- return _tts
54
 
55
  # ========================================================
56
  # AUDIO FILTER – Noise Reduction + Highpass
@@ -76,6 +78,42 @@ def apply_fade(audio, sr, duration_ms=10):
76
 
77
  return audio
78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  # ========================================================
80
  # SPEECH-TO-TEXT (STT)
81
  # ========================================================
@@ -88,24 +126,11 @@ def transcribe_audio(audio_path: str) -> str:
88
  if audio_path is None:
89
  return ""
90
 
91
- # WAV einlesen (soundfile garantiert PCM korrekt)
92
  data, sr = sf.read(audio_path)
93
-
94
- # immer Mono
95
- if len(data.shape) > 1:
96
- data = data[:, 0]
97
-
98
- # Whisper >30s vermeiden
99
- MAX_SAMPLES = sr * 30
100
- if len(data) > MAX_SAMPLES:
101
- data = data[:MAX_SAMPLES]
102
-
103
  asr = get_asr_pipeline()
104
-
105
  print(">>> Transkribiere Audio...")
106
- result = asr(
107
- {"array": data, "sampling_rate": sr},
108
- )
109
 
110
  text = result.get("text", "").strip()
111
  print("ASR:", text)
@@ -115,11 +140,11 @@ def transcribe_audio(audio_path: str) -> str:
115
  # TEXT-TO-SPEECH (TTS)
116
  # ========================================================
117
 
118
- def synthesize_speech(text: str):
119
  if not text or not text.strip():
120
  return None
121
 
122
- tts = get_tts_pipeline()
123
  out = tts(text)
124
 
125
  # rohes Audio from MMS (float32 [-1, 1])
@@ -155,4 +180,3 @@ def synthesize_speech(text: str):
155
 
156
  # Rückgabe: (sr, np.int16 array)
157
  return (sr, audio_int16)
158
-
 
8
  Dieses File ist 100% stabil für HuggingFace Spaces.
9
  """
10
 
11
+ from typing import Optional, Tuple, Dict
12
  import numpy as np
13
  import soundfile as sf
14
  from scipy.signal import butter, filtfilt
15
  from transformers import pipeline
16
+ import librosa
17
+ import webrtcvad
18
 
19
  # Modelle
20
  ASR_MODEL_ID = "openai/whisper-small"
21
  TTS_MODEL_ID = "facebook/mms-tts-deu"
22
 
23
  _asr = None
24
+ _tts_cache: Dict[str, any] = {}
25
 
26
  # ========================================================
27
  # STT PIPELINE
 
44
  # TTS PIPELINE
45
  # ========================================================
46
 
47
+ def get_tts_pipeline(model_id: Optional[str] = None):
48
+ mid = model_id or TTS_MODEL_ID
49
+ if mid not in _tts_cache:
50
+ print(f">>> Lade TTS Modell: {mid}")
51
+ _tts_cache[mid] = pipeline(
52
  task="text-to-speech",
53
+ model=mid,
54
  )
55
+ return _tts_cache[mid]
56
 
57
  # ========================================================
58
  # AUDIO FILTER – Noise Reduction + Highpass
 
78
 
79
  return audio
80
 
81
+ def _vad_trim(audio16: np.ndarray, sr: int) -> np.ndarray:
82
+ vad = webrtcvad.Vad(2)
83
+ frame_ms = 30
84
+ frame_len = int(sr * frame_ms / 1000)
85
+ if frame_len <= 0:
86
+ return audio16
87
+ start = 0
88
+ end = len(audio16)
89
+ voiced = []
90
+ i = 0
91
+ while i + frame_len <= len(audio16):
92
+ frame = audio16[i:i+frame_len]
93
+ is_voiced = vad.is_speech(frame.tobytes(), sample_rate=sr)
94
+ voiced.append(is_voiced)
95
+ i += frame_len
96
+ first = next((idx for idx, v in enumerate(voiced) if v), None)
97
+ last = next((len(voiced)-1-idx for idx, v in enumerate(reversed(voiced)) if v), None)
98
+ if first is None or last is None or last < first:
99
+ return audio16
100
+ start = first * frame_len
101
+ end = min((last + 1) * frame_len, len(audio16))
102
+ return audio16[start:end]
103
+
104
+ def preprocess_audio_for_stt(raw: np.ndarray, sr: int) -> Tuple[np.ndarray, int]:
105
+ if raw.ndim > 1:
106
+ raw = raw[:, 0]
107
+ y = librosa.to_mono(raw.astype(np.float32))
108
+ y = librosa.resample(y, orig_sr=sr, target_sr=16000)
109
+ y = y / (np.max(np.abs(y)) + 1e-9)
110
+ y16 = np.clip(y * 32767, -32768, 32767).astype(np.int16)
111
+ y16 = _vad_trim(y16, 16000)
112
+ max_samples = 16000 * 30
113
+ if len(y16) > max_samples:
114
+ y16 = y16[:max_samples]
115
+ return y16.astype(np.float32) / 32767.0, 16000
116
+
117
  # ========================================================
118
  # SPEECH-TO-TEXT (STT)
119
  # ========================================================
 
126
  if audio_path is None:
127
  return ""
128
 
 
129
  data, sr = sf.read(audio_path)
130
+ data, sr = preprocess_audio_for_stt(data, sr)
 
 
 
 
 
 
 
 
 
131
  asr = get_asr_pipeline()
 
132
  print(">>> Transkribiere Audio...")
133
+ result = asr({"array": data, "sampling_rate": sr})
 
 
134
 
135
  text = result.get("text", "").strip()
136
  print("ASR:", text)
 
140
  # TEXT-TO-SPEECH (TTS)
141
  # ========================================================
142
 
143
+ def synthesize_speech(text: str, tts_model_id: Optional[str] = None):
144
  if not text or not text.strip():
145
  return None
146
 
147
+ tts = get_tts_pipeline(tts_model_id)
148
  out = tts(text)
149
 
150
  # rohes Audio from MMS (float32 [-1, 1])
 
180
 
181
  # Rückgabe: (sr, np.int16 array)
182
  return (sr, audio_int16)