Nguyen5 commited on
Commit
80c3670
·
1 Parent(s): 54f3783
Files changed (2) hide show
  1. app.py +11 -42
  2. speech_io.py +26 -51
app.py CHANGED
@@ -2,11 +2,7 @@
2
  # Version 26.11 – ohne Modi, stabil für Text + Voice
3
 
4
  import gradio as gr
5
- try:
6
- from gradio_pdf import PDF
7
- _HAS_PDF = True
8
- except ImportError:
9
- _HAS_PDF = False
10
  from huggingface_hub import hf_hub_download
11
 
12
  from load_documents import load_documents, DATASET, PDF_FILE, HTML_FILE
@@ -17,10 +13,6 @@ from llm import load_llm
17
  from rag_pipeline import answer, PDF_BASE_URL, LAW_URL
18
 
19
  from speech_io import transcribe_audio, synthesize_speech
20
- from fastapi import FastAPI
21
- import realtime_server as rt
22
- import sys
23
- sys.dont_write_bytecode = True
24
 
25
  # =====================================================
26
  # INITIALISIERUNG (global)
@@ -105,7 +97,7 @@ def chatbot_text(user_message, history):
105
  # VOICE CHATBOT
106
  # =====================================================
107
 
108
- def chatbot_voice(audio_path, history, tts_model_id):
109
  # 1. Speech → Text
110
  text = transcribe_audio(audio_path)
111
  if not text:
@@ -126,7 +118,7 @@ def chatbot_voice(audio_path, history, tts_model_id):
126
  history = history + [{"role": "assistant", "content": bot_msg}]
127
 
128
  # 3. Text → Speech
129
- audio = synthesize_speech(bot_msg, tts_model_id)
130
 
131
  return history, audio, ""
132
 
@@ -134,13 +126,13 @@ def chatbot_voice(audio_path, history, tts_model_id):
134
  # LAST ANSWER → TTS
135
  # =====================================================
136
 
137
- def read_last_answer(history, tts_model_id=None):
138
  if not history:
139
  return None
140
 
141
  for msg in reversed(history):
142
  if msg["role"] == "assistant":
143
- return synthesize_speech(msg["content"], tts_model_id)
144
 
145
  return None
146
 
@@ -158,7 +150,7 @@ with gr.Blocks(title="Prüfungsrechts-Chatbot (RAG + Sprache)") as demo:
158
 
159
  with gr.Row():
160
  with gr.Column(scale=2):
161
- chatbot = gr.Chatbot(label="Chat", height=500)
162
 
163
  msg = gr.Textbox(
164
  label="Frage eingeben",
@@ -183,36 +175,21 @@ with gr.Blocks(title="Prüfungsrechts-Chatbot (RAG + Sprache)") as demo:
183
  gr.Markdown("### 🎙️ Spracheingabe")
184
  voice_in = gr.Audio(sources=["microphone"], type="filepath")
185
  voice_out = gr.Audio(label="Vorgelesene Antwort", type="numpy")
186
- tts_lang = gr.Dropdown(
187
- label="TTS Sprache",
188
- choices=[
189
- "facebook/mms-tts-deu",
190
- "facebook/mms-tts-vie",
191
- "facebook/mms-tts-eng",
192
- ],
193
- value="facebook/mms-tts-deu",
194
- )
195
 
196
  voice_btn = gr.Button("Sprechen & senden")
197
  voice_btn.click(
198
  chatbot_voice,
199
- [voice_in, chatbot, tts_lang],
200
  [chatbot, voice_out, msg]
201
  )
202
 
203
  read_btn = gr.Button("🔁 Antwort erneut vorlesen")
204
  read_btn.click(
205
  read_last_answer,
206
- [chatbot, tts_lang],
207
  [voice_out]
208
  )
209
 
210
- gr.Markdown("### ⚡ Voice (Realtime) – thử nghiệm")
211
- gr.Markdown("Sử dụng OpenAI Realtime API cho hội thoại nói. Trang test chạy cùng máy chủ này.")
212
- gr.HTML("""
213
- <iframe src="/realtime/" style="width:100%;height:300px;border:1px solid #ccc"></iframe>
214
- """)
215
-
216
  clear_btn = gr.Button("Chat zurücksetzen")
217
  clear_btn.click(lambda: [], None, chatbot)
218
 
@@ -222,21 +199,13 @@ with gr.Blocks(title="Prüfungsrechts-Chatbot (RAG + Sprache)") as demo:
222
 
223
  with gr.Column(scale=1):
224
  gr.Markdown("### 📄 Prüfungsordnung (PDF)")
225
- if _HAS_PDF:
226
- PDF(_pdf_path, height=350)
227
- else:
228
- gr.HTML(f'<iframe src="{PDF_BASE_URL}" style="width:100%;height:350px;border:none;"></iframe>')
229
 
230
  gr.Markdown("### 📘 Hochschulgesetz NRW (Website)")
231
  gr.HTML(
232
  f'<iframe src="{LAW_URL}" style="width:100%;height:350px;border:none;"></iframe>'
233
  )
234
 
235
- # FastAPI app: mount Gradio + realtime server cùng một host
236
- app = FastAPI()
237
- app = gr.mount_gradio_app(app, demo, path="/")
238
- app.mount("/realtime", rt.app)
239
-
240
  if __name__ == "__main__":
241
- import uvicorn
242
- uvicorn.run(app, host="0.0.0.0", port=7860)
 
2
  # Version 26.11 – ohne Modi, stabil für Text + Voice
3
 
4
  import gradio as gr
5
+ from gradio_pdf import PDF
 
 
 
 
6
  from huggingface_hub import hf_hub_download
7
 
8
  from load_documents import load_documents, DATASET, PDF_FILE, HTML_FILE
 
13
  from rag_pipeline import answer, PDF_BASE_URL, LAW_URL
14
 
15
  from speech_io import transcribe_audio, synthesize_speech
 
 
 
 
16
 
17
  # =====================================================
18
  # INITIALISIERUNG (global)
 
97
  # VOICE CHATBOT
98
  # =====================================================
99
 
100
+ def chatbot_voice(audio_path, history):
101
  # 1. Speech → Text
102
  text = transcribe_audio(audio_path)
103
  if not text:
 
118
  history = history + [{"role": "assistant", "content": bot_msg}]
119
 
120
  # 3. Text → Speech
121
+ audio = synthesize_speech(bot_msg)
122
 
123
  return history, audio, ""
124
 
 
126
  # LAST ANSWER → TTS
127
  # =====================================================
128
 
129
+ def read_last_answer(history):
130
  if not history:
131
  return None
132
 
133
  for msg in reversed(history):
134
  if msg["role"] == "assistant":
135
+ return synthesize_speech(msg["content"])
136
 
137
  return None
138
 
 
150
 
151
  with gr.Row():
152
  with gr.Column(scale=2):
153
+ chatbot = gr.Chatbot(type="messages", label="Chat", height=500)
154
 
155
  msg = gr.Textbox(
156
  label="Frage eingeben",
 
175
  gr.Markdown("### 🎙️ Spracheingabe")
176
  voice_in = gr.Audio(sources=["microphone"], type="filepath")
177
  voice_out = gr.Audio(label="Vorgelesene Antwort", type="numpy")
 
 
 
 
 
 
 
 
 
178
 
179
  voice_btn = gr.Button("Sprechen & senden")
180
  voice_btn.click(
181
  chatbot_voice,
182
+ [voice_in, chatbot],
183
  [chatbot, voice_out, msg]
184
  )
185
 
186
  read_btn = gr.Button("🔁 Antwort erneut vorlesen")
187
  read_btn.click(
188
  read_last_answer,
189
+ [chatbot],
190
  [voice_out]
191
  )
192
 
 
 
 
 
 
 
193
  clear_btn = gr.Button("Chat zurücksetzen")
194
  clear_btn.click(lambda: [], None, chatbot)
195
 
 
199
 
200
  with gr.Column(scale=1):
201
  gr.Markdown("### 📄 Prüfungsordnung (PDF)")
202
+ PDF(_pdf_path, height=350)
 
 
 
203
 
204
  gr.Markdown("### 📘 Hochschulgesetz NRW (Website)")
205
  gr.HTML(
206
  f'<iframe src="{LAW_URL}" style="width:100%;height:350px;border:none;"></iframe>'
207
  )
208
 
 
 
 
 
 
209
  if __name__ == "__main__":
210
+ demo.launch()
211
+
speech_io.py CHANGED
@@ -8,20 +8,18 @@ Sprachbasierte Ein-/Ausgabe:
8
  Dieses File ist 100% stabil für HuggingFace Spaces.
9
  """
10
 
11
- from typing import Optional, Tuple, Dict
12
  import numpy as np
13
  import soundfile as sf
14
  from scipy.signal import butter, filtfilt
15
  from transformers import pipeline
16
- import librosa
17
- import webrtcvad
18
 
19
  # Modelle
20
  ASR_MODEL_ID = "openai/whisper-small"
21
  TTS_MODEL_ID = "facebook/mms-tts-deu"
22
 
23
  _asr = None
24
- _tts_cache: Dict[str, any] = {}
25
 
26
  # ========================================================
27
  # STT PIPELINE
@@ -44,15 +42,15 @@ def get_asr_pipeline():
44
  # TTS PIPELINE
45
  # ========================================================
46
 
47
- def get_tts_pipeline(model_id: Optional[str] = None):
48
- mid = model_id or TTS_MODEL_ID
49
- if mid not in _tts_cache:
50
- print(f">>> Lade TTS Modell: {mid}")
51
- _tts_cache[mid] = pipeline(
52
  task="text-to-speech",
53
- model=mid,
54
  )
55
- return _tts_cache[mid]
56
 
57
  # ========================================================
58
  # AUDIO FILTER – Noise Reduction + Highpass
@@ -78,42 +76,6 @@ def apply_fade(audio, sr, duration_ms=10):
78
 
79
  return audio
80
 
81
- def _vad_trim(audio16: np.ndarray, sr: int) -> np.ndarray:
82
- vad = webrtcvad.Vad(2)
83
- frame_ms = 30
84
- frame_len = int(sr * frame_ms / 1000)
85
- if frame_len <= 0:
86
- return audio16
87
- start = 0
88
- end = len(audio16)
89
- voiced = []
90
- i = 0
91
- while i + frame_len <= len(audio16):
92
- frame = audio16[i:i+frame_len]
93
- is_voiced = vad.is_speech(frame.tobytes(), sample_rate=sr)
94
- voiced.append(is_voiced)
95
- i += frame_len
96
- first = next((idx for idx, v in enumerate(voiced) if v), None)
97
- last = next((len(voiced)-1-idx for idx, v in enumerate(reversed(voiced)) if v), None)
98
- if first is None or last is None or last < first:
99
- return audio16
100
- start = first * frame_len
101
- end = min((last + 1) * frame_len, len(audio16))
102
- return audio16[start:end]
103
-
104
- def preprocess_audio_for_stt(raw: np.ndarray, sr: int) -> Tuple[np.ndarray, int]:
105
- if raw.ndim > 1:
106
- raw = raw[:, 0]
107
- y = librosa.to_mono(raw.astype(np.float32))
108
- y = librosa.resample(y, orig_sr=sr, target_sr=16000)
109
- y = y / (np.max(np.abs(y)) + 1e-9)
110
- y16 = np.clip(y * 32767, -32768, 32767).astype(np.int16)
111
- y16 = _vad_trim(y16, 16000)
112
- max_samples = 16000 * 30
113
- if len(y16) > max_samples:
114
- y16 = y16[:max_samples]
115
- return y16.astype(np.float32) / 32767.0, 16000
116
-
117
  # ========================================================
118
  # SPEECH-TO-TEXT (STT)
119
  # ========================================================
@@ -126,11 +88,24 @@ def transcribe_audio(audio_path: str) -> str:
126
  if audio_path is None:
127
  return ""
128
 
 
129
  data, sr = sf.read(audio_path)
130
- data, sr = preprocess_audio_for_stt(data, sr)
 
 
 
 
 
 
 
 
 
131
  asr = get_asr_pipeline()
 
132
  print(">>> Transkribiere Audio...")
133
- result = asr({"array": data, "sampling_rate": sr})
 
 
134
 
135
  text = result.get("text", "").strip()
136
  print("ASR:", text)
@@ -140,11 +115,11 @@ def transcribe_audio(audio_path: str) -> str:
140
  # TEXT-TO-SPEECH (TTS)
141
  # ========================================================
142
 
143
- def synthesize_speech(text: str, tts_model_id: Optional[str] = None):
144
  if not text or not text.strip():
145
  return None
146
 
147
- tts = get_tts_pipeline(tts_model_id)
148
  out = tts(text)
149
 
150
  # rohes Audio from MMS (float32 [-1, 1])
 
8
  Dieses File ist 100% stabil für HuggingFace Spaces.
9
  """
10
 
11
+ from typing import Optional, Tuple
12
  import numpy as np
13
  import soundfile as sf
14
  from scipy.signal import butter, filtfilt
15
  from transformers import pipeline
 
 
16
 
17
  # Modelle
18
  ASR_MODEL_ID = "openai/whisper-small"
19
  TTS_MODEL_ID = "facebook/mms-tts-deu"
20
 
21
  _asr = None
22
+ _tts = None
23
 
24
  # ========================================================
25
  # STT PIPELINE
 
42
  # TTS PIPELINE
43
  # ========================================================
44
 
45
+ def get_tts_pipeline():
46
+ global _tts
47
+ if _tts is None:
48
+ print(f">>> Lade TTS Modell: {TTS_MODEL_ID}")
49
+ _tts = pipeline(
50
  task="text-to-speech",
51
+ model=TTS_MODEL_ID,
52
  )
53
+ return _tts
54
 
55
  # ========================================================
56
  # AUDIO FILTER – Noise Reduction + Highpass
 
76
 
77
  return audio
78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  # ========================================================
80
  # SPEECH-TO-TEXT (STT)
81
  # ========================================================
 
88
  if audio_path is None:
89
  return ""
90
 
91
+ # WAV einlesen (soundfile garantiert PCM korrekt)
92
  data, sr = sf.read(audio_path)
93
+
94
+ # immer Mono
95
+ if len(data.shape) > 1:
96
+ data = data[:, 0]
97
+
98
+ # Whisper >30s vermeiden
99
+ MAX_SAMPLES = sr * 30
100
+ if len(data) > MAX_SAMPLES:
101
+ data = data[:MAX_SAMPLES]
102
+
103
  asr = get_asr_pipeline()
104
+
105
  print(">>> Transkribiere Audio...")
106
+ result = asr(
107
+ {"array": data, "sampling_rate": sr},
108
+ )
109
 
110
  text = result.get("text", "").strip()
111
  print("ASR:", text)
 
115
  # TEXT-TO-SPEECH (TTS)
116
  # ========================================================
117
 
118
+ def synthesize_speech(text: str):
119
  if not text or not text.strip():
120
  return None
121
 
122
+ tts = get_tts_pipeline()
123
  out = tts(text)
124
 
125
  # rohes Audio from MMS (float32 [-1, 1])