rakib72642 commited on
Commit
13425c7
·
1 Parent(s): 6a7bafa

fixed stt and added whisper and elevenlabs stt

Browse files
Files changed (4) hide show
  1. .env +2 -2
  2. core/backend.py +3 -0
  3. services/stt.py +75 -14
  4. services/tts.py +1 -1
.env CHANGED
@@ -9,8 +9,8 @@ GOOGLE_API_KEY="AIzaSyA9sqz4YKQHKXR9TU1imw0DPOghzHOMiBo"
9
 
10
 
11
  ELEVENLABS_API_KEY="b3af3a938c8e15d5eae700ea47eea7d88dfe397f34fbd4b0c75c24f143b032b8"
12
- ELEVENLABS_VOICE_ID="iuABfyf7pRoBzuPqzUCt"
13
- ELEVENLABS_MODEL_ID="eleven_multilingual_v2"
14
 
15
  # TWILIO_ACCOUNT_SID="ACfafc0d2d007bdf14b21bb3e14a7a7b31"
16
  # TWILIO_AUTH_TOKEN="ed15fa98748c8c3d3d02cb54e431a187"
 
9
 
10
 
11
  ELEVENLABS_API_KEY="b3af3a938c8e15d5eae700ea47eea7d88dfe397f34fbd4b0c75c24f143b032b8"
12
+ ELEVENLABS_VOICE_ID="4O1sYUnmtThcBoSBrri7"
13
+ ELEVENLABS_MODEL_ID="eleven_v3"
14
 
15
  # TWILIO_ACCOUNT_SID="ACfafc0d2d007bdf14b21bb3e14a7a7b31"
16
  # TWILIO_AUTH_TOKEN="ed15fa98748c8c3d3d02cb54e431a187"
core/backend.py CHANGED
@@ -403,6 +403,9 @@ LANGUAGE RULE:
403
  - If Bangla → reply Bangla (বাংলা).
404
  - If Banglish → reply Bangla (বাংলা).
405
  - Never mix languages unless user mixes first.
 
 
 
406
  TOOLS:
407
  - Use backend tools if needed
408
  - Always confirm before final action
 
403
  - If Bangla → reply Bangla (বাংলা).
404
  - If Banglish → reply Bangla (বাংলা).
405
  - Never mix languages unless user mixes first.
406
+ DOCTOR ID RULE:
407
+ - Never generate or guess doctor_id.
408
+ - doctor_id must only come from search_doctor tool output.
409
  TOOLS:
410
  - Use backend tools if needed
411
  - Always confirm before final action
services/stt.py CHANGED
@@ -43,6 +43,7 @@ from concurrent.futures import ThreadPoolExecutor
43
  from dataclasses import dataclass, field
44
  from typing import Optional
45
 
 
46
  from faster_whisper import WhisperModel
47
 
48
  # ── Bangla script patterns ─────────────────────────────────────────────────────
@@ -51,10 +52,9 @@ _WRONG_SCRIPT_RE = re.compile(
51
  r"[\u0600-\u06FF\u0750-\u077F\uFB50-\uFDFF\uFE70-\uFEFF]"
52
  )
53
 
54
- # Bangla decoder seed — keeps Whisper in বাংলা Unicode block
55
- _BANGLA_SEED = "আমি আপনার সাথে বাংলায় কথা বলছি।"
56
-
57
  # ── Configuration ──────────────────────────────────────────────────────────────
 
 
58
  _STT_MODEL = os.getenv("STT_MODEL", "large-v3")
59
  _COMPUTE_TYPE = os.getenv("STT_COMPUTE_TYPE", "int8_float32")
60
  _BATCH_WINDOW = float(os.getenv("STT_BATCH_WINDOW_MS", "30")) / 1000 # 30ms (was 50ms)
@@ -62,6 +62,10 @@ _MAX_BATCH = int(os.getenv("STT_MAX_BATCH", "8"))
62
  _MODEL_LOAD_TIMEOUT = int(os.getenv("STT_MODEL_LOAD_TIMEOUT_S", "120")) # seconds
63
  MAX_INPUT_BYTES = 5_242_880 # 5 MB
64
 
 
 
 
 
65
  # ── Singleton model state ──────────────────────────────────────────────────────
66
  _model: Optional[WhisperModel] = None
67
  _model_lock = threading.Lock()
@@ -109,8 +113,11 @@ def _make_silence_wav(duration_s: float = 0.5, sr: int = 16_000) -> io.BytesIO:
109
  return buf
110
 
111
 
112
- # Start background model load immediately at import
113
- threading.Thread(target=_load_and_warm, daemon=True, name="whisper-loader").start()
 
 
 
114
 
115
 
116
  # ── ffmpeg conversion (sync, runs in _ffmpeg_pool) ────────────────────────────
@@ -178,7 +185,6 @@ def _transcribe_batch_sync(wav_paths: list[str]) -> list[Optional[str]]:
178
  condition_on_previous_text=False,
179
  temperature=0,
180
  suppress_tokens=[-1],
181
- initial_prompt=_BANGLA_SEED,
182
  no_speech_threshold=0.6,
183
  log_prob_threshold=-0.5,
184
  compression_ratio_threshold=2.4,
@@ -198,6 +204,43 @@ def _transcribe_batch_sync(wav_paths: list[str]) -> list[Optional[str]]:
198
  return results
199
 
200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
  # ── Hallucination / script validation ─────────────────────────────────────────
202
  def _validate(text: str) -> Optional[str]:
203
  if not text or not text.strip():
@@ -209,6 +252,13 @@ def _validate(text: str) -> Optional[str]:
209
  return None
210
  if len(words) == 2 and words[0] == words[1]:
211
  return None
 
 
 
 
 
 
 
212
  # Soft script check — log but keep
213
  wrong = len(_WRONG_SCRIPT_RE.findall(text))
214
  alpha = sum(1 for c in text if c.isalpha())
@@ -318,7 +368,7 @@ class STTProcessor:
318
  """
319
 
320
  async def transcribe(self, audio_bytes: bytes) -> Optional[str]:
321
- """Full pipeline: validate → ffmpeg (parallel) → batch GPU inference."""
322
 
323
  if not audio_bytes or len(audio_bytes) < 300:
324
  print(f"[STT] Ignored tiny packet ({len(audio_bytes)} B)")
@@ -327,7 +377,24 @@ class STTProcessor:
327
  if len(audio_bytes) > MAX_INPUT_BYTES:
328
  audio_bytes = audio_bytes[:MAX_INPUT_BYTES]
329
 
330
- # FIX-BUG6: wait for model with timeout not forever
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
331
  if not _model_ready.is_set():
332
  print("[STT] Waiting for model to load…")
333
  ready = await asyncio.to_thread(_model_ready.wait, _MODEL_LOAD_TIMEOUT)
@@ -338,12 +405,6 @@ class STTProcessor:
338
  if _model_error:
339
  raise RuntimeError(f"[STT] Whisper model failed to load: {_model_error}")
340
 
341
- # ffmpeg: runs in parallel I/O pool (not serialised)
342
- loop = asyncio.get_running_loop()
343
- wav_path = await loop.run_in_executor(_ffmpeg_pool, _to_wav_sync, audio_bytes)
344
- if not wav_path:
345
- return None
346
-
347
  # Batch GPU inference
348
  text = await _batch_worker.enqueue(wav_path)
349
  return _validate(text) if text else None
 
43
  from dataclasses import dataclass, field
44
  from typing import Optional
45
 
46
+ import requests
47
  from faster_whisper import WhisperModel
48
 
49
  # ── Bangla script patterns ─────────────────────────────────────────────────────
 
52
  r"[\u0600-\u06FF\u0750-\u077F\uFB50-\uFDFF\uFE70-\uFEFF]"
53
  )
54
 
 
 
 
55
  # ── Configuration ──────────────────────────────────────────────────────────────
56
+ USE_ELEVENLABS_STT = True # True = ElevenLabs Scribe, False = Whisper
57
+
58
  _STT_MODEL = os.getenv("STT_MODEL", "large-v3")
59
  _COMPUTE_TYPE = os.getenv("STT_COMPUTE_TYPE", "int8_float32")
60
  _BATCH_WINDOW = float(os.getenv("STT_BATCH_WINDOW_MS", "30")) / 1000 # 30ms (was 50ms)
 
62
  _MODEL_LOAD_TIMEOUT = int(os.getenv("STT_MODEL_LOAD_TIMEOUT_S", "120")) # seconds
63
  MAX_INPUT_BYTES = 5_242_880 # 5 MB
64
 
65
+ ELEVENLABS_STT_MODEL_ID = os.getenv("ELEVENLABS_STT_MODEL_ID", "scribe_v2")
66
+ ELEVENLABS_STT_LANGUAGE = os.getenv("ELEVENLABS_STT_LANGUAGE", "bn")
67
+ ELEVENLABS_STT_TIMEOUT = float(os.getenv("ELEVENLABS_STT_TIMEOUT", "60"))
68
+
69
  # ── Singleton model state ──────────────────────────────────────────────────────
70
  _model: Optional[WhisperModel] = None
71
  _model_lock = threading.Lock()
 
113
  return buf
114
 
115
 
116
+ if not USE_ELEVENLABS_STT:
117
+ # Start background model load immediately at import
118
+ threading.Thread(target=_load_and_warm, daemon=True, name="whisper-loader").start()
119
+ else:
120
+ print("[STT] ElevenLabs STT enabled; Whisper model load skipped")
121
 
122
 
123
  # ── ffmpeg conversion (sync, runs in _ffmpeg_pool) ────────────────────────────
 
185
  condition_on_previous_text=False,
186
  temperature=0,
187
  suppress_tokens=[-1],
 
188
  no_speech_threshold=0.6,
189
  log_prob_threshold=-0.5,
190
  compression_ratio_threshold=2.4,
 
204
  return results
205
 
206
 
207
+ def _transcribe_elevenlabs_sync(wav_path: str) -> Optional[str]:
208
+ """
209
+ ElevenLabs Scribe transcription using the REST API.
210
+ Runs in a thread so the async pipeline stays non-blocking.
211
+ """
212
+ api_key = os.getenv("ELEVENLABS_API_KEY", "").strip()
213
+ if not api_key:
214
+ raise RuntimeError("[STT][ElevenLabs] ELEVENLABS_API_KEY missing")
215
+
216
+ url = "https://api.elevenlabs.io/v1/speech-to-text"
217
+ headers = {"xi-api-key": api_key}
218
+ data = {
219
+ "model_id": ELEVENLABS_STT_MODEL_ID,
220
+ "language_code": ELEVENLABS_STT_LANGUAGE,
221
+ }
222
+
223
+ with open(wav_path, "rb") as f:
224
+ files = {"file": f}
225
+ resp = requests.post(
226
+ url,
227
+ headers=headers,
228
+ data=data,
229
+ files=files,
230
+ timeout=ELEVENLABS_STT_TIMEOUT,
231
+ )
232
+
233
+ if not resp.ok:
234
+ raise RuntimeError(f"[STT][ElevenLabs] HTTP {resp.status_code}: {resp.text[:200]}")
235
+
236
+ payload = resp.json()
237
+ text = (payload.get("text") or "").strip()
238
+ lang = payload.get("language_code", "?")
239
+ prob = payload.get("language_probability", 0)
240
+ print(f"[STT][ElevenLabs] lang={lang} p={prob} → {text[:60]}")
241
+ return text or None
242
+
243
+
244
  # ── Hallucination / script validation ─────────────────────────────────────────
245
  def _validate(text: str) -> Optional[str]:
246
  if not text or not text.strip():
 
252
  return None
253
  if len(words) == 2 and words[0] == words[1]:
254
  return None
255
+ # Catch repeated-loop hallucinations like "আপনার সাথে ..." repeated many times.
256
+ for phrase_len in (2, 3, 4):
257
+ if len(words) >= phrase_len * 3:
258
+ phrase = words[:phrase_len]
259
+ if all(words[i:i + phrase_len] == phrase for i in range(0, phrase_len * 3, phrase_len)):
260
+ print(f"[STT] rejected looped phrase: {text[:60]}")
261
+ return None
262
  # Soft script check — log but keep
263
  wrong = len(_WRONG_SCRIPT_RE.findall(text))
264
  alpha = sum(1 for c in text if c.isalpha())
 
368
  """
369
 
370
  async def transcribe(self, audio_bytes: bytes) -> Optional[str]:
371
+ """Full pipeline: validate → ffmpeg (parallel) → Whisper or ElevenLabs STT."""
372
 
373
  if not audio_bytes or len(audio_bytes) < 300:
374
  print(f"[STT] Ignored tiny packet ({len(audio_bytes)} B)")
 
377
  if len(audio_bytes) > MAX_INPUT_BYTES:
378
  audio_bytes = audio_bytes[:MAX_INPUT_BYTES]
379
 
380
+ # ffmpeg: runs in parallel I/O pool (not serialised)
381
+ loop = asyncio.get_running_loop()
382
+ wav_path = await loop.run_in_executor(_ffmpeg_pool, _to_wav_sync, audio_bytes)
383
+ if not wav_path:
384
+ return None
385
+
386
+ if USE_ELEVENLABS_STT:
387
+ try:
388
+ text = await loop.run_in_executor(_ffmpeg_pool, _transcribe_elevenlabs_sync, wav_path)
389
+ return _validate(text) if text else None
390
+ finally:
391
+ if os.path.exists(wav_path):
392
+ try:
393
+ os.remove(wav_path)
394
+ except OSError:
395
+ pass
396
+
397
+ # Whisper path: wait for model with timeout — not forever
398
  if not _model_ready.is_set():
399
  print("[STT] Waiting for model to load…")
400
  ready = await asyncio.to_thread(_model_ready.wait, _MODEL_LOAD_TIMEOUT)
 
405
  if _model_error:
406
  raise RuntimeError(f"[STT] Whisper model failed to load: {_model_error}")
407
 
 
 
 
 
 
 
408
  # Batch GPU inference
409
  text = await _batch_worker.enqueue(wav_path)
410
  return _validate(text) if text else None
services/tts.py CHANGED
@@ -16,7 +16,7 @@ import os, re, asyncio
16
 
17
  load_dotenv()
18
 
19
- USE_ELEVENLABS = False
20
  EDGE_VOICE = "bn-BD-NabanitaNeural"
21
  ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY", "")
22
  ELEVENLABS_VOICE_ID = os.getenv("ELEVENLABS_VOICE_ID", "21m00Tcm4TlvDq8ikWAM")
 
16
 
17
  load_dotenv()
18
 
19
+ USE_ELEVENLABS = True
20
  EDGE_VOICE = "bn-BD-NabanitaNeural"
21
  ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY", "")
22
  ELEVENLABS_VOICE_ID = os.getenv("ELEVENLABS_VOICE_ID", "21m00Tcm4TlvDq8ikWAM")