fixed stt and added whisper and elevenlabs stt

Browse files

Files changed (4) hide show

.env +2 -2
core/backend.py +3 -0
services/stt.py +75 -14
services/tts.py +1 -1

.env CHANGED Viewed

@@ -9,8 +9,8 @@ GOOGLE_API_KEY="AIzaSyA9sqz4YKQHKXR9TU1imw0DPOghzHOMiBo"
 ELEVENLABS_API_KEY="b3af3a938c8e15d5eae700ea47eea7d88dfe397f34fbd4b0c75c24f143b032b8"
-ELEVENLABS_VOICE_ID="iuABfyf7pRoBzuPqzUCt"
-ELEVENLABS_MODEL_ID="eleven_multilingual_v2"
 # TWILIO_ACCOUNT_SID="ACfafc0d2d007bdf14b21bb3e14a7a7b31"
 # TWILIO_AUTH_TOKEN="ed15fa98748c8c3d3d02cb54e431a187"

 ELEVENLABS_API_KEY="b3af3a938c8e15d5eae700ea47eea7d88dfe397f34fbd4b0c75c24f143b032b8"
+ELEVENLABS_VOICE_ID="4O1sYUnmtThcBoSBrri7"
+ELEVENLABS_MODEL_ID="eleven_v3"
 # TWILIO_ACCOUNT_SID="ACfafc0d2d007bdf14b21bb3e14a7a7b31"
 # TWILIO_AUTH_TOKEN="ed15fa98748c8c3d3d02cb54e431a187"

core/backend.py CHANGED Viewed

@@ -403,6 +403,9 @@ LANGUAGE RULE:
 - If Bangla → reply Bangla (বাংলা).
 - If Banglish → reply Bangla (বাংলা).
 - Never mix languages unless user mixes first.
 TOOLS:
 - Use backend tools if needed
 - Always confirm before final action

 - If Bangla → reply Bangla (বাংলা).
 - If Banglish → reply Bangla (বাংলা).
 - Never mix languages unless user mixes first.
+DOCTOR ID RULE:
+- Never generate or guess doctor_id.
+- doctor_id must only come from search_doctor tool output.
 TOOLS:
 - Use backend tools if needed
 - Always confirm before final action

services/stt.py CHANGED Viewed

@@ -43,6 +43,7 @@ from concurrent.futures import ThreadPoolExecutor
 from dataclasses import dataclass, field
 from typing import Optional
 from faster_whisper import WhisperModel
 # ── Bangla script patterns ─────────────────────────────────────────────────────
@@ -51,10 +52,9 @@ _WRONG_SCRIPT_RE = re.compile(
     r"[\u0600-\u06FF\u0750-\u077F\uFB50-\uFDFF\uFE70-\uFEFF]"
 )
-# Bangla decoder seed — keeps Whisper in বাংলা Unicode block
-_BANGLA_SEED = "আমি আপনার সাথে বাংলায় কথা বলছি।"
 # ── Configuration ──────────────────────────────────────────────────────────────
 _STT_MODEL      = os.getenv("STT_MODEL",          "large-v3")
 _COMPUTE_TYPE   = os.getenv("STT_COMPUTE_TYPE",   "int8_float32")
 _BATCH_WINDOW   = float(os.getenv("STT_BATCH_WINDOW_MS", "30")) / 1000  # 30ms (was 50ms)
@@ -62,6 +62,10 @@ _MAX_BATCH      = int(os.getenv("STT_MAX_BATCH",  "8"))
 _MODEL_LOAD_TIMEOUT = int(os.getenv("STT_MODEL_LOAD_TIMEOUT_S", "120"))  # seconds
 MAX_INPUT_BYTES = 5_242_880  # 5 MB
 # ── Singleton model state ──────────────────────────────────────────────────────
 _model: Optional[WhisperModel] = None
 _model_lock    = threading.Lock()
@@ -109,8 +113,11 @@ def _make_silence_wav(duration_s: float = 0.5, sr: int = 16_000) -> io.BytesIO:
     return buf
-# Start background model load immediately at import
-threading.Thread(target=_load_and_warm, daemon=True, name="whisper-loader").start()
 # ── ffmpeg conversion (sync, runs in _ffmpeg_pool) ────────────────────────────
@@ -178,7 +185,6 @@ def _transcribe_batch_sync(wav_paths: list[str]) -> list[Optional[str]]:
                 condition_on_previous_text=False,
                 temperature=0,
                 suppress_tokens=[-1],
-                initial_prompt=_BANGLA_SEED,
                 no_speech_threshold=0.6,
                 log_prob_threshold=-0.5,
                 compression_ratio_threshold=2.4,
@@ -198,6 +204,43 @@ def _transcribe_batch_sync(wav_paths: list[str]) -> list[Optional[str]]:
     return results
 # ── Hallucination / script validation ─────────────────────────────────────────
 def _validate(text: str) -> Optional[str]:
     if not text or not text.strip():
@@ -209,6 +252,13 @@ def _validate(text: str) -> Optional[str]:
         return None
     if len(words) == 2 and words[0] == words[1]:
         return None
     # Soft script check — log but keep
     wrong = len(_WRONG_SCRIPT_RE.findall(text))
     alpha = sum(1 for c in text if c.isalpha())
@@ -318,7 +368,7 @@ class STTProcessor:
     """
     async def transcribe(self, audio_bytes: bytes) -> Optional[str]:
-        """Full pipeline: validate → ffmpeg (parallel) → batch GPU inference."""
         if not audio_bytes or len(audio_bytes) < 300:
             print(f"[STT] Ignored tiny packet ({len(audio_bytes)} B)")
@@ -327,7 +377,24 @@ class STTProcessor:
         if len(audio_bytes) > MAX_INPUT_BYTES:
             audio_bytes = audio_bytes[:MAX_INPUT_BYTES]
-        # FIX-BUG6: wait for model with timeout — not forever
         if not _model_ready.is_set():
             print("[STT] Waiting for model to load…")
             ready = await asyncio.to_thread(_model_ready.wait, _MODEL_LOAD_TIMEOUT)
@@ -338,12 +405,6 @@ class STTProcessor:
             if _model_error:
                 raise RuntimeError(f"[STT] Whisper model failed to load: {_model_error}")
-        # ffmpeg: runs in parallel I/O pool (not serialised)
-        loop     = asyncio.get_running_loop()
-        wav_path = await loop.run_in_executor(_ffmpeg_pool, _to_wav_sync, audio_bytes)
-        if not wav_path:
-            return None
         # Batch GPU inference
         text = await _batch_worker.enqueue(wav_path)
         return _validate(text) if text else None

 from dataclasses import dataclass, field
 from typing import Optional
+import requests
 from faster_whisper import WhisperModel
 # ── Bangla script patterns ─────────────────────────────────────────────────────
     r"[\u0600-\u06FF\u0750-\u077F\uFB50-\uFDFF\uFE70-\uFEFF]"
 )
 # ── Configuration ──────────────────────────────────────────────────────────────
+USE_ELEVENLABS_STT = True  # True = ElevenLabs Scribe, False = Whisper
 _STT_MODEL      = os.getenv("STT_MODEL",          "large-v3")
 _COMPUTE_TYPE   = os.getenv("STT_COMPUTE_TYPE",   "int8_float32")
 _BATCH_WINDOW   = float(os.getenv("STT_BATCH_WINDOW_MS", "30")) / 1000  # 30ms (was 50ms)
 _MODEL_LOAD_TIMEOUT = int(os.getenv("STT_MODEL_LOAD_TIMEOUT_S", "120"))  # seconds
 MAX_INPUT_BYTES = 5_242_880  # 5 MB
+ELEVENLABS_STT_MODEL_ID = os.getenv("ELEVENLABS_STT_MODEL_ID", "scribe_v2")
+ELEVENLABS_STT_LANGUAGE = os.getenv("ELEVENLABS_STT_LANGUAGE", "bn")
+ELEVENLABS_STT_TIMEOUT = float(os.getenv("ELEVENLABS_STT_TIMEOUT", "60"))
 # ── Singleton model state ──────────────────────────────────────────────────────
 _model: Optional[WhisperModel] = None
 _model_lock    = threading.Lock()
     return buf
+if not USE_ELEVENLABS_STT:
+    # Start background model load immediately at import
+    threading.Thread(target=_load_and_warm, daemon=True, name="whisper-loader").start()
+else:
+    print("[STT] ElevenLabs STT enabled; Whisper model load skipped")
 # ── ffmpeg conversion (sync, runs in _ffmpeg_pool) ────────────────────────────
                 condition_on_previous_text=False,
                 temperature=0,
                 suppress_tokens=[-1],
                 no_speech_threshold=0.6,
                 log_prob_threshold=-0.5,
                 compression_ratio_threshold=2.4,
     return results
+def _transcribe_elevenlabs_sync(wav_path: str) -> Optional[str]:
+    """
+    ElevenLabs Scribe transcription using the REST API.
+    Runs in a thread so the async pipeline stays non-blocking.
+    """
+    api_key = os.getenv("ELEVENLABS_API_KEY", "").strip()
+    if not api_key:
+        raise RuntimeError("[STT][ElevenLabs] ELEVENLABS_API_KEY missing")
+    url = "https://api.elevenlabs.io/v1/speech-to-text"
+    headers = {"xi-api-key": api_key}
+    data = {
+        "model_id": ELEVENLABS_STT_MODEL_ID,
+        "language_code": ELEVENLABS_STT_LANGUAGE,
+    }
+    with open(wav_path, "rb") as f:
+        files = {"file": f}
+        resp = requests.post(
+            url,
+            headers=headers,
+            data=data,
+            files=files,
+            timeout=ELEVENLABS_STT_TIMEOUT,
+        )
+    if not resp.ok:
+        raise RuntimeError(f"[STT][ElevenLabs] HTTP {resp.status_code}: {resp.text[:200]}")
+    payload = resp.json()
+    text = (payload.get("text") or "").strip()
+    lang = payload.get("language_code", "?")
+    prob = payload.get("language_probability", 0)
+    print(f"[STT][ElevenLabs] lang={lang} p={prob} → {text[:60]}")
+    return text or None
 # ── Hallucination / script validation ─────────────────────────────────────────
 def _validate(text: str) -> Optional[str]:
     if not text or not text.strip():
         return None
     if len(words) == 2 and words[0] == words[1]:
         return None
+    # Catch repeated-loop hallucinations like "আপনার সাথে ..." repeated many times.
+    for phrase_len in (2, 3, 4):
+        if len(words) >= phrase_len * 3:
+            phrase = words[:phrase_len]
+            if all(words[i:i + phrase_len] == phrase for i in range(0, phrase_len * 3, phrase_len)):
+                print(f"[STT] rejected looped phrase: {text[:60]}")
+                return None
     # Soft script check — log but keep
     wrong = len(_WRONG_SCRIPT_RE.findall(text))
     alpha = sum(1 for c in text if c.isalpha())
     """
     async def transcribe(self, audio_bytes: bytes) -> Optional[str]:
+        """Full pipeline: validate → ffmpeg (parallel) → Whisper or ElevenLabs STT."""
         if not audio_bytes or len(audio_bytes) < 300:
             print(f"[STT] Ignored tiny packet ({len(audio_bytes)} B)")
         if len(audio_bytes) > MAX_INPUT_BYTES:
             audio_bytes = audio_bytes[:MAX_INPUT_BYTES]
+        # ffmpeg: runs in parallel I/O pool (not serialised)
+        loop     = asyncio.get_running_loop()
+        wav_path = await loop.run_in_executor(_ffmpeg_pool, _to_wav_sync, audio_bytes)
+        if not wav_path:
+            return None
+        if USE_ELEVENLABS_STT:
+            try:
+                text = await loop.run_in_executor(_ffmpeg_pool, _transcribe_elevenlabs_sync, wav_path)
+                return _validate(text) if text else None
+            finally:
+                if os.path.exists(wav_path):
+                    try:
+                        os.remove(wav_path)
+                    except OSError:
+                        pass
+        # Whisper path: wait for model with timeout — not forever
         if not _model_ready.is_set():
             print("[STT] Waiting for model to load…")
             ready = await asyncio.to_thread(_model_ready.wait, _MODEL_LOAD_TIMEOUT)
             if _model_error:
                 raise RuntimeError(f"[STT] Whisper model failed to load: {_model_error}")
         # Batch GPU inference
         text = await _batch_worker.enqueue(wav_path)
         return _validate(text) if text else None

services/tts.py CHANGED Viewed

@@ -16,7 +16,7 @@ import os, re, asyncio
 load_dotenv()
-USE_ELEVENLABS       = False
 EDGE_VOICE           = "bn-BD-NabanitaNeural"
 ELEVENLABS_API_KEY   = os.getenv("ELEVENLABS_API_KEY", "")
 ELEVENLABS_VOICE_ID  = os.getenv("ELEVENLABS_VOICE_ID", "21m00Tcm4TlvDq8ikWAM")

 load_dotenv()
+USE_ELEVENLABS       = True
 EDGE_VOICE           = "bn-BD-NabanitaNeural"
 ELEVENLABS_API_KEY   = os.getenv("ELEVENLABS_API_KEY", "")
 ELEVENLABS_VOICE_ID  = os.getenv("ELEVENLABS_VOICE_ID", "21m00Tcm4TlvDq8ikWAM")