HawkEyesAI
/

Voice-AI-Agent

Model card Files Files and versions

xet

Community

rakib72642 commited on about 15 hours ago

Commit

089db7b

1 Parent(s): df44f62

adjusted silence and db settings

Browse files

Files changed (4) hide show

.env +4 -0
app.py +1 -0
frontend/script.js +2 -2
services/stt.py +59 -0

.env CHANGED Viewed

@@ -14,6 +14,10 @@ ELEVENLABS_MODEL_ID="eleven_v3"
 SMTP_PASSWORD="kjch nsve khty nrsc"
 # TWILIO_ACCOUNT_SID="ACfafc0d2d007bdf14b21bb3e14a7a7b31" # "4sMbMU3eBnL80hE0H20S"
 # TWILIO_AUTH_TOKEN="ed15fa98748c8c3d3d02cb54e431a187"
 # TWILIO_PHONE_NUMBER="+14343375085"

 SMTP_PASSWORD="kjch nsve khty nrsc"
+# STT filtering: drop transcripts like "[silence]" / "[noise]" before they hit chat/LLM.
+# Set to 0 to disable.
+STT_DROP_NOISE_TRANSCRIPTS="1"
 # TWILIO_ACCOUNT_SID="ACfafc0d2d007bdf14b21bb3e14a7a7b31" # "4sMbMU3eBnL80hE0H20S"
 # TWILIO_AUTH_TOKEN="ed15fa98748c8c3d3d02cb54e431a187"
 # TWILIO_PHONE_NUMBER="+14343375085"

app.py CHANGED Viewed

@@ -355,6 +355,7 @@ async def ws_voice(ws: WebSocket):
         # ── STT ───────────────────────────────────────────────────────────────
         transcript = await stt.transcribe(audio_bytes)
         if not transcript:
             await _safe_text(ws, {"type": "error", "text": "কথা বুঝতে পারিনি, আবার বলুন।"})
             await _safe_text(ws, {"type": "end"})
             return

         # ── STT ───────────────────────────────────────────────────────────────
         transcript = await stt.transcribe(audio_bytes)
         if not transcript:
+            # Silence / background-noise turns should be ignored silently.
             await _safe_text(ws, {"type": "error", "text": "কথা বুঝতে পারিনি, আবার বলুন।"})
             await _safe_text(ws, {"type": "end"})
             return

frontend/script.js CHANGED Viewed

@@ -82,8 +82,8 @@ let _chatRetryTimer = null;
 let _voiceRetryTimer = null;
 // ─── VAD / recording settings ─────────────────────────────────────────────────
-let SILENCE_MS = 900; // default; user-adjustable in UI
-let SILENCE_DB = -38;
 const VAD_MS = 60;
 const MIN_SPEECH_MS = 320; // discard noise bursts shorter than this

 let _voiceRetryTimer = null;
 // ─── VAD / recording settings ─────────────────────────────────────────────────
+let SILENCE_MS = 800; // default; user-adjustable in UI
+let SILENCE_DB = -30;
 const VAD_MS = 60;
 const MIN_SPEECH_MS = 320; // discard noise bursts shorter than this

services/stt.py CHANGED Viewed

@@ -54,6 +54,7 @@ _WRONG_SCRIPT_RE = re.compile(
 # ── Configuration ──────────────────────────────────────────────────────────────
 USE_ELEVENLABS_STT = True  # True = ElevenLabs Scribe, False = Whisper
 _STT_MODEL      = os.getenv("STT_MODEL",          "large-v3")
 _COMPUTE_TYPE   = os.getenv("STT_COMPUTE_TYPE",   "int8_float32")
@@ -242,10 +243,68 @@ def _transcribe_elevenlabs_sync(wav_path: str) -> Optional[str]:
 # ── Hallucination / script validation ─────────────────────────────────────────
 def _validate(text: str) -> Optional[str]:
     if not text or not text.strip():
         return None
     text  = text.strip()
     words = text.split()
     if len(words) >= 6 and len(set(words)) / len(words) < 0.25:
         print(f"[STT] rejected repetition: {text[:60]}")

 # ── Configuration ──────────────────────────────────────────────────────────────
 USE_ELEVENLABS_STT = True  # True = ElevenLabs Scribe, False = Whisper
+DROP_NOISE_TRANSCRIPTS = os.getenv("STT_DROP_NOISE_TRANSCRIPTS", "1").strip() not in ("0", "false", "False")
 _STT_MODEL      = os.getenv("STT_MODEL",          "large-v3")
 _COMPUTE_TYPE   = os.getenv("STT_COMPUTE_TYPE",   "int8_float32")
 # ── Hallucination / script validation ─────────────────────────────────────────
+_NOISE_TOKENS = {
+    "silence",
+    "[silence]",
+    "noise",
+    "[noise]",
+    "[background noise]",
+    "[background]",
+    "[music]",
+    "music",
+    "[ringing]",
+    "ringing",
+    "[phone ringing]",
+    "phone ringing",
+    "[clicking]",
+    "clicking",
+    "[breathing]",
+    "breathing",
+    "[inaudible]",
+    "inaudible",
+    "[crosstalk]",
+    "crosstalk",
+}
+_NOISE_BRACKET_RE = re.compile(r"^\[([^\]]+)\]$")
+_HAS_LETTER_OR_DIGIT_RE = re.compile(r"[0-9A-Za-z\u0980-\u09FF]")
+def _is_noise_transcript(text: str) -> bool:
+    """
+    Returns True when the transcript appears to be a non-user utterance such as
+    silence/background noise labels.
+    """
+    raw = (text or "").strip()
+    if not raw:
+        return True
+    low = raw.lower()
+    if low in _NOISE_TOKENS:
+        return True
+    m = _NOISE_BRACKET_RE.match(raw)
+    if m:
+        inner = m.group(1).strip().lower()
+        if inner in _NOISE_TOKENS:
+            return True
+        if any(k in inner for k in ("silence", "noise", "music", "ring", "click", "inaudible", "breath")):
+            return True
+    # Pure punctuation / symbols ("...", "—", etc.) should not trigger LLM turns.
+    if not _HAS_LETTER_OR_DIGIT_RE.search(raw):
+        return True
+    return False
 def _validate(text: str) -> Optional[str]:
     if not text or not text.strip():
         return None
     text  = text.strip()
+    if DROP_NOISE_TRANSCRIPTS and _is_noise_transcript(text):
+        print(f"[STT] dropped noise/silence: {text[:60]}")
+        return None
     words = text.split()
     if len(words) >= 6 and len(set(words)) / len(words) < 0.25:
         print(f"[STT] rejected repetition: {text[:60]}")