Commit Β·
089db7b
1
Parent(s): df44f62
adjusted silence and db settings
Browse files- .env +4 -0
- app.py +1 -0
- frontend/script.js +2 -2
- services/stt.py +59 -0
.env
CHANGED
|
@@ -14,6 +14,10 @@ ELEVENLABS_MODEL_ID="eleven_v3"
|
|
| 14 |
|
| 15 |
SMTP_PASSWORD="kjch nsve khty nrsc"
|
| 16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
# TWILIO_ACCOUNT_SID="ACfafc0d2d007bdf14b21bb3e14a7a7b31" # "4sMbMU3eBnL80hE0H20S"
|
| 18 |
# TWILIO_AUTH_TOKEN="ed15fa98748c8c3d3d02cb54e431a187"
|
| 19 |
# TWILIO_PHONE_NUMBER="+14343375085"
|
|
|
|
| 14 |
|
| 15 |
SMTP_PASSWORD="kjch nsve khty nrsc"
|
| 16 |
|
| 17 |
+
# STT filtering: drop transcripts like "[silence]" / "[noise]" before they hit chat/LLM.
|
| 18 |
+
# Set to 0 to disable.
|
| 19 |
+
STT_DROP_NOISE_TRANSCRIPTS="1"
|
| 20 |
+
|
| 21 |
# TWILIO_ACCOUNT_SID="ACfafc0d2d007bdf14b21bb3e14a7a7b31" # "4sMbMU3eBnL80hE0H20S"
|
| 22 |
# TWILIO_AUTH_TOKEN="ed15fa98748c8c3d3d02cb54e431a187"
|
| 23 |
# TWILIO_PHONE_NUMBER="+14343375085"
|
app.py
CHANGED
|
@@ -355,6 +355,7 @@ async def ws_voice(ws: WebSocket):
|
|
| 355 |
# ββ STT βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 356 |
transcript = await stt.transcribe(audio_bytes)
|
| 357 |
if not transcript:
|
|
|
|
| 358 |
await _safe_text(ws, {"type": "error", "text": "ΰ¦ΰ¦₯ΰ¦Ύ বΰ§ΰ¦ΰ¦€ΰ§ ΰ¦ͺারিনি, ΰ¦ΰ¦¬ΰ¦Ύΰ¦° বলΰ§ΰ¦¨ΰ₯€"})
|
| 359 |
await _safe_text(ws, {"type": "end"})
|
| 360 |
return
|
|
|
|
| 355 |
# ββ STT βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 356 |
transcript = await stt.transcribe(audio_bytes)
|
| 357 |
if not transcript:
|
| 358 |
+
# Silence / background-noise turns should be ignored silently.
|
| 359 |
await _safe_text(ws, {"type": "error", "text": "ΰ¦ΰ¦₯ΰ¦Ύ বΰ§ΰ¦ΰ¦€ΰ§ ΰ¦ͺারিনি, ΰ¦ΰ¦¬ΰ¦Ύΰ¦° বলΰ§ΰ¦¨ΰ₯€"})
|
| 360 |
await _safe_text(ws, {"type": "end"})
|
| 361 |
return
|
frontend/script.js
CHANGED
|
@@ -82,8 +82,8 @@ let _chatRetryTimer = null;
|
|
| 82 |
let _voiceRetryTimer = null;
|
| 83 |
|
| 84 |
// βββ VAD / recording settings βββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 85 |
-
let SILENCE_MS =
|
| 86 |
-
let SILENCE_DB = -
|
| 87 |
const VAD_MS = 60;
|
| 88 |
const MIN_SPEECH_MS = 320; // discard noise bursts shorter than this
|
| 89 |
|
|
|
|
| 82 |
let _voiceRetryTimer = null;
|
| 83 |
|
| 84 |
// βββ VAD / recording settings βββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 85 |
+
let SILENCE_MS = 800; // default; user-adjustable in UI
|
| 86 |
+
let SILENCE_DB = -30;
|
| 87 |
const VAD_MS = 60;
|
| 88 |
const MIN_SPEECH_MS = 320; // discard noise bursts shorter than this
|
| 89 |
|
services/stt.py
CHANGED
|
@@ -54,6 +54,7 @@ _WRONG_SCRIPT_RE = re.compile(
|
|
| 54 |
|
| 55 |
# ββ Configuration ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 56 |
USE_ELEVENLABS_STT = True # True = ElevenLabs Scribe, False = Whisper
|
|
|
|
| 57 |
|
| 58 |
_STT_MODEL = os.getenv("STT_MODEL", "large-v3")
|
| 59 |
_COMPUTE_TYPE = os.getenv("STT_COMPUTE_TYPE", "int8_float32")
|
|
@@ -242,10 +243,68 @@ def _transcribe_elevenlabs_sync(wav_path: str) -> Optional[str]:
|
|
| 242 |
|
| 243 |
|
| 244 |
# ββ Hallucination / script validation βββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 245 |
def _validate(text: str) -> Optional[str]:
|
| 246 |
if not text or not text.strip():
|
| 247 |
return None
|
| 248 |
text = text.strip()
|
|
|
|
|
|
|
|
|
|
| 249 |
words = text.split()
|
| 250 |
if len(words) >= 6 and len(set(words)) / len(words) < 0.25:
|
| 251 |
print(f"[STT] rejected repetition: {text[:60]}")
|
|
|
|
| 54 |
|
| 55 |
# ββ Configuration ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 56 |
USE_ELEVENLABS_STT = True # True = ElevenLabs Scribe, False = Whisper
|
| 57 |
+
DROP_NOISE_TRANSCRIPTS = os.getenv("STT_DROP_NOISE_TRANSCRIPTS", "1").strip() not in ("0", "false", "False")
|
| 58 |
|
| 59 |
_STT_MODEL = os.getenv("STT_MODEL", "large-v3")
|
| 60 |
_COMPUTE_TYPE = os.getenv("STT_COMPUTE_TYPE", "int8_float32")
|
|
|
|
| 243 |
|
| 244 |
|
| 245 |
# ββ Hallucination / script validation βββββββββββββββββββββββββββββββββββββββββ
|
| 246 |
+
_NOISE_TOKENS = {
|
| 247 |
+
"silence",
|
| 248 |
+
"[silence]",
|
| 249 |
+
"noise",
|
| 250 |
+
"[noise]",
|
| 251 |
+
"[background noise]",
|
| 252 |
+
"[background]",
|
| 253 |
+
"[music]",
|
| 254 |
+
"music",
|
| 255 |
+
"[ringing]",
|
| 256 |
+
"ringing",
|
| 257 |
+
"[phone ringing]",
|
| 258 |
+
"phone ringing",
|
| 259 |
+
"[clicking]",
|
| 260 |
+
"clicking",
|
| 261 |
+
"[breathing]",
|
| 262 |
+
"breathing",
|
| 263 |
+
"[inaudible]",
|
| 264 |
+
"inaudible",
|
| 265 |
+
"[crosstalk]",
|
| 266 |
+
"crosstalk",
|
| 267 |
+
}
|
| 268 |
+
|
| 269 |
+
_NOISE_BRACKET_RE = re.compile(r"^\[([^\]]+)\]$")
|
| 270 |
+
_HAS_LETTER_OR_DIGIT_RE = re.compile(r"[0-9A-Za-z\u0980-\u09FF]")
|
| 271 |
+
|
| 272 |
+
|
| 273 |
+
def _is_noise_transcript(text: str) -> bool:
|
| 274 |
+
"""
|
| 275 |
+
Returns True when the transcript appears to be a non-user utterance such as
|
| 276 |
+
silence/background noise labels.
|
| 277 |
+
"""
|
| 278 |
+
raw = (text or "").strip()
|
| 279 |
+
if not raw:
|
| 280 |
+
return True
|
| 281 |
+
|
| 282 |
+
low = raw.lower()
|
| 283 |
+
if low in _NOISE_TOKENS:
|
| 284 |
+
return True
|
| 285 |
+
|
| 286 |
+
m = _NOISE_BRACKET_RE.match(raw)
|
| 287 |
+
if m:
|
| 288 |
+
inner = m.group(1).strip().lower()
|
| 289 |
+
if inner in _NOISE_TOKENS:
|
| 290 |
+
return True
|
| 291 |
+
if any(k in inner for k in ("silence", "noise", "music", "ring", "click", "inaudible", "breath")):
|
| 292 |
+
return True
|
| 293 |
+
|
| 294 |
+
# Pure punctuation / symbols ("...", "β", etc.) should not trigger LLM turns.
|
| 295 |
+
if not _HAS_LETTER_OR_DIGIT_RE.search(raw):
|
| 296 |
+
return True
|
| 297 |
+
|
| 298 |
+
return False
|
| 299 |
+
|
| 300 |
+
|
| 301 |
def _validate(text: str) -> Optional[str]:
|
| 302 |
if not text or not text.strip():
|
| 303 |
return None
|
| 304 |
text = text.strip()
|
| 305 |
+
if DROP_NOISE_TRANSCRIPTS and _is_noise_transcript(text):
|
| 306 |
+
print(f"[STT] dropped noise/silence: {text[:60]}")
|
| 307 |
+
return None
|
| 308 |
words = text.split()
|
| 309 |
if len(words) >= 6 and len(set(words)) / len(words) < 0.25:
|
| 310 |
print(f"[STT] rejected repetition: {text[:60]}")
|