rakib72642 commited on
Commit
089db7b
Β·
1 Parent(s): df44f62

adjusted silence and db settings

Browse files
Files changed (4) hide show
  1. .env +4 -0
  2. app.py +1 -0
  3. frontend/script.js +2 -2
  4. services/stt.py +59 -0
.env CHANGED
@@ -14,6 +14,10 @@ ELEVENLABS_MODEL_ID="eleven_v3"
14
 
15
  SMTP_PASSWORD="kjch nsve khty nrsc"
16
 
 
 
 
 
17
  # TWILIO_ACCOUNT_SID="ACfafc0d2d007bdf14b21bb3e14a7a7b31" # "4sMbMU3eBnL80hE0H20S"
18
  # TWILIO_AUTH_TOKEN="ed15fa98748c8c3d3d02cb54e431a187"
19
  # TWILIO_PHONE_NUMBER="+14343375085"
 
14
 
15
  SMTP_PASSWORD="kjch nsve khty nrsc"
16
 
17
+ # STT filtering: drop transcripts like "[silence]" / "[noise]" before they hit chat/LLM.
18
+ # Set to 0 to disable.
19
+ STT_DROP_NOISE_TRANSCRIPTS="1"
20
+
21
  # TWILIO_ACCOUNT_SID="ACfafc0d2d007bdf14b21bb3e14a7a7b31" # "4sMbMU3eBnL80hE0H20S"
22
  # TWILIO_AUTH_TOKEN="ed15fa98748c8c3d3d02cb54e431a187"
23
  # TWILIO_PHONE_NUMBER="+14343375085"
app.py CHANGED
@@ -355,6 +355,7 @@ async def ws_voice(ws: WebSocket):
355
  # ── STT ───────────────────────────────────────────────────────────────
356
  transcript = await stt.transcribe(audio_bytes)
357
  if not transcript:
 
358
  await _safe_text(ws, {"type": "error", "text": "কΰ¦₯ΰ¦Ύ বুঝঀে ΰ¦ͺারিনি, আবার বলুনΰ₯€"})
359
  await _safe_text(ws, {"type": "end"})
360
  return
 
355
  # ── STT ───────────────────────────────────────────────────────────────
356
  transcript = await stt.transcribe(audio_bytes)
357
  if not transcript:
358
+ # Silence / background-noise turns should be ignored silently.
359
  await _safe_text(ws, {"type": "error", "text": "কΰ¦₯ΰ¦Ύ বুঝঀে ΰ¦ͺারিনি, আবার বলুনΰ₯€"})
360
  await _safe_text(ws, {"type": "end"})
361
  return
frontend/script.js CHANGED
@@ -82,8 +82,8 @@ let _chatRetryTimer = null;
82
  let _voiceRetryTimer = null;
83
 
84
  // ─── VAD / recording settings ─────────────────────────────────────────────────
85
- let SILENCE_MS = 900; // default; user-adjustable in UI
86
- let SILENCE_DB = -38;
87
  const VAD_MS = 60;
88
  const MIN_SPEECH_MS = 320; // discard noise bursts shorter than this
89
 
 
82
  let _voiceRetryTimer = null;
83
 
84
  // ─── VAD / recording settings ─────────────────────────────────────────────────
85
+ let SILENCE_MS = 800; // default; user-adjustable in UI
86
+ let SILENCE_DB = -30;
87
  const VAD_MS = 60;
88
  const MIN_SPEECH_MS = 320; // discard noise bursts shorter than this
89
 
services/stt.py CHANGED
@@ -54,6 +54,7 @@ _WRONG_SCRIPT_RE = re.compile(
54
 
55
  # ── Configuration ──────────────────────────────────────────────────────────────
56
  USE_ELEVENLABS_STT = True # True = ElevenLabs Scribe, False = Whisper
 
57
 
58
  _STT_MODEL = os.getenv("STT_MODEL", "large-v3")
59
  _COMPUTE_TYPE = os.getenv("STT_COMPUTE_TYPE", "int8_float32")
@@ -242,10 +243,68 @@ def _transcribe_elevenlabs_sync(wav_path: str) -> Optional[str]:
242
 
243
 
244
  # ── Hallucination / script validation ─────────────────────────────────────────
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
245
  def _validate(text: str) -> Optional[str]:
246
  if not text or not text.strip():
247
  return None
248
  text = text.strip()
 
 
 
249
  words = text.split()
250
  if len(words) >= 6 and len(set(words)) / len(words) < 0.25:
251
  print(f"[STT] rejected repetition: {text[:60]}")
 
54
 
55
  # ── Configuration ──────────────────────────────────────────────────────────────
56
  USE_ELEVENLABS_STT = True # True = ElevenLabs Scribe, False = Whisper
57
+ DROP_NOISE_TRANSCRIPTS = os.getenv("STT_DROP_NOISE_TRANSCRIPTS", "1").strip() not in ("0", "false", "False")
58
 
59
  _STT_MODEL = os.getenv("STT_MODEL", "large-v3")
60
  _COMPUTE_TYPE = os.getenv("STT_COMPUTE_TYPE", "int8_float32")
 
243
 
244
 
245
  # ── Hallucination / script validation ─────────────────────────────────────────
246
+ _NOISE_TOKENS = {
247
+ "silence",
248
+ "[silence]",
249
+ "noise",
250
+ "[noise]",
251
+ "[background noise]",
252
+ "[background]",
253
+ "[music]",
254
+ "music",
255
+ "[ringing]",
256
+ "ringing",
257
+ "[phone ringing]",
258
+ "phone ringing",
259
+ "[clicking]",
260
+ "clicking",
261
+ "[breathing]",
262
+ "breathing",
263
+ "[inaudible]",
264
+ "inaudible",
265
+ "[crosstalk]",
266
+ "crosstalk",
267
+ }
268
+
269
+ _NOISE_BRACKET_RE = re.compile(r"^\[([^\]]+)\]$")
270
+ _HAS_LETTER_OR_DIGIT_RE = re.compile(r"[0-9A-Za-z\u0980-\u09FF]")
271
+
272
+
273
+ def _is_noise_transcript(text: str) -> bool:
274
+ """
275
+ Returns True when the transcript appears to be a non-user utterance such as
276
+ silence/background noise labels.
277
+ """
278
+ raw = (text or "").strip()
279
+ if not raw:
280
+ return True
281
+
282
+ low = raw.lower()
283
+ if low in _NOISE_TOKENS:
284
+ return True
285
+
286
+ m = _NOISE_BRACKET_RE.match(raw)
287
+ if m:
288
+ inner = m.group(1).strip().lower()
289
+ if inner in _NOISE_TOKENS:
290
+ return True
291
+ if any(k in inner for k in ("silence", "noise", "music", "ring", "click", "inaudible", "breath")):
292
+ return True
293
+
294
+ # Pure punctuation / symbols ("...", "β€”", etc.) should not trigger LLM turns.
295
+ if not _HAS_LETTER_OR_DIGIT_RE.search(raw):
296
+ return True
297
+
298
+ return False
299
+
300
+
301
  def _validate(text: str) -> Optional[str]:
302
  if not text or not text.strip():
303
  return None
304
  text = text.strip()
305
+ if DROP_NOISE_TRANSCRIPTS and _is_noise_transcript(text):
306
+ print(f"[STT] dropped noise/silence: {text[:60]}")
307
+ return None
308
  words = text.split()
309
  if len(words) >= 6 and len(set(words)) / len(words) < 0.25:
310
  print(f"[STT] rejected repetition: {text[:60]}")