Refactor voice handling for barge-in support and update UI text labels

Browse files

Files changed (4) hide show

app.py +14 -2
frontend/index.html +1 -1
frontend/script.js +158 -30
services/tts.py +2 -2

app.py CHANGED Viewed

@@ -392,18 +392,25 @@ async def ws_voice(ws: WebSocket):
         await _safe_text(ws, {"type": "end"})
     async def _utterance_worker():
         while True:
             audio_bytes = await _utterance_q.get()
             if audio_bytes is None:
                 break
             try:
-                await _handle_utterance(audio_bytes)
             except asyncio.CancelledError:
-                raise
             except Exception as exc:
                 print(f"[VOICE] Utterance worker error: {exc}")
                 await _safe_text(ws, {"type": "error", "text": str(exc)})
                 await _safe_text(ws, {"type": "end"})
     try:
         _worker_task = asyncio.create_task(_utterance_worker())
@@ -424,6 +431,11 @@ async def ws_voice(ws: WebSocket):
             if "bytes" in data and data["bytes"]:
                 audio_bytes = data["bytes"]
                 print(f"[VOICE] [{user_id}] Utterance: {len(audio_bytes):,} bytes")
                 await _utterance_q.put(audio_bytes)
             elif "text" in data and data["text"]:

         await _safe_text(ws, {"type": "end"})
     async def _utterance_worker():
+        nonlocal _active_task
         while True:
             audio_bytes = await _utterance_q.get()
             if audio_bytes is None:
                 break
             try:
+                # Run each utterance as a cancellable task so barge-in can
+                # immediately interrupt LLM+TTS mid-turn.
+                _active_task = asyncio.create_task(_handle_utterance(audio_bytes))
+                await _active_task
             except asyncio.CancelledError:
+                # Interruption is normal (client barge-in / cancel).
+                pass
             except Exception as exc:
                 print(f"[VOICE] Utterance worker error: {exc}")
                 await _safe_text(ws, {"type": "error", "text": str(exc)})
                 await _safe_text(ws, {"type": "end"})
+            finally:
+                _active_task = None
     try:
         _worker_task = asyncio.create_task(_utterance_worker())
             if "bytes" in data and data["bytes"]:
                 audio_bytes = data["bytes"]
                 print(f"[VOICE] [{user_id}] Utterance: {len(audio_bytes):,} bytes")
+                # If a turn is currently speaking, treat a new utterance as
+                # barge-in: cancel current output and drop any queued audio.
+                if _active_task is not None and not _active_task.done():
+                    await _cancel_active()
+                    await _drain_utterance_queue()
                 await _utterance_q.put(audio_bytes)
             elif "text" in data and data["text"]:

frontend/index.html CHANGED Viewed

@@ -247,7 +247,7 @@
       <div class="voice-row">
         <button id="mic-btn" class="mic-btn mic-off">
           <span class="mic-icon">🎤</span>
-          <span class="mic-label">Voice শুরু করুন</span>
         </button>
         <button id="stop-btn" class="stop-btn" title="Stop AI speech">
           <svg width="18" height="18" viewBox="0 0 24 24" fill="currentColor">

       <div class="voice-row">
         <button id="mic-btn" class="mic-btn mic-off">
           <span class="mic-icon">🎤</span>
+          <span class="mic-label">Start</span>
         </button>
         <button id="stop-btn" class="stop-btn" title="Stop AI speech">
           <svg width="18" height="18" viewBox="0 0 24 24" fill="currentColor">

frontend/script.js CHANGED Viewed

@@ -1,5 +1,3 @@
 'use strict';
 // ─── DOM refs ─────────────────────────────────────────────────────────────────
@@ -58,9 +56,10 @@ const USER_ID = (() => {
 const WS_BASES = (() => {
   const scheme = window.location.protocol === 'https:' ? 'wss:' : 'ws:';
   const bases = [];
-  const host = window.location.host && window.location.host !== 'null'
-    ? `${scheme}//${window.location.host}`
-    : '';
   const push = (base) => {
     if (base && !bases.includes(base)) bases.push(base);
   };
@@ -83,10 +82,10 @@ let _chatRetryTimer = null;
 let _voiceRetryTimer = null;
 // ─── VAD / recording settings ─────────────────────────────────────────────────
-let SILENCE_MS = 1200; // BUG-FIX-B: was 450 ms
 let SILENCE_DB = -38;
-const VAD_MS = 80;
-const MIN_SPEECH_MS = 400; // discard noise bursts shorter than this
 // ─── Playback state ───────────────────────────────────────────────────────────
 let _ctx = null;
@@ -95,6 +94,12 @@ let _endTimer = null;
 let _cancelled = false;
 let _inFlight = 0;
 let _ttsPlaying = false;
 let brainMode = false;
 let brainVoiceActive = false;
 let brainRestartTimer = null;
@@ -293,8 +298,18 @@ function onChatMsg(ev) {
 // ── Voice WS handler ──────────────────────────────────────────────────────────
 function onVoiceMsg(ev) {
   if (ev.data instanceof ArrayBuffer) {
     _ttsPlaying = true;
-    enqueueAudio(ev.data);
     return;
   }
@@ -439,6 +454,19 @@ function _ctxEnsure() {
   return _ctx;
 }
 async function enqueueAudio(buf) {
   if (_cancelled) return;
   _inFlight++;
@@ -470,13 +498,23 @@ async function enqueueAudio(buf) {
   src.buffer = decoded;
   src.connect(ctx.destination);
   const now = ctx.currentTime;
-  const start = Math.max(now + 0.01, _schedEnd);
   src.start(start);
   _schedEnd = start + decoded.duration;
   src.onended = () => {
     _inFlight = Math.max(0, _inFlight - 1);
     _vizQ();
   };
   setState('speaking');
@@ -522,7 +560,13 @@ function _done() {
   if (brainMode && brainVoiceActive) {
     clearTimeout(brainAutoRestartTimer);
     brainAutoRestartTimer = setTimeout(() => {
-      if (!brainMode || !brainVoiceActive || isListening || isProcessing || isRecordingLocked) {
         return;
       }
       _brainResumeListening();
@@ -534,17 +578,55 @@ function _done() {
 function stopAllAudio() {
   _cancelled = true;
   _ttsPlaying = false;
   clearTimeout(_endTimer);
   _endTimer = null;
   _schedEnd = 0;
   _inFlight = 0;
   _vizQ();
-  if (_ctx && _ctx.state === 'running') _ctx.suspend().catch(() => {});
   if (voiceWS && voiceWS.readyState === WebSocket.OPEN) {
     voiceWS.send(JSON.stringify({ type: 'cancel' }));
   }
 }
 // ═══════════════════════════════════════════════════════════════════════════════
 //  TEXT CHAT
 // ═══════════════════════════════════════════════════════════════════════════════
@@ -707,8 +789,9 @@ function _resetVoiceState() {
 // ── VAD tick ──────────────────────────────────────────────────────────────────
 function vadTick() {
   if (!analyser) return;
-  if (_ttsPlaying) return; // mute during TTS playback
-  if (isProcessing || isRecordingLocked) return; // hard lock
   const buf = new Float32Array(analyser.frequencyBinCount);
   analyser.getFloatTimeDomainData(buf);
@@ -718,6 +801,27 @@ function vadTick() {
   const speech = db > SILENCE_DB;
   if (speech) {
     clearTimeout(silenceTimer);
     silenceTimer = null;
@@ -733,6 +837,7 @@ function vadTick() {
       console.log('[VAD] Speech detected — recording');
     }
   } else {
     if (isSpeaking && !silenceTimer) {
       silenceTimer = setTimeout(_onSilenceTimeout, SILENCE_MS);
     }
@@ -764,14 +869,19 @@ function _onSilenceTimeout() {
     `[VAD] Silence after ${speechDuration} ms — finalising utterance`,
   );
-  // Stop VAD before stopRecorder so no new speech detection during processing
-  clearInterval(vadInt);
-  clearInterval(vizInt);
-  vadInt = vizInt = null;
   // Lock state BEFORE stopRecorder (onstop may fire almost immediately)
   isSpeaking = false;
-  isListening = false;
   isProcessing = true;
   isRecordingLocked = true;
   _cancelled = false;
@@ -780,9 +890,9 @@ function _onSilenceTimeout() {
   tLlm = 0;
   tTts = 0;
-  micBtn.disabled = true;
-  setMic('processing');
-  setState('processing');
   stopRecorder(); // → triggers onstop asynchronously
 }
@@ -958,14 +1068,14 @@ function setState(s) {
 }
 const MIC_MAP = {
-  off: { cls: 'mic-off', label: 'Voice শুরু করুন', icon: '🎤' },
   listening: {
     cls: 'mic-listening',
-    label: 'শুনছি… (বাতিল ���রতে ক্লিক)',
     icon: '🟢',
   },
-  recording: { cls: 'mic-recording', label: 'বলছেন…', icon: '🔴' },
-  processing: { cls: 'mic-processing', label: 'প্রক্রিয়া করছে…', icon: '⏳' },
 };
 function setMic(s) {
@@ -1022,10 +1132,18 @@ function setBrainMode(on) {
     sidebarToggle.textContent = '›';
     chatBox.scrollTop = chatBox.scrollHeight;
     textInput.blur();
-    _brainModeSetSearch(isProcessing || isListening || isSpeaking || _ttsPlaying);
     if (!isListening && !isProcessing && !isRecordingLocked) {
       setTimeout(() => {
-        if (brainMode && brainVoiceActive && !isListening && !isProcessing && !isRecordingLocked) {
           _brainResumeListening();
         }
       }, 180);
@@ -1064,7 +1182,13 @@ function _brainSetTtsBubble(text, active = true) {
 }
 function _brainResumeListening() {
-  if (!brainMode || !brainVoiceActive || isListening || isProcessing || isRecordingLocked) {
     return;
   }
   if (micStream && analyserCtx && analyser) {
@@ -1093,7 +1217,11 @@ function _queueBrainReconnect() {
 }
 function _flushVoicePendingPackets() {
-  if (!voiceWS || voiceWS.readyState !== WebSocket.OPEN || !voicePendingPackets.length) {
     return;
   }
   const packets = voicePendingPackets.splice(0);

 'use strict';
 // ─── DOM refs ─────────────────────────────────────────────────────────────────
 const WS_BASES = (() => {
   const scheme = window.location.protocol === 'https:' ? 'wss:' : 'ws:';
   const bases = [];
+  const host =
+    window.location.host && window.location.host !== 'null'
+      ? `${scheme}//${window.location.host}`
+      : '';
   const push = (base) => {
     if (base && !bases.includes(base)) bases.push(base);
   };
 let _voiceRetryTimer = null;
 // ─── VAD / recording settings ─────────────────────────────────────────────────
+let SILENCE_MS = 900; // default; user-adjustable in UI
 let SILENCE_DB = -38;
+const VAD_MS = 60;
+const MIN_SPEECH_MS = 320; // discard noise bursts shorter than this
 // ─── Playback state ───────────────────────────────────────────────────────────
 let _ctx = null;
 let _cancelled = false;
 let _inFlight = 0;
 let _ttsPlaying = false;
+let _activeSources = [];
+let _bargeInArmedAt = 0;
+let _bargeInFiredAt = 0;
+let _dropAudioUntil = 0;
+let _audioChain = Promise.resolve();
+let _playbackGen = 0;
 let brainMode = false;
 let brainVoiceActive = false;
 let brainRestartTimer = null;
 // ── Voice WS handler ──────────────────────────────────────────────────────────
 function onVoiceMsg(ev) {
   if (ev.data instanceof ArrayBuffer) {
+    if (Date.now() < _dropAudioUntil) return; // drop late packets after cancel
     _ttsPlaying = true;
+    // Ensure decode+schedule happens strictly in arrival order.
+    // decodeAudioData is async and can complete out-of-order otherwise.
+    const gen = _playbackGen;
+    _audioChain = _audioChain
+      .catch(() => {})
+      .then(() => {
+        if (gen !== _playbackGen) return;
+        if (_cancelled) return;
+        return enqueueAudio(ev.data);
+      });
     return;
   }
   return _ctx;
 }
+function _stopAllSources() {
+  const sources = _activeSources.splice(0);
+  for (const src of sources) {
+    try {
+      src.onended = null;
+      src.stop(0);
+    } catch {}
+    try {
+      src.disconnect();
+    } catch {}
+  }
+}
 async function enqueueAudio(buf) {
   if (_cancelled) return;
   _inFlight++;
   src.buffer = decoded;
   src.connect(ctx.destination);
   const now = ctx.currentTime;
+  // Tiny gap between chunks improves perceived naturalness (less "machine-gun").
+  const GAP_S = 0.015;
+  const start = Math.max(now + 0.01, _schedEnd + GAP_S);
+  if (_cancelled) {
+    _inFlight = Math.max(0, _inFlight - 1);
+    _vizQ();
+    return;
+  }
+  _activeSources.push(src);
   src.start(start);
   _schedEnd = start + decoded.duration;
   src.onended = () => {
     _inFlight = Math.max(0, _inFlight - 1);
     _vizQ();
+    const idx = _activeSources.indexOf(src);
+    if (idx >= 0) _activeSources.splice(idx, 1);
   };
   setState('speaking');
   if (brainMode && brainVoiceActive) {
     clearTimeout(brainAutoRestartTimer);
     brainAutoRestartTimer = setTimeout(() => {
+      if (
+        !brainMode ||
+        !brainVoiceActive ||
+        isListening ||
+        isProcessing ||
+        isRecordingLocked
+      ) {
         return;
       }
       _brainResumeListening();
 function stopAllAudio() {
   _cancelled = true;
   _ttsPlaying = false;
+  _dropAudioUntil = Date.now() + 700;
+  _playbackGen++;
+  _audioChain = Promise.resolve();
+  _stopAllSources();
   clearTimeout(_endTimer);
   _endTimer = null;
   _schedEnd = 0;
   _inFlight = 0;
   _vizQ();
+  if (_ctx && _ctx.state !== 'closed') {
+    // Close releases scheduled audio immediately; a new ctx is created on demand.
+    _ctx.close().catch(() => {});
+  }
+  _ctx = null;
   if (voiceWS && voiceWS.readyState === WebSocket.OPEN) {
     voiceWS.send(JSON.stringify({ type: 'cancel' }));
   }
 }
+function _bargeInNow(reason = 'speech') {
+  const now = Date.now();
+  if (now - _bargeInFiredAt < 500) return; // debounce
+  _bargeInFiredAt = now;
+  console.log('[BargeIn] interrupt:', reason);
+  stopAllAudio();
+  // Unlock immediately so the user can speak right away.
+  isProcessing = false;
+  isRecordingLocked = false;
+  _cancelled = false;
+  aiEl = null;
+  aiTxt = '';
+  _setCaption('');
+  _removeThinking();
+  micBtn.disabled = false;
+  // If mic is already warm (brain continuous mode), just re-arm VAD.
+  if (brainMode && brainVoiceActive) {
+    _brainModeSetSearch(false);
+    // If analyser/mic are already active, VAD tick will immediately
+    // transition into recording on the next speech sample.
+    _brainResumeListening();
+    return;
+  }
+  // Otherwise, start listening fresh (user initiated by speaking).
+  startListening().catch(() => {});
+}
 // ═══════════════════════════════════════════════════════════════════════════════
 //  TEXT CHAT
 // ═══════════════════════════════════════════════════════════════════════════════
 // ── VAD tick ──────────────────────────────────────────────────────────────────
 function vadTick() {
   if (!analyser) return;
+  // In brain mode we allow "barge-in": user speech interrupts TTS playback.
+  // In non-brain mode we still keep the hard lock to prevent overlapping turns.
+  if (!brainMode && (isProcessing || isRecordingLocked)) return;
   const buf = new Float32Array(analyser.frequencyBinCount);
   analyser.getFloatTimeDomainData(buf);
   const speech = db > SILENCE_DB;
   if (speech) {
+    // ── Barge-in detector ────────────────────────────────────────────────
+    if (brainMode && brainVoiceActive && (_ttsPlaying || isProcessing || isRecordingLocked)) {
+      // Stricter threshold reduces false triggers from echo + noise.
+      const loud = db > SILENCE_DB + 4;
+      if (loud) {
+        if (!_bargeInArmedAt) _bargeInArmedAt = Date.now();
+        if (Date.now() - _bargeInArmedAt >= 90) {
+          _bargeInArmedAt = 0;
+          _bargeInNow(_ttsPlaying ? 'vad_tts' : 'vad_thinking');
+          // After barge-in unlock, continue into the normal recording start
+          // path in this same tick.
+        } else {
+          // Don't start recording until we confirm it’s real barge-in speech.
+          return;
+        }
+      } else {
+        _bargeInArmedAt = 0;
+        return;
+      }
+    }
     clearTimeout(silenceTimer);
     silenceTimer = null;
       console.log('[VAD] Speech detected — recording');
     }
   } else {
+    _bargeInArmedAt = 0;
     if (isSpeaking && !silenceTimer) {
       silenceTimer = setTimeout(_onSilenceTimeout, SILENCE_MS);
     }
     `[VAD] Silence after ${speechDuration} ms — finalising utterance`,
   );
+  const keepBrainMicWarm = brainMode && brainVoiceActive;
+  // In brain mode we keep VAD running so we can detect barge-in while the AI is
+  // thinking/speaking. Outside brain mode we stop VAD during processing.
+  if (!keepBrainMicWarm) {
+    clearInterval(vadInt);
+    clearInterval(vizInt);
+    vadInt = vizInt = null;
+  }
   // Lock state BEFORE stopRecorder (onstop may fire almost immediately)
   isSpeaking = false;
+  isListening = keepBrainMicWarm; // mic stays "hot" in brain mode
   isProcessing = true;
   isRecordingLocked = true;
   _cancelled = false;
   tLlm = 0;
   tTts = 0;
+  micBtn.disabled = !keepBrainMicWarm;
+  setMic(keepBrainMicWarm ? 'listening' : 'processing');
+  setState(keepBrainMicWarm ? 'listening' : 'processing');
   stopRecorder(); // → triggers onstop asynchronously
 }
 }
 const MIC_MAP = {
+  off: { cls: 'mic-off', label: 'Press to Start talking', icon: '🎤' },
   listening: {
     cls: 'mic-listening',
+    label: 'Listening...',
     icon: '🟢',
   },
+  recording: { cls: 'mic-recording', label: 'Listening..', icon: '🔴' },
+  processing: { cls: 'mic-processing', label: 'Please wait !!!', icon: '⏳' },
 };
 function setMic(s) {
     sidebarToggle.textContent = '›';
     chatBox.scrollTop = chatBox.scrollHeight;
     textInput.blur();
+    _brainModeSetSearch(
+      isProcessing || isListening || isSpeaking || _ttsPlaying,
+    );
     if (!isListening && !isProcessing && !isRecordingLocked) {
       setTimeout(() => {
+        if (
+          brainMode &&
+          brainVoiceActive &&
+          !isListening &&
+          !isProcessing &&
+          !isRecordingLocked
+        ) {
           _brainResumeListening();
         }
       }, 180);
 }
 function _brainResumeListening() {
+  if (
+    !brainMode ||
+    !brainVoiceActive ||
+    isListening ||
+    isProcessing ||
+    isRecordingLocked
+  ) {
     return;
   }
   if (micStream && analyserCtx && analyser) {
 }
 function _flushVoicePendingPackets() {
+  if (
+    !voiceWS ||
+    voiceWS.readyState !== WebSocket.OPEN ||
+    !voicePendingPackets.length
+  ) {
     return;
   }
   const packets = voicePendingPackets.splice(0);

services/tts.py CHANGED Viewed

@@ -21,7 +21,7 @@ EDGE_VOICE           = "bn-BD-NabanitaNeural"
 ELEVENLABS_API_KEY   = os.getenv("ELEVENLABS_API_KEY", "")
 ELEVENLABS_VOICE_ID  = os.getenv("ELEVENLABS_VOICE_ID", "21m00Tcm4TlvDq8ikWAM")
 ELEVENLABS_MODEL_ID  = os.getenv("ELEVENLABS_MODEL_ID", "eleven_multilingual_v2")
-ELEVENLABS_SPEED     = float(os.getenv("ELEVENLABS_SPEED", "1.08"))
 ELEVENLABS_OUTPUT_FORMAT = "mp3_22050_32"
 ELEVENLABS_STABILITY  = 0.45
 ELEVENLABS_SIMILARITY = 0.80
@@ -139,7 +139,7 @@ async def _elevenlabs_stream(
 async def text_to_speech_stream(
     text: str,
     voice: str | None = None,
-    rate: str = "+8%",
 ):
     """
     Stream TTS audio for `text`.

 ELEVENLABS_API_KEY   = os.getenv("ELEVENLABS_API_KEY", "")
 ELEVENLABS_VOICE_ID  = os.getenv("ELEVENLABS_VOICE_ID", "21m00Tcm4TlvDq8ikWAM")
 ELEVENLABS_MODEL_ID  = os.getenv("ELEVENLABS_MODEL_ID", "eleven_multilingual_v2")
+ELEVENLABS_SPEED     = float(os.getenv("ELEVENLABS_SPEED", "1.05"))
 ELEVENLABS_OUTPUT_FORMAT = "mp3_22050_32"
 ELEVENLABS_STABILITY  = 0.45
 ELEVENLABS_SIMILARITY = 0.80
 async def text_to_speech_stream(
     text: str,
     voice: str | None = None,
+    rate: str = "+4%",
 ):
     """
     Stream TTS audio for `text`.