updated js+stream+stt+app

Browse files

Files changed (4) hide show

app.py +62 -34
frontend/script.js +89 -19
services/streaming.py +116 -206
services/stt.py +162 -93

app.py CHANGED Viewed

@@ -1,3 +1,18 @@
 import asyncio
 import json
@@ -40,9 +55,8 @@ async def root():
     return HTMLResponse("<h2>index.html not found</h2>", status_code=404)
-# ── Helpers ───────────────────────────────────────────────────────────────────
 def _ws_open(ws: WebSocket) -> bool:
-    """Return True if the WebSocket connection is still alive."""
     return ws.client_state == WebSocketState.CONNECTED
@@ -66,7 +80,7 @@ async def _safe_bytes(ws: WebSocket, data: bytes) -> bool:
         return False
-# ── Text chat WebSocket ───────────────────────────────────────────────────────
 @app.websocket("/ws/chat")
 async def ws_chat(ws: WebSocket):
     await ws.accept()
@@ -91,34 +105,31 @@ async def ws_chat(ws: WebSocket):
                 async for token in stream:
                     full_response += token
                 await _safe_text(ws, {"type": "chat", "text": full_response})
-            except Exception as e:
-                print(f"[CHAT] AI error: {e}")
-                await _safe_text(ws, {"type": "error", "text": str(e)})
             await _safe_text(ws, {"type": "end"})
     except WebSocketDisconnect:
         print("[CHAT] Client disconnected")
-    except Exception as e:
-        if "disconnect" not in str(e).lower():
-            print(f"[CHAT] WS error: {e}")
-# ── Voice WebSocket ───────────────────────────────────────────────────────────
 @app.websocket("/ws/voice")
 async def ws_voice(ws: WebSocket):
     await ws.accept()
     print("[VOICE] Client connected")
-    stt     = STTProcessor()
-    user_id = "voice_user"
     try:
         while True:
-            # ── FIX: Check connection state before every receive ──────────────
-            # The previous crash "Cannot call receive once a disconnect message
-            # has been received" happened because we called ws.receive() after
-            # the client had already disconnected. Now we check first.
             if not _ws_open(ws):
                 print("[VOICE] Connection dropped, exiting handler.")
                 break
@@ -128,28 +139,32 @@ async def ws_voice(ws: WebSocket):
             except WebSocketDisconnect:
                 print("[VOICE] Client disconnected.")
                 break
-            except Exception as e:
-                # Catches starlette's internal disconnect errors gracefully
-                if "disconnect" in str(e).lower():
                     print("[VOICE] Client disconnected (recv error).")
                 else:
-                    print(f"[VOICE] Receive error: {e}")
                 break
-            # ── Audio blob from VAD ───────────────────────────────────────────
             if "bytes" in data and data["bytes"]:
                 audio_bytes = data["bytes"]
                 print(f"[VOICE] Received utterance: {len(audio_bytes):,} bytes")
-                # 1. STT — in thread so event loop isn't blocked
-                transcript = await asyncio.to_thread(stt.transcribe, audio_bytes)
                 if not transcript:
                     await _safe_text(ws, {
                         "type": "error",
                         "text": "কথা বুঝতে পারিনি, আবার বলুন।"
                     })
-                    # Send 'end' so client's isProcessing resets and VAD resumes
                     await _safe_text(ws, {"type": "end"})
                     continue
@@ -158,9 +173,10 @@ async def ws_voice(ws: WebSocket):
                     break
                 # 2. AI + TTS pipeline
-                tts_streamer = ParallelTTSStreamer()
-                async def run_ai_and_tts():
                     try:
                         stream = await ai.main(user_id, transcript)
                         async for token in stream:
@@ -169,34 +185,46 @@ async def ws_voice(ws: WebSocket):
                             if not await _safe_text(ws, {"type": "llm_token", "token": token}):
                                 break
                             await tts_streamer.add_token(token)
-                    except Exception as e:
-                        print(f"[VOICE] AI error: {e}")
                     finally:
                         await tts_streamer.flush()
-                async def stream_tts_audio():
                     async for chunk in tts_streamer.stream_audio():
                         if not await _safe_bytes(ws, chunk):
                             break
                 await asyncio.gather(run_ai_and_tts(), stream_tts_audio())
-                # Signal end of turn → client resumes VAD
                 await _safe_text(ws, {"type": "end"})
-            # ── Control messages ──────────────────────────────────────────────
             elif "text" in data and data["text"]:
                 try:
                     msg = json.loads(data["text"])
                     if msg.get("type") == "ping":
                         await _safe_text(ws, {"type": "pong"})
                 except json.JSONDecodeError:
                     pass
     except WebSocketDisconnect:
         print("[VOICE] Client disconnected (outer)")
-    except Exception as e:
-        if "disconnect" not in str(e).lower():
-            print(f"[VOICE] WS error: {e}")
     finally:
         print("[VOICE] Handler exiting cleanly.")

+"""
+app.py — FastAPI entry point
+Fixes applied
+─────────────
+1. STT is now fully async (stt.transcribe is a coroutine) — no more
+   asyncio.to_thread wrapper needed in the WS handler.
+2. BARGE-IN: when the client sends a new audio blob while TTS is still
+   playing, the running tts_streamer is cancelled before starting a new
+   turn.  The client enforces isProcessing so this should be rare, but
+   the server now handles it gracefully.
+3. Per-session cancel token stored in `_active_streamer` so any new
+   utterance from the same WS cleanly aborts the previous one.
+4. All other logic (ping/pong, safe send helpers, chat WS) is unchanged.
+"""
 import asyncio
 import json
     return HTMLResponse("<h2>index.html not found</h2>", status_code=404)
+# ── Helpers ────────────────────────────────────────────────────────────────────
 def _ws_open(ws: WebSocket) -> bool:
     return ws.client_state == WebSocketState.CONNECTED
         return False
+# ── Text chat WebSocket ────────────────────────────────────────────────────────
 @app.websocket("/ws/chat")
 async def ws_chat(ws: WebSocket):
     await ws.accept()
                 async for token in stream:
                     full_response += token
                 await _safe_text(ws, {"type": "chat", "text": full_response})
+            except Exception as exc:
+                print(f"[CHAT] AI error: {exc}")
+                await _safe_text(ws, {"type": "error", "text": str(exc)})
             await _safe_text(ws, {"type": "end"})
     except WebSocketDisconnect:
         print("[CHAT] Client disconnected")
+    except Exception as exc:
+        if "disconnect" not in str(exc).lower():
+            print(f"[CHAT] WS error: {exc}")
+# ── Voice WebSocket ────────────────────────────────────────────────────────────
 @app.websocket("/ws/voice")
 async def ws_voice(ws: WebSocket):
     await ws.accept()
     print("[VOICE] Client connected")
+    stt             = STTProcessor()
+    user_id         = "voice_user"
+    _active_streamer: ParallelTTSStreamer | None = None   # barge-in handle
     try:
         while True:
             if not _ws_open(ws):
                 print("[VOICE] Connection dropped, exiting handler.")
                 break
             except WebSocketDisconnect:
                 print("[VOICE] Client disconnected.")
                 break
+            except Exception as exc:
+                if "disconnect" in str(exc).lower():
                     print("[VOICE] Client disconnected (recv error).")
                 else:
+                    print(f"[VOICE] Receive error: {exc}")
                 break
+            # ── Audio blob from client VAD ──────────────────────────────────
             if "bytes" in data and data["bytes"]:
                 audio_bytes = data["bytes"]
                 print(f"[VOICE] Received utterance: {len(audio_bytes):,} bytes")
+                # ── Barge-in: cancel any running TTS turn ───────────────────
+                if _active_streamer is not None:
+                    print("[VOICE] Barge-in — cancelling previous TTS.")
+                    await _active_streamer.cancel()
+                    _active_streamer = None
+                # 1. STT — now a native coroutine (GPU semaphore inside)
+                transcript = await stt.transcribe(audio_bytes)
                 if not transcript:
                     await _safe_text(ws, {
                         "type": "error",
                         "text": "কথা বুঝতে পারিনি, আবার বলুন।"
                     })
                     await _safe_text(ws, {"type": "end"})
                     continue
                     break
                 # 2. AI + TTS pipeline
+                tts_streamer    = ParallelTTSStreamer()
+                _active_streamer = tts_streamer
+                async def run_ai_and_tts() -> None:
                     try:
                         stream = await ai.main(user_id, transcript)
                         async for token in stream:
                             if not await _safe_text(ws, {"type": "llm_token", "token": token}):
                                 break
                             await tts_streamer.add_token(token)
+                    except Exception as exc:
+                        print(f"[VOICE] AI error: {exc}")
                     finally:
                         await tts_streamer.flush()
+                async def stream_tts_audio() -> None:
                     async for chunk in tts_streamer.stream_audio():
                         if not await _safe_bytes(ws, chunk):
                             break
                 await asyncio.gather(run_ai_and_tts(), stream_tts_audio())
+                _active_streamer = None
+                # Signal end-of-turn → client resumes VAD
                 await _safe_text(ws, {"type": "end"})
+            # ── Control messages ────────────────────────────────────────────
             elif "text" in data and data["text"]:
                 try:
                     msg = json.loads(data["text"])
                     if msg.get("type") == "ping":
                         await _safe_text(ws, {"type": "pong"})
+                    # Client can send {"type":"cancel"} to abort TTS mid-turn
+                    elif msg.get("type") == "cancel":
+                        if _active_streamer is not None:
+                            print("[VOICE] Client cancel signal received.")
+                            await _active_streamer.cancel()
+                            _active_streamer = None
+                        await _safe_text(ws, {"type": "end"})
                 except json.JSONDecodeError:
                     pass
     except WebSocketDisconnect:
         print("[VOICE] Client disconnected (outer)")
+    except Exception as exc:
+        if "disconnect" not in str(exc).lower():
+            print(f"[VOICE] WS error: {exc}")
     finally:
+        if _active_streamer is not None:
+            await _active_streamer.cancel()
         print("[VOICE] Handler exiting cleanly.")

frontend/script.js CHANGED Viewed

@@ -1,3 +1,22 @@
 const chatBox = document.getElementById('chat-box');
 const sendBtn = document.getElementById('send-btn');
 const textInput = document.getElementById('text-input');
@@ -20,14 +39,15 @@ let isListening = false;
 let isSpeaking = false;
 let silenceTimer = null;
 let vadInterval = null;
-let isProcessing = false; // true while server is processing an utterance
 let currentAIMessage = null;
 let playbackChain = Promise.resolve();
 // ── VAD config ────────────────────────────────────────────────────────────────
-const SILENCE_THRESHOLD_DB = -45; // dBFS; lower = more sensitive
-const SILENCE_TIMEOUT_MS = 3000; // 3 s silence → send utterance
 const VAD_POLL_MS = 100;
 // ── Text chat ─────────────────────────────────────────────────────────────────
@@ -44,7 +64,6 @@ function sendTextMessage() {
   textInput.value = '';
 }
-// Chat WS now sends JSON: {"type":"chat","text":"..."} or {"type":"end"}
 chatSocket.onmessage = (e) => {
   let msg;
   try {
@@ -54,7 +73,6 @@ chatSocket.onmessage = (e) => {
   }
   if (msg.type === 'chat' && msg.text) appendMessage(msg.text, 'ai');
   if (msg.type === 'error') appendMessage('⚠️ ' + msg.text, 'system');
-  // 'end' — nothing to do for text chat
 };
 chatSocket.onerror = (e) => console.error('Chat WS error:', e);
 chatSocket.onclose = () => console.log('Chat WS closed');
@@ -83,20 +101,27 @@ voiceSocket.onmessage = (event) => {
   switch (msg.type) {
     case 'stt':
-      // Show transcribed Bangla text as user bubble
       appendMessage('🎤 ' + msg.text, 'user');
       currentAIMessage = null;
       break;
     case 'llm_token':
-      // Stream AI tokens into growing bubble
-      if (!currentAIMessage) currentAIMessage = appendMessage('', 'ai');
-      currentAIMessage.textContent += msg.token;
       chatBox.scrollTop = chatBox.scrollHeight;
       break;
     case 'end':
-      // Server finished this turn → resume VAD listening
       currentAIMessage = null;
       isProcessing = false;
       if (isListening) setMicStatus('listening');
@@ -104,7 +129,6 @@ voiceSocket.onmessage = (event) => {
     case 'error':
       appendMessage('⚠️ ' + msg.text, 'system');
-      // Still need to reset so VAD resumes
       isProcessing = false;
       if (isListening) setMicStatus('listening');
       break;
@@ -117,18 +141,26 @@ voiceSocket.onmessage = (event) => {
   }
 };
-// ── Audio playback: sequential, no overlap ────────────────────────────────────
 function enqueueAudio(buffer) {
   playbackChain = playbackChain.then(() => playBuffer(buffer));
 }
 function playBuffer(buffer) {
   return new Promise((resolve) => {
     const blob = new Blob([buffer], { type: 'audio/mpeg' });
     const url = URL.createObjectURL(blob);
     const audio = new Audio(url);
     const done = () => {
       URL.revokeObjectURL(url);
       resolve();
     };
     audio.onended = done;
@@ -140,6 +172,27 @@ function playBuffer(buffer) {
   });
 }
 // ── Mic button ────────────────────────────────────────────────────────────────
 micBtn.onclick = async () => {
   if (!isListening) await startListening();
@@ -172,7 +225,6 @@ async function startListening() {
   isListening = true;
   setMicStatus('listening');
   vadInterval = setInterval(vadTick, VAD_POLL_MS);
 }
@@ -184,6 +236,8 @@ function stopListening() {
   if (isSpeaking) stopRecorder(true); // discard in-progress utterance
   micStream?.getTracks().forEach((t) => t.stop());
   audioContext?.close();
   micStream = audioContext = analyser = null;
@@ -194,17 +248,24 @@ function stopListening() {
 // ── VAD polling ───────────────────────────────────────────────────────────────
 function vadTick() {
-  if (!analyser || isProcessing) return;
   const data = new Float32Array(analyser.frequencyBinCount);
   analyser.getFloatTimeDomainData(data);
-  // RMS → dBFS
   const rms = Math.sqrt(data.reduce((s, v) => s + v * v, 0) / data.length);
   const db = rms > 0 ? 20 * Math.log10(rms) : -Infinity;
   const speaking = db > SILENCE_THRESHOLD_DB;
   if (speaking) {
     clearTimeout(silenceTimer);
     silenceTimer = null;
@@ -218,8 +279,11 @@ function vadTick() {
       silenceTimer = setTimeout(() => {
         silenceTimer = null;
         isSpeaking = false;
         isProcessing = true;
-        stopRecorder(false); // send the utterance
         setMicStatus('processing');
       }, SILENCE_TIMEOUT_MS);
     }
@@ -236,7 +300,6 @@ function startRecorder() {
     : 'audio/webm';
   mediaRecorder = new MediaRecorder(micStream, { mimeType });
   mediaRecorder.ondataavailable = (e) => {
     if (e.data.size > 0) audioChunks.push(e.data);
   };
@@ -272,7 +335,7 @@ function stopRecorder(discard = false) {
   mediaRecorder = null;
 }
-// ── UI ────────────────────────────────────────────────────────────────────────
 function setMicStatus(state) {
   const labels = {
     off: '🎤 Start Voice',
@@ -287,7 +350,14 @@ function setMicStatus(state) {
 function appendMessage(text, sender) {
   const div = document.createElement('div');
   div.className = `message ${sender}`;
-  div.textContent = text;
   chatBox.appendChild(div);
   chatBox.scrollTop = chatBox.scrollHeight;
   return div;

+/* ─────────────────────────────────────────────────────────────────────────────
+   script.js — Voice + text chat client
+   Fixes applied
+   ─────────────
+   1. DOUBLE-SEND BUG: silenceTimer is now explicitly cleared whenever
+      isProcessing is set to true, so a timer that was already ticking
+      can't fire a second stopRecorder() call.
+   2. TTS INTERRUPT / BARGE-IN: stopAllAudio() cancels the current
+      HTMLAudioElement and sends {"type":"cancel"} to the server so the
+      TTS pipeline also aborts server-side.
+   3. MARKDOWN RENDERING: AI bubble uses marked.parse() instead of
+      textContent so Bangla markdown (bold, lists, headings) renders
+      correctly in the chat.
+   4. VAD barge-in path: if the user starts speaking while TTS is playing
+      the audio stops immediately, isProcessing resets, and the new
+      utterance is captured normally.
+───────────────────────────────────────────────────────────────────────────── */
 const chatBox = document.getElementById('chat-box');
 const sendBtn = document.getElementById('send-btn');
 const textInput = document.getElementById('text-input');
 let isSpeaking = false;
 let silenceTimer = null;
 let vadInterval = null;
+let isProcessing = false; // true while server is processing / TTS playing
 let currentAIMessage = null;
+let currentAudio = null; // the HTMLAudioElement currently playing
 let playbackChain = Promise.resolve();
 // ── VAD config ────────────────────────────────────────────────────────────────
+const SILENCE_THRESHOLD_DB = -45; // dBFS
+const SILENCE_TIMEOUT_MS = 3000; // ms of silence before sending utterance
 const VAD_POLL_MS = 100;
 // ── Text chat ─────────────────────────────────────────────────────────────────
   textInput.value = '';
 }
 chatSocket.onmessage = (e) => {
   let msg;
   try {
   }
   if (msg.type === 'chat' && msg.text) appendMessage(msg.text, 'ai');
   if (msg.type === 'error') appendMessage('⚠️ ' + msg.text, 'system');
 };
 chatSocket.onerror = (e) => console.error('Chat WS error:', e);
 chatSocket.onclose = () => console.log('Chat WS closed');
   switch (msg.type) {
     case 'stt':
       appendMessage('🎤 ' + msg.text, 'user');
       currentAIMessage = null;
       break;
     case 'llm_token':
+      // FIX: stream tokens into a div; final markdown render happens on 'end'
+      if (!currentAIMessage) {
+        currentAIMessage = appendMessage('', 'ai');
+        currentAIMessage._raw = '';
+      }
+      currentAIMessage._raw += msg.token;
+      // Live preview: render markdown progressively
+      currentAIMessage.innerHTML = marked.parse(currentAIMessage._raw);
       chatBox.scrollTop = chatBox.scrollHeight;
       break;
     case 'end':
+      // Ensure final markdown render
+      if (currentAIMessage && currentAIMessage._raw) {
+        currentAIMessage.innerHTML = marked.parse(currentAIMessage._raw);
+      }
       currentAIMessage = null;
       isProcessing = false;
       if (isListening) setMicStatus('listening');
     case 'error':
       appendMessage('⚠️ ' + msg.text, 'system');
       isProcessing = false;
       if (isListening) setMicStatus('listening');
       break;
   }
 };
+// ── Audio playback ─────────────────────────────────────────────────────────────
 function enqueueAudio(buffer) {
   playbackChain = playbackChain.then(() => playBuffer(buffer));
 }
 function playBuffer(buffer) {
   return new Promise((resolve) => {
+    if (isProcessing === false) {
+      resolve();
+      return;
+    } // cancelled mid-chain
     const blob = new Blob([buffer], { type: 'audio/mpeg' });
     const url = URL.createObjectURL(blob);
     const audio = new Audio(url);
+    currentAudio = audio;
     const done = () => {
       URL.revokeObjectURL(url);
+      currentAudio = null;
       resolve();
     };
     audio.onended = done;
   });
 }
+/**
+ * Stop all queued and current audio immediately.
+ * Also sends a cancel signal to the server so TTS generation stops.
+ */
+function stopAllAudio() {
+  // Replace the chain with an already-resolved promise so queued buffers
+  // that haven't started yet are silently dropped.
+  playbackChain = Promise.resolve();
+  if (currentAudio) {
+    currentAudio.pause();
+    currentAudio.src = '';
+    currentAudio = null;
+  }
+  // Tell server to abort TTS pipeline
+  if (voiceSocket.readyState === WebSocket.OPEN) {
+    voiceSocket.send(JSON.stringify({ type: 'cancel' }));
+  }
+}
 // ── Mic button ────────────────────────────────────────────────────────────────
 micBtn.onclick = async () => {
   if (!isListening) await startListening();
   isListening = true;
   setMicStatus('listening');
   vadInterval = setInterval(vadTick, VAD_POLL_MS);
 }
   if (isSpeaking) stopRecorder(true); // discard in-progress utterance
+  stopAllAudio();
   micStream?.getTracks().forEach((t) => t.stop());
   audioContext?.close();
   micStream = audioContext = analyser = null;
 // ── VAD polling ───────────────────────────────────────────────────────────────
 function vadTick() {
+  if (!analyser) return;
   const data = new Float32Array(analyser.frequencyBinCount);
   analyser.getFloatTimeDomainData(data);
   const rms = Math.sqrt(data.reduce((s, v) => s + v * v, 0) / data.length);
   const db = rms > 0 ? 20 * Math.log10(rms) : -Infinity;
   const speaking = db > SILENCE_THRESHOLD_DB;
   if (speaking) {
+    // FIX: barge-in — user started talking while TTS is playing
+    if (isProcessing) {
+      console.log('[VAD] Barge-in detected — stopping TTS.');
+      stopAllAudio();
+      isProcessing = false;
+    }
+    // FIX: clear any pending silence timer so it can't double-fire
     clearTimeout(silenceTimer);
     silenceTimer = null;
       silenceTimer = setTimeout(() => {
         silenceTimer = null;
         isSpeaking = false;
+        // FIX: set isProcessing *before* stopping the recorder so that
+        // if vadTick fires again during onstop it sees the flag and skips.
         isProcessing = true;
+        stopRecorder(false);
         setMicStatus('processing');
       }, SILENCE_TIMEOUT_MS);
     }
     : 'audio/webm';
   mediaRecorder = new MediaRecorder(micStream, { mimeType });
   mediaRecorder.ondataavailable = (e) => {
     if (e.data.size > 0) audioChunks.push(e.data);
   };
   mediaRecorder = null;
 }
+// ── UI helpers ────────────────────────────────────────────────────────────────
 function setMicStatus(state) {
   const labels = {
     off: '🎤 Start Voice',
 function appendMessage(text, sender) {
   const div = document.createElement('div');
   div.className = `message ${sender}`;
+  if (sender === 'ai' && typeof marked !== 'undefined') {
+    // FIX: render Bangla markdown (bold, lists, headings) properly
+    div.innerHTML = marked.parse(text);
+  } else {
+    div.textContent = text;
+  }
   chatBox.appendChild(div);
   chatBox.scrollTop = chatBox.scrollHeight;
   return div;

services/streaming.py CHANGED Viewed

@@ -1,217 +1,98 @@
-# import asyncio
-# import edge_tts
-# # ── Voice ─────────────────────────────────────────────────────────────────────
-# VOICE = "bn-BD-NabanitaNeural"
-# # Flush when buffer reaches this many characters (even without punctuation)
-# FLUSH_LEN = 50
-# # Don't send a TTS request for fewer than this many characters
-# MIN_CHARS = 3
-# # Punctuation marks that trigger an immediate flush
-# FLUSH_TRIGGERS = frozenset(".!?।,;:\n—–")
-# class ParallelTTSStreamer:
-#     """
-#     Collects LLM tokens, splits them into prosodic chunks, and converts each
-#     chunk to audio via edge-tts.
-#     FIX: Audio chunks are now guaranteed to arrive IN ORDER by chaining each
-#     TTS task so it only writes to the queue after the previous task finishes.
-#     This prevents audio chunks from chunk-2 overtaking chunk-1 during playback.
-#     """
-#     def __init__(self, voice: str = VOICE):
-#         self.voice   = voice
-#         self.buffer  = ""
-#         self.queue: asyncio.Queue[bytes | None] = asyncio.Queue()
-#         # Tracks the last scheduled task so each new task waits for it first
-#         self._prev_task: asyncio.Task | None = None
-#         self._flush_lock = asyncio.Lock()
-#     async def add_token(self, token: str) -> None:
-#         """Feed a single LLM output token into the streamer."""
-#         if not token:
-#             return
-#         self.buffer += token
-#         should_flush = (
-#             any(ch in FLUSH_TRIGGERS for ch in token)
-#             or len(self.buffer) >= FLUSH_LEN
-#         )
-#         if should_flush:
-#             await self._schedule_flush()
-#     async def _schedule_flush(self) -> None:
-#         """Snapshot the buffer and schedule an ordered TTS task."""
-#         async with self._flush_lock:
-#             text = self.buffer.strip()
-#             self.buffer = ""
-#         if len(text) < MIN_CHARS:
-#             return
-#         # Each task waits for the previous one before pushing to queue,
-#         # guaranteeing in-order audio delivery.
-#         prev = self._prev_task
-#         task = asyncio.create_task(self._tts_ordered(text, prev))
-#         self._prev_task = task
-#     async def _tts_ordered(self, text: str, wait_for: asyncio.Task | None) -> None:
-#         """
-#         1. First synthesise audio bytes (can run in parallel with other chunks).
-#         2. Then wait for the previous chunk to finish writing to queue.
-#         3. Then push our bytes to the queue in order.
-#         """
-#         # Step 1: synthesise concurrently (no queue writes yet)
-#         audio_chunks: list[bytes] = []
-#         try:
-#             communicate = edge_tts.Communicate(text, self.voice)
-#             async for chunk in communicate.stream():
-#                 if chunk["type"] == "audio":
-#                     audio_chunks.append(chunk["data"])
-#         except Exception as e:
-#             print(f"[TTS] edge-tts error for '{text[:30]}…': {e}")
-#             # Still need to chain — wait for prev even on error
-#             if wait_for and not wait_for.done():
-#                 await wait_for
-#             return
-#         # Step 2: wait for the previous chunk to have finished queuing
-#         if wait_for and not wait_for.done():
-#             try:
-#                 await wait_for
-#             except Exception:
-#                 pass
-#         # Step 3: push our audio bytes in order
-#         for data in audio_chunks:
-#             await self.queue.put(data)
-#     async def flush(self) -> None:
-#         """
-#         Flush remaining buffer, wait for all in-flight TTS tasks, then
-#         signal end-of-stream with sentinel None.
-#         """
-#         await self._schedule_flush()
-#         # Wait for the last chained task (which transitively waits for all)
-#         if self._prev_task:
-#             try:
-#                 await self._prev_task
-#             except Exception:
-#                 pass
-#         await self.queue.put(None)
-#     async def stream_audio(self):
-#         """
-#         Async generator that yields audio bytes in order.
-#         Stops when the sentinel None is received.
-#         """
-#         while True:
-#             chunk = await self.queue.get()
-#             if chunk is None:
-#                 break
-#             yield chunk
-import re
 import asyncio
 import edge_tts
-VOICE = "bn-BD-NabanitaNeural"
-FLUSH_LEN     = 80        # chars before forced flush (longer = more natural speech)
-MIN_CHARS     = 5         # don't TTS tiny fragments
 FLUSH_TRIGGERS = frozenset(".!?।,;:\n—–")
 def _clean_for_tts(text: str) -> str:
-    """
-    Strip markdown and other non-speech symbols before sending to edge-tts.
-    The LLM outputs markdown (**, *, #, -, numbers+dot lists) which edge-tts
-    either reads aloud awkwardly ("asterisk asterisk") or returns 'No audio
-    was received' on punctuation-only chunks like '**' or '-)'.
-    """
-    # Remove bold/italic markers
-    text = re.sub(r'\*{1,3}', '', text)
-    # Remove heading markers
-    text = re.sub(r'#+\s*', '', text)
-    # Remove list markers like "১.", "1.", "-", "•"
-    text = re.sub(r'^\s*[-•]\s*', '', text, flags=re.MULTILINE)
-    text = re.sub(r'^\s*[\d০-৯]+[.)]\s*', '', text, flags=re.MULTILINE)
-    # Remove leftover backticks
-    text = re.sub(r'`+', '', text)
-    # Collapse extra whitespace / blank lines
-    text = re.sub(r'\n{2,}', '\n', text)
-    text = text.strip()
-    return text
 class ParallelTTSStreamer:
     """
-    Collects LLM tokens → splits into prosodic chunks → converts to audio
-    via edge-tts in parallel → streams audio bytes IN ORDER.
-    Audio ordering is guaranteed by a task chain: each chunk task synthesises
-    audio freely (parallel) but only writes to the queue after the previous
-    chunk finishes, so the client always hears chunk-1 before chunk-2.
     """
-    def __init__(self, voice: str = VOICE):
-        self.voice      = voice
-        self.buffer     = ""
         self.queue: asyncio.Queue[bytes | None] = asyncio.Queue()
-        self._prev_task: asyncio.Task | None = None
         self._flush_lock = asyncio.Lock()
     async def add_token(self, token: str) -> None:
-        if not token:
             return
-        self.buffer += token
-        if any(ch in FLUSH_TRIGGERS for ch in token) or len(self.buffer) >= FLUSH_LEN:
             await self._schedule_flush()
     async def _schedule_flush(self) -> None:
         async with self._flush_lock:
-            raw  = self.buffer.strip()
-            self.buffer = ""
         text = _clean_for_tts(raw)
         if len(text) < MIN_CHARS:
@@ -220,41 +101,70 @@ class ParallelTTSStreamer:
         prev = self._prev_task
         task = asyncio.create_task(self._tts_ordered(text, prev))
         self._prev_task = task
     async def _tts_ordered(self, text: str, wait_for: asyncio.Task | None) -> None:
-        """Synthesise audio (parallel), then write to queue in order."""
-        # Step 1 — synthesise concurrently
         audio_chunks: list[bytes] = []
-        try:
-            communicate = edge_tts.Communicate(text, self.voice)
-            async for chunk in communicate.stream():
-                if chunk["type"] == "audio":
-                    audio_chunks.append(chunk["data"])
-        except Exception as e:
-            print(f"[TTS] edge-tts error for '{text[:40]}': {e}")
-        # Step 2 — wait for previous chunk to finish queuing
         if wait_for and not wait_for.done():
             try:
                 await wait_for
             except Exception:
                 pass
-        # Step 3 — write to queue in order
-        for data in audio_chunks:
-            await self.queue.put(data)
     async def flush(self) -> None:
-        """Flush remaining buffer, await all tasks, send end sentinel."""
         await self._schedule_flush()
         if self._prev_task:
             try:
                 await self._prev_task
             except Exception:
                 pass
-        await self.queue.put(None)
     async def stream_audio(self):
         while True:
             chunk = await self.queue.get()
             if chunk is None:

+"""
+services/streaming.py — Parallel + ordered TTS streamer
+Fixes applied
+─────────────
+1. BUFFER RACE — self.buffer is now only mutated while holding
+   self._flush_lock, so add_token() and _schedule_flush() can never
+   interleave partial writes.
+2. CANCELLATION — ParallelTTSStreamer.cancel() drops all pending tasks
+   and poisons the queue with a sentinel so stream_audio() exits
+   immediately.  app.py calls cancel() when the user starts speaking
+   mid-playback, giving true barge-in / interrupt behaviour.
+3. Markdown stripping (_clean_for_tts) is unchanged.
+4. Audio ordering guarantee is unchanged (task-chain pattern).
+"""
+from __future__ import annotations
 import asyncio
+import re
 import edge_tts
+VOICE          = "bn-BD-NabanitaNeural"
+FLUSH_LEN      = 80          # chars before forced flush
+MIN_CHARS      = 5           # skip tiny fragments
 FLUSH_TRIGGERS = frozenset(".!?।,;:\n—–")
+# ── Markdown → plain text ──────────────────────────────────────────────────────
 def _clean_for_tts(text: str) -> str:
+    text = re.sub(r"\*{1,3}", "", text)
+    text = re.sub(r"#+\s*", "", text)
+    text = re.sub(r"^\s*[-•]\s*", "", text, flags=re.MULTILINE)
+    text = re.sub(r"^\s*[\d০-৯]+[.)]\s*", "", text, flags=re.MULTILINE)
+    text = re.sub(r"`+", "", text)
+    text = re.sub(r"\n{2,}", "\n", text)
+    return text.strip()
+# ── Streamer ───────────────────────────────────────────────────────────────────
 class ParallelTTSStreamer:
     """
+    Collects LLM tokens → prosodic chunks → parallel edge-tts calls →
+    serialised audio queue.
+    Usage
+    ─────
+        streamer = ParallelTTSStreamer()
+        # producer
+        await streamer.add_token(token)
+        await streamer.flush()          # call once when LLM finishes
+        # consumer (run concurrently with producer)
+        async for chunk in streamer.stream_audio():
+            await ws.send_bytes(chunk)
+        # interrupt (call from any coroutine)
+        await streamer.cancel()
     """
+    def __init__(self, voice: str = VOICE) -> None:
+        self.voice       = voice
+        self.buffer      = ""
         self.queue: asyncio.Queue[bytes | None] = asyncio.Queue()
+        self._prev_task: asyncio.Task | None    = None
         self._flush_lock = asyncio.Lock()
+        self._cancelled  = False
+        self._tasks: list[asyncio.Task] = []    # track all live tasks
+    # ── Token intake ───────────────────────────────────────────────────────────
     async def add_token(self, token: str) -> None:
+        if not token or self._cancelled:
             return
+        # FIX: hold the lock for the buffer write too, not just the flush
+        async with self._flush_lock:
+            self.buffer += token
+            should_flush = (
+                any(ch in FLUSH_TRIGGERS for ch in token)
+                or len(self.buffer) >= FLUSH_LEN
+            )
+        if should_flush:
             await self._schedule_flush()
+    # ── Flush scheduling ───────────────────────────────────────────────────────
     async def _schedule_flush(self) -> None:
+        if self._cancelled:
+            return
         async with self._flush_lock:
+            raw          = self.buffer.strip()
+            self.buffer  = ""
         text = _clean_for_tts(raw)
         if len(text) < MIN_CHARS:
         prev = self._prev_task
         task = asyncio.create_task(self._tts_ordered(text, prev))
         self._prev_task = task
+        self._tasks.append(task)
+        task.add_done_callback(lambda t: self._tasks.remove(t) if t in self._tasks else None)
+    # ── Ordered TTS task ───────────────────────────────────────────────────────
     async def _tts_ordered(self, text: str, wait_for: asyncio.Task | None) -> None:
+        # Step 1 — synthesise (may run in parallel with other chunks)
         audio_chunks: list[bytes] = []
+        if not self._cancelled:
+            try:
+                communicate = edge_tts.Communicate(text, self.voice)
+                async for chunk in communicate.stream():
+                    if self._cancelled:
+                        break
+                    if chunk["type"] == "audio":
+                        audio_chunks.append(chunk["data"])
+            except Exception as exc:
+                print(f"[TTS] edge-tts error for '{text[:40]}': {exc}")
+        # Step 2 — wait for predecessor to finish queuing (preserves order)
         if wait_for and not wait_for.done():
             try:
                 await wait_for
             except Exception:
                 pass
+        # Step 3 — write to queue (skipped if cancelled)
+        if not self._cancelled:
+            for data in audio_chunks:
+                await self.queue.put(data)
+    # ── Flush remaining buffer ─────────────────────────────────────────────────
     async def flush(self) -> None:
+        """Call once after the LLM stream ends."""
         await self._schedule_flush()
         if self._prev_task:
             try:
                 await self._prev_task
             except Exception:
                 pass
+        await self.queue.put(None)          # end-of-stream sentinel
+    # ── Interrupt / barge-in ───────────────────────────────────────────────────
+    async def cancel(self) -> None:
+        """
+        Immediately abort all in-flight TTS tasks and unblock stream_audio().
+        Safe to call from any coroutine while stream_audio() is running.
+        """
+        self._cancelled = True
+        # Cancel all pending asyncio tasks
+        for task in list(self._tasks):
+            task.cancel()
+        # Drain and poison the queue so stream_audio() exits
+        while not self.queue.empty():
+            try:
+                self.queue.get_nowait()
+            except asyncio.QueueEmpty:
+                break
+        await self.queue.put(None)          # sentinel → stream_audio exits
+    # ── Audio consumer ─────────────────────────────────────────────────────────
     async def stream_audio(self):
+        """Async generator — yields ordered audio bytes until cancelled/done."""
         while True:
             chunk = await self.queue.get()
             if chunk is None:

services/stt.py CHANGED Viewed

@@ -1,58 +1,104 @@
 import os
 import re
 import subprocess
 import tempfile
 from faster_whisper import WhisperModel
-model = WhisperModel("large-v3", device="cuda", compute_type="int8_float32")
-BANGLA_PATTERN = re.compile(r'[\u0980-\u09FF]')
-# Scripts we consider "wrong" — Arabic, Urdu, Devanagari (when expecting Bangla)
 WRONG_SCRIPT_PATTERN = re.compile(
-    r'[\u0600-\u06FF'   # Arabic / Urdu
-    r'\u0750-\u077F'   # Arabic Supplement
-    r'\uFB50-\uFDFF'   # Arabic Presentation Forms
-    r'\uFE70-\uFEFF]'  # Arabic Presentation Forms-B
 )
-def _is_valid_bangla(text: str) -> bool:
-    """
-    Return True if the transcript looks like real Bangla.
-    A valid transcript must:
-    1. Contain at least one Bangla Unicode character, OR be very short
-       (some valid responses are single digits/punctuation).
-    2. NOT be dominated by Arabic/Urdu script (Whisper wrong-script error).
     """
-    bangla_chars  = len(BANGLA_PATTERN.findall(text))
-    wrong_chars   = len(WRONG_SCRIPT_PATTERN.findall(text))
-    total_alpha   = sum(1 for c in text if c.isalpha())
     if total_alpha == 0:
-        return True   # digits/punctuation only — let it through
-    # If more than 30% of alphabetic chars are Arabic/Urdu script, reject
-    if total_alpha > 0 and (wrong_chars / total_alpha) > 0.30:
         return False
-    # Must have at least some Bangla characters for long responses
-    if total_alpha > 5 and bangla_chars == 0:
         return False
     return True
 class STTProcessor:
     MIN_INPUT_BYTES = 3_000
-    def _to_wav(self, audio_bytes: bytes) -> str | None:
-        """Convert browser WebM/opus to 16 kHz mono WAV with loudness normalization."""
         in_path = out_path = None
         try:
             with tempfile.NamedTemporaryFile(suffix=".webm", delete=False) as f:
@@ -61,13 +107,17 @@ class STTProcessor:
             with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
                 out_path = f.name
-            result = subprocess.run([
-                "ffmpeg", "-y", "-loglevel", "warning",
-                "-i", in_path,
-                "-ar", "16000", "-ac", "1",
-                "-af", "loudnorm",
-                "-f", "wav", out_path,
-            ], stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
             if result.returncode != 0:
                 print("[STT] ffmpeg error:", result.stderr.decode(errors="replace").strip())
@@ -78,71 +128,90 @@ class STTProcessor:
             print(f"[STT] WAV ready: {os.path.getsize(out_path):,} bytes")
             return out_path
-        except Exception as e:
-            print(f"[STT] _to_wav: {e}")
             return None
         finally:
             if in_path and os.path.exists(in_path):
-                try: os.remove(in_path)
-                except OSError: pass
-    def transcribe(self, audio_bytes: bytes) -> str | None:
         if len(audio_bytes) < self.MIN_INPUT_BYTES:
             print(f"[STT] Too short ({len(audio_bytes)} B), skipping.")
             return None
-        wav_path = None
-        try:
-            wav_path = self._to_wav(audio_bytes)
-            if not wav_path:
-                return None
-            # segments, info = model.transcribe(wav_path, language="bn", task="translate", beam_size=5)
-            segments, info = model.transcribe(
-                wav_path,
-                language="bn",
-                beam_size=5,
-                vad_filter=False,               # loudnorm handles quiet audio
-                condition_on_previous_text=False,
-                temperature=0,
-                suppress_tokens=[-1],
-                no_speech_threshold=0.5,
-                log_prob_threshold=-1.0,
-                # task="translate"
-                # NO initial_prompt — causes hallucination loops on base model
-            )
-            text = " ".join(seg.text.strip() for seg in segments).strip()
-            print(f"[STT] Lang={info.language} prob={info.language_probability:.2f}")
-            if not text:
-                print("[STT] Empty transcript.")
-                return None
-            # ── Hallucination guard: repeated words ───────────────────────────
-            words = text.split()
-            if len(words) > 5 and (len(set(words)) / len(words)) < 0.25:
-                print(f"[STT] Hallucination (repetition) discarded: {text[:60]}")
-                return None
-            # ── Script validation: must be Bangla Unicode ─────────────────────
-            if not _is_valid_bangla(text):
-                print(f"[STT] Wrong script (Arabic/Urdu output from base model) "
-                      f"discarded: {text[:60]}")
-                print("[STT] ⚠ If this keeps happening, ensure you're using "
-                      "model='small' not 'base'.")
-                return None
-            print(f"[STT] Transcript: {text}")
-            return text
-        except Exception as e:
-            print(f"[STT] transcribe: {e}")
             import traceback; traceback.print_exc()
             return None
         finally:
-            if wav_path and os.path.exists(wav_path):
-                try: os.remove(wav_path)
-                except OSError: pass

+"""
+services/stt.py — GPU-safe Faster-Whisper STT processor
+Fixes applied
+─────────────
+1. LAZY model initialisation — WhisperModel is loaded once on first use,
+   not at import time, so FastAPI starts instantly.
+2. CUDA semaphore (max 1) — only one transcription runs on the GPU at a
+   time.  Concurrent requests queue here instead of racing on the CUDA
+   context, which caused OOM and silent hangs on RTX 3060 (12 GB).
+3. ffmpeg runs in the same thread as the model call (both inside
+   asyncio.to_thread), keeping the async event-loop completely free.
+4. Hallucination guards and Bangla script validation are unchanged.
+"""
+from __future__ import annotations
+import asyncio
 import os
 import re
 import subprocess
 import tempfile
+from threading import Lock
 from faster_whisper import WhisperModel
+# ── Bangla / wrong-script patterns ────────────────────────────────────────────
+BANGLA_PATTERN = re.compile(r"[\u0980-\u09FF]")
 WRONG_SCRIPT_PATTERN = re.compile(
+    r"[\u0600-\u06FF"   # Arabic / Urdu
+    r"\u0750-\u077F"    # Arabic Supplement
+    r"\uFB50-\uFDFF"    # Arabic Presentation Forms
+    r"\uFE70-\uFEFF]"   # Arabic Presentation Forms-B
 )
+# ── Lazy singleton ─────────────────────────────────────────────────────────────
+_model: WhisperModel | None = None
+_model_lock = Lock()          # protects the one-time initialisation
+# Semaphore lives in the event-loop thread; created on first async use.
+_gpu_semaphore: asyncio.Semaphore | None = None
+def _get_model() -> WhisperModel:
+    """
+    Load WhisperModel on first call, return the cached instance thereafter.
+    Thread-safe via a threading.Lock (called from worker threads).
+    """
+    global _model
+    if _model is None:
+        with _model_lock:
+            if _model is None:          # double-checked locking
+                print("[STT] Loading Faster-Whisper large-v3 on CUDA …")
+                _model = WhisperModel(
+                    "large-v3",
+                    device="cuda",
+                    compute_type="int8_float32",
+                )
+                print("[STT] Model ready.")
+    return _model
+def _get_semaphore() -> asyncio.Semaphore:
+    """
+    Return (or create) a per-event-loop asyncio.Semaphore(1).
+    Must be called from the async context (event-loop thread).
     """
+    global _gpu_semaphore
+    if _gpu_semaphore is None:
+        _gpu_semaphore = asyncio.Semaphore(1)
+    return _gpu_semaphore
+# ── Script validation ──────────────────────────────────────────────────────────
+def _is_valid_bangla(text: str) -> bool:
+    bangla_chars = len(BANGLA_PATTERN.findall(text))
+    wrong_chars  = len(WRONG_SCRIPT_PATTERN.findall(text))
+    total_alpha  = sum(1 for c in text if c.isalpha())
     if total_alpha == 0:
+        return True                         # digits / punctuation — allow
+    if (wrong_chars / total_alpha) > 0.30:  # >30 % Arabic/Urdu → reject
         return False
+    if total_alpha > 5 and bangla_chars == 0:   # long but zero Bangla → reject
         return False
     return True
+# ── Core processor ─────────────────────────────────────────────────────────────
 class STTProcessor:
     MIN_INPUT_BYTES = 3_000
+    # ── ffmpeg helper ──────────────────────────────────────────────────────────
+    @staticmethod
+    def _to_wav(audio_bytes: bytes) -> str | None:
+        """
+        Convert browser WebM/Opus blob → 16 kHz mono WAV with loudnorm.
+        Runs in a worker thread (called via asyncio.to_thread).
+        """
         in_path = out_path = None
         try:
             with tempfile.NamedTemporaryFile(suffix=".webm", delete=False) as f:
             with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
                 out_path = f.name
+            result = subprocess.run(
+                [
+                    "ffmpeg", "-y", "-loglevel", "warning",
+                    "-i", in_path,
+                    "-ar", "16000", "-ac", "1",
+                    "-af", "loudnorm",
+                    "-f", "wav", out_path,
+                ],
+                stdout=subprocess.DEVNULL,
+                stderr=subprocess.PIPE,
+            )
             if result.returncode != 0:
                 print("[STT] ffmpeg error:", result.stderr.decode(errors="replace").strip())
             print(f"[STT] WAV ready: {os.path.getsize(out_path):,} bytes")
             return out_path
+        except Exception as exc:
+            print(f"[STT] _to_wav: {exc}")
             return None
         finally:
             if in_path and os.path.exists(in_path):
+                try:
+                    os.remove(in_path)
+                except OSError:
+                    pass
+    # ── Synchronous transcription (runs in worker thread) ─────────────────────
+    @staticmethod
+    def _transcribe_sync(wav_path: str) -> str | None:
+        """
+        Whisper inference.  Called inside asyncio.to_thread so it never
+        blocks the event loop.  The GPU semaphore is acquired *before*
+        this function is dispatched, so only one call executes at a time.
+        """
+        model = _get_model()
+        segments, info = model.transcribe(
+            wav_path,
+            language="bn",
+            beam_size=5,
+            vad_filter=False,
+            condition_on_previous_text=False,
+            temperature=0,
+            suppress_tokens=[-1],
+            no_speech_threshold=0.5,
+            log_prob_threshold=-1.0,
+        )
+        text = " ".join(seg.text.strip() for seg in segments).strip()
+        print(f"[STT] Lang={info.language} prob={info.language_probability:.2f}")
+        return text
+    # ── Public async entry-point ───────────────────────────────────────────────
+    async def transcribe(self, audio_bytes: bytes) -> str | None:
+        """
+        Full pipeline: validate → ffmpeg → GPU inference.
+        Awaitable from the async WS handler.  GPU access is serialised
+        via an asyncio.Semaphore so concurrent sessions queue here
+        instead of crashing the CUDA context.
+        """
         if len(audio_bytes) < self.MIN_INPUT_BYTES:
             print(f"[STT] Too short ({len(audio_bytes)} B), skipping.")
             return None
+        # ffmpeg conversion (CPU-bound, off event loop)
+        wav_path = await asyncio.to_thread(self._to_wav, audio_bytes)
+        if not wav_path:
+            return None
+        sem = _get_semaphore()
+        try:
+            async with sem:                         # serialise GPU access
+                text = await asyncio.to_thread(self._transcribe_sync, wav_path)
+        except Exception as exc:
+            print(f"[STT] transcribe error: {exc}")
             import traceback; traceback.print_exc()
             return None
         finally:
+            if os.path.exists(wav_path):
+                try:
+                    os.remove(wav_path)
+                except OSError:
+                    pass
+        if not text:
+            print("[STT] Empty transcript.")
+            return None
+        # ── Hallucination guard ────────────────────────────────────────────────
+        words = text.split()
+        if len(words) > 5 and (len(set(words)) / len(words)) < 0.25:
+            print(f"[STT] Hallucination (repetition) discarded: {text[:60]}")
+            return None
+        # ── Script validation ──────────────────────────────────────────────────
+        if not _is_valid_bangla(text):
+            print(f"[STT] Wrong script discarded: {text[:60]}")
+            return None
+        print(f"[STT] Transcript: {text}")
+        return text