Enhance text processing for TTS: strip emotion tags and improve whitespace handling; add full text recovery for LLM responses.

Browse files

Files changed (4) hide show

app.py +6 -0
frontend/script.js +19 -0
services/streaming.py +59 -4
services/tts.py +13 -5

app.py CHANGED Viewed

@@ -369,11 +369,13 @@ async def ws_voice(ws: WebSocket):
         audio_seq        = 0
         async def run_llm():
             try:
                 stream = await ai.main(user_id, transcript)
                 async for token in stream:
                     if not token:
                         continue
                     if not await _safe_text(ws, {"type": "llm_token", "token": token}):
                         break
                     await tts_streamer.add_token(token)
@@ -382,6 +384,10 @@ async def ws_voice(ws: WebSocket):
             except Exception as exc:
                 print(f"[VOICE] LLM error: {exc}")
             finally:
                 await tts_streamer.flush()
         async def run_tts_framed():

         audio_seq        = 0
         async def run_llm():
+            full_text = ""
             try:
                 stream = await ai.main(user_id, transcript)
                 async for token in stream:
                     if not token:
                         continue
+                    full_text += token
                     if not await _safe_text(ws, {"type": "llm_token", "token": token}):
                         break
                     await tts_streamer.add_token(token)
             except Exception as exc:
                 print(f"[VOICE] LLM error: {exc}")
             finally:
+                # Best-effort: send the full text once at the end so the UI can
+                # recover if it missed any streamed tokens.
+                if full_text:
+                    await _safe_text(ws, {"type": "llm_full", "text": full_text})
                 await tts_streamer.flush()
         async def run_tts_framed():

frontend/script.js CHANGED Viewed

@@ -387,6 +387,25 @@ function onVoiceMsg(ev) {
       }
       break;
     case 'end':
       _renderAiText(true);
       _removeThinking();

       }
       break;
+    case 'llm_full':
+      if (!msg.text) break;
+      // Best-effort recovery path: if any streamed tokens were dropped, the
+      // server sends the final full text once at turn end.
+      brainLastResponse = msg.text;
+      _brainSetTtsBubble(brainLastResponse);
+      if (!brainMode) {
+        if (!aiEl) {
+          aiEl = document.createElement('div');
+          aiEl.className = 'message ai';
+          chatBox.appendChild(aiEl);
+        }
+        aiTxt = msg.text;
+        _renderAiText();
+      } else {
+        aiTxt = msg.text;
+      }
+      break;
     case 'end':
       _renderAiText(true);
       _removeThinking();

services/streaming.py CHANGED Viewed

@@ -63,13 +63,53 @@ _SENTINEL           = object()
 def _clean_for_tts(text: str) -> str:
     text = re.sub(r"\*{1,3}", "", text)
     text = re.sub(r"#+\s*", "", text)
     text = re.sub(r"^\s*[-•]\s*", "", text, flags=re.MULTILINE)
     text = re.sub(r"^\s*[\d০-৯]+[.)]\s*", "", text, flags=re.MULTILINE)
     text = re.sub(r"`+", "", text)
     text = re.sub(r"\n{2,}", "\n", text)
-    return text.strip()
 def _should_flush(buffer: str, first_chunk: bool) -> bool:
@@ -117,6 +157,7 @@ class ParallelTTSStreamer:
         self.buffer       = ""
         self._cancelled   = False
         self._first_chunk = True
         self._slot_index  = 0
         self._slots: list[_AudioSlot] = []
         self._slots_lock  = asyncio.Lock()
@@ -132,9 +173,16 @@ class ParallelTTSStreamer:
         loop = asyncio.get_running_loop()
         now  = loop.time()
         self._last_token_t = now
         self.buffer += token
-        if _should_flush(self.buffer, self._first_chunk):
             self._first_chunk = False
             await self._schedule_chunk()
             self._last_flush_t = now
             return
@@ -145,6 +193,8 @@ class ParallelTTSStreamer:
         flush_min = FIRST_FLUSH_MIN if self._first_chunk else SUBSEQUENT_FLUSH_MIN
         if len(self.buffer) >= flush_min and (now - self._last_flush_t) >= 0.8:
             self._first_chunk = False
             await self._schedule_chunk()
             self._last_flush_t = now
@@ -152,8 +202,12 @@ class ParallelTTSStreamer:
         if self._cancelled:
             self.buffer = ""
             return
-        text        = _clean_for_tts(self.buffer.strip())
-        self.buffer = ""
         if len(text) < MIN_CHARS:
             return
         async with self._slots_lock:
@@ -254,6 +308,7 @@ class ParallelTTSStreamer:
     def reset(self) -> None:
         self._cancelled   = False
         self._first_chunk = True
         self.buffer       = ""
         self._slot_index  = 0
         self._slots.clear()

 def _clean_for_tts(text: str) -> str:
+    # Strip emotion/tone tags like "[calm]" "[neutral]" "[happy]" etc.
+    # These are useful for UI but often degrade or break TTS synthesis.
+    # Remove them wherever they appear, then normalize whitespace.
+    text = re.sub(r"(?:(?<=^)|(?<=\s))\[[^\[\]\n]{1,24}\](?=\s|$)", "", text)
+    # Also strip orphaned tag fragments that can occur if the streamer flushes
+    # mid-tag during token streaming (e.g. "[neutral" or "neutral]").
+    text = re.sub(r"(?:(?<=^)|(?<=\s))\[[A-Za-z]{2,16}(?=\s|$)", "", text)
+    text = re.sub(r"(?:(?<=^)|(?<=\s))[A-Za-z]{2,16}\](?=\s|$)", "", text)
     text = re.sub(r"\*{1,3}", "", text)
     text = re.sub(r"#+\s*", "", text)
     text = re.sub(r"^\s*[-•]\s*", "", text, flags=re.MULTILINE)
     text = re.sub(r"^\s*[\d০-৯]+[.)]\s*", "", text, flags=re.MULTILINE)
     text = re.sub(r"`+", "", text)
     text = re.sub(r"\n{2,}", "\n", text)
+    # Collapse runs of spaces introduced by tag removal.
+    text = re.sub(r"[ \t]{2,}", " ", text)
+    # Keep normal spaces so chunk boundaries don't glue words together.
+    return text.strip("\n\r\t")
+def _flush_reason(buffer: str, first_chunk: bool) -> str | None:
+    """
+    Like _should_flush, but returns the reason so we can preserve spacing
+    when flushing at a word boundary.
+    """
+    n = len(buffer)
+    if n == 0:
+        return None
+    flush_min  = FIRST_FLUSH_MIN  if first_chunk else SUBSEQUENT_FLUSH_MIN
+    hard_limit = FIRST_FLUSH_HARD if first_chunk else SUBSEQUENT_FLUSH_HARD
+    if n >= hard_limit:
+        return "hard"
+    last_char = buffer[-1]
+    if last_char in SENTENCE_BOUNDARIES and n >= flush_min:
+        return "sentence"
+    if last_char in CLAUSE_BOUNDARIES and n >= hard_limit * 0.70:
+        return "clause"
+    if last_char == " " and n >= flush_min:
+        return "space"
+    return None
 def _should_flush(buffer: str, first_chunk: bool) -> bool:
         self.buffer       = ""
         self._cancelled   = False
         self._first_chunk = True
+        self._carry_space = False
         self._slot_index  = 0
         self._slots: list[_AudioSlot] = []
         self._slots_lock  = asyncio.Lock()
         loop = asyncio.get_running_loop()
         now  = loop.time()
         self._last_token_t = now
+        # If we flushed at a word boundary previously, preserve a single
+        # inter-word space so Bengali/English words don't get glued together.
+        if self.buffer == " " and token[:1].isspace():
+            token = token.lstrip()
         self.buffer += token
+        reason = _flush_reason(self.buffer, self._first_chunk)
+        if reason is not None:
             self._first_chunk = False
+            self._carry_space = (reason == "space")
             await self._schedule_chunk()
             self._last_flush_t = now
             return
         flush_min = FIRST_FLUSH_MIN if self._first_chunk else SUBSEQUENT_FLUSH_MIN
         if len(self.buffer) >= flush_min and (now - self._last_flush_t) >= 0.8:
             self._first_chunk = False
+            # Time-based flush: don't force a carry space.
+            self._carry_space = False
             await self._schedule_chunk()
             self._last_flush_t = now
         if self._cancelled:
             self.buffer = ""
             return
+        raw = self.buffer
+        self.buffer = " " if self._carry_space else ""
+        self._carry_space = False
+        # IMPORTANT: don't lose an inter-word space when the flush happened
+        # exactly at a word boundary (buffer ended with " ").
+        text = _clean_for_tts(raw)
         if len(text) < MIN_CHARS:
             return
         async with self._slots_lock:
     def reset(self) -> None:
         self._cancelled   = False
         self._first_chunk = True
+        self._carry_space = False
         self.buffer       = ""
         self._slot_index  = 0
         self._slots.clear()

services/tts.py CHANGED Viewed

@@ -83,7 +83,12 @@ def split_sentences(text: str) -> list[str]:
     TTS task is small (a phrase, not a full sentence). This allows synthesis
     to start sooner for later parts of a long response.
     """
-    text = text.strip()
     if not text:
         return []
     # Split on sentence-ending punctuation AND clause delimiters
@@ -100,7 +105,7 @@ async def _edge_tts_stream(text: str, voice: str = EDGE_VOICE, rate: str = "+22%
     """
     if edge_tts is None:
         raise RuntimeError("edge_tts is not installed")
-    text = text.strip()
     if not text:
         return
     try:
@@ -125,13 +130,13 @@ async def _elevenlabs_stream(
     speaker_boost: bool = ELEVENLABS_SPEAKER_BOOST,
 ):
     import httpx
-    text = text.strip()
     if not text:
         return
     # Reduce unnatural pauses for short streamed chunks.
     # ElevenLabs adds strong pauses on sentence-ending punctuation; for
     # low-latency streaming we prefer faster turn-taking.
-    text = re.sub(r"[।.!?,;:—–]+$", "", text).strip()
     url     = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}/stream"
     headers = {
         "xi-api-key":   ELEVENLABS_API_KEY,
@@ -189,7 +194,10 @@ async def text_to_speech_stream(
       synthesised. The phrases are kept intentionally small by
       services/streaming.py, so latency remains low.
     """
-    text = text.strip()
     if not text:
         return

     TTS task is small (a phrase, not a full sentence). This allows synthesis
     to start sooner for later parts of a long response.
     """
+    # Strip any emotion/tone tags like "[calm]" "[neutral]" etc. These are
+    # intended for UI display and can degrade/break some TTS backends.
+    text = re.sub(r"(?:(?<=^)|(?<=\s))\[[^\[\]\n]{1,24}\](?=\s|$)", "", text)
+    text = re.sub(r"(?:(?<=^)|(?<=\s))\[[A-Za-z]{2,16}(?=\s|$)", "", text)
+    text = re.sub(r"(?:(?<=^)|(?<=\s))[A-Za-z]{2,16}\](?=\s|$)", "", text)
+    text = re.sub(r"[ \t]{2,}", " ", text).strip("\n\r\t")
     if not text:
         return []
     # Split on sentence-ending punctuation AND clause delimiters
     """
     if edge_tts is None:
         raise RuntimeError("edge_tts is not installed")
+    text = text.strip("\n\r\t")
     if not text:
         return
     try:
     speaker_boost: bool = ELEVENLABS_SPEAKER_BOOST,
 ):
     import httpx
+    text = text.strip("\n\r\t")
     if not text:
         return
     # Reduce unnatural pauses for short streamed chunks.
     # ElevenLabs adds strong pauses on sentence-ending punctuation; for
     # low-latency streaming we prefer faster turn-taking.
+    text = re.sub(r"[।.!?,;:—–]+$", "", text).strip("\n\r\t")
     url     = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}/stream"
     headers = {
         "xi-api-key":   ELEVENLABS_API_KEY,
       synthesised. The phrases are kept intentionally small by
       services/streaming.py, so latency remains low.
     """
+    # Preserve normal spaces inside/around streamed phrase chunks; don't
+    # aggressively trim because it can glue words across chunk boundaries
+    # (e.g. "দিয়ে" + "আপনার" → "দিয়েআপনার").
+    text = text.strip("\n\r\t")
     if not text:
         return