rakib72642 commited on
Commit
2d19124
·
1 Parent(s): 357f10b

Enhance text processing for TTS: strip emotion tags and improve whitespace handling; add full text recovery for LLM responses.

Browse files
Files changed (4) hide show
  1. app.py +6 -0
  2. frontend/script.js +19 -0
  3. services/streaming.py +59 -4
  4. services/tts.py +13 -5
app.py CHANGED
@@ -369,11 +369,13 @@ async def ws_voice(ws: WebSocket):
369
  audio_seq = 0
370
 
371
  async def run_llm():
 
372
  try:
373
  stream = await ai.main(user_id, transcript)
374
  async for token in stream:
375
  if not token:
376
  continue
 
377
  if not await _safe_text(ws, {"type": "llm_token", "token": token}):
378
  break
379
  await tts_streamer.add_token(token)
@@ -382,6 +384,10 @@ async def ws_voice(ws: WebSocket):
382
  except Exception as exc:
383
  print(f"[VOICE] LLM error: {exc}")
384
  finally:
 
 
 
 
385
  await tts_streamer.flush()
386
 
387
  async def run_tts_framed():
 
369
  audio_seq = 0
370
 
371
  async def run_llm():
372
+ full_text = ""
373
  try:
374
  stream = await ai.main(user_id, transcript)
375
  async for token in stream:
376
  if not token:
377
  continue
378
+ full_text += token
379
  if not await _safe_text(ws, {"type": "llm_token", "token": token}):
380
  break
381
  await tts_streamer.add_token(token)
 
384
  except Exception as exc:
385
  print(f"[VOICE] LLM error: {exc}")
386
  finally:
387
+ # Best-effort: send the full text once at the end so the UI can
388
+ # recover if it missed any streamed tokens.
389
+ if full_text:
390
+ await _safe_text(ws, {"type": "llm_full", "text": full_text})
391
  await tts_streamer.flush()
392
 
393
  async def run_tts_framed():
frontend/script.js CHANGED
@@ -387,6 +387,25 @@ function onVoiceMsg(ev) {
387
  }
388
  break;
389
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
390
  case 'end':
391
  _renderAiText(true);
392
  _removeThinking();
 
387
  }
388
  break;
389
 
390
+ case 'llm_full':
391
+ if (!msg.text) break;
392
+ // Best-effort recovery path: if any streamed tokens were dropped, the
393
+ // server sends the final full text once at turn end.
394
+ brainLastResponse = msg.text;
395
+ _brainSetTtsBubble(brainLastResponse);
396
+ if (!brainMode) {
397
+ if (!aiEl) {
398
+ aiEl = document.createElement('div');
399
+ aiEl.className = 'message ai';
400
+ chatBox.appendChild(aiEl);
401
+ }
402
+ aiTxt = msg.text;
403
+ _renderAiText();
404
+ } else {
405
+ aiTxt = msg.text;
406
+ }
407
+ break;
408
+
409
  case 'end':
410
  _renderAiText(true);
411
  _removeThinking();
services/streaming.py CHANGED
@@ -63,13 +63,53 @@ _SENTINEL = object()
63
 
64
 
65
  def _clean_for_tts(text: str) -> str:
 
 
 
 
 
 
 
 
66
  text = re.sub(r"\*{1,3}", "", text)
67
  text = re.sub(r"#+\s*", "", text)
68
  text = re.sub(r"^\s*[-•]\s*", "", text, flags=re.MULTILINE)
69
  text = re.sub(r"^\s*[\d০-৯]+[.)]\s*", "", text, flags=re.MULTILINE)
70
  text = re.sub(r"`+", "", text)
71
  text = re.sub(r"\n{2,}", "\n", text)
72
- return text.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
 
75
  def _should_flush(buffer: str, first_chunk: bool) -> bool:
@@ -117,6 +157,7 @@ class ParallelTTSStreamer:
117
  self.buffer = ""
118
  self._cancelled = False
119
  self._first_chunk = True
 
120
  self._slot_index = 0
121
  self._slots: list[_AudioSlot] = []
122
  self._slots_lock = asyncio.Lock()
@@ -132,9 +173,16 @@ class ParallelTTSStreamer:
132
  loop = asyncio.get_running_loop()
133
  now = loop.time()
134
  self._last_token_t = now
 
 
 
 
135
  self.buffer += token
136
- if _should_flush(self.buffer, self._first_chunk):
 
 
137
  self._first_chunk = False
 
138
  await self._schedule_chunk()
139
  self._last_flush_t = now
140
  return
@@ -145,6 +193,8 @@ class ParallelTTSStreamer:
145
  flush_min = FIRST_FLUSH_MIN if self._first_chunk else SUBSEQUENT_FLUSH_MIN
146
  if len(self.buffer) >= flush_min and (now - self._last_flush_t) >= 0.8:
147
  self._first_chunk = False
 
 
148
  await self._schedule_chunk()
149
  self._last_flush_t = now
150
 
@@ -152,8 +202,12 @@ class ParallelTTSStreamer:
152
  if self._cancelled:
153
  self.buffer = ""
154
  return
155
- text = _clean_for_tts(self.buffer.strip())
156
- self.buffer = ""
 
 
 
 
157
  if len(text) < MIN_CHARS:
158
  return
159
  async with self._slots_lock:
@@ -254,6 +308,7 @@ class ParallelTTSStreamer:
254
  def reset(self) -> None:
255
  self._cancelled = False
256
  self._first_chunk = True
 
257
  self.buffer = ""
258
  self._slot_index = 0
259
  self._slots.clear()
 
63
 
64
 
65
  def _clean_for_tts(text: str) -> str:
66
+ # Strip emotion/tone tags like "[calm]" "[neutral]" "[happy]" etc.
67
+ # These are useful for UI but often degrade or break TTS synthesis.
68
+ # Remove them wherever they appear, then normalize whitespace.
69
+ text = re.sub(r"(?:(?<=^)|(?<=\s))\[[^\[\]\n]{1,24}\](?=\s|$)", "", text)
70
+ # Also strip orphaned tag fragments that can occur if the streamer flushes
71
+ # mid-tag during token streaming (e.g. "[neutral" or "neutral]").
72
+ text = re.sub(r"(?:(?<=^)|(?<=\s))\[[A-Za-z]{2,16}(?=\s|$)", "", text)
73
+ text = re.sub(r"(?:(?<=^)|(?<=\s))[A-Za-z]{2,16}\](?=\s|$)", "", text)
74
  text = re.sub(r"\*{1,3}", "", text)
75
  text = re.sub(r"#+\s*", "", text)
76
  text = re.sub(r"^\s*[-•]\s*", "", text, flags=re.MULTILINE)
77
  text = re.sub(r"^\s*[\d০-৯]+[.)]\s*", "", text, flags=re.MULTILINE)
78
  text = re.sub(r"`+", "", text)
79
  text = re.sub(r"\n{2,}", "\n", text)
80
+ # Collapse runs of spaces introduced by tag removal.
81
+ text = re.sub(r"[ \t]{2,}", " ", text)
82
+ # Keep normal spaces so chunk boundaries don't glue words together.
83
+ return text.strip("\n\r\t")
84
+
85
+
86
+ def _flush_reason(buffer: str, first_chunk: bool) -> str | None:
87
+ """
88
+ Like _should_flush, but returns the reason so we can preserve spacing
89
+ when flushing at a word boundary.
90
+ """
91
+ n = len(buffer)
92
+ if n == 0:
93
+ return None
94
+
95
+ flush_min = FIRST_FLUSH_MIN if first_chunk else SUBSEQUENT_FLUSH_MIN
96
+ hard_limit = FIRST_FLUSH_HARD if first_chunk else SUBSEQUENT_FLUSH_HARD
97
+
98
+ if n >= hard_limit:
99
+ return "hard"
100
+
101
+ last_char = buffer[-1]
102
+
103
+ if last_char in SENTENCE_BOUNDARIES and n >= flush_min:
104
+ return "sentence"
105
+
106
+ if last_char in CLAUSE_BOUNDARIES and n >= hard_limit * 0.70:
107
+ return "clause"
108
+
109
+ if last_char == " " and n >= flush_min:
110
+ return "space"
111
+
112
+ return None
113
 
114
 
115
  def _should_flush(buffer: str, first_chunk: bool) -> bool:
 
157
  self.buffer = ""
158
  self._cancelled = False
159
  self._first_chunk = True
160
+ self._carry_space = False
161
  self._slot_index = 0
162
  self._slots: list[_AudioSlot] = []
163
  self._slots_lock = asyncio.Lock()
 
173
  loop = asyncio.get_running_loop()
174
  now = loop.time()
175
  self._last_token_t = now
176
+ # If we flushed at a word boundary previously, preserve a single
177
+ # inter-word space so Bengali/English words don't get glued together.
178
+ if self.buffer == " " and token[:1].isspace():
179
+ token = token.lstrip()
180
  self.buffer += token
181
+
182
+ reason = _flush_reason(self.buffer, self._first_chunk)
183
+ if reason is not None:
184
  self._first_chunk = False
185
+ self._carry_space = (reason == "space")
186
  await self._schedule_chunk()
187
  self._last_flush_t = now
188
  return
 
193
  flush_min = FIRST_FLUSH_MIN if self._first_chunk else SUBSEQUENT_FLUSH_MIN
194
  if len(self.buffer) >= flush_min and (now - self._last_flush_t) >= 0.8:
195
  self._first_chunk = False
196
+ # Time-based flush: don't force a carry space.
197
+ self._carry_space = False
198
  await self._schedule_chunk()
199
  self._last_flush_t = now
200
 
 
202
  if self._cancelled:
203
  self.buffer = ""
204
  return
205
+ raw = self.buffer
206
+ self.buffer = " " if self._carry_space else ""
207
+ self._carry_space = False
208
+ # IMPORTANT: don't lose an inter-word space when the flush happened
209
+ # exactly at a word boundary (buffer ended with " ").
210
+ text = _clean_for_tts(raw)
211
  if len(text) < MIN_CHARS:
212
  return
213
  async with self._slots_lock:
 
308
  def reset(self) -> None:
309
  self._cancelled = False
310
  self._first_chunk = True
311
+ self._carry_space = False
312
  self.buffer = ""
313
  self._slot_index = 0
314
  self._slots.clear()
services/tts.py CHANGED
@@ -83,7 +83,12 @@ def split_sentences(text: str) -> list[str]:
83
  TTS task is small (a phrase, not a full sentence). This allows synthesis
84
  to start sooner for later parts of a long response.
85
  """
86
- text = text.strip()
 
 
 
 
 
87
  if not text:
88
  return []
89
  # Split on sentence-ending punctuation AND clause delimiters
@@ -100,7 +105,7 @@ async def _edge_tts_stream(text: str, voice: str = EDGE_VOICE, rate: str = "+22%
100
  """
101
  if edge_tts is None:
102
  raise RuntimeError("edge_tts is not installed")
103
- text = text.strip()
104
  if not text:
105
  return
106
  try:
@@ -125,13 +130,13 @@ async def _elevenlabs_stream(
125
  speaker_boost: bool = ELEVENLABS_SPEAKER_BOOST,
126
  ):
127
  import httpx
128
- text = text.strip()
129
  if not text:
130
  return
131
  # Reduce unnatural pauses for short streamed chunks.
132
  # ElevenLabs adds strong pauses on sentence-ending punctuation; for
133
  # low-latency streaming we prefer faster turn-taking.
134
- text = re.sub(r"[।.!?,;:—–]+$", "", text).strip()
135
  url = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}/stream"
136
  headers = {
137
  "xi-api-key": ELEVENLABS_API_KEY,
@@ -189,7 +194,10 @@ async def text_to_speech_stream(
189
  synthesised. The phrases are kept intentionally small by
190
  services/streaming.py, so latency remains low.
191
  """
192
- text = text.strip()
 
 
 
193
  if not text:
194
  return
195
 
 
83
  TTS task is small (a phrase, not a full sentence). This allows synthesis
84
  to start sooner for later parts of a long response.
85
  """
86
+ # Strip any emotion/tone tags like "[calm]" "[neutral]" etc. These are
87
+ # intended for UI display and can degrade/break some TTS backends.
88
+ text = re.sub(r"(?:(?<=^)|(?<=\s))\[[^\[\]\n]{1,24}\](?=\s|$)", "", text)
89
+ text = re.sub(r"(?:(?<=^)|(?<=\s))\[[A-Za-z]{2,16}(?=\s|$)", "", text)
90
+ text = re.sub(r"(?:(?<=^)|(?<=\s))[A-Za-z]{2,16}\](?=\s|$)", "", text)
91
+ text = re.sub(r"[ \t]{2,}", " ", text).strip("\n\r\t")
92
  if not text:
93
  return []
94
  # Split on sentence-ending punctuation AND clause delimiters
 
105
  """
106
  if edge_tts is None:
107
  raise RuntimeError("edge_tts is not installed")
108
+ text = text.strip("\n\r\t")
109
  if not text:
110
  return
111
  try:
 
130
  speaker_boost: bool = ELEVENLABS_SPEAKER_BOOST,
131
  ):
132
  import httpx
133
+ text = text.strip("\n\r\t")
134
  if not text:
135
  return
136
  # Reduce unnatural pauses for short streamed chunks.
137
  # ElevenLabs adds strong pauses on sentence-ending punctuation; for
138
  # low-latency streaming we prefer faster turn-taking.
139
+ text = re.sub(r"[।.!?,;:—–]+$", "", text).strip("\n\r\t")
140
  url = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}/stream"
141
  headers = {
142
  "xi-api-key": ELEVENLABS_API_KEY,
 
194
  synthesised. The phrases are kept intentionally small by
195
  services/streaming.py, so latency remains low.
196
  """
197
+ # Preserve normal spaces inside/around streamed phrase chunks; don't
198
+ # aggressively trim because it can glue words across chunk boundaries
199
+ # (e.g. "দিয়ে" + "আপনার" → "দিয়েআপনার").
200
+ text = text.strip("\n\r\t")
201
  if not text:
202
  return
203