Commit ·
2d19124
1
Parent(s): 357f10b
Enhance text processing for TTS: strip emotion tags and improve whitespace handling; add full text recovery for LLM responses.
Browse files- app.py +6 -0
- frontend/script.js +19 -0
- services/streaming.py +59 -4
- services/tts.py +13 -5
app.py
CHANGED
|
@@ -369,11 +369,13 @@ async def ws_voice(ws: WebSocket):
|
|
| 369 |
audio_seq = 0
|
| 370 |
|
| 371 |
async def run_llm():
|
|
|
|
| 372 |
try:
|
| 373 |
stream = await ai.main(user_id, transcript)
|
| 374 |
async for token in stream:
|
| 375 |
if not token:
|
| 376 |
continue
|
|
|
|
| 377 |
if not await _safe_text(ws, {"type": "llm_token", "token": token}):
|
| 378 |
break
|
| 379 |
await tts_streamer.add_token(token)
|
|
@@ -382,6 +384,10 @@ async def ws_voice(ws: WebSocket):
|
|
| 382 |
except Exception as exc:
|
| 383 |
print(f"[VOICE] LLM error: {exc}")
|
| 384 |
finally:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 385 |
await tts_streamer.flush()
|
| 386 |
|
| 387 |
async def run_tts_framed():
|
|
|
|
| 369 |
audio_seq = 0
|
| 370 |
|
| 371 |
async def run_llm():
|
| 372 |
+
full_text = ""
|
| 373 |
try:
|
| 374 |
stream = await ai.main(user_id, transcript)
|
| 375 |
async for token in stream:
|
| 376 |
if not token:
|
| 377 |
continue
|
| 378 |
+
full_text += token
|
| 379 |
if not await _safe_text(ws, {"type": "llm_token", "token": token}):
|
| 380 |
break
|
| 381 |
await tts_streamer.add_token(token)
|
|
|
|
| 384 |
except Exception as exc:
|
| 385 |
print(f"[VOICE] LLM error: {exc}")
|
| 386 |
finally:
|
| 387 |
+
# Best-effort: send the full text once at the end so the UI can
|
| 388 |
+
# recover if it missed any streamed tokens.
|
| 389 |
+
if full_text:
|
| 390 |
+
await _safe_text(ws, {"type": "llm_full", "text": full_text})
|
| 391 |
await tts_streamer.flush()
|
| 392 |
|
| 393 |
async def run_tts_framed():
|
frontend/script.js
CHANGED
|
@@ -387,6 +387,25 @@ function onVoiceMsg(ev) {
|
|
| 387 |
}
|
| 388 |
break;
|
| 389 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 390 |
case 'end':
|
| 391 |
_renderAiText(true);
|
| 392 |
_removeThinking();
|
|
|
|
| 387 |
}
|
| 388 |
break;
|
| 389 |
|
| 390 |
+
case 'llm_full':
|
| 391 |
+
if (!msg.text) break;
|
| 392 |
+
// Best-effort recovery path: if any streamed tokens were dropped, the
|
| 393 |
+
// server sends the final full text once at turn end.
|
| 394 |
+
brainLastResponse = msg.text;
|
| 395 |
+
_brainSetTtsBubble(brainLastResponse);
|
| 396 |
+
if (!brainMode) {
|
| 397 |
+
if (!aiEl) {
|
| 398 |
+
aiEl = document.createElement('div');
|
| 399 |
+
aiEl.className = 'message ai';
|
| 400 |
+
chatBox.appendChild(aiEl);
|
| 401 |
+
}
|
| 402 |
+
aiTxt = msg.text;
|
| 403 |
+
_renderAiText();
|
| 404 |
+
} else {
|
| 405 |
+
aiTxt = msg.text;
|
| 406 |
+
}
|
| 407 |
+
break;
|
| 408 |
+
|
| 409 |
case 'end':
|
| 410 |
_renderAiText(true);
|
| 411 |
_removeThinking();
|
services/streaming.py
CHANGED
|
@@ -63,13 +63,53 @@ _SENTINEL = object()
|
|
| 63 |
|
| 64 |
|
| 65 |
def _clean_for_tts(text: str) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
text = re.sub(r"\*{1,3}", "", text)
|
| 67 |
text = re.sub(r"#+\s*", "", text)
|
| 68 |
text = re.sub(r"^\s*[-•]\s*", "", text, flags=re.MULTILINE)
|
| 69 |
text = re.sub(r"^\s*[\d০-৯]+[.)]\s*", "", text, flags=re.MULTILINE)
|
| 70 |
text = re.sub(r"`+", "", text)
|
| 71 |
text = re.sub(r"\n{2,}", "\n", text)
|
| 72 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
|
| 74 |
|
| 75 |
def _should_flush(buffer: str, first_chunk: bool) -> bool:
|
|
@@ -117,6 +157,7 @@ class ParallelTTSStreamer:
|
|
| 117 |
self.buffer = ""
|
| 118 |
self._cancelled = False
|
| 119 |
self._first_chunk = True
|
|
|
|
| 120 |
self._slot_index = 0
|
| 121 |
self._slots: list[_AudioSlot] = []
|
| 122 |
self._slots_lock = asyncio.Lock()
|
|
@@ -132,9 +173,16 @@ class ParallelTTSStreamer:
|
|
| 132 |
loop = asyncio.get_running_loop()
|
| 133 |
now = loop.time()
|
| 134 |
self._last_token_t = now
|
|
|
|
|
|
|
|
|
|
|
|
|
| 135 |
self.buffer += token
|
| 136 |
-
|
|
|
|
|
|
|
| 137 |
self._first_chunk = False
|
|
|
|
| 138 |
await self._schedule_chunk()
|
| 139 |
self._last_flush_t = now
|
| 140 |
return
|
|
@@ -145,6 +193,8 @@ class ParallelTTSStreamer:
|
|
| 145 |
flush_min = FIRST_FLUSH_MIN if self._first_chunk else SUBSEQUENT_FLUSH_MIN
|
| 146 |
if len(self.buffer) >= flush_min and (now - self._last_flush_t) >= 0.8:
|
| 147 |
self._first_chunk = False
|
|
|
|
|
|
|
| 148 |
await self._schedule_chunk()
|
| 149 |
self._last_flush_t = now
|
| 150 |
|
|
@@ -152,8 +202,12 @@ class ParallelTTSStreamer:
|
|
| 152 |
if self._cancelled:
|
| 153 |
self.buffer = ""
|
| 154 |
return
|
| 155 |
-
|
| 156 |
-
self.buffer = ""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
if len(text) < MIN_CHARS:
|
| 158 |
return
|
| 159 |
async with self._slots_lock:
|
|
@@ -254,6 +308,7 @@ class ParallelTTSStreamer:
|
|
| 254 |
def reset(self) -> None:
|
| 255 |
self._cancelled = False
|
| 256 |
self._first_chunk = True
|
|
|
|
| 257 |
self.buffer = ""
|
| 258 |
self._slot_index = 0
|
| 259 |
self._slots.clear()
|
|
|
|
| 63 |
|
| 64 |
|
| 65 |
def _clean_for_tts(text: str) -> str:
|
| 66 |
+
# Strip emotion/tone tags like "[calm]" "[neutral]" "[happy]" etc.
|
| 67 |
+
# These are useful for UI but often degrade or break TTS synthesis.
|
| 68 |
+
# Remove them wherever they appear, then normalize whitespace.
|
| 69 |
+
text = re.sub(r"(?:(?<=^)|(?<=\s))\[[^\[\]\n]{1,24}\](?=\s|$)", "", text)
|
| 70 |
+
# Also strip orphaned tag fragments that can occur if the streamer flushes
|
| 71 |
+
# mid-tag during token streaming (e.g. "[neutral" or "neutral]").
|
| 72 |
+
text = re.sub(r"(?:(?<=^)|(?<=\s))\[[A-Za-z]{2,16}(?=\s|$)", "", text)
|
| 73 |
+
text = re.sub(r"(?:(?<=^)|(?<=\s))[A-Za-z]{2,16}\](?=\s|$)", "", text)
|
| 74 |
text = re.sub(r"\*{1,3}", "", text)
|
| 75 |
text = re.sub(r"#+\s*", "", text)
|
| 76 |
text = re.sub(r"^\s*[-•]\s*", "", text, flags=re.MULTILINE)
|
| 77 |
text = re.sub(r"^\s*[\d০-৯]+[.)]\s*", "", text, flags=re.MULTILINE)
|
| 78 |
text = re.sub(r"`+", "", text)
|
| 79 |
text = re.sub(r"\n{2,}", "\n", text)
|
| 80 |
+
# Collapse runs of spaces introduced by tag removal.
|
| 81 |
+
text = re.sub(r"[ \t]{2,}", " ", text)
|
| 82 |
+
# Keep normal spaces so chunk boundaries don't glue words together.
|
| 83 |
+
return text.strip("\n\r\t")
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
def _flush_reason(buffer: str, first_chunk: bool) -> str | None:
|
| 87 |
+
"""
|
| 88 |
+
Like _should_flush, but returns the reason so we can preserve spacing
|
| 89 |
+
when flushing at a word boundary.
|
| 90 |
+
"""
|
| 91 |
+
n = len(buffer)
|
| 92 |
+
if n == 0:
|
| 93 |
+
return None
|
| 94 |
+
|
| 95 |
+
flush_min = FIRST_FLUSH_MIN if first_chunk else SUBSEQUENT_FLUSH_MIN
|
| 96 |
+
hard_limit = FIRST_FLUSH_HARD if first_chunk else SUBSEQUENT_FLUSH_HARD
|
| 97 |
+
|
| 98 |
+
if n >= hard_limit:
|
| 99 |
+
return "hard"
|
| 100 |
+
|
| 101 |
+
last_char = buffer[-1]
|
| 102 |
+
|
| 103 |
+
if last_char in SENTENCE_BOUNDARIES and n >= flush_min:
|
| 104 |
+
return "sentence"
|
| 105 |
+
|
| 106 |
+
if last_char in CLAUSE_BOUNDARIES and n >= hard_limit * 0.70:
|
| 107 |
+
return "clause"
|
| 108 |
+
|
| 109 |
+
if last_char == " " and n >= flush_min:
|
| 110 |
+
return "space"
|
| 111 |
+
|
| 112 |
+
return None
|
| 113 |
|
| 114 |
|
| 115 |
def _should_flush(buffer: str, first_chunk: bool) -> bool:
|
|
|
|
| 157 |
self.buffer = ""
|
| 158 |
self._cancelled = False
|
| 159 |
self._first_chunk = True
|
| 160 |
+
self._carry_space = False
|
| 161 |
self._slot_index = 0
|
| 162 |
self._slots: list[_AudioSlot] = []
|
| 163 |
self._slots_lock = asyncio.Lock()
|
|
|
|
| 173 |
loop = asyncio.get_running_loop()
|
| 174 |
now = loop.time()
|
| 175 |
self._last_token_t = now
|
| 176 |
+
# If we flushed at a word boundary previously, preserve a single
|
| 177 |
+
# inter-word space so Bengali/English words don't get glued together.
|
| 178 |
+
if self.buffer == " " and token[:1].isspace():
|
| 179 |
+
token = token.lstrip()
|
| 180 |
self.buffer += token
|
| 181 |
+
|
| 182 |
+
reason = _flush_reason(self.buffer, self._first_chunk)
|
| 183 |
+
if reason is not None:
|
| 184 |
self._first_chunk = False
|
| 185 |
+
self._carry_space = (reason == "space")
|
| 186 |
await self._schedule_chunk()
|
| 187 |
self._last_flush_t = now
|
| 188 |
return
|
|
|
|
| 193 |
flush_min = FIRST_FLUSH_MIN if self._first_chunk else SUBSEQUENT_FLUSH_MIN
|
| 194 |
if len(self.buffer) >= flush_min and (now - self._last_flush_t) >= 0.8:
|
| 195 |
self._first_chunk = False
|
| 196 |
+
# Time-based flush: don't force a carry space.
|
| 197 |
+
self._carry_space = False
|
| 198 |
await self._schedule_chunk()
|
| 199 |
self._last_flush_t = now
|
| 200 |
|
|
|
|
| 202 |
if self._cancelled:
|
| 203 |
self.buffer = ""
|
| 204 |
return
|
| 205 |
+
raw = self.buffer
|
| 206 |
+
self.buffer = " " if self._carry_space else ""
|
| 207 |
+
self._carry_space = False
|
| 208 |
+
# IMPORTANT: don't lose an inter-word space when the flush happened
|
| 209 |
+
# exactly at a word boundary (buffer ended with " ").
|
| 210 |
+
text = _clean_for_tts(raw)
|
| 211 |
if len(text) < MIN_CHARS:
|
| 212 |
return
|
| 213 |
async with self._slots_lock:
|
|
|
|
| 308 |
def reset(self) -> None:
|
| 309 |
self._cancelled = False
|
| 310 |
self._first_chunk = True
|
| 311 |
+
self._carry_space = False
|
| 312 |
self.buffer = ""
|
| 313 |
self._slot_index = 0
|
| 314 |
self._slots.clear()
|
services/tts.py
CHANGED
|
@@ -83,7 +83,12 @@ def split_sentences(text: str) -> list[str]:
|
|
| 83 |
TTS task is small (a phrase, not a full sentence). This allows synthesis
|
| 84 |
to start sooner for later parts of a long response.
|
| 85 |
"""
|
| 86 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
if not text:
|
| 88 |
return []
|
| 89 |
# Split on sentence-ending punctuation AND clause delimiters
|
|
@@ -100,7 +105,7 @@ async def _edge_tts_stream(text: str, voice: str = EDGE_VOICE, rate: str = "+22%
|
|
| 100 |
"""
|
| 101 |
if edge_tts is None:
|
| 102 |
raise RuntimeError("edge_tts is not installed")
|
| 103 |
-
text = text.strip()
|
| 104 |
if not text:
|
| 105 |
return
|
| 106 |
try:
|
|
@@ -125,13 +130,13 @@ async def _elevenlabs_stream(
|
|
| 125 |
speaker_boost: bool = ELEVENLABS_SPEAKER_BOOST,
|
| 126 |
):
|
| 127 |
import httpx
|
| 128 |
-
text = text.strip()
|
| 129 |
if not text:
|
| 130 |
return
|
| 131 |
# Reduce unnatural pauses for short streamed chunks.
|
| 132 |
# ElevenLabs adds strong pauses on sentence-ending punctuation; for
|
| 133 |
# low-latency streaming we prefer faster turn-taking.
|
| 134 |
-
text = re.sub(r"[।.!?,;:—–]+$", "", text).strip()
|
| 135 |
url = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}/stream"
|
| 136 |
headers = {
|
| 137 |
"xi-api-key": ELEVENLABS_API_KEY,
|
|
@@ -189,7 +194,10 @@ async def text_to_speech_stream(
|
|
| 189 |
synthesised. The phrases are kept intentionally small by
|
| 190 |
services/streaming.py, so latency remains low.
|
| 191 |
"""
|
| 192 |
-
|
|
|
|
|
|
|
|
|
|
| 193 |
if not text:
|
| 194 |
return
|
| 195 |
|
|
|
|
| 83 |
TTS task is small (a phrase, not a full sentence). This allows synthesis
|
| 84 |
to start sooner for later parts of a long response.
|
| 85 |
"""
|
| 86 |
+
# Strip any emotion/tone tags like "[calm]" "[neutral]" etc. These are
|
| 87 |
+
# intended for UI display and can degrade/break some TTS backends.
|
| 88 |
+
text = re.sub(r"(?:(?<=^)|(?<=\s))\[[^\[\]\n]{1,24}\](?=\s|$)", "", text)
|
| 89 |
+
text = re.sub(r"(?:(?<=^)|(?<=\s))\[[A-Za-z]{2,16}(?=\s|$)", "", text)
|
| 90 |
+
text = re.sub(r"(?:(?<=^)|(?<=\s))[A-Za-z]{2,16}\](?=\s|$)", "", text)
|
| 91 |
+
text = re.sub(r"[ \t]{2,}", " ", text).strip("\n\r\t")
|
| 92 |
if not text:
|
| 93 |
return []
|
| 94 |
# Split on sentence-ending punctuation AND clause delimiters
|
|
|
|
| 105 |
"""
|
| 106 |
if edge_tts is None:
|
| 107 |
raise RuntimeError("edge_tts is not installed")
|
| 108 |
+
text = text.strip("\n\r\t")
|
| 109 |
if not text:
|
| 110 |
return
|
| 111 |
try:
|
|
|
|
| 130 |
speaker_boost: bool = ELEVENLABS_SPEAKER_BOOST,
|
| 131 |
):
|
| 132 |
import httpx
|
| 133 |
+
text = text.strip("\n\r\t")
|
| 134 |
if not text:
|
| 135 |
return
|
| 136 |
# Reduce unnatural pauses for short streamed chunks.
|
| 137 |
# ElevenLabs adds strong pauses on sentence-ending punctuation; for
|
| 138 |
# low-latency streaming we prefer faster turn-taking.
|
| 139 |
+
text = re.sub(r"[।.!?,;:—–]+$", "", text).strip("\n\r\t")
|
| 140 |
url = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}/stream"
|
| 141 |
headers = {
|
| 142 |
"xi-api-key": ELEVENLABS_API_KEY,
|
|
|
|
| 194 |
synthesised. The phrases are kept intentionally small by
|
| 195 |
services/streaming.py, so latency remains low.
|
| 196 |
"""
|
| 197 |
+
# Preserve normal spaces inside/around streamed phrase chunks; don't
|
| 198 |
+
# aggressively trim because it can glue words across chunk boundaries
|
| 199 |
+
# (e.g. "দিয়ে" + "আপনার" → "দিয়েআপনার").
|
| 200 |
+
text = text.strip("\n\r\t")
|
| 201 |
if not text:
|
| 202 |
return
|
| 203 |
|