adjusted mobile number problem + numbers problem fixed

Browse files

Files changed (4) hide show

app.py +83 -6
core/backend.py +65 -6
frontend/script.js +87 -16
services/streaming.py +56 -0

app.py CHANGED Viewed

@@ -26,6 +26,7 @@ LLM+TTS) preserved.
 import asyncio
 import json
 import os
 import struct
 import uuid
 from contextlib import asynccontextmanager
@@ -188,6 +189,70 @@ async def rtc_close(session_id: str):
 #  WEBSOCKET HELPERS
 # ══════════════════════════════════════════════════════════════════════════════
 def _normalize_ai_text(text: str) -> str:
     """
     Apply small UX wording normalizations to assistant-visible text.
@@ -198,6 +263,7 @@ def _normalize_ai_text(text: str) -> str:
     out = text
     out = out.replace("উপলব্ধ", "এভেলেবেল")
     out = out.replace("জ্বি", "আচ্ছা")
     return out
@@ -283,6 +349,7 @@ async def ws_chat(ws: WebSocket):
                 full_text = ""
                 async for token in stream:
                     if token:
                         full_text += token
                         await _safe_text(ws, {"type": "llm_token", "token": token})
                 # Ensure the final rendered message uses normalized wording.
@@ -348,6 +415,7 @@ async def ws_voice(ws: WebSocket):
     # by the client UI (e.g., brain-mode welcome).
     _utterance_q: asyncio.Queue[object | None] = asyncio.Queue()
     _worker_task: asyncio.Task | None = None
     async def _cancel_active():
         nonlocal _active_streamer, _active_task
@@ -382,13 +450,17 @@ async def ws_voice(ws: WebSocket):
             await _safe_text(ws, {"type": "end"})
             return
         tts_streamer = ParallelTTSStreamer()
         _active_streamer = tts_streamer
         audio_seq = 0
         async def run_text():
             try:
-                await _safe_text(ws, {"type": "llm_full", "text": speak_text})
                 await tts_streamer.add_token(speak_text)
             except asyncio.CancelledError:
                 raise
@@ -398,7 +470,7 @@ async def ws_voice(ws: WebSocket):
         async def run_tts_framed():
             nonlocal audio_seq
             async for chunk in tts_streamer.stream_audio():
-                framed = struct.pack(">I", audio_seq) + chunk
                 if not await _safe_bytes(ws, framed):
                     break
                 audio_seq += 1
@@ -409,6 +481,7 @@ async def ws_voice(ws: WebSocket):
     async def _handle_utterance(audio_bytes: bytes):
         nonlocal _active_streamer
         # ── STT ───────────────────────────────────────────────────────────────
         transcript = await stt.transcribe(audio_bytes)
@@ -419,7 +492,10 @@ async def ws_voice(ws: WebSocket):
             return
         print(f"[VOICE] [{user_id}] STT: {transcript}")
-        if not await _safe_text(ws, {"type": "stt", "text": transcript}):
             return
         # ── LLM + TTS (concurrent) ─────────────────────────────────────────────
@@ -434,8 +510,9 @@ async def ws_voice(ws: WebSocket):
                 async for token in stream:
                     if not token:
                         continue
                     full_text += token
-                    if not await _safe_text(ws, {"type": "llm_token", "token": token}):
                         break
                     await tts_streamer.add_token(token)
             except asyncio.CancelledError:
@@ -446,7 +523,7 @@ async def ws_voice(ws: WebSocket):
                 # Best-effort: send the full text once at the end so the UI can
                 # recover if it missed any streamed tokens.
                 if full_text:
-                    await _safe_text(ws, {"type": "llm_full", "text": _normalize_ai_text(full_text)})
                 await tts_streamer.flush()
         async def run_tts_framed():
@@ -457,7 +534,7 @@ async def ws_voice(ws: WebSocket):
             """
             nonlocal audio_seq
             async for chunk in tts_streamer.stream_audio():
-                framed = struct.pack(">I", audio_seq) + chunk
                 if not await _safe_bytes(ws, framed):
                     break
                 audio_seq += 1

 import asyncio
 import json
 import os
+import re
 import struct
 import uuid
 from contextlib import asynccontextmanager
 #  WEBSOCKET HELPERS
 # ══════════════════════════════════════════════════════════════════════════════
+_DIGIT_WORDS = {
+    "0": "শূন্য",
+    "1": "এক",
+    "2": "দুই",
+    "3": "তিন",
+    "4": "চার",
+    "5": "পাঁচ",
+    "6": "ছয়",
+    "7": "সাত",
+    "8": "আট",
+    "9": "নয়",
+    "০": "শূন্য",
+    "১": "এক",
+    "২": "দুই",
+    "৩": "তিন",
+    "৪": "চার",
+    "৫": "পাঁচ",
+    "৬": "ছয়",
+    "৭": "সাত",
+    "৮": "আট",
+    "৯": "নয়",
+    "٠": "শূন্য",
+    "١": "এক",
+    "٢": "দুই",
+    "٣": "তিন",
+    "٤": "চার",
+    "٥": "পাঁচ",
+    "٦": "ছয়",
+    "٧": "সাত",
+    "٨": "আট",
+    "٩": "নয়",
+}
+def _spoken_digits(chunk: str) -> str:
+    digits = [ch for ch in chunk if ch in _DIGIT_WORDS]
+    if len(digits) < 10:
+        return chunk
+    spoken = " ".join(_DIGIT_WORDS[ch] for ch in digits)
+    return spoken
+def _expand_phone_like_numbers(text: str) -> str:
+    if not text:
+        return ""
+    def repl(match: re.Match[str]) -> str:
+        chunk = match.group(0)
+        spoken = _spoken_digits(chunk)
+        if spoken == chunk:
+            return chunk
+        prev_char = text[match.start() - 1] if match.start() > 0 else ""
+        next_char = text[match.end()] if match.end() < len(text) else ""
+        if prev_char and not prev_char.isspace() and prev_char not in "([<{\"'":
+            spoken = " " + spoken
+        if next_char and not next_char.isspace() and next_char not in ")]>.,!?;:}\"'":
+            spoken = spoken + " "
+        return spoken
+    return re.sub(r"[+\d০-৯٠-٩][\d০-৯٠-٩\s().\-]{8,}[\d০-৯٠-٩]", repl, text)
 def _normalize_ai_text(text: str) -> str:
     """
     Apply small UX wording normalizations to assistant-visible text.
     out = text
     out = out.replace("উপলব্ধ", "এভেলেবেল")
     out = out.replace("জ্বি", "আচ্ছা")
+    out = _expand_phone_like_numbers(out)
     return out
                 full_text = ""
                 async for token in stream:
                     if token:
+                        token = _normalize_ai_text(token)
                         full_text += token
                         await _safe_text(ws, {"type": "llm_token", "token": token})
                 # Ensure the final rendered message uses normalized wording.
     # by the client UI (e.g., brain-mode welcome).
     _utterance_q: asyncio.Queue[object | None] = asyncio.Queue()
     _worker_task: asyncio.Task | None = None
+    _turn_id: int = 0
     async def _cancel_active():
         nonlocal _active_streamer, _active_task
             await _safe_text(ws, {"type": "end"})
             return
+        nonlocal _turn_id
+        _turn_id += 1
+        my_turn = _turn_id
         tts_streamer = ParallelTTSStreamer()
         _active_streamer = tts_streamer
         audio_seq = 0
         async def run_text():
             try:
+                await _safe_text(ws, {"type": "llm_full", "text": speak_text, "turn": my_turn})
                 await tts_streamer.add_token(speak_text)
             except asyncio.CancelledError:
                 raise
         async def run_tts_framed():
             nonlocal audio_seq
             async for chunk in tts_streamer.stream_audio():
+                framed = struct.pack(">II", my_turn, audio_seq) + chunk
                 if not await _safe_bytes(ws, framed):
                     break
                 audio_seq += 1
     async def _handle_utterance(audio_bytes: bytes):
         nonlocal _active_streamer
+        nonlocal _turn_id
         # ── STT ───────────────────────────────────────────────────────────────
         transcript = await stt.transcribe(audio_bytes)
             return
         print(f"[VOICE] [{user_id}] STT: {transcript}")
+        _turn_id += 1
+        my_turn = _turn_id
+        if not await _safe_text(ws, {"type": "stt", "text": transcript, "turn": my_turn}):
             return
         # ── LLM + TTS (concurrent) ─────────────────────────────────────────────
                 async for token in stream:
                     if not token:
                         continue
+                    token = _normalize_ai_text(token)
                     full_text += token
+                    if not await _safe_text(ws, {"type": "llm_token", "token": token, "turn": my_turn}):
                         break
                     await tts_streamer.add_token(token)
             except asyncio.CancelledError:
                 # Best-effort: send the full text once at the end so the UI can
                 # recover if it missed any streamed tokens.
                 if full_text:
+                    await _safe_text(ws, {"type": "llm_full", "text": _normalize_ai_text(full_text), "turn": my_turn})
                 await tts_streamer.flush()
         async def run_tts_framed():
             """
             nonlocal audio_seq
             async for chunk in tts_streamer.stream_audio():
+                framed = struct.pack(">II", my_turn, audio_seq) + chunk
                 if not await _safe_bytes(ws, framed):
                     break
                 audio_seq += 1

core/backend.py CHANGED Viewed

@@ -30,6 +30,8 @@ from langchain_ollama import ChatOllama
 load_dotenv()
 # ═══════════════════════════════════════════════════════════════════════════════
 #  STATE
@@ -122,6 +124,62 @@ def _normalize_digits(text: str) -> str:
     return _clean_text(text).translate(_DIGIT_TRANSLATION)
 DAY_ALIASES = {
     "sunday": "Sunday",
     "monday": "Monday",
@@ -427,7 +485,7 @@ def _format_email_html(subject: str, body_text: str) -> str:
       <div style="background:#ffffff;border-radius:14px;border:1px solid #e6e8f0;overflow:hidden;">
         <div style="padding:18px 20px;background:linear-gradient(135deg,#0ea5e9,#8b5cf6);color:#fff;">
           <div style="font-size:16px;font-weight:700;">{subject}</div>
-          <div style="font-size:12px;opacity:.9;margin-top:4px;">Aasha • Hospital Assistant</div>
         </div>
         <div style="padding:18px 20px;color:#0f172a;font-size:14px;line-height:1.55;">
           {safe}
@@ -470,7 +528,7 @@ def _format_appt_email_text(
     ]
     if extra:
         lines.extend(["", extra.strip()])
-    lines.extend(["", "Thank you.", "Aasha • Hospital Assistant"])
     return "\n".join(lines)
 # ═══════════════════════════════════════════════════════════════════════════════
@@ -1227,7 +1285,7 @@ async def delete_appointment(
 #  SYSTEM PROMPT
 # ═══════════════════════════════════════════════════════════════════════════════
 BASE_SYSTEM = """
-You are Aasha, a warm, Bangla-first hospital phone-call assistant and medical appointment concierge.
 Your job is to help people find doctors, check availability, and manage appointments.
 PERSONA (Voice, Vibe & Emotion Layer)
@@ -1316,8 +1374,9 @@ LANGUAGE RULE
     - "দুই হাজার বিশ সাল"
 - Mobile Number Format (spoken Bangla style):
-    - When you SAY or READ a phone number aloud in Bangla, spell it digit-by-digit using Bangla digit words, separated by spaces.
-      Do NOT read it as a single large number.
     - Example spoken formats:
         - "শূন্য এক ছয় তিন আট আট তিন শূন্য এক ছয় পাঁচ"
         - "শূন্য এক তিন দুই শূন্য শূন্য শূন্য নয় দুই তিন শূন্য"
@@ -1570,7 +1629,7 @@ class AIBackend:
             if _has_tool_calls(retry_response):
                 response = retry_response
-        print(f"[AI]: {str(response.content)[:200]}")
         print(">>>>>>>>>> CHAT NODE END <<<<<<<<<<")
         return {"messages": [response]}

 load_dotenv()
+PROJECT_NAME = "Hospital Assistant"
+AI_NAME = "আয়েশা"
 # ═══════════════════════════════════════════════════════════════════════════════
 #  STATE
     return _clean_text(text).translate(_DIGIT_TRANSLATION)
+_SPOKEN_DIGIT_WORDS = {
+    "0": "শূন্য",
+    "1": "এক",
+    "2": "দুই",
+    "3": "তিন",
+    "4": "চার",
+    "5": "পাঁচ",
+    "6": "ছয়",
+    "7": "সাত",
+    "8": "আট",
+    "9": "নয়",
+    "০": "শূন্য",
+    "১": "এক",
+    "২": "দুই",
+    "৩": "তিন",
+    "৪": "চার",
+    "৫": "পাঁচ",
+    "৬": "ছয়",
+    "৭": "সাত",
+    "৮": "আট",
+    "৯": "নয়",
+    "٠": "শূন্য",
+    "١": "এক",
+    "٢": "দুই",
+    "٣": "তিন",
+    "٤": "চার",
+    "٥": "পাঁচ",
+    "٦": "ছয়",
+    "٧": "সাত",
+    "٨": "আট",
+    "٩": "নয়",
+}
+def _spoken_phone_text(text: str) -> str:
+    if not text:
+        return ""
+    def repl(match: re.Match[str]) -> str:
+        chunk = match.group(0)
+        digits = [ch for ch in chunk if ch in _SPOKEN_DIGIT_WORDS]
+        if len(digits) < 10:
+            return chunk
+        spoken = " ".join(_SPOKEN_DIGIT_WORDS[ch] for ch in digits)
+        prev_char = text[match.start() - 1] if match.start() > 0 else ""
+        next_char = text[match.end()] if match.end() < len(text) else ""
+        if prev_char and not prev_char.isspace() and prev_char not in "([<{\"'":
+            spoken = " " + spoken
+        if next_char and not next_char.isspace() and next_char not in ")]>.,!?;:}\"'":
+            spoken = spoken + " "
+        return spoken
+    out = re.sub(r"[+\d০-৯٠-٩][\d০-৯٠-٩\s().\-]{8,}[\d০-৯٠-٩]", repl, text)
+    return re.sub(r"[ \t]{2,}", " ", out)
 DAY_ALIASES = {
     "sunday": "Sunday",
     "monday": "Monday",
       <div style="background:#ffffff;border-radius:14px;border:1px solid #e6e8f0;overflow:hidden;">
         <div style="padding:18px 20px;background:linear-gradient(135deg,#0ea5e9,#8b5cf6);color:#fff;">
           <div style="font-size:16px;font-weight:700;">{subject}</div>
+          <div style="font-size:12px;opacity:.9;margin-top:4px;">{AI_NAME} • {PROJECT_NAME}</div>
         </div>
         <div style="padding:18px 20px;color:#0f172a;font-size:14px;line-height:1.55;">
           {safe}
     ]
     if extra:
         lines.extend(["", extra.strip()])
+    lines.extend(["", "Thank you.", f"{AI_NAME} • {PROJECT_NAME}"])
     return "\n".join(lines)
 # ═══════════════════════════════════════════════════════════════════════════════
 #  SYSTEM PROMPT
 # ═══════════════════════════════════════════════════════════════════════════════
 BASE_SYSTEM = """
+You are আয়েশা, a warm, Bangla-first hospital phone-call assistant and medical appointment concierge.
 Your job is to help people find doctors, check availability, and manage appointments.
 PERSONA (Voice, Vibe & Emotion Layer)
     - "দুই হাজার বিশ সাল"
 - Mobile Number Format (spoken Bangla style):
+    - When you SAY or READ a phone number aloud in Bangla, ALWAYS spell it digit-by-digit using Bangla digit words, separated by spaces.
+      Never output the raw digit string.
+    - If the number is attached to other words, insert spaces around it so it is easy to hear.
     - Example spoken formats:
         - "শূন্য এক ছয় তিন আট আট তিন শূন্য এক ছয় পাঁচ"
         - "শূন্য এক তিন দুই শূন্য শূন্য শূন্য নয় দুই তিন শূন্য"
             if _has_tool_calls(retry_response):
                 response = retry_response
+        print(f"[AI]: {_spoken_phone_text(str(response.content))[:200]}")
         print(">>>>>>>>>> CHAT NODE END <<<<<<<<<<")
         return {"messages": [response]}

frontend/script.js CHANGED Viewed

@@ -102,6 +102,7 @@ let _audioChain = Promise.resolve();
 let _playbackGen = 0;
 let _expectedSeq = 0;
 let _pendingAudio = new Map();
 // Client-side playback speed multiplier.
 // This makes speech faster immediately even if the TTS provider speed setting
@@ -116,8 +117,68 @@ let voicePendingPackets = [];
 let brainLastResponse = '';
 let _brainWelcomed = false;
 const BRAIN_WELCOME_TEXT =
-  '[calm] হ্যালো! আমি আপনার ভয়েস সহকারী। আপনি কীভাবে সাহায্য চান?';
 // ─── Recording state ──────────────────────────────────────────────────────────
 let micStream = null;
@@ -311,12 +372,17 @@ function onVoiceMsg(ev) {
   if (ev.data instanceof ArrayBuffer) {
     if (Date.now() < _dropAudioUntil) return; // drop late packets after cancel
     _ttsPlaying = true;
-    // Framed audio: 4-byte big-endian seq id + raw audio bytes.
-    // We buffer/reorder by seq so playback always matches text order.
     const u8 = new Uint8Array(ev.data);
-    if (u8.length <= 4) return;
-    const seq = (u8[0] << 24) | (u8[1] << 16) | (u8[2] << 8) | (u8[3] << 0);
-    const payload = ev.data.slice(4);
     _pendingAudio.set(seq >>> 0, payload);
     const gen = _playbackGen;
@@ -351,6 +417,7 @@ function onVoiceMsg(ev) {
     case 'stt':
       // New turn: reset audio ordering/buffers.
       _expectedSeq = 0;
       _pendingAudio.clear();
       tStt = Date.now();
@@ -369,13 +436,15 @@ function onVoiceMsg(ev) {
     case 'llm_token':
       if (!msg.token) break;
       if (tLlm === 0) {
         tLlm = Date.now();
         if (tStt > 0) mLlm.textContent = tLlm - tStt + ' ms';
       }
       _removeThinking();
-      _setCaption(aiTxt + msg.token);
-      brainLastResponse = aiTxt + msg.token;
       _brainSetTtsBubble(brainLastResponse);
       _brainModeSetSearch(true);
       if (!brainMode) {
@@ -384,10 +453,7 @@ function onVoiceMsg(ev) {
           aiEl.className = 'message ai';
           chatBox.appendChild(aiEl);
         }
-        aiTxt += msg.token;
         _renderAiText();
-      } else {
-        aiTxt += msg.token;
       }
       break;
@@ -395,7 +461,13 @@ function onVoiceMsg(ev) {
       if (!msg.text) break;
       // Best-effort recovery path: if any streamed tokens were dropped, the
       // server sends the final full text once at turn end.
-      brainLastResponse = msg.text;
       _brainSetTtsBubble(brainLastResponse);
       if (!brainMode) {
         if (!aiEl) {
@@ -403,10 +475,7 @@ function onVoiceMsg(ev) {
           aiEl.className = 'message ai';
           chatBox.appendChild(aiEl);
         }
-        aiTxt = msg.text;
         _renderAiText();
-      } else {
-        aiTxt = msg.text;
       }
       break;
@@ -639,7 +708,9 @@ function _done() {
 function stopAllAudio() {
   _cancelled = true;
   _ttsPlaying = false;
-  _dropAudioUntil = Date.now() + 700;
   _playbackGen++;
   _audioChain = Promise.resolve();
   _expectedSeq = 0;

 let _playbackGen = 0;
 let _expectedSeq = 0;
 let _pendingAudio = new Map();
+let _currentTurn = 0;
 // Client-side playback speed multiplier.
 // This makes speech faster immediately even if the TTS provider speed setting
 let brainLastResponse = '';
 let _brainWelcomed = false;
+const SPOKEN_DIGIT_WORDS = {
+  '0': 'শূন্য',
+  '1': 'এক',
+  '2': 'দুই',
+  '3': 'তিন',
+  '4': 'চার',
+  '5': 'পাঁচ',
+  '6': 'ছয়',
+  '7': 'সাত',
+  '8': 'আট',
+  '9': 'নয়',
+  '০': 'শূন্য',
+  '১': 'এক',
+  '২': 'দুই',
+  '৩': 'তিন',
+  '৪': 'চার',
+  '৫': 'পাঁচ',
+  '৬': 'ছয়',
+  '৭': 'সাত',
+  '৮': 'আট',
+  '৯': 'নয়',
+  '٠': 'শূন্য',
+  '١': 'এক',
+  '٢': 'দুই',
+  '٣': 'তিন',
+  '٤': 'চার',
+  '٥': 'পাঁচ',
+  '٦': 'ছয়',
+  '٧': 'সাত',
+  '٨': 'আট',
+  '٩': 'নয়',
+};
+function _spokenDigitWords(chunk) {
+  const digits = Array.from(chunk).filter((ch) => ch in SPOKEN_DIGIT_WORDS);
+  if (digits.length < 10) return chunk;
+  return digits.map((ch) => SPOKEN_DIGIT_WORDS[ch]).join(' ');
+}
+function _normalizeVisibleAiText(text) {
+  if (!text) return '';
+  let out = String(text)
+    .replaceAll('উপলব্ধ', 'এভেলেবেল')
+    .replaceAll('জ্বি', 'আচ্ছা');
+  out = out.replace(
+    /[+\d০-৯٠-٩][\d০-৯٠-٩\s().\-]{8,}[\d০-৯٠-٩]/g,
+    (match, offset, whole) => {
+      const spoken = _spokenDigitWords(match);
+      if (spoken === match) return match;
+      const prev = offset > 0 ? whole[offset - 1] : '';
+      const next = offset + match.length < whole.length ? whole[offset + match.length] : '';
+      let value = spoken;
+      if (prev && !/\s/.test(prev) && !/[([<{\"']/.test(prev)) value = ' ' + value;
+      if (next && !/\s/.test(next) && !/[\])>.,!?;:}\"']/.test(next)) value = value + ' ';
+      return value;
+    },
+  );
+  return out.replace(/[ \t]{2,}/g, ' ');
+}
 const BRAIN_WELCOME_TEXT =
+  '[calm] হ্যালো, আমি আয়েশা! হাসপাতাল রিসেপশন থেকে বলছি। আপনি কি কোনো অ্যাপয়েন্টমেন্ট বুক করতে চান?';
 // ─── Recording state ──────────────────────────────────────────────────────────
 let micStream = null;
   if (ev.data instanceof ArrayBuffer) {
     if (Date.now() < _dropAudioUntil) return; // drop late packets after cancel
     _ttsPlaying = true;
+    // Framed audio: 4-byte big-endian turn id + 4-byte big-endian seq id + raw audio bytes.
+    // We buffer/reorder by seq inside a turn, and ignore late packets from older turns.
     const u8 = new Uint8Array(ev.data);
+    if (u8.length <= 8) return;
+    const turn =
+      (u8[0] << 24) | (u8[1] << 16) | (u8[2] << 8) | (u8[3] << 0);
+    const seq =
+      (u8[4] << 24) | (u8[5] << 16) | (u8[6] << 8) | (u8[7] << 0);
+    const turnU = turn >>> 0;
+    if (turnU !== (_currentTurn >>> 0)) return;
+    const payload = ev.data.slice(8);
     _pendingAudio.set(seq >>> 0, payload);
     const gen = _playbackGen;
     case 'stt':
       // New turn: reset audio ordering/buffers.
+      if (typeof msg.turn === 'number') _currentTurn = msg.turn >>> 0;
       _expectedSeq = 0;
       _pendingAudio.clear();
       tStt = Date.now();
     case 'llm_token':
       if (!msg.token) break;
+      const tokenText = _normalizeVisibleAiText(msg.token);
       if (tLlm === 0) {
         tLlm = Date.now();
         if (tStt > 0) mLlm.textContent = tLlm - tStt + ' ms';
       }
       _removeThinking();
+      aiTxt = _normalizeVisibleAiText(aiTxt + tokenText);
+      _setCaption(aiTxt);
+      brainLastResponse = aiTxt;
       _brainSetTtsBubble(brainLastResponse);
       _brainModeSetSearch(true);
       if (!brainMode) {
           aiEl.className = 'message ai';
           chatBox.appendChild(aiEl);
         }
         _renderAiText();
       }
       break;
       if (!msg.text) break;
       // Best-effort recovery path: if any streamed tokens were dropped, the
       // server sends the final full text once at turn end.
+      if (typeof msg.turn === 'number') {
+        _currentTurn = msg.turn >>> 0;
+        _expectedSeq = 0;
+        _pendingAudio.clear();
+      }
+      brainLastResponse = _normalizeVisibleAiText(msg.text);
+      aiTxt = brainLastResponse;
       _brainSetTtsBubble(brainLastResponse);
       if (!brainMode) {
         if (!aiEl) {
           aiEl.className = 'message ai';
           chatBox.appendChild(aiEl);
         }
         _renderAiText();
       }
       break;
 function stopAllAudio() {
   _cancelled = true;
   _ttsPlaying = false;
+  // With turn-id framed audio, we can shorten the drop window; late packets
+  // are ignored by turn mismatch.
+  _dropAudioUntil = Date.now() + 120;
   _playbackGen++;
   _audioChain = Promise.resolve();
   _expectedSeq = 0;

services/streaming.py CHANGED Viewed

@@ -61,6 +61,61 @@ SENTENCE_BOUNDARIES = frozenset(".!?।॥\n")
 CLAUSE_BOUNDARIES   = frozenset(",;:—–")
 _SENTINEL           = object()
 def _clean_for_tts(text: str) -> str:
     # Strip emotion/tone tags like "[calm]" "[neutral]" "[happy]" etc.
@@ -79,6 +134,7 @@ def _clean_for_tts(text: str) -> str:
     text = re.sub(r"\n{2,}", "\n", text)
     # Collapse runs of spaces introduced by tag removal.
     text = re.sub(r"[ \t]{2,}", " ", text)
     # Keep normal spaces so chunk boundaries don't glue words together.
     return text.strip("\n\r\t")

 CLAUSE_BOUNDARIES   = frozenset(",;:—–")
 _SENTINEL           = object()
+_DIGIT_WORDS = {
+    "0": "শূন্য",
+    "1": "এক",
+    "2": "দুই",
+    "3": "তিন",
+    "4": "চার",
+    "5": "পাঁচ",
+    "6": "ছয়",
+    "7": "সাত",
+    "8": "আট",
+    "9": "নয়",
+    "০": "শূন্য",
+    "১": "এক",
+    "২": "দুই",
+    "৩": "তিন",
+    "৪": "চার",
+    "৫": "পাঁচ",
+    "৬": "ছয়",
+    "৭": "সাত",
+    "৮": "আট",
+    "৯": "নয়",
+    "٠": "শূন্য",
+    "١": "এক",
+    "٢": "দুই",
+    "٣": "তিন",
+    "٤": "চার",
+    "٥": "পাঁচ",
+    "٦": "ছয়",
+    "٧": "সাত",
+    "٨": "আট",
+    "٩": "নয়",
+}
+def _spoken_phone_text(text: str) -> str:
+    if not text:
+        return ""
+    def repl(match: re.Match[str]) -> str:
+        chunk = match.group(0)
+        digits = [ch for ch in chunk if ch in _DIGIT_WORDS]
+        if len(digits) < 10:
+            return chunk
+        spoken = " ".join(_DIGIT_WORDS[ch] for ch in digits)
+        prev_char = text[match.start() - 1] if match.start() > 0 else ""
+        next_char = text[match.end()] if match.end() < len(text) else ""
+        if prev_char and not prev_char.isspace() and prev_char not in "([<{\"'":
+            spoken = " " + spoken
+        if next_char and not next_char.isspace() and next_char not in ")]>.,!?;:}\"'":
+            spoken = spoken + " "
+        return spoken
+    out = re.sub(r"[+\d০-৯٠-٩][\d০-৯٠-٩\s().\-]{8,}[\d০-৯٠-٩]", repl, text)
+    return re.sub(r"[ \t]{2,}", " ", out)
 def _clean_for_tts(text: str) -> str:
     # Strip emotion/tone tags like "[calm]" "[neutral]" "[happy]" etc.
     text = re.sub(r"\n{2,}", "\n", text)
     # Collapse runs of spaces introduced by tag removal.
     text = re.sub(r"[ \t]{2,}", " ", text)
+    text = _spoken_phone_text(text)
     # Keep normal spaces so chunk boundaries don't glue words together.
     return text.strip("\n\r\t")