rakib72642 commited on
Commit
16676c4
·
1 Parent(s): 440d1f1

adjusted mobile number problem + numbers problem fixed

Browse files
Files changed (4) hide show
  1. app.py +83 -6
  2. core/backend.py +65 -6
  3. frontend/script.js +87 -16
  4. services/streaming.py +56 -0
app.py CHANGED
@@ -26,6 +26,7 @@ LLM+TTS) preserved.
26
  import asyncio
27
  import json
28
  import os
 
29
  import struct
30
  import uuid
31
  from contextlib import asynccontextmanager
@@ -188,6 +189,70 @@ async def rtc_close(session_id: str):
188
  # WEBSOCKET HELPERS
189
  # ══════════════════════════════════════════════════════════════════════════════
190
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
  def _normalize_ai_text(text: str) -> str:
192
  """
193
  Apply small UX wording normalizations to assistant-visible text.
@@ -198,6 +263,7 @@ def _normalize_ai_text(text: str) -> str:
198
  out = text
199
  out = out.replace("উপলব্ধ", "এভেলেবেল")
200
  out = out.replace("জ্বি", "আচ্ছা")
 
201
  return out
202
 
203
 
@@ -283,6 +349,7 @@ async def ws_chat(ws: WebSocket):
283
  full_text = ""
284
  async for token in stream:
285
  if token:
 
286
  full_text += token
287
  await _safe_text(ws, {"type": "llm_token", "token": token})
288
  # Ensure the final rendered message uses normalized wording.
@@ -348,6 +415,7 @@ async def ws_voice(ws: WebSocket):
348
  # by the client UI (e.g., brain-mode welcome).
349
  _utterance_q: asyncio.Queue[object | None] = asyncio.Queue()
350
  _worker_task: asyncio.Task | None = None
 
351
 
352
  async def _cancel_active():
353
  nonlocal _active_streamer, _active_task
@@ -382,13 +450,17 @@ async def ws_voice(ws: WebSocket):
382
  await _safe_text(ws, {"type": "end"})
383
  return
384
 
 
 
 
 
385
  tts_streamer = ParallelTTSStreamer()
386
  _active_streamer = tts_streamer
387
  audio_seq = 0
388
 
389
  async def run_text():
390
  try:
391
- await _safe_text(ws, {"type": "llm_full", "text": speak_text})
392
  await tts_streamer.add_token(speak_text)
393
  except asyncio.CancelledError:
394
  raise
@@ -398,7 +470,7 @@ async def ws_voice(ws: WebSocket):
398
  async def run_tts_framed():
399
  nonlocal audio_seq
400
  async for chunk in tts_streamer.stream_audio():
401
- framed = struct.pack(">I", audio_seq) + chunk
402
  if not await _safe_bytes(ws, framed):
403
  break
404
  audio_seq += 1
@@ -409,6 +481,7 @@ async def ws_voice(ws: WebSocket):
409
 
410
  async def _handle_utterance(audio_bytes: bytes):
411
  nonlocal _active_streamer
 
412
 
413
  # ── STT ───────────────────────────────────────────────────────────────
414
  transcript = await stt.transcribe(audio_bytes)
@@ -419,7 +492,10 @@ async def ws_voice(ws: WebSocket):
419
  return
420
 
421
  print(f"[VOICE] [{user_id}] STT: {transcript}")
422
- if not await _safe_text(ws, {"type": "stt", "text": transcript}):
 
 
 
423
  return
424
 
425
  # ── LLM + TTS (concurrent) ─────────────────────────────────────────────
@@ -434,8 +510,9 @@ async def ws_voice(ws: WebSocket):
434
  async for token in stream:
435
  if not token:
436
  continue
 
437
  full_text += token
438
- if not await _safe_text(ws, {"type": "llm_token", "token": token}):
439
  break
440
  await tts_streamer.add_token(token)
441
  except asyncio.CancelledError:
@@ -446,7 +523,7 @@ async def ws_voice(ws: WebSocket):
446
  # Best-effort: send the full text once at the end so the UI can
447
  # recover if it missed any streamed tokens.
448
  if full_text:
449
- await _safe_text(ws, {"type": "llm_full", "text": _normalize_ai_text(full_text)})
450
  await tts_streamer.flush()
451
 
452
  async def run_tts_framed():
@@ -457,7 +534,7 @@ async def ws_voice(ws: WebSocket):
457
  """
458
  nonlocal audio_seq
459
  async for chunk in tts_streamer.stream_audio():
460
- framed = struct.pack(">I", audio_seq) + chunk
461
  if not await _safe_bytes(ws, framed):
462
  break
463
  audio_seq += 1
 
26
  import asyncio
27
  import json
28
  import os
29
+ import re
30
  import struct
31
  import uuid
32
  from contextlib import asynccontextmanager
 
189
  # WEBSOCKET HELPERS
190
  # ══════════════════════════════════════════════════════════════════════════════
191
 
192
+ _DIGIT_WORDS = {
193
+ "0": "শূন্য",
194
+ "1": "এক",
195
+ "2": "দুই",
196
+ "3": "তিন",
197
+ "4": "চার",
198
+ "5": "পাঁচ",
199
+ "6": "ছয়",
200
+ "7": "সাত",
201
+ "8": "আট",
202
+ "9": "নয়",
203
+ "০": "শূন্য",
204
+ "১": "এক",
205
+ "২": "দুই",
206
+ "৩": "তিন",
207
+ "৪": "চার",
208
+ "৫": "পাঁচ",
209
+ "৬": "ছয়",
210
+ "৭": "সাত",
211
+ "৮": "আট",
212
+ "৯": "নয়",
213
+ "٠": "শূন্য",
214
+ "١": "এক",
215
+ "٢": "দুই",
216
+ "٣": "তিন",
217
+ "٤": "চার",
218
+ "٥": "পাঁচ",
219
+ "٦": "ছয়",
220
+ "٧": "সাত",
221
+ "٨": "আট",
222
+ "٩": "নয়",
223
+ }
224
+
225
+
226
+ def _spoken_digits(chunk: str) -> str:
227
+ digits = [ch for ch in chunk if ch in _DIGIT_WORDS]
228
+ if len(digits) < 10:
229
+ return chunk
230
+ spoken = " ".join(_DIGIT_WORDS[ch] for ch in digits)
231
+ return spoken
232
+
233
+
234
+ def _expand_phone_like_numbers(text: str) -> str:
235
+ if not text:
236
+ return ""
237
+
238
+ def repl(match: re.Match[str]) -> str:
239
+ chunk = match.group(0)
240
+ spoken = _spoken_digits(chunk)
241
+ if spoken == chunk:
242
+ return chunk
243
+
244
+ prev_char = text[match.start() - 1] if match.start() > 0 else ""
245
+ next_char = text[match.end()] if match.end() < len(text) else ""
246
+
247
+ if prev_char and not prev_char.isspace() and prev_char not in "([<{\"'":
248
+ spoken = " " + spoken
249
+ if next_char and not next_char.isspace() and next_char not in ")]>.,!?;:}\"'":
250
+ spoken = spoken + " "
251
+ return spoken
252
+
253
+ return re.sub(r"[+\d০-৯٠-٩][\d০-৯٠-٩\s().\-]{8,}[\d০-৯٠-٩]", repl, text)
254
+
255
+
256
  def _normalize_ai_text(text: str) -> str:
257
  """
258
  Apply small UX wording normalizations to assistant-visible text.
 
263
  out = text
264
  out = out.replace("উপলব্ধ", "এভেলেবেল")
265
  out = out.replace("জ্বি", "আচ্ছা")
266
+ out = _expand_phone_like_numbers(out)
267
  return out
268
 
269
 
 
349
  full_text = ""
350
  async for token in stream:
351
  if token:
352
+ token = _normalize_ai_text(token)
353
  full_text += token
354
  await _safe_text(ws, {"type": "llm_token", "token": token})
355
  # Ensure the final rendered message uses normalized wording.
 
415
  # by the client UI (e.g., brain-mode welcome).
416
  _utterance_q: asyncio.Queue[object | None] = asyncio.Queue()
417
  _worker_task: asyncio.Task | None = None
418
+ _turn_id: int = 0
419
 
420
  async def _cancel_active():
421
  nonlocal _active_streamer, _active_task
 
450
  await _safe_text(ws, {"type": "end"})
451
  return
452
 
453
+ nonlocal _turn_id
454
+ _turn_id += 1
455
+ my_turn = _turn_id
456
+
457
  tts_streamer = ParallelTTSStreamer()
458
  _active_streamer = tts_streamer
459
  audio_seq = 0
460
 
461
  async def run_text():
462
  try:
463
+ await _safe_text(ws, {"type": "llm_full", "text": speak_text, "turn": my_turn})
464
  await tts_streamer.add_token(speak_text)
465
  except asyncio.CancelledError:
466
  raise
 
470
  async def run_tts_framed():
471
  nonlocal audio_seq
472
  async for chunk in tts_streamer.stream_audio():
473
+ framed = struct.pack(">II", my_turn, audio_seq) + chunk
474
  if not await _safe_bytes(ws, framed):
475
  break
476
  audio_seq += 1
 
481
 
482
  async def _handle_utterance(audio_bytes: bytes):
483
  nonlocal _active_streamer
484
+ nonlocal _turn_id
485
 
486
  # ── STT ───────────────────────────────────────────────────────────────
487
  transcript = await stt.transcribe(audio_bytes)
 
492
  return
493
 
494
  print(f"[VOICE] [{user_id}] STT: {transcript}")
495
+ _turn_id += 1
496
+ my_turn = _turn_id
497
+
498
+ if not await _safe_text(ws, {"type": "stt", "text": transcript, "turn": my_turn}):
499
  return
500
 
501
  # ── LLM + TTS (concurrent) ─────────────────────────────────────────────
 
510
  async for token in stream:
511
  if not token:
512
  continue
513
+ token = _normalize_ai_text(token)
514
  full_text += token
515
+ if not await _safe_text(ws, {"type": "llm_token", "token": token, "turn": my_turn}):
516
  break
517
  await tts_streamer.add_token(token)
518
  except asyncio.CancelledError:
 
523
  # Best-effort: send the full text once at the end so the UI can
524
  # recover if it missed any streamed tokens.
525
  if full_text:
526
+ await _safe_text(ws, {"type": "llm_full", "text": _normalize_ai_text(full_text), "turn": my_turn})
527
  await tts_streamer.flush()
528
 
529
  async def run_tts_framed():
 
534
  """
535
  nonlocal audio_seq
536
  async for chunk in tts_streamer.stream_audio():
537
+ framed = struct.pack(">II", my_turn, audio_seq) + chunk
538
  if not await _safe_bytes(ws, framed):
539
  break
540
  audio_seq += 1
core/backend.py CHANGED
@@ -30,6 +30,8 @@ from langchain_ollama import ChatOllama
30
 
31
  load_dotenv()
32
 
 
 
33
 
34
  # ═══════════════════════════════════════════════════════════════════════════════
35
  # STATE
@@ -122,6 +124,62 @@ def _normalize_digits(text: str) -> str:
122
  return _clean_text(text).translate(_DIGIT_TRANSLATION)
123
 
124
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
  DAY_ALIASES = {
126
  "sunday": "Sunday",
127
  "monday": "Monday",
@@ -427,7 +485,7 @@ def _format_email_html(subject: str, body_text: str) -> str:
427
  <div style="background:#ffffff;border-radius:14px;border:1px solid #e6e8f0;overflow:hidden;">
428
  <div style="padding:18px 20px;background:linear-gradient(135deg,#0ea5e9,#8b5cf6);color:#fff;">
429
  <div style="font-size:16px;font-weight:700;">{subject}</div>
430
- <div style="font-size:12px;opacity:.9;margin-top:4px;">AashaHospital Assistant</div>
431
  </div>
432
  <div style="padding:18px 20px;color:#0f172a;font-size:14px;line-height:1.55;">
433
  {safe}
@@ -470,7 +528,7 @@ def _format_appt_email_text(
470
  ]
471
  if extra:
472
  lines.extend(["", extra.strip()])
473
- lines.extend(["", "Thank you.", "AashaHospital Assistant"])
474
  return "\n".join(lines)
475
 
476
  # ═══════════════════════════════════════════════════════════════════════════════
@@ -1227,7 +1285,7 @@ async def delete_appointment(
1227
  # SYSTEM PROMPT
1228
  # ═══════════════════════════════════════════════════════════════════════════════
1229
  BASE_SYSTEM = """
1230
- You are Aasha, a warm, Bangla-first hospital phone-call assistant and medical appointment concierge.
1231
  Your job is to help people find doctors, check availability, and manage appointments.
1232
 
1233
  PERSONA (Voice, Vibe & Emotion Layer)
@@ -1316,8 +1374,9 @@ LANGUAGE RULE
1316
  - "দুই হাজার বিশ সাল"
1317
 
1318
  - Mobile Number Format (spoken Bangla style):
1319
- - When you SAY or READ a phone number aloud in Bangla, spell it digit-by-digit using Bangla digit words, separated by spaces.
1320
- Do NOT read it as a single large number.
 
1321
  - Example spoken formats:
1322
  - "শূন্য এক ছয় তিন আট আট তিন শূন্য এক ছয় পাঁচ"
1323
  - "শূন্য এক তিন দুই শূন্য শূন্য শূন্য নয় দুই তিন শূন্য"
@@ -1570,7 +1629,7 @@ class AIBackend:
1570
  if _has_tool_calls(retry_response):
1571
  response = retry_response
1572
 
1573
- print(f"[AI]: {str(response.content)[:200]}")
1574
  print(">>>>>>>>>> CHAT NODE END <<<<<<<<<<")
1575
  return {"messages": [response]}
1576
 
 
30
 
31
  load_dotenv()
32
 
33
+ PROJECT_NAME = "Hospital Assistant"
34
+ AI_NAME = "আয়েশা"
35
 
36
  # ═══════════════════════════════════════════════════════════════════════════════
37
  # STATE
 
124
  return _clean_text(text).translate(_DIGIT_TRANSLATION)
125
 
126
 
127
+ _SPOKEN_DIGIT_WORDS = {
128
+ "0": "শূন্য",
129
+ "1": "এক",
130
+ "2": "দুই",
131
+ "3": "তিন",
132
+ "4": "চার",
133
+ "5": "পাঁচ",
134
+ "6": "ছয়",
135
+ "7": "সাত",
136
+ "8": "আট",
137
+ "9": "নয়",
138
+ "০": "শূন্য",
139
+ "১": "এক",
140
+ "২": "দুই",
141
+ "৩": "তিন",
142
+ "৪": "চার",
143
+ "৫": "পাঁচ",
144
+ "৬": "ছয়",
145
+ "৭": "সাত",
146
+ "৮": "আট",
147
+ "৯": "নয়",
148
+ "٠": "শূন্য",
149
+ "١": "এক",
150
+ "٢": "দুই",
151
+ "٣": "তিন",
152
+ "٤": "চার",
153
+ "٥": "পাঁচ",
154
+ "٦": "ছয়",
155
+ "٧": "সাত",
156
+ "٨": "আট",
157
+ "٩": "নয়",
158
+ }
159
+
160
+
161
+ def _spoken_phone_text(text: str) -> str:
162
+ if not text:
163
+ return ""
164
+
165
+ def repl(match: re.Match[str]) -> str:
166
+ chunk = match.group(0)
167
+ digits = [ch for ch in chunk if ch in _SPOKEN_DIGIT_WORDS]
168
+ if len(digits) < 10:
169
+ return chunk
170
+ spoken = " ".join(_SPOKEN_DIGIT_WORDS[ch] for ch in digits)
171
+ prev_char = text[match.start() - 1] if match.start() > 0 else ""
172
+ next_char = text[match.end()] if match.end() < len(text) else ""
173
+ if prev_char and not prev_char.isspace() and prev_char not in "([<{\"'":
174
+ spoken = " " + spoken
175
+ if next_char and not next_char.isspace() and next_char not in ")]>.,!?;:}\"'":
176
+ spoken = spoken + " "
177
+ return spoken
178
+
179
+ out = re.sub(r"[+\d০-৯٠-٩][\d০-৯٠-٩\s().\-]{8,}[\d০-৯٠-٩]", repl, text)
180
+ return re.sub(r"[ \t]{2,}", " ", out)
181
+
182
+
183
  DAY_ALIASES = {
184
  "sunday": "Sunday",
185
  "monday": "Monday",
 
485
  <div style="background:#ffffff;border-radius:14px;border:1px solid #e6e8f0;overflow:hidden;">
486
  <div style="padding:18px 20px;background:linear-gradient(135deg,#0ea5e9,#8b5cf6);color:#fff;">
487
  <div style="font-size:16px;font-weight:700;">{subject}</div>
488
+ <div style="font-size:12px;opacity:.9;margin-top:4px;">{AI_NAME}{PROJECT_NAME}</div>
489
  </div>
490
  <div style="padding:18px 20px;color:#0f172a;font-size:14px;line-height:1.55;">
491
  {safe}
 
528
  ]
529
  if extra:
530
  lines.extend(["", extra.strip()])
531
+ lines.extend(["", "Thank you.", f"{AI_NAME}{PROJECT_NAME}"])
532
  return "\n".join(lines)
533
 
534
  # ═══════════════════════════════════════════════════════════════════════════════
 
1285
  # SYSTEM PROMPT
1286
  # ═══════════════════════════════════════════════════════════════════════════════
1287
  BASE_SYSTEM = """
1288
+ You are আয়েশা, a warm, Bangla-first hospital phone-call assistant and medical appointment concierge.
1289
  Your job is to help people find doctors, check availability, and manage appointments.
1290
 
1291
  PERSONA (Voice, Vibe & Emotion Layer)
 
1374
  - "দুই হাজার বিশ সাল"
1375
 
1376
  - Mobile Number Format (spoken Bangla style):
1377
+ - When you SAY or READ a phone number aloud in Bangla, ALWAYS spell it digit-by-digit using Bangla digit words, separated by spaces.
1378
+ Never output the raw digit string.
1379
+ - If the number is attached to other words, insert spaces around it so it is easy to hear.
1380
  - Example spoken formats:
1381
  - "শূন্য এক ছয় তিন আট আট তিন শূন্য এক ছয় পাঁচ"
1382
  - "শূন্য এক তিন দুই শূন্য শূন্য শূন্য নয় দুই তিন শূন্য"
 
1629
  if _has_tool_calls(retry_response):
1630
  response = retry_response
1631
 
1632
+ print(f"[AI]: {_spoken_phone_text(str(response.content))[:200]}")
1633
  print(">>>>>>>>>> CHAT NODE END <<<<<<<<<<")
1634
  return {"messages": [response]}
1635
 
frontend/script.js CHANGED
@@ -102,6 +102,7 @@ let _audioChain = Promise.resolve();
102
  let _playbackGen = 0;
103
  let _expectedSeq = 0;
104
  let _pendingAudio = new Map();
 
105
 
106
  // Client-side playback speed multiplier.
107
  // This makes speech faster immediately even if the TTS provider speed setting
@@ -116,8 +117,68 @@ let voicePendingPackets = [];
116
  let brainLastResponse = '';
117
  let _brainWelcomed = false;
118
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
  const BRAIN_WELCOME_TEXT =
120
- '[calm] হ্যালো! আমি আপন ভয়েস সারী। আপনি কীভাবে হায্ চান?';
121
 
122
  // ─── Recording state ──────────────────────────────────────────────────────────
123
  let micStream = null;
@@ -311,12 +372,17 @@ function onVoiceMsg(ev) {
311
  if (ev.data instanceof ArrayBuffer) {
312
  if (Date.now() < _dropAudioUntil) return; // drop late packets after cancel
313
  _ttsPlaying = true;
314
- // Framed audio: 4-byte big-endian seq id + raw audio bytes.
315
- // We buffer/reorder by seq so playback always matches text order.
316
  const u8 = new Uint8Array(ev.data);
317
- if (u8.length <= 4) return;
318
- const seq = (u8[0] << 24) | (u8[1] << 16) | (u8[2] << 8) | (u8[3] << 0);
319
- const payload = ev.data.slice(4);
 
 
 
 
 
320
  _pendingAudio.set(seq >>> 0, payload);
321
 
322
  const gen = _playbackGen;
@@ -351,6 +417,7 @@ function onVoiceMsg(ev) {
351
 
352
  case 'stt':
353
  // New turn: reset audio ordering/buffers.
 
354
  _expectedSeq = 0;
355
  _pendingAudio.clear();
356
  tStt = Date.now();
@@ -369,13 +436,15 @@ function onVoiceMsg(ev) {
369
 
370
  case 'llm_token':
371
  if (!msg.token) break;
 
372
  if (tLlm === 0) {
373
  tLlm = Date.now();
374
  if (tStt > 0) mLlm.textContent = tLlm - tStt + ' ms';
375
  }
376
  _removeThinking();
377
- _setCaption(aiTxt + msg.token);
378
- brainLastResponse = aiTxt + msg.token;
 
379
  _brainSetTtsBubble(brainLastResponse);
380
  _brainModeSetSearch(true);
381
  if (!brainMode) {
@@ -384,10 +453,7 @@ function onVoiceMsg(ev) {
384
  aiEl.className = 'message ai';
385
  chatBox.appendChild(aiEl);
386
  }
387
- aiTxt += msg.token;
388
  _renderAiText();
389
- } else {
390
- aiTxt += msg.token;
391
  }
392
  break;
393
 
@@ -395,7 +461,13 @@ function onVoiceMsg(ev) {
395
  if (!msg.text) break;
396
  // Best-effort recovery path: if any streamed tokens were dropped, the
397
  // server sends the final full text once at turn end.
398
- brainLastResponse = msg.text;
 
 
 
 
 
 
399
  _brainSetTtsBubble(brainLastResponse);
400
  if (!brainMode) {
401
  if (!aiEl) {
@@ -403,10 +475,7 @@ function onVoiceMsg(ev) {
403
  aiEl.className = 'message ai';
404
  chatBox.appendChild(aiEl);
405
  }
406
- aiTxt = msg.text;
407
  _renderAiText();
408
- } else {
409
- aiTxt = msg.text;
410
  }
411
  break;
412
 
@@ -639,7 +708,9 @@ function _done() {
639
  function stopAllAudio() {
640
  _cancelled = true;
641
  _ttsPlaying = false;
642
- _dropAudioUntil = Date.now() + 700;
 
 
643
  _playbackGen++;
644
  _audioChain = Promise.resolve();
645
  _expectedSeq = 0;
 
102
  let _playbackGen = 0;
103
  let _expectedSeq = 0;
104
  let _pendingAudio = new Map();
105
+ let _currentTurn = 0;
106
 
107
  // Client-side playback speed multiplier.
108
  // This makes speech faster immediately even if the TTS provider speed setting
 
117
  let brainLastResponse = '';
118
  let _brainWelcomed = false;
119
 
120
+ const SPOKEN_DIGIT_WORDS = {
121
+ '0': 'শূন্য',
122
+ '1': 'এক',
123
+ '2': 'দুই',
124
+ '3': 'তিন',
125
+ '4': 'চার',
126
+ '5': 'পাঁচ',
127
+ '6': 'ছয়',
128
+ '7': 'সাত',
129
+ '8': 'আট',
130
+ '9': 'নয়',
131
+ '০': 'শূন্য',
132
+ '১': 'এক',
133
+ '২': 'দুই',
134
+ '৩': 'তিন',
135
+ '৪': 'চার',
136
+ '৫': 'পাঁচ',
137
+ '৬': 'ছয়',
138
+ '৭': 'সাত',
139
+ '৮': 'আট',
140
+ '৯': 'নয়',
141
+ '٠': 'শূন্য',
142
+ '١': 'এক',
143
+ '٢': 'দুই',
144
+ '٣': 'তিন',
145
+ '٤': 'চার',
146
+ '٥': 'পাঁচ',
147
+ '٦': 'ছয়',
148
+ '٧': 'সাত',
149
+ '٨': 'আট',
150
+ '٩': 'নয়',
151
+ };
152
+
153
+ function _spokenDigitWords(chunk) {
154
+ const digits = Array.from(chunk).filter((ch) => ch in SPOKEN_DIGIT_WORDS);
155
+ if (digits.length < 10) return chunk;
156
+ return digits.map((ch) => SPOKEN_DIGIT_WORDS[ch]).join(' ');
157
+ }
158
+
159
+ function _normalizeVisibleAiText(text) {
160
+ if (!text) return '';
161
+ let out = String(text)
162
+ .replaceAll('উপলব্ধ', 'এভেলেবেল')
163
+ .replaceAll('জ্বি', 'আচ্ছা');
164
+ out = out.replace(
165
+ /[+\d০-৯٠-٩][\d০-৯٠-٩\s().\-]{8,}[\d০-৯٠-٩]/g,
166
+ (match, offset, whole) => {
167
+ const spoken = _spokenDigitWords(match);
168
+ if (spoken === match) return match;
169
+ const prev = offset > 0 ? whole[offset - 1] : '';
170
+ const next = offset + match.length < whole.length ? whole[offset + match.length] : '';
171
+ let value = spoken;
172
+ if (prev && !/\s/.test(prev) && !/[([<{\"']/.test(prev)) value = ' ' + value;
173
+ if (next && !/\s/.test(next) && !/[\])>.,!?;:}\"']/.test(next)) value = value + ' ';
174
+ return value;
175
+ },
176
+ );
177
+ return out.replace(/[ \t]{2,}/g, ' ');
178
+ }
179
+
180
  const BRAIN_WELCOME_TEXT =
181
+ '[calm] হ্যালো, আমি আয়েশ! হাপাতাল রিেপশন থেে বলছি। আপনি কি কোনো অ্য়েনটমেন্ট বুক করতে চান?';
182
 
183
  // ─── Recording state ──────────────────────────────────────────────────────────
184
  let micStream = null;
 
372
  if (ev.data instanceof ArrayBuffer) {
373
  if (Date.now() < _dropAudioUntil) return; // drop late packets after cancel
374
  _ttsPlaying = true;
375
+ // Framed audio: 4-byte big-endian turn id + 4-byte big-endian seq id + raw audio bytes.
376
+ // We buffer/reorder by seq inside a turn, and ignore late packets from older turns.
377
  const u8 = new Uint8Array(ev.data);
378
+ if (u8.length <= 8) return;
379
+ const turn =
380
+ (u8[0] << 24) | (u8[1] << 16) | (u8[2] << 8) | (u8[3] << 0);
381
+ const seq =
382
+ (u8[4] << 24) | (u8[5] << 16) | (u8[6] << 8) | (u8[7] << 0);
383
+ const turnU = turn >>> 0;
384
+ if (turnU !== (_currentTurn >>> 0)) return;
385
+ const payload = ev.data.slice(8);
386
  _pendingAudio.set(seq >>> 0, payload);
387
 
388
  const gen = _playbackGen;
 
417
 
418
  case 'stt':
419
  // New turn: reset audio ordering/buffers.
420
+ if (typeof msg.turn === 'number') _currentTurn = msg.turn >>> 0;
421
  _expectedSeq = 0;
422
  _pendingAudio.clear();
423
  tStt = Date.now();
 
436
 
437
  case 'llm_token':
438
  if (!msg.token) break;
439
+ const tokenText = _normalizeVisibleAiText(msg.token);
440
  if (tLlm === 0) {
441
  tLlm = Date.now();
442
  if (tStt > 0) mLlm.textContent = tLlm - tStt + ' ms';
443
  }
444
  _removeThinking();
445
+ aiTxt = _normalizeVisibleAiText(aiTxt + tokenText);
446
+ _setCaption(aiTxt);
447
+ brainLastResponse = aiTxt;
448
  _brainSetTtsBubble(brainLastResponse);
449
  _brainModeSetSearch(true);
450
  if (!brainMode) {
 
453
  aiEl.className = 'message ai';
454
  chatBox.appendChild(aiEl);
455
  }
 
456
  _renderAiText();
 
 
457
  }
458
  break;
459
 
 
461
  if (!msg.text) break;
462
  // Best-effort recovery path: if any streamed tokens were dropped, the
463
  // server sends the final full text once at turn end.
464
+ if (typeof msg.turn === 'number') {
465
+ _currentTurn = msg.turn >>> 0;
466
+ _expectedSeq = 0;
467
+ _pendingAudio.clear();
468
+ }
469
+ brainLastResponse = _normalizeVisibleAiText(msg.text);
470
+ aiTxt = brainLastResponse;
471
  _brainSetTtsBubble(brainLastResponse);
472
  if (!brainMode) {
473
  if (!aiEl) {
 
475
  aiEl.className = 'message ai';
476
  chatBox.appendChild(aiEl);
477
  }
 
478
  _renderAiText();
 
 
479
  }
480
  break;
481
 
 
708
  function stopAllAudio() {
709
  _cancelled = true;
710
  _ttsPlaying = false;
711
+ // With turn-id framed audio, we can shorten the drop window; late packets
712
+ // are ignored by turn mismatch.
713
+ _dropAudioUntil = Date.now() + 120;
714
  _playbackGen++;
715
  _audioChain = Promise.resolve();
716
  _expectedSeq = 0;
services/streaming.py CHANGED
@@ -61,6 +61,61 @@ SENTENCE_BOUNDARIES = frozenset(".!?।॥\n")
61
  CLAUSE_BOUNDARIES = frozenset(",;:—–")
62
  _SENTINEL = object()
63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
  def _clean_for_tts(text: str) -> str:
66
  # Strip emotion/tone tags like "[calm]" "[neutral]" "[happy]" etc.
@@ -79,6 +134,7 @@ def _clean_for_tts(text: str) -> str:
79
  text = re.sub(r"\n{2,}", "\n", text)
80
  # Collapse runs of spaces introduced by tag removal.
81
  text = re.sub(r"[ \t]{2,}", " ", text)
 
82
  # Keep normal spaces so chunk boundaries don't glue words together.
83
  return text.strip("\n\r\t")
84
 
 
61
  CLAUSE_BOUNDARIES = frozenset(",;:—–")
62
  _SENTINEL = object()
63
 
64
+ _DIGIT_WORDS = {
65
+ "0": "শূন্য",
66
+ "1": "এক",
67
+ "2": "দুই",
68
+ "3": "তিন",
69
+ "4": "চার",
70
+ "5": "পাঁচ",
71
+ "6": "ছয়",
72
+ "7": "সাত",
73
+ "8": "আট",
74
+ "9": "নয়",
75
+ "০": "শূন্য",
76
+ "১": "এক",
77
+ "২": "দুই",
78
+ "৩": "তিন",
79
+ "৪": "চার",
80
+ "৫": "পাঁচ",
81
+ "৬": "ছয়",
82
+ "৭": "সাত",
83
+ "৮": "আট",
84
+ "৯": "নয়",
85
+ "٠": "শূন্য",
86
+ "١": "এক",
87
+ "٢": "দুই",
88
+ "٣": "তিন",
89
+ "٤": "চার",
90
+ "٥": "পাঁচ",
91
+ "٦": "ছয়",
92
+ "٧": "সাত",
93
+ "٨": "আট",
94
+ "٩": "নয়",
95
+ }
96
+
97
+
98
+ def _spoken_phone_text(text: str) -> str:
99
+ if not text:
100
+ return ""
101
+
102
+ def repl(match: re.Match[str]) -> str:
103
+ chunk = match.group(0)
104
+ digits = [ch for ch in chunk if ch in _DIGIT_WORDS]
105
+ if len(digits) < 10:
106
+ return chunk
107
+ spoken = " ".join(_DIGIT_WORDS[ch] for ch in digits)
108
+ prev_char = text[match.start() - 1] if match.start() > 0 else ""
109
+ next_char = text[match.end()] if match.end() < len(text) else ""
110
+ if prev_char and not prev_char.isspace() and prev_char not in "([<{\"'":
111
+ spoken = " " + spoken
112
+ if next_char and not next_char.isspace() and next_char not in ")]>.,!?;:}\"'":
113
+ spoken = spoken + " "
114
+ return spoken
115
+
116
+ out = re.sub(r"[+\d০-৯٠-٩][\d০-৯٠-٩\s().\-]{8,}[\d০-৯٠-٩]", repl, text)
117
+ return re.sub(r"[ \t]{2,}", " ", out)
118
+
119
 
120
  def _clean_for_tts(text: str) -> str:
121
  # Strip emotion/tone tags like "[calm]" "[neutral]" "[happy]" etc.
 
134
  text = re.sub(r"\n{2,}", "\n", text)
135
  # Collapse runs of spaces introduced by tag removal.
136
  text = re.sub(r"[ \t]{2,}", " ", text)
137
+ text = _spoken_phone_text(text)
138
  # Keep normal spaces so chunk boundaries don't glue words together.
139
  return text.strip("\n\r\t")
140