rakib72642 commited on
Commit
440d1f1
·
1 Parent(s): 089db7b

adjusted mobile number problem

Browse files
Files changed (5) hide show
  1. app.py +77 -5
  2. core/backend.py +61 -0
  3. frontend/script.js +38 -1
  4. requirements.txt +5 -5
  5. tmp.ipynb +1 -1
app.py CHANGED
@@ -188,6 +188,19 @@ async def rtc_close(session_id: str):
188
  # WEBSOCKET HELPERS
189
  # ══════════════════════════════════════════════════════════════════════════════
190
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
  def _ws_open(ws: WebSocket) -> bool:
192
  return ws.client_state == WebSocketState.CONNECTED
193
 
@@ -267,9 +280,14 @@ async def ws_chat(ws: WebSocket):
267
 
268
  try:
269
  stream = await ai.main(user_id, user_query)
 
270
  async for token in stream:
271
  if token:
 
272
  await _safe_text(ws, {"type": "llm_token", "token": token})
 
 
 
273
  except Exception as exc:
274
  import traceback; traceback.print_exc()
275
  await _safe_text(ws, {"type": "error", "text": str(exc)})
@@ -326,7 +344,9 @@ async def ws_voice(ws: WebSocket):
326
  stt = STTProcessor()
327
  _active_streamer: ParallelTTSStreamer | None = None
328
  _active_task: asyncio.Task | None = None
329
- _utterance_q: asyncio.Queue[bytes | None] = asyncio.Queue()
 
 
330
  _worker_task: asyncio.Task | None = None
331
 
332
  async def _cancel_active():
@@ -349,6 +369,44 @@ async def ws_voice(ws: WebSocket):
349
  except asyncio.QueueEmpty:
350
  break
351
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
352
  async def _handle_utterance(audio_bytes: bytes):
353
  nonlocal _active_streamer
354
 
@@ -388,7 +446,7 @@ async def ws_voice(ws: WebSocket):
388
  # Best-effort: send the full text once at the end so the UI can
389
  # recover if it missed any streamed tokens.
390
  if full_text:
391
- await _safe_text(ws, {"type": "llm_full", "text": full_text})
392
  await tts_streamer.flush()
393
 
394
  async def run_tts_framed():
@@ -411,13 +469,18 @@ async def ws_voice(ws: WebSocket):
411
  async def _utterance_worker():
412
  nonlocal _active_task
413
  while True:
414
- audio_bytes = await _utterance_q.get()
415
- if audio_bytes is None:
416
  break
417
  try:
418
  # Run each utterance as a cancellable task so barge-in can
419
  # immediately interrupt LLM+TTS mid-turn.
420
- _active_task = asyncio.create_task(_handle_utterance(audio_bytes))
 
 
 
 
 
421
  await _active_task
422
  except asyncio.CancelledError:
423
  # Interruption is normal (client barge-in / cancel).
@@ -472,6 +535,15 @@ async def ws_voice(ws: WebSocket):
472
  await _cancel_active()
473
  await _drain_utterance_queue()
474
  await _safe_text(ws, {"type": "end"})
 
 
 
 
 
 
 
 
 
475
  except json.JSONDecodeError:
476
  pass
477
 
 
188
  # WEBSOCKET HELPERS
189
  # ══════════════════════════════════════════════════════════════════════════════
190
 
191
+ def _normalize_ai_text(text: str) -> str:
192
+ """
193
+ Apply small UX wording normalizations to assistant-visible text.
194
+ (We still instruct the model via system prompt, but this guarantees output.)
195
+ """
196
+ if not text:
197
+ return ""
198
+ out = text
199
+ out = out.replace("উপলব্ধ", "এভেলেবেল")
200
+ out = out.replace("জ্বি", "আচ্ছা")
201
+ return out
202
+
203
+
204
  def _ws_open(ws: WebSocket) -> bool:
205
  return ws.client_state == WebSocketState.CONNECTED
206
 
 
280
 
281
  try:
282
  stream = await ai.main(user_id, user_query)
283
+ full_text = ""
284
  async for token in stream:
285
  if token:
286
+ full_text += token
287
  await _safe_text(ws, {"type": "llm_token", "token": token})
288
+ # Ensure the final rendered message uses normalized wording.
289
+ if full_text:
290
+ await _safe_text(ws, {"type": "chat", "text": _normalize_ai_text(full_text)})
291
  except Exception as exc:
292
  import traceback; traceback.print_exc()
293
  await _safe_text(ws, {"type": "error", "text": str(exc)})
 
344
  stt = STTProcessor()
345
  _active_streamer: ParallelTTSStreamer | None = None
346
  _active_task: asyncio.Task | None = None
347
+ # Queue supports both audio turns and server-side "speak" turns initiated
348
+ # by the client UI (e.g., brain-mode welcome).
349
+ _utterance_q: asyncio.Queue[object | None] = asyncio.Queue()
350
  _worker_task: asyncio.Task | None = None
351
 
352
  async def _cancel_active():
 
369
  except asyncio.QueueEmpty:
370
  break
371
 
372
+ async def _handle_speak(text: str):
373
+ """
374
+ Generate TTS for a given text without running STT.
375
+ Uses the same framed-audio protocol as normal turns and emits `llm_full`
376
+ so the UI can display the spoken text.
377
+ """
378
+ nonlocal _active_streamer
379
+
380
+ speak_text = _normalize_ai_text((text or "").strip())
381
+ if not speak_text:
382
+ await _safe_text(ws, {"type": "end"})
383
+ return
384
+
385
+ tts_streamer = ParallelTTSStreamer()
386
+ _active_streamer = tts_streamer
387
+ audio_seq = 0
388
+
389
+ async def run_text():
390
+ try:
391
+ await _safe_text(ws, {"type": "llm_full", "text": speak_text})
392
+ await tts_streamer.add_token(speak_text)
393
+ except asyncio.CancelledError:
394
+ raise
395
+ finally:
396
+ await tts_streamer.flush()
397
+
398
+ async def run_tts_framed():
399
+ nonlocal audio_seq
400
+ async for chunk in tts_streamer.stream_audio():
401
+ framed = struct.pack(">I", audio_seq) + chunk
402
+ if not await _safe_bytes(ws, framed):
403
+ break
404
+ audio_seq += 1
405
+
406
+ await asyncio.gather(run_text(), run_tts_framed(), return_exceptions=True)
407
+ _active_streamer = None
408
+ await _safe_text(ws, {"type": "end"})
409
+
410
  async def _handle_utterance(audio_bytes: bytes):
411
  nonlocal _active_streamer
412
 
 
446
  # Best-effort: send the full text once at the end so the UI can
447
  # recover if it missed any streamed tokens.
448
  if full_text:
449
+ await _safe_text(ws, {"type": "llm_full", "text": _normalize_ai_text(full_text)})
450
  await tts_streamer.flush()
451
 
452
  async def run_tts_framed():
 
469
  async def _utterance_worker():
470
  nonlocal _active_task
471
  while True:
472
+ item = await _utterance_q.get()
473
+ if item is None:
474
  break
475
  try:
476
  # Run each utterance as a cancellable task so barge-in can
477
  # immediately interrupt LLM+TTS mid-turn.
478
+ if isinstance(item, (bytes, bytearray)):
479
+ _active_task = asyncio.create_task(_handle_utterance(bytes(item)))
480
+ elif isinstance(item, dict) and item.get("type") == "speak":
481
+ _active_task = asyncio.create_task(_handle_speak(str(item.get("text", ""))))
482
+ else:
483
+ continue
484
  await _active_task
485
  except asyncio.CancelledError:
486
  # Interruption is normal (client barge-in / cancel).
 
535
  await _cancel_active()
536
  await _drain_utterance_queue()
537
  await _safe_text(ws, {"type": "end"})
538
+ elif t == "speak":
539
+ # UI-initiated TTS turn (e.g. brain-mode welcome).
540
+ # Do not block the receive loop; enqueue for worker.
541
+ speak_text = str(msg.get("text", "")).strip()
542
+ if speak_text:
543
+ if _active_task is not None and not _active_task.done():
544
+ await _cancel_active()
545
+ await _drain_utterance_queue()
546
+ await _utterance_q.put({"type": "speak", "text": speak_text})
547
  except json.JSONDecodeError:
548
  pass
549
 
core/backend.py CHANGED
@@ -158,6 +158,7 @@ SPECIALTY_ALIASES = {
158
  "মেডিসিন": ["medicine", "internal medicine", "physician", "general medicine"],
159
  "নিউরো": ["neurologist", "neurology", "brain"],
160
  "স্নায়ু": ["neurologist", "neurology", "brain"],
 
161
  "নাক": ["ent", "otolaryngologist", "ear nose throat"],
162
  "কান": ["ent", "otolaryngologist", "ear nose throat"],
163
  "গলা": ["ent", "otolaryngologist", "ear nose throat"],
@@ -176,6 +177,14 @@ SPECIALTY_ALIASES = {
176
  "কিডনি": ["nephrologist", "kidney", "renal"],
177
  "গ্যাস্ট্রো": ["gastroenterologist", "stomach", "digestive"],
178
  "পেট": ["gastroenterologist", "stomach", "digestive"],
 
 
 
 
 
 
 
 
179
  }
180
 
181
 
@@ -211,6 +220,32 @@ def _expand_search_terms(text: str) -> list[str]:
211
  if token:
212
  terms.add(token)
213
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
214
  return sorted(terms)
215
 
216
 
@@ -279,6 +314,17 @@ def _message_text(content) -> str:
279
  else:
280
  parts.append(str(item))
281
  return _clean_text(" ".join(parts))
 
 
 
 
 
 
 
 
 
 
 
282
  return _clean_text(str(content))
283
 
284
 
@@ -1269,6 +1315,13 @@ LANGUAGE RULE
1269
  - "দুই হাজার ছাব্বিশ সাল"
1270
  - "দুই হাজার বিশ সাল"
1271
 
 
 
 
 
 
 
 
1272
  BEHAVIOR PRIORITY
1273
  - Professional customer-support clarity first
1274
  - Emotional tone tagging second
@@ -1278,12 +1331,20 @@ BEHAVIOR PRIORITY
1278
  DATA RULE:
1279
  - Doctor names, categories, and days in the database are English.
1280
  - Bangla terms such as চক্ষু/কার্ডিও/শিশু/চর্ম must be translated to English search terms before tool calls.
 
 
 
 
1281
 
1282
  RESPONSE STYLE:
1283
  - Be concise.
1284
  - Be reassuring.
1285
  - Be jolly and encouraging, but not over-the-top.
1286
  - Ask one clear question when more information is needed.
 
 
 
 
1287
  """
1288
 
1289
  SUMMARY_SYSTEM = (
 
158
  "মেডিসিন": ["medicine", "internal medicine", "physician", "general medicine"],
159
  "নিউরো": ["neurologist", "neurology", "brain"],
160
  "স্নায়ু": ["neurologist", "neurology", "brain"],
161
+ "নিউরোলজি": ["neurologist", "neurology", "neorology", "neuro"],
162
  "নাক": ["ent", "otolaryngologist", "ear nose throat"],
163
  "কান": ["ent", "otolaryngologist", "ear nose throat"],
164
  "গলা": ["ent", "otolaryngologist", "ear nose throat"],
 
177
  "কিডনি": ["nephrologist", "kidney", "renal"],
178
  "গ্যাস্ট্রো": ["gastroenterologist", "stomach", "digestive"],
179
  "পেট": ["gastroenterologist", "stomach", "digestive"],
180
+ # DB category uses "Gastrologist" in some datasets; include common spellings.
181
+ "গ্যাস্ট্রোএন্টারোলজি": [
182
+ "gastrologist",
183
+ "gastroenterologist",
184
+ "gastroenterology",
185
+ "gastrology",
186
+ "gastro",
187
+ ],
188
  }
189
 
190
 
 
220
  if token:
221
  terms.add(token)
222
 
223
+ # ── English specialty normalization (handles user saying "neurology" etc.) ──
224
+ def _ology_to_ologist(tok: str) -> str:
225
+ # neurology -> neurologist, cardiology -> cardiologist
226
+ if tok.endswith("ology") and len(tok) > 4:
227
+ return tok[:-1] + "ist" # drop trailing 'y', add 'ist'
228
+ return ""
229
+
230
+ extra: set[str] = set()
231
+ for tok in list(terms):
232
+ if not tok:
233
+ continue
234
+ # Common misspelling: neorology -> neurology
235
+ if tok == "neorology":
236
+ extra.update({"neurology", "neurologist"})
237
+ if tok in ("neurology", "neurologic", "neurological"):
238
+ extra.add("neurologist")
239
+ if tok in ("dentistry", "dental"):
240
+ extra.add("dentist")
241
+ if tok in ("gastroenterology", "gastroenterologist", "gastrology"):
242
+ extra.update({"gastrologist", "gastroenterologist"})
243
+
244
+ mapped = _ology_to_ologist(tok)
245
+ if mapped:
246
+ extra.add(mapped)
247
+
248
+ terms.update(extra)
249
  return sorted(terms)
250
 
251
 
 
314
  else:
315
  parts.append(str(item))
316
  return _clean_text(" ".join(parts))
317
+ if isinstance(content, dict):
318
+ # Some providers wrap message content as an object.
319
+ if content.get("type") == "text":
320
+ return _clean_text(str(content.get("text", "")))
321
+ if "text" in content:
322
+ return _clean_text(str(content.get("text", "")))
323
+ # Fallback: stringify deterministically-ish.
324
+ try:
325
+ return _clean_text(json.dumps(content, ensure_ascii=False))
326
+ except Exception:
327
+ return _clean_text(str(content))
328
  return _clean_text(str(content))
329
 
330
 
 
1315
  - "দুই হাজার ছাব্বিশ সাল"
1316
  - "দুই হাজার বিশ সাল"
1317
 
1318
+ - Mobile Number Format (spoken Bangla style):
1319
+ - When you SAY or READ a phone number aloud in Bangla, spell it digit-by-digit using Bangla digit words, separated by spaces.
1320
+ Do NOT read it as a single large number.
1321
+ - Example spoken formats:
1322
+ - "শূন্য এক ছয় তিন আট আট তিন শূন্য এক ছয় পাঁচ"
1323
+ - "শূন্য এক তিন দুই শূন্য শূন্য শূন্য নয় দুই তিন শূন্য"
1324
+
1325
  BEHAVIOR PRIORITY
1326
  - Professional customer-support clarity first
1327
  - Emotional tone tagging second
 
1331
  DATA RULE:
1332
  - Doctor names, categories, and days in the database are English.
1333
  - Bangla terms such as চক্ষু/কার্ডিও/শিশু/চর্ম must be translated to English search terms before tool calls.
1334
+ - IMPORTANT: Some users may say specialties as the field name (e.g. "neurology", "cardiology", "dentistry").
1335
+ The database categories may be stored as doctor types (e.g. "Neurologist", "Cardiologist", "Dentist").
1336
+ When searching doctors, include both forms (e.g. neurology → neurologist) and handle common misspellings
1337
+ like "neorology".
1338
 
1339
  RESPONSE STYLE:
1340
  - Be concise.
1341
  - Be reassuring.
1342
  - Be jolly and encouraging, but not over-the-top.
1343
  - Ask one clear question when more information is needed.
1344
+
1345
+ WORDING (Bangla UX consistency):
1346
+ - Avoid using the Bangla word “উপলব্ধ” in user-facing replies. Instead say “এভেলেবেল” when you mean “available”.
1347
+ - Avoid “জ্বি”. Use natural acknowledgements like “আচ্ছা”, “ঠিক আছে”, or “ওকে”.
1348
  """
1349
 
1350
  SUMMARY_SYSTEM = (
frontend/script.js CHANGED
@@ -114,6 +114,10 @@ let brainAutoRestartTimer = null;
114
  let brainPendingAudio = null;
115
  let voicePendingPackets = [];
116
  let brainLastResponse = '';
 
 
 
 
117
 
118
  // ─── Recording state ──────────────────────────────────────────────────────────
119
  let micStream = null;
@@ -407,6 +411,9 @@ function onVoiceMsg(ev) {
407
  break;
408
 
409
  case 'end':
 
 
 
410
  _renderAiText(true);
411
  _removeThinking();
412
  if (brainMode) brainLastResponse = aiTxt || brainLastResponse;
@@ -1146,7 +1153,9 @@ function setMic(s) {
1146
  }
1147
 
1148
  function appendMsg(text, who) {
1149
- if (brainMode && who !== 'system') return null;
 
 
1150
  const d = document.createElement('div');
1151
  d.className = 'message ' + who;
1152
  if (who === 'ai' && typeof marked !== 'undefined') {
@@ -1195,6 +1204,15 @@ function setBrainMode(on) {
1195
  _brainModeSetSearch(
1196
  isProcessing || isListening || isSpeaking || _ttsPlaying,
1197
  );
 
 
 
 
 
 
 
 
 
1198
  if (!isListening && !isProcessing && !isRecordingLocked) {
1199
  setTimeout(() => {
1200
  if (
@@ -1221,6 +1239,25 @@ function setBrainMode(on) {
1221
  }
1222
  }
1223
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1224
  function _brainModeSetSearch(active) {
1225
  if (!brainStage) return;
1226
  brainStage.classList.toggle('searching', !!active);
 
114
  let brainPendingAudio = null;
115
  let voicePendingPackets = [];
116
  let brainLastResponse = '';
117
+ let _brainWelcomed = false;
118
+
119
+ const BRAIN_WELCOME_TEXT =
120
+ '[calm] হ্যালো! আমি আপনার ভয়েস সহকারী। আপনি কীভাবে সাহায্য চান?';
121
 
122
  // ─── Recording state ──────────────────────────────────────────────────────────
123
  let micStream = null;
 
411
  break;
412
 
413
  case 'end':
414
+ // In brain mode we don't stream tokens into chat UI, so append a final
415
+ // transcript line at turn end.
416
+ if (brainMode && aiTxt) appendMsg(aiTxt, 'ai');
417
  _renderAiText(true);
418
  _removeThinking();
419
  if (brainMode) brainLastResponse = aiTxt || brainLastResponse;
 
1153
  }
1154
 
1155
  function appendMsg(text, who) {
1156
+ // In brain mode, keep user messages hidden (brain panel acts as UI),
1157
+ // but still show AI messages as a readable transcript.
1158
+ if (brainMode && who === 'user') return null;
1159
  const d = document.createElement('div');
1160
  d.className = 'message ' + who;
1161
  if (who === 'ai' && typeof marked !== 'undefined') {
 
1204
  _brainModeSetSearch(
1205
  isProcessing || isListening || isSpeaking || _ttsPlaying,
1206
  );
1207
+ // One-time welcome when entering brain mode (per page load).
1208
+ if (!_brainWelcomed) {
1209
+ _brainWelcomed = true;
1210
+ setTimeout(() => {
1211
+ if (!brainMode || !brainVoiceActive) return;
1212
+ if (isProcessing || isSpeaking || _ttsPlaying) return;
1213
+ _brainSendWelcome();
1214
+ }, 220);
1215
+ }
1216
  if (!isListening && !isProcessing && !isRecordingLocked) {
1217
  setTimeout(() => {
1218
  if (
 
1239
  }
1240
  }
1241
 
1242
+ function _brainSendWelcome() {
1243
+ const payload = JSON.stringify({ type: 'speak', text: BRAIN_WELCOME_TEXT });
1244
+ if (!voiceWS || voiceWS.readyState !== WebSocket.OPEN) {
1245
+ // If the socket is reconnecting, queue for later.
1246
+ voicePendingPackets.push(payload);
1247
+ _connectVoice();
1248
+ return;
1249
+ }
1250
+ try {
1251
+ appendThinking();
1252
+ voiceWS.send(payload);
1253
+ console.log('[Brain] welcome sent');
1254
+ } catch (err) {
1255
+ console.error('[Brain] welcome send failed:', err);
1256
+ voicePendingPackets.push(payload);
1257
+ _connectVoice();
1258
+ }
1259
+ }
1260
+
1261
  function _brainModeSetSearch(active) {
1262
  if (!brainStage) return;
1263
  brainStage.classList.toggle('searching', !!active);
requirements.txt CHANGED
@@ -11,11 +11,11 @@ fastapi
11
  uvicorn
12
  websockets
13
 
14
- # ===== Async / DB =====
15
- aiosqlite
16
- aiosmtplib
17
- dateparser
18
- twilio
19
 
20
  # ===== LangChain Ecosystem =====
21
  langchain
 
11
  uvicorn
12
  websockets
13
 
14
+ # ===== Async / DB =====
15
+ aiosqlite
16
+ aiosmtplib
17
+ dateparser
18
+ twilio
19
 
20
  # ===== LangChain Ecosystem =====
21
  langchain
tmp.ipynb CHANGED
@@ -2,7 +2,7 @@
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
- "execution_count": 1,
6
  "id": "5cbff6ce",
7
  "metadata": {},
8
  "outputs": [],
 
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
+ "execution_count": 4,
6
  "id": "5cbff6ce",
7
  "metadata": {},
8
  "outputs": [],