adjusted mobile number problem

Browse files

Files changed (5) hide show

app.py +77 -5
core/backend.py +61 -0
frontend/script.js +38 -1
requirements.txt +5 -5
tmp.ipynb +1 -1

app.py CHANGED Viewed

@@ -188,6 +188,19 @@ async def rtc_close(session_id: str):
 #  WEBSOCKET HELPERS
 # ══════════════════════════════════════════════════════════════════════════════
 def _ws_open(ws: WebSocket) -> bool:
     return ws.client_state == WebSocketState.CONNECTED
@@ -267,9 +280,14 @@ async def ws_chat(ws: WebSocket):
             try:
                 stream = await ai.main(user_id, user_query)
                 async for token in stream:
                     if token:
                         await _safe_text(ws, {"type": "llm_token", "token": token})
             except Exception as exc:
                 import traceback; traceback.print_exc()
                 await _safe_text(ws, {"type": "error", "text": str(exc)})
@@ -326,7 +344,9 @@ async def ws_voice(ws: WebSocket):
     stt = STTProcessor()
     _active_streamer: ParallelTTSStreamer | None = None
     _active_task:     asyncio.Task | None        = None
-    _utterance_q: asyncio.Queue[bytes | None] = asyncio.Queue()
     _worker_task: asyncio.Task | None = None
     async def _cancel_active():
@@ -349,6 +369,44 @@ async def ws_voice(ws: WebSocket):
             except asyncio.QueueEmpty:
                 break
     async def _handle_utterance(audio_bytes: bytes):
         nonlocal _active_streamer
@@ -388,7 +446,7 @@ async def ws_voice(ws: WebSocket):
                 # Best-effort: send the full text once at the end so the UI can
                 # recover if it missed any streamed tokens.
                 if full_text:
-                    await _safe_text(ws, {"type": "llm_full", "text": full_text})
                 await tts_streamer.flush()
         async def run_tts_framed():
@@ -411,13 +469,18 @@ async def ws_voice(ws: WebSocket):
     async def _utterance_worker():
         nonlocal _active_task
         while True:
-            audio_bytes = await _utterance_q.get()
-            if audio_bytes is None:
                 break
             try:
                 # Run each utterance as a cancellable task so barge-in can
                 # immediately interrupt LLM+TTS mid-turn.
-                _active_task = asyncio.create_task(_handle_utterance(audio_bytes))
                 await _active_task
             except asyncio.CancelledError:
                 # Interruption is normal (client barge-in / cancel).
@@ -472,6 +535,15 @@ async def ws_voice(ws: WebSocket):
                         await _cancel_active()
                         await _drain_utterance_queue()
                         await _safe_text(ws, {"type": "end"})
                 except json.JSONDecodeError:
                     pass

 #  WEBSOCKET HELPERS
 # ══════════════════════════════════════════════════════════════════════════════
+def _normalize_ai_text(text: str) -> str:
+    """
+    Apply small UX wording normalizations to assistant-visible text.
+    (We still instruct the model via system prompt, but this guarantees output.)
+    """
+    if not text:
+        return ""
+    out = text
+    out = out.replace("উপলব্ধ", "এভেলেবেল")
+    out = out.replace("জ্বি", "আচ্ছা")
+    return out
 def _ws_open(ws: WebSocket) -> bool:
     return ws.client_state == WebSocketState.CONNECTED
             try:
                 stream = await ai.main(user_id, user_query)
+                full_text = ""
                 async for token in stream:
                     if token:
+                        full_text += token
                         await _safe_text(ws, {"type": "llm_token", "token": token})
+                # Ensure the final rendered message uses normalized wording.
+                if full_text:
+                    await _safe_text(ws, {"type": "chat", "text": _normalize_ai_text(full_text)})
             except Exception as exc:
                 import traceback; traceback.print_exc()
                 await _safe_text(ws, {"type": "error", "text": str(exc)})
     stt = STTProcessor()
     _active_streamer: ParallelTTSStreamer | None = None
     _active_task:     asyncio.Task | None        = None
+    # Queue supports both audio turns and server-side "speak" turns initiated
+    # by the client UI (e.g., brain-mode welcome).
+    _utterance_q: asyncio.Queue[object | None] = asyncio.Queue()
     _worker_task: asyncio.Task | None = None
     async def _cancel_active():
             except asyncio.QueueEmpty:
                 break
+    async def _handle_speak(text: str):
+        """
+        Generate TTS for a given text without running STT.
+        Uses the same framed-audio protocol as normal turns and emits `llm_full`
+        so the UI can display the spoken text.
+        """
+        nonlocal _active_streamer
+        speak_text = _normalize_ai_text((text or "").strip())
+        if not speak_text:
+            await _safe_text(ws, {"type": "end"})
+            return
+        tts_streamer = ParallelTTSStreamer()
+        _active_streamer = tts_streamer
+        audio_seq = 0
+        async def run_text():
+            try:
+                await _safe_text(ws, {"type": "llm_full", "text": speak_text})
+                await tts_streamer.add_token(speak_text)
+            except asyncio.CancelledError:
+                raise
+            finally:
+                await tts_streamer.flush()
+        async def run_tts_framed():
+            nonlocal audio_seq
+            async for chunk in tts_streamer.stream_audio():
+                framed = struct.pack(">I", audio_seq) + chunk
+                if not await _safe_bytes(ws, framed):
+                    break
+                audio_seq += 1
+        await asyncio.gather(run_text(), run_tts_framed(), return_exceptions=True)
+        _active_streamer = None
+        await _safe_text(ws, {"type": "end"})
     async def _handle_utterance(audio_bytes: bytes):
         nonlocal _active_streamer
                 # Best-effort: send the full text once at the end so the UI can
                 # recover if it missed any streamed tokens.
                 if full_text:
+                    await _safe_text(ws, {"type": "llm_full", "text": _normalize_ai_text(full_text)})
                 await tts_streamer.flush()
         async def run_tts_framed():
     async def _utterance_worker():
         nonlocal _active_task
         while True:
+            item = await _utterance_q.get()
+            if item is None:
                 break
             try:
                 # Run each utterance as a cancellable task so barge-in can
                 # immediately interrupt LLM+TTS mid-turn.
+                if isinstance(item, (bytes, bytearray)):
+                    _active_task = asyncio.create_task(_handle_utterance(bytes(item)))
+                elif isinstance(item, dict) and item.get("type") == "speak":
+                    _active_task = asyncio.create_task(_handle_speak(str(item.get("text", ""))))
+                else:
+                    continue
                 await _active_task
             except asyncio.CancelledError:
                 # Interruption is normal (client barge-in / cancel).
                         await _cancel_active()
                         await _drain_utterance_queue()
                         await _safe_text(ws, {"type": "end"})
+                    elif t == "speak":
+                        # UI-initiated TTS turn (e.g. brain-mode welcome).
+                        # Do not block the receive loop; enqueue for worker.
+                        speak_text = str(msg.get("text", "")).strip()
+                        if speak_text:
+                            if _active_task is not None and not _active_task.done():
+                                await _cancel_active()
+                                await _drain_utterance_queue()
+                            await _utterance_q.put({"type": "speak", "text": speak_text})
                 except json.JSONDecodeError:
                     pass

core/backend.py CHANGED Viewed

@@ -158,6 +158,7 @@ SPECIALTY_ALIASES = {
     "মেডিসিন": ["medicine", "internal medicine", "physician", "general medicine"],
     "নিউরো": ["neurologist", "neurology", "brain"],
     "স্নায়ু": ["neurologist", "neurology", "brain"],
     "নাক": ["ent", "otolaryngologist", "ear nose throat"],
     "কান": ["ent", "otolaryngologist", "ear nose throat"],
     "গলা": ["ent", "otolaryngologist", "ear nose throat"],
@@ -176,6 +177,14 @@ SPECIALTY_ALIASES = {
     "কিডনি": ["nephrologist", "kidney", "renal"],
     "গ্যাস্ট্রো": ["gastroenterologist", "stomach", "digestive"],
     "পেট": ["gastroenterologist", "stomach", "digestive"],
 }
@@ -211,6 +220,32 @@ def _expand_search_terms(text: str) -> list[str]:
         if token:
             terms.add(token)
     return sorted(terms)
@@ -279,6 +314,17 @@ def _message_text(content) -> str:
             else:
                 parts.append(str(item))
         return _clean_text(" ".join(parts))
     return _clean_text(str(content))
@@ -1269,6 +1315,13 @@ LANGUAGE RULE
     - "দুই হাজার ছাব্বিশ সাল"
     - "দুই হাজার বিশ সাল"
 BEHAVIOR PRIORITY
 - Professional customer-support clarity first
 - Emotional tone tagging second
@@ -1278,12 +1331,20 @@ BEHAVIOR PRIORITY
 DATA RULE:
 - Doctor names, categories, and days in the database are English.
 - Bangla terms such as চক্ষু/কার্ডিও/শিশু/চর্ম must be translated to English search terms before tool calls.
 RESPONSE STYLE:
 - Be concise.
 - Be reassuring.
 - Be jolly and encouraging, but not over-the-top.
 - Ask one clear question when more information is needed.
 """
 SUMMARY_SYSTEM = (

     "মেডিসিন": ["medicine", "internal medicine", "physician", "general medicine"],
     "নিউরো": ["neurologist", "neurology", "brain"],
     "স্নায়ু": ["neurologist", "neurology", "brain"],
+    "নিউরোলজি": ["neurologist", "neurology", "neorology", "neuro"],
     "নাক": ["ent", "otolaryngologist", "ear nose throat"],
     "কান": ["ent", "otolaryngologist", "ear nose throat"],
     "গলা": ["ent", "otolaryngologist", "ear nose throat"],
     "কিডনি": ["nephrologist", "kidney", "renal"],
     "গ্যাস্ট্রো": ["gastroenterologist", "stomach", "digestive"],
     "পেট": ["gastroenterologist", "stomach", "digestive"],
+    # DB category uses "Gastrologist" in some datasets; include common spellings.
+    "গ্যাস্ট্রোএন্টারোলজি": [
+        "gastrologist",
+        "gastroenterologist",
+        "gastroenterology",
+        "gastrology",
+        "gastro",
+    ],
 }
         if token:
             terms.add(token)
+    # ── English specialty normalization (handles user saying "neurology" etc.) ──
+    def _ology_to_ologist(tok: str) -> str:
+        # neurology -> neurologist, cardiology -> cardiologist
+        if tok.endswith("ology") and len(tok) > 4:
+            return tok[:-1] + "ist"  # drop trailing 'y', add 'ist'
+        return ""
+    extra: set[str] = set()
+    for tok in list(terms):
+        if not tok:
+            continue
+        # Common misspelling: neorology -> neurology
+        if tok == "neorology":
+            extra.update({"neurology", "neurologist"})
+        if tok in ("neurology", "neurologic", "neurological"):
+            extra.add("neurologist")
+        if tok in ("dentistry", "dental"):
+            extra.add("dentist")
+        if tok in ("gastroenterology", "gastroenterologist", "gastrology"):
+            extra.update({"gastrologist", "gastroenterologist"})
+        mapped = _ology_to_ologist(tok)
+        if mapped:
+            extra.add(mapped)
+    terms.update(extra)
     return sorted(terms)
             else:
                 parts.append(str(item))
         return _clean_text(" ".join(parts))
+    if isinstance(content, dict):
+        # Some providers wrap message content as an object.
+        if content.get("type") == "text":
+            return _clean_text(str(content.get("text", "")))
+        if "text" in content:
+            return _clean_text(str(content.get("text", "")))
+        # Fallback: stringify deterministically-ish.
+        try:
+            return _clean_text(json.dumps(content, ensure_ascii=False))
+        except Exception:
+            return _clean_text(str(content))
     return _clean_text(str(content))
     - "দুই হাজার ছাব্বিশ সাল"
     - "দুই হাজার বিশ সাল"
+- Mobile Number Format (spoken Bangla style):
+    - When you SAY or READ a phone number aloud in Bangla, spell it digit-by-digit using Bangla digit words, separated by spaces.
+      Do NOT read it as a single large number.
+    - Example spoken formats:
+        - "শূন্য এক ছয় তিন আট আট তিন শূন্য এক ছয় পাঁচ"
+        - "শূন্য এক তিন দুই শূন্য শূন্য শূন্য নয় দুই তিন শূন্য"
 BEHAVIOR PRIORITY
 - Professional customer-support clarity first
 - Emotional tone tagging second
 DATA RULE:
 - Doctor names, categories, and days in the database are English.
 - Bangla terms such as চক্ষু/কার্ডিও/শিশু/চর্ম must be translated to English search terms before tool calls.
+- IMPORTANT: Some users may say specialties as the field name (e.g. "neurology", "cardiology", "dentistry").
+  The database categories may be stored as doctor types (e.g. "Neurologist", "Cardiologist", "Dentist").
+  When searching doctors, include both forms (e.g. neurology → neurologist) and handle common misspellings
+  like "neorology".
 RESPONSE STYLE:
 - Be concise.
 - Be reassuring.
 - Be jolly and encouraging, but not over-the-top.
 - Ask one clear question when more information is needed.
+WORDING (Bangla UX consistency):
+- Avoid using the Bangla word “উপলব্ধ” in user-facing replies. Instead say “এভেলেবেল” when you mean “available”.
+- Avoid “জ্বি”. Use natural acknowledgements like “আচ্ছা”, “ঠিক আছে”, or “ওকে”.
 """
 SUMMARY_SYSTEM = (

frontend/script.js CHANGED Viewed

@@ -114,6 +114,10 @@ let brainAutoRestartTimer = null;
 let brainPendingAudio = null;
 let voicePendingPackets = [];
 let brainLastResponse = '';
 // ─── Recording state ──────────────────────────────────────────────────────────
 let micStream = null;
@@ -407,6 +411,9 @@ function onVoiceMsg(ev) {
       break;
     case 'end':
       _renderAiText(true);
       _removeThinking();
       if (brainMode) brainLastResponse = aiTxt || brainLastResponse;
@@ -1146,7 +1153,9 @@ function setMic(s) {
 }
 function appendMsg(text, who) {
-  if (brainMode && who !== 'system') return null;
   const d = document.createElement('div');
   d.className = 'message ' + who;
   if (who === 'ai' && typeof marked !== 'undefined') {
@@ -1195,6 +1204,15 @@ function setBrainMode(on) {
     _brainModeSetSearch(
       isProcessing || isListening || isSpeaking || _ttsPlaying,
     );
     if (!isListening && !isProcessing && !isRecordingLocked) {
       setTimeout(() => {
         if (
@@ -1221,6 +1239,25 @@ function setBrainMode(on) {
   }
 }
 function _brainModeSetSearch(active) {
   if (!brainStage) return;
   brainStage.classList.toggle('searching', !!active);

 let brainPendingAudio = null;
 let voicePendingPackets = [];
 let brainLastResponse = '';
+let _brainWelcomed = false;
+const BRAIN_WELCOME_TEXT =
+  '[calm] হ্যালো! আমি আপনার ভয়েস সহকারী। আপনি কীভাবে সাহায্য চান?';
 // ─── Recording state ──────────────────────────────────────────────────────────
 let micStream = null;
       break;
     case 'end':
+      // In brain mode we don't stream tokens into chat UI, so append a final
+      // transcript line at turn end.
+      if (brainMode && aiTxt) appendMsg(aiTxt, 'ai');
       _renderAiText(true);
       _removeThinking();
       if (brainMode) brainLastResponse = aiTxt || brainLastResponse;
 }
 function appendMsg(text, who) {
+  // In brain mode, keep user messages hidden (brain panel acts as UI),
+  // but still show AI messages as a readable transcript.
+  if (brainMode && who === 'user') return null;
   const d = document.createElement('div');
   d.className = 'message ' + who;
   if (who === 'ai' && typeof marked !== 'undefined') {
     _brainModeSetSearch(
       isProcessing || isListening || isSpeaking || _ttsPlaying,
     );
+    // One-time welcome when entering brain mode (per page load).
+    if (!_brainWelcomed) {
+      _brainWelcomed = true;
+      setTimeout(() => {
+        if (!brainMode || !brainVoiceActive) return;
+        if (isProcessing || isSpeaking || _ttsPlaying) return;
+        _brainSendWelcome();
+      }, 220);
+    }
     if (!isListening && !isProcessing && !isRecordingLocked) {
       setTimeout(() => {
         if (
   }
 }
+function _brainSendWelcome() {
+  const payload = JSON.stringify({ type: 'speak', text: BRAIN_WELCOME_TEXT });
+  if (!voiceWS || voiceWS.readyState !== WebSocket.OPEN) {
+    // If the socket is reconnecting, queue for later.
+    voicePendingPackets.push(payload);
+    _connectVoice();
+    return;
+  }
+  try {
+    appendThinking();
+    voiceWS.send(payload);
+    console.log('[Brain] welcome sent');
+  } catch (err) {
+    console.error('[Brain] welcome send failed:', err);
+    voicePendingPackets.push(payload);
+    _connectVoice();
+  }
+}
 function _brainModeSetSearch(active) {
   if (!brainStage) return;
   brainStage.classList.toggle('searching', !!active);

requirements.txt CHANGED Viewed

@@ -11,11 +11,11 @@ fastapi
 uvicorn
 websockets
-# ===== Async / DB =====
-aiosqlite
-aiosmtplib
-dateparser
-twilio
 # ===== LangChain Ecosystem =====
 langchain

 uvicorn
 websockets
+# ===== Async / DB =====
+aiosqlite
+aiosmtplib
+dateparser
+twilio
 # ===== LangChain Ecosystem =====
 langchain

tmp.ipynb CHANGED Viewed

@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
    "id": "5cbff6ce",
    "metadata": {},
    "outputs": [],

  "cells": [
   {
    "cell_type": "code",
+   "execution_count": 4,
    "id": "5cbff6ce",
    "metadata": {},
    "outputs": [],