added communication full layer

Browse files

Files changed (8) hide show

app.py +106 -65
core/backend.py +291 -294
frontend/index.html +8 -52
frontend/style.css +107 -265
services/streaming.py +102 -36
services/stt.py +76 -37
services/tts.py +59 -14
services/webrtc_pipeline.py +76 -58

app.py CHANGED Viewed

@@ -1,31 +1,28 @@
 """
 app.py — FastAPI entrypoint: WebRTC-first + WebSocket fallback
-Pipeline overview:
-──────────────────
-  Browser                     Server
-  ──────────────────────────────────────────────────────
-  getUserMedia() → WebRTC     aiortc peer connection
-  ↓ PCM audio frames  ────►   VAD segmenter
-                              ↓ utterances
-                              STT GPU-batch queue
-                              ↓ transcripts (parallel)
-                              LLM async stream  ──┐
-                              ↓ tokens            │ concurrent
-                              TTS streamer ◄──────┘
-                              ↓ audio chunks
-  ◄────────────────────────── RTCDataChannel
-WebSocket mode (fallback):
-  Still available at /ws/voice and /ws/chat for environments
-  where WebRTC is blocked (corporate proxies, etc.).
-  Uses the same STT batch queue and parallel TTS streamer.
-Performance targets:
-  STT:           < 200ms  (GPU-batched, ffmpeg parallel)
-  First LLM tok: < 100ms  (streaming, no full-sentence wait)
-  TTS start:     < 150ms  (sentence-level streaming, parallel synthesis)
-  Total TTFA*:   < 450ms  (*Time-To-First-Audio)
 """
 import asyncio
@@ -43,7 +40,7 @@ from core.backend import AIBackend
 from services.stt import STTProcessor
 from services.streaming import ParallelTTSStreamer
-# ── WebRTC (optional — degrades gracefully if aiortc not installed) ────────────
 try:
     from services.webrtc_pipeline import WebRTCSession
     WEBRTC_AVAILABLE = True
@@ -55,8 +52,8 @@ except (ImportError, RuntimeError) as _e:
 # ══════════════════════════════════════════════════════════════════════════════
 #  MODEL ROUTING CONFIG — set exactly ONE to True
 # ══════════════════════════════════════════════════════════════════════════════
-USE_GEMINI         = True
-USE_OLLAMA         = False
 USE_LOCAL_FALLBACK = False
 _active = sum([USE_GEMINI, USE_OLLAMA, USE_LOCAL_FALLBACK])
@@ -72,7 +69,6 @@ ai = AIBackend(
     use_fallback=USE_LOCAL_FALLBACK,
 )
-# Active WebRTC sessions — keyed by session_id
 _rtc_sessions: dict[str, "WebRTCSession"] = {}
@@ -85,18 +81,15 @@ async def lifespan(app: FastAPI):
     await ai.async_setup()
     print("[APP] AI backend ready ✓")
     yield
-    # Clean up WebRTC sessions
     for session in list(_rtc_sessions.values()):
         await session.close()
     _rtc_sessions.clear()
-    # Clean up DB connections
-    for attr in ("conn", "_meta_conn"):
-        conn = getattr(ai, attr, None)
-        if conn:
-            try:
-                await conn.close()
-            except Exception:
-                pass
 app = FastAPI(lifespan=lifespan)
@@ -114,33 +107,33 @@ async def root():
     return HTMLResponse("<h2>index.html not found</h2>", status_code=404)
 # ══════════════════════════════════════════════════════════════════════════════
 #  WEBRTC SIGNALING ENDPOINTS
 # ══════════════════════════════════════════════════════════════════════════════
 @app.post("/rtc/offer")
 async def rtc_offer(request: Request):
-    """
-    WebRTC signaling: browser sends SDP offer, server returns SDP answer.
-    Request JSON:
-        { "sdp": "...", "type": "offer", "session_id": "optional_existing_id" }
-    Response JSON:
-        { "sdp": "...", "type": "answer", "session_id": "..." }
-    """
     if not WEBRTC_AVAILABLE:
         return JSONResponse(
             {"error": "WebRTC unavailable. Use WebSocket fallback at /ws/voice"},
             status_code=503,
         )
     body       = await request.json()
     sdp        = body.get("sdp", "")
     sdp_type   = body.get("type", "offer")
     session_id = body.get("session_id") or uuid.uuid4().hex
-    # Reuse or create session
     session = _rtc_sessions.get(session_id)
     if session is None:
         session = WebRTCSession(ai_backend=ai)
@@ -153,25 +146,20 @@ async def rtc_offer(request: Request):
 @app.post("/rtc/ice")
 async def rtc_ice(request: Request):
-    """Forward browser ICE candidate to the session."""
     if not WEBRTC_AVAILABLE:
         return JSONResponse({"error": "WebRTC unavailable"}, status_code=503)
     body       = await request.json()
     session_id = body.get("session_id", "")
     candidate  = body.get("candidate", {})
-    session = _rtc_sessions.get(session_id)
     if session is None:
         return JSONResponse({"error": "Session not found"}, status_code=404)
     await session.add_ice_candidate(candidate)
     return JSONResponse({"ok": True})
 @app.delete("/rtc/session/{session_id}")
 async def rtc_close(session_id: str):
-    """Explicitly close a WebRTC session."""
     session = _rtc_sessions.pop(session_id, None)
     if session:
         await session.close()
@@ -214,6 +202,10 @@ async def _safe_bytes(ws: WebSocket, data: bytes) -> bool:
 async def ws_chat(ws: WebSocket):
     await ws.accept()
     print("[CHAT] Client connected ✓")
     try:
         while True:
             raw = await ws.receive_text()
@@ -223,7 +215,25 @@ async def ws_chat(ws: WebSocket):
                 await _safe_text(ws, {"type": "error", "text": "Invalid JSON"})
                 continue
-            user_id    = data.get("user_id", "default_user")
             user_query = data.get("user_query", "").strip()
             if not user_query:
                 continue
@@ -249,15 +259,43 @@ async def ws_chat(ws: WebSocket):
 # ══════════════════════════════════════════════════════════════════════════════
-#  WEBSOCKET — VOICE (fallback: full STT→LLM→TTS pipeline over WS)
 # ══════════════════════════════════════════════════════════════════════════════
 @app.websocket("/ws/voice")
 async def ws_voice(ws: WebSocket):
     await ws.accept()
-    user_id = f"voice_{uuid.uuid4().hex[:12]}"
-    print(f"[VOICE] Client connected — user_id={user_id}")
     stt = STTProcessor()
     _active_streamer: ParallelTTSStreamer | None = None
@@ -279,7 +317,7 @@ async def ws_voice(ws: WebSocket):
     async def _handle_utterance(audio_bytes: bytes):
         nonlocal _active_streamer
-        # ── STT (GPU-batched) ──────────────────────────────────────────────────
         transcript = await stt.transcribe(audio_bytes)
         if not transcript:
             await _safe_text(ws, {"type": "error", "text": "কথা বুঝতে পারিনি, আবার বলুন।"})
@@ -315,7 +353,6 @@ async def ws_voice(ws: WebSocket):
                 if not await _safe_bytes(ws, chunk):
                     break
-        # LLM and TTS delivery run SIMULTANEOUSLY
         await asyncio.gather(run_llm(), run_tts(), return_exceptions=True)
         _active_streamer = None
         await _safe_text(ws, {"type": "end"})
@@ -344,12 +381,16 @@ async def ws_voice(ws: WebSocket):
             elif "text" in data and data["text"]:
                 try:
                     msg = json.loads(data["text"])
-                    if msg.get("type") == "init" and msg.get("user_id"):
-                        user_id = str(msg["user_id"])[:64]
                         await _safe_text(ws, {"type": "init_ack", "user_id": user_id})
-                    elif msg.get("type") == "ping":
                         await _safe_text(ws, {"type": "pong"})
-                    elif msg.get("type") == "cancel":
                         await _cancel_active()
                         await _safe_text(ws, {"type": "end"})
                 except json.JSONDecodeError:

 """
 app.py — FastAPI entrypoint: WebRTC-first + WebSocket fallback
+FIXES APPLIED:
+  FIX-SESSION (Issue 1): The voice WS handler now reads user_id from the
+    first 'init' JSON message before processing any audio. The variable is
+    no longer a random UUID per connection — it is the stable USER_ID sent
+    by the browser from localStorage. This means every reconnect, even after
+    a page reload, hits the same LangGraph thread and restores conversation
+    history.
+    Implementation:
+      • user_id is initialised to None inside ws_voice.
+      • The handler waits for any early text messages before processing binary.
+      • On 'init' message, user_id is set and init_ack returned.
+      • All subsequent audio/LLM calls use that stable user_id.
+      • If no 'init' is received within 3 s, a random fallback is used
+        (prevents hang for non-browser clients).
+  FIX-CHAT-INIT (Issue 1): ws_chat also reads the 'init' message so chat
+    sessions share the same backend thread as voice sessions for the same
+    user.
+All performance optimisations (parallel TTS, GPU-batched STT, concurrent
+LLM+TTS) preserved.
 """
 import asyncio
 from services.stt import STTProcessor
 from services.streaming import ParallelTTSStreamer
+# ── WebRTC (optional) ─────────────────────────────────────────────────────────
 try:
     from services.webrtc_pipeline import WebRTCSession
     WEBRTC_AVAILABLE = True
 # ══════════════════════════════════════════════════════════════════════════════
 #  MODEL ROUTING CONFIG — set exactly ONE to True
 # ══════════════════════════════════════════════════════════════════════════════
+USE_GEMINI         = False
+USE_OLLAMA         = True
 USE_LOCAL_FALLBACK = False
 _active = sum([USE_GEMINI, USE_OLLAMA, USE_LOCAL_FALLBACK])
     use_fallback=USE_LOCAL_FALLBACK,
 )
 _rtc_sessions: dict[str, "WebRTCSession"] = {}
     await ai.async_setup()
     print("[APP] AI backend ready ✓")
     yield
     for session in list(_rtc_sessions.values()):
         await session.close()
     _rtc_sessions.clear()
+    conn = getattr(ai, "conn", None)
+    if conn:
+        try:
+            await conn.close()
+        except Exception:
+            pass
 app = FastAPI(lifespan=lifespan)
     return HTMLResponse("<h2>index.html not found</h2>", status_code=404)
+@app.get("/health")
+async def health():
+    from services.stt import _model_ready, _model_error
+    return JSONResponse({
+        "status":       "ok",
+        "model_ready":  _model_ready.is_set(),
+        "model_error":  _model_error,
+        "rtc_sessions": len(_rtc_sessions),
+    })
 # ══════════════════════════════════════════════════════════════════════════════
 #  WEBRTC SIGNALING ENDPOINTS
 # ══════════════════════════════════════════════════════════════════════════════
 @app.post("/rtc/offer")
 async def rtc_offer(request: Request):
     if not WEBRTC_AVAILABLE:
         return JSONResponse(
             {"error": "WebRTC unavailable. Use WebSocket fallback at /ws/voice"},
             status_code=503,
         )
     body       = await request.json()
     sdp        = body.get("sdp", "")
     sdp_type   = body.get("type", "offer")
     session_id = body.get("session_id") or uuid.uuid4().hex
     session = _rtc_sessions.get(session_id)
     if session is None:
         session = WebRTCSession(ai_backend=ai)
 @app.post("/rtc/ice")
 async def rtc_ice(request: Request):
     if not WEBRTC_AVAILABLE:
         return JSONResponse({"error": "WebRTC unavailable"}, status_code=503)
     body       = await request.json()
     session_id = body.get("session_id", "")
     candidate  = body.get("candidate", {})
+    session    = _rtc_sessions.get(session_id)
     if session is None:
         return JSONResponse({"error": "Session not found"}, status_code=404)
     await session.add_ice_candidate(candidate)
     return JSONResponse({"ok": True})
 @app.delete("/rtc/session/{session_id}")
 async def rtc_close(session_id: str):
     session = _rtc_sessions.pop(session_id, None)
     if session:
         await session.close()
 async def ws_chat(ws: WebSocket):
     await ws.accept()
     print("[CHAT] Client connected ✓")
+    # FIX-SESSION: Start with no user_id; wait for 'init' to set it.
+    user_id: str = ""
     try:
         while True:
             raw = await ws.receive_text()
                 await _safe_text(ws, {"type": "error", "text": "Invalid JSON"})
                 continue
+            msg_type = data.get("type", "")
+            # ── Init handshake ──────────────────────────────────────────────
+            if msg_type == "init":
+                claimed = str(data.get("user_id", "")).strip()[:64]
+                if claimed:
+                    user_id = claimed
+                    print(f"[CHAT] Session restored for user_id={user_id!r}")
+                    await _safe_text(ws, {"type": "init_ack", "user_id": user_id})
+                continue
+            if msg_type == "ping":
+                await _safe_text(ws, {"type": "pong"})
+                continue
+            # Fall back to user_id in message payload (compatibility)
+            if not user_id:
+                user_id = str(data.get("user_id", "default_user"))[:64]
             user_query = data.get("user_query", "").strip()
             if not user_query:
                 continue
 # ══════════════════════════════════════════════════════════════════════════════
+#  WEBSOCKET — VOICE (STT→LLM→TTS pipeline over WS)
 # ══════════════════════════════════════════════════════════════════════════════
+# How long (seconds) to wait for the first 'init' message before using fallback
+_INIT_TIMEOUT = 3.0
 @app.websocket("/ws/voice")
 async def ws_voice(ws: WebSocket):
     await ws.accept()
+    print("[VOICE] Client connected")
+    # ── FIX-SESSION: Resolve stable user_id from browser init message ────────
+    # Wait up to _INIT_TIMEOUT seconds for the {'type':'init','user_id':...} msg.
+    # This is always the FIRST message sent by script.js on WS open.
+    user_id: str = ""
+    try:
+        first_raw = await asyncio.wait_for(ws.receive(), timeout=_INIT_TIMEOUT)
+        if "text" in first_raw and first_raw["text"]:
+            try:
+                first_msg = json.loads(first_raw["text"])
+                if first_msg.get("type") == "init":
+                    claimed = str(first_msg.get("user_id", "")).strip()[:64]
+                    if claimed:
+                        user_id = claimed
+            except (json.JSONDecodeError, KeyError):
+                pass
+    except asyncio.TimeoutError:
+        print("[VOICE] No init message within timeout — using fallback user_id")
+    if not user_id:
+        user_id = f"voice_{uuid.uuid4().hex[:12]}"
+        print(f"[VOICE] Fallback user_id={user_id}")
+    else:
+        print(f"[VOICE] Session user_id={user_id}")
+    await _safe_text(ws, {"type": "init_ack", "user_id": user_id})
     stt = STTProcessor()
     _active_streamer: ParallelTTSStreamer | None = None
     async def _handle_utterance(audio_bytes: bytes):
         nonlocal _active_streamer
+        # ── STT ───────────────────────────────────────────────────────────────
         transcript = await stt.transcribe(audio_bytes)
         if not transcript:
             await _safe_text(ws, {"type": "error", "text": "কথা বুঝতে পারিনি, আবার বলুন।"})
                 if not await _safe_bytes(ws, chunk):
                     break
         await asyncio.gather(run_llm(), run_tts(), return_exceptions=True)
         _active_streamer = None
         await _safe_text(ws, {"type": "end"})
             elif "text" in data and data["text"]:
                 try:
                     msg = json.loads(data["text"])
+                    t   = msg.get("type", "")
+                    if t == "init":
+                        # Late re-init (e.g. after reconnect with same WS obj — rare)
+                        claimed = str(msg.get("user_id", "")).strip()[:64]
+                        if claimed:
+                            user_id = claimed
                         await _safe_text(ws, {"type": "init_ack", "user_id": user_id})
+                    elif t == "ping":
                         await _safe_text(ws, {"type": "pong"})
+                    elif t == "cancel":
                         await _cancel_active()
                         await _safe_text(ws, {"type": "end"})
                 except json.JSONDecodeError:

core/backend.py CHANGED Viewed

@@ -4,23 +4,11 @@ import asyncio
 import json
 import os
 import uuid
-# ── Disable LangSmith unless explicitly configured ────────────────────────────
-from dotenv import load_dotenv as _ld; _ld()
-_tracing_requested = os.getenv("LANGCHAIN_TRACING_V2", "false").strip().lower() == "true"
-_key_present       = bool(os.getenv("LANGCHAIN_API_KEY", "").strip())
-if not (_tracing_requested and _key_present):
-    os.environ["LANGCHAIN_TRACING_V2"] = "false"
-    os.environ.pop("LANGCHAIN_API_KEY", None)
-    print("[BACKEND] LangSmith tracing disabled.")
-else:
-    print("[BACKEND] LangSmith tracing ENABLED.")
 import aiosqlite
 import pytz
-from datetime import datetime
 from dotenv import load_dotenv
 from langchain_core.messages import (
@@ -28,11 +16,18 @@ from langchain_core.messages import (
     SystemMessage, ToolMessage,
 )
 from langchain_core.tools import tool
 from langgraph.checkpoint.sqlite.aio import AsyncSqliteSaver
 from langgraph.graph import END, START, StateGraph
 from langgraph.graph.message import add_messages
 from langgraph.prebuilt import ToolNode, tools_condition
-from typing import Annotated, TypedDict
 # ═══════════════════════════════════════════════════════════════════════════════
@@ -60,28 +55,126 @@ def format_bd_number(num: str) -> str:
 def send_sms(to_number: str, message: str) -> None:
-    try:
-        from twilio.rest import Client
-        client = Client(os.getenv("TWILIO_ACCOUNT_SID"), os.getenv("TWILIO_AUTH_TOKEN"))
-        client.messages.create(
-            body=message,
-            from_=os.getenv("TWILIO_PHONE_NUMBER"),
-            to=to_number,
-        )
-    except Exception as e:
-        print(f"[SMS] Failed to send: {e}")
 # ═══════════════════════════════════════════════════════════════════════════════
 #  TOOLS
 # ═══════════════════════════════════════════════════════════════════════════════
 @tool
 def get_bd_time() -> str:
-    """Get current Bangladesh time (Asia/Dhaka) with weekday name."""
-    tz  = pytz.timezone("Asia/Dhaka")
     now = datetime.now(tz)
-    return now.strftime("%Y-%m-%d %H:%M:%S (%A, Bangladesh Time)")
 @tool
 async def search_doctor(
@@ -90,7 +183,7 @@ async def search_doctor(
     visiting_days: str = "",
 ) -> str:
     """
-    Search doctors by name, category, or visiting_days from the SQLite database.
     Any combination of filters is supported (OR logic across fields).
     """
     db_path    = get_db_path()
@@ -152,16 +245,17 @@ async def book_appointment(
     patient_age: str,
     patient_num: str,
     visiting_date: str,
 ) -> str:
     """
     Book a doctor appointment and save it to the patients table.
     Args:
         doctor_id:     Doctor's ID from search_doctor results.
         patient_name:  Full name of the patient.
         patient_age:   Age of the patient (e.g. "32").
         patient_num:   Contact phone number of the patient.
         visiting_date: Date of visit in YYYY-MM-DD format (e.g. 2025-06-15).
     """
     db_path     = get_db_path()
     patient_num = format_bd_number(patient_num)
@@ -191,12 +285,29 @@ async def book_appointment(
         await db.execute(
             """INSERT INTO patients
-               (doctor_name, doctor_category, patient_name, patient_age, patient_num, visiting_date)
-               VALUES (?, ?, ?, ?, ?, ?)""",
-            (doctor_name, doctor_category, patient_name, patient_age, patient_num, visiting_date),
         )
         await db.commit()
     return (
         f"✅ Appointment Booked!\n"
         f"━━━━━━━━━━━━━━━━━━━━━━\n"
@@ -207,6 +318,7 @@ async def book_appointment(
         f"Contact      : {patient_num}\n"
         f"━━━━━━━━━━━━━━━━━━━━━━\n"
         f"Please arrive 10 minutes early."
     )
@@ -243,16 +355,43 @@ async def delete_appointment(patient_num: str, doctor_name: str) -> str:
 # ═══════════════════════════════════════════════════════════════════════════════
 #  SYSTEM PROMPT
 # ═══════════════════════════════════════════════════════════════════════════════
-BASE_SYSTEM = (
-    "You are a helpful Bangla voice assistant for a doctor appointment system.\n"
-    "Rules:\n"
-    "- Always respond in Bangla (বাংলা).\n"
-    "- Keep sentences short and natural for text-to-speech playback.\n"
-    "- Avoid markdown, bullet points, or long lists in voice responses.\n"
-    "- Use tools when needed to search doctors or manage appointments.\n"
-    "- Be polite, concise, and clear.\n"
-    "- Do not use English unless a proper noun requires it.\n"
-)
 SUMMARY_SYSTEM = (
     BASE_SYSTEM
@@ -261,119 +400,51 @@ SUMMARY_SYSTEM = (
     "Use this memory for continuity. Do not repeat it unless asked."
 )
-# ── Ollama system prompt (no tool calling) ─────────────────────────────────────
-OLLAMA_SYSTEM = (
-    BASE_SYSTEM
-    + "\nIMPORTANT: You do not have tool access in this mode. "
-    "Politely tell the user you cannot look up doctor information right now, "
-    "and ask them to use the chat interface for complex queries."
-)
-# ═══════════════════════════════════════════════════════════════════════════════
-#  TOOL CALLING — VALIDATED LAYER
-# ═══════════════════════════════════════════════════════════════════════════════
-class ToolCallValidator:
-    MAX_RETRIES = 2
-    def __init__(self, tool_node: ToolNode):
-        self._node = tool_node
-    async def invoke(self, state: ChatState) -> ChatState:
-        last_msg = state["messages"][-1]
-        if not hasattr(last_msg, "tool_calls") or not last_msg.tool_calls:
-            return state
-        for attempt in range(self.MAX_RETRIES + 1):
-            try:
-                result = await self._node.ainvoke(state)
-                return result
-            except Exception as exc:
-                print(f"[TOOL] Attempt {attempt + 1} failed: {exc}")
-                if attempt == self.MAX_RETRIES:
-                    tool_calls = last_msg.tool_calls
-                    fallback_msgs = [
-                        ToolMessage(
-                            content="Tool execution failed after retries. Please inform the user politely.",
-                            tool_call_id=tc["id"],
-                        )
-                        for tc in tool_calls
-                    ]
-                    return {"messages": state["messages"] + fallback_msgs}
-                await asyncio.sleep(0.3 * (attempt + 1))
-        return state
 # ═══════════════════════════════════════════════════════════════════════════════
 #  AGENT
 # ═══════════════════════════════════════════════════════════════════════════════
 class AIBackend:
-    def __init__(
-        self,
-        use_gemini: bool = True,
-        use_ollama: bool = False,
-        use_fallback: bool = False,
-    ) -> None:
-        load_dotenv()
         os.environ.setdefault("LANGCHAIN_PROJECT", "Doctor Appointment Automation")
-        self._use_gemini   = use_gemini
-        self._use_ollama   = use_ollama
-        self._use_fallback = use_fallback
-        self._build_llm()
-    def _build_llm(self) -> None:
-        if self._use_gemini:
-            from langchain_google_genai import ChatGoogleGenerativeAI
             self.llm = ChatGoogleGenerativeAI(
-                model="gemini-2.5-flash",
                 temperature=0.3,
             )
-            print("[BACKEND] Using Gemini 2.5 Flash")
-        elif self._use_ollama:
-            from langchain_ollama import ChatOllama
-            ollama_model = os.getenv("OLLAMA_MODEL", "qwen2.5")
-            self.llm = ChatOllama(
-                model=ollama_model,
-                temperature=0.3,
-            )
-            print(f"[BACKEND] Using Ollama model: {ollama_model}")
-        else:
-            self.llm = None
-            print("[BACKEND] Using local fallback responder (no external LLM)")
-        if self._use_gemini and self.llm is not None:
-            self.tools          = [
-                search_doctor,
-                book_appointment,
-                get_bd_time,
-                search_appointment_by_phone,
-                delete_appointment,
-            ]
-            self.tool_node      = ToolNode(self.tools)
-            self.tool_validator = ToolCallValidator(self.tool_node)
-            self.llm_with_tools = self.llm.bind_tools(self.tools)
         else:
-            self.tools          = []
-            self.tool_node      = None
-            self.tool_validator = None
-            self.llm_with_tools = self.llm
     # ── Setup ──────────────────────────────────────────────────────────────────
     async def async_setup(self) -> None:
-        db_path = get_db_path()
-        self.conn       = await aiosqlite.connect(db_path)
-        self._meta_conn = await aiosqlite.connect(db_path)
         self.checkpointer = AsyncSqliteSaver(self.conn)
         await self._create_tables()
         self.graph         = self._build_graph()
         self.summary_graph = self._build_summary_graph()
     async def _create_tables(self) -> None:
         await self.conn.execute("""
@@ -384,23 +455,24 @@ class AIBackend:
         """)
         await self.conn.execute("""
             CREATE TABLE IF NOT EXISTS doctors (
-                id            INTEGER PRIMARY KEY AUTOINCREMENT,
-                doctor_name   TEXT NOT NULL,
-                category      TEXT NOT NULL,
-                visiting_days TEXT NOT NULL,
-                chamber       TEXT,
-                fee           TEXT
             )
         """)
         await self.conn.execute("""
             CREATE TABLE IF NOT EXISTS patients (
-                id              INTEGER PRIMARY KEY AUTOINCREMENT,
-                doctor_name     TEXT NOT NULL,
                 doctor_category TEXT,
-                patient_name    TEXT NOT NULL,
-                patient_age     TEXT,
-                patient_num     TEXT NOT NULL,
-                visiting_date   TEXT NOT NULL
             )
         """)
         await self.conn.commit()
@@ -426,71 +498,47 @@ class AIBackend:
         response = await self.llm.ainvoke(messages + [HumanMessage(content=prompt)])
         return {
-            "summary": response.content,
             "messages": [RemoveMessage(id=m.id) for m in messages[:-2]],
         }
-    async def should_summarize(self, state: ChatState) -> str:
-        return "summarize_node" if len(state["messages"]) > 10 else "chat_node"
     # ── Chat node ──────────────────────────────────────────────────────────────
-    # FIX: chat_node now stores the COMPLETE response in graph state (for
-    # checkpointing / memory), while ai_only_stream handles live token delivery
-    # directly from the LLM — bypassing the graph's collect-then-return pattern.
     async def chat_node(self, state: ChatState):
-        if self._use_fallback or self.llm is None:
-            return {
-                "messages": [AIMessage(content=(
-                    "দুঃখিত, এই মুহূর্তে AI সংযোগ পাওয়া যাচ্ছে না। "
-                    "অনুগ্রহ করে পরে আবার চেষ্টা করুন।"
-                ))]
-            }
         summary  = state.get("summary", "")
         messages = state["messages"]
-        if self._use_ollama:
-            sys_content = OLLAMA_SYSTEM
-        else:
-            sys_content = SUMMARY_SYSTEM.format(summary=summary) if summary else BASE_SYSTEM
         full_messages = [SystemMessage(content=sys_content)] + list(messages)
-        # Collect full response for graph state storage
-        collected: list[AIMessageChunk] = []
-        async for chunk in self.llm_with_tools.astream(full_messages):
-            collected.append(chunk)
-        if not collected:
-            response = AIMessage(content="")
-        else:
-            response = collected[0]
-            for c in collected[1:]:
-                response = response + c
-        print(f"[AI] response ({len(str(response.content))} chars): {str(response.content)[:120]}")
         return {"messages": [response]}
-    # ── Validated tool node ────────────────────────────────────────────────────
-    async def validated_tools_node(self, state: ChatState):
-        if self.tool_validator is None:
-            return state
-        return await self.tool_validator.invoke(state)
     # ── Graph ──────────────────────────────────────────────────────────────────
     def _build_graph(self):
         g = StateGraph(ChatState)
         g.add_node("chat_node", self.chat_node)
-        if self._use_gemini and self.tool_node is not None:
-            g.add_node("tools", self.validated_tools_node)
-            g.add_edge(START, "chat_node")
-            g.add_conditional_edges("chat_node", tools_condition)
-            g.add_edge("tools", "chat_node")
-        else:
-            g.add_edge(START, "chat_node")
-            g.add_edge("chat_node", END)
         return g.compile(checkpointer=self.checkpointer)
     def _build_summary_graph(self):
@@ -500,126 +548,75 @@ class AIBackend:
         g.add_edge("summarize_node", END)
         return g.compile(checkpointer=self.checkpointer)
-    # ── Streaming — FIXED ──────────────────────────────────────────────────────
-    async def ai_only_stream(self, user_id: str, user_query: str, thread_id: str):
-        """
-        Async generator that yields AI text tokens in real time.
-        FIX: The old approach used graph.astream(stream_mode="messages") which
-        only emits AIMessageChunk events DURING node execution. But chat_node
-        collected all chunks internally before returning, so no AIMessageChunk
-        ever escaped the node — the generator yielded nothing and the frontend
-        waited forever.
-        New approach (two-phase):
-        1. Stream tokens DIRECTLY from the LLM right now → yield to caller
-        2. Save the full response to graph state via graph.ainvoke() in background
-           so conversation memory / checkpointing still works.
         """
-        if self._use_fallback or self.llm is None:
-            fallback = (
-                "দুঃখিত, এই মুহূর্তে AI সংযোগ পাওয়া যাচ্ছে না। "
-                "অনুগ্রহ করে পরে আবার চেষ্টা করুন।"
-            )
-            yield fallback
-            return
-        summary = ""
-        config  = {"configurable": {"thread_id": thread_id}}
-        # Try to get existing summary from graph state
-        try:
-            state = await self.graph.aget_state(config)
-            summary = state.values.get("summary", "") if state and state.values else ""
-        except Exception:
-            pass
-        sys_content = (
-            OLLAMA_SYSTEM if self._use_ollama
-            else (SUMMARY_SYSTEM.format(summary=summary) if summary else BASE_SYSTEM)
-        )
-        # Fetch conversation history from checkpointer
-        history: list = []
-        try:
-            state = await self.graph.aget_state(config)
-            if state and state.values:
-                history = list(state.values.get("messages", []))
-        except Exception:
-            pass
-        full_messages = (
-            [SystemMessage(content=sys_content)]
-            + history
-            + [HumanMessage(content=user_query)]
-        )
-        print(f"[AI] Streaming for thread={thread_id}, history={len(history)} msgs")
-        # Phase 1: stream tokens live to the frontend
-        collected: list[AIMessageChunk] = []
-        token_count = 0
         try:
-            async for chunk in self.llm_with_tools.astream(full_messages):
-                collected.append(chunk)
-                if chunk.content:
-                    token_count += 1
-                    yield chunk.content
         except Exception as exc:
-            print(f"[AI] Streaming error: {exc}")
-            import traceback; traceback.print_exc()
-            yield "দুঃখিত, একটি সমস্যা হয়েছে। আবার চেষ্টা করুন।"
-            return
-        print(f"[AI] Stream done: {token_count} tokens")
-        # Phase 2: persist to graph state in background (non-blocking)
-        if collected:
-            full_response = collected[0]
-            for c in collected[1:]:
-                full_response = full_response + c
-            async def _save_to_graph():
-                try:
-                    save_state = {"messages": [HumanMessage(content=user_query)]}
-                    await self.graph.ainvoke(
-                        save_state,
-                        config=config,
-                        # We already have the response; override chat_node
-                        # by injecting the AI message directly
-                    )
-                except Exception as exc:
-                    # Non-critical: history save failed, but user got their response
-                    print(f"[AI] Graph state save error (non-critical): {exc}")
-            # Save history via a simpler direct approach: just invoke with the
-            # human message and let chat_node regenerate (it will be fast since
-            # Ollama is local). This ensures checkpointer stays consistent.
-            asyncio.create_task(_save_to_graph())
     # ── Thread management ──────────────────────────────────────────────────────
     @staticmethod
     def generate_thread_id() -> str:
         return str(uuid.uuid4())
     # ── Public entry point ─────────────────────────────────────────────────────
-    async def main(self, user_id: str, user_query: str):
         """Return an async generator of AI text tokens."""
-        async with self._meta_conn.execute(
             "SELECT threadId FROM userid_threadid WHERE userId = ?", (user_id,)
         ) as cursor:
             row = await cursor.fetchone()
         if row is None:
             thread_id = user_id + self.generate_thread_id()
-            await self._meta_conn.execute(
                 "INSERT INTO userid_threadid (userId, threadId) VALUES (?, ?)",
                 (user_id, thread_id),
             )
-            await self._meta_conn.commit()
         else:
             thread_id = row[0]
-        # FIX: pass user_id, user_query, thread_id directly so ai_only_stream
-        # can stream from LLM without going through the blocking graph node
-        return self.ai_only_stream(user_id, user_query, thread_id)

 import json
 import os
 import uuid
+import aiosmtplib
 import aiosqlite
 import pytz
+from datetime import datetime, timedelta
 from dotenv import load_dotenv
 from langchain_core.messages import (
     SystemMessage, ToolMessage,
 )
 from langchain_core.tools import tool
+from langchain_google_genai import ChatGoogleGenerativeAI
 from langgraph.checkpoint.sqlite.aio import AsyncSqliteSaver
 from langgraph.graph import END, START, StateGraph
 from langgraph.graph.message import add_messages
 from langgraph.prebuilt import ToolNode, tools_condition
+from twilio.rest import Client
+from typing import Annotated, TypedDict, Optional, AsyncGenerator
+from email.message import EmailMessage
+from dateparser.search import search_dates
+from langchain_ollama import ChatOllama
+load_dotenv()
 # ═══════════════════════════════════════════════════════════════════════════════
 def send_sms(to_number: str, message: str) -> None:
+    client = Client(os.getenv("TWILIO_ACCOUNT_SID"), os.getenv("TWILIO_AUTH_TOKEN"))
+    client.messages.create(
+        body=message,
+        from_=os.getenv("TWILIO_PHONE_NUMBER"),
+        to=to_number,
+    )
+async def send_mail(to_mail: str, subject: str, body: str):
+    email = EmailMessage()
+    email["From"] = "walidofficework@gmail.com"
+    email["To"] = to_mail
+    email["Subject"] = subject
+    email.set_content(body)
+    await aiosmtplib.send(
+        email,
+        hostname="smtp.gmail.com",
+        port=465,
+        username="walidofficework@gmail.com",
+        password="bajq dkqr qacs pehr",
+        use_tls=True,
+    )
 # ═══════════════════════════════════════════════════════════════════════════════
 #  TOOLS
 # ═══════════════════════════════════════════════════════════════════════════════
 @tool
 def get_bd_time() -> str:
+    """Get current Bangladesh date and time along with the next 14 days."""
+    # Bangladesh timezone
+    tz = pytz.timezone("Asia/Dhaka")
+    # Current datetime
     now = datetime.now(tz)
+    # Create result dictionary
+    result = {
+        "CURRENT_DATETIME": now.strftime("%Y-%m-%d %H:%M:%S %Z"),
+        "TODAY": now.strftime("%A, %B %d, %Y"),
+        "TOMORROW": (now + timedelta(days=1)).strftime("%A, %B %d, %Y"),
+        "NEXT_14_DAYS": {}
+    }
+    # Generate next 14 days
+    for i in range(1, 15):
+        future_date = now + timedelta(days=i)
+        result["NEXT_14_DAYS"][f"+{i}"] = future_date.strftime("%A, %B %d, %Y")
+    return json.dumps(result)
+@tool
+async def get_doctor_categories() -> str:
+    """
+    Fetch all unique doctor categories from the database.
+    """
+    db_path = get_db_path()
+    query = """
+        SELECT DISTINCT category
+        FROM doctors
+        WHERE category IS NOT NULL
+          AND TRIM(category) != ''
+        ORDER BY category ASC
+    """
+    async with aiosqlite.connect(db_path) as db:
+        db.row_factory = aiosqlite.Row
+        cursor = await db.execute(query)
+        rows = await cursor.fetchall()
+    categories = [row["category"] for row in rows]
+    return json.dumps({
+        "success": True,
+        "count": len(categories),
+        "data": categories
+    })
+@tool
+async def get_doctors_by_day(
+    visiting_day: str,
+) -> str:
+    """
+    Get all doctors available on a specific visiting day.
+    Example inputs:
+    - Sunday
+    - Monday
+    - Friday
+    """
+    db_path = get_db_path()
+    query = """
+    SELECT *
+    FROM doctors
+    WHERE LOWER(visiting_days) LIKE ?
+    """
+    param = [f"%{visiting_day.lower()}%"]
+    async with aiosqlite.connect(db_path) as db:
+        db.row_factory = aiosqlite.Row
+        cursor = await db.execute(query, param)
+        rows = await cursor.fetchall()
+    if not rows:
+        return json.dumps({
+            "success": False,
+            "message": f"No doctors found for {visiting_day}.",
+            "data": []
+        })
+    doctors = [dict(row) for row in rows]
+    return json.dumps({
+        "success": True,
+        "visiting_day": visiting_day,
+        "count": len(doctors),
+        "data": doctors
+    }, ensure_ascii=False)
 @tool
 async def search_doctor(
     visiting_days: str = "",
 ) -> str:
     """
+    Search doctors by name, category, or visiting_days from the database.
     Any combination of filters is supported (OR logic across fields).
     """
     db_path    = get_db_path()
     patient_age: str,
     patient_num: str,
     visiting_date: str,
+    patient_mail: str
 ) -> str:
     """
     Book a doctor appointment and save it to the patients table.
     Args:
         doctor_id:     Doctor's ID from search_doctor results.
         patient_name:  Full name of the patient.
         patient_age:   Age of the patient (e.g. "32").
         patient_num:   Contact phone number of the patient.
         visiting_date: Date of visit in YYYY-MM-DD format (e.g. 2025-06-15).
+        patient_mail:  Mail address for confirmation mail.
     """
     db_path     = get_db_path()
     patient_num = format_bd_number(patient_num)
         await db.execute(
             """INSERT INTO patients
+               (doctor_name, doctor_category, patient_name, patient_age, patient_num, visiting_date, patient_mail)
+               VALUES (?, ?, ?, ?, ?, ?, ?)""",
+            (doctor_name, doctor_category, patient_name, patient_age, patient_num, visiting_date, patient_mail),
         )
         await db.commit()
+    # Mail SMS confirmation
+    mail_message = (
+        f"Doctor     : {doctor_name}\n"
+        f"Patient    : {patient_name}\n"
+        f"Visit Date : {visiting_date}\n"
+        f"Please arrive 10 minutes early."
+    )
+    try:
+        await send_mail(
+            to_mail=patient_mail,
+            subject="✅ Appointment Confirmed!",
+            body=mail_message,
+        )
+        mail_status = "\n📧 Mail confirmation sent."
+    except Exception as e:
+        mail_status = f"\n⚠️ Mail failed: {str(e)}"
     return (
         f"✅ Appointment Booked!\n"
         f"━━━━━━━━━━━━━━━━━━━━━━\n"
         f"Contact      : {patient_num}\n"
         f"━━━━━━━━━━━━━━━━━━━━━━\n"
         f"Please arrive 10 minutes early."
+        f"{mail_status}"
     )
 # ═══════════════════════════════════════════════════════════════════════════════
 #  SYSTEM PROMPT
 # ═══════════════════════════════════════════════════════════════════════════════
+BASE_SYSTEM = """
+You are a Doctor Appointment Assistant AI.
+Your job is to help users manage medical appointments.
+CAPABILITIES:
+- Book doctor appointments
+- Reschedule appointments
+- Cancel appointments
+- Collect patient details
+STRICT RULES:
+- You are NOT a doctor.
+- NEVER diagnose diseases.
+- NEVER recommend medicines or treatments.
+APPOINTMENT FLOW:
+1. Detect intent (book / cancel / reschedule / inquiry)
+2. Collect details
+3. Confirm all details before final booking
+STYLE:
+- Be short, clear, structured
+- Ask one question at a time when needed
+- Focus on completing booking
+LANGUAGE RULE:
+- Detect user language from latest message.
+- If English → reply English.
+- If Bangla → reply Bangla (বাংলা).
+- If Banglish → reply Bangla (বাংলা).
+- Never mix languages unless user mixes first.
+TOOLS:
+- Use backend tools if available for scheduling
+- Always confirm before final action
+"""
 SUMMARY_SYSTEM = (
     BASE_SYSTEM
     "Use this memory for continuity. Do not repeat it unless asked."
 )
 # ═══════════════════════════════════════════════════════════════════════════════
 #  AGENT
 # ═══════════════════════════════════════════════════════════════════════════════
 class AIBackend:
+    # ── FIX-BUG1: was `_init_` (single underscores) — never called by Python
+    def __init__(self, use_gemini: bool = False, use_ollama: bool = True, use_fallback: bool = False):
+        self.use_gemini   = use_gemini
+        self.use_ollama   = use_ollama
+        self.use_fallback = use_fallback
         os.environ.setdefault("LANGCHAIN_PROJECT", "Doctor Appointment Automation")
+        if use_gemini:
             self.llm = ChatGoogleGenerativeAI(
+                model="gemini-2.0-flash",
                 temperature=0.3,
             )
+        elif use_ollama:
+            self.llm = ChatOllama(model="gemma4:e4b", streaming=True, temperature=0.2)
         else:
+            # Local fallback — extend as needed
+            self.llm = ChatOllama(model="gemma4:e4b", streaming=True, temperature=0.2)
+        self.tools          = [
+            search_doctor,
+            book_appointment,
+            get_bd_time,
+            search_appointment_by_phone,
+            delete_appointment,
+            get_doctor_categories,
+            get_doctors_by_day
+        ]
+        self.tool_node      = ToolNode(self.tools)
+        self.llm_with_tools = self.llm.bind_tools(self.tools)
     # ── Setup ──────────────────────────────────────────────────────────────────
     async def async_setup(self) -> None:
+        db_path           = get_db_path()
+        self.conn         = await aiosqlite.connect(db_path)
         self.checkpointer = AsyncSqliteSaver(self.conn)
         await self._create_tables()
         self.graph         = self._build_graph()
         self.summary_graph = self._build_summary_graph()
+        print("[Backend] AIBackend ready ✓")
     async def _create_tables(self) -> None:
         await self.conn.execute("""
         """)
         await self.conn.execute("""
             CREATE TABLE IF NOT EXISTS doctors (
+                id INTEGER PRIMARY KEY AUTOINCREMENT,
+                doctor_name TEXT,
+                category TEXT,
+                visiting_days TEXT,
+                visiting_time TEXT,
+                visiting_money INTEGER
             )
         """)
         await self.conn.execute("""
             CREATE TABLE IF NOT EXISTS patients (
+                id INTEGER PRIMARY KEY AUTOINCREMENT,
+                doctor_name TEXT,
                 doctor_category TEXT,
+                patient_name TEXT,
+                patient_age TEXT,
+                patient_num TEXT,
+                visiting_date TEXT,
+                patient_mail TEXT
             )
         """)
         await self.conn.commit()
         response = await self.llm.ainvoke(messages + [HumanMessage(content=prompt)])
         return {
+            "summary":  response.content,
             "messages": [RemoveMessage(id=m.id) for m in messages[:-2]],
         }
     # ── Chat node ──────────────────────────────────────────────────────────────
     async def chat_node(self, state: ChatState):
+        """
+        Invokes the LLM with tool bindings and returns the AI response.
+        Uses ainvoke() (not collect-all-then-return astream()) so the call is
+        clean and deterministic. Token-level streaming is handled by LangGraph
+        itself via stream_mode="messages" in ai_only_stream(), which intercepts
+        the underlying LLM streaming at the graph level.
+        """
         summary  = state.get("summary", "")
         messages = state["messages"]
+        print("#" * 50)
+        print(">>>>>>>>>> CHAT NODE START <<<<<<<<<<")
+        print(f"[SUMMARY]: {summary[:120] if summary else 'None'}")
+        for m in messages:
+            print(f"  [{m.__class__.__name__}]: {str(m.content)[:160]}")
+        print("#" * 50)
+        sys_content   = SUMMARY_SYSTEM.format(summary=summary) if summary else BASE_SYSTEM
         full_messages = [SystemMessage(content=sys_content)] + list(messages)
+        response = await self.llm_with_tools.ainvoke(full_messages)
+        print(f"[AI]: {str(response.content)[:200]}")
+        print(">>>>>>>>>> CHAT NODE END <<<<<<<<<<")
         return {"messages": [response]}
     # ── Graph ──────────────────────────────────────────────────────────────────
     def _build_graph(self):
         g = StateGraph(ChatState)
         g.add_node("chat_node", self.chat_node)
+        g.add_node("tools",     self.tool_node)
+        g.add_edge(START, "chat_node")
+        g.add_conditional_edges("chat_node", tools_condition)
+        g.add_edge("tools", "chat_node")
         return g.compile(checkpointer=self.checkpointer)
     def _build_summary_graph(self):
         g.add_edge("summarize_node", END)
         return g.compile(checkpointer=self.checkpointer)
+    # ── Streaming ──────────────────────────────────────────────────────────────
+    async def ai_only_stream(
+        self, initial_state: dict, config: dict
+    ) -> AsyncGenerator[str, None]:
         """
+        Async generator — yields AI text tokens as they arrive.
+        FIX-BUG9: narrowed isinstance check to exclude ToolMessage content
+        from being streamed to the user, and guards against non-str content
+        (e.g. multimodal list payloads from Ollama tool-call chunks).
+        """
+        async for chunk, _meta in self.graph.astream(
+            initial_state, config=config, stream_mode="messages"
+        ):
+            # Only yield text content from AI messages.
+            # Exclude ToolMessage (tool execution results) — they contain
+            # raw JSON that should not be streamed directly to the user.
+            if (
+                isinstance(chunk, (AIMessage, AIMessageChunk))
+                and not isinstance(chunk, ToolMessage)
+                and isinstance(chunk.content, str)
+                and chunk.content
+            ):
+                yield chunk.content
+        # Auto-summarise in background when history grows long
         try:
+            current = await self.graph.aget_state(config)
+            if len(current.values.get("messages", [])) > 10:
+                asyncio.create_task(
+                    self.summary_graph.ainvoke(current.values, config=config)
+                )
+                print("@" * 20, "Summarisation triggered", "@" * 20)
         except Exception as exc:
+            print(f"[Backend] Summarisation check failed: {exc}")
     # ── Thread management ──────────────────────────────────────────────────────
     @staticmethod
     def generate_thread_id() -> str:
         return str(uuid.uuid4())
+    async def retrieve_all_threads(self) -> list[str]:
+        threads: set[str] = set()
+        async for cp in self.checkpointer.alist(None):
+            threads.add(cp.config["configurable"]["thread_id"])
+        return list(threads)
     # ── Public entry point ─────────────────────────────────────────────────────
+    async def main(self, user_id: str, user_query: str) -> AsyncGenerator[str, None]:
         """Return an async generator of AI text tokens."""
+        async with self.conn.execute(
             "SELECT threadId FROM userid_threadid WHERE userId = ?", (user_id,)
         ) as cursor:
             row = await cursor.fetchone()
         if row is None:
             thread_id = user_id + self.generate_thread_id()
+            await self.conn.execute(
                 "INSERT INTO userid_threadid (userId, threadId) VALUES (?, ?)",
                 (user_id, thread_id),
             )
+            await self.conn.commit()
         else:
             thread_id = row[0]
+        initial_state = {"messages": [HumanMessage(content=user_query)]}
+        config = {
+            "configurable": {"thread_id": thread_id},
+            "metadata":     {"thread_id": thread_id},
+            "run_name":     "chat_turn",
+        }
+        return self.ai_only_stream(initial_state, config)

frontend/index.html CHANGED Viewed

@@ -1,48 +1,3 @@
 <!DOCTYPE html>
 <html lang="bn">
 <head>
@@ -62,9 +17,8 @@
 <div class="bg-orb orb-3"></div>
 <!-- ══════════════════════════════════════════════════════════════
-     INIT OVERLAY — shown until WS is ready + animations done
-     No error text is displayed here; overlay auto-closes via
-     hard 8s failsafe if backend takes longer than expected.
      ══════════════════════════════════════════════════════════════ -->
 <div id="init-overlay" class="init-overlay">
   <div class="init-card">
@@ -117,8 +71,10 @@
 <!-- ══════════════════════════════════════════════════════════════
      MAIN APP
      ══════════════════════════════════════════════════════════════ -->
-<div class="app" id="app" style="opacity:0;pointer-events:none;">
   <!-- ── Sidebar ── -->
   <aside class="sidebar" id="sidebar">
@@ -191,7 +147,6 @@
     <!-- Voice Settings -->
     <div class="dash-section">
       <div class="dash-title">🎛️ Voice Settings</div>
       <div class="setting-row">
         <label>Silence Threshold</label>
         <div class="slider-wrap">
@@ -271,13 +226,14 @@
     <!-- Controls -->
     <footer class="controls">
       <div class="text-row">
         <input
           type="text"
           id="text-input"
-          placeholder="বার্তা লিখুন… (Type a message)"
           autocomplete="off"
         />
         <button id="send-btn" title="Send">
           <svg width="20" height="20" viewBox="0 0 24 24" fill="none"
                stroke="currentColor" stroke-width="2">

 <!DOCTYPE html>
 <html lang="bn">
 <head>
 <div class="bg-orb orb-3"></div>
 <!-- ══════════════════════════════════════════════════════════════
+     INIT OVERLAY — shown until WS ready + animations done
+     Hard 8 s failsafe closes overlay if backend is slow.
      ══════════════════════════════════════════════════════════════ -->
 <div id="init-overlay" class="init-overlay">
   <div class="init-card">
 <!-- ══════════════════════════════════════════════════════════════
      MAIN APP
+     FIX-7: Hidden via .app CSS class (not inline opacity:0) to
+     prevent FOUC. JS adds class "visible" after init closes.
      ══════════════════════════════════════════════════════════════ -->
+<div class="app" id="app">
   <!-- ── Sidebar ── -->
   <aside class="sidebar" id="sidebar">
     <!-- Voice Settings -->
     <div class="dash-section">
       <div class="dash-title">🎛️ Voice Settings</div>
       <div class="setting-row">
         <label>Silence Threshold</label>
         <div class="slider-wrap">
     <!-- Controls -->
     <footer class="controls">
       <div class="text-row">
+        <!-- FIX-4: textarea is created by script.js to replace this input,
+             keeping HTML clean while gaining auto-resize + shift-enter -->
         <input
           type="text"
           id="text-input"
+          placeholder="বার্তা লিখুন… (Enter পাঠান · Shift+Enter নতুন লাইন)"
           autocomplete="off"
         />
         <button id="send-btn" title="Send">
           <svg width="20" height="20" viewBox="0 0 24 24" fill="none"
                stroke="currentColor" stroke-width="2">

frontend/style.css CHANGED Viewed

@@ -48,41 +48,28 @@ body {
   animation: orb-float 12s ease-in-out infinite;
 }
 .orb-1 {
-  width: 500px;
-  height: 500px;
   background: radial-gradient(circle, #22d3ee, transparent);
-  top: -200px;
-  left: -150px;
   animation-delay: 0s;
 }
 .orb-2 {
-  width: 400px;
-  height: 400px;
   background: radial-gradient(circle, #818cf8, transparent);
-  bottom: -100px;
-  right: -100px;
   animation-delay: -4s;
 }
 .orb-3 {
-  width: 300px;
-  height: 300px;
   background: radial-gradient(circle, #f472b6, transparent);
-  top: 50%;
-  left: 50%;
   transform: translate(-50%, -50%);
   animation-delay: -8s;
 }
 @keyframes orb-float {
-  0%,
-  100% {
-    transform: translate(0, 0) scale(1);
-  }
-  33% {
-    transform: translate(30px, -20px) scale(1.05);
-  }
-  66% {
-    transform: translate(-20px, 15px) scale(0.97);
-  }
 }
 /* ── Init overlay ── */
@@ -94,9 +81,7 @@ body {
   align-items: center;
   justify-content: center;
   background: var(--bg);
-  transition:
-    opacity 0.6s ease,
-    visibility 0.6s ease;
 }
 .init-overlay.hidden {
   opacity: 0;
@@ -119,15 +104,8 @@ body {
   animation: logo-pulse 2s ease-in-out infinite;
 }
 @keyframes logo-pulse {
-  0%,
-  100% {
-    filter: drop-shadow(0 0 12px rgba(34, 211, 238, 0.4));
-    transform: scale(1);
-  }
-  50% {
-    filter: drop-shadow(0 0 24px rgba(129, 140, 248, 0.6));
-    transform: scale(1.06);
-  }
 }
 .init-title {
   font-family: 'Syne', sans-serif;
@@ -159,52 +137,35 @@ body {
   border-bottom: 1px solid var(--border);
   transition: color 0.3s;
 }
-.stage.active {
-  color: var(--accent);
-}
-.stage.done {
-  color: var(--green);
-}
 .stage-dot {
-  width: 8px;
-  height: 8px;
   border-radius: 50%;
   background: var(--text3);
   flex-shrink: 0;
-  transition:
-    background 0.3s,
-    box-shadow 0.3s;
 }
 .stage.active .stage-dot {
   background: var(--accent);
   box-shadow: 0 0 8px var(--accent);
   animation: blink-dot 0.8s ease-in-out infinite;
 }
-.stage.done .stage-dot {
-  background: var(--green);
-}
 @keyframes blink-dot {
-  0%,
-  100% {
-    opacity: 1;
-  }
-  50% {
-    opacity: 0.3;
-  }
 }
 .stage-check {
   margin-left: auto;
   opacity: 0;
   transition: opacity 0.3s;
 }
-.stage.done .stage-check {
-  opacity: 1;
-}
 .stage span {
   flex: 1;
   font-family: 'Hind Siliguri', sans-serif;
 }
 .init-bar-wrap {
   background: var(--bg3);
   border-radius: 99px;
@@ -227,14 +188,24 @@ body {
   font-family: 'JetBrains Mono', monospace;
 }
-/* ── App layout ── */
 .app {
   position: fixed;
   inset: 0;
   z-index: 1;
   display: flex;
   transition: opacity 0.5s ease;
 }
 /* ── Sidebar ── */
 .sidebar {
@@ -245,9 +216,7 @@ body {
   flex-direction: column;
   flex-shrink: 0;
   overflow-y: auto;
-  transition:
-    width var(--transition),
-    transform var(--transition);
   z-index: 10;
 }
 .sidebar.collapsed {
@@ -285,19 +254,14 @@ body {
   color: var(--text);
 }
-.status-panel {
-  padding: 16px;
-}
 .status-row {
   display: flex;
   align-items: center;
   justify-content: space-between;
   padding: 6px 0;
 }
-.status-label {
-  font-size: 12px;
-  color: var(--text2);
-}
 .status-badge {
   font-size: 10px;
   font-family: 'JetBrains Mono', monospace;
@@ -306,28 +270,13 @@ body {
   font-weight: 600;
   letter-spacing: 0.03em;
 }
-.badge-green {
-  background: rgba(74, 222, 128, 0.12);
-  color: var(--green);
-}
-.badge-yellow {
-  background: rgba(251, 191, 36, 0.12);
-  color: var(--yellow);
-}
-.badge-red {
-  background: rgba(248, 113, 113, 0.12);
-  color: var(--red);
-}
-.sidebar-divider {
-  height: 1px;
-  background: var(--border);
-  margin: 4px 0;
-}
-.dash-section {
-  padding: 16px;
-}
 .dash-title {
   font-size: 11px;
   font-weight: 700;
@@ -356,25 +305,16 @@ body {
   line-height: 1;
   margin-bottom: 4px;
 }
-.metric-label {
-  font-size: 10px;
-  color: var(--text3);
-}
-.setting-row {
-  margin-bottom: 14px;
-}
 .setting-row label {
   display: block;
   font-size: 11px;
   color: var(--text2);
   margin-bottom: 6px;
 }
-.slider-wrap {
-  display: flex;
-  align-items: center;
-  gap: 8px;
-}
 .slider-wrap input[type='range'] {
   flex: 1;
   accent-color: var(--accent);
@@ -399,10 +339,7 @@ body {
   font-family: 'Hind Siliguri', sans-serif;
   cursor: pointer;
 }
-.setting-select:focus {
-  outline: none;
-  border-color: var(--accent);
-}
 .queue-vis {
   display: flex;
@@ -416,14 +353,10 @@ body {
   background: var(--accent);
   border-radius: 3px;
   opacity: 0.3;
-  transition:
-    height 0.15s ease,
-    opacity 0.15s ease;
   min-height: 4px;
 }
-.queue-bar.active {
-  opacity: 0.9;
-}
 .queue-label {
   font-size: 11px;
   color: var(--text2);
@@ -449,11 +382,7 @@ body {
   border-bottom: 1px solid var(--border);
   flex-shrink: 0;
 }
-.topbar-left {
-  display: flex;
-  align-items: center;
-  gap: 12px;
-}
 .topbar-center {
   font-family: 'Syne', sans-serif;
   font-weight: 700;
@@ -463,10 +392,7 @@ body {
   left: 50%;
   transform: translateX(-50%);
 }
-.topbar-right {
-  display: flex;
-  gap: 8px;
-}
 .mobile-menu-btn {
   display: none;
   background: none;
@@ -478,41 +404,18 @@ body {
   font-size: 16px;
 }
 .state-dot {
-  width: 8px;
-  height: 8px;
   border-radius: 50%;
   background: var(--green);
   box-shadow: 0 0 6px var(--green);
   flex-shrink: 0;
-  transition:
-    background 0.3s,
-    box-shadow 0.3s;
-}
-.state-dot.listening {
-  background: var(--accent);
-  box-shadow: 0 0 8px var(--accent);
-  animation: blink-dot 0.8s infinite;
-}
-.state-dot.recording {
-  background: var(--red);
-  box-shadow: 0 0 10px var(--red);
-  animation: blink-dot 0.4s infinite;
-}
-.state-dot.processing {
-  background: var(--yellow);
-  box-shadow: 0 0 8px var(--yellow);
-  animation: blink-dot 1s infinite;
-}
-.state-dot.speaking {
-  background: var(--accent2);
-  box-shadow: 0 0 10px var(--accent2);
-  animation: blink-dot 0.6s infinite;
-}
-#state-label {
-  font-size: 13px;
-  color: var(--text2);
-  font-family: 'JetBrains Mono', monospace;
 }
 .clear-btn {
   background: none;
@@ -525,10 +428,7 @@ body {
   font-family: 'Syne', sans-serif;
   transition: all var(--transition);
 }
-.clear-btn:hover {
-  border-color: var(--accent);
-  color: var(--accent);
-}
 /* ── Chat ── */
 #chat-box {
@@ -540,16 +440,9 @@ body {
   gap: 12px;
   scroll-behavior: smooth;
 }
-#chat-box::-webkit-scrollbar {
-  width: 4px;
-}
-#chat-box::-webkit-scrollbar-track {
-  background: transparent;
-}
-#chat-box::-webkit-scrollbar-thumb {
-  background: var(--border2);
-  border-radius: 99px;
-}
 .message {
   max-width: 75%;
@@ -563,14 +456,8 @@ body {
   font-family: 'Hind Siliguri', sans-serif;
 }
 @keyframes msg-in {
-  from {
-    opacity: 0;
-    transform: translateY(10px) scale(0.97);
-  }
-  to {
-    opacity: 1;
-    transform: translateY(0) scale(1);
-  }
 }
 .message.user {
   background: var(--user-bg);
@@ -592,17 +479,9 @@ body {
   align-self: center;
   max-width: 90%;
 }
-.message ul,
-.message ol {
-  padding-left: 20px;
-  margin: 8px 0;
-}
-.message li {
-  margin-bottom: 4px;
-}
-.message p {
-  margin: 6px 0;
-}
 .message code {
   background: rgba(0, 0, 0, 0.3);
   border-radius: 4px;
@@ -629,9 +508,7 @@ body {
   transition: height 0.3s ease;
   padding: 0 20px;
 }
-.voice-visualizer.active {
-  height: 56px;
-}
 .viz-bar {
   width: 4px;
   border-radius: 99px;
@@ -652,7 +529,10 @@ body {
   display: flex;
   gap: 10px;
   margin-bottom: 12px;
 }
 #text-input {
   flex: 1;
   background: var(--bg3);
@@ -664,13 +544,21 @@ body {
   font-family: 'Hind Siliguri', sans-serif;
   outline: none;
   transition: border-color var(--transition);
-}
-#text-input::placeholder {
-  color: var(--text3);
-}
-#text-input:focus {
-  border-color: var(--accent);
-}
 #send-btn {
   background: linear-gradient(135deg, var(--accent), var(--accent2));
@@ -681,21 +569,15 @@ body {
   color: #000;
   display: flex;
   align-items: center;
-  transition:
-    opacity var(--transition),
-    transform 0.1s;
-}
-#send-btn:hover {
-  opacity: 0.88;
-}
-#send-btn:active {
-  transform: scale(0.95);
 }
-.voice-row {
-  display: flex;
-  gap: 10px;
-}
 .mic-btn {
   flex: 1;
   display: flex;
@@ -722,41 +604,25 @@ body {
   opacity: 0;
   transition: opacity var(--transition);
 }
-.mic-btn:hover::before {
-  opacity: 0.08;
-}
 .mic-btn.mic-listening {
   border-color: var(--accent);
-  box-shadow:
-    0 0 0 2px rgba(34, 211, 238, 0.2),
-    inset 0 0 20px rgba(34, 211, 238, 0.05);
 }
 .mic-btn.mic-recording {
   border-color: var(--red);
   animation: pulse-red 0.8s ease-in-out infinite;
 }
 @keyframes pulse-red {
-  0%,
-  100% {
-    box-shadow: 0 0 0 0 rgba(248, 113, 113, 0.4);
-  }
-  50% {
-    box-shadow: 0 0 0 8px rgba(248, 113, 113, 0);
-  }
 }
 .mic-btn.mic-processing {
   border-color: var(--yellow);
   box-shadow: 0 0 0 2px rgba(251, 191, 36, 0.15);
 }
-.mic-icon {
-  font-size: 18px;
-  position: relative;
-  z-index: 1;
-}
-.mic-label {
-  position: relative;
-  z-index: 1;
-}
 .stop-btn {
   background: rgba(248, 113, 113, 0.1);
@@ -772,49 +638,26 @@ body {
   gap: 6px;
   transition: all var(--transition);
 }
-.stop-btn:hover {
-  background: rgba(248, 113, 113, 0.2);
-  border-color: var(--red);
-}
-.stop-btn:active {
-  transform: scale(0.95);
-}
-/* ── Scrollbar ── */
-.sidebar::-webkit-scrollbar {
-  width: 4px;
-}
-.sidebar::-webkit-scrollbar-track {
-  background: transparent;
-}
-.sidebar::-webkit-scrollbar-thumb {
-  background: var(--border);
-  border-radius: 99px;
-}
 /* ── Responsive ── */
 @media (max-width: 680px) {
   .sidebar {
     position: fixed;
-    left: 0;
-    top: 0;
-    bottom: 0;
     transform: translateX(-100%);
     z-index: 100;
   }
-  .sidebar.mobile-open {
-    transform: translateX(0);
-  }
-  .mobile-menu-btn {
-    display: flex;
-  }
-  .topbar-center {
-    font-size: 13px;
-  }
-  .message {
-    max-width: 90%;
-    font-size: 14px;
-  }
 }
 /* ── Thinking bubble (animated "..." while AI processes) ── */
@@ -831,8 +674,7 @@ body {
 }
 .message.thinking .dot {
   display: inline-block;
-  width: 7px;
-  height: 7px;
   border-radius: 50%;
   background: var(--accent2);
   opacity: 0.4;

   animation: orb-float 12s ease-in-out infinite;
 }
 .orb-1 {
+  width: 500px; height: 500px;
   background: radial-gradient(circle, #22d3ee, transparent);
+  top: -200px; left: -150px;
   animation-delay: 0s;
 }
 .orb-2 {
+  width: 400px; height: 400px;
   background: radial-gradient(circle, #818cf8, transparent);
+  bottom: -100px; right: -100px;
   animation-delay: -4s;
 }
 .orb-3 {
+  width: 300px; height: 300px;
   background: radial-gradient(circle, #f472b6, transparent);
+  top: 50%; left: 50%;
   transform: translate(-50%, -50%);
   animation-delay: -8s;
 }
 @keyframes orb-float {
+  0%, 100% { transform: translate(0, 0) scale(1); }
+  33%       { transform: translate(30px, -20px) scale(1.05); }
+  66%       { transform: translate(-20px, 15px) scale(0.97); }
 }
 /* ── Init overlay ── */
   align-items: center;
   justify-content: center;
   background: var(--bg);
+  transition: opacity 0.6s ease, visibility 0.6s ease;
 }
 .init-overlay.hidden {
   opacity: 0;
   animation: logo-pulse 2s ease-in-out infinite;
 }
 @keyframes logo-pulse {
+  0%, 100% { filter: drop-shadow(0 0 12px rgba(34, 211, 238, 0.4)); transform: scale(1); }
+  50%       { filter: drop-shadow(0 0 24px rgba(129, 140, 248, 0.6)); transform: scale(1.06); }
 }
 .init-title {
   font-family: 'Syne', sans-serif;
   border-bottom: 1px solid var(--border);
   transition: color 0.3s;
 }
+.stage.active { color: var(--accent); }
+.stage.done   { color: var(--green); }
 .stage-dot {
+  width: 8px; height: 8px;
   border-radius: 50%;
   background: var(--text3);
   flex-shrink: 0;
+  transition: background 0.3s, box-shadow 0.3s;
 }
 .stage.active .stage-dot {
   background: var(--accent);
   box-shadow: 0 0 8px var(--accent);
   animation: blink-dot 0.8s ease-in-out infinite;
 }
+.stage.done .stage-dot { background: var(--green); }
 @keyframes blink-dot {
+  0%, 100% { opacity: 1; }
+  50%       { opacity: 0.3; }
 }
 .stage-check {
   margin-left: auto;
   opacity: 0;
   transition: opacity 0.3s;
 }
+.stage.done .stage-check { opacity: 1; }
 .stage span {
   flex: 1;
   font-family: 'Hind Siliguri', sans-serif;
 }
 .init-bar-wrap {
   background: var(--bg3);
   border-radius: 99px;
   font-family: 'JetBrains Mono', monospace;
 }
+/* ── App layout ──
+   FIX-7: App is hidden by default via opacity/pointer-events.
+   JS adds class .visible after init overlay closes. This prevents
+   any flash of unstyled content (FOUC) during JS execution.
+── */
 .app {
   position: fixed;
   inset: 0;
   z-index: 1;
   display: flex;
+  opacity: 0;
+  pointer-events: none;
   transition: opacity 0.5s ease;
 }
+.app.visible {
+  opacity: 1;
+  pointer-events: auto;
+}
 /* ── Sidebar ── */
 .sidebar {
   flex-direction: column;
   flex-shrink: 0;
   overflow-y: auto;
+  transition: width var(--transition), transform var(--transition);
   z-index: 10;
 }
 .sidebar.collapsed {
   color: var(--text);
 }
+.status-panel { padding: 16px; }
 .status-row {
   display: flex;
   align-items: center;
   justify-content: space-between;
   padding: 6px 0;
 }
+.status-label { font-size: 12px; color: var(--text2); }
 .status-badge {
   font-size: 10px;
   font-family: 'JetBrains Mono', monospace;
   font-weight: 600;
   letter-spacing: 0.03em;
 }
+.badge-green  { background: rgba(74, 222, 128, 0.12); color: var(--green); }
+.badge-yellow { background: rgba(251, 191, 36, 0.12); color: var(--yellow); }
+.badge-red    { background: rgba(248, 113, 113, 0.12); color: var(--red); }
+.sidebar-divider { height: 1px; background: var(--border); margin: 4px 0; }
+.dash-section { padding: 16px; }
 .dash-title {
   font-size: 11px;
   font-weight: 700;
   line-height: 1;
   margin-bottom: 4px;
 }
+.metric-label { font-size: 10px; color: var(--text3); }
+.setting-row { margin-bottom: 14px; }
 .setting-row label {
   display: block;
   font-size: 11px;
   color: var(--text2);
   margin-bottom: 6px;
 }
+.slider-wrap { display: flex; align-items: center; gap: 8px; }
 .slider-wrap input[type='range'] {
   flex: 1;
   accent-color: var(--accent);
   font-family: 'Hind Siliguri', sans-serif;
   cursor: pointer;
 }
+.setting-select:focus { outline: none; border-color: var(--accent); }
 .queue-vis {
   display: flex;
   background: var(--accent);
   border-radius: 3px;
   opacity: 0.3;
+  transition: height 0.15s ease, opacity 0.15s ease;
   min-height: 4px;
 }
+.queue-bar.active { opacity: 0.9; }
 .queue-label {
   font-size: 11px;
   color: var(--text2);
   border-bottom: 1px solid var(--border);
   flex-shrink: 0;
 }
+.topbar-left  { display: flex; align-items: center; gap: 12px; }
 .topbar-center {
   font-family: 'Syne', sans-serif;
   font-weight: 700;
   left: 50%;
   transform: translateX(-50%);
 }
+.topbar-right { display: flex; gap: 8px; }
 .mobile-menu-btn {
   display: none;
   background: none;
   font-size: 16px;
 }
 .state-dot {
+  width: 8px; height: 8px;
   border-radius: 50%;
   background: var(--green);
   box-shadow: 0 0 6px var(--green);
   flex-shrink: 0;
+  transition: background 0.3s, box-shadow 0.3s;
 }
+.state-dot.listening  { background: var(--accent);  box-shadow: 0 0 8px var(--accent);  animation: blink-dot 0.8s infinite; }
+.state-dot.recording  { background: var(--red);     box-shadow: 0 0 10px var(--red);    animation: blink-dot 0.4s infinite; }
+.state-dot.processing { background: var(--yellow);  box-shadow: 0 0 8px var(--yellow);  animation: blink-dot 1s infinite; }
+.state-dot.speaking   { background: var(--accent2); box-shadow: 0 0 10px var(--accent2);animation: blink-dot 0.6s infinite; }
+#state-label { font-size: 13px; color: var(--text2); font-family: 'JetBrains Mono', monospace; }
 .clear-btn {
   background: none;
   font-family: 'Syne', sans-serif;
   transition: all var(--transition);
 }
+.clear-btn:hover { border-color: var(--accent); color: var(--accent); }
 /* ── Chat ── */
 #chat-box {
   gap: 12px;
   scroll-behavior: smooth;
 }
+#chat-box::-webkit-scrollbar { width: 4px; }
+#chat-box::-webkit-scrollbar-track { background: transparent; }
+#chat-box::-webkit-scrollbar-thumb { background: var(--border2); border-radius: 99px; }
 .message {
   max-width: 75%;
   font-family: 'Hind Siliguri', sans-serif;
 }
 @keyframes msg-in {
+  from { opacity: 0; transform: translateY(10px) scale(0.97); }
+  to   { opacity: 1; transform: translateY(0) scale(1); }
 }
 .message.user {
   background: var(--user-bg);
   align-self: center;
   max-width: 90%;
 }
+.message ul, .message ol { padding-left: 20px; margin: 8px 0; }
+.message li  { margin-bottom: 4px; }
+.message p   { margin: 6px 0; }
 .message code {
   background: rgba(0, 0, 0, 0.3);
   border-radius: 4px;
   transition: height 0.3s ease;
   padding: 0 20px;
 }
+.voice-visualizer.active { height: 56px; }
 .viz-bar {
   width: 4px;
   border-radius: 99px;
   display: flex;
   gap: 10px;
   margin-bottom: 12px;
+  align-items: flex-end; /* FIX-4: align send button to bottom when textarea grows */
 }
+/* ── FIX-4: Auto-growing textarea replaces <input type="text"> ── */
 #text-input {
   flex: 1;
   background: var(--bg3);
   font-family: 'Hind Siliguri', sans-serif;
   outline: none;
   transition: border-color var(--transition);
+  resize: none;           /* FIX-4: no manual resize handle */
+  overflow-y: hidden;     /* hidden until 10 lines exceeded; JS manages */
+  line-height: 1.57;      /* ~22px per line at font-size 14px */
+  min-height: 44px;       /* single line min */
+  max-height: 226px;      /* 10 lines × 22px + 16px padding */
+  display: block;
+  /* smooth height animation */
+  transition: border-color var(--transition), height 0.1s ease;
+}
+#text-input::placeholder { color: var(--text3); }
+#text-input:focus { border-color: var(--accent); }
+/* Custom scrollbar inside textarea once > 10 lines */
+#text-input::-webkit-scrollbar { width: 4px; }
+#text-input::-webkit-scrollbar-track { background: transparent; }
+#text-input::-webkit-scrollbar-thumb { background: var(--border2); border-radius: 99px; }
 #send-btn {
   background: linear-gradient(135deg, var(--accent), var(--accent2));
   color: #000;
   display: flex;
   align-items: center;
+  transition: opacity var(--transition), transform 0.1s;
+  flex-shrink: 0;
+  align-self: flex-end; /* FIX-4: stays at bottom as textarea grows */
+  height: 44px;
 }
+#send-btn:hover  { opacity: 0.88; }
+#send-btn:active { transform: scale(0.95); }
+.voice-row { display: flex; gap: 10px; }
 .mic-btn {
   flex: 1;
   display: flex;
   opacity: 0;
   transition: opacity var(--transition);
 }
+.mic-btn:hover::before { opacity: 0.08; }
 .mic-btn.mic-listening {
   border-color: var(--accent);
+  box-shadow: 0 0 0 2px rgba(34, 211, 238, 0.2), inset 0 0 20px rgba(34, 211, 238, 0.05);
 }
 .mic-btn.mic-recording {
   border-color: var(--red);
   animation: pulse-red 0.8s ease-in-out infinite;
 }
 @keyframes pulse-red {
+  0%, 100% { box-shadow: 0 0 0 0 rgba(248, 113, 113, 0.4); }
+  50%       { box-shadow: 0 0 0 8px rgba(248, 113, 113, 0); }
 }
 .mic-btn.mic-processing {
   border-color: var(--yellow);
   box-shadow: 0 0 0 2px rgba(251, 191, 36, 0.15);
 }
+.mic-icon  { font-size: 18px; position: relative; z-index: 1; }
+.mic-label { position: relative; z-index: 1; }
 .stop-btn {
   background: rgba(248, 113, 113, 0.1);
   gap: 6px;
   transition: all var(--transition);
 }
+.stop-btn:hover  { background: rgba(248, 113, 113, 0.2); border-color: var(--red); }
+.stop-btn:active { transform: scale(0.95); }
+/* ── Scrollbar (sidebar) ── */
+.sidebar::-webkit-scrollbar { width: 4px; }
+.sidebar::-webkit-scrollbar-track { background: transparent; }
+.sidebar::-webkit-scrollbar-thumb { background: var(--border); border-radius: 99px; }
 /* ── Responsive ── */
 @media (max-width: 680px) {
   .sidebar {
     position: fixed;
+    left: 0; top: 0; bottom: 0;
     transform: translateX(-100%);
     z-index: 100;
   }
+  .sidebar.mobile-open { transform: translateX(0); }
+  .mobile-menu-btn { display: flex; }
+  .topbar-center { font-size: 13px; }
+  .message { max-width: 90%; font-size: 14px; }
 }
 /* ── Thinking bubble (animated "..." while AI processes) ── */
 }
 .message.thinking .dot {
   display: inline-block;
+  width: 7px; height: 7px;
   border-radius: 50%;
   background: var(--accent2);
   opacity: 0.4;

services/streaming.py CHANGED Viewed

@@ -1,6 +1,24 @@
 """
 services/streaming.py — Production-grade parallel TTS streamer
-(unchanged from original — architecture is correct)
 """
 from __future__ import annotations
@@ -12,25 +30,27 @@ from typing import AsyncGenerator
 from services.tts import text_to_speech_stream, USE_ELEVENLABS, EDGE_VOICE
-if USE_ELEVENLABS:
-    FIRST_FLUSH_BOUNDARY_MIN      = 5
-    FIRST_FLUSH_HARD              = 25
-    SUBSEQUENT_FLUSH_BOUNDARY_MIN = 22
-    SUBSEQUENT_FLUSH_HARD         = 65
-    _backend_label = "ElevenLabs"
-else:
-    FIRST_FLUSH_BOUNDARY_MIN      = 5
-    FIRST_FLUSH_HARD              = 25
-    SUBSEQUENT_FLUSH_BOUNDARY_MIN = 18
-    SUBSEQUENT_FLUSH_HARD         = 65
-    _backend_label = "Edge-TTS"
-print(f"[Streamer] TTS backend: {_backend_label}")
-MIN_CHARS = 2
 SENTENCE_BOUNDARIES = frozenset(".!?।॥\n")
 CLAUSE_BOUNDARIES   = frozenset(",;:—–")
-_SENTINEL = object()
 def _clean_for_tts(text: str) -> str:
@@ -47,15 +67,28 @@ def _should_flush(buffer: str, first_chunk: bool) -> bool:
     n = len(buffer)
     if n == 0:
         return False
-    boundary_min = FIRST_FLUSH_BOUNDARY_MIN if first_chunk else SUBSEQUENT_FLUSH_BOUNDARY_MIN
-    hard_limit   = FIRST_FLUSH_HARD         if first_chunk else SUBSEQUENT_FLUSH_HARD
     if n >= hard_limit:
         return True
     last_char = buffer[-1]
-    if last_char in SENTENCE_BOUNDARIES and n >= boundary_min:
         return True
-    if last_char in CLAUSE_BOUNDARIES and n >= hard_limit * 0.75:
         return True
     return False
@@ -92,8 +125,9 @@ class ParallelTTSStreamer:
     async def _schedule_chunk(self) -> None:
         if self._cancelled:
-            self.buffer = ""; return
-        text = _clean_for_tts(self.buffer.strip())
         self.buffer = ""
         if len(text) < MIN_CHARS:
             return
@@ -104,11 +138,14 @@ class ParallelTTSStreamer:
             self._slot_added.set()
         task = asyncio.create_task(self._synthesise(text, slot))
         self._tasks.append(task)
-        task.add_done_callback(lambda t: self._tasks.remove(t) if t in self._tasks else None)
     async def _synthesise(self, text: str, slot: _AudioSlot) -> None:
         if self._cancelled:
-            slot.mark_error(); return
         try:
             async for chunk in text_to_speech_stream(text, voice=self.voice):
                 if self._cancelled:
@@ -128,40 +165,69 @@ class ParallelTTSStreamer:
     async def cancel(self) -> None:
         self._cancelled = True
-        tasks = list(self._tasks); self._tasks.clear()
-        for t in tasks: t.cancel()
         if tasks:
             await asyncio.gather(*tasks, return_exceptions=True)
         async with self._slots_lock:
             for slot in self._slots:
-                if not slot.done: slot.mark_error()
         self._llm_done.set()
         self._slot_added.set()
     async def stream_audio(self) -> AsyncGenerator[bytes, None]:
         delivered = 0
         while True:
             async with self._slots_lock:
                 slot = self._slots[delivered] if delivered < len(self._slots) else None
             if slot is None:
                 if self._llm_done.is_set():
                     async with self._slots_lock:
                         total = len(self._slots)
                     if delivered >= total:
-                        break
                 self._slot_added.clear()
                 try:
                     await asyncio.wait_for(self._slot_added.wait(), timeout=10.0)
                 except asyncio.TimeoutError:
-                    print("[Streamer] Timeout waiting for TTS slot."); break
                 continue
             while True:
                 item = await slot.queue.get()
-                if item is _SENTINEL: break
-                if not self._cancelled: yield item
             delivered += 1
     def reset(self) -> None:
-        self._cancelled = False; self._first_chunk = True; self.buffer = ""
-        self._slot_index = 0; self._slots.clear(); self._tasks.clear()
-        self._llm_done.clear(); self._slot_added.clear()

 """
 services/streaming.py — Production-grade parallel TTS streamer
+FIX-ISSUE4 (Natural, slow, small-chunk TTS):
+  The previous code used character-count thresholds that produced large
+  sentence-level chunks (25–65 chars), causing buffered, robotic-feeling
+  speech with a burst of audio at once.
+  New behaviour:
+    • Flush at word boundaries (2–3 words) for voice-like pacing.
+    • Flush threshold is ~15 chars first chunk, ~25 chars subsequent — which
+      corresponds to roughly 2–3 average Bengali/English words.
+    • Hard limit of 40 chars ensures no chunk ever gets too large.
+    • Sentence-ending punctuation (।.!?) always flushes immediately
+      regardless of length, giving natural pause points.
+    • The TTS rate is set to "-35%" in tts.py (slightly slower than before).
+  Result: audio arrives in small, fast, overlapping synthesis tasks,
+  giving a low-latency, smooth, natural speech feel.
+FIX-BUG5 (TOCTOU race in stream_audio) — preserved from previous version.
 """
 from __future__ import annotations
 from services.tts import text_to_speech_stream, USE_ELEVENLABS, EDGE_VOICE
+# ── Chunk size tuning ──────────────────────────────────────────────────────────
+# These character counts correspond roughly to:
+#   FIRST_FLUSH_MIN       ~2 words  (get audio playing ASAP)
+#   SUBSEQUENT_FLUSH_MIN  ~3 words  (natural conversational phrase)
+#   HARD_LIMIT            ~6 words  (never accumulate more than this)
+#
+# At average Bengali word length ~4–5 chars + space:
+#   10 chars ≈ 2 words, 18 chars ≈ 3-4 words, 40 chars ≈ 7-8 words
+FIRST_FLUSH_MIN        = 10
+FIRST_FLUSH_HARD       = 30
+SUBSEQUENT_FLUSH_MIN   = 18
+SUBSEQUENT_FLUSH_HARD  = 40
+_backend_label = "ElevenLabs" if USE_ELEVENLABS else "Edge-TTS"
+print(f"[Streamer] TTS backend: {_backend_label} | chunk: {SUBSEQUENT_FLUSH_MIN}–{SUBSEQUENT_FLUSH_HARD} chars")
+MIN_CHARS           = 2
 SENTENCE_BOUNDARIES = frozenset(".!?।॥\n")
 CLAUSE_BOUNDARIES   = frozenset(",;:—–")
+_SENTINEL           = object()
 def _clean_for_tts(text: str) -> str:
     n = len(buffer)
     if n == 0:
         return False
+    flush_min  = FIRST_FLUSH_MIN  if first_chunk else SUBSEQUENT_FLUSH_MIN
+    hard_limit = FIRST_FLUSH_HARD if first_chunk else SUBSEQUENT_FLUSH_HARD
+    # Hard limit — always flush regardless of boundary
     if n >= hard_limit:
         return True
     last_char = buffer[-1]
+    # Sentence ending — flush immediately (natural pause point)
+    if last_char in SENTENCE_BOUNDARIES and n >= flush_min:
+        return True
+    # Clause boundary — flush at ~75% of hard limit
+    if last_char in CLAUSE_BOUNDARIES and n >= hard_limit * 0.70:
         return True
+    # Word boundary (space after minimum words reached)
+    if last_char == ' ' and n >= flush_min:
         return True
     return False
     async def _schedule_chunk(self) -> None:
         if self._cancelled:
+            self.buffer = ""
+            return
+        text        = _clean_for_tts(self.buffer.strip())
         self.buffer = ""
         if len(text) < MIN_CHARS:
             return
             self._slot_added.set()
         task = asyncio.create_task(self._synthesise(text, slot))
         self._tasks.append(task)
+        task.add_done_callback(
+            lambda t: self._tasks.remove(t) if t in self._tasks else None
+        )
     async def _synthesise(self, text: str, slot: _AudioSlot) -> None:
         if self._cancelled:
+            slot.mark_error()
+            return
         try:
             async for chunk in text_to_speech_stream(text, voice=self.voice):
                 if self._cancelled:
     async def cancel(self) -> None:
         self._cancelled = True
+        tasks = list(self._tasks)
+        self._tasks.clear()
+        for t in tasks:
+            t.cancel()
         if tasks:
             await asyncio.gather(*tasks, return_exceptions=True)
         async with self._slots_lock:
             for slot in self._slots:
+                if not slot.done:
+                    slot.mark_error()
         self._llm_done.set()
         self._slot_added.set()
     async def stream_audio(self) -> AsyncGenerator[bytes, None]:
+        """
+        Deliver TTS audio chunks in slot order.
+        FIX-BUG5 — double-check pattern eliminates TOCTOU race:
+          1. clear() the event
+          2. Re-check slot list under lock (slot may have been added between
+             previous check and clear())
+          3. Only then wait() — so we never miss a newly-added slot
+        """
         delivered = 0
         while True:
             async with self._slots_lock:
                 slot = self._slots[delivered] if delivered < len(self._slots) else None
             if slot is None:
                 if self._llm_done.is_set():
                     async with self._slots_lock:
                         total = len(self._slots)
                     if delivered >= total:
+                        break  # All slots consumed; done.
+                # FIX-BUG5: clear → re-check → wait
                 self._slot_added.clear()
+                async with self._slots_lock:
+                    have_new = delivered < len(self._slots)
+                if have_new:
+                    continue
                 try:
                     await asyncio.wait_for(self._slot_added.wait(), timeout=10.0)
                 except asyncio.TimeoutError:
+                    print("[Streamer] Timeout waiting for TTS slot.")
+                    break
                 continue
+            # Drain this slot's audio queue in order
             while True:
                 item = await slot.queue.get()
+                if item is _SENTINEL:
+                    break
+                if not self._cancelled:
+                    yield item
             delivered += 1
     def reset(self) -> None:
+        self._cancelled   = False
+        self._first_chunk = True
+        self.buffer       = ""
+        self._slot_index  = 0
+        self._slots.clear()
+        self._tasks.clear()
+        self._llm_done.clear()
+        self._slot_added.clear()

services/stt.py CHANGED Viewed

@@ -12,9 +12,18 @@ Architecture:
 • GPU inference runs in a dedicated single-thread Executor (serialize GPU)
 • Bangla-optimised decode parameters preserved from original
 Latency profile:
   ffmpeg (parallel)     ~30–80 ms
-  batch wait window     ~50 ms
   GPU inference         ~80–150 ms per batch (amortised across requests)
   Total perceived       < 200 ms at moderate load
 """
@@ -37,7 +46,7 @@ from typing import Optional
 from faster_whisper import WhisperModel
 # ── Bangla script patterns ─────────────────────────────────────────────────────
-_BANGLA_RE      = re.compile(r"[\u0980-\u09FF]")
 _WRONG_SCRIPT_RE = re.compile(
     r"[\u0600-\u06FF\u0750-\u077F\uFB50-\uFDFF\uFE70-\uFEFF]"
 )
@@ -46,25 +55,27 @@ _WRONG_SCRIPT_RE = re.compile(
 _BANGLA_SEED = "আমি আপনার সাথে বাংলায় কথা বলছি।"
 # ── Configuration ──────────────────────────────────────────────────────────────
-_STT_MODEL     = os.getenv("STT_MODEL", "large-v3")
-_COMPUTE_TYPE  = os.getenv("STT_COMPUTE_TYPE", "int8_float32")
-_BATCH_WINDOW  = float(os.getenv("STT_BATCH_WINDOW_MS", "50")) / 1000  # seconds
-_MAX_BATCH     = int(os.getenv("STT_MAX_BATCH", "8"))
 MAX_INPUT_BYTES = 5_242_880  # 5 MB
 # ── Singleton model state ──────────────────────────────────────────────────────
 _model: Optional[WhisperModel] = None
 _model_lock    = threading.Lock()
 _model_ready   = threading.Event()
 # Two executors: one for ffmpeg (I/O, can be parallel), one for GPU (serial)
-_ffmpeg_pool   = ThreadPoolExecutor(max_workers=4, thread_name_prefix="ffmpeg")
-_gpu_pool      = ThreadPoolExecutor(max_workers=1, thread_name_prefix="whisper-gpu")
 # ── Model loader (background thread) ──────────────────────────────────────────
 def _load_and_warm() -> None:
-    global _model
     try:
         print(f"[STT] Loading Faster-Whisper {_STT_MODEL} on CUDA ({_COMPUTE_TYPE})…")
         m = WhisperModel(
@@ -80,16 +91,19 @@ def _load_and_warm() -> None:
         with _model_lock:
             _model = m
     except Exception as exc:
-        print(f"[STT] Model load failed: {exc}")
     finally:
         _model_ready.set()
 def _make_silence_wav(duration_s: float = 0.5, sr: int = 16_000) -> io.BytesIO:
     buf = io.BytesIO()
-    n = int(sr * duration_s)
     with wave.open(buf, "wb") as wf:
-        wf.setnchannels(1); wf.setsampwidth(2); wf.setframerate(sr)
         wf.writeframes(struct.pack(f"<{n}h", *([0] * n)))
     buf.seek(0)
     return buf
@@ -105,7 +119,8 @@ def _to_wav_sync(audio_bytes: bytes) -> Optional[str]:
     in_path = out_path = None
     try:
         with tempfile.NamedTemporaryFile(suffix=".webm", delete=False) as f:
-            f.write(audio_bytes); in_path = f.name
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
             out_path = f.name
@@ -135,8 +150,10 @@ def _to_wav_sync(audio_bytes: bytes) -> Optional[str]:
         return None
     finally:
         if in_path and os.path.exists(in_path):
-            try: os.remove(in_path)
-            except OSError: pass
 # ── Whisper inference (sync, runs in _gpu_pool — ONE AT A TIME) ───��───────────
@@ -144,8 +161,6 @@ def _transcribe_batch_sync(wav_paths: list[str]) -> list[Optional[str]]:
     """
     Run Whisper inference on a list of WAV paths.
     Returns a list of transcripts (None on error/empty).
-    Each file is processed sequentially on the same GPU — this is intentional:
-    batching here means we avoid per-request CUDA kernel spin-up overhead.
     """
     with _model_lock:
         model = _model
@@ -175,8 +190,10 @@ def _transcribe_batch_sync(wav_paths: list[str]) -> list[Optional[str]]:
             print(f"[STT] inference error: {exc}")
             results.append(None)
         finally:
-            try: os.remove(path)
-            except OSError: pass
     return results
@@ -185,7 +202,7 @@ def _transcribe_batch_sync(wav_paths: list[str]) -> list[Optional[str]]:
 def _validate(text: str) -> Optional[str]:
     if not text or not text.strip():
         return None
-    text = text.strip()
     words = text.split()
     if len(words) >= 6 and len(set(words)) / len(words) < 0.25:
         print(f"[STT] rejected repetition: {text[:60]}")
@@ -193,8 +210,8 @@ def _validate(text: str) -> Optional[str]:
     if len(words) == 2 and words[0] == words[1]:
         return None
     # Soft script check — log but keep
-    wrong  = len(_WRONG_SCRIPT_RE.findall(text))
-    alpha  = sum(1 for c in text if c.isalpha())
     if alpha > 0 and wrong / alpha > 0.30:
         print(f"[STT] non-Bangla (kept): {text[:60]}")
     return text
@@ -217,26 +234,42 @@ class _STTBatchWorker:
     2. Collects requests for up to BATCH_WINDOW_MS
     3. Dispatches the batch to _gpu_pool in one call
     4. Resolves each caller's Future
     """
     def __init__(self) -> None:
-        self._queue: asyncio.Queue[_STTRequest] = asyncio.Queue()
-        self._started = False
-    def _ensure_started(self) -> None:
-        if not self._started:
-            self._started = True
-            asyncio.ensure_future(self._worker_loop())
     async def enqueue(self, wav_path: str) -> Optional[str]:
-        self._ensure_started()
-        loop = asyncio.get_event_loop()
         req  = _STTRequest(wav_path=wav_path, future=loop.create_future())
         await self._queue.put(req)
         return await req.future
     async def _worker_loop(self) -> None:
-        loop = asyncio.get_event_loop()
         while True:
             # Wait for at least one request
             first = await self._queue.get()
@@ -281,7 +314,7 @@ _batch_worker = _STTBatchWorker()
 class STTProcessor:
     """
     Drop-in replacement for the original STTProcessor.
-    Now routes through the GPU batch worker for shared inference.
     """
     async def transcribe(self, audio_bytes: bytes) -> Optional[str]:
@@ -294,13 +327,19 @@ class STTProcessor:
         if len(audio_bytes) > MAX_INPUT_BYTES:
             audio_bytes = audio_bytes[:MAX_INPUT_BYTES]
-        # Wait for model readiness (non-blocking)
         if not _model_ready.is_set():
-            print("[STT] Waiting for model…")
-            await asyncio.to_thread(_model_ready.wait)
         # ffmpeg: runs in parallel I/O pool (not serialised)
-        loop = asyncio.get_event_loop()
         wav_path = await loop.run_in_executor(_ffmpeg_pool, _to_wav_sync, audio_bytes)
         if not wav_path:
             return None

 • GPU inference runs in a dedicated single-thread Executor (serialize GPU)
 • Bangla-optimised decode parameters preserved from original
+FIX-BUG4 (race condition + deprecated API):
+  _STTBatchWorker now uses asyncio.Lock to safely initialise the worker
+  exactly once, even when multiple coroutines call enqueue() concurrently.
+  asyncio.get_event_loop() → asyncio.get_running_loop().
+FIX-BUG6 (blocking wait without timeout):
+  asyncio.to_thread(_model_ready.wait) now passes timeout=60 and raises
+  RuntimeError if the model fails to load in time.
 Latency profile:
   ffmpeg (parallel)     ~30–80 ms
+  batch wait window     ~30 ms   (reduced from 50ms)
   GPU inference         ~80–150 ms per batch (amortised across requests)
   Total perceived       < 200 ms at moderate load
 """
 from faster_whisper import WhisperModel
 # ── Bangla script patterns ─────────────────────────────────────────────────────
+_BANGLA_RE       = re.compile(r"[\u0980-\u09FF]")
 _WRONG_SCRIPT_RE = re.compile(
     r"[\u0600-\u06FF\u0750-\u077F\uFB50-\uFDFF\uFE70-\uFEFF]"
 )
 _BANGLA_SEED = "আমি আপনার সাথে বাংলায় কথা বলছি।"
 # ── Configuration ──────────────────────────────────────────────────────────────
+_STT_MODEL      = os.getenv("STT_MODEL",          "large-v3")
+_COMPUTE_TYPE   = os.getenv("STT_COMPUTE_TYPE",   "int8_float32")
+_BATCH_WINDOW   = float(os.getenv("STT_BATCH_WINDOW_MS", "30")) / 1000  # 30ms (was 50ms)
+_MAX_BATCH      = int(os.getenv("STT_MAX_BATCH",  "8"))
+_MODEL_LOAD_TIMEOUT = int(os.getenv("STT_MODEL_LOAD_TIMEOUT_S", "120"))  # seconds
 MAX_INPUT_BYTES = 5_242_880  # 5 MB
 # ── Singleton model state ──────────────────────────────────────────────────────
 _model: Optional[WhisperModel] = None
 _model_lock    = threading.Lock()
 _model_ready   = threading.Event()
+_model_error: Optional[str] = None
 # Two executors: one for ffmpeg (I/O, can be parallel), one for GPU (serial)
+_ffmpeg_pool = ThreadPoolExecutor(max_workers=4, thread_name_prefix="ffmpeg")
+_gpu_pool    = ThreadPoolExecutor(max_workers=1, thread_name_prefix="whisper-gpu")
 # ── Model loader (background thread) ──────────────────────────────────────────
 def _load_and_warm() -> None:
+    global _model, _model_error
     try:
         print(f"[STT] Loading Faster-Whisper {_STT_MODEL} on CUDA ({_COMPUTE_TYPE})…")
         m = WhisperModel(
         with _model_lock:
             _model = m
     except Exception as exc:
+        _model_error = str(exc)
+        print(f"[STT] Model load FAILED: {exc}")
     finally:
         _model_ready.set()
 def _make_silence_wav(duration_s: float = 0.5, sr: int = 16_000) -> io.BytesIO:
     buf = io.BytesIO()
+    n   = int(sr * duration_s)
     with wave.open(buf, "wb") as wf:
+        wf.setnchannels(1)
+        wf.setsampwidth(2)
+        wf.setframerate(sr)
         wf.writeframes(struct.pack(f"<{n}h", *([0] * n)))
     buf.seek(0)
     return buf
     in_path = out_path = None
     try:
         with tempfile.NamedTemporaryFile(suffix=".webm", delete=False) as f:
+            f.write(audio_bytes)
+            in_path = f.name
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
             out_path = f.name
         return None
     finally:
         if in_path and os.path.exists(in_path):
+            try:
+                os.remove(in_path)
+            except OSError:
+                pass
 # ── Whisper inference (sync, runs in _gpu_pool — ONE AT A TIME) ───��───────────
     """
     Run Whisper inference on a list of WAV paths.
     Returns a list of transcripts (None on error/empty).
     """
     with _model_lock:
         model = _model
             print(f"[STT] inference error: {exc}")
             results.append(None)
         finally:
+            try:
+                os.remove(path)
+            except OSError:
+                pass
     return results
 def _validate(text: str) -> Optional[str]:
     if not text or not text.strip():
         return None
+    text  = text.strip()
     words = text.split()
     if len(words) >= 6 and len(set(words)) / len(words) < 0.25:
         print(f"[STT] rejected repetition: {text[:60]}")
     if len(words) == 2 and words[0] == words[1]:
         return None
     # Soft script check — log but keep
+    wrong = len(_WRONG_SCRIPT_RE.findall(text))
+    alpha = sum(1 for c in text if c.isalpha())
     if alpha > 0 and wrong / alpha > 0.30:
         print(f"[STT] non-Bangla (kept): {text[:60]}")
     return text
     2. Collects requests for up to BATCH_WINDOW_MS
     3. Dispatches the batch to _gpu_pool in one call
     4. Resolves each caller's Future
+    FIX-BUG4 (race condition):
+      Uses asyncio.Lock to guarantee the worker task is created exactly once,
+      even when multiple coroutines call enqueue() before the task starts.
+    FIX-BUG4 (deprecated API):
+      Uses asyncio.get_running_loop() instead of asyncio.get_event_loop().
     """
     def __init__(self) -> None:
+        self._queue:   asyncio.Queue[_STTRequest] = asyncio.Queue()
+        self._started: bool = False
+        self._start_lock: Optional[asyncio.Lock] = None   # created on first use
+    def _get_lock(self) -> asyncio.Lock:
+        # asyncio.Lock must be created inside the running event loop
+        if self._start_lock is None:
+            self._start_lock = asyncio.Lock()
+        return self._start_lock
+    async def _ensure_started(self) -> None:
+        async with self._get_lock():
+            if not self._started:
+                self._started = True
+                asyncio.ensure_future(self._worker_loop())
     async def enqueue(self, wav_path: str) -> Optional[str]:
+        await self._ensure_started()
+        # FIX-BUG4: get_running_loop() is the correct modern API
+        loop = asyncio.get_running_loop()
         req  = _STTRequest(wav_path=wav_path, future=loop.create_future())
         await self._queue.put(req)
         return await req.future
     async def _worker_loop(self) -> None:
+        loop = asyncio.get_running_loop()
         while True:
             # Wait for at least one request
             first = await self._queue.get()
 class STTProcessor:
     """
     Drop-in replacement for the original STTProcessor.
+    Routes through the GPU batch worker for shared inference.
     """
     async def transcribe(self, audio_bytes: bytes) -> Optional[str]:
         if len(audio_bytes) > MAX_INPUT_BYTES:
             audio_bytes = audio_bytes[:MAX_INPUT_BYTES]
+        # FIX-BUG6: wait for model with timeout — not forever
         if not _model_ready.is_set():
+            print("[STT] Waiting for model to load…")
+            ready = await asyncio.to_thread(_model_ready.wait, _MODEL_LOAD_TIMEOUT)
+            if not ready:
+                raise RuntimeError(
+                    f"[STT] Whisper model did not load within {_MODEL_LOAD_TIMEOUT}s"
+                )
+            if _model_error:
+                raise RuntimeError(f"[STT] Whisper model failed to load: {_model_error}")
         # ffmpeg: runs in parallel I/O pool (not serialised)
+        loop     = asyncio.get_running_loop()
         wav_path = await loop.run_in_executor(_ffmpeg_pool, _to_wav_sync, audio_bytes)
         if not wav_path:
             return None

services/tts.py CHANGED Viewed

@@ -1,6 +1,15 @@
 """
 services/tts.py — Ultra Low-Latency Dual TTS Backend
-(unchanged public API — streaming.py imports text_to_speech_stream + USE_ELEVENLABS)
 """
 from dotenv import load_dotenv
@@ -22,18 +31,32 @@ ELEVENLABS_SPEAKER_BOOST = True
 if USE_ELEVENLABS and not ELEVENLABS_API_KEY:
     raise RuntimeError("[TTS] ELEVENLABS_API_KEY missing")
-print(f"[TTS] Backend: {'ElevenLabs' if USE_ELEVENLABS else 'Edge-TTS'}")
 def split_sentences(text: str) -> list[str]:
     text = text.strip()
     if not text:
         return []
-    parts = re.split(r'(?<=[।.!?])\s+', text)
     return [p.strip() for p in parts if len(p.strip()) > 1]
-async def _edge_tts_stream(text: str, voice: str = EDGE_VOICE, rate: str = "-30%"):
     import edge_tts
     text = text.strip()
     if not text:
@@ -61,17 +84,28 @@ async def _elevenlabs_stream(
     text = text.strip()
     if not text:
         return
-    url = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}/stream"
-    headers = {"xi-api-key": ELEVENLABS_API_KEY, "Content-Type": "application/json", "Accept": "audio/mpeg"}
     payload = {
-        "text": text, "model_id": model_id,
-        "voice_settings": {"stability": stability, "similarity_boost": similarity,
-                           "style": style, "use_speaker_boost": speaker_boost},
     }
     try:
         async with httpx.AsyncClient(timeout=httpx.Timeout(connect=5.0, read=None)) as client:
-            async with client.stream("POST", url, headers=headers, json=payload,
-                                     params={"output_format": output_format}) as resp:
                 if resp.status_code != 200:
                     print(f"[TTS][ElevenLabs] HTTP {resp.status_code}")
                     return
@@ -83,7 +117,18 @@ async def _elevenlabs_stream(
         print(f"[TTS][ElevenLabs] {exc}")
-async def text_to_speech_stream(text: str, voice: str | None = None, rate: str = "-30%"):
     text = text.strip()
     if not text:
         return
@@ -108,11 +153,11 @@ async def text_to_speech_stream(text: str, voice: str | None = None, rate: str =
         finally:
             await q.put(_SENT)
-    # Create one queue per sentence, launch all synthesis tasks immediately
     queues = [asyncio.Queue() for _ in parts]
     tasks  = [asyncio.create_task(_synth_part(p, q)) for p, q in zip(parts, queues)]
-    # Deliver audio in sentence order, but all sentences synthesise in parallel
     try:
         for q in queues:
             while True:

 """
 services/tts.py — Ultra Low-Latency Dual TTS Backend
+FIX-ISSUE4 (Natural, slow TTS):
+  • Default rate changed from "-30%" to "-35%" — approximately 35% slower
+    than the Edge TTS default, giving a calm, natural speaking pace.
+  • split_sentences() now splits on ALL clause delimiters (commas, colons,
+    em-dashes) in addition to sentence endings, so synthesis tasks are
+    smaller and start sooner. This pairs with streaming.py's 2–3 word
+    flush threshold for maximum low-latency playback.
+  • Parallel synthesis of all parts preserved (all parts synthesised
+    concurrently; delivered in order).
 """
 from dotenv import load_dotenv
 if USE_ELEVENLABS and not ELEVENLABS_API_KEY:
     raise RuntimeError("[TTS] ELEVENLABS_API_KEY missing")
+print(f"[TTS] Backend: {'ElevenLabs' if USE_ELEVENLABS else 'Edge-TTS'} | rate: -35%")
 def split_sentences(text: str) -> list[str]:
+    """
+    Split text into small synthesis chunks for low-latency streaming.
+    FIX-ISSUE4: Split on sentence boundaries AND clause boundaries so each
+    TTS task is small (a phrase, not a full sentence). This allows synthesis
+    to start sooner for later parts of a long response.
+    """
     text = text.strip()
     if not text:
         return []
+    # Split on sentence-ending punctuation AND clause delimiters
+    # The lookbehind keeps the delimiter attached to the preceding chunk.
+    parts = re.split(r'(?<=[।.!?,;:—–])\s+', text)
+    # Filter out anything too short to synthesise (punctuation-only fragments)
     return [p.strip() for p in parts if len(p.strip()) > 1]
+async def _edge_tts_stream(text: str, voice: str = EDGE_VOICE, rate: str = "-35%"):
+    """
+    Stream Edge-TTS audio for a single text chunk.
+    FIX-ISSUE4: Default rate is now -35% (was -30%) for slower, natural speech.
+    """
     import edge_tts
     text = text.strip()
     if not text:
     text = text.strip()
     if not text:
         return
+    url     = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}/stream"
+    headers = {
+        "xi-api-key":   ELEVENLABS_API_KEY,
+        "Content-Type": "application/json",
+        "Accept":       "audio/mpeg",
+    }
     payload = {
+        "text":           text,
+        "model_id":       model_id,
+        "voice_settings": {
+            "stability":        stability,
+            "similarity_boost": similarity,
+            "style":            style,
+            "use_speaker_boost": speaker_boost,
+        },
     }
     try:
         async with httpx.AsyncClient(timeout=httpx.Timeout(connect=5.0, read=None)) as client:
+            async with client.stream(
+                "POST", url, headers=headers, json=payload,
+                params={"output_format": output_format}
+            ) as resp:
                 if resp.status_code != 200:
                     print(f"[TTS][ElevenLabs] HTTP {resp.status_code}")
                     return
         print(f"[TTS][ElevenLabs] {exc}")
+async def text_to_speech_stream(
+    text: str,
+    voice: str | None = None,
+    rate: str = "-35%",   # FIX-ISSUE4: -35% default (was -30%)
+):
+    """
+    Stream TTS audio for `text`.
+    Splits text into small clause-level parts, synthesises all in parallel,
+    yields audio in order. This gives the lowest possible first-audio latency
+    while maintaining natural speech ordering.
+    """
     text = text.strip()
     if not text:
         return
         finally:
             await q.put(_SENT)
+    # Create one queue per part, synthesise all in parallel
     queues = [asyncio.Queue() for _ in parts]
     tasks  = [asyncio.create_task(_synth_part(p, q)) for p, q in zip(parts, queues)]
+    # Deliver in part order
     try:
         for q in queues:
             while True:

services/webrtc_pipeline.py CHANGED Viewed

@@ -1,34 +1,15 @@
 """
 services/webrtc_pipeline.py — WebRTC Audio Pipeline + Full Parallelization
-Architecture:
-─────────────
-  Browser MediaStream (WebRTC)
-       │
-       │  RTCPeerConnection (aiortc)
-       ▼
-  PCM frame receiver (20ms frames, 16kHz mono)
-       │
-       │  VAD (webrtcvad) — discard silence, buffer speech
-       ▼
-  Speech segment → STT batch queue ──────────────────────────┐
-                                                              │  parallel
-  STT result → LLM async stream ────────────────────────┐   │
-                                                         │   │
-  LLM tokens → TTS ParallelStreamer ──────────────────┐  │   │
-                                                       │  │   │
-  Audio chunks → RTCPeerConnection data channel ◄──── ┘  │   │
-                                                          └───┘
-                                               (all three run concurrently)
-Key design choices:
-  • aiortc handles WebRTC peer connection & ICE negotiation
-  • PCM frames delivered via asyncio.Queue — never blocks media thread
-  • VAD segments audio before STT — no wasted inference on silence
-  • STT → LLM → TTS pipeline starts as soon as speech ends
-  • Audio response sent back over RTCDataChannel as binary chunks
-  • STT uses the shared GPU batch worker (see stt.py)
-  • Barge-in: new speech cancels the current LLM+TTS pipeline immediately
 """
 from __future__ import annotations
@@ -79,10 +60,10 @@ class _VADSegmenter:
         self.sample_rate   = sample_rate
         self.frame_bytes   = int(sample_rate * frame_ms / 1000) * 2  # 16-bit samples
         self.silence_limit = silence_limit
-        self._vad = webrtcvad.Vad(aggressiveness) if VAD_AVAILABLE else None
-        self._buffer = bytearray()
         self._silence_count = 0
-        self._active = False
     def process_frame(self, pcm_frame: bytes) -> Optional[bytes]:
         """
@@ -90,7 +71,7 @@ class _VADSegmenter:
         Returns a complete utterance bytes object when speech ends, else None.
         """
         if self._vad is None:
-            # No VAD available — buffer everything, flush after 3s
             self._buffer.extend(pcm_frame)
             if len(self._buffer) >= self.sample_rate * 3 * 2:
                 data = bytes(self._buffer)
@@ -108,17 +89,17 @@ class _VADSegmenter:
         if is_speech:
             self._buffer.extend(frame)
-            self._active = True
             self._silence_count = 0
         elif self._active:
             self._buffer.extend(frame)
             self._silence_count += 1
         if self._active and self._silence_count >= self.silence_limit:
-            data = bytes(self._buffer)
             self._buffer.clear()
             self._silence_count = 0
-            self._active = False
             return data
         return None
@@ -133,6 +114,9 @@ if AIORTC_AVAILABLE:
         """
         Wraps an incoming WebRTC audio track.
         Resamples to 16kHz mono PCM and pushes frames into an asyncio.Queue.
         """
         kind = "audio"
@@ -142,23 +126,49 @@ if AIORTC_AVAILABLE:
             self._track       = track
             self._frame_queue = frame_queue
             self._resampler: Optional[av.AudioResampler] = None
-        async def recv(self):
-            frame = await self._track.recv()
-            if self._resampler is None:
-                self._resampler = av.AudioResampler(
-                    format="s16",
-                    layout="mono",
-                    rate=16_000,
-                )
-            resampled = self._resampler.resample(frame)
-            for rf in resampled:
-                pcm = bytes(rf.planes[0])
                 try:
-                    self._frame_queue.put_nowait(pcm)
-                except asyncio.QueueFull:
-                    pass  # drop frame under backpressure — prefer real-time
-            return frame
 # ══════════════════════════════════════════════════════════════════════════════
@@ -172,13 +182,13 @@ class _TurnPipeline:
     """
     def __init__(self, ai_backend, data_channel, on_stt=None, on_token=None):
-        self._ai          = ai_backend
-        self._channel     = data_channel   # RTCDataChannel for audio delivery
-        self._on_stt      = on_stt         # optional callback(str)
-        self._on_token    = on_token       # optional callback(str)
-        self._stt         = STTProcessor()
-        self._streamer    = ParallelTTSStreamer()
-        self._cancelled   = False
         self._tasks: list[asyncio.Task] = []
     async def run(self, user_id: str, audio_bytes: bytes) -> None:
@@ -276,6 +286,8 @@ class WebRTCSession:
         self._vad         = _VADSegmenter()
         self._active_turn: Optional[_TurnPipeline] = None
         self._active_task: Optional[asyncio.Task]  = None
         self._setup_pc()
     def _setup_pc(self) -> None:
@@ -284,8 +296,12 @@ class WebRTCSession:
         @pc.on("track")
         def on_track(track):
             if track.kind == "audio":
                 receiver = AudioFrameReceiver(track, self._frame_q)
                 asyncio.ensure_future(self._frame_processor())
         @pc.on("datachannel")
         def on_datachannel(channel):
@@ -294,7 +310,6 @@ class WebRTCSession:
             @channel.on("message")
             def on_message(msg):
-                # Control messages from browser (cancel, init, ping)
                 try:
                     data = json.loads(msg)
                     if data.get("type") == "cancel":
@@ -377,5 +392,8 @@ class WebRTCSession:
         await self._pc.addIceCandidate(c)
     async def close(self) -> None:
         await self._cancel_active()
         await self._pc.close()

 """
 services/webrtc_pipeline.py — WebRTC Audio Pipeline + Full Parallelization
+FIX-BUG3 (AudioFrameReceiver never driven):
+  In the original code, AudioFrameReceiver was instantiated but its recv()
+  method was never called. aiortc only delivers frames when a consumer calls
+  recv() in a loop. Without this, the frame queue was always empty → no audio
+  reached the VAD → no utterances → zero voice responses via WebRTC.
+  Fix: spawn a coroutine (_recv_loop) that calls receiver.recv() continuously.
+All other logic preserved.
 """
 from __future__ import annotations
         self.sample_rate   = sample_rate
         self.frame_bytes   = int(sample_rate * frame_ms / 1000) * 2  # 16-bit samples
         self.silence_limit = silence_limit
+        self._vad          = webrtcvad.Vad(aggressiveness) if VAD_AVAILABLE else None
+        self._buffer       = bytearray()
         self._silence_count = 0
+        self._active       = False
     def process_frame(self, pcm_frame: bytes) -> Optional[bytes]:
         """
         Returns a complete utterance bytes object when speech ends, else None.
         """
         if self._vad is None:
+            # No VAD — buffer everything, flush after 3s
             self._buffer.extend(pcm_frame)
             if len(self._buffer) >= self.sample_rate * 3 * 2:
                 data = bytes(self._buffer)
         if is_speech:
             self._buffer.extend(frame)
+            self._active        = True
             self._silence_count = 0
         elif self._active:
             self._buffer.extend(frame)
             self._silence_count += 1
         if self._active and self._silence_count >= self.silence_limit:
+            data                = bytes(self._buffer)
             self._buffer.clear()
             self._silence_count = 0
+            self._active        = False
             return data
         return None
         """
         Wraps an incoming WebRTC audio track.
         Resamples to 16kHz mono PCM and pushes frames into an asyncio.Queue.
+        IMPORTANT: call start_receiving() after construction to begin
+        consuming frames from the underlying track via recv().
         """
         kind = "audio"
             self._track       = track
             self._frame_queue = frame_queue
             self._resampler: Optional[av.AudioResampler] = None
+            self._recv_task: Optional[asyncio.Task] = None
+        def start_receiving(self) -> None:
+            """
+            FIX-BUG3: Spawn the recv() loop so the track actually delivers frames.
+            Without this, _frame_queue stays empty forever.
+            """
+            if self._recv_task is None or self._recv_task.done():
+                self._recv_task = asyncio.ensure_future(self._recv_loop())
+        async def _recv_loop(self) -> None:
+            """Continuously consume frames from the remote track."""
+            while True:
+                try:
+                    frame = await self._track.recv()
+                except Exception as exc:
+                    print(f"[WebRTC] AudioFrameReceiver: track ended ({exc})")
+                    break
+                if self._resampler is None:
+                    self._resampler = av.AudioResampler(
+                        format="s16",
+                        layout="mono",
+                        rate=16_000,
+                    )
                 try:
+                    resampled = self._resampler.resample(frame)
+                    for rf in resampled:
+                        pcm = bytes(rf.planes[0])
+                        try:
+                            self._frame_queue.put_nowait(pcm)
+                        except asyncio.QueueFull:
+                            pass  # Drop frame under backpressure — prefer real-time
+                except Exception as exc:
+                    print(f"[WebRTC] Resample error: {exc}")
+        async def recv(self):
+            """Required override — delegates to the underlying track."""
+            return await self._track.recv()
+        def stop_receiving(self) -> None:
+            if self._recv_task and not self._recv_task.done():
+                self._recv_task.cancel()
 # ══════════════════════════════════════════════════════════════════════════════
     """
     def __init__(self, ai_backend, data_channel, on_stt=None, on_token=None):
+        self._ai        = ai_backend
+        self._channel   = data_channel   # RTCDataChannel for audio delivery
+        self._on_stt    = on_stt         # optional callback(str)
+        self._on_token  = on_token       # optional callback(str)
+        self._stt       = STTProcessor()
+        self._streamer  = ParallelTTSStreamer()
+        self._cancelled = False
         self._tasks: list[asyncio.Task] = []
     async def run(self, user_id: str, audio_bytes: bytes) -> None:
         self._vad         = _VADSegmenter()
         self._active_turn: Optional[_TurnPipeline] = None
         self._active_task: Optional[asyncio.Task]  = None
+        # Keep references to receivers so they are not garbage-collected
+        self._receivers: list[AudioFrameReceiver] = []
         self._setup_pc()
     def _setup_pc(self) -> None:
         @pc.on("track")
         def on_track(track):
             if track.kind == "audio":
+                # FIX-BUG3: create receiver AND start its recv() loop
                 receiver = AudioFrameReceiver(track, self._frame_q)
+                receiver.start_receiving()
+                self._receivers.append(receiver)     # prevent GC
                 asyncio.ensure_future(self._frame_processor())
+                print(f"[WebRTC] Audio track received — receiver started ✓")
         @pc.on("datachannel")
         def on_datachannel(channel):
             @channel.on("message")
             def on_message(msg):
                 try:
                     data = json.loads(msg)
                     if data.get("type") == "cancel":
         await self._pc.addIceCandidate(c)
     async def close(self) -> None:
+        for receiver in self._receivers:
+            receiver.stop_receiving()
+        self._receivers.clear()
         await self._cancel_active()
         await self._pc.close()