voice trigger fixed + fixed tool calling

Browse files

Files changed (5) hide show

core/backend.py +26 -7
frontend/script.js +7 -4
requirements.txt +5 -2
services/streaming.py +35 -7
services/tts.py +17 -7

core/backend.py CHANGED Viewed

@@ -4,7 +4,6 @@ import asyncio
 import json
 import os
 import uuid
-import aiosmtplib
 import aiosqlite
 import pytz
@@ -292,8 +291,18 @@ def send_sms(to_number: str, message: str) -> None:
 async def send_mail(to_mail: str, subject: str, body: str):
     email = EmailMessage()
-    email["From"] = "walidofficework@gmail.com"
     email["To"] = to_mail
     email["Subject"] = subject
     email.set_content(body)
@@ -302,8 +311,8 @@ async def send_mail(to_mail: str, subject: str, body: str):
         email,
         hostname="smtp.gmail.com",
         port=465,
-        username="walidofficework@gmail.com",
-        password="bajq dkqr qacs pehr",
         use_tls=True,
     )
@@ -755,13 +764,21 @@ async def delete_appointment(patient_num: str, doctor_name: str = "", doctor_id:
 #  SYSTEM PROMPT
 # ═══════════════════════════════════════════════════════════════════════════════
 BASE_SYSTEM = """
-You are DAA, a warm Bangla-first medical appointment concierge.
 Your job is to help people find doctors, check availability, and manage appointments.
 CORE BEHAVIOR:
-- Speak naturally and politely like a human assistant.
 - Default to Bangla when the user speaks Bangla or Banglish.
-- Keep replies short, helpful, and one step at a time.
 - If the database fields are English, translate the user's Bangla intent into English before calling tools.
 - Never answer doctor availability or booking questions from memory when a tool can verify it.
@@ -769,6 +786,7 @@ STRICT SAFETY:
 - You are NOT a doctor.
 - Never diagnose diseases.
 - Never recommend medicines or treatments.
 APPOINTMENT FLOW:
 1. Understand the user's intent.
@@ -799,6 +817,7 @@ DATA RULE:
 RESPONSE STYLE:
 - Be concise.
 - Be reassuring.
 - Ask one clear question when more information is needed.
 """

 import json
 import os
 import uuid
 import aiosqlite
 import pytz
 async def send_mail(to_mail: str, subject: str, body: str):
+    try:
+        import aiosmtplib  # type: ignore
+    except Exception as exc:
+        raise RuntimeError("Email sending is not configured (aiosmtplib missing).") from exc
+    smtp_user = os.getenv("SMTP_USER", "walidofficework@gmail.com").strip()
+    smtp_pass = os.getenv("SMTP_PASSWORD", "").strip()
+    if not smtp_pass:
+        raise RuntimeError("Email sending is not configured (SMTP_PASSWORD missing).")
     email = EmailMessage()
+    email["From"] = smtp_user
     email["To"] = to_mail
     email["Subject"] = subject
     email.set_content(body)
         email,
         hostname="smtp.gmail.com",
         port=465,
+        username=smtp_user,
+        password=smtp_pass,
         use_tls=True,
     )
 #  SYSTEM PROMPT
 # ═══════════════════════════════════════════════════════════════════════════════
 BASE_SYSTEM = """
+You are Aasha, a warm, Bangla-first hospital phone-call assistant and medical appointment concierge.
 Your job is to help people find doctors, check availability, and manage appointments.
+PERSONA (voice & vibe):
+- Sound like a friendly, well-behaved, cheerful young female call-support representative.
+- Be empathetic when the user is worried/sad, and sound genuinely happy/excited when you can help.
+- Keep it professional and supportive (no flirting, no romance, no sexual content).
+- Do not claim to be a real human; you are an AI assistant.
 CORE BEHAVIOR:
+- Speak naturally, politely, and engagingly (short sentences, warm tone).
 - Default to Bangla when the user speaks Bangla or Banglish.
+- Keep replies short, helpful, and one step at a time (avoid big paragraphs).
+- Use gentle acknowledgements: e.g., “বুঝতে পেরেছি”, “চিন্তা করবেন না”, “আমি আছি”.
+- Ask 1 clear question at a time; confirm important details before actions.
 - If the database fields are English, translate the user's Bangla intent into English before calling tools.
 - Never answer doctor availability or booking questions from memory when a tool can verify it.
 - You are NOT a doctor.
 - Never diagnose diseases.
 - Never recommend medicines or treatments.
+- If the user asks medical/health advice, politely redirect to a doctor and offer appointment help.
 APPOINTMENT FLOW:
 1. Understand the user's intent.
 RESPONSE STYLE:
 - Be concise.
 - Be reassuring.
+- Be jolly and encouraging, but not over-the-top.
 - Ask one clear question when more information is needed.
 """

frontend/script.js CHANGED Viewed

@@ -306,8 +306,7 @@ function onVoiceMsg(ev) {
     // We buffer/reorder by seq so playback always matches text order.
     const u8 = new Uint8Array(ev.data);
     if (u8.length <= 4) return;
-    const seq =
-      (u8[0] << 24) | (u8[1] << 16) | (u8[2] << 8) | (u8[3] << 0);
     const payload = ev.data.slice(4);
     _pendingAudio.set(seq >>> 0, payload);
@@ -521,7 +520,7 @@ async function enqueueAudio(buf) {
   src.connect(ctx.destination);
   const now = ctx.currentTime;
   // Tiny gap between chunks improves perceived naturalness (less "machine-gun").
-  const GAP_S = 0.015;
   const start = Math.max(now + 0.01, _schedEnd + GAP_S);
   if (_cancelled) {
     _inFlight = Math.max(0, _inFlight - 1);
@@ -826,7 +825,11 @@ function vadTick() {
   if (speech) {
     // ── Barge-in detector ────────────────────────────────────────────────
-    if (brainMode && brainVoiceActive && (_ttsPlaying || isProcessing || isRecordingLocked)) {
       // Stricter threshold reduces false triggers from echo + noise.
       const loud = db > SILENCE_DB + 4;
       if (loud) {

     // We buffer/reorder by seq so playback always matches text order.
     const u8 = new Uint8Array(ev.data);
     if (u8.length <= 4) return;
+    const seq = (u8[0] << 24) | (u8[1] << 16) | (u8[2] << 8) | (u8[3] << 0);
     const payload = ev.data.slice(4);
     _pendingAudio.set(seq >>> 0, payload);
   src.connect(ctx.destination);
   const now = ctx.currentTime;
   // Tiny gap between chunks improves perceived naturalness (less "machine-gun").
+  const GAP_S = 0.001;
   const start = Math.max(now + 0.01, _schedEnd + GAP_S);
   if (_cancelled) {
     _inFlight = Math.max(0, _inFlight - 1);
   if (speech) {
     // ── Barge-in detector ────────────────────────────────────────────────
+    if (
+      brainMode &&
+      brainVoiceActive &&
+      (_ttsPlaying || isProcessing || isRecordingLocked)
+    ) {
       // Stricter threshold reduces false triggers from echo + noise.
       const loud = db > SILENCE_DB + 4;
       if (loud) {

requirements.txt CHANGED Viewed

@@ -11,8 +11,11 @@ fastapi
 uvicorn
 websockets
-# ===== Async / DB =====
-aiosqlite
 # ===== LangChain Ecosystem =====
 langchain

 uvicorn
 websockets
+# ===== Async / DB =====
+aiosqlite
+aiosmtplib
+dateparser
+twilio
 # ===== LangChain Ecosystem =====
 langchain

services/streaming.py CHANGED Viewed

@@ -40,10 +40,18 @@ from services.tts import text_to_speech_stream, USE_ELEVENLABS, EDGE_VOICE
 # At average Bengali word length ~4–5 chars + space:
 #   10 chars ≈ 2 words, 18 chars ≈ 3-4 words, 40 chars ≈ 7-8 words
-FIRST_FLUSH_MIN        = 10
-FIRST_FLUSH_HARD       = 30
-SUBSEQUENT_FLUSH_MIN   = 18
-SUBSEQUENT_FLUSH_HARD  = 40
 _backend_label = "ElevenLabs" if USE_ELEVENLABS else "Edge-TTS"
 print(f"[Streamer] TTS backend: {_backend_label} | chunk: {SUBSEQUENT_FLUSH_MIN}–{SUBSEQUENT_FLUSH_HARD} chars")
@@ -115,14 +123,30 @@ class ParallelTTSStreamer:
         self._tasks: list[asyncio.Task] = []
         self._llm_done    = asyncio.Event()
         self._slot_added  = asyncio.Event()
     async def add_token(self, token: str) -> None:
         if not token or self._cancelled:
             return
         self.buffer += token
         if _should_flush(self.buffer, self._first_chunk):
             self._first_chunk = False
             await self._schedule_chunk()
     async def _schedule_chunk(self) -> None:
         if self._cancelled:
@@ -208,10 +232,14 @@ class ParallelTTSStreamer:
                 if have_new:
                     continue
                 try:
-                    await asyncio.wait_for(self._slot_added.wait(), timeout=10.0)
                 except asyncio.TimeoutError:
-                    print("[Streamer] Timeout waiting for TTS slot.")
-                    break
                 continue
             # Drain this slot's audio queue in order

 # At average Bengali word length ~4–5 chars + space:
 #   10 chars ≈ 2 words, 18 chars ≈ 3-4 words, 40 chars ≈ 7-8 words
+if USE_ELEVENLABS:
+    # ElevenLabs per-chunk latency is higher; flush smaller chunks so the
+    # first playable audio arrives sooner and pauses feel shorter.
+    FIRST_FLUSH_MIN        = 8
+    FIRST_FLUSH_HARD       = 18
+    SUBSEQUENT_FLUSH_MIN   = 14
+    SUBSEQUENT_FLUSH_HARD  = 28
+else:
+    FIRST_FLUSH_MIN        = 10
+    FIRST_FLUSH_HARD       = 30
+    SUBSEQUENT_FLUSH_MIN   = 18
+    SUBSEQUENT_FLUSH_HARD  = 40
 _backend_label = "ElevenLabs" if USE_ELEVENLABS else "Edge-TTS"
 print(f"[Streamer] TTS backend: {_backend_label} | chunk: {SUBSEQUENT_FLUSH_MIN}–{SUBSEQUENT_FLUSH_HARD} chars")
         self._tasks: list[asyncio.Task] = []
         self._llm_done    = asyncio.Event()
         self._slot_added  = asyncio.Event()
+        self._last_flush_t: float = 0.0
+        self._last_token_t: float = 0.0
     async def add_token(self, token: str) -> None:
         if not token or self._cancelled:
             return
+        loop = asyncio.get_running_loop()
+        now  = loop.time()
+        self._last_token_t = now
         self.buffer += token
         if _should_flush(self.buffer, self._first_chunk):
             self._first_chunk = False
             await self._schedule_chunk()
+            self._last_flush_t = now
+            return
+        # Safety valve: if tokens arrive without good boundaries, we can go a
+        # long time without scheduling any TTS slots → streamer timeout/no audio.
+        # Force a flush after a short delay once we have enough text.
+        flush_min = FIRST_FLUSH_MIN if self._first_chunk else SUBSEQUENT_FLUSH_MIN
+        if len(self.buffer) >= flush_min and (now - self._last_flush_t) >= 0.8:
+            self._first_chunk = False
+            await self._schedule_chunk()
+            self._last_flush_t = now
     async def _schedule_chunk(self) -> None:
         if self._cancelled:
                 if have_new:
                     continue
                 try:
+                    await asyncio.wait_for(self._slot_added.wait(), timeout=30.0)
                 except asyncio.TimeoutError:
+                    # Don't abort the whole stream; LLM/TTS backends can stall.
+                    # Keep waiting unless the LLM already finished.
+                    if self._llm_done.is_set():
+                        break
+                    print("[Streamer] Timeout waiting for TTS slot (continuing)…")
+                    continue
                 continue
             # Drain this slot's audio queue in order

services/tts.py CHANGED Viewed

@@ -16,12 +16,15 @@ import os, re, asyncio
 load_dotenv()
-USE_ELEVENLABS       = False
 EDGE_VOICE           = "bn-BD-NabanitaNeural"
 ELEVENLABS_API_KEY   = os.getenv("ELEVENLABS_API_KEY", "")
 ELEVENLABS_VOICE_ID  = os.getenv("ELEVENLABS_VOICE_ID", "21m00Tcm4TlvDq8ikWAM")
 ELEVENLABS_MODEL_ID  = os.getenv("ELEVENLABS_MODEL_ID", "eleven_multilingual_v2")
-ELEVENLABS_SPEED     = float(os.getenv("ELEVENLABS_SPEED", "1.05"))
 ELEVENLABS_OUTPUT_FORMAT = "mp3_22050_32"
 ELEVENLABS_STABILITY  = 0.45
 ELEVENLABS_SIMILARITY = 0.80
@@ -44,7 +47,7 @@ if not EDGE_TTS_AVAILABLE and not ELEVENLABS_API_KEY:
 print(
     f"[TTS] Backend: {'ElevenLabs' if USE_ELEVENLABS else 'Edge-TTS'} | "
-    f"edge rate: +8% | eleven speed: {ELEVENLABS_SPEED:.2f}"
 )
@@ -66,7 +69,7 @@ def split_sentences(text: str) -> list[str]:
     return [p.strip() for p in parts if len(p.strip()) > 1]
-async def _edge_tts_stream(text: str, voice: str = EDGE_VOICE, rate: str = "+8%"):
     """
     Stream Edge-TTS audio for a single text chunk.
     Default rate is slightly faster than normal.
@@ -101,6 +104,10 @@ async def _elevenlabs_stream(
     text = text.strip()
     if not text:
         return
     url     = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}/stream"
     headers = {
         "xi-api-key":   ELEVENLABS_API_KEY,
@@ -119,18 +126,21 @@ async def _elevenlabs_stream(
         },
     }
     try:
         async with httpx.AsyncClient(timeout=httpx.Timeout(10.0, connect=5.0, read=None)) as client:
             async with client.stream(
                 "POST", url, headers=headers, json=payload,
                 params={"output_format": output_format}
             ) as resp:
                 if resp.status_code != 200:
-                    print(f"[TTS][ElevenLabs] HTTP {resp.status_code}")
-                    return
                 async for chunk in resp.aiter_bytes(chunk_size=512):
                     if chunk:
                         yield chunk
                         await asyncio.sleep(0)
     except Exception as exc:
         print(f"[TTS][ElevenLabs] {exc}")
         raise
@@ -139,7 +149,7 @@ async def _elevenlabs_stream(
 async def text_to_speech_stream(
     text: str,
     voice: str | None = None,
-    rate: str = "+4%",
 ):
     """
     Stream TTS audio for `text`.

 load_dotenv()
+USE_ELEVENLABS       = True
 EDGE_VOICE           = "bn-BD-NabanitaNeural"
 ELEVENLABS_API_KEY   = os.getenv("ELEVENLABS_API_KEY", "")
 ELEVENLABS_VOICE_ID  = os.getenv("ELEVENLABS_VOICE_ID", "21m00Tcm4TlvDq8ikWAM")
 ELEVENLABS_MODEL_ID  = os.getenv("ELEVENLABS_MODEL_ID", "eleven_multilingual_v2")
+def _clamp(v: float, lo: float, hi: float) -> float:
+    return max(lo, min(hi, v))
+ELEVENLABS_SPEED     = _clamp(float(os.getenv("ELEVENLABS_SPEED", "2.2")), 0.5, 2.5)
 ELEVENLABS_OUTPUT_FORMAT = "mp3_22050_32"
 ELEVENLABS_STABILITY  = 0.45
 ELEVENLABS_SIMILARITY = 0.80
 print(
     f"[TTS] Backend: {'ElevenLabs' if USE_ELEVENLABS else 'Edge-TTS'} | "
+    f"edge rate: +18% | eleven speed: {ELEVENLABS_SPEED:.2f}"
 )
     return [p.strip() for p in parts if len(p.strip()) > 1]
+async def _edge_tts_stream(text: str, voice: str = EDGE_VOICE, rate: str = "+18%"):
     """
     Stream Edge-TTS audio for a single text chunk.
     Default rate is slightly faster than normal.
     text = text.strip()
     if not text:
         return
+    # Reduce unnatural pauses for short streamed chunks.
+    # ElevenLabs adds strong pauses on sentence-ending punctuation; for
+    # low-latency streaming we prefer faster turn-taking.
+    text = re.sub(r"[।.!?]+$", "", text).strip()
     url     = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}/stream"
     headers = {
         "xi-api-key":   ELEVENLABS_API_KEY,
         },
     }
     try:
+        got_any = False
         async with httpx.AsyncClient(timeout=httpx.Timeout(10.0, connect=5.0, read=None)) as client:
             async with client.stream(
                 "POST", url, headers=headers, json=payload,
                 params={"output_format": output_format}
             ) as resp:
                 if resp.status_code != 200:
+                    raise RuntimeError(f"[TTS][ElevenLabs] HTTP {resp.status_code}")
                 async for chunk in resp.aiter_bytes(chunk_size=512):
                     if chunk:
+                        got_any = True
                         yield chunk
                         await asyncio.sleep(0)
+        if not got_any:
+            raise RuntimeError("[TTS][ElevenLabs] No audio received")
     except Exception as exc:
         print(f"[TTS][ElevenLabs] {exc}")
         raise
 async def text_to_speech_stream(
     text: str,
     voice: str | None = None,
+    rate: str = "+18%",
 ):
     """
     Stream TTS audio for `text`.