rakib72642 commited on
Commit
58fed26
·
1 Parent(s): 1ce5806

voice trigger fixed + fixed tool calling

Browse files
core/backend.py CHANGED
@@ -4,7 +4,6 @@ import asyncio
4
  import json
5
  import os
6
  import uuid
7
- import aiosmtplib
8
 
9
  import aiosqlite
10
  import pytz
@@ -292,8 +291,18 @@ def send_sms(to_number: str, message: str) -> None:
292
 
293
 
294
  async def send_mail(to_mail: str, subject: str, body: str):
 
 
 
 
 
 
 
 
 
 
295
  email = EmailMessage()
296
- email["From"] = "walidofficework@gmail.com"
297
  email["To"] = to_mail
298
  email["Subject"] = subject
299
  email.set_content(body)
@@ -302,8 +311,8 @@ async def send_mail(to_mail: str, subject: str, body: str):
302
  email,
303
  hostname="smtp.gmail.com",
304
  port=465,
305
- username="walidofficework@gmail.com",
306
- password="bajq dkqr qacs pehr",
307
  use_tls=True,
308
  )
309
 
@@ -755,13 +764,21 @@ async def delete_appointment(patient_num: str, doctor_name: str = "", doctor_id:
755
  # SYSTEM PROMPT
756
  # ═══════════════════════════════════════════════════════════════════════════════
757
  BASE_SYSTEM = """
758
- You are DAA, a warm Bangla-first medical appointment concierge.
759
  Your job is to help people find doctors, check availability, and manage appointments.
760
 
 
 
 
 
 
 
761
  CORE BEHAVIOR:
762
- - Speak naturally and politely like a human assistant.
763
  - Default to Bangla when the user speaks Bangla or Banglish.
764
- - Keep replies short, helpful, and one step at a time.
 
 
765
  - If the database fields are English, translate the user's Bangla intent into English before calling tools.
766
  - Never answer doctor availability or booking questions from memory when a tool can verify it.
767
 
@@ -769,6 +786,7 @@ STRICT SAFETY:
769
  - You are NOT a doctor.
770
  - Never diagnose diseases.
771
  - Never recommend medicines or treatments.
 
772
 
773
  APPOINTMENT FLOW:
774
  1. Understand the user's intent.
@@ -799,6 +817,7 @@ DATA RULE:
799
  RESPONSE STYLE:
800
  - Be concise.
801
  - Be reassuring.
 
802
  - Ask one clear question when more information is needed.
803
  """
804
 
 
4
  import json
5
  import os
6
  import uuid
 
7
 
8
  import aiosqlite
9
  import pytz
 
291
 
292
 
293
  async def send_mail(to_mail: str, subject: str, body: str):
294
+ try:
295
+ import aiosmtplib # type: ignore
296
+ except Exception as exc:
297
+ raise RuntimeError("Email sending is not configured (aiosmtplib missing).") from exc
298
+
299
+ smtp_user = os.getenv("SMTP_USER", "walidofficework@gmail.com").strip()
300
+ smtp_pass = os.getenv("SMTP_PASSWORD", "").strip()
301
+ if not smtp_pass:
302
+ raise RuntimeError("Email sending is not configured (SMTP_PASSWORD missing).")
303
+
304
  email = EmailMessage()
305
+ email["From"] = smtp_user
306
  email["To"] = to_mail
307
  email["Subject"] = subject
308
  email.set_content(body)
 
311
  email,
312
  hostname="smtp.gmail.com",
313
  port=465,
314
+ username=smtp_user,
315
+ password=smtp_pass,
316
  use_tls=True,
317
  )
318
 
 
764
  # SYSTEM PROMPT
765
  # ═══════════════════════════════════════════════════════════════════════════════
766
  BASE_SYSTEM = """
767
+ You are Aasha, a warm, Bangla-first hospital phone-call assistant and medical appointment concierge.
768
  Your job is to help people find doctors, check availability, and manage appointments.
769
 
770
+ PERSONA (voice & vibe):
771
+ - Sound like a friendly, well-behaved, cheerful young female call-support representative.
772
+ - Be empathetic when the user is worried/sad, and sound genuinely happy/excited when you can help.
773
+ - Keep it professional and supportive (no flirting, no romance, no sexual content).
774
+ - Do not claim to be a real human; you are an AI assistant.
775
+
776
  CORE BEHAVIOR:
777
+ - Speak naturally, politely, and engagingly (short sentences, warm tone).
778
  - Default to Bangla when the user speaks Bangla or Banglish.
779
+ - Keep replies short, helpful, and one step at a time (avoid big paragraphs).
780
+ - Use gentle acknowledgements: e.g., “বুঝতে পেরেছি”, “চিন্তা করবেন না”, “আমি আছি”.
781
+ - Ask 1 clear question at a time; confirm important details before actions.
782
  - If the database fields are English, translate the user's Bangla intent into English before calling tools.
783
  - Never answer doctor availability or booking questions from memory when a tool can verify it.
784
 
 
786
  - You are NOT a doctor.
787
  - Never diagnose diseases.
788
  - Never recommend medicines or treatments.
789
+ - If the user asks medical/health advice, politely redirect to a doctor and offer appointment help.
790
 
791
  APPOINTMENT FLOW:
792
  1. Understand the user's intent.
 
817
  RESPONSE STYLE:
818
  - Be concise.
819
  - Be reassuring.
820
+ - Be jolly and encouraging, but not over-the-top.
821
  - Ask one clear question when more information is needed.
822
  """
823
 
frontend/script.js CHANGED
@@ -306,8 +306,7 @@ function onVoiceMsg(ev) {
306
  // We buffer/reorder by seq so playback always matches text order.
307
  const u8 = new Uint8Array(ev.data);
308
  if (u8.length <= 4) return;
309
- const seq =
310
- (u8[0] << 24) | (u8[1] << 16) | (u8[2] << 8) | (u8[3] << 0);
311
  const payload = ev.data.slice(4);
312
  _pendingAudio.set(seq >>> 0, payload);
313
 
@@ -521,7 +520,7 @@ async function enqueueAudio(buf) {
521
  src.connect(ctx.destination);
522
  const now = ctx.currentTime;
523
  // Tiny gap between chunks improves perceived naturalness (less "machine-gun").
524
- const GAP_S = 0.015;
525
  const start = Math.max(now + 0.01, _schedEnd + GAP_S);
526
  if (_cancelled) {
527
  _inFlight = Math.max(0, _inFlight - 1);
@@ -826,7 +825,11 @@ function vadTick() {
826
 
827
  if (speech) {
828
  // ── Barge-in detector ────────────────────────────────────────────────
829
- if (brainMode && brainVoiceActive && (_ttsPlaying || isProcessing || isRecordingLocked)) {
 
 
 
 
830
  // Stricter threshold reduces false triggers from echo + noise.
831
  const loud = db > SILENCE_DB + 4;
832
  if (loud) {
 
306
  // We buffer/reorder by seq so playback always matches text order.
307
  const u8 = new Uint8Array(ev.data);
308
  if (u8.length <= 4) return;
309
+ const seq = (u8[0] << 24) | (u8[1] << 16) | (u8[2] << 8) | (u8[3] << 0);
 
310
  const payload = ev.data.slice(4);
311
  _pendingAudio.set(seq >>> 0, payload);
312
 
 
520
  src.connect(ctx.destination);
521
  const now = ctx.currentTime;
522
  // Tiny gap between chunks improves perceived naturalness (less "machine-gun").
523
+ const GAP_S = 0.001;
524
  const start = Math.max(now + 0.01, _schedEnd + GAP_S);
525
  if (_cancelled) {
526
  _inFlight = Math.max(0, _inFlight - 1);
 
825
 
826
  if (speech) {
827
  // ── Barge-in detector ────────────────────────────────────────────────
828
+ if (
829
+ brainMode &&
830
+ brainVoiceActive &&
831
+ (_ttsPlaying || isProcessing || isRecordingLocked)
832
+ ) {
833
  // Stricter threshold reduces false triggers from echo + noise.
834
  const loud = db > SILENCE_DB + 4;
835
  if (loud) {
requirements.txt CHANGED
@@ -11,8 +11,11 @@ fastapi
11
  uvicorn
12
  websockets
13
 
14
- # ===== Async / DB =====
15
- aiosqlite
 
 
 
16
 
17
  # ===== LangChain Ecosystem =====
18
  langchain
 
11
  uvicorn
12
  websockets
13
 
14
+ # ===== Async / DB =====
15
+ aiosqlite
16
+ aiosmtplib
17
+ dateparser
18
+ twilio
19
 
20
  # ===== LangChain Ecosystem =====
21
  langchain
services/streaming.py CHANGED
@@ -40,10 +40,18 @@ from services.tts import text_to_speech_stream, USE_ELEVENLABS, EDGE_VOICE
40
  # At average Bengali word length ~4–5 chars + space:
41
  # 10 chars ≈ 2 words, 18 chars ≈ 3-4 words, 40 chars ≈ 7-8 words
42
 
43
- FIRST_FLUSH_MIN = 10
44
- FIRST_FLUSH_HARD = 30
45
- SUBSEQUENT_FLUSH_MIN = 18
46
- SUBSEQUENT_FLUSH_HARD = 40
 
 
 
 
 
 
 
 
47
 
48
  _backend_label = "ElevenLabs" if USE_ELEVENLABS else "Edge-TTS"
49
  print(f"[Streamer] TTS backend: {_backend_label} | chunk: {SUBSEQUENT_FLUSH_MIN}–{SUBSEQUENT_FLUSH_HARD} chars")
@@ -115,14 +123,30 @@ class ParallelTTSStreamer:
115
  self._tasks: list[asyncio.Task] = []
116
  self._llm_done = asyncio.Event()
117
  self._slot_added = asyncio.Event()
 
 
118
 
119
  async def add_token(self, token: str) -> None:
120
  if not token or self._cancelled:
121
  return
 
 
 
122
  self.buffer += token
123
  if _should_flush(self.buffer, self._first_chunk):
124
  self._first_chunk = False
125
  await self._schedule_chunk()
 
 
 
 
 
 
 
 
 
 
 
126
 
127
  async def _schedule_chunk(self) -> None:
128
  if self._cancelled:
@@ -208,10 +232,14 @@ class ParallelTTSStreamer:
208
  if have_new:
209
  continue
210
  try:
211
- await asyncio.wait_for(self._slot_added.wait(), timeout=10.0)
212
  except asyncio.TimeoutError:
213
- print("[Streamer] Timeout waiting for TTS slot.")
214
- break
 
 
 
 
215
  continue
216
 
217
  # Drain this slot's audio queue in order
 
40
  # At average Bengali word length ~4–5 chars + space:
41
  # 10 chars ≈ 2 words, 18 chars ≈ 3-4 words, 40 chars ≈ 7-8 words
42
 
43
+ if USE_ELEVENLABS:
44
+ # ElevenLabs per-chunk latency is higher; flush smaller chunks so the
45
+ # first playable audio arrives sooner and pauses feel shorter.
46
+ FIRST_FLUSH_MIN = 8
47
+ FIRST_FLUSH_HARD = 18
48
+ SUBSEQUENT_FLUSH_MIN = 14
49
+ SUBSEQUENT_FLUSH_HARD = 28
50
+ else:
51
+ FIRST_FLUSH_MIN = 10
52
+ FIRST_FLUSH_HARD = 30
53
+ SUBSEQUENT_FLUSH_MIN = 18
54
+ SUBSEQUENT_FLUSH_HARD = 40
55
 
56
  _backend_label = "ElevenLabs" if USE_ELEVENLABS else "Edge-TTS"
57
  print(f"[Streamer] TTS backend: {_backend_label} | chunk: {SUBSEQUENT_FLUSH_MIN}–{SUBSEQUENT_FLUSH_HARD} chars")
 
123
  self._tasks: list[asyncio.Task] = []
124
  self._llm_done = asyncio.Event()
125
  self._slot_added = asyncio.Event()
126
+ self._last_flush_t: float = 0.0
127
+ self._last_token_t: float = 0.0
128
 
129
  async def add_token(self, token: str) -> None:
130
  if not token or self._cancelled:
131
  return
132
+ loop = asyncio.get_running_loop()
133
+ now = loop.time()
134
+ self._last_token_t = now
135
  self.buffer += token
136
  if _should_flush(self.buffer, self._first_chunk):
137
  self._first_chunk = False
138
  await self._schedule_chunk()
139
+ self._last_flush_t = now
140
+ return
141
+
142
+ # Safety valve: if tokens arrive without good boundaries, we can go a
143
+ # long time without scheduling any TTS slots → streamer timeout/no audio.
144
+ # Force a flush after a short delay once we have enough text.
145
+ flush_min = FIRST_FLUSH_MIN if self._first_chunk else SUBSEQUENT_FLUSH_MIN
146
+ if len(self.buffer) >= flush_min and (now - self._last_flush_t) >= 0.8:
147
+ self._first_chunk = False
148
+ await self._schedule_chunk()
149
+ self._last_flush_t = now
150
 
151
  async def _schedule_chunk(self) -> None:
152
  if self._cancelled:
 
232
  if have_new:
233
  continue
234
  try:
235
+ await asyncio.wait_for(self._slot_added.wait(), timeout=30.0)
236
  except asyncio.TimeoutError:
237
+ # Don't abort the whole stream; LLM/TTS backends can stall.
238
+ # Keep waiting unless the LLM already finished.
239
+ if self._llm_done.is_set():
240
+ break
241
+ print("[Streamer] Timeout waiting for TTS slot (continuing)…")
242
+ continue
243
  continue
244
 
245
  # Drain this slot's audio queue in order
services/tts.py CHANGED
@@ -16,12 +16,15 @@ import os, re, asyncio
16
 
17
  load_dotenv()
18
 
19
- USE_ELEVENLABS = False
20
  EDGE_VOICE = "bn-BD-NabanitaNeural"
21
  ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY", "")
22
  ELEVENLABS_VOICE_ID = os.getenv("ELEVENLABS_VOICE_ID", "21m00Tcm4TlvDq8ikWAM")
23
  ELEVENLABS_MODEL_ID = os.getenv("ELEVENLABS_MODEL_ID", "eleven_multilingual_v2")
24
- ELEVENLABS_SPEED = float(os.getenv("ELEVENLABS_SPEED", "1.05"))
 
 
 
25
  ELEVENLABS_OUTPUT_FORMAT = "mp3_22050_32"
26
  ELEVENLABS_STABILITY = 0.45
27
  ELEVENLABS_SIMILARITY = 0.80
@@ -44,7 +47,7 @@ if not EDGE_TTS_AVAILABLE and not ELEVENLABS_API_KEY:
44
 
45
  print(
46
  f"[TTS] Backend: {'ElevenLabs' if USE_ELEVENLABS else 'Edge-TTS'} | "
47
- f"edge rate: +8% | eleven speed: {ELEVENLABS_SPEED:.2f}"
48
  )
49
 
50
 
@@ -66,7 +69,7 @@ def split_sentences(text: str) -> list[str]:
66
  return [p.strip() for p in parts if len(p.strip()) > 1]
67
 
68
 
69
- async def _edge_tts_stream(text: str, voice: str = EDGE_VOICE, rate: str = "+8%"):
70
  """
71
  Stream Edge-TTS audio for a single text chunk.
72
  Default rate is slightly faster than normal.
@@ -101,6 +104,10 @@ async def _elevenlabs_stream(
101
  text = text.strip()
102
  if not text:
103
  return
 
 
 
 
104
  url = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}/stream"
105
  headers = {
106
  "xi-api-key": ELEVENLABS_API_KEY,
@@ -119,18 +126,21 @@ async def _elevenlabs_stream(
119
  },
120
  }
121
  try:
 
122
  async with httpx.AsyncClient(timeout=httpx.Timeout(10.0, connect=5.0, read=None)) as client:
123
  async with client.stream(
124
  "POST", url, headers=headers, json=payload,
125
  params={"output_format": output_format}
126
  ) as resp:
127
  if resp.status_code != 200:
128
- print(f"[TTS][ElevenLabs] HTTP {resp.status_code}")
129
- return
130
  async for chunk in resp.aiter_bytes(chunk_size=512):
131
  if chunk:
 
132
  yield chunk
133
  await asyncio.sleep(0)
 
 
134
  except Exception as exc:
135
  print(f"[TTS][ElevenLabs] {exc}")
136
  raise
@@ -139,7 +149,7 @@ async def _elevenlabs_stream(
139
  async def text_to_speech_stream(
140
  text: str,
141
  voice: str | None = None,
142
- rate: str = "+4%",
143
  ):
144
  """
145
  Stream TTS audio for `text`.
 
16
 
17
  load_dotenv()
18
 
19
+ USE_ELEVENLABS = True
20
  EDGE_VOICE = "bn-BD-NabanitaNeural"
21
  ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY", "")
22
  ELEVENLABS_VOICE_ID = os.getenv("ELEVENLABS_VOICE_ID", "21m00Tcm4TlvDq8ikWAM")
23
  ELEVENLABS_MODEL_ID = os.getenv("ELEVENLABS_MODEL_ID", "eleven_multilingual_v2")
24
+ def _clamp(v: float, lo: float, hi: float) -> float:
25
+ return max(lo, min(hi, v))
26
+
27
+ ELEVENLABS_SPEED = _clamp(float(os.getenv("ELEVENLABS_SPEED", "2.2")), 0.5, 2.5)
28
  ELEVENLABS_OUTPUT_FORMAT = "mp3_22050_32"
29
  ELEVENLABS_STABILITY = 0.45
30
  ELEVENLABS_SIMILARITY = 0.80
 
47
 
48
  print(
49
  f"[TTS] Backend: {'ElevenLabs' if USE_ELEVENLABS else 'Edge-TTS'} | "
50
+ f"edge rate: +18% | eleven speed: {ELEVENLABS_SPEED:.2f}"
51
  )
52
 
53
 
 
69
  return [p.strip() for p in parts if len(p.strip()) > 1]
70
 
71
 
72
+ async def _edge_tts_stream(text: str, voice: str = EDGE_VOICE, rate: str = "+18%"):
73
  """
74
  Stream Edge-TTS audio for a single text chunk.
75
  Default rate is slightly faster than normal.
 
104
  text = text.strip()
105
  if not text:
106
  return
107
+ # Reduce unnatural pauses for short streamed chunks.
108
+ # ElevenLabs adds strong pauses on sentence-ending punctuation; for
109
+ # low-latency streaming we prefer faster turn-taking.
110
+ text = re.sub(r"[।.!?]+$", "", text).strip()
111
  url = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}/stream"
112
  headers = {
113
  "xi-api-key": ELEVENLABS_API_KEY,
 
126
  },
127
  }
128
  try:
129
+ got_any = False
130
  async with httpx.AsyncClient(timeout=httpx.Timeout(10.0, connect=5.0, read=None)) as client:
131
  async with client.stream(
132
  "POST", url, headers=headers, json=payload,
133
  params={"output_format": output_format}
134
  ) as resp:
135
  if resp.status_code != 200:
136
+ raise RuntimeError(f"[TTS][ElevenLabs] HTTP {resp.status_code}")
 
137
  async for chunk in resp.aiter_bytes(chunk_size=512):
138
  if chunk:
139
+ got_any = True
140
  yield chunk
141
  await asyncio.sleep(0)
142
+ if not got_any:
143
+ raise RuntimeError("[TTS][ElevenLabs] No audio received")
144
  except Exception as exc:
145
  print(f"[TTS][ElevenLabs] {exc}")
146
  raise
 
149
  async def text_to_speech_stream(
150
  text: str,
151
  voice: str | None = None,
152
+ rate: str = "+18%",
153
  ):
154
  """
155
  Stream TTS audio for `text`.