rakib72642 commited on
Commit
ed5b8b8
·
1 Parent(s): f2ea5fc

structure ready :: rakib

Browse files
.env CHANGED
@@ -5,13 +5,13 @@ LANGCHAIN_ENDPOINT='https://api.smith.langchain.com'
5
  LANGCHAIN_API_KEY='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'
6
  LANGCHAIN_PROJECT='Default'
7
 
8
- TWILIO_ACCOUNT_SID="ACfafc0d2d007bdf14b21bb3e14a7a7b31"
9
- TWILIO_AUTH_TOKEN="ed15fa98748c8c3d3d02cb54e431a187"
10
- TWILIO_PHONE_NUMBER="+14343375085"
11
 
12
- LIVEKIT_URL=wss://demo-wqwzjgsv.livekit.cloud
13
- LIVEKIT_API_KEY=APIesfzMFdhmrb6
14
- LIVEKIT_API_SECRET=kb7jLghH6Q3qLXxUHoYwREpYJdgX8qgAOHBDOG7q40G
15
 
16
- GROQ_API_KEY=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
17
- CARTESIA_API_KEY=sk_car_h3oyy6jPSJzx8KnEGJ1m5f
 
5
  LANGCHAIN_API_KEY='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'
6
  LANGCHAIN_PROJECT='Default'
7
 
8
+ # TWILIO_ACCOUNT_SID="ACfafc0d2d007bdf14b21bb3e14a7a7b31"
9
+ # TWILIO_AUTH_TOKEN="ed15fa98748c8c3d3d02cb54e431a187"
10
+ # TWILIO_PHONE_NUMBER="+14343375085"
11
 
12
+ # LIVEKIT_URL=wss://demo-wqwzjgsv.livekit.cloud
13
+ # LIVEKIT_API_KEY=APIesfzMFdhmrb6
14
+ # LIVEKIT_API_SECRET=kb7jLghH6Q3qLXxUHoYwREpYJdgX8qgAOHBDOG7q40G
15
 
16
+ # GROQ_API_KEY=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
17
+ # CARTESIA_API_KEY=sk_car_h3oyy6jPSJzx8KnEGJ1m5f
.vscode/settings.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "python-envs.defaultEnvManager": "ms-python.python:conda",
3
+ "python-envs.defaultPackageManager": "ms-python.python:conda"
4
+ }
app.py CHANGED
@@ -1,95 +1,202 @@
1
- from fastapi import FastAPI
2
- from fastapi.responses import StreamingResponse
 
 
3
  from contextlib import asynccontextmanager
4
- from pydantic import BaseModel
5
- from core.backend import AIBackend
6
- import uvicorn, json, os
7
- from fastapi.middleware.cors import CORSMiddleware
8
  from fastapi import FastAPI, WebSocket, WebSocketDisconnect
9
- from services.stt import StreamingSTT
10
- from services.tts import text_to_speech_stream
11
  from fastapi.staticfiles import StaticFiles
 
 
 
 
 
12
 
 
13
 
14
- chatbot_obj = AIBackend()
15
 
16
  @asynccontextmanager
17
  async def lifespan(app: FastAPI):
18
- await chatbot_obj.async_setup()
 
19
  yield
20
- if chatbot_obj.conn:
21
- await chatbot_obj.conn.close()
 
22
 
23
  app = FastAPI(lifespan=lifespan)
24
 
25
- class UserRequest(BaseModel):
26
- user_id: str
27
- user_query: str
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
- @app.post("/chat")
30
- async def chat(request: UserRequest):
31
- stream = await chatbot_obj.main(
32
- user_id=request.user_id,
33
- user_query=request.user_query,
34
- )
35
- return StreamingResponse(stream, media_type="text/event-stream")
36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  @app.websocket("/ws/chat")
38
- async def websocket_chat(websocket: WebSocket):
39
- await websocket.accept()
 
40
  try:
41
  while True:
42
- # receive frontend message
43
- data = await websocket.receive_text()
44
- payload = json.loads(data)
45
-
46
- user_id = payload["user_id"]
47
- user_query = payload["user_query"]
48
-
49
- # stream AI response
50
- stream = await chatbot_obj.main(
51
- user_id=user_id,
52
- user_query=user_query
53
- )
54
-
55
- async for chunk in stream:
56
- await websocket.send_text(chunk)
57
-
58
- # notify frontend response finished
59
- await websocket.send_text("[[END]]")
 
 
 
 
 
 
60
  except WebSocketDisconnect:
61
- print("Client disconnected")
 
 
 
 
62
 
 
63
  @app.websocket("/ws/voice")
64
- async def voice_ws(websocket: WebSocket):
65
- await websocket.accept()
66
- stt = StreamingSTT()
 
 
 
 
67
  try:
68
  while True:
69
- message = await websocket.receive()
70
- # 🎤 AUDIO INPUT
71
- if "bytes" in message:
72
- audio_chunk = message["bytes"]
73
- stt.add_audio(audio_chunk)
74
- text = stt.transcribe_if_ready()
75
- if not text:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  continue
77
- await websocket.send_text(f"[STT]: {text}")
78
- # 🤖 LLM STREAM
79
- stream = chatbot_obj.main(
80
- user_id="voice_user",
81
- user_query=text
82
- )
83
- full_response = ""
84
- async for token in stream:
85
- full_response += token
86
- await websocket.send_text(f"[LLM]: {token}")
87
- # 🔊 TTS STREAM
88
- async for audio_chunk in text_to_speech_stream(full_response):
89
- await websocket.send_bytes(audio_chunk)
90
- await websocket.send_text("[END]")
91
- except WebSocketDisconnect:
92
- print("Voice client disconnected")
93
 
94
- if __name__ == "__main__":
95
- uvicorn.run("app:app", host="127.0.0.1", port=8679, reload=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import asyncio
3
+ import json
4
+ import os
5
  from contextlib import asynccontextmanager
6
+
 
 
 
7
  from fastapi import FastAPI, WebSocket, WebSocketDisconnect
8
+ from fastapi.responses import FileResponse, HTMLResponse
 
9
  from fastapi.staticfiles import StaticFiles
10
+ from starlette.websockets import WebSocketState
11
+
12
+ from core.backend import AIBackend
13
+ from services.stt import STTProcessor
14
+ from services.streaming import ParallelTTSStreamer
15
 
16
+ ai = AIBackend()
17
 
 
18
 
19
  @asynccontextmanager
20
  async def lifespan(app: FastAPI):
21
+ await ai.async_setup()
22
+ print("[APP] AI backend ready.")
23
  yield
24
+ if hasattr(ai, "conn") and ai.conn:
25
+ await ai.conn.close()
26
+
27
 
28
  app = FastAPI(lifespan=lifespan)
29
 
30
+ try:
31
+ app.mount("/static", StaticFiles(directory="."), name="static")
32
+ except Exception:
33
+ pass
34
+
35
+
36
+ @app.get("/")
37
+ async def root():
38
+ if os.path.exists("index.html"):
39
+ return FileResponse("index.html")
40
+ return HTMLResponse("<h2>index.html not found</h2>", status_code=404)
41
+
42
+
43
+ # ── Helpers ───────────────────────────────────────────────────────────────────
44
+ def _ws_open(ws: WebSocket) -> bool:
45
+ """Return True if the WebSocket connection is still alive."""
46
+ return ws.client_state == WebSocketState.CONNECTED
47
 
 
 
 
 
 
 
 
48
 
49
+ async def _safe_text(ws: WebSocket, payload: dict) -> bool:
50
+ if not _ws_open(ws):
51
+ return False
52
+ try:
53
+ await ws.send_text(json.dumps(payload))
54
+ return True
55
+ except Exception:
56
+ return False
57
+
58
+
59
+ async def _safe_bytes(ws: WebSocket, data: bytes) -> bool:
60
+ if not _ws_open(ws):
61
+ return False
62
+ try:
63
+ await ws.send_bytes(data)
64
+ return True
65
+ except Exception:
66
+ return False
67
+
68
+
69
+ # ── Text chat WebSocket ───────────────────────────────────────────────────────
70
  @app.websocket("/ws/chat")
71
+ async def ws_chat(ws: WebSocket):
72
+ await ws.accept()
73
+ print("[CHAT] Client connected")
74
  try:
75
  while True:
76
+ raw = await ws.receive_text()
77
+ try:
78
+ data = json.loads(raw)
79
+ except json.JSONDecodeError:
80
+ await _safe_text(ws, {"type": "error", "text": "Invalid JSON"})
81
+ continue
82
+
83
+ user_id = data.get("user_id", "default_user")
84
+ user_query = data.get("user_query", "").strip()
85
+ if not user_query:
86
+ continue
87
+
88
+ full_response = ""
89
+ try:
90
+ stream = await ai.main(user_id, user_query)
91
+ async for token in stream:
92
+ full_response += token
93
+ await _safe_text(ws, {"type": "chat", "text": full_response})
94
+ except Exception as e:
95
+ print(f"[CHAT] AI error: {e}")
96
+ await _safe_text(ws, {"type": "error", "text": str(e)})
97
+
98
+ await _safe_text(ws, {"type": "end"})
99
+
100
  except WebSocketDisconnect:
101
+ print("[CHAT] Client disconnected")
102
+ except Exception as e:
103
+ if "disconnect" not in str(e).lower():
104
+ print(f"[CHAT] WS error: {e}")
105
+
106
 
107
+ # ── Voice WebSocket ───────────────────────────────────────────────────────────
108
  @app.websocket("/ws/voice")
109
+ async def ws_voice(ws: WebSocket):
110
+ await ws.accept()
111
+ print("[VOICE] Client connected")
112
+
113
+ stt = STTProcessor()
114
+ user_id = "voice_user"
115
+
116
  try:
117
  while True:
118
+ # ── FIX: Check connection state before every receive ──────────────
119
+ # The previous crash "Cannot call receive once a disconnect message
120
+ # has been received" happened because we called ws.receive() after
121
+ # the client had already disconnected. Now we check first.
122
+ if not _ws_open(ws):
123
+ print("[VOICE] Connection dropped, exiting handler.")
124
+ break
125
+
126
+ try:
127
+ data = await ws.receive()
128
+ except WebSocketDisconnect:
129
+ print("[VOICE] Client disconnected.")
130
+ break
131
+ except Exception as e:
132
+ # Catches starlette's internal disconnect errors gracefully
133
+ if "disconnect" in str(e).lower():
134
+ print("[VOICE] Client disconnected (recv error).")
135
+ else:
136
+ print(f"[VOICE] Receive error: {e}")
137
+ break
138
+
139
+ # ── Audio blob from VAD ───────────────────────────────────────────
140
+ if "bytes" in data and data["bytes"]:
141
+ audio_bytes = data["bytes"]
142
+ print(f"[VOICE] Received utterance: {len(audio_bytes):,} bytes")
143
+
144
+ # 1. STT — in thread so event loop isn't blocked
145
+ transcript = await asyncio.to_thread(stt.transcribe, audio_bytes)
146
+
147
+ if not transcript:
148
+ await _safe_text(ws, {
149
+ "type": "error",
150
+ "text": "কথা বুঝতে পারিনি, আবার বলুন।"
151
+ })
152
+ # Send 'end' so client's isProcessing resets and VAD resumes
153
+ await _safe_text(ws, {"type": "end"})
154
  continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
 
156
+ print(f"[VOICE] STT: {transcript}")
157
+ if not await _safe_text(ws, {"type": "stt", "text": transcript}):
158
+ break
159
+
160
+ # 2. AI + TTS pipeline
161
+ tts_streamer = ParallelTTSStreamer()
162
+
163
+ async def run_ai_and_tts():
164
+ try:
165
+ stream = await ai.main(user_id, transcript)
166
+ async for token in stream:
167
+ if not token:
168
+ continue
169
+ if not await _safe_text(ws, {"type": "llm_token", "token": token}):
170
+ break
171
+ await tts_streamer.add_token(token)
172
+ except Exception as e:
173
+ print(f"[VOICE] AI error: {e}")
174
+ finally:
175
+ await tts_streamer.flush()
176
+
177
+ async def stream_tts_audio():
178
+ async for chunk in tts_streamer.stream_audio():
179
+ if not await _safe_bytes(ws, chunk):
180
+ break
181
+
182
+ await asyncio.gather(run_ai_and_tts(), stream_tts_audio())
183
+
184
+ # Signal end of turn → client resumes VAD
185
+ await _safe_text(ws, {"type": "end"})
186
+
187
+ # ── Control messages ──────────────────────────────────────────────
188
+ elif "text" in data and data["text"]:
189
+ try:
190
+ msg = json.loads(data["text"])
191
+ if msg.get("type") == "ping":
192
+ await _safe_text(ws, {"type": "pong"})
193
+ except json.JSONDecodeError:
194
+ pass
195
+
196
+ except WebSocketDisconnect:
197
+ print("[VOICE] Client disconnected (outer)")
198
+ except Exception as e:
199
+ if "disconnect" not in str(e).lower():
200
+ print(f"[VOICE] WS error: {e}")
201
+ finally:
202
+ print("[VOICE] Handler exiting cleanly.")
core/backend.py CHANGED
@@ -20,6 +20,7 @@ class ChatState(TypedDict):
20
  summary: str
21
 
22
  ######################### TOOLS #########################
 
23
  def get_db_path():
24
  return os.path.join(os.path.dirname(__file__), "daa.db")
25
 
@@ -58,34 +59,34 @@ async def search_doctor(name: str = "", category: str = "", visiting_days: str =
58
  query = "SELECT * FROM doctors WHERE 1=1"
59
  params = []
60
  conditions = []
61
-
62
  if name:
63
  conditions.append("LOWER(doctor_name) LIKE ?")
64
  params.append(f"%{name.lower()}%")
65
-
66
  if category:
67
  conditions.append("LOWER(category) LIKE ?")
68
  params.append(f"%{category.lower()}%")
69
-
70
  if visiting_days:
71
  conditions.append("LOWER(visiting_days) LIKE ?")
72
  params.append(f"%{visiting_days.lower()}%")
73
-
74
  if conditions:
75
  query += " AND (" + " OR ".join(conditions) + ")"
76
-
77
  async with aiosqlite.connect(db_path) as db:
78
  db.row_factory = aiosqlite.Row
79
  cursor = await db.execute(query, params)
80
  rows = await cursor.fetchall()
81
-
82
  if not rows:
83
  return json.dumps({
84
  "success": False,
85
  "message": "No doctors found matching your search.",
86
  "data": []
87
  })
88
-
89
  return json.dumps({
90
  "success": True,
91
  "count": len(rows),
@@ -99,25 +100,25 @@ async def search_appointment_by_phone(patient_num: str) -> str:
99
  """
100
  db_path = get_db_path()
101
  patient_num = format_bd_number(patient_num)
102
-
103
  async with aiosqlite.connect(db_path) as db:
104
  db.row_factory = aiosqlite.Row
105
-
106
  cursor = await db.execute("""
107
  SELECT * FROM patients
108
  WHERE patient_num = ?
109
  ORDER BY visiting_date ASC
110
  """, (patient_num,))
111
-
112
  rows = await cursor.fetchall()
113
-
114
  if not rows:
115
  return json.dumps({
116
  "success": False,
117
  "message": "No appointments found for this phone number.",
118
  "data": []
119
  })
120
-
121
  return json.dumps({
122
  "success": True,
123
  "count": len(rows),
@@ -128,32 +129,33 @@ async def search_appointment_by_phone(patient_num: str) -> str:
128
  async def book_appointment(doctor_id: int, patient_name: str, patient_age: str, patient_num: str, visiting_date: str) -> str:
129
  """
130
  Book a doctor appointment and save it to the patients table.
131
-
132
  Args:
133
  doctor_id: Doctor's ID from search_doctor results.
134
  patient_name: Full name of the patient.
135
  patient_age: Age of the patient (e.g. "32").
136
  patient_num: Contact phone number of the patient.
137
  visiting_date: Date of visit in YYYY-MM-DD format (e.g. 2025-06-15).
138
-
139
  Returns a booking confirmation with the new record ID.
140
  """
141
  db_path = get_db_path()
142
-
143
  async with aiosqlite.connect(db_path) as db:
144
  db.row_factory = aiosqlite.Row
 
145
  patient_num = format_bd_number(patient_num)
146
-
147
  # Verify doctor exists
148
  cursor = await db.execute("SELECT * FROM doctors WHERE id = ?", (doctor_id,))
149
  doctor = await cursor.fetchone()
150
  if not doctor:
151
  return f"No doctor found with ID {doctor_id}. Please search for a doctor first."
152
-
153
  doctor_data = dict(doctor)
154
  doctor_name = doctor_data.get("doctor_name", "Unknown")
155
  doctor_category = doctor_data.get("doctor_category", "Unknown")
156
-
157
  # Check for conflicting booking (same doctor + same date)
158
  cursor = await db.execute(
159
  """SELECT id FROM patients
@@ -166,7 +168,7 @@ async def book_appointment(doctor_id: int, patient_name: str, patient_age: str,
166
  f"A booking for {patient_name} with Dr. {doctor_name} "
167
  f"on {visiting_date} already exists."
168
  )
169
-
170
  # Insert into patients table
171
  cursor = await db.execute(
172
  """INSERT INTO patients (doctor_name, doctor_category, patient_name, patient_age, patient_num, visiting_date)
@@ -174,7 +176,7 @@ async def book_appointment(doctor_id: int, patient_name: str, patient_age: str,
174
  (doctor_name, doctor_category, patient_name, patient_age, patient_num, visiting_date),
175
  )
176
  await db.commit()
177
-
178
  # Send SMS confirmation
179
  sms_message = (
180
  f"✅ Appointment Confirmed!\n"
@@ -188,7 +190,7 @@ async def book_appointment(doctor_id: int, patient_name: str, patient_age: str,
188
  # sms_status = "📱 SMS confirmation sent."
189
  # except Exception as e:
190
  # sms_status = f"⚠️ SMS failed: {str(e)}"
191
-
192
  return (
193
  f"✅ Appointment Booked!\n"
194
  f"━━━━━━━━━━━━━━━━━━━━━━\n"
@@ -209,33 +211,33 @@ async def delete_appointment(patient_num: str, doctor_name: str) -> str:
209
  db_path = get_db_path()
210
  # normalize phone number
211
  patient_num = format_bd_number(patient_num)
212
-
213
  async with aiosqlite.connect(db_path) as db:
214
  db.row_factory = aiosqlite.Row
215
-
216
  # check if appointment exists first
217
  cursor = await db.execute("""
218
  SELECT * FROM patients
219
  WHERE patient_num = ?
220
  AND LOWER(doctor_name) = LOWER(?)
221
  """, (patient_num, doctor_name))
222
-
223
  row = await cursor.fetchone()
224
  if not row:
225
  return json.dumps({
226
  "success": False,
227
  "message": "No matching appointment found to delete."
228
  })
229
-
230
  # delete appointment
231
  await db.execute("""
232
  DELETE FROM patients
233
  WHERE patient_num = ?
234
  AND LOWER(doctor_name) = LOWER(?)
235
  """, (patient_num, doctor_name))
236
-
237
  await db.commit()
238
-
239
  return json.dumps({
240
  "success": True,
241
  "message": f"Appointment with Dr. {doctor_name} deleted successfully."
@@ -250,7 +252,7 @@ class AIBackend:
250
  self.tools = [search_doctor, book_appointment, get_bd_time, search_appointment_by_phone, delete_appointment]
251
  self.tool_node = ToolNode(self.tools)
252
  self.llm_with_tools = self.llm.bind_tools(self.tools)
253
-
254
  async def async_setup(self):
255
  db_path = os.path.join(os.path.dirname(__file__), "daa.db")
256
  self.conn = await aiosqlite.connect(db_path)
@@ -258,7 +260,7 @@ class AIBackend:
258
  await self._create_user_table()
259
  self.graph = self._build_graph()
260
  self.summary_graph = self._build_summary_graph()
261
-
262
  async def _create_user_table(self):
263
  await self.conn.execute("""
264
  CREATE TABLE IF NOT EXISTS userid_threadid (
@@ -267,7 +269,7 @@ class AIBackend:
267
  )
268
  """)
269
  await self.conn.commit()
270
-
271
  ######################### SUMMARIZE NODE #########################
272
  async def summarize_conversation(self, state: ChatState):
273
  existing_summary = state.get("summary", "")
@@ -275,12 +277,12 @@ class AIBackend:
275
  prompt = (
276
  f"""
277
  You are maintaining a long-term conversation memory for a chatbot.
278
-
279
  Existing summary:
280
  {existing_summary}
281
-
282
  Update and extend the summary using ONLY the new conversation messages above.
283
-
284
  Instructions:
285
  - Preserve important existing context.
286
  - Add new facts, decisions, preferences, goals, issues, and ongoing tasks.
@@ -297,9 +299,9 @@ class AIBackend:
297
  else
298
  """
299
  You are creating a long-term conversation memory summary for a chatbot.
300
-
301
  Summarize the conversation above.
302
-
303
  Instructions:
304
  - Capture important user information, goals, preferences, projects, and decisions.
305
  - Include technical issues, debugging progress, and solutions discussed.
@@ -315,37 +317,40 @@ class AIBackend:
315
  "summary": response.content,
316
  "messages": [RemoveMessage(id=m.id) for m in messages[:-2]],
317
  }
318
-
319
  async def should_summarize(self, state: ChatState):
320
  if len(state["messages"]) > 10:
321
  return "summarize_node"
322
  return "chat_node"
323
-
324
  ######################### CHAT NODE #########################
325
  async def chat_node(self, state: ChatState):
326
  summary = state.get("summary", "")
327
  messages = state["messages"]
328
-
329
  print('#'*50)
330
  print(">>>>>>>>>> CHAT NODE START <<<<<<<<<<")
331
  if summary:
332
  print(f"[SUMMARY]:\n{summary}\n")
333
  else:
334
  print("[NO SUMMARY YET]\n")
335
-
336
  print('$'*50)
337
  print("[MESSAGES]:")
338
  for m in messages:
339
  role = m.__class__.__name__
340
  print(f" [{role}]: {m.content[:200]}")
341
  print('$'*50,'\n')
342
-
343
  if summary:
344
  summary_message = SystemMessage(
345
  content=(
346
- "You are provided with a condensed memory of previous conversations.\n\n"
347
  f"Conversation Memory:\n{summary}\n\n"
348
  "Instructions:\n"
 
 
 
349
  "- Use this memory as long-term conversational context.\n"
350
  "- Maintain continuity with the user's previous discussions, projects, goals, and preferences.\n"
351
  "- Prioritize recent and relevant information when generating responses.\n"
@@ -361,32 +366,32 @@ class AIBackend:
361
  print(">>>>>>>>>> CHAT NODE END <<<<<<<<<<")
362
  print('#'*50)
363
  return {"messages": [response]}
364
-
365
  ######################### GRAPH #########################
366
  def _build_graph(self):
367
  g = StateGraph(ChatState)
368
  g.add_node("chat_node", self.chat_node)
369
  g.add_node("tools", self.tool_node)
370
-
371
  g.add_edge(START, "chat_node")
372
  g.add_conditional_edges("chat_node", tools_condition)
373
  g.add_edge("tools", "chat_node")
374
-
375
  return g.compile(checkpointer=self.checkpointer)
376
-
377
  def _build_summary_graph(self):
378
  g = StateGraph(ChatState)
379
  g.add_node("summarize_node", self.summarize_conversation)
380
  g.add_edge(START, "summarize_node")
381
  g.add_edge("summarize_node", END)
382
  return g.compile(checkpointer=self.checkpointer)
383
-
384
  ######################### STREAMING #########################
385
  async def ai_only_stream(self, initial_state: dict, config: dict):
386
  async for message_chunk, metadata in self.graph.astream(initial_state, config=config, stream_mode="messages"):
387
  if isinstance(message_chunk, AIMessage) and message_chunk.content:
388
  yield message_chunk.content
389
-
390
  # Auto Summarization Execute
391
  current_state = await self.graph.aget_state(config)
392
  if len(current_state.values.get("messages", [])) > 10:
@@ -394,26 +399,26 @@ class AIBackend:
394
  self.summary_graph.ainvoke(current_state.values, config=config)
395
  )
396
  print('@'*20,'Summarization Execute','@'*20)
397
-
398
  ######################### THREAD ID #########################
399
  @staticmethod
400
  def generate_thread_id() -> str:
401
  return str(uuid.uuid4())
402
-
403
  ######################### RETRIEVE ALL THREADS #########################
404
  async def retrieve_all_threads(self):
405
  all_threads = set()
406
  async for checkpoint in self.checkpointer.alist(None):
407
  all_threads.add(checkpoint.config["configurable"]["thread_id"])
408
  return list(all_threads)
409
-
410
  ######################### MAIN ENTRY POINT #########################
411
  async def main(self, user_id: str, user_query: str):
412
  async with self.conn.execute(
413
  "SELECT userId, threadId FROM userid_threadid WHERE userId = ?", (user_id,)
414
  ) as cursor:
415
  result = await cursor.fetchone()
416
-
417
  if result is None:
418
  thread_id = user_id + self.generate_thread_id()
419
  await self.conn.execute(
@@ -423,11 +428,11 @@ class AIBackend:
423
  await self.conn.commit()
424
  else:
425
  thread_id = result[1]
426
-
427
  initial_state = {"messages": [HumanMessage(content=user_query)]}
428
  config = {
429
  "configurable": {"thread_id": thread_id},
430
  "metadata": {"thread_id": thread_id},
431
  "run_name": "chat_turn",
432
  }
433
- return self.ai_only_stream(initial_state, config)
 
20
  summary: str
21
 
22
  ######################### TOOLS #########################
23
+ # After imports, before STATE class
24
  def get_db_path():
25
  return os.path.join(os.path.dirname(__file__), "daa.db")
26
 
 
59
  query = "SELECT * FROM doctors WHERE 1=1"
60
  params = []
61
  conditions = []
62
+
63
  if name:
64
  conditions.append("LOWER(doctor_name) LIKE ?")
65
  params.append(f"%{name.lower()}%")
66
+
67
  if category:
68
  conditions.append("LOWER(category) LIKE ?")
69
  params.append(f"%{category.lower()}%")
70
+
71
  if visiting_days:
72
  conditions.append("LOWER(visiting_days) LIKE ?")
73
  params.append(f"%{visiting_days.lower()}%")
74
+
75
  if conditions:
76
  query += " AND (" + " OR ".join(conditions) + ")"
77
+
78
  async with aiosqlite.connect(db_path) as db:
79
  db.row_factory = aiosqlite.Row
80
  cursor = await db.execute(query, params)
81
  rows = await cursor.fetchall()
82
+
83
  if not rows:
84
  return json.dumps({
85
  "success": False,
86
  "message": "No doctors found matching your search.",
87
  "data": []
88
  })
89
+
90
  return json.dumps({
91
  "success": True,
92
  "count": len(rows),
 
100
  """
101
  db_path = get_db_path()
102
  patient_num = format_bd_number(patient_num)
103
+
104
  async with aiosqlite.connect(db_path) as db:
105
  db.row_factory = aiosqlite.Row
106
+
107
  cursor = await db.execute("""
108
  SELECT * FROM patients
109
  WHERE patient_num = ?
110
  ORDER BY visiting_date ASC
111
  """, (patient_num,))
112
+
113
  rows = await cursor.fetchall()
114
+
115
  if not rows:
116
  return json.dumps({
117
  "success": False,
118
  "message": "No appointments found for this phone number.",
119
  "data": []
120
  })
121
+
122
  return json.dumps({
123
  "success": True,
124
  "count": len(rows),
 
129
  async def book_appointment(doctor_id: int, patient_name: str, patient_age: str, patient_num: str, visiting_date: str) -> str:
130
  """
131
  Book a doctor appointment and save it to the patients table.
132
+
133
  Args:
134
  doctor_id: Doctor's ID from search_doctor results.
135
  patient_name: Full name of the patient.
136
  patient_age: Age of the patient (e.g. "32").
137
  patient_num: Contact phone number of the patient.
138
  visiting_date: Date of visit in YYYY-MM-DD format (e.g. 2025-06-15).
139
+
140
  Returns a booking confirmation with the new record ID.
141
  """
142
  db_path = get_db_path()
143
+
144
  async with aiosqlite.connect(db_path) as db:
145
  db.row_factory = aiosqlite.Row
146
+
147
  patient_num = format_bd_number(patient_num)
148
+
149
  # Verify doctor exists
150
  cursor = await db.execute("SELECT * FROM doctors WHERE id = ?", (doctor_id,))
151
  doctor = await cursor.fetchone()
152
  if not doctor:
153
  return f"No doctor found with ID {doctor_id}. Please search for a doctor first."
154
+
155
  doctor_data = dict(doctor)
156
  doctor_name = doctor_data.get("doctor_name", "Unknown")
157
  doctor_category = doctor_data.get("doctor_category", "Unknown")
158
+
159
  # Check for conflicting booking (same doctor + same date)
160
  cursor = await db.execute(
161
  """SELECT id FROM patients
 
168
  f"A booking for {patient_name} with Dr. {doctor_name} "
169
  f"on {visiting_date} already exists."
170
  )
171
+
172
  # Insert into patients table
173
  cursor = await db.execute(
174
  """INSERT INTO patients (doctor_name, doctor_category, patient_name, patient_age, patient_num, visiting_date)
 
176
  (doctor_name, doctor_category, patient_name, patient_age, patient_num, visiting_date),
177
  )
178
  await db.commit()
179
+
180
  # Send SMS confirmation
181
  sms_message = (
182
  f"✅ Appointment Confirmed!\n"
 
190
  # sms_status = "📱 SMS confirmation sent."
191
  # except Exception as e:
192
  # sms_status = f"⚠️ SMS failed: {str(e)}"
193
+
194
  return (
195
  f"✅ Appointment Booked!\n"
196
  f"━━━━━━━━━━━━━━━━━━━━━━\n"
 
211
  db_path = get_db_path()
212
  # normalize phone number
213
  patient_num = format_bd_number(patient_num)
214
+
215
  async with aiosqlite.connect(db_path) as db:
216
  db.row_factory = aiosqlite.Row
217
+
218
  # check if appointment exists first
219
  cursor = await db.execute("""
220
  SELECT * FROM patients
221
  WHERE patient_num = ?
222
  AND LOWER(doctor_name) = LOWER(?)
223
  """, (patient_num, doctor_name))
224
+
225
  row = await cursor.fetchone()
226
  if not row:
227
  return json.dumps({
228
  "success": False,
229
  "message": "No matching appointment found to delete."
230
  })
231
+
232
  # delete appointment
233
  await db.execute("""
234
  DELETE FROM patients
235
  WHERE patient_num = ?
236
  AND LOWER(doctor_name) = LOWER(?)
237
  """, (patient_num, doctor_name))
238
+
239
  await db.commit()
240
+
241
  return json.dumps({
242
  "success": True,
243
  "message": f"Appointment with Dr. {doctor_name} deleted successfully."
 
252
  self.tools = [search_doctor, book_appointment, get_bd_time, search_appointment_by_phone, delete_appointment]
253
  self.tool_node = ToolNode(self.tools)
254
  self.llm_with_tools = self.llm.bind_tools(self.tools)
255
+
256
  async def async_setup(self):
257
  db_path = os.path.join(os.path.dirname(__file__), "daa.db")
258
  self.conn = await aiosqlite.connect(db_path)
 
260
  await self._create_user_table()
261
  self.graph = self._build_graph()
262
  self.summary_graph = self._build_summary_graph()
263
+
264
  async def _create_user_table(self):
265
  await self.conn.execute("""
266
  CREATE TABLE IF NOT EXISTS userid_threadid (
 
269
  )
270
  """)
271
  await self.conn.commit()
272
+
273
  ######################### SUMMARIZE NODE #########################
274
  async def summarize_conversation(self, state: ChatState):
275
  existing_summary = state.get("summary", "")
 
277
  prompt = (
278
  f"""
279
  You are maintaining a long-term conversation memory for a chatbot.
280
+
281
  Existing summary:
282
  {existing_summary}
283
+
284
  Update and extend the summary using ONLY the new conversation messages above.
285
+
286
  Instructions:
287
  - Preserve important existing context.
288
  - Add new facts, decisions, preferences, goals, issues, and ongoing tasks.
 
299
  else
300
  """
301
  You are creating a long-term conversation memory summary for a chatbot.
302
+
303
  Summarize the conversation above.
304
+
305
  Instructions:
306
  - Capture important user information, goals, preferences, projects, and decisions.
307
  - Include technical issues, debugging progress, and solutions discussed.
 
317
  "summary": response.content,
318
  "messages": [RemoveMessage(id=m.id) for m in messages[:-2]],
319
  }
320
+
321
  async def should_summarize(self, state: ChatState):
322
  if len(state["messages"]) > 10:
323
  return "summarize_node"
324
  return "chat_node"
325
+
326
  ######################### CHAT NODE #########################
327
  async def chat_node(self, state: ChatState):
328
  summary = state.get("summary", "")
329
  messages = state["messages"]
330
+
331
  print('#'*50)
332
  print(">>>>>>>>>> CHAT NODE START <<<<<<<<<<")
333
  if summary:
334
  print(f"[SUMMARY]:\n{summary}\n")
335
  else:
336
  print("[NO SUMMARY YET]\n")
337
+
338
  print('$'*50)
339
  print("[MESSAGES]:")
340
  for m in messages:
341
  role = m.__class__.__name__
342
  print(f" [{role}]: {m.content[:200]}")
343
  print('$'*50,'\n')
344
+
345
  if summary:
346
  summary_message = SystemMessage(
347
  content=(
348
+ "You are a Bangla voice assistant. You are provided with a condensed memory of previous conversations.\n\n"
349
  f"Conversation Memory:\n{summary}\n\n"
350
  "Instructions:\n"
351
+ "- Always respond in Bangla (বাংলা)"
352
+ "- Keep sentences short for speech"
353
+ "- No English unless necessary"
354
  "- Use this memory as long-term conversational context.\n"
355
  "- Maintain continuity with the user's previous discussions, projects, goals, and preferences.\n"
356
  "- Prioritize recent and relevant information when generating responses.\n"
 
366
  print(">>>>>>>>>> CHAT NODE END <<<<<<<<<<")
367
  print('#'*50)
368
  return {"messages": [response]}
369
+
370
  ######################### GRAPH #########################
371
  def _build_graph(self):
372
  g = StateGraph(ChatState)
373
  g.add_node("chat_node", self.chat_node)
374
  g.add_node("tools", self.tool_node)
375
+
376
  g.add_edge(START, "chat_node")
377
  g.add_conditional_edges("chat_node", tools_condition)
378
  g.add_edge("tools", "chat_node")
379
+
380
  return g.compile(checkpointer=self.checkpointer)
381
+
382
  def _build_summary_graph(self):
383
  g = StateGraph(ChatState)
384
  g.add_node("summarize_node", self.summarize_conversation)
385
  g.add_edge(START, "summarize_node")
386
  g.add_edge("summarize_node", END)
387
  return g.compile(checkpointer=self.checkpointer)
388
+
389
  ######################### STREAMING #########################
390
  async def ai_only_stream(self, initial_state: dict, config: dict):
391
  async for message_chunk, metadata in self.graph.astream(initial_state, config=config, stream_mode="messages"):
392
  if isinstance(message_chunk, AIMessage) and message_chunk.content:
393
  yield message_chunk.content
394
+
395
  # Auto Summarization Execute
396
  current_state = await self.graph.aget_state(config)
397
  if len(current_state.values.get("messages", [])) > 10:
 
399
  self.summary_graph.ainvoke(current_state.values, config=config)
400
  )
401
  print('@'*20,'Summarization Execute','@'*20)
402
+
403
  ######################### THREAD ID #########################
404
  @staticmethod
405
  def generate_thread_id() -> str:
406
  return str(uuid.uuid4())
407
+
408
  ######################### RETRIEVE ALL THREADS #########################
409
  async def retrieve_all_threads(self):
410
  all_threads = set()
411
  async for checkpoint in self.checkpointer.alist(None):
412
  all_threads.add(checkpoint.config["configurable"]["thread_id"])
413
  return list(all_threads)
414
+
415
  ######################### MAIN ENTRY POINT #########################
416
  async def main(self, user_id: str, user_query: str):
417
  async with self.conn.execute(
418
  "SELECT userId, threadId FROM userid_threadid WHERE userId = ?", (user_id,)
419
  ) as cursor:
420
  result = await cursor.fetchone()
421
+
422
  if result is None:
423
  thread_id = user_id + self.generate_thread_id()
424
  await self.conn.execute(
 
428
  await self.conn.commit()
429
  else:
430
  thread_id = result[1]
431
+
432
  initial_state = {"messages": [HumanMessage(content=user_query)]}
433
  config = {
434
  "configurable": {"thread_id": thread_id},
435
  "metadata": {"thread_id": thread_id},
436
  "run_name": "chat_turn",
437
  }
438
+ return self.ai_only_stream(initial_state, config)
frontend/index.html CHANGED
@@ -44,4 +44,4 @@
44
  <script src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"></script>
45
  <script src="script.js"></script>
46
  </body>
47
- </html>
 
44
  <script src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"></script>
45
  <script src="script.js"></script>
46
  </body>
47
+ </html>
frontend/script.js CHANGED
@@ -1,166 +1,294 @@
1
- const chatBox = document.getElementById("chat-box");
2
- const sendBtn = document.getElementById("send-btn");
3
- const textInput = document.getElementById("text-input");
4
- const micBtn = document.getElementById("mic-btn");
5
-
6
- const userId = "walid";
7
-
8
-
9
- // =======================
10
- // CHAT WEBSOCKET
11
- // =======================
12
-
13
- const chatSocket = new WebSocket("ws://127.0.0.1:8679/ws/chat");
14
-
15
- chatSocket.onmessage = (event) => {
16
-
17
- const data = event.data;
18
-
19
- if (data === "[[END]]") {
20
- return;
21
- }
22
-
23
- appendMessage(data, "ai");
24
- };
25
-
26
-
27
- sendBtn.onclick = () => {
28
- sendTextMessage();
29
- };
30
-
31
- textInput.addEventListener("keydown", (e) => {
32
- if (e.key === "Enter") {
33
- sendTextMessage();
34
- }
 
 
35
  });
36
 
37
-
38
  function sendTextMessage() {
 
 
 
 
 
 
39
 
40
- const message = textInput.value.trim();
41
-
42
- if (!message) return;
43
-
44
- appendMessage(message, "user");
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
- chatSocket.send(JSON.stringify({
47
- user_id: userId,
48
- user_query: message
49
- }));
50
 
51
- textInput.value = "";
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  }
53
 
 
 
 
 
 
54
 
55
- // =======================
56
- // VOICE WEBSOCKET
57
- // =======================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
- const voiceSocket = new WebSocket("ws://127.0.0.1:8679/ws/voice");
 
 
 
 
60
 
61
- voiceSocket.binaryType = "arraybuffer";
62
 
63
- let mediaRecorder;
64
- let audioChunks = [];
65
- let isRecording = false;
66
 
67
- voiceSocket.onmessage = async (event) => {
 
 
68
 
69
- // TEXT MESSAGE
70
- if (typeof event.data === "string") {
 
71
 
72
- const text = event.data;
 
73
 
74
- if (text.startsWith("[STT]:")) {
75
- appendMessage("🎤 " + text.replace("[STT]:", ""), "user");
76
- }
 
77
 
78
- else if (text.startsWith("[LLM]:")) {
79
- appendMessage(
80
- text.replace("[LLM]:", ""),
81
- "ai"
82
- );
83
- }
84
 
85
- return;
 
 
 
86
  }
 
 
 
 
 
 
 
 
 
 
 
 
87
 
88
- // AUDIO MESSAGE
89
- const audioBlob = new Blob([event.data], { type: "audio/mp3" });
90
-
91
- const audioUrl = URL.createObjectURL(audioBlob);
92
 
93
- const audio = new Audio(audioUrl);
 
 
94
 
95
- audio.play();
96
- };
97
 
 
 
 
98
 
99
- micBtn.onclick = async () => {
 
 
 
 
100
 
101
- if (!isRecording) {
102
- startRecording();
 
103
  } else {
104
- stopRecording();
 
 
105
  }
106
- };
107
-
108
-
109
- async function startRecording() {
110
-
111
- const stream = await navigator.mediaDevices.getUserMedia({
112
- audio: true
113
- });
114
-
115
- mediaRecorder = new MediaRecorder(stream, {
116
- mimeType: "audio/webm"
117
- });
118
-
119
- mediaRecorder.start(250);
120
-
121
- mediaRecorder.ondataavailable = async (event) => {
122
-
123
- if (event.data.size > 0 &&
124
- voiceSocket.readyState === WebSocket.OPEN) {
125
 
126
- const arrayBuffer = await event.data.arrayBuffer();
 
127
 
128
- voiceSocket.send(arrayBuffer);
129
- }
 
 
 
 
130
  };
131
-
132
- isRecording = true;
133
-
134
- micBtn.innerText = "⏹ Stop Voice";
135
- micBtn.classList.add("recording");
136
  }
137
 
138
-
139
- function stopRecording() {
140
-
141
- mediaRecorder.stop();
142
-
143
- isRecording = false;
144
-
145
- micBtn.innerText = "🎤 Start Voice";
146
- micBtn.classList.remove("recording");
 
147
  }
148
 
149
-
150
- // =======================
151
- // UI
152
- // =======================
153
-
154
  function appendMessage(text, sender) {
155
-
156
- const div = document.createElement("div");
157
-
158
- div.classList.add("message");
159
- div.classList.add(sender);
160
-
161
- div.innerHTML = marked.parse(text);
162
-
163
- chatBox.appendChild(div);
164
-
165
- chatBox.scrollTop = chatBox.scrollHeight;
166
- }
 
1
+ const chatBox = document.getElementById('chat-box');
2
+ const sendBtn = document.getElementById('send-btn');
3
+ const textInput = document.getElementById('text-input');
4
+ const micBtn = document.getElementById('mic-btn');
5
+
6
+ const userId = 'walid';
7
+
8
+ // ── WebSockets ────────────────────────────────────────────────────────────────
9
+ const chatSocket = new WebSocket('ws://127.0.0.1:8679/ws/chat');
10
+ const voiceSocket = new WebSocket('ws://127.0.0.1:8679/ws/voice');
11
+ voiceSocket.binaryType = 'arraybuffer';
12
+
13
+ // ── State ─────────────────────────────────────────────────────────────────────
14
+ let micStream = null;
15
+ let audioContext = null;
16
+ let analyser = null;
17
+ let mediaRecorder = null;
18
+ let audioChunks = [];
19
+ let isListening = false;
20
+ let isSpeaking = false;
21
+ let silenceTimer = null;
22
+ let vadInterval = null;
23
+ let isProcessing = false; // true while server is processing an utterance
24
+
25
+ let currentAIMessage = null;
26
+ let playbackChain = Promise.resolve();
27
+
28
+ // ── VAD config ────────────────────────────────────────────────────────────────
29
+ const SILENCE_THRESHOLD_DB = -45; // dBFS; lower = more sensitive
30
+ const SILENCE_TIMEOUT_MS = 3000; // 3 s silence → send utterance
31
+ const VAD_POLL_MS = 100;
32
+
33
+ // ── Text chat ─────────────────────────────────────────────────────────────────
34
+ sendBtn.onclick = sendTextMessage;
35
+ textInput.addEventListener('keydown', (e) => {
36
+ if (e.key === 'Enter') sendTextMessage();
37
  });
38
 
 
39
  function sendTextMessage() {
40
+ const msg = textInput.value.trim();
41
+ if (!msg) return;
42
+ appendMessage(msg, 'user');
43
+ chatSocket.send(JSON.stringify({ user_id: userId, user_query: msg }));
44
+ textInput.value = '';
45
+ }
46
 
47
+ // Chat WS now sends JSON: {"type":"chat","text":"..."} or {"type":"end"}
48
+ chatSocket.onmessage = (e) => {
49
+ let msg;
50
+ try {
51
+ msg = JSON.parse(e.data);
52
+ } catch {
53
+ return;
54
+ }
55
+ if (msg.type === 'chat' && msg.text) appendMessage(msg.text, 'ai');
56
+ if (msg.type === 'error') appendMessage('⚠️ ' + msg.text, 'system');
57
+ // 'end' — nothing to do for text chat
58
+ };
59
+ chatSocket.onerror = (e) => console.error('Chat WS error:', e);
60
+ chatSocket.onclose = () => console.log('Chat WS closed');
61
+
62
+ // ── Voice WebSocket events ────────────────────────────────────────────────────
63
+ voiceSocket.onopen = () => console.log('[WS] Voice connected');
64
+ voiceSocket.onclose = () => {
65
+ console.log('[WS] Voice closed');
66
+ stopListening();
67
+ };
68
+ voiceSocket.onerror = (e) => console.error('[WS] Voice error:', e);
69
+
70
+ voiceSocket.onmessage = (event) => {
71
+ // Binary → audio playback
72
+ if (event.data instanceof ArrayBuffer) {
73
+ enqueueAudio(event.data);
74
+ return;
75
+ }
76
+
77
+ let msg;
78
+ try {
79
+ msg = JSON.parse(event.data);
80
+ } catch {
81
+ return;
82
+ }
83
+
84
+ switch (msg.type) {
85
+ case 'stt':
86
+ // Show transcribed Bangla text as user bubble
87
+ appendMessage('🎤 ' + msg.text, 'user');
88
+ currentAIMessage = null;
89
+ break;
90
+
91
+ case 'llm_token':
92
+ // Stream AI tokens into growing bubble
93
+ if (!currentAIMessage) currentAIMessage = appendMessage('', 'ai');
94
+ currentAIMessage.textContent += msg.token;
95
+ chatBox.scrollTop = chatBox.scrollHeight;
96
+ break;
97
+
98
+ case 'end':
99
+ // Server finished this turn → resume VAD listening
100
+ currentAIMessage = null;
101
+ isProcessing = false;
102
+ if (isListening) setMicStatus('listening');
103
+ break;
104
+
105
+ case 'error':
106
+ appendMessage('⚠️ ' + msg.text, 'system');
107
+ // Still need to reset so VAD resumes
108
+ isProcessing = false;
109
+ if (isListening) setMicStatus('listening');
110
+ break;
111
+
112
+ case 'pong':
113
+ break;
114
+
115
+ default:
116
+ console.log('[WS] Unknown msg:', msg.type);
117
+ }
118
+ };
119
 
120
+ // ── Audio playback: sequential, no overlap ────────────────────────────────────
121
+ function enqueueAudio(buffer) {
122
+ playbackChain = playbackChain.then(() => playBuffer(buffer));
123
+ }
124
 
125
+ function playBuffer(buffer) {
126
+ return new Promise((resolve) => {
127
+ const blob = new Blob([buffer], { type: 'audio/mpeg' });
128
+ const url = URL.createObjectURL(blob);
129
+ const audio = new Audio(url);
130
+ const done = () => {
131
+ URL.revokeObjectURL(url);
132
+ resolve();
133
+ };
134
+ audio.onended = done;
135
+ audio.onerror = () => {
136
+ console.warn('[AUDIO] playback error');
137
+ done();
138
+ };
139
+ audio.play().catch(() => done());
140
+ });
141
  }
142
 
143
+ // ── Mic button ────────────────────────────────────────────────────────────────
144
+ micBtn.onclick = async () => {
145
+ if (!isListening) await startListening();
146
+ else stopListening();
147
+ };
148
 
149
+ // ── Start continuous listening with VAD ───────────────────────────────────────
150
+ async function startListening() {
151
+ try {
152
+ micStream = await navigator.mediaDevices.getUserMedia({
153
+ audio: {
154
+ echoCancellation: true,
155
+ noiseSuppression: true,
156
+ autoGainControl: true,
157
+ channelCount: 1,
158
+ sampleRate: 16000,
159
+ },
160
+ });
161
+ } catch (e) {
162
+ console.error('Mic error:', e);
163
+ appendMessage('⚠️ Microphone access denied.', 'system');
164
+ return;
165
+ }
166
+
167
+ audioContext = new AudioContext();
168
+ const source = audioContext.createMediaStreamSource(micStream);
169
+ analyser = audioContext.createAnalyser();
170
+ analyser.fftSize = 512;
171
+ source.connect(analyser);
172
+
173
+ isListening = true;
174
+ setMicStatus('listening');
175
+
176
+ vadInterval = setInterval(vadTick, VAD_POLL_MS);
177
+ }
178
 
179
+ // ── Stop everything ───────────────────────────────────────────────────────────
180
+ function stopListening() {
181
+ clearInterval(vadInterval);
182
+ clearTimeout(silenceTimer);
183
+ vadInterval = silenceTimer = null;
184
 
185
+ if (isSpeaking) stopRecorder(true); // discard in-progress utterance
186
 
187
+ micStream?.getTracks().forEach((t) => t.stop());
188
+ audioContext?.close();
189
+ micStream = audioContext = analyser = null;
190
 
191
+ isSpeaking = isListening = isProcessing = false;
192
+ setMicStatus('off');
193
+ }
194
 
195
+ // ── VAD polling ───────────────────────────────────────────────────────────────
196
+ function vadTick() {
197
+ if (!analyser || isProcessing) return;
198
 
199
+ const data = new Float32Array(analyser.frequencyBinCount);
200
+ analyser.getFloatTimeDomainData(data);
201
 
202
+ // RMS → dBFS
203
+ const rms = Math.sqrt(data.reduce((s, v) => s + v * v, 0) / data.length);
204
+ const db = rms > 0 ? 20 * Math.log10(rms) : -Infinity;
205
+ const speaking = db > SILENCE_THRESHOLD_DB;
206
 
207
+ if (speaking) {
208
+ clearTimeout(silenceTimer);
209
+ silenceTimer = null;
 
 
 
210
 
211
+ if (!isSpeaking) {
212
+ isSpeaking = true;
213
+ startRecorder();
214
+ setMicStatus('recording');
215
  }
216
+ } else {
217
+ if (isSpeaking && !silenceTimer) {
218
+ silenceTimer = setTimeout(() => {
219
+ silenceTimer = null;
220
+ isSpeaking = false;
221
+ isProcessing = true;
222
+ stopRecorder(false); // send the utterance
223
+ setMicStatus('processing');
224
+ }, SILENCE_TIMEOUT_MS);
225
+ }
226
+ }
227
+ }
228
 
229
+ // ── Recorder ──────────────────────────────────────────────────────────────────
230
+ function startRecorder() {
231
+ if (!micStream) return;
232
+ audioChunks = [];
233
 
234
+ const mimeType = MediaRecorder.isTypeSupported('audio/webm;codecs=opus')
235
+ ? 'audio/webm;codecs=opus'
236
+ : 'audio/webm';
237
 
238
+ mediaRecorder = new MediaRecorder(micStream, { mimeType });
 
239
 
240
+ mediaRecorder.ondataavailable = (e) => {
241
+ if (e.data.size > 0) audioChunks.push(e.data);
242
+ };
243
 
244
+ mediaRecorder.onstop = async () => {
245
+ if (!audioChunks.length) return;
246
+ const blob = new Blob(audioChunks, { type: mimeType });
247
+ const buffer = await blob.arrayBuffer();
248
+ audioChunks = [];
249
 
250
+ if (voiceSocket.readyState === WebSocket.OPEN) {
251
+ console.log(`[VAD] Sending utterance: ${buffer.byteLength} bytes`);
252
+ voiceSocket.send(buffer);
253
  } else {
254
+ console.warn('[VAD] WS not open, utterance discarded');
255
+ isProcessing = false;
256
+ if (isListening) setMicStatus('listening');
257
  }
258
+ };
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
259
 
260
+ mediaRecorder.start();
261
+ }
262
 
263
+ function stopRecorder(discard = false) {
264
+ if (!mediaRecorder || mediaRecorder.state === 'inactive') return;
265
+ if (discard) {
266
+ mediaRecorder.ondataavailable = () => {};
267
+ mediaRecorder.onstop = () => {
268
+ audioChunks = [];
269
  };
270
+ }
271
+ mediaRecorder.stop();
272
+ mediaRecorder = null;
 
 
273
  }
274
 
275
+ // ── UI ────────────────────────────────────────────────────────────────────────
276
+ function setMicStatus(state) {
277
+ const labels = {
278
+ off: '🎤 Start Voice',
279
+ listening: '🟢 Listening… (click to stop)',
280
+ recording: '🔴 Speaking…',
281
+ processing: '⏳ Processing…',
282
+ };
283
+ micBtn.innerText = labels[state] ?? '🎤 Start Voice';
284
+ micBtn.className = state === 'off' ? '' : `mic-${state}`;
285
  }
286
 
 
 
 
 
 
287
  function appendMessage(text, sender) {
288
+ const div = document.createElement('div');
289
+ div.className = `message ${sender}`;
290
+ div.textContent = text;
291
+ chatBox.appendChild(div);
292
+ chatBox.scrollTop = chatBox.scrollHeight;
293
+ return div;
294
+ }
 
 
 
 
 
frontend/style.css CHANGED
@@ -149,4 +149,4 @@ button:hover {
149
  flex-direction: column;
150
  padding: 10px;
151
  gap: 6px;
152
- }
 
149
  flex-direction: column;
150
  padding: 10px;
151
  gap: 6px;
152
+ }
requirements.txt CHANGED
@@ -1,35 +1,60 @@
 
1
  python-dotenv
 
 
 
 
 
 
 
2
  fastapi
3
  uvicorn
4
- requests
 
 
 
 
 
5
  langchain
6
- langchain-chroma
7
- langchain-classic
8
- langchain-community
9
  langchain-core
 
10
  langchain-experimental
11
- langchain-google-genai
 
 
 
 
 
 
12
  langchain-huggingface
 
13
  langchain-mcp-adapters
14
- langchain-ollama
15
- langchain-openai
16
- langchain-protocol
17
- langchain-text-splitters
18
  langgraph
 
 
19
  langgraph-checkpoint
20
  langgraph-checkpoint-sqlite
21
- langgraph-prebuilt
22
- langgraph-sdk
23
  langsmith
24
- aiosqlite
25
- colorama
 
26
  faster-whisper
 
 
 
 
 
 
 
 
 
 
27
  mcp
28
- numpy
29
- ollama
30
- pydantic
31
- twilio
32
- uuid_utils
33
- uv
34
- uvicorn
35
 
 
 
 
 
1
+ # ===== Core =====
2
  python-dotenv
3
+ requests
4
+ numpy
5
+ pydantic
6
+ colorama
7
+ uuid_utils
8
+
9
+ # ===== Web Framework =====
10
  fastapi
11
  uvicorn
12
+ websockets
13
+
14
+ # ===== Async / DB =====
15
+ aiosqlite
16
+
17
+ # ===== LangChain Ecosystem =====
18
  langchain
 
 
 
19
  langchain-core
20
+ langchain-community
21
  langchain-experimental
22
+ langchain-text-splitters
23
+ langchain-chroma
24
+ langchain-classic
25
+ langchain-protocol
26
+
27
+ langchain-openai
28
+ langchain-ollama
29
  langchain-huggingface
30
+ langchain-google-genai
31
  langchain-mcp-adapters
32
+
33
+ # ===== LangGraph Ecosystem =====
 
 
34
  langgraph
35
+ langgraph-sdk
36
+ langgraph-prebuilt
37
  langgraph-checkpoint
38
  langgraph-checkpoint-sqlite
39
+
40
+ # ===== Observability / Tracking =====
41
  langsmith
42
+
43
+ # ===== AI / LLM / STT / TTS =====
44
+ ollama
45
  faster-whisper
46
+ edge-tts
47
+ google-generativeai
48
+
49
+ # ===== Audio / Media =====
50
+ pydub
51
+ Pillow
52
+
53
+
54
+
55
+ # ===== MCP =====
56
  mcp
 
 
 
 
 
 
 
57
 
58
+ # ===== Utility =====
59
+ uv
60
+ pytz
services/streaming.py ADDED
@@ -0,0 +1,262 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import asyncio
2
+ # import edge_tts
3
+
4
+ # # ── Voice ─────────────────────────────────────────────────────────────────────
5
+ # VOICE = "bn-BD-NabanitaNeural"
6
+
7
+ # # Flush when buffer reaches this many characters (even without punctuation)
8
+ # FLUSH_LEN = 50
9
+
10
+ # # Don't send a TTS request for fewer than this many characters
11
+ # MIN_CHARS = 3
12
+
13
+ # # Punctuation marks that trigger an immediate flush
14
+ # FLUSH_TRIGGERS = frozenset(".!?।,;:\n—–")
15
+
16
+
17
+ # class ParallelTTSStreamer:
18
+ # """
19
+ # Collects LLM tokens, splits them into prosodic chunks, and converts each
20
+ # chunk to audio via edge-tts.
21
+
22
+ # FIX: Audio chunks are now guaranteed to arrive IN ORDER by chaining each
23
+ # TTS task so it only writes to the queue after the previous task finishes.
24
+ # This prevents audio chunks from chunk-2 overtaking chunk-1 during playback.
25
+ # """
26
+
27
+ # def __init__(self, voice: str = VOICE):
28
+ # self.voice = voice
29
+ # self.buffer = ""
30
+ # self.queue: asyncio.Queue[bytes | None] = asyncio.Queue()
31
+ # # Tracks the last scheduled task so each new task waits for it first
32
+ # self._prev_task: asyncio.Task | None = None
33
+ # self._flush_lock = asyncio.Lock()
34
+
35
+ # async def add_token(self, token: str) -> None:
36
+ # """Feed a single LLM output token into the streamer."""
37
+ # if not token:
38
+ # return
39
+
40
+ # self.buffer += token
41
+
42
+ # should_flush = (
43
+ # any(ch in FLUSH_TRIGGERS for ch in token)
44
+ # or len(self.buffer) >= FLUSH_LEN
45
+ # )
46
+
47
+ # if should_flush:
48
+ # await self._schedule_flush()
49
+
50
+ # async def _schedule_flush(self) -> None:
51
+ # """Snapshot the buffer and schedule an ordered TTS task."""
52
+ # async with self._flush_lock:
53
+ # text = self.buffer.strip()
54
+ # self.buffer = ""
55
+
56
+ # if len(text) < MIN_CHARS:
57
+ # return
58
+
59
+ # # Each task waits for the previous one before pushing to queue,
60
+ # # guaranteeing in-order audio delivery.
61
+ # prev = self._prev_task
62
+ # task = asyncio.create_task(self._tts_ordered(text, prev))
63
+ # self._prev_task = task
64
+
65
+ # async def _tts_ordered(self, text: str, wait_for: asyncio.Task | None) -> None:
66
+ # """
67
+ # 1. First synthesise audio bytes (can run in parallel with other chunks).
68
+ # 2. Then wait for the previous chunk to finish writing to queue.
69
+ # 3. Then push our bytes to the queue in order.
70
+ # """
71
+ # # Step 1: synthesise concurrently (no queue writes yet)
72
+ # audio_chunks: list[bytes] = []
73
+ # try:
74
+ # communicate = edge_tts.Communicate(text, self.voice)
75
+ # async for chunk in communicate.stream():
76
+ # if chunk["type"] == "audio":
77
+ # audio_chunks.append(chunk["data"])
78
+ # except Exception as e:
79
+ # print(f"[TTS] edge-tts error for '{text[:30]}…': {e}")
80
+ # # Still need to chain — wait for prev even on error
81
+ # if wait_for and not wait_for.done():
82
+ # await wait_for
83
+ # return
84
+
85
+ # # Step 2: wait for the previous chunk to have finished queuing
86
+ # if wait_for and not wait_for.done():
87
+ # try:
88
+ # await wait_for
89
+ # except Exception:
90
+ # pass
91
+
92
+ # # Step 3: push our audio bytes in order
93
+ # for data in audio_chunks:
94
+ # await self.queue.put(data)
95
+
96
+ # async def flush(self) -> None:
97
+ # """
98
+ # Flush remaining buffer, wait for all in-flight TTS tasks, then
99
+ # signal end-of-stream with sentinel None.
100
+ # """
101
+ # await self._schedule_flush()
102
+
103
+ # # Wait for the last chained task (which transitively waits for all)
104
+ # if self._prev_task:
105
+ # try:
106
+ # await self._prev_task
107
+ # except Exception:
108
+ # pass
109
+
110
+ # await self.queue.put(None)
111
+
112
+ # async def stream_audio(self):
113
+ # """
114
+ # Async generator that yields audio bytes in order.
115
+ # Stops when the sentinel None is received.
116
+ # """
117
+ # while True:
118
+ # chunk = await self.queue.get()
119
+ # if chunk is None:
120
+ # break
121
+ # yield chunk
122
+
123
+
124
+
125
+
126
+
127
+
128
+
129
+
130
+
131
+
132
+
133
+
134
+
135
+
136
+
137
+
138
+
139
+
140
+
141
+
142
+
143
+
144
+
145
+
146
+
147
+
148
+
149
+
150
+
151
+
152
+
153
+
154
+ import re
155
+ import asyncio
156
+ import edge_tts
157
+
158
+ VOICE = "bn-BD-NabanitaNeural"
159
+ FLUSH_LEN = 80 # chars before forced flush (longer = more natural speech)
160
+ MIN_CHARS = 5 # don't TTS tiny fragments
161
+ FLUSH_TRIGGERS = frozenset(".!?।,;:\n—–")
162
+
163
+
164
+ def _clean_for_tts(text: str) -> str:
165
+ """
166
+ Strip markdown and other non-speech symbols before sending to edge-tts.
167
+
168
+ The LLM outputs markdown (**, *, #, -, numbers+dot lists) which edge-tts
169
+ either reads aloud awkwardly ("asterisk asterisk") or returns 'No audio
170
+ was received' on punctuation-only chunks like '**' or '-)'.
171
+ """
172
+ # Remove bold/italic markers
173
+ text = re.sub(r'\*{1,3}', '', text)
174
+ # Remove heading markers
175
+ text = re.sub(r'#+\s*', '', text)
176
+ # Remove list markers like "১.", "1.", "-", "•"
177
+ text = re.sub(r'^\s*[-•]\s*', '', text, flags=re.MULTILINE)
178
+ text = re.sub(r'^\s*[\d০-৯]+[.)]\s*', '', text, flags=re.MULTILINE)
179
+ # Remove leftover backticks
180
+ text = re.sub(r'`+', '', text)
181
+ # Collapse extra whitespace / blank lines
182
+ text = re.sub(r'\n{2,}', '\n', text)
183
+ text = text.strip()
184
+ return text
185
+
186
+
187
+ class ParallelTTSStreamer:
188
+ """
189
+ Collects LLM tokens → splits into prosodic chunks → converts to audio
190
+ via edge-tts in parallel → streams audio bytes IN ORDER.
191
+
192
+ Audio ordering is guaranteed by a task chain: each chunk task synthesises
193
+ audio freely (parallel) but only writes to the queue after the previous
194
+ chunk finishes, so the client always hears chunk-1 before chunk-2.
195
+ """
196
+
197
+ def __init__(self, voice: str = VOICE):
198
+ self.voice = voice
199
+ self.buffer = ""
200
+ self.queue: asyncio.Queue[bytes | None] = asyncio.Queue()
201
+ self._prev_task: asyncio.Task | None = None
202
+ self._flush_lock = asyncio.Lock()
203
+
204
+ async def add_token(self, token: str) -> None:
205
+ if not token:
206
+ return
207
+ self.buffer += token
208
+ if any(ch in FLUSH_TRIGGERS for ch in token) or len(self.buffer) >= FLUSH_LEN:
209
+ await self._schedule_flush()
210
+
211
+ async def _schedule_flush(self) -> None:
212
+ async with self._flush_lock:
213
+ raw = self.buffer.strip()
214
+ self.buffer = ""
215
+
216
+ text = _clean_for_tts(raw)
217
+ if len(text) < MIN_CHARS:
218
+ return
219
+
220
+ prev = self._prev_task
221
+ task = asyncio.create_task(self._tts_ordered(text, prev))
222
+ self._prev_task = task
223
+
224
+ async def _tts_ordered(self, text: str, wait_for: asyncio.Task | None) -> None:
225
+ """Synthesise audio (parallel), then write to queue in order."""
226
+ # Step 1 — synthesise concurrently
227
+ audio_chunks: list[bytes] = []
228
+ try:
229
+ communicate = edge_tts.Communicate(text, self.voice)
230
+ async for chunk in communicate.stream():
231
+ if chunk["type"] == "audio":
232
+ audio_chunks.append(chunk["data"])
233
+ except Exception as e:
234
+ print(f"[TTS] edge-tts error for '{text[:40]}': {e}")
235
+
236
+ # Step 2 — wait for previous chunk to finish queuing
237
+ if wait_for and not wait_for.done():
238
+ try:
239
+ await wait_for
240
+ except Exception:
241
+ pass
242
+
243
+ # Step 3 — write to queue in order
244
+ for data in audio_chunks:
245
+ await self.queue.put(data)
246
+
247
+ async def flush(self) -> None:
248
+ """Flush remaining buffer, await all tasks, send end sentinel."""
249
+ await self._schedule_flush()
250
+ if self._prev_task:
251
+ try:
252
+ await self._prev_task
253
+ except Exception:
254
+ pass
255
+ await self.queue.put(None)
256
+
257
+ async def stream_audio(self):
258
+ while True:
259
+ chunk = await self.queue.get()
260
+ if chunk is None:
261
+ break
262
+ yield chunk
services/stt.py CHANGED
@@ -1,73 +1,148 @@
1
- # from faster_whisper import WhisperModel
2
- # import tempfile
3
-
4
- # model = WhisperModel("small", device="cpu", compute_type="int8")
5
- # class StreamingSTT:
6
- # def __init__(self):
7
- # self.audio_buffer = bytearray()
8
-
9
- # def add_audio(self, chunk: bytes):
10
- # self.audio_buffer.extend(chunk)
11
-
12
- # def transcribe_if_ready(self):
13
- # # simple chunk trigger (1.5–3 sec buffer recommended)
14
- # if len(self.audio_buffer) < 48000 * 2 * 2:
15
- # return None
16
-
17
- # with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as f:
18
- # f.write(self.audio_buffer)
19
- # f.flush()
20
- # segments, _ = model.transcribe(f.name, language="bn", task="translate", beam_size=1)
21
- # text = " ".join([s.text for s in segments])
22
-
23
- # self.audio_buffer.clear()
24
- # return text
25
 
 
 
 
 
26
 
 
27
 
28
 
29
- from faster_whisper import WhisperModel
30
- import tempfile
31
 
32
- model = WhisperModel(
33
- "small",
34
- device="cpu",
35
- compute_type="int8"
36
- )
37
 
38
- class StreamingSTT:
39
 
40
- def __init__(self):
41
- self.audio_buffer = bytearray()
 
 
 
 
 
42
 
43
- def add_audio(self, chunk: bytes):
44
- self.audio_buffer.extend(chunk)
45
 
46
- def transcribe_if_ready(self):
47
- # wait enough audio
48
- if len(self.audio_buffer) < 50000:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  return None
50
 
51
- # SAVE AS WEBM
52
- with tempfile.NamedTemporaryFile(
53
- suffix=".webm",
54
- delete=True
55
- ) as f:
56
 
57
- f.write(self.audio_buffer)
58
- f.flush()
59
 
60
- segments, _ = model.transcribe(
61
- f.name,
62
- language="bn",
63
- beam_size=1
64
- )
65
 
66
- text = " ".join(
67
- [segment.text for segment in segments]
 
 
 
 
 
 
 
 
 
 
68
  )
69
-
70
-
71
- self.audio_buffer.clear()
72
 
73
- return text.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
 
2
+ import os
3
+ import re
4
+ import subprocess
5
+ import tempfile
6
 
7
+ from faster_whisper import WhisperModel
8
 
9
 
10
+ model = WhisperModel("large-v3", device="cuda", compute_type="int8_float32")
 
11
 
 
 
 
 
 
12
 
13
+ BANGLA_PATTERN = re.compile(r'[\u0980-\u09FF]')
14
 
15
+ # Scripts we consider "wrong" — Arabic, Urdu, Devanagari (when expecting Bangla)
16
+ WRONG_SCRIPT_PATTERN = re.compile(
17
+ r'[\u0600-\u06FF' # Arabic / Urdu
18
+ r'\u0750-\u077F' # Arabic Supplement
19
+ r'\uFB50-\uFDFF' # Arabic Presentation Forms
20
+ r'\uFE70-\uFEFF]' # Arabic Presentation Forms-B
21
+ )
22
 
 
 
23
 
24
+ def _is_valid_bangla(text: str) -> bool:
25
+ """
26
+ Return True if the transcript looks like real Bangla.
27
+
28
+ A valid transcript must:
29
+ 1. Contain at least one Bangla Unicode character, OR be very short
30
+ (some valid responses are single digits/punctuation).
31
+ 2. NOT be dominated by Arabic/Urdu script (Whisper wrong-script error).
32
+ """
33
+ bangla_chars = len(BANGLA_PATTERN.findall(text))
34
+ wrong_chars = len(WRONG_SCRIPT_PATTERN.findall(text))
35
+ total_alpha = sum(1 for c in text if c.isalpha())
36
+
37
+ if total_alpha == 0:
38
+ return True # digits/punctuation only — let it through
39
+
40
+ # If more than 30% of alphabetic chars are Arabic/Urdu script, reject
41
+ if total_alpha > 0 and (wrong_chars / total_alpha) > 0.30:
42
+ return False
43
+
44
+ # Must have at least some Bangla characters for long responses
45
+ if total_alpha > 5 and bangla_chars == 0:
46
+ return False
47
+
48
+ return True
49
+
50
+
51
+ class STTProcessor:
52
+ MIN_INPUT_BYTES = 3_000
53
+
54
+ def _to_wav(self, audio_bytes: bytes) -> str | None:
55
+ """Convert browser WebM/opus to 16 kHz mono WAV with loudness normalization."""
56
+ in_path = out_path = None
57
+ try:
58
+ with tempfile.NamedTemporaryFile(suffix=".webm", delete=False) as f:
59
+ f.write(audio_bytes)
60
+ in_path = f.name
61
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
62
+ out_path = f.name
63
+
64
+ result = subprocess.run([
65
+ "ffmpeg", "-y", "-loglevel", "warning",
66
+ "-i", in_path,
67
+ "-ar", "16000", "-ac", "1",
68
+ "-af", "loudnorm",
69
+ "-f", "wav", out_path,
70
+ ], stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
71
+
72
+ if result.returncode != 0:
73
+ print("[STT] ffmpeg error:", result.stderr.decode(errors="replace").strip())
74
+ return None
75
+ if not os.path.exists(out_path) or os.path.getsize(out_path) < 500:
76
+ print("[STT] ffmpeg produced empty WAV.")
77
+ return None
78
+
79
+ print(f"[STT] WAV ready: {os.path.getsize(out_path):,} bytes")
80
+ return out_path
81
+ except Exception as e:
82
+ print(f"[STT] _to_wav: {e}")
83
+ return None
84
+ finally:
85
+ if in_path and os.path.exists(in_path):
86
+ try: os.remove(in_path)
87
+ except OSError: pass
88
+
89
+ def transcribe(self, audio_bytes: bytes) -> str | None:
90
+ if len(audio_bytes) < self.MIN_INPUT_BYTES:
91
+ print(f"[STT] Too short ({len(audio_bytes)} B), skipping.")
92
  return None
93
 
94
+ wav_path = None
95
+ try:
96
+ wav_path = self._to_wav(audio_bytes)
97
+ if not wav_path:
98
+ return None
99
 
 
 
100
 
101
+ # segments, info = model.transcribe(wav_path, language="bn", task="translate", beam_size=5)
 
 
 
 
102
 
103
+ segments, info = model.transcribe(
104
+ wav_path,
105
+ language="bn",
106
+ beam_size=5,
107
+ vad_filter=False, # loudnorm handles quiet audio
108
+ condition_on_previous_text=False,
109
+ temperature=0,
110
+ suppress_tokens=[-1],
111
+ no_speech_threshold=0.5,
112
+ log_prob_threshold=-1.0,
113
+ # task="translate"
114
+ # NO initial_prompt — causes hallucination loops on base model
115
  )
 
 
 
116
 
117
+ text = " ".join(seg.text.strip() for seg in segments).strip()
118
+ print(f"[STT] Lang={info.language} prob={info.language_probability:.2f}")
119
+
120
+ if not text:
121
+ print("[STT] Empty transcript.")
122
+ return None
123
+
124
+ # ── Hallucination guard: repeated words ───────────────────────────
125
+ words = text.split()
126
+ if len(words) > 5 and (len(set(words)) / len(words)) < 0.25:
127
+ print(f"[STT] Hallucination (repetition) discarded: {text[:60]}")
128
+ return None
129
+
130
+ # ── Script validation: must be Bangla Unicode ─────────────────────
131
+ if not _is_valid_bangla(text):
132
+ print(f"[STT] Wrong script (Arabic/Urdu output from base model) "
133
+ f"discarded: {text[:60]}")
134
+ print("[STT] ⚠ If this keeps happening, ensure you're using "
135
+ "model='small' not 'base'.")
136
+ return None
137
+
138
+ print(f"[STT] Transcript: {text}")
139
+ return text
140
+
141
+ except Exception as e:
142
+ print(f"[STT] transcribe: {e}")
143
+ import traceback; traceback.print_exc()
144
+ return None
145
+ finally:
146
+ if wav_path and os.path.exists(wav_path):
147
+ try: os.remove(wav_path)
148
+ except OSError: pass
services/tts.py CHANGED
@@ -1,12 +1,29 @@
 
1
  import edge_tts
2
- import asyncio
3
- import tempfile
4
 
5
- VOICE = "en-US-AriaNeural"
 
 
 
 
 
 
 
 
 
 
6
 
7
- async def text_to_speech_stream(text: str):
8
- communicate = edge_tts.Communicate(text, VOICE)
 
 
 
 
9
 
10
- async for chunk in communicate.stream():
11
- if chunk["type"] == "audio":
12
- yield chunk["data"]
 
 
 
 
 
1
+
2
  import edge_tts
 
 
3
 
4
+ VOICE = "bn-BD-NabanitaNeural"
5
+
6
+
7
+ async def text_to_speech_stream(text: str, voice: str = VOICE):
8
+ """
9
+ Async generator that converts *text* to Bangla audio and yields
10
+ raw MP3 bytes chunk-by-chunk as they arrive from edge-tts.
11
+
12
+ Args:
13
+ text: The Bangla (or mixed) text to synthesise.
14
+ voice: edge-tts voice name. Defaults to bn-BD-NabanitaNeural.
15
 
16
+ Yields:
17
+ bytes raw MP3 audio data ready to send over WebSocket.
18
+ """
19
+ text = text.strip()
20
+ if not text:
21
+ return
22
 
23
+ try:
24
+ communicate = edge_tts.Communicate(text, voice)
25
+ async for chunk in communicate.stream():
26
+ if chunk["type"] == "audio":
27
+ yield chunk["data"]
28
+ except Exception as e:
29
+ print(f"[TTS] text_to_speech_stream error: {e}")
services/vad.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import webrtcvad
3
+
4
+ class VADDetector:
5
+ def __init__(self, sample_rate=16000, frame_ms=30, aggressiveness=2):
6
+ self.vad = webrtcvad.Vad(aggressiveness)
7
+ self.sample_rate = sample_rate
8
+ self.frame_ms = frame_ms
9
+ self.frame_size = int(sample_rate * frame_ms / 1000) * 2
10
+
11
+ def is_valid(self, frame: bytes):
12
+ return len(frame) == self.frame_size
13
+
14
+ def is_speech(self, frame: bytes) -> bool:
15
+ if not self.is_valid(frame):
16
+ return False
17
+ try:
18
+ return self.vad.is_speech(frame, self.sample_rate)
19
+ except:
20
+ return False
21
+
22
+
23
+ class VADSegmenter:
24
+ def __init__(self, vad: VADDetector, silence_limit=8):
25
+ self.vad = vad
26
+ self.silence_limit = silence_limit
27
+
28
+ self.buffer = bytearray()
29
+ self.silence = 0
30
+ self.active = False
31
+
32
+ def add_frame(self, frame: bytes):
33
+ speech = self.vad.is_speech(frame)
34
+
35
+ if speech:
36
+ self.buffer.extend(frame)
37
+ self.active = True
38
+ self.silence = 0
39
+ else:
40
+ if self.active:
41
+ self.silence += 1
42
+
43
+ if self.active and self.silence > self.silence_limit:
44
+ audio = bytes(self.buffer)
45
+ self.buffer.clear()
46
+ self.silence = 0
47
+ self.active = False
48
+ return audio
49
+
50
+ return None