Commit ·
f84481c
1
Parent(s): 75ee53d
updated voice module
Browse files- app.py +152 -70
- services/streaming.py +35 -181
- services/stt.py +241 -325
- services/tts.py +58 -139
- services/vad.py +18 -14
- services/webrtc_pipeline.py +381 -0
app.py
CHANGED
|
@@ -1,22 +1,31 @@
|
|
| 1 |
"""
|
| 2 |
-
app.py — FastAPI entrypoint
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
─────────────
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
"""
|
| 21 |
|
| 22 |
import asyncio
|
|
@@ -25,8 +34,8 @@ import os
|
|
| 25 |
import uuid
|
| 26 |
from contextlib import asynccontextmanager
|
| 27 |
|
| 28 |
-
from fastapi import FastAPI, WebSocket, WebSocketDisconnect
|
| 29 |
-
from fastapi.responses import FileResponse, HTMLResponse
|
| 30 |
from fastapi.staticfiles import StaticFiles
|
| 31 |
from starlette.websockets import WebSocketState
|
| 32 |
|
|
@@ -34,8 +43,17 @@ from core.backend import AIBackend
|
|
| 34 |
from services.stt import STTProcessor
|
| 35 |
from services.streaming import ParallelTTSStreamer
|
| 36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
# ══════════════════════════════════════════════════════════════════════════════
|
| 38 |
-
# MODEL ROUTING CONFIG
|
| 39 |
# ══════════════════════════════════════════════════════════════════════════════
|
| 40 |
USE_GEMINI = True
|
| 41 |
USE_OLLAMA = False
|
|
@@ -45,28 +63,40 @@ _active = sum([USE_GEMINI, USE_OLLAMA, USE_LOCAL_FALLBACK])
|
|
| 45 |
if _active != 1:
|
| 46 |
raise RuntimeError(
|
| 47 |
f"[CONFIG] Exactly one of USE_GEMINI / USE_OLLAMA / USE_LOCAL_FALLBACK "
|
| 48 |
-
f"must be True. Got {_active}
|
| 49 |
)
|
| 50 |
|
| 51 |
-
# ══════════════════════════════════════════════════════════════════════════════
|
| 52 |
-
# AI BACKEND
|
| 53 |
-
# ══════════════════════════════════════════════════════════════════════════════
|
| 54 |
ai = AIBackend(
|
| 55 |
use_gemini=USE_GEMINI,
|
| 56 |
use_ollama=USE_OLLAMA,
|
| 57 |
use_fallback=USE_LOCAL_FALLBACK,
|
| 58 |
)
|
| 59 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
|
| 61 |
@asynccontextmanager
|
| 62 |
async def lifespan(app: FastAPI):
|
| 63 |
await ai.async_setup()
|
| 64 |
-
print("[APP] AI backend ready
|
| 65 |
yield
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
|
| 71 |
|
| 72 |
app = FastAPI(lifespan=lifespan)
|
|
@@ -84,7 +114,73 @@ async def root():
|
|
| 84 |
return HTMLResponse("<h2>index.html not found</h2>", status_code=404)
|
| 85 |
|
| 86 |
|
| 87 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
|
| 89 |
def _ws_open(ws: WebSocket) -> bool:
|
| 90 |
return ws.client_state == WebSocketState.CONNECTED
|
|
@@ -110,12 +206,14 @@ async def _safe_bytes(ws: WebSocket, data: bytes) -> bool:
|
|
| 110 |
return False
|
| 111 |
|
| 112 |
|
| 113 |
-
#
|
|
|
|
|
|
|
| 114 |
|
| 115 |
@app.websocket("/ws/chat")
|
| 116 |
async def ws_chat(ws: WebSocket):
|
| 117 |
await ws.accept()
|
| 118 |
-
print("[CHAT]
|
| 119 |
try:
|
| 120 |
while True:
|
| 121 |
raw = await ws.receive_text()
|
|
@@ -127,21 +225,18 @@ async def ws_chat(ws: WebSocket):
|
|
| 127 |
|
| 128 |
user_id = data.get("user_id", "default_user")
|
| 129 |
user_query = data.get("user_query", "").strip()
|
| 130 |
-
|
| 131 |
-
print(f"[CHAT] user_id={user_id!r} query={user_query!r}")
|
| 132 |
-
|
| 133 |
if not user_query:
|
| 134 |
continue
|
| 135 |
|
|
|
|
|
|
|
| 136 |
try:
|
| 137 |
stream = await ai.main(user_id, user_query)
|
| 138 |
async for token in stream:
|
| 139 |
-
if
|
| 140 |
-
|
| 141 |
-
await _safe_text(ws, {"type": "llm_token", "token": token})
|
| 142 |
except Exception as exc:
|
| 143 |
import traceback; traceback.print_exc()
|
| 144 |
-
print(f"[CHAT] AI error: {exc}")
|
| 145 |
await _safe_text(ws, {"type": "error", "text": str(exc)})
|
| 146 |
|
| 147 |
await _safe_text(ws, {"type": "end"})
|
|
@@ -150,10 +245,12 @@ async def ws_chat(ws: WebSocket):
|
|
| 150 |
print("[CHAT] Client disconnected")
|
| 151 |
except Exception as exc:
|
| 152 |
if "disconnect" not in str(exc).lower():
|
| 153 |
-
print(f"[CHAT]
|
| 154 |
|
| 155 |
|
| 156 |
-
#
|
|
|
|
|
|
|
| 157 |
|
| 158 |
@app.websocket("/ws/voice")
|
| 159 |
async def ws_voice(ws: WebSocket):
|
|
@@ -162,7 +259,7 @@ async def ws_voice(ws: WebSocket):
|
|
| 162 |
user_id = f"voice_{uuid.uuid4().hex[:12]}"
|
| 163 |
print(f"[VOICE] Client connected — user_id={user_id}")
|
| 164 |
|
| 165 |
-
stt
|
| 166 |
_active_streamer: ParallelTTSStreamer | None = None
|
| 167 |
_active_task: asyncio.Task | None = None
|
| 168 |
|
|
@@ -182,12 +279,10 @@ async def ws_voice(ws: WebSocket):
|
|
| 182 |
async def _handle_utterance(audio_bytes: bytes):
|
| 183 |
nonlocal _active_streamer
|
| 184 |
|
|
|
|
| 185 |
transcript = await stt.transcribe(audio_bytes)
|
| 186 |
if not transcript:
|
| 187 |
-
await _safe_text(ws, {
|
| 188 |
-
"type": "error",
|
| 189 |
-
"text": "কথা বুঝতে পারিনি, আবার বলুন।"
|
| 190 |
-
})
|
| 191 |
await _safe_text(ws, {"type": "end"})
|
| 192 |
return
|
| 193 |
|
|
@@ -195,10 +290,11 @@ async def ws_voice(ws: WebSocket):
|
|
| 195 |
if not await _safe_text(ws, {"type": "stt", "text": transcript}):
|
| 196 |
return
|
| 197 |
|
|
|
|
| 198 |
tts_streamer = ParallelTTSStreamer()
|
| 199 |
_active_streamer = tts_streamer
|
| 200 |
|
| 201 |
-
async def
|
| 202 |
try:
|
| 203 |
stream = await ai.main(user_id, transcript)
|
| 204 |
async for token in stream:
|
|
@@ -210,7 +306,7 @@ async def ws_voice(ws: WebSocket):
|
|
| 210 |
except asyncio.CancelledError:
|
| 211 |
raise
|
| 212 |
except Exception as exc:
|
| 213 |
-
print(f"[VOICE]
|
| 214 |
finally:
|
| 215 |
await tts_streamer.flush()
|
| 216 |
|
|
@@ -219,7 +315,8 @@ async def ws_voice(ws: WebSocket):
|
|
| 219 |
if not await _safe_bytes(ws, chunk):
|
| 220 |
break
|
| 221 |
|
| 222 |
-
|
|
|
|
| 223 |
_active_streamer = None
|
| 224 |
await _safe_text(ws, {"type": "end"})
|
| 225 |
|
|
@@ -231,53 +328,38 @@ async def ws_voice(ws: WebSocket):
|
|
| 231 |
try:
|
| 232 |
data = await ws.receive()
|
| 233 |
except WebSocketDisconnect:
|
| 234 |
-
print("[VOICE] Client disconnected.")
|
| 235 |
break
|
| 236 |
except Exception as exc:
|
| 237 |
if "disconnect" in str(exc).lower():
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
print(f"[VOICE] Receive error: {exc}")
|
| 241 |
break
|
| 242 |
|
| 243 |
-
# ── Audio utterance ────────────────────────────────────────────────
|
| 244 |
if "bytes" in data and data["bytes"]:
|
| 245 |
audio_bytes = data["bytes"]
|
| 246 |
print(f"[VOICE] [{user_id}] Utterance: {len(audio_bytes):,} bytes")
|
| 247 |
-
|
| 248 |
-
# Barge-in: cancel immediately before starting new turn
|
| 249 |
await _cancel_active()
|
|
|
|
| 250 |
|
| 251 |
-
_active_task = asyncio.create_task(
|
| 252 |
-
_handle_utterance(audio_bytes)
|
| 253 |
-
)
|
| 254 |
-
|
| 255 |
-
# ── Control messages ───────────────────────────────────────────────
|
| 256 |
elif "text" in data and data["text"]:
|
| 257 |
try:
|
| 258 |
msg = json.loads(data["text"])
|
| 259 |
-
|
| 260 |
if msg.get("type") == "init" and msg.get("user_id"):
|
| 261 |
user_id = str(msg["user_id"])[:64]
|
| 262 |
-
print(f"[VOICE] user_id updated: {user_id}")
|
| 263 |
await _safe_text(ws, {"type": "init_ack", "user_id": user_id})
|
| 264 |
-
|
| 265 |
elif msg.get("type") == "ping":
|
| 266 |
await _safe_text(ws, {"type": "pong"})
|
| 267 |
-
|
| 268 |
elif msg.get("type") == "cancel":
|
| 269 |
-
print("[VOICE] Client cancel signal.")
|
| 270 |
await _cancel_active()
|
| 271 |
await _safe_text(ws, {"type": "end"})
|
| 272 |
-
|
| 273 |
except json.JSONDecodeError:
|
| 274 |
pass
|
| 275 |
|
| 276 |
except WebSocketDisconnect:
|
| 277 |
-
|
| 278 |
except Exception as exc:
|
| 279 |
if "disconnect" not in str(exc).lower():
|
| 280 |
-
print(f"[VOICE]
|
| 281 |
finally:
|
| 282 |
await _cancel_active()
|
| 283 |
print(f"[VOICE] [{user_id}] Handler exiting cleanly.")
|
|
|
|
| 1 |
"""
|
| 2 |
+
app.py — FastAPI entrypoint: WebRTC-first + WebSocket fallback
|
| 3 |
+
|
| 4 |
+
Pipeline overview:
|
| 5 |
+
──────────────────
|
| 6 |
+
Browser Server
|
| 7 |
+
──────────────────────────────────────────────────────
|
| 8 |
+
getUserMedia() → WebRTC aiortc peer connection
|
| 9 |
+
↓ PCM audio frames ────► VAD segmenter
|
| 10 |
+
↓ utterances
|
| 11 |
+
STT GPU-batch queue
|
| 12 |
+
↓ transcripts (parallel)
|
| 13 |
+
LLM async stream ──┐
|
| 14 |
+
↓ tokens │ concurrent
|
| 15 |
+
TTS streamer ◄──────┘
|
| 16 |
+
↓ audio chunks
|
| 17 |
+
◄────────────────────────── RTCDataChannel
|
| 18 |
+
|
| 19 |
+
WebSocket mode (fallback):
|
| 20 |
+
Still available at /ws/voice and /ws/chat for environments
|
| 21 |
+
where WebRTC is blocked (corporate proxies, etc.).
|
| 22 |
+
Uses the same STT batch queue and parallel TTS streamer.
|
| 23 |
+
|
| 24 |
+
Performance targets:
|
| 25 |
+
STT: < 200ms (GPU-batched, ffmpeg parallel)
|
| 26 |
+
First LLM tok: < 100ms (streaming, no full-sentence wait)
|
| 27 |
+
TTS start: < 150ms (sentence-level streaming, parallel synthesis)
|
| 28 |
+
Total TTFA*: < 450ms (*Time-To-First-Audio)
|
| 29 |
"""
|
| 30 |
|
| 31 |
import asyncio
|
|
|
|
| 34 |
import uuid
|
| 35 |
from contextlib import asynccontextmanager
|
| 36 |
|
| 37 |
+
from fastapi import FastAPI, WebSocket, WebSocketDisconnect, Request
|
| 38 |
+
from fastapi.responses import FileResponse, HTMLResponse, JSONResponse
|
| 39 |
from fastapi.staticfiles import StaticFiles
|
| 40 |
from starlette.websockets import WebSocketState
|
| 41 |
|
|
|
|
| 43 |
from services.stt import STTProcessor
|
| 44 |
from services.streaming import ParallelTTSStreamer
|
| 45 |
|
| 46 |
+
# ── WebRTC (optional — degrades gracefully if aiortc not installed) ────────────
|
| 47 |
+
try:
|
| 48 |
+
from services.webrtc_pipeline import WebRTCSession
|
| 49 |
+
WEBRTC_AVAILABLE = True
|
| 50 |
+
print("[APP] WebRTC pipeline available ✓")
|
| 51 |
+
except (ImportError, RuntimeError) as _e:
|
| 52 |
+
WEBRTC_AVAILABLE = False
|
| 53 |
+
print(f"[APP] WebRTC pipeline unavailable ({_e}). WebSocket fallback only.")
|
| 54 |
+
|
| 55 |
# ══════════════════════════════════════════════════════════════════════════════
|
| 56 |
+
# MODEL ROUTING CONFIG — set exactly ONE to True
|
| 57 |
# ══════════════════════════════════════════════════════════════════════════════
|
| 58 |
USE_GEMINI = True
|
| 59 |
USE_OLLAMA = False
|
|
|
|
| 63 |
if _active != 1:
|
| 64 |
raise RuntimeError(
|
| 65 |
f"[CONFIG] Exactly one of USE_GEMINI / USE_OLLAMA / USE_LOCAL_FALLBACK "
|
| 66 |
+
f"must be True. Got {_active}."
|
| 67 |
)
|
| 68 |
|
|
|
|
|
|
|
|
|
|
| 69 |
ai = AIBackend(
|
| 70 |
use_gemini=USE_GEMINI,
|
| 71 |
use_ollama=USE_OLLAMA,
|
| 72 |
use_fallback=USE_LOCAL_FALLBACK,
|
| 73 |
)
|
| 74 |
|
| 75 |
+
# Active WebRTC sessions — keyed by session_id
|
| 76 |
+
_rtc_sessions: dict[str, "WebRTCSession"] = {}
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
# ═══════════════════════════════════════════════════════════════��══════════════
|
| 80 |
+
# LIFESPAN
|
| 81 |
+
# ══════════════════════════════════════════════════════════════════════════════
|
| 82 |
|
| 83 |
@asynccontextmanager
|
| 84 |
async def lifespan(app: FastAPI):
|
| 85 |
await ai.async_setup()
|
| 86 |
+
print("[APP] AI backend ready ✓")
|
| 87 |
yield
|
| 88 |
+
# Clean up WebRTC sessions
|
| 89 |
+
for session in list(_rtc_sessions.values()):
|
| 90 |
+
await session.close()
|
| 91 |
+
_rtc_sessions.clear()
|
| 92 |
+
# Clean up DB connections
|
| 93 |
+
for attr in ("conn", "_meta_conn"):
|
| 94 |
+
conn = getattr(ai, attr, None)
|
| 95 |
+
if conn:
|
| 96 |
+
try:
|
| 97 |
+
await conn.close()
|
| 98 |
+
except Exception:
|
| 99 |
+
pass
|
| 100 |
|
| 101 |
|
| 102 |
app = FastAPI(lifespan=lifespan)
|
|
|
|
| 114 |
return HTMLResponse("<h2>index.html not found</h2>", status_code=404)
|
| 115 |
|
| 116 |
|
| 117 |
+
# ══════════════════════════════════════════════════════════════════════════════
|
| 118 |
+
# WEBRTC SIGNALING ENDPOINTS
|
| 119 |
+
# ══════════════════════════════════════════════════════════════════════════════
|
| 120 |
+
|
| 121 |
+
@app.post("/rtc/offer")
|
| 122 |
+
async def rtc_offer(request: Request):
|
| 123 |
+
"""
|
| 124 |
+
WebRTC signaling: browser sends SDP offer, server returns SDP answer.
|
| 125 |
+
|
| 126 |
+
Request JSON:
|
| 127 |
+
{ "sdp": "...", "type": "offer", "session_id": "optional_existing_id" }
|
| 128 |
+
|
| 129 |
+
Response JSON:
|
| 130 |
+
{ "sdp": "...", "type": "answer", "session_id": "..." }
|
| 131 |
+
"""
|
| 132 |
+
if not WEBRTC_AVAILABLE:
|
| 133 |
+
return JSONResponse(
|
| 134 |
+
{"error": "WebRTC unavailable. Use WebSocket fallback at /ws/voice"},
|
| 135 |
+
status_code=503,
|
| 136 |
+
)
|
| 137 |
+
|
| 138 |
+
body = await request.json()
|
| 139 |
+
sdp = body.get("sdp", "")
|
| 140 |
+
sdp_type = body.get("type", "offer")
|
| 141 |
+
session_id = body.get("session_id") or uuid.uuid4().hex
|
| 142 |
+
|
| 143 |
+
# Reuse or create session
|
| 144 |
+
session = _rtc_sessions.get(session_id)
|
| 145 |
+
if session is None:
|
| 146 |
+
session = WebRTCSession(ai_backend=ai)
|
| 147 |
+
_rtc_sessions[session_id] = session
|
| 148 |
+
print(f"[RTC] New session: {session_id} user_id={session.user_id}")
|
| 149 |
+
|
| 150 |
+
answer = await session.handle_offer(sdp, sdp_type)
|
| 151 |
+
return JSONResponse({**answer, "session_id": session_id})
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
@app.post("/rtc/ice")
|
| 155 |
+
async def rtc_ice(request: Request):
|
| 156 |
+
"""Forward browser ICE candidate to the session."""
|
| 157 |
+
if not WEBRTC_AVAILABLE:
|
| 158 |
+
return JSONResponse({"error": "WebRTC unavailable"}, status_code=503)
|
| 159 |
+
|
| 160 |
+
body = await request.json()
|
| 161 |
+
session_id = body.get("session_id", "")
|
| 162 |
+
candidate = body.get("candidate", {})
|
| 163 |
+
|
| 164 |
+
session = _rtc_sessions.get(session_id)
|
| 165 |
+
if session is None:
|
| 166 |
+
return JSONResponse({"error": "Session not found"}, status_code=404)
|
| 167 |
+
|
| 168 |
+
await session.add_ice_candidate(candidate)
|
| 169 |
+
return JSONResponse({"ok": True})
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
@app.delete("/rtc/session/{session_id}")
|
| 173 |
+
async def rtc_close(session_id: str):
|
| 174 |
+
"""Explicitly close a WebRTC session."""
|
| 175 |
+
session = _rtc_sessions.pop(session_id, None)
|
| 176 |
+
if session:
|
| 177 |
+
await session.close()
|
| 178 |
+
return JSONResponse({"ok": True})
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
# ══════════════════════════════════════════════════════════════════════════════
|
| 182 |
+
# WEBSOCKET HELPERS
|
| 183 |
+
# ══════════════════════════════════════════════════════════════════════════════
|
| 184 |
|
| 185 |
def _ws_open(ws: WebSocket) -> bool:
|
| 186 |
return ws.client_state == WebSocketState.CONNECTED
|
|
|
|
| 206 |
return False
|
| 207 |
|
| 208 |
|
| 209 |
+
# ══════════════════════════════════════════════════════════════════════════════
|
| 210 |
+
# WEBSOCKET — CHAT (text only, streaming tokens)
|
| 211 |
+
# ════════════════════��═════════════════════════════════════════════════════════
|
| 212 |
|
| 213 |
@app.websocket("/ws/chat")
|
| 214 |
async def ws_chat(ws: WebSocket):
|
| 215 |
await ws.accept()
|
| 216 |
+
print("[CHAT] Client connected ✓")
|
| 217 |
try:
|
| 218 |
while True:
|
| 219 |
raw = await ws.receive_text()
|
|
|
|
| 225 |
|
| 226 |
user_id = data.get("user_id", "default_user")
|
| 227 |
user_query = data.get("user_query", "").strip()
|
|
|
|
|
|
|
|
|
|
| 228 |
if not user_query:
|
| 229 |
continue
|
| 230 |
|
| 231 |
+
print(f"[CHAT] user_id={user_id!r} query={user_query!r}")
|
| 232 |
+
|
| 233 |
try:
|
| 234 |
stream = await ai.main(user_id, user_query)
|
| 235 |
async for token in stream:
|
| 236 |
+
if token:
|
| 237 |
+
await _safe_text(ws, {"type": "llm_token", "token": token})
|
|
|
|
| 238 |
except Exception as exc:
|
| 239 |
import traceback; traceback.print_exc()
|
|
|
|
| 240 |
await _safe_text(ws, {"type": "error", "text": str(exc)})
|
| 241 |
|
| 242 |
await _safe_text(ws, {"type": "end"})
|
|
|
|
| 245 |
print("[CHAT] Client disconnected")
|
| 246 |
except Exception as exc:
|
| 247 |
if "disconnect" not in str(exc).lower():
|
| 248 |
+
print(f"[CHAT] Error: {exc}")
|
| 249 |
|
| 250 |
|
| 251 |
+
# ══════════════════════════════════════════════════════════════════════════════
|
| 252 |
+
# WEBSOCKET — VOICE (fallback: full STT→LLM→TTS pipeline over WS)
|
| 253 |
+
# ══════════════════════════════════════════════════════════════════════════════
|
| 254 |
|
| 255 |
@app.websocket("/ws/voice")
|
| 256 |
async def ws_voice(ws: WebSocket):
|
|
|
|
| 259 |
user_id = f"voice_{uuid.uuid4().hex[:12]}"
|
| 260 |
print(f"[VOICE] Client connected — user_id={user_id}")
|
| 261 |
|
| 262 |
+
stt = STTProcessor()
|
| 263 |
_active_streamer: ParallelTTSStreamer | None = None
|
| 264 |
_active_task: asyncio.Task | None = None
|
| 265 |
|
|
|
|
| 279 |
async def _handle_utterance(audio_bytes: bytes):
|
| 280 |
nonlocal _active_streamer
|
| 281 |
|
| 282 |
+
# ── STT (GPU-batched) ──────────────────────────────────────────────────
|
| 283 |
transcript = await stt.transcribe(audio_bytes)
|
| 284 |
if not transcript:
|
| 285 |
+
await _safe_text(ws, {"type": "error", "text": "কথা বুঝতে পারিনি, আবার বলুন।"})
|
|
|
|
|
|
|
|
|
|
| 286 |
await _safe_text(ws, {"type": "end"})
|
| 287 |
return
|
| 288 |
|
|
|
|
| 290 |
if not await _safe_text(ws, {"type": "stt", "text": transcript}):
|
| 291 |
return
|
| 292 |
|
| 293 |
+
# ── LLM + TTS (concurrent) ─────────────────────────────────────────────
|
| 294 |
tts_streamer = ParallelTTSStreamer()
|
| 295 |
_active_streamer = tts_streamer
|
| 296 |
|
| 297 |
+
async def run_llm():
|
| 298 |
try:
|
| 299 |
stream = await ai.main(user_id, transcript)
|
| 300 |
async for token in stream:
|
|
|
|
| 306 |
except asyncio.CancelledError:
|
| 307 |
raise
|
| 308 |
except Exception as exc:
|
| 309 |
+
print(f"[VOICE] LLM error: {exc}")
|
| 310 |
finally:
|
| 311 |
await tts_streamer.flush()
|
| 312 |
|
|
|
|
| 315 |
if not await _safe_bytes(ws, chunk):
|
| 316 |
break
|
| 317 |
|
| 318 |
+
# LLM and TTS delivery run SIMULTANEOUSLY
|
| 319 |
+
await asyncio.gather(run_llm(), run_tts(), return_exceptions=True)
|
| 320 |
_active_streamer = None
|
| 321 |
await _safe_text(ws, {"type": "end"})
|
| 322 |
|
|
|
|
| 328 |
try:
|
| 329 |
data = await ws.receive()
|
| 330 |
except WebSocketDisconnect:
|
|
|
|
| 331 |
break
|
| 332 |
except Exception as exc:
|
| 333 |
if "disconnect" in str(exc).lower():
|
| 334 |
+
break
|
| 335 |
+
print(f"[VOICE] Receive error: {exc}")
|
|
|
|
| 336 |
break
|
| 337 |
|
|
|
|
| 338 |
if "bytes" in data and data["bytes"]:
|
| 339 |
audio_bytes = data["bytes"]
|
| 340 |
print(f"[VOICE] [{user_id}] Utterance: {len(audio_bytes):,} bytes")
|
|
|
|
|
|
|
| 341 |
await _cancel_active()
|
| 342 |
+
_active_task = asyncio.create_task(_handle_utterance(audio_bytes))
|
| 343 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 344 |
elif "text" in data and data["text"]:
|
| 345 |
try:
|
| 346 |
msg = json.loads(data["text"])
|
|
|
|
| 347 |
if msg.get("type") == "init" and msg.get("user_id"):
|
| 348 |
user_id = str(msg["user_id"])[:64]
|
|
|
|
| 349 |
await _safe_text(ws, {"type": "init_ack", "user_id": user_id})
|
|
|
|
| 350 |
elif msg.get("type") == "ping":
|
| 351 |
await _safe_text(ws, {"type": "pong"})
|
|
|
|
| 352 |
elif msg.get("type") == "cancel":
|
|
|
|
| 353 |
await _cancel_active()
|
| 354 |
await _safe_text(ws, {"type": "end"})
|
|
|
|
| 355 |
except json.JSONDecodeError:
|
| 356 |
pass
|
| 357 |
|
| 358 |
except WebSocketDisconnect:
|
| 359 |
+
pass
|
| 360 |
except Exception as exc:
|
| 361 |
if "disconnect" not in str(exc).lower():
|
| 362 |
+
print(f"[VOICE] Error: {exc}")
|
| 363 |
finally:
|
| 364 |
await _cancel_active()
|
| 365 |
print(f"[VOICE] [{user_id}] Handler exiting cleanly.")
|
services/streaming.py
CHANGED
|
@@ -1,39 +1,6 @@
|
|
| 1 |
"""
|
| 2 |
services/streaming.py — Production-grade parallel TTS streamer
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 6 |
-
ROUTING CONFIG — mirrors tts.py; must stay in sync
|
| 7 |
-
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 8 |
-
USE_ELEVENLABS = True → ElevenLabs streaming TTS
|
| 9 |
-
USE_ELEVENLABS = False → Edge-TTS (free, no API key needed)
|
| 10 |
-
|
| 11 |
-
Note: This flag is read from tts.py at import time so you only need to
|
| 12 |
-
change it in ONE place (tts.py). streaming.py re-exports it for clarity.
|
| 13 |
-
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 14 |
-
|
| 15 |
-
Changelog (vs previous streaming.py):
|
| 16 |
-
──────────────────────────────────────
|
| 17 |
-
1. DUAL BACKEND ROUTING — _synthesise() dispatches to either
|
| 18 |
-
_edge_tts_stream() or _elevenlabs_stream() via the shared
|
| 19 |
-
text_to_speech_stream() unified API in tts.py.
|
| 20 |
-
|
| 21 |
-
2. VOICE OVERRIDE PER INSTANCE — ParallelTTSStreamer.__init__ accepts
|
| 22 |
-
an optional `voice` param. For Edge-TTS pass a voice name string;
|
| 23 |
-
for ElevenLabs pass a voice ID. None uses the tts.py defaults.
|
| 24 |
-
|
| 25 |
-
3. ELEVENLABS LATENCY TUNING — When ElevenLabs is active, flush
|
| 26 |
-
thresholds are slightly tighter (FIRST_FLUSH_BOUNDARY_MIN = 8 chars,
|
| 27 |
-
FIRST_FLUSH_HARD = 35 chars) because ElevenLabs has higher per-request
|
| 28 |
-
latency than Edge-TTS and benefits from being called with slightly
|
| 29 |
-
larger chunks rather than many tiny requests.
|
| 30 |
-
|
| 31 |
-
4. ALL PREVIOUS FIXES RETAINED:
|
| 32 |
-
• FIRST_FLUSH_BOUNDARY_MIN 15→10 (Edge-TTS) / 10→8 (ElevenLabs)
|
| 33 |
-
• '॥' (double danda) in SENTENCE_BOUNDARIES
|
| 34 |
-
• cancel() sets _cancelled BEFORE task.cancel() (race fix)
|
| 35 |
-
• asyncio.Event-based slot wake (no spin polling)
|
| 36 |
-
• MIN_CHARS = 3 (was 4)
|
| 37 |
"""
|
| 38 |
|
| 39 |
from __future__ import annotations
|
|
@@ -43,56 +10,39 @@ import re
|
|
| 43 |
from dataclasses import dataclass, field
|
| 44 |
from typing import AsyncGenerator
|
| 45 |
|
| 46 |
-
# Import the unified TTS API and the routing flag from tts.py
|
| 47 |
from services.tts import text_to_speech_stream, USE_ELEVENLABS, EDGE_VOICE
|
| 48 |
|
| 49 |
-
# ── Flush thresholds ───────────────────────────────────────────────────────────
|
| 50 |
-
# ElevenLabs has higher per-request overhead so we use slightly larger chunks
|
| 51 |
-
# to avoid many tiny API calls, while still starting audio quickly.
|
| 52 |
if USE_ELEVENLABS:
|
| 53 |
-
FIRST_FLUSH_BOUNDARY_MIN =
|
| 54 |
-
FIRST_FLUSH_HARD =
|
| 55 |
-
SUBSEQUENT_FLUSH_BOUNDARY_MIN =
|
| 56 |
-
SUBSEQUENT_FLUSH_HARD =
|
| 57 |
_backend_label = "ElevenLabs"
|
| 58 |
else:
|
| 59 |
-
FIRST_FLUSH_BOUNDARY_MIN =
|
| 60 |
-
FIRST_FLUSH_HARD =
|
| 61 |
-
SUBSEQUENT_FLUSH_BOUNDARY_MIN =
|
| 62 |
-
SUBSEQUENT_FLUSH_HARD =
|
| 63 |
_backend_label = "Edge-TTS"
|
| 64 |
|
| 65 |
print(f"[Streamer] TTS backend: {_backend_label}")
|
| 66 |
|
| 67 |
-
MIN_CHARS =
|
| 68 |
-
|
| 69 |
SENTENCE_BOUNDARIES = frozenset(".!?।॥\n")
|
| 70 |
CLAUSE_BOUNDARIES = frozenset(",;:—–")
|
| 71 |
-
|
| 72 |
_SENTINEL = object()
|
| 73 |
|
| 74 |
|
| 75 |
-
# ══════════════════════════════════════════════════════════════════════════
|
| 76 |
-
# TEXT CLEANING
|
| 77 |
-
# ══════════════════════════════════════════════════════════════════════════
|
| 78 |
-
|
| 79 |
def _clean_for_tts(text: str) -> str:
|
| 80 |
-
|
| 81 |
-
text = re.sub(r"\*
|
| 82 |
-
text = re.sub(r"
|
| 83 |
-
text = re.sub(r"^\s*[-
|
| 84 |
-
text = re.sub(r"
|
| 85 |
-
text = re.sub(r"
|
| 86 |
-
text = re.sub(r"\n{2,}", "\n", text)
|
| 87 |
return text.strip()
|
| 88 |
|
| 89 |
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
# ══════════════════════════════════════════════════════════════════════════
|
| 93 |
-
# FLUSH LOGIC
|
| 94 |
-
# ══════════════════════════════════════════════════════════════════════════
|
| 95 |
-
|
| 96 |
def _should_flush(buffer: str, first_chunk: bool) -> bool:
|
| 97 |
n = len(buffer)
|
| 98 |
if n == 0:
|
|
@@ -109,48 +59,18 @@ def _should_flush(buffer: str, first_chunk: bool) -> bool:
|
|
| 109 |
return False
|
| 110 |
|
| 111 |
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
# ══════════════════════════════════════════════════════════════════════════
|
| 115 |
-
# AUDIO SLOT
|
| 116 |
-
# ══════════════════════════════════════════════════════════════════════════
|
| 117 |
-
|
| 118 |
@dataclass
|
| 119 |
class _AudioSlot:
|
| 120 |
index: int
|
| 121 |
queue: asyncio.Queue = field(default_factory=lambda: asyncio.Queue())
|
| 122 |
done: bool = False
|
| 123 |
|
| 124 |
-
def mark_done(self)
|
| 125 |
-
|
| 126 |
-
self.queue.put_nowait(_SENTINEL)
|
| 127 |
-
|
| 128 |
-
def mark_error(self) -> None:
|
| 129 |
-
self.done = True
|
| 130 |
-
self.queue.put_nowait(_SENTINEL)
|
| 131 |
|
| 132 |
|
| 133 |
-
# ══════════════════════════════════════════════════════════════════════════
|
| 134 |
-
# PARALLEL TTS STREAMER
|
| 135 |
-
# ══════════════════════════════════════════════════════════════════════════
|
| 136 |
-
|
| 137 |
class ParallelTTSStreamer:
|
| 138 |
-
"""
|
| 139 |
-
LLM tokens → sentence chunks → parallel TTS (Edge-TTS or ElevenLabs)
|
| 140 |
-
→ ordered audio delivery over WebSocket.
|
| 141 |
-
|
| 142 |
-
Usage:
|
| 143 |
-
streamer = ParallelTTSStreamer() # uses tts.py defaults
|
| 144 |
-
streamer = ParallelTTSStreamer(voice=...) # override voice/voice-ID
|
| 145 |
-
|
| 146 |
-
The `voice` parameter meaning depends on USE_ELEVENLABS:
|
| 147 |
-
• Edge-TTS → pass an Edge-TTS voice name string
|
| 148 |
-
• ElevenLabs → pass an ElevenLabs voice ID string
|
| 149 |
-
If None, the tts.py module defaults are used.
|
| 150 |
-
"""
|
| 151 |
-
|
| 152 |
def __init__(self, voice: str | None = None) -> None:
|
| 153 |
-
# None signals tts.py to use its own defaults
|
| 154 |
self.voice = voice
|
| 155 |
self.buffer = ""
|
| 156 |
self._cancelled = False
|
|
@@ -160,9 +80,7 @@ class ParallelTTSStreamer:
|
|
| 160 |
self._slots_lock = asyncio.Lock()
|
| 161 |
self._tasks: list[asyncio.Task] = []
|
| 162 |
self._llm_done = asyncio.Event()
|
| 163 |
-
self._slot_added = asyncio.Event()
|
| 164 |
-
|
| 165 |
-
# ── Token ingestion ────────────────────────────────────────────────────────
|
| 166 |
|
| 167 |
async def add_token(self, token: str) -> None:
|
| 168 |
if not token or self._cancelled:
|
|
@@ -172,45 +90,25 @@ class ParallelTTSStreamer:
|
|
| 172 |
self._first_chunk = False
|
| 173 |
await self._schedule_chunk()
|
| 174 |
|
| 175 |
-
# ── Chunk scheduling ───────────────────────────────────────────────────────
|
| 176 |
-
|
| 177 |
async def _schedule_chunk(self) -> None:
|
| 178 |
if self._cancelled:
|
| 179 |
-
self.buffer = ""
|
| 180 |
-
return
|
| 181 |
-
|
| 182 |
text = _clean_for_tts(self.buffer.strip())
|
| 183 |
self.buffer = ""
|
| 184 |
if len(text) < MIN_CHARS:
|
| 185 |
return
|
| 186 |
-
|
| 187 |
async with self._slots_lock:
|
| 188 |
slot = _AudioSlot(index=self._slot_index)
|
| 189 |
self._slot_index += 1
|
| 190 |
self._slots.append(slot)
|
| 191 |
-
self._slot_added.set()
|
| 192 |
-
|
| 193 |
task = asyncio.create_task(self._synthesise(text, slot))
|
| 194 |
self._tasks.append(task)
|
| 195 |
-
task.add_done_callback(
|
| 196 |
-
lambda t: self._tasks.remove(t) if t in self._tasks else None
|
| 197 |
-
)
|
| 198 |
-
|
| 199 |
-
# ── TTS synthesis — routes to active backend ──────────────���────────────────
|
| 200 |
|
| 201 |
async def _synthesise(self, text: str, slot: _AudioSlot) -> None:
|
| 202 |
-
"""
|
| 203 |
-
Calls the unified text_to_speech_stream() from tts.py which internally
|
| 204 |
-
dispatches to Edge-TTS or ElevenLabs based on USE_ELEVENLABS.
|
| 205 |
-
|
| 206 |
-
The optional self.voice parameter is forwarded as-is:
|
| 207 |
-
• Edge-TTS → voice name string (e.g. "bn-BD-PradeepNeural")
|
| 208 |
-
• ElevenLabs → voice ID string (e.g. "pNInz6obpgDQGcFmaJgB")
|
| 209 |
-
"""
|
| 210 |
if self._cancelled:
|
| 211 |
-
slot.mark_error()
|
| 212 |
-
return
|
| 213 |
-
|
| 214 |
try:
|
| 215 |
async for chunk in text_to_speech_stream(text, voice=self.voice):
|
| 216 |
if self._cancelled:
|
|
@@ -223,91 +121,47 @@ class ParallelTTSStreamer:
|
|
| 223 |
finally:
|
| 224 |
slot.mark_done()
|
| 225 |
|
| 226 |
-
# ── Flush ──────────────────────────────────────────────────────────────────
|
| 227 |
-
|
| 228 |
async def flush(self) -> None:
|
| 229 |
-
"""Call after the LLM stream ends to synthesise any buffered remainder."""
|
| 230 |
if self.buffer.strip():
|
| 231 |
await self._schedule_chunk()
|
| 232 |
self._llm_done.set()
|
| 233 |
|
| 234 |
-
# ── Cancel ────────────────────────────────────────────────────────────────
|
| 235 |
-
|
| 236 |
async def cancel(self) -> None:
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
Race fix: _cancelled is set to True BEFORE cancelling tasks so that
|
| 241 |
-
any still-running task that checks the flag won't enqueue more chunks.
|
| 242 |
-
"""
|
| 243 |
-
self._cancelled = True # set first — closes the race window
|
| 244 |
-
|
| 245 |
-
tasks = list(self._tasks)
|
| 246 |
-
self._tasks.clear()
|
| 247 |
-
for t in tasks:
|
| 248 |
-
t.cancel()
|
| 249 |
if tasks:
|
| 250 |
await asyncio.gather(*tasks, return_exceptions=True)
|
| 251 |
-
|
| 252 |
async with self._slots_lock:
|
| 253 |
for slot in self._slots:
|
| 254 |
-
if not slot.done:
|
| 255 |
-
slot.mark_error()
|
| 256 |
-
|
| 257 |
self._llm_done.set()
|
| 258 |
-
self._slot_added.set()
|
| 259 |
-
|
| 260 |
-
# ── Audio delivery ─────────────────────────────────────────────────────────
|
| 261 |
|
| 262 |
async def stream_audio(self) -> AsyncGenerator[bytes, None]:
|
| 263 |
-
"""
|
| 264 |
-
Async generator — yields audio bytes in the exact order the TTS chunks
|
| 265 |
-
were scheduled (preserves sentence order even with parallel synthesis).
|
| 266 |
-
"""
|
| 267 |
delivered = 0
|
| 268 |
-
|
| 269 |
while True:
|
| 270 |
async with self._slots_lock:
|
| 271 |
slot = self._slots[delivered] if delivered < len(self._slots) else None
|
| 272 |
-
|
| 273 |
if slot is None:
|
| 274 |
if self._llm_done.is_set():
|
| 275 |
async with self._slots_lock:
|
| 276 |
total = len(self._slots)
|
| 277 |
if delivered >= total:
|
| 278 |
break
|
| 279 |
-
|
| 280 |
-
# Wait on event (no spin polling)
|
| 281 |
self._slot_added.clear()
|
| 282 |
try:
|
| 283 |
-
await asyncio.wait_for(
|
| 284 |
-
self._slot_added.wait(),
|
| 285 |
-
timeout=10.0 # ElevenLabs can be slower; 10 s guard
|
| 286 |
-
)
|
| 287 |
except asyncio.TimeoutError:
|
| 288 |
-
print("[Streamer]
|
| 289 |
-
break
|
| 290 |
continue
|
| 291 |
-
|
| 292 |
-
# Drain this slot's audio queue in order
|
| 293 |
while True:
|
| 294 |
item = await slot.queue.get()
|
| 295 |
-
if item is _SENTINEL:
|
| 296 |
-
|
| 297 |
-
if not self._cancelled:
|
| 298 |
-
yield item
|
| 299 |
-
|
| 300 |
delivered += 1
|
| 301 |
|
| 302 |
-
# ── Reset ──────────────────────────────────────────────────────────────────
|
| 303 |
-
|
| 304 |
def reset(self) -> None:
|
| 305 |
-
|
| 306 |
-
self.
|
| 307 |
-
self.
|
| 308 |
-
self.buffer = ""
|
| 309 |
-
self._slot_index = 0
|
| 310 |
-
self._slots.clear()
|
| 311 |
-
self._tasks.clear()
|
| 312 |
-
self._llm_done.clear()
|
| 313 |
-
self._slot_added.clear()
|
|
|
|
| 1 |
"""
|
| 2 |
services/streaming.py — Production-grade parallel TTS streamer
|
| 3 |
+
(unchanged from original — architecture is correct)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
"""
|
| 5 |
|
| 6 |
from __future__ import annotations
|
|
|
|
| 10 |
from dataclasses import dataclass, field
|
| 11 |
from typing import AsyncGenerator
|
| 12 |
|
|
|
|
| 13 |
from services.tts import text_to_speech_stream, USE_ELEVENLABS, EDGE_VOICE
|
| 14 |
|
|
|
|
|
|
|
|
|
|
| 15 |
if USE_ELEVENLABS:
|
| 16 |
+
FIRST_FLUSH_BOUNDARY_MIN = 5
|
| 17 |
+
FIRST_FLUSH_HARD = 25
|
| 18 |
+
SUBSEQUENT_FLUSH_BOUNDARY_MIN = 22
|
| 19 |
+
SUBSEQUENT_FLUSH_HARD = 65
|
| 20 |
_backend_label = "ElevenLabs"
|
| 21 |
else:
|
| 22 |
+
FIRST_FLUSH_BOUNDARY_MIN = 5
|
| 23 |
+
FIRST_FLUSH_HARD = 25
|
| 24 |
+
SUBSEQUENT_FLUSH_BOUNDARY_MIN = 18
|
| 25 |
+
SUBSEQUENT_FLUSH_HARD = 65
|
| 26 |
_backend_label = "Edge-TTS"
|
| 27 |
|
| 28 |
print(f"[Streamer] TTS backend: {_backend_label}")
|
| 29 |
|
| 30 |
+
MIN_CHARS = 2
|
|
|
|
| 31 |
SENTENCE_BOUNDARIES = frozenset(".!?।॥\n")
|
| 32 |
CLAUSE_BOUNDARIES = frozenset(",;:—–")
|
|
|
|
| 33 |
_SENTINEL = object()
|
| 34 |
|
| 35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
def _clean_for_tts(text: str) -> str:
|
| 37 |
+
text = re.sub(r"\*{1,3}", "", text)
|
| 38 |
+
text = re.sub(r"#+\s*", "", text)
|
| 39 |
+
text = re.sub(r"^\s*[-•]\s*", "", text, flags=re.MULTILINE)
|
| 40 |
+
text = re.sub(r"^\s*[\d০-৯]+[.)]\s*", "", text, flags=re.MULTILINE)
|
| 41 |
+
text = re.sub(r"`+", "", text)
|
| 42 |
+
text = re.sub(r"\n{2,}", "\n", text)
|
|
|
|
| 43 |
return text.strip()
|
| 44 |
|
| 45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
def _should_flush(buffer: str, first_chunk: bool) -> bool:
|
| 47 |
n = len(buffer)
|
| 48 |
if n == 0:
|
|
|
|
| 59 |
return False
|
| 60 |
|
| 61 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
@dataclass
|
| 63 |
class _AudioSlot:
|
| 64 |
index: int
|
| 65 |
queue: asyncio.Queue = field(default_factory=lambda: asyncio.Queue())
|
| 66 |
done: bool = False
|
| 67 |
|
| 68 |
+
def mark_done(self) -> None: self.done = True; self.queue.put_nowait(_SENTINEL)
|
| 69 |
+
def mark_error(self) -> None: self.done = True; self.queue.put_nowait(_SENTINEL)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
|
| 71 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
class ParallelTTSStreamer:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
def __init__(self, voice: str | None = None) -> None:
|
|
|
|
| 74 |
self.voice = voice
|
| 75 |
self.buffer = ""
|
| 76 |
self._cancelled = False
|
|
|
|
| 80 |
self._slots_lock = asyncio.Lock()
|
| 81 |
self._tasks: list[asyncio.Task] = []
|
| 82 |
self._llm_done = asyncio.Event()
|
| 83 |
+
self._slot_added = asyncio.Event()
|
|
|
|
|
|
|
| 84 |
|
| 85 |
async def add_token(self, token: str) -> None:
|
| 86 |
if not token or self._cancelled:
|
|
|
|
| 90 |
self._first_chunk = False
|
| 91 |
await self._schedule_chunk()
|
| 92 |
|
|
|
|
|
|
|
| 93 |
async def _schedule_chunk(self) -> None:
|
| 94 |
if self._cancelled:
|
| 95 |
+
self.buffer = ""; return
|
|
|
|
|
|
|
| 96 |
text = _clean_for_tts(self.buffer.strip())
|
| 97 |
self.buffer = ""
|
| 98 |
if len(text) < MIN_CHARS:
|
| 99 |
return
|
|
|
|
| 100 |
async with self._slots_lock:
|
| 101 |
slot = _AudioSlot(index=self._slot_index)
|
| 102 |
self._slot_index += 1
|
| 103 |
self._slots.append(slot)
|
| 104 |
+
self._slot_added.set()
|
|
|
|
| 105 |
task = asyncio.create_task(self._synthesise(text, slot))
|
| 106 |
self._tasks.append(task)
|
| 107 |
+
task.add_done_callback(lambda t: self._tasks.remove(t) if t in self._tasks else None)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
|
| 109 |
async def _synthesise(self, text: str, slot: _AudioSlot) -> None:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
if self._cancelled:
|
| 111 |
+
slot.mark_error(); return
|
|
|
|
|
|
|
| 112 |
try:
|
| 113 |
async for chunk in text_to_speech_stream(text, voice=self.voice):
|
| 114 |
if self._cancelled:
|
|
|
|
| 121 |
finally:
|
| 122 |
slot.mark_done()
|
| 123 |
|
|
|
|
|
|
|
| 124 |
async def flush(self) -> None:
|
|
|
|
| 125 |
if self.buffer.strip():
|
| 126 |
await self._schedule_chunk()
|
| 127 |
self._llm_done.set()
|
| 128 |
|
|
|
|
|
|
|
| 129 |
async def cancel(self) -> None:
|
| 130 |
+
self._cancelled = True
|
| 131 |
+
tasks = list(self._tasks); self._tasks.clear()
|
| 132 |
+
for t in tasks: t.cancel()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
if tasks:
|
| 134 |
await asyncio.gather(*tasks, return_exceptions=True)
|
|
|
|
| 135 |
async with self._slots_lock:
|
| 136 |
for slot in self._slots:
|
| 137 |
+
if not slot.done: slot.mark_error()
|
|
|
|
|
|
|
| 138 |
self._llm_done.set()
|
| 139 |
+
self._slot_added.set()
|
|
|
|
|
|
|
| 140 |
|
| 141 |
async def stream_audio(self) -> AsyncGenerator[bytes, None]:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
delivered = 0
|
|
|
|
| 143 |
while True:
|
| 144 |
async with self._slots_lock:
|
| 145 |
slot = self._slots[delivered] if delivered < len(self._slots) else None
|
|
|
|
| 146 |
if slot is None:
|
| 147 |
if self._llm_done.is_set():
|
| 148 |
async with self._slots_lock:
|
| 149 |
total = len(self._slots)
|
| 150 |
if delivered >= total:
|
| 151 |
break
|
|
|
|
|
|
|
| 152 |
self._slot_added.clear()
|
| 153 |
try:
|
| 154 |
+
await asyncio.wait_for(self._slot_added.wait(), timeout=10.0)
|
|
|
|
|
|
|
|
|
|
| 155 |
except asyncio.TimeoutError:
|
| 156 |
+
print("[Streamer] Timeout waiting for TTS slot."); break
|
|
|
|
| 157 |
continue
|
|
|
|
|
|
|
| 158 |
while True:
|
| 159 |
item = await slot.queue.get()
|
| 160 |
+
if item is _SENTINEL: break
|
| 161 |
+
if not self._cancelled: yield item
|
|
|
|
|
|
|
|
|
|
| 162 |
delivered += 1
|
| 163 |
|
|
|
|
|
|
|
| 164 |
def reset(self) -> None:
|
| 165 |
+
self._cancelled = False; self._first_chunk = True; self.buffer = ""
|
| 166 |
+
self._slot_index = 0; self._slots.clear(); self._tasks.clear()
|
| 167 |
+
self._llm_done.clear(); self._slot_added.clear()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
services/stt.py
CHANGED
|
@@ -1,38 +1,22 @@
|
|
| 1 |
"""
|
| 2 |
-
services/stt.py —
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
─────────────
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
3. BETTER FFMPEG PIPELINE — Replaced `loudnorm` (EBU R128, designed for
|
| 21 |
-
broadcast audio) with a lightweight chain:
|
| 22 |
-
highpass f=80 → afftdn nf=-25 → aresample=resampler=swr
|
| 23 |
-
This removes low-frequency rumble, light background noise, and resamples
|
| 24 |
-
cleanly to 16 kHz without the over-compression artefacts loudnorm
|
| 25 |
-
introduces on short (1–5 s) speech clips.
|
| 26 |
-
|
| 27 |
-
4. AUDIO SIZE CAP — Added MAX_INPUT_BYTES (5 MB). Prevents runaway memory
|
| 28 |
-
usage if a browser bug sends a huge blob.
|
| 29 |
-
|
| 30 |
-
5. MODEL SELECTION VIA ENV — STT_MODEL env var allows switching to
|
| 31 |
-
large-v3-turbo (4× faster, similar Bangla accuracy) without code changes.
|
| 32 |
-
Defaults to large-v3 for maximum quality.
|
| 33 |
-
|
| 34 |
-
6. All other logic (background preload, singleton, semaphore, hallucination
|
| 35 |
-
guard, script validation) is preserved unchanged.
|
| 36 |
"""
|
| 37 |
|
| 38 |
from __future__ import annotations
|
|
@@ -41,354 +25,286 @@ import asyncio
|
|
| 41 |
import io
|
| 42 |
import os
|
| 43 |
import re
|
|
|
|
| 44 |
import subprocess
|
| 45 |
import tempfile
|
| 46 |
import threading
|
|
|
|
| 47 |
from concurrent.futures import ThreadPoolExecutor
|
|
|
|
|
|
|
| 48 |
|
| 49 |
from faster_whisper import WhisperModel
|
| 50 |
|
| 51 |
-
# ── Bangla
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
r"[\u0600-\u06FF"
|
| 55 |
-
r"\u0750-\u077F"
|
| 56 |
-
r"\uFB50-\uFDFF"
|
| 57 |
-
r"\uFE70-\uFEFF]"
|
| 58 |
)
|
| 59 |
|
| 60 |
-
#
|
| 61 |
-
# A short natural Bangla sentence primes the Whisper decoder to prefer the
|
| 62 |
-
# Bengali Unicode block. Keep it short (< 20 words) so it doesn't dominate
|
| 63 |
-
# the context window for short utterances.
|
| 64 |
_BANGLA_SEED = "আমি আপনার সাথে বাংলায় কথা বলছি।"
|
| 65 |
|
| 66 |
-
# ──
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
|
|
|
|
|
|
| 70 |
|
| 71 |
-
# ── Singleton state ──────────────────────────────────────────────────────
|
| 72 |
-
_model: WhisperModel
|
| 73 |
-
_model_lock
|
| 74 |
-
_model_ready
|
| 75 |
-
_gpu_semaphore: asyncio.Semaphore | None = None
|
| 76 |
|
| 77 |
-
|
|
|
|
|
|
|
| 78 |
|
| 79 |
|
| 80 |
-
# ── Model loader ──────────────────────────────────────────
|
| 81 |
def _load_and_warm() -> None:
|
| 82 |
global _model
|
| 83 |
try:
|
| 84 |
-
print(f"[STT] Loading Faster-Whisper {_STT_MODEL} on CUDA ({_COMPUTE_TYPE})
|
| 85 |
m = WhisperModel(
|
| 86 |
_STT_MODEL,
|
| 87 |
device="cuda",
|
| 88 |
compute_type=_COMPUTE_TYPE,
|
| 89 |
num_workers=1,
|
| 90 |
)
|
| 91 |
-
|
| 92 |
-
silence = _make_silence_wav(
|
| 93 |
list(m.transcribe(silence, language="bn", beam_size=1)[0])
|
| 94 |
-
print("[STT] GPU warmup complete. STT ready
|
| 95 |
with _model_lock:
|
| 96 |
_model = m
|
| 97 |
except Exception as exc:
|
| 98 |
-
print(f"[STT] Model load
|
| 99 |
finally:
|
| 100 |
_model_ready.set()
|
| 101 |
|
| 102 |
|
| 103 |
-
def _make_silence_wav(duration_s: float = 0.5,
|
| 104 |
-
import struct, wave
|
| 105 |
buf = io.BytesIO()
|
| 106 |
-
|
| 107 |
with wave.open(buf, "wb") as wf:
|
| 108 |
-
wf.setnchannels(1)
|
| 109 |
-
wf.
|
| 110 |
-
wf.setframerate(sample_rate)
|
| 111 |
-
wf.writeframes(struct.pack(f"<{n_samples}h", *([0] * n_samples)))
|
| 112 |
buf.seek(0)
|
| 113 |
return buf
|
| 114 |
|
| 115 |
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
return _model
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
def _get_semaphore() -> asyncio.Semaphore:
|
| 122 |
-
"""Return (or lazily create) the GPU semaphore on the current event loop."""
|
| 123 |
-
global _gpu_semaphore
|
| 124 |
-
if _gpu_semaphore is None:
|
| 125 |
-
# FIX: Always create on the running loop to avoid cross-loop binding.
|
| 126 |
-
try:
|
| 127 |
-
loop = asyncio.get_running_loop()
|
| 128 |
-
except RuntimeError:
|
| 129 |
-
loop = None
|
| 130 |
-
_gpu_semaphore = asyncio.Semaphore(1)
|
| 131 |
-
return _gpu_semaphore
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
# ── Background load at import ──────────────────────────────────────────────────
|
| 135 |
-
_bg_thread = threading.Thread(target=_load_and_warm, daemon=True, name="whisper-loader")
|
| 136 |
-
_bg_thread.start()
|
| 137 |
|
| 138 |
|
| 139 |
-
# ──
|
| 140 |
-
def
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
MIN_INPUT_BYTES = 3_000
|
| 156 |
-
MAX_INPUT_BYTES = 5_242_880 # 5 MB cap — prevents runaway blobs
|
| 157 |
-
|
| 158 |
-
@staticmethod
|
| 159 |
-
def _to_wav(audio_bytes: bytes) -> str | None:
|
| 160 |
-
"""
|
| 161 |
-
Convert browser WebM/Opus blob → 16 kHz mono WAV.
|
| 162 |
-
|
| 163 |
-
FIX: Replaced `loudnorm` with a lighter chain:
|
| 164 |
-
highpass f=80 — removes low-frequency rumble / HVAC noise
|
| 165 |
-
afftdn nf=-25 — light spectral noise reduction (−25 dB floor)
|
| 166 |
-
aresample — ensures clean 16 kHz output
|
| 167 |
-
|
| 168 |
-
This avoids the two-pass EBU R128 behaviour that loudnorm exhibits in
|
| 169 |
-
single-pass mode and that over-compresses short speech clips.
|
| 170 |
-
"""
|
| 171 |
-
in_path = out_path = None
|
| 172 |
try:
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
[
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
"-f", "wav", out_path,
|
| 186 |
-
],
|
| 187 |
-
stdout=subprocess.DEVNULL,
|
| 188 |
-
stderr=subprocess.PIPE,
|
| 189 |
-
timeout=30, # failsafe: kill runaway ffmpeg
|
| 190 |
)
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
if not os.path.exists(out_path) or os.path.getsize(out_path) < 500:
|
| 195 |
-
print("[STT] ffmpeg produced empty WAV.")
|
| 196 |
-
return None
|
| 197 |
-
print(f"[STT] WAV ready: {os.path.getsize(out_path):,} bytes")
|
| 198 |
-
return out_path
|
| 199 |
-
except subprocess.TimeoutExpired:
|
| 200 |
-
print("[STT] ffmpeg timed out.")
|
| 201 |
-
return None
|
| 202 |
except Exception as exc:
|
| 203 |
-
print(f"[STT]
|
| 204 |
-
|
| 205 |
finally:
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 226 |
|
| 227 |
-
|
| 228 |
-
wav_path
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
no_speech_threshold=0.6, # was 0.5; avoids breath-noise transcription
|
| 238 |
-
log_prob_threshold=-0.5, # was -1.0; rejects low-confidence segments
|
| 239 |
-
compression_ratio_threshold=2.4, # filter repetitive/garbage output
|
| 240 |
-
)
|
| 241 |
-
text = " ".join(seg.text.strip() for seg in segments).strip()
|
| 242 |
-
print(f"[STT] Lang={info.language} prob={info.language_probability:.2f}")
|
| 243 |
-
return text
|
| 244 |
-
|
| 245 |
-
# async def transcribe(self, audio_bytes: bytes) -> str | None:
|
| 246 |
-
# """Full pipeline: validate → wait for model → ffmpeg → GPU inference."""
|
| 247 |
-
# if len(audio_bytes) < self.MIN_INPUT_BYTES:
|
| 248 |
-
# print(f"[STT] Too short ({len(audio_bytes)} B), skipping.")
|
| 249 |
-
# return None
|
| 250 |
-
|
| 251 |
-
# # FIX: Cap oversized blobs early
|
| 252 |
-
# if len(audio_bytes) > self.MAX_INPUT_BYTES:
|
| 253 |
-
# print(f"[STT] Input too large ({len(audio_bytes):,} B), capping.")
|
| 254 |
-
# audio_bytes = audio_bytes[: self.MAX_INPUT_BYTES]
|
| 255 |
-
|
| 256 |
-
# if not _model_ready.is_set():
|
| 257 |
-
# print("[STT] Model loading, waiting …")
|
| 258 |
-
# await asyncio.to_thread(_model_ready.wait)
|
| 259 |
-
|
| 260 |
-
# wav_path = await asyncio.to_thread(self._to_wav, audio_bytes)
|
| 261 |
-
# if not wav_path:
|
| 262 |
-
# return None
|
| 263 |
-
|
| 264 |
-
# sem = _get_semaphore()
|
| 265 |
-
# try:
|
| 266 |
-
# async with sem:
|
| 267 |
-
# loop = asyncio.get_running_loop()
|
| 268 |
-
# text = await loop.run_in_executor(
|
| 269 |
-
# _inference_pool, self._transcribe_sync, wav_path
|
| 270 |
-
# )
|
| 271 |
-
# except Exception as exc:
|
| 272 |
-
# print(f"[STT] transcribe error: {exc}")
|
| 273 |
-
# import traceback; traceback.print_exc()
|
| 274 |
-
# return None
|
| 275 |
-
# finally:
|
| 276 |
-
# if os.path.exists(wav_path):
|
| 277 |
-
# try: os.remove(wav_path)
|
| 278 |
-
# except OSError: pass
|
| 279 |
-
|
| 280 |
-
# if not text:
|
| 281 |
-
# print("[STT] Empty transcript.")
|
| 282 |
-
# return None
|
| 283 |
-
|
| 284 |
-
# # Hallucination guard
|
| 285 |
-
# words = text.split()
|
| 286 |
-
# unique_ratio = len(set(words)) / len(words) if words else 1.0
|
| 287 |
-
# if len(words) >= 3 and unique_ratio < 0.40:
|
| 288 |
-
# print(f"[STT] Hallucination discarded (repetition): {text[:60]}")
|
| 289 |
-
# return None
|
| 290 |
-
# if len(words) == 2 and words[0] == words[1]:
|
| 291 |
-
# print(f"[STT] Hallucination discarded (2-word repeat): {text[:60]}")
|
| 292 |
-
# return None
|
| 293 |
-
|
| 294 |
-
# if not _is_valid_bangla(text):
|
| 295 |
-
# print(f"[STT] Wrong script discarded: {text[:60]}")
|
| 296 |
-
# return None
|
| 297 |
-
|
| 298 |
-
# print(f"[STT] Transcript: {text}")
|
| 299 |
-
# return text
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
async def transcribe(self, audio_bytes: bytes) -> str | None:
|
| 303 |
-
"""Robust STT pipeline optimized for streaming voice."""
|
| 304 |
-
|
| 305 |
-
# ─────────────────────────────
|
| 306 |
-
# 1. VERY LIGHT sanity check (DO NOT OVER FILTER)
|
| 307 |
-
# ─────────────────────────────
|
| 308 |
-
if not audio_bytes or len(audio_bytes) < 300:
|
| 309 |
-
print(f"[STT] Ignored tiny packet ({len(audio_bytes)} B)")
|
| 310 |
-
return None
|
| 311 |
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
|
| 316 |
|
| 317 |
-
# ─────────────────────────────
|
| 318 |
-
# 2. Wait for model readiness (unchanged)
|
| 319 |
-
# ─────────────────────────────
|
| 320 |
-
if not _model_ready.is_set():
|
| 321 |
-
print("[STT] Model loading, waiting …")
|
| 322 |
-
await asyncio.to_thread(_model_ready.wait)
|
| 323 |
|
| 324 |
-
|
| 325 |
-
# 3. Convert audio
|
| 326 |
-
# ─────────────────────────────
|
| 327 |
-
wav_path = await asyncio.to_thread(self._to_wav, audio_bytes)
|
| 328 |
-
if not wav_path:
|
| 329 |
-
return None
|
| 330 |
|
| 331 |
-
sem = _get_semaphore()
|
| 332 |
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
|
| 336 |
-
text = await loop.run_in_executor(
|
| 337 |
-
_inference_pool,
|
| 338 |
-
self._transcribe_sync,
|
| 339 |
-
wav_path
|
| 340 |
-
)
|
| 341 |
|
| 342 |
-
|
| 343 |
-
|
| 344 |
-
|
|
|
|
|
|
|
| 345 |
|
| 346 |
-
|
| 347 |
-
|
| 348 |
-
if wav_path and os.path.exists(wav_path):
|
| 349 |
-
os.remove(wav_path)
|
| 350 |
-
except OSError:
|
| 351 |
-
pass
|
| 352 |
|
| 353 |
-
|
| 354 |
-
|
| 355 |
-
# ─────────────────────────────
|
| 356 |
-
if not text or not text.strip():
|
| 357 |
-
print("[STT] Empty transcript.")
|
| 358 |
return None
|
| 359 |
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
# ─────────────────────────────
|
| 363 |
-
# 5. SAFE hallucination filter (RELAXED)
|
| 364 |
-
# ─────────────────────────────
|
| 365 |
-
words = text.split()
|
| 366 |
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
|
| 371 |
-
if unique_ratio < 0.25:
|
| 372 |
-
print(f"[STT] Rejected heavy repetition: {text[:60]}")
|
| 373 |
-
return None
|
| 374 |
|
| 375 |
-
#
|
| 376 |
-
|
| 377 |
-
|
|
|
|
| 378 |
return None
|
| 379 |
|
| 380 |
-
#
|
| 381 |
-
|
| 382 |
-
|
| 383 |
-
try:
|
| 384 |
-
if not _is_valid_bangla(text):
|
| 385 |
-
# do NOT drop aggressively — log only
|
| 386 |
-
print(f"[STT] Non-Bangla detected (kept anyway): {text[:60]}")
|
| 387 |
-
except Exception:
|
| 388 |
-
pass
|
| 389 |
-
|
| 390 |
-
# ─────────────────────────────
|
| 391 |
-
# 7. SUCCESS
|
| 392 |
-
# ─────────────────────────────
|
| 393 |
-
print(f"[STT] Transcript: {text}")
|
| 394 |
-
return text
|
|
|
|
| 1 |
"""
|
| 2 |
+
services/stt.py — GPU-Batched Faster-Whisper STT Pipeline
|
| 3 |
+
|
| 4 |
+
Architecture:
|
| 5 |
+
─────────────
|
| 6 |
+
• Single shared WhisperModel instance (loaded once, never reloaded)
|
| 7 |
+
• asyncio.Queue-based request intake — fully non-blocking
|
| 8 |
+
• Micro-batching worker: accumulates requests over BATCH_WINDOW_MS,
|
| 9 |
+
then runs a single GPU forward pass for the entire batch
|
| 10 |
+
• Each caller awaits its own asyncio.Future — zero polling overhead
|
| 11 |
+
• ffmpeg audio conversion runs in a ThreadPoolExecutor (I/O bound)
|
| 12 |
+
• GPU inference runs in a dedicated single-thread Executor (serialize GPU)
|
| 13 |
+
• Bangla-optimised decode parameters preserved from original
|
| 14 |
+
|
| 15 |
+
Latency profile:
|
| 16 |
+
ffmpeg (parallel) ~30–80 ms
|
| 17 |
+
batch wait window ~50 ms
|
| 18 |
+
GPU inference ~80–150 ms per batch (amortised across requests)
|
| 19 |
+
Total perceived < 200 ms at moderate load
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
"""
|
| 21 |
|
| 22 |
from __future__ import annotations
|
|
|
|
| 25 |
import io
|
| 26 |
import os
|
| 27 |
import re
|
| 28 |
+
import struct
|
| 29 |
import subprocess
|
| 30 |
import tempfile
|
| 31 |
import threading
|
| 32 |
+
import wave
|
| 33 |
from concurrent.futures import ThreadPoolExecutor
|
| 34 |
+
from dataclasses import dataclass, field
|
| 35 |
+
from typing import Optional
|
| 36 |
|
| 37 |
from faster_whisper import WhisperModel
|
| 38 |
|
| 39 |
+
# ── Bangla script patterns ─────────────────────────────────────────────────────
|
| 40 |
+
_BANGLA_RE = re.compile(r"[\u0980-\u09FF]")
|
| 41 |
+
_WRONG_SCRIPT_RE = re.compile(
|
| 42 |
+
r"[\u0600-\u06FF\u0750-\u077F\uFB50-\uFDFF\uFE70-\uFEFF]"
|
|
|
|
|
|
|
|
|
|
| 43 |
)
|
| 44 |
|
| 45 |
+
# Bangla decoder seed — keeps Whisper in বাংলা Unicode block
|
|
|
|
|
|
|
|
|
|
| 46 |
_BANGLA_SEED = "আমি আপনার সাথে বাংলায় কথা বলছি।"
|
| 47 |
|
| 48 |
+
# ── Configuration ──────────────────────────────────────────────────────────────
|
| 49 |
+
_STT_MODEL = os.getenv("STT_MODEL", "large-v3")
|
| 50 |
+
_COMPUTE_TYPE = os.getenv("STT_COMPUTE_TYPE", "int8_float32")
|
| 51 |
+
_BATCH_WINDOW = float(os.getenv("STT_BATCH_WINDOW_MS", "50")) / 1000 # seconds
|
| 52 |
+
_MAX_BATCH = int(os.getenv("STT_MAX_BATCH", "8"))
|
| 53 |
+
MAX_INPUT_BYTES = 5_242_880 # 5 MB
|
| 54 |
|
| 55 |
+
# ── Singleton model state ──────────────────────────────────────────────────────
|
| 56 |
+
_model: Optional[WhisperModel] = None
|
| 57 |
+
_model_lock = threading.Lock()
|
| 58 |
+
_model_ready = threading.Event()
|
|
|
|
| 59 |
|
| 60 |
+
# Two executors: one for ffmpeg (I/O, can be parallel), one for GPU (serial)
|
| 61 |
+
_ffmpeg_pool = ThreadPoolExecutor(max_workers=4, thread_name_prefix="ffmpeg")
|
| 62 |
+
_gpu_pool = ThreadPoolExecutor(max_workers=1, thread_name_prefix="whisper-gpu")
|
| 63 |
|
| 64 |
|
| 65 |
+
# ── Model loader (background thread) ──────────────────────────────────────────
|
| 66 |
def _load_and_warm() -> None:
|
| 67 |
global _model
|
| 68 |
try:
|
| 69 |
+
print(f"[STT] Loading Faster-Whisper {_STT_MODEL} on CUDA ({_COMPUTE_TYPE})…")
|
| 70 |
m = WhisperModel(
|
| 71 |
_STT_MODEL,
|
| 72 |
device="cuda",
|
| 73 |
compute_type=_COMPUTE_TYPE,
|
| 74 |
num_workers=1,
|
| 75 |
)
|
| 76 |
+
# GPU warmup — forces CUDA kernel compilation
|
| 77 |
+
silence = _make_silence_wav(0.5)
|
| 78 |
list(m.transcribe(silence, language="bn", beam_size=1)[0])
|
| 79 |
+
print("[STT] GPU warmup complete. STT ready ✓")
|
| 80 |
with _model_lock:
|
| 81 |
_model = m
|
| 82 |
except Exception as exc:
|
| 83 |
+
print(f"[STT] Model load failed: {exc}")
|
| 84 |
finally:
|
| 85 |
_model_ready.set()
|
| 86 |
|
| 87 |
|
| 88 |
+
def _make_silence_wav(duration_s: float = 0.5, sr: int = 16_000) -> io.BytesIO:
|
|
|
|
| 89 |
buf = io.BytesIO()
|
| 90 |
+
n = int(sr * duration_s)
|
| 91 |
with wave.open(buf, "wb") as wf:
|
| 92 |
+
wf.setnchannels(1); wf.setsampwidth(2); wf.setframerate(sr)
|
| 93 |
+
wf.writeframes(struct.pack(f"<{n}h", *([0] * n)))
|
|
|
|
|
|
|
| 94 |
buf.seek(0)
|
| 95 |
return buf
|
| 96 |
|
| 97 |
|
| 98 |
+
# Start background model load immediately at import
|
| 99 |
+
threading.Thread(target=_load_and_warm, daemon=True, name="whisper-loader").start()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
|
| 101 |
|
| 102 |
+
# ── ffmpeg conversion (sync, runs in _ffmpeg_pool) ────────────────────────────
|
| 103 |
+
def _to_wav_sync(audio_bytes: bytes) -> Optional[str]:
|
| 104 |
+
"""Convert WebM/Opus → 16 kHz mono WAV. Returns temp file path or None."""
|
| 105 |
+
in_path = out_path = None
|
| 106 |
+
try:
|
| 107 |
+
with tempfile.NamedTemporaryFile(suffix=".webm", delete=False) as f:
|
| 108 |
+
f.write(audio_bytes); in_path = f.name
|
| 109 |
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
|
| 110 |
+
out_path = f.name
|
| 111 |
+
|
| 112 |
+
result = subprocess.run(
|
| 113 |
+
[
|
| 114 |
+
"ffmpeg", "-y", "-loglevel", "warning",
|
| 115 |
+
"-i", in_path,
|
| 116 |
+
"-ar", "16000", "-ac", "1",
|
| 117 |
+
"-af", "highpass=f=80,afftdn=nf=-25,aresample=resampler=swr",
|
| 118 |
+
"-f", "wav", out_path,
|
| 119 |
+
],
|
| 120 |
+
stdout=subprocess.DEVNULL,
|
| 121 |
+
stderr=subprocess.PIPE,
|
| 122 |
+
timeout=30,
|
| 123 |
+
)
|
| 124 |
+
if result.returncode != 0:
|
| 125 |
+
print("[STT][ffmpeg]", result.stderr.decode(errors="replace")[:200])
|
| 126 |
+
return None
|
| 127 |
+
if not os.path.exists(out_path) or os.path.getsize(out_path) < 500:
|
| 128 |
+
return None
|
| 129 |
+
return out_path
|
| 130 |
+
except subprocess.TimeoutExpired:
|
| 131 |
+
print("[STT][ffmpeg] timed out")
|
| 132 |
+
return None
|
| 133 |
+
except Exception as exc:
|
| 134 |
+
print(f"[STT][ffmpeg] {exc}")
|
| 135 |
+
return None
|
| 136 |
+
finally:
|
| 137 |
+
if in_path and os.path.exists(in_path):
|
| 138 |
+
try: os.remove(in_path)
|
| 139 |
+
except OSError: pass
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
# ── Whisper inference (sync, runs in _gpu_pool — ONE AT A TIME) ───────────────
|
| 143 |
+
def _transcribe_batch_sync(wav_paths: list[str]) -> list[Optional[str]]:
|
| 144 |
+
"""
|
| 145 |
+
Run Whisper inference on a list of WAV paths.
|
| 146 |
+
Returns a list of transcripts (None on error/empty).
|
| 147 |
+
Each file is processed sequentially on the same GPU — this is intentional:
|
| 148 |
+
batching here means we avoid per-request CUDA kernel spin-up overhead.
|
| 149 |
+
"""
|
| 150 |
+
with _model_lock:
|
| 151 |
+
model = _model
|
| 152 |
+
if model is None:
|
| 153 |
+
return [None] * len(wav_paths)
|
| 154 |
|
| 155 |
+
results: list[Optional[str]] = []
|
| 156 |
+
for path in wav_paths:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
try:
|
| 158 |
+
segments, info = model.transcribe(
|
| 159 |
+
path,
|
| 160 |
+
language="bn",
|
| 161 |
+
beam_size=5,
|
| 162 |
+
vad_filter=False,
|
| 163 |
+
condition_on_previous_text=False,
|
| 164 |
+
temperature=0,
|
| 165 |
+
suppress_tokens=[-1],
|
| 166 |
+
initial_prompt=_BANGLA_SEED,
|
| 167 |
+
no_speech_threshold=0.6,
|
| 168 |
+
log_prob_threshold=-0.5,
|
| 169 |
+
compression_ratio_threshold=2.4,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
)
|
| 171 |
+
text = " ".join(seg.text.strip() for seg in segments).strip()
|
| 172 |
+
print(f"[STT] lang={info.language} p={info.language_probability:.2f} → {text[:60]}")
|
| 173 |
+
results.append(text or None)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 174 |
except Exception as exc:
|
| 175 |
+
print(f"[STT] inference error: {exc}")
|
| 176 |
+
results.append(None)
|
| 177 |
finally:
|
| 178 |
+
try: os.remove(path)
|
| 179 |
+
except OSError: pass
|
| 180 |
+
|
| 181 |
+
return results
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
# ── Hallucination / script validation ─────────────────────────────────────────
|
| 185 |
+
def _validate(text: str) -> Optional[str]:
|
| 186 |
+
if not text or not text.strip():
|
| 187 |
+
return None
|
| 188 |
+
text = text.strip()
|
| 189 |
+
words = text.split()
|
| 190 |
+
if len(words) >= 6 and len(set(words)) / len(words) < 0.25:
|
| 191 |
+
print(f"[STT] rejected repetition: {text[:60]}")
|
| 192 |
+
return None
|
| 193 |
+
if len(words) == 2 and words[0] == words[1]:
|
| 194 |
+
return None
|
| 195 |
+
# Soft script check — log but keep
|
| 196 |
+
wrong = len(_WRONG_SCRIPT_RE.findall(text))
|
| 197 |
+
alpha = sum(1 for c in text if c.isalpha())
|
| 198 |
+
if alpha > 0 and wrong / alpha > 0.30:
|
| 199 |
+
print(f"[STT] non-Bangla (kept): {text[:60]}")
|
| 200 |
+
return text
|
| 201 |
+
|
| 202 |
+
|
| 203 |
+
# ══════════════════════════════════════════════════════════════════════════════
|
| 204 |
+
# BATCH QUEUE + WORKER
|
| 205 |
+
# ══════════════════════════════════════════════════════════════════════════════
|
| 206 |
+
|
| 207 |
+
@dataclass
|
| 208 |
+
class _STTRequest:
|
| 209 |
+
wav_path: str
|
| 210 |
+
future: asyncio.Future = field(default_factory=asyncio.Future)
|
| 211 |
+
|
| 212 |
+
|
| 213 |
+
class _STTBatchWorker:
|
| 214 |
+
"""
|
| 215 |
+
Singleton async worker that:
|
| 216 |
+
1. Accepts STT requests from any coroutine via enqueue()
|
| 217 |
+
2. Collects requests for up to BATCH_WINDOW_MS
|
| 218 |
+
3. Dispatches the batch to _gpu_pool in one call
|
| 219 |
+
4. Resolves each caller's Future
|
| 220 |
+
"""
|
| 221 |
+
|
| 222 |
+
def __init__(self) -> None:
|
| 223 |
+
self._queue: asyncio.Queue[_STTRequest] = asyncio.Queue()
|
| 224 |
+
self._started = False
|
| 225 |
+
|
| 226 |
+
def _ensure_started(self) -> None:
|
| 227 |
+
if not self._started:
|
| 228 |
+
self._started = True
|
| 229 |
+
asyncio.ensure_future(self._worker_loop())
|
| 230 |
+
|
| 231 |
+
async def enqueue(self, wav_path: str) -> Optional[str]:
|
| 232 |
+
self._ensure_started()
|
| 233 |
+
loop = asyncio.get_event_loop()
|
| 234 |
+
req = _STTRequest(wav_path=wav_path, future=loop.create_future())
|
| 235 |
+
await self._queue.put(req)
|
| 236 |
+
return await req.future
|
| 237 |
+
|
| 238 |
+
async def _worker_loop(self) -> None:
|
| 239 |
+
loop = asyncio.get_event_loop()
|
| 240 |
+
while True:
|
| 241 |
+
# Wait for at least one request
|
| 242 |
+
first = await self._queue.get()
|
| 243 |
+
batch = [first]
|
| 244 |
+
|
| 245 |
+
# Micro-batch window: collect more requests arriving within BATCH_WINDOW
|
| 246 |
+
try:
|
| 247 |
+
deadline = loop.time() + _BATCH_WINDOW
|
| 248 |
+
while len(batch) < _MAX_BATCH:
|
| 249 |
+
remaining = deadline - loop.time()
|
| 250 |
+
if remaining <= 0:
|
| 251 |
+
break
|
| 252 |
+
req = await asyncio.wait_for(self._queue.get(), timeout=remaining)
|
| 253 |
+
batch.append(req)
|
| 254 |
+
except asyncio.TimeoutError:
|
| 255 |
+
pass
|
| 256 |
|
| 257 |
+
# Dispatch batch to GPU executor
|
| 258 |
+
wav_paths = [r.wav_path for r in batch]
|
| 259 |
+
print(f"[STT] Dispatching batch of {len(batch)} to GPU…")
|
| 260 |
+
try:
|
| 261 |
+
results = await loop.run_in_executor(
|
| 262 |
+
_gpu_pool, _transcribe_batch_sync, wav_paths
|
| 263 |
+
)
|
| 264 |
+
except Exception as exc:
|
| 265 |
+
results = [None] * len(batch)
|
| 266 |
+
print(f"[STT] Batch GPU error: {exc}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 267 |
|
| 268 |
+
# Resolve futures
|
| 269 |
+
for req, text in zip(batch, results):
|
| 270 |
+
if not req.future.done():
|
| 271 |
+
req.future.set_result(text)
|
| 272 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 273 |
|
| 274 |
+
_batch_worker = _STTBatchWorker()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 275 |
|
|
|
|
| 276 |
|
| 277 |
+
# ══════════════════════════════════════════════════════════════════════════════
|
| 278 |
+
# PUBLIC API
|
| 279 |
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 280 |
|
| 281 |
+
class STTProcessor:
|
| 282 |
+
"""
|
| 283 |
+
Drop-in replacement for the original STTProcessor.
|
| 284 |
+
Now routes through the GPU batch worker for shared inference.
|
| 285 |
+
"""
|
| 286 |
|
| 287 |
+
async def transcribe(self, audio_bytes: bytes) -> Optional[str]:
|
| 288 |
+
"""Full pipeline: validate → ffmpeg (parallel) → batch GPU inference."""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 289 |
|
| 290 |
+
if not audio_bytes or len(audio_bytes) < 300:
|
| 291 |
+
print(f"[STT] Ignored tiny packet ({len(audio_bytes)} B)")
|
|
|
|
|
|
|
|
|
|
| 292 |
return None
|
| 293 |
|
| 294 |
+
if len(audio_bytes) > MAX_INPUT_BYTES:
|
| 295 |
+
audio_bytes = audio_bytes[:MAX_INPUT_BYTES]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 296 |
|
| 297 |
+
# Wait for model readiness (non-blocking)
|
| 298 |
+
if not _model_ready.is_set():
|
| 299 |
+
print("[STT] Waiting for model…")
|
| 300 |
+
await asyncio.to_thread(_model_ready.wait)
|
|
|
|
|
|
|
|
|
|
| 301 |
|
| 302 |
+
# ffmpeg: runs in parallel I/O pool (not serialised)
|
| 303 |
+
loop = asyncio.get_event_loop()
|
| 304 |
+
wav_path = await loop.run_in_executor(_ffmpeg_pool, _to_wav_sync, audio_bytes)
|
| 305 |
+
if not wav_path:
|
| 306 |
return None
|
| 307 |
|
| 308 |
+
# Batch GPU inference
|
| 309 |
+
text = await _batch_worker.enqueue(wav_path)
|
| 310 |
+
return _validate(text) if text else None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
services/tts.py
CHANGED
|
@@ -1,46 +1,22 @@
|
|
| 1 |
"""
|
| 2 |
services/tts.py — Ultra Low-Latency Dual TTS Backend
|
| 3 |
-
|
| 4 |
-
Fixes applied:
|
| 5 |
-
- sentence-level streaming
|
| 6 |
-
- reduced chunk buffering (ElevenLabs)
|
| 7 |
-
- WebSocket-safe streaming design
|
| 8 |
-
- optional PCM mode (recommended for real-time apps)
|
| 9 |
-
- first-audio priority behavior
|
| 10 |
-
- no internal accumulation
|
| 11 |
-
- improved async flow stability
|
| 12 |
"""
|
| 13 |
|
| 14 |
from dotenv import load_dotenv
|
| 15 |
-
import os
|
| 16 |
-
import re
|
| 17 |
-
import asyncio
|
| 18 |
|
| 19 |
load_dotenv()
|
| 20 |
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
# ─────────────────────────────────────────────
|
| 29 |
-
EDGE_VOICE = "bn-BD-NabanitaNeural"
|
| 30 |
-
|
| 31 |
-
# ─────────────────────────────────────────────
|
| 32 |
-
# ELEVENLABS CONFIG
|
| 33 |
-
# ─────────────────────────────────────────────
|
| 34 |
-
ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY", "")
|
| 35 |
-
ELEVENLABS_VOICE_ID = os.getenv("ELEVENLABS_VOICE_ID", "21m00Tcm4TlvDq8ikWAM")
|
| 36 |
-
ELEVENLABS_MODEL_ID = os.getenv("ELEVENLABS_MODEL_ID", "eleven_multilingual_v2")
|
| 37 |
-
|
| 38 |
-
# 🔥 LOW LATENCY FORMAT (IMPORTANT FIX)
|
| 39 |
-
ELEVENLABS_OUTPUT_FORMAT = "pcm_16000" # BEST for real-time (no MP3 decode delay)
|
| 40 |
-
|
| 41 |
-
ELEVENLABS_STABILITY = 0.45
|
| 42 |
ELEVENLABS_SIMILARITY = 0.80
|
| 43 |
-
ELEVENLABS_STYLE
|
| 44 |
ELEVENLABS_SPEAKER_BOOST = True
|
| 45 |
|
| 46 |
if USE_ELEVENLABS and not ELEVENLABS_API_KEY:
|
|
@@ -49,49 +25,28 @@ if USE_ELEVENLABS and not ELEVENLABS_API_KEY:
|
|
| 49 |
print(f"[TTS] Backend: {'ElevenLabs' if USE_ELEVENLABS else 'Edge-TTS'}")
|
| 50 |
|
| 51 |
|
| 52 |
-
|
| 53 |
-
# TEXT SPLITTER (REAL LATENCY FIX)
|
| 54 |
-
# ─────────────────────────────────────────────
|
| 55 |
-
def split_sentences(text: str):
|
| 56 |
text = text.strip()
|
| 57 |
if not text:
|
| 58 |
return []
|
| 59 |
-
|
| 60 |
-
# Bangla + English sentence splitting
|
| 61 |
parts = re.split(r'(?<=[।.!?])\s+', text)
|
| 62 |
-
|
| 63 |
-
# prevent empty + reduce micro-chunks
|
| 64 |
return [p.strip() for p in parts if len(p.strip()) > 1]
|
| 65 |
|
| 66 |
|
| 67 |
-
|
| 68 |
-
# EDGE-TTS STREAM (FIXED + NON-BLOCKING)
|
| 69 |
-
# ─────────────────────────────────────────────
|
| 70 |
-
async def _edge_tts_stream(text: str, voice: str = EDGE_VOICE):
|
| 71 |
import edge_tts
|
| 72 |
-
|
| 73 |
text = text.strip()
|
| 74 |
if not text:
|
| 75 |
return
|
| 76 |
-
|
| 77 |
try:
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
async for chunk in communicate.stream():
|
| 81 |
if chunk["type"] == "audio":
|
| 82 |
-
# 🔥 immediate yield (no buffering)
|
| 83 |
yield chunk["data"]
|
| 84 |
-
|
| 85 |
-
# allow event loop fairness (prevents WebSocket lag spikes)
|
| 86 |
await asyncio.sleep(0)
|
| 87 |
-
|
| 88 |
except Exception as exc:
|
| 89 |
-
print(f"[TTS][Edge]
|
| 90 |
|
| 91 |
|
| 92 |
-
# ─────────────────────────────────────────────
|
| 93 |
-
# ELEVENLABS STREAM (LOW LATENCY FIXED)
|
| 94 |
-
# ─────────────────────────────────────────────
|
| 95 |
async def _elevenlabs_stream(
|
| 96 |
text: str,
|
| 97 |
voice_id: str = ELEVENLABS_VOICE_ID,
|
|
@@ -103,105 +58,69 @@ async def _elevenlabs_stream(
|
|
| 103 |
speaker_boost: bool = ELEVENLABS_SPEAKER_BOOST,
|
| 104 |
):
|
| 105 |
import httpx
|
| 106 |
-
|
| 107 |
text = text.strip()
|
| 108 |
if not text:
|
| 109 |
return
|
| 110 |
-
|
| 111 |
url = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}/stream"
|
| 112 |
-
|
| 113 |
-
headers = {
|
| 114 |
-
"xi-api-key": ELEVENLABS_API_KEY,
|
| 115 |
-
"Content-Type": "application/json",
|
| 116 |
-
"Accept": "audio/mpeg",
|
| 117 |
-
}
|
| 118 |
-
|
| 119 |
payload = {
|
| 120 |
-
"text": text,
|
| 121 |
-
"
|
| 122 |
-
|
| 123 |
-
"stability": stability,
|
| 124 |
-
"similarity_boost": similarity,
|
| 125 |
-
"style": style,
|
| 126 |
-
"use_speaker_boost": speaker_boost,
|
| 127 |
-
},
|
| 128 |
}
|
| 129 |
-
|
| 130 |
-
params = {"output_format": output_format}
|
| 131 |
-
|
| 132 |
try:
|
| 133 |
-
async with httpx.AsyncClient(
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
async with client.stream(
|
| 138 |
-
"POST",
|
| 139 |
-
url,
|
| 140 |
-
headers=headers,
|
| 141 |
-
json=payload,
|
| 142 |
-
params=params,
|
| 143 |
-
) as resp:
|
| 144 |
-
|
| 145 |
if resp.status_code != 200:
|
| 146 |
-
|
| 147 |
-
print(f"[TTS][ElevenLabs] HTTP {resp.status_code}: {err[:200]}")
|
| 148 |
return
|
| 149 |
-
|
| 150 |
-
# 🔥 smaller chunk size = lower latency
|
| 151 |
async for chunk in resp.aiter_bytes(chunk_size=512):
|
| 152 |
if chunk:
|
| 153 |
yield chunk
|
| 154 |
await asyncio.sleep(0)
|
| 155 |
-
|
| 156 |
except Exception as exc:
|
| 157 |
-
print(f"[TTS][ElevenLabs]
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
# ─────────────────────────────────────────────
|
| 161 |
-
# PUBLIC API (ZERO BUFFER STREAM DESIGN)
|
| 162 |
-
# ─────────────────────────────────────────────
|
| 163 |
-
async def text_to_speech_stream(text: str, voice: str | None = None):
|
| 164 |
-
"""
|
| 165 |
-
Ultra-low latency streaming TTS generator.
|
| 166 |
|
| 167 |
-
Designed for:
|
| 168 |
-
- FastAPI WebSocket
|
| 169 |
-
- real-time AI agents
|
| 170 |
-
- Bangla-first voice systems
|
| 171 |
-
"""
|
| 172 |
|
|
|
|
| 173 |
text = text.strip()
|
| 174 |
if not text:
|
| 175 |
return
|
| 176 |
|
| 177 |
-
voice_to_use = voice
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
# ─────────────────────────────
|
| 182 |
-
if USE_ELEVENLABS:
|
| 183 |
-
for part in split_sentences(text):
|
| 184 |
-
|
| 185 |
-
# 🔥 stream immediately per sentence
|
| 186 |
-
async for chunk in _elevenlabs_stream(
|
| 187 |
-
part,
|
| 188 |
-
voice_id=voice_to_use or ELEVENLABS_VOICE_ID,
|
| 189 |
-
):
|
| 190 |
-
yield chunk
|
| 191 |
-
|
| 192 |
-
# yield control (prevents backend lag spikes)
|
| 193 |
-
await asyncio.sleep(0)
|
| 194 |
-
|
| 195 |
-
# ─────────────────────────────
|
| 196 |
-
# EDGE MODE
|
| 197 |
-
# ─────────────────────────────
|
| 198 |
-
else:
|
| 199 |
-
for part in split_sentences(text):
|
| 200 |
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 205 |
yield chunk
|
| 206 |
-
|
| 207 |
-
|
|
|
|
|
|
|
|
|
| 1 |
"""
|
| 2 |
services/tts.py — Ultra Low-Latency Dual TTS Backend
|
| 3 |
+
(unchanged public API — streaming.py imports text_to_speech_stream + USE_ELEVENLABS)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
"""
|
| 5 |
|
| 6 |
from dotenv import load_dotenv
|
| 7 |
+
import os, re, asyncio
|
|
|
|
|
|
|
| 8 |
|
| 9 |
load_dotenv()
|
| 10 |
|
| 11 |
+
USE_ELEVENLABS = False
|
| 12 |
+
EDGE_VOICE = "bn-BD-NabanitaNeural"
|
| 13 |
+
ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY", "")
|
| 14 |
+
ELEVENLABS_VOICE_ID = os.getenv("ELEVENLABS_VOICE_ID", "21m00Tcm4TlvDq8ikWAM")
|
| 15 |
+
ELEVENLABS_MODEL_ID = os.getenv("ELEVENLABS_MODEL_ID", "eleven_multilingual_v2")
|
| 16 |
+
ELEVENLABS_OUTPUT_FORMAT = "pcm_16000"
|
| 17 |
+
ELEVENLABS_STABILITY = 0.45
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
ELEVENLABS_SIMILARITY = 0.80
|
| 19 |
+
ELEVENLABS_STYLE = 0.35
|
| 20 |
ELEVENLABS_SPEAKER_BOOST = True
|
| 21 |
|
| 22 |
if USE_ELEVENLABS and not ELEVENLABS_API_KEY:
|
|
|
|
| 25 |
print(f"[TTS] Backend: {'ElevenLabs' if USE_ELEVENLABS else 'Edge-TTS'}")
|
| 26 |
|
| 27 |
|
| 28 |
+
def split_sentences(text: str) -> list[str]:
|
|
|
|
|
|
|
|
|
|
| 29 |
text = text.strip()
|
| 30 |
if not text:
|
| 31 |
return []
|
|
|
|
|
|
|
| 32 |
parts = re.split(r'(?<=[।.!?])\s+', text)
|
|
|
|
|
|
|
| 33 |
return [p.strip() for p in parts if len(p.strip()) > 1]
|
| 34 |
|
| 35 |
|
| 36 |
+
async def _edge_tts_stream(text: str, voice: str = EDGE_VOICE, rate: str = "-30%"):
|
|
|
|
|
|
|
|
|
|
| 37 |
import edge_tts
|
|
|
|
| 38 |
text = text.strip()
|
| 39 |
if not text:
|
| 40 |
return
|
|
|
|
| 41 |
try:
|
| 42 |
+
async for chunk in edge_tts.Communicate(text, voice, rate=rate).stream():
|
|
|
|
|
|
|
| 43 |
if chunk["type"] == "audio":
|
|
|
|
| 44 |
yield chunk["data"]
|
|
|
|
|
|
|
| 45 |
await asyncio.sleep(0)
|
|
|
|
| 46 |
except Exception as exc:
|
| 47 |
+
print(f"[TTS][Edge] {exc}")
|
| 48 |
|
| 49 |
|
|
|
|
|
|
|
|
|
|
| 50 |
async def _elevenlabs_stream(
|
| 51 |
text: str,
|
| 52 |
voice_id: str = ELEVENLABS_VOICE_ID,
|
|
|
|
| 58 |
speaker_boost: bool = ELEVENLABS_SPEAKER_BOOST,
|
| 59 |
):
|
| 60 |
import httpx
|
|
|
|
| 61 |
text = text.strip()
|
| 62 |
if not text:
|
| 63 |
return
|
|
|
|
| 64 |
url = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}/stream"
|
| 65 |
+
headers = {"xi-api-key": ELEVENLABS_API_KEY, "Content-Type": "application/json", "Accept": "audio/mpeg"}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
payload = {
|
| 67 |
+
"text": text, "model_id": model_id,
|
| 68 |
+
"voice_settings": {"stability": stability, "similarity_boost": similarity,
|
| 69 |
+
"style": style, "use_speaker_boost": speaker_boost},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
}
|
|
|
|
|
|
|
|
|
|
| 71 |
try:
|
| 72 |
+
async with httpx.AsyncClient(timeout=httpx.Timeout(connect=5.0, read=None)) as client:
|
| 73 |
+
async with client.stream("POST", url, headers=headers, json=payload,
|
| 74 |
+
params={"output_format": output_format}) as resp:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
if resp.status_code != 200:
|
| 76 |
+
print(f"[TTS][ElevenLabs] HTTP {resp.status_code}")
|
|
|
|
| 77 |
return
|
|
|
|
|
|
|
| 78 |
async for chunk in resp.aiter_bytes(chunk_size=512):
|
| 79 |
if chunk:
|
| 80 |
yield chunk
|
| 81 |
await asyncio.sleep(0)
|
|
|
|
| 82 |
except Exception as exc:
|
| 83 |
+
print(f"[TTS][ElevenLabs] {exc}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
|
| 86 |
+
async def text_to_speech_stream(text: str, voice: str | None = None, rate: str = "-30%"):
|
| 87 |
text = text.strip()
|
| 88 |
if not text:
|
| 89 |
return
|
| 90 |
|
| 91 |
+
voice_to_use = voice or (ELEVENLABS_VOICE_ID if USE_ELEVENLABS else EDGE_VOICE)
|
| 92 |
+
parts = split_sentences(text)
|
| 93 |
+
if not parts:
|
| 94 |
+
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
|
| 96 |
+
_SENT = object() # sentinel
|
| 97 |
+
|
| 98 |
+
async def _synth_part(part: str, q: asyncio.Queue):
|
| 99 |
+
try:
|
| 100 |
+
if USE_ELEVENLABS:
|
| 101 |
+
async for chunk in _elevenlabs_stream(part, voice_id=voice_to_use):
|
| 102 |
+
await q.put(chunk)
|
| 103 |
+
else:
|
| 104 |
+
async for chunk in _edge_tts_stream(part, voice=voice_to_use, rate=rate):
|
| 105 |
+
await q.put(chunk)
|
| 106 |
+
except Exception as exc:
|
| 107 |
+
print(f"[TTS] synth error: {exc}")
|
| 108 |
+
finally:
|
| 109 |
+
await q.put(_SENT)
|
| 110 |
+
|
| 111 |
+
# Create one queue per sentence, launch all synthesis tasks immediately
|
| 112 |
+
queues = [asyncio.Queue() for _ in parts]
|
| 113 |
+
tasks = [asyncio.create_task(_synth_part(p, q)) for p, q in zip(parts, queues)]
|
| 114 |
+
|
| 115 |
+
# Deliver audio in sentence order, but all sentences synthesise in parallel
|
| 116 |
+
try:
|
| 117 |
+
for q in queues:
|
| 118 |
+
while True:
|
| 119 |
+
chunk = await q.get()
|
| 120 |
+
if chunk is _SENT:
|
| 121 |
+
break
|
| 122 |
yield chunk
|
| 123 |
+
finally:
|
| 124 |
+
for t in tasks:
|
| 125 |
+
t.cancel()
|
| 126 |
+
await asyncio.gather(*tasks, return_exceptions=True)
|
services/vad.py
CHANGED
|
@@ -1,13 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import webrtcvad
|
| 2 |
|
|
|
|
| 3 |
class VADDetector:
|
| 4 |
def __init__(self, sample_rate=16000, frame_ms=30, aggressiveness=2):
|
| 5 |
-
self.vad
|
| 6 |
self.sample_rate = sample_rate
|
| 7 |
-
self.frame_ms
|
| 8 |
self.frame_size = int(sample_rate * frame_ms / 1000) * 2
|
| 9 |
|
| 10 |
-
def is_valid(self, frame: bytes):
|
| 11 |
return len(frame) == self.frame_size
|
| 12 |
|
| 13 |
def is_speech(self, frame: bytes) -> bool:
|
|
@@ -15,35 +21,33 @@ class VADDetector:
|
|
| 15 |
return False
|
| 16 |
try:
|
| 17 |
return self.vad.is_speech(frame, self.sample_rate)
|
| 18 |
-
except:
|
| 19 |
return False
|
| 20 |
|
| 21 |
|
| 22 |
class VADSegmenter:
|
| 23 |
def __init__(self, vad: VADDetector, silence_limit=8):
|
| 24 |
-
self.vad
|
| 25 |
self.silence_limit = silence_limit
|
| 26 |
-
|
| 27 |
-
self.
|
| 28 |
-
self.
|
| 29 |
-
self.active = False
|
| 30 |
|
| 31 |
def add_frame(self, frame: bytes):
|
| 32 |
speech = self.vad.is_speech(frame)
|
| 33 |
|
| 34 |
if speech:
|
| 35 |
self.buffer.extend(frame)
|
| 36 |
-
self.active
|
| 37 |
self.silence = 0
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
self.silence += 1
|
| 41 |
|
| 42 |
if self.active and self.silence > self.silence_limit:
|
| 43 |
audio = bytes(self.buffer)
|
| 44 |
self.buffer.clear()
|
| 45 |
self.silence = 0
|
| 46 |
-
self.active
|
| 47 |
return audio
|
| 48 |
|
| 49 |
return None
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
services/vad.py — WebRTC VAD wrapper (unchanged — already correct)
|
| 3 |
+
Now also used by webrtc_pipeline.py's _VADSegmenter for PCM frame processing.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
import webrtcvad
|
| 7 |
|
| 8 |
+
|
| 9 |
class VADDetector:
|
| 10 |
def __init__(self, sample_rate=16000, frame_ms=30, aggressiveness=2):
|
| 11 |
+
self.vad = webrtcvad.Vad(aggressiveness)
|
| 12 |
self.sample_rate = sample_rate
|
| 13 |
+
self.frame_ms = frame_ms
|
| 14 |
self.frame_size = int(sample_rate * frame_ms / 1000) * 2
|
| 15 |
|
| 16 |
+
def is_valid(self, frame: bytes) -> bool:
|
| 17 |
return len(frame) == self.frame_size
|
| 18 |
|
| 19 |
def is_speech(self, frame: bytes) -> bool:
|
|
|
|
| 21 |
return False
|
| 22 |
try:
|
| 23 |
return self.vad.is_speech(frame, self.sample_rate)
|
| 24 |
+
except Exception:
|
| 25 |
return False
|
| 26 |
|
| 27 |
|
| 28 |
class VADSegmenter:
|
| 29 |
def __init__(self, vad: VADDetector, silence_limit=8):
|
| 30 |
+
self.vad = vad
|
| 31 |
self.silence_limit = silence_limit
|
| 32 |
+
self.buffer = bytearray()
|
| 33 |
+
self.silence = 0
|
| 34 |
+
self.active = False
|
|
|
|
| 35 |
|
| 36 |
def add_frame(self, frame: bytes):
|
| 37 |
speech = self.vad.is_speech(frame)
|
| 38 |
|
| 39 |
if speech:
|
| 40 |
self.buffer.extend(frame)
|
| 41 |
+
self.active = True
|
| 42 |
self.silence = 0
|
| 43 |
+
elif self.active:
|
| 44 |
+
self.silence += 1
|
|
|
|
| 45 |
|
| 46 |
if self.active and self.silence > self.silence_limit:
|
| 47 |
audio = bytes(self.buffer)
|
| 48 |
self.buffer.clear()
|
| 49 |
self.silence = 0
|
| 50 |
+
self.active = False
|
| 51 |
return audio
|
| 52 |
|
| 53 |
return None
|
services/webrtc_pipeline.py
ADDED
|
@@ -0,0 +1,381 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
services/webrtc_pipeline.py — WebRTC Audio Pipeline + Full Parallelization
|
| 3 |
+
|
| 4 |
+
Architecture:
|
| 5 |
+
─────────────
|
| 6 |
+
Browser MediaStream (WebRTC)
|
| 7 |
+
│
|
| 8 |
+
│ RTCPeerConnection (aiortc)
|
| 9 |
+
▼
|
| 10 |
+
PCM frame receiver (20ms frames, 16kHz mono)
|
| 11 |
+
│
|
| 12 |
+
│ VAD (webrtcvad) — discard silence, buffer speech
|
| 13 |
+
▼
|
| 14 |
+
Speech segment → STT batch queue ──────────────────────────┐
|
| 15 |
+
│ parallel
|
| 16 |
+
STT result → LLM async stream ────────────────────────┐ │
|
| 17 |
+
│ │
|
| 18 |
+
LLM tokens → TTS ParallelStreamer ──────────────────┐ │ │
|
| 19 |
+
│ │ │
|
| 20 |
+
Audio chunks → RTCPeerConnection data channel ◄──── ┘ │ │
|
| 21 |
+
└───┘
|
| 22 |
+
(all three run concurrently)
|
| 23 |
+
|
| 24 |
+
Key design choices:
|
| 25 |
+
• aiortc handles WebRTC peer connection & ICE negotiation
|
| 26 |
+
• PCM frames delivered via asyncio.Queue — never blocks media thread
|
| 27 |
+
• VAD segments audio before STT — no wasted inference on silence
|
| 28 |
+
• STT → LLM → TTS pipeline starts as soon as speech ends
|
| 29 |
+
• Audio response sent back over RTCDataChannel as binary chunks
|
| 30 |
+
• STT uses the shared GPU batch worker (see stt.py)
|
| 31 |
+
• Barge-in: new speech cancels the current LLM+TTS pipeline immediately
|
| 32 |
+
"""
|
| 33 |
+
|
| 34 |
+
from __future__ import annotations
|
| 35 |
+
|
| 36 |
+
import asyncio
|
| 37 |
+
import json
|
| 38 |
+
import uuid
|
| 39 |
+
from typing import Optional
|
| 40 |
+
|
| 41 |
+
try:
|
| 42 |
+
from aiortc import RTCPeerConnection, RTCSessionDescription, MediaStreamTrack
|
| 43 |
+
from aiortc.contrib.media import MediaBlackhole
|
| 44 |
+
import av
|
| 45 |
+
AIORTC_AVAILABLE = True
|
| 46 |
+
except ImportError:
|
| 47 |
+
AIORTC_AVAILABLE = False
|
| 48 |
+
print("[WebRTC] aiortc not installed — WebRTC pipeline unavailable. "
|
| 49 |
+
"Install: pip install aiortc")
|
| 50 |
+
|
| 51 |
+
try:
|
| 52 |
+
import webrtcvad
|
| 53 |
+
VAD_AVAILABLE = True
|
| 54 |
+
except ImportError:
|
| 55 |
+
VAD_AVAILABLE = False
|
| 56 |
+
print("[WebRTC] webrtcvad not installed — VAD unavailable.")
|
| 57 |
+
|
| 58 |
+
from services.stt import STTProcessor
|
| 59 |
+
from services.streaming import ParallelTTSStreamer
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
# ══════════════════════════════════════════════════════════════════════════════
|
| 63 |
+
# VAD SEGMENTER (PCM frames → speech utterances)
|
| 64 |
+
# ══════════════════════════════════════════════════════════════════════════════
|
| 65 |
+
|
| 66 |
+
class _VADSegmenter:
|
| 67 |
+
"""
|
| 68 |
+
Accumulates raw 16-bit mono PCM frames.
|
| 69 |
+
Yields complete utterances when silence follows speech.
|
| 70 |
+
"""
|
| 71 |
+
|
| 72 |
+
def __init__(
|
| 73 |
+
self,
|
| 74 |
+
sample_rate: int = 16_000,
|
| 75 |
+
frame_ms: int = 20, # 20ms frames — aiortc default
|
| 76 |
+
aggressiveness: int = 2,
|
| 77 |
+
silence_limit: int = 12, # ~240ms silence → end of utterance
|
| 78 |
+
) -> None:
|
| 79 |
+
self.sample_rate = sample_rate
|
| 80 |
+
self.frame_bytes = int(sample_rate * frame_ms / 1000) * 2 # 16-bit samples
|
| 81 |
+
self.silence_limit = silence_limit
|
| 82 |
+
self._vad = webrtcvad.Vad(aggressiveness) if VAD_AVAILABLE else None
|
| 83 |
+
self._buffer = bytearray()
|
| 84 |
+
self._silence_count = 0
|
| 85 |
+
self._active = False
|
| 86 |
+
|
| 87 |
+
def process_frame(self, pcm_frame: bytes) -> Optional[bytes]:
|
| 88 |
+
"""
|
| 89 |
+
Feed one 20ms PCM frame.
|
| 90 |
+
Returns a complete utterance bytes object when speech ends, else None.
|
| 91 |
+
"""
|
| 92 |
+
if self._vad is None:
|
| 93 |
+
# No VAD available — buffer everything, flush after 3s
|
| 94 |
+
self._buffer.extend(pcm_frame)
|
| 95 |
+
if len(self._buffer) >= self.sample_rate * 3 * 2:
|
| 96 |
+
data = bytes(self._buffer)
|
| 97 |
+
self._buffer.clear()
|
| 98 |
+
return data
|
| 99 |
+
return None
|
| 100 |
+
|
| 101 |
+
# Pad or trim to exact frame size
|
| 102 |
+
frame = pcm_frame[:self.frame_bytes].ljust(self.frame_bytes, b'\x00')
|
| 103 |
+
|
| 104 |
+
try:
|
| 105 |
+
is_speech = self._vad.is_speech(frame, self.sample_rate)
|
| 106 |
+
except Exception:
|
| 107 |
+
is_speech = False
|
| 108 |
+
|
| 109 |
+
if is_speech:
|
| 110 |
+
self._buffer.extend(frame)
|
| 111 |
+
self._active = True
|
| 112 |
+
self._silence_count = 0
|
| 113 |
+
elif self._active:
|
| 114 |
+
self._buffer.extend(frame)
|
| 115 |
+
self._silence_count += 1
|
| 116 |
+
|
| 117 |
+
if self._active and self._silence_count >= self.silence_limit:
|
| 118 |
+
data = bytes(self._buffer)
|
| 119 |
+
self._buffer.clear()
|
| 120 |
+
self._silence_count = 0
|
| 121 |
+
self._active = False
|
| 122 |
+
return data
|
| 123 |
+
|
| 124 |
+
return None
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
# ═════════════════��════════════════════════════════════════════════════════════
|
| 128 |
+
# AUDIO TRACK RECEIVER
|
| 129 |
+
# ══════════════════════════════════════════════════════════════════════════════
|
| 130 |
+
|
| 131 |
+
if AIORTC_AVAILABLE:
|
| 132 |
+
class AudioFrameReceiver(MediaStreamTrack):
|
| 133 |
+
"""
|
| 134 |
+
Wraps an incoming WebRTC audio track.
|
| 135 |
+
Resamples to 16kHz mono PCM and pushes frames into an asyncio.Queue.
|
| 136 |
+
"""
|
| 137 |
+
|
| 138 |
+
kind = "audio"
|
| 139 |
+
|
| 140 |
+
def __init__(self, track: MediaStreamTrack, frame_queue: asyncio.Queue) -> None:
|
| 141 |
+
super().__init__()
|
| 142 |
+
self._track = track
|
| 143 |
+
self._frame_queue = frame_queue
|
| 144 |
+
self._resampler: Optional[av.AudioResampler] = None
|
| 145 |
+
|
| 146 |
+
async def recv(self):
|
| 147 |
+
frame = await self._track.recv()
|
| 148 |
+
if self._resampler is None:
|
| 149 |
+
self._resampler = av.AudioResampler(
|
| 150 |
+
format="s16",
|
| 151 |
+
layout="mono",
|
| 152 |
+
rate=16_000,
|
| 153 |
+
)
|
| 154 |
+
resampled = self._resampler.resample(frame)
|
| 155 |
+
for rf in resampled:
|
| 156 |
+
pcm = bytes(rf.planes[0])
|
| 157 |
+
try:
|
| 158 |
+
self._frame_queue.put_nowait(pcm)
|
| 159 |
+
except asyncio.QueueFull:
|
| 160 |
+
pass # drop frame under backpressure — prefer real-time
|
| 161 |
+
return frame
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
# ══════════════════════════════════════════════════════════════════════════════
|
| 165 |
+
# TURN PIPELINE (STT → LLM → TTS, all parallel)
|
| 166 |
+
# ══════════════════════════════════════════════════════════════════════════════
|
| 167 |
+
|
| 168 |
+
class _TurnPipeline:
|
| 169 |
+
"""
|
| 170 |
+
Runs one conversation turn: speech bytes → transcript → LLM stream → audio.
|
| 171 |
+
Designed to be created fresh per turn (or cancelled on barge-in).
|
| 172 |
+
"""
|
| 173 |
+
|
| 174 |
+
def __init__(self, ai_backend, data_channel, on_stt=None, on_token=None):
|
| 175 |
+
self._ai = ai_backend
|
| 176 |
+
self._channel = data_channel # RTCDataChannel for audio delivery
|
| 177 |
+
self._on_stt = on_stt # optional callback(str)
|
| 178 |
+
self._on_token = on_token # optional callback(str)
|
| 179 |
+
self._stt = STTProcessor()
|
| 180 |
+
self._streamer = ParallelTTSStreamer()
|
| 181 |
+
self._cancelled = False
|
| 182 |
+
self._tasks: list[asyncio.Task] = []
|
| 183 |
+
|
| 184 |
+
async def run(self, user_id: str, audio_bytes: bytes) -> None:
|
| 185 |
+
"""Full pipeline: audio → STT → LLM+TTS (parallel)."""
|
| 186 |
+
|
| 187 |
+
# ── Phase 1: STT (GPU-batched) ────────────────────────────────────────
|
| 188 |
+
transcript = await self._stt.transcribe(audio_bytes)
|
| 189 |
+
if not transcript or self._cancelled:
|
| 190 |
+
self._send_ctrl({"type": "end"})
|
| 191 |
+
return
|
| 192 |
+
|
| 193 |
+
if self._on_stt:
|
| 194 |
+
self._on_stt(transcript)
|
| 195 |
+
self._send_ctrl({"type": "stt", "text": transcript})
|
| 196 |
+
|
| 197 |
+
# ── Phase 2: LLM + TTS in parallel ───────────────────────────────────
|
| 198 |
+
await asyncio.gather(
|
| 199 |
+
self._run_llm(user_id, transcript),
|
| 200 |
+
self._run_tts_delivery(),
|
| 201 |
+
return_exceptions=True,
|
| 202 |
+
)
|
| 203 |
+
|
| 204 |
+
if not self._cancelled:
|
| 205 |
+
self._send_ctrl({"type": "end"})
|
| 206 |
+
|
| 207 |
+
async def _run_llm(self, user_id: str, transcript: str) -> None:
|
| 208 |
+
"""Stream LLM tokens → TTS streamer (concurrent with audio delivery)."""
|
| 209 |
+
try:
|
| 210 |
+
stream = await self._ai.main(user_id, transcript)
|
| 211 |
+
async for token in stream:
|
| 212 |
+
if self._cancelled or not token:
|
| 213 |
+
break
|
| 214 |
+
if self._on_token:
|
| 215 |
+
self._on_token(token)
|
| 216 |
+
self._send_ctrl({"type": "llm_token", "token": token})
|
| 217 |
+
await self._streamer.add_token(token)
|
| 218 |
+
except asyncio.CancelledError:
|
| 219 |
+
raise
|
| 220 |
+
except Exception as exc:
|
| 221 |
+
print(f"[Pipeline] LLM error: {exc}")
|
| 222 |
+
finally:
|
| 223 |
+
await self._streamer.flush()
|
| 224 |
+
|
| 225 |
+
async def _run_tts_delivery(self) -> None:
|
| 226 |
+
"""Forward audio chunks from TTS streamer to WebRTC data channel."""
|
| 227 |
+
async for chunk in self._streamer.stream_audio():
|
| 228 |
+
if self._cancelled:
|
| 229 |
+
break
|
| 230 |
+
self._send_audio(chunk)
|
| 231 |
+
|
| 232 |
+
def _send_ctrl(self, payload: dict) -> None:
|
| 233 |
+
if self._channel and self._channel.readyState == "open":
|
| 234 |
+
try:
|
| 235 |
+
self._channel.send(json.dumps(payload))
|
| 236 |
+
except Exception:
|
| 237 |
+
pass
|
| 238 |
+
|
| 239 |
+
def _send_audio(self, data: bytes) -> None:
|
| 240 |
+
if self._channel and self._channel.readyState == "open":
|
| 241 |
+
try:
|
| 242 |
+
self._channel.send(data)
|
| 243 |
+
except Exception:
|
| 244 |
+
pass
|
| 245 |
+
|
| 246 |
+
async def cancel(self) -> None:
|
| 247 |
+
self._cancelled = True
|
| 248 |
+
await self._streamer.cancel()
|
| 249 |
+
for t in self._tasks:
|
| 250 |
+
t.cancel()
|
| 251 |
+
if self._tasks:
|
| 252 |
+
await asyncio.gather(*self._tasks, return_exceptions=True)
|
| 253 |
+
|
| 254 |
+
|
| 255 |
+
# ══════════════════════════════════════════════════════════════════════════════
|
| 256 |
+
# WEBRTC SESSION HANDLER
|
| 257 |
+
# ══════════════════════════════════════════════════════════════════════════════
|
| 258 |
+
|
| 259 |
+
class WebRTCSession:
|
| 260 |
+
"""
|
| 261 |
+
Manages one WebRTC peer connection:
|
| 262 |
+
• Handles ICE/SDP negotiation
|
| 263 |
+
• Receives audio track → VAD → STT queue
|
| 264 |
+
• Sends responses back via RTCDataChannel
|
| 265 |
+
• Supports barge-in (cancel active turn on new speech)
|
| 266 |
+
"""
|
| 267 |
+
|
| 268 |
+
def __init__(self, ai_backend) -> None:
|
| 269 |
+
if not AIORTC_AVAILABLE:
|
| 270 |
+
raise RuntimeError("aiortc is required for WebRTC mode")
|
| 271 |
+
self._ai = ai_backend
|
| 272 |
+
self.user_id = f"rtc_{uuid.uuid4().hex[:12]}"
|
| 273 |
+
self._pc = RTCPeerConnection()
|
| 274 |
+
self._channel = None
|
| 275 |
+
self._frame_q: asyncio.Queue = asyncio.Queue(maxsize=500)
|
| 276 |
+
self._vad = _VADSegmenter()
|
| 277 |
+
self._active_turn: Optional[_TurnPipeline] = None
|
| 278 |
+
self._active_task: Optional[asyncio.Task] = None
|
| 279 |
+
self._setup_pc()
|
| 280 |
+
|
| 281 |
+
def _setup_pc(self) -> None:
|
| 282 |
+
pc = self._pc
|
| 283 |
+
|
| 284 |
+
@pc.on("track")
|
| 285 |
+
def on_track(track):
|
| 286 |
+
if track.kind == "audio":
|
| 287 |
+
receiver = AudioFrameReceiver(track, self._frame_q)
|
| 288 |
+
asyncio.ensure_future(self._frame_processor())
|
| 289 |
+
|
| 290 |
+
@pc.on("datachannel")
|
| 291 |
+
def on_datachannel(channel):
|
| 292 |
+
self._channel = channel
|
| 293 |
+
print(f"[WebRTC] DataChannel open: {channel.label}")
|
| 294 |
+
|
| 295 |
+
@channel.on("message")
|
| 296 |
+
def on_message(msg):
|
| 297 |
+
# Control messages from browser (cancel, init, ping)
|
| 298 |
+
try:
|
| 299 |
+
data = json.loads(msg)
|
| 300 |
+
if data.get("type") == "cancel":
|
| 301 |
+
asyncio.ensure_future(self._cancel_active())
|
| 302 |
+
elif data.get("type") == "init" and data.get("user_id"):
|
| 303 |
+
self.user_id = str(data["user_id"])[:64]
|
| 304 |
+
except Exception:
|
| 305 |
+
pass
|
| 306 |
+
|
| 307 |
+
@pc.on("connectionstatechange")
|
| 308 |
+
async def on_state():
|
| 309 |
+
print(f"[WebRTC] Connection state: {pc.connectionState}")
|
| 310 |
+
if pc.connectionState in ("failed", "closed"):
|
| 311 |
+
await self._cancel_active()
|
| 312 |
+
|
| 313 |
+
async def _frame_processor(self) -> None:
|
| 314 |
+
"""Consume PCM frames from queue → VAD → dispatch turns."""
|
| 315 |
+
while True:
|
| 316 |
+
try:
|
| 317 |
+
frame = await asyncio.wait_for(self._frame_q.get(), timeout=5.0)
|
| 318 |
+
except asyncio.TimeoutError:
|
| 319 |
+
continue
|
| 320 |
+
except Exception:
|
| 321 |
+
break
|
| 322 |
+
|
| 323 |
+
utterance = self._vad.process_frame(frame)
|
| 324 |
+
if utterance:
|
| 325 |
+
await self._dispatch_turn(utterance)
|
| 326 |
+
|
| 327 |
+
async def _dispatch_turn(self, audio_bytes: bytes) -> None:
|
| 328 |
+
"""Barge-in aware: cancel current turn, start new one."""
|
| 329 |
+
await self._cancel_active()
|
| 330 |
+
|
| 331 |
+
pipeline = _TurnPipeline(
|
| 332 |
+
ai_backend=self._ai,
|
| 333 |
+
data_channel=self._channel,
|
| 334 |
+
)
|
| 335 |
+
self._active_turn = pipeline
|
| 336 |
+
self._active_task = asyncio.create_task(
|
| 337 |
+
pipeline.run(self.user_id, audio_bytes)
|
| 338 |
+
)
|
| 339 |
+
|
| 340 |
+
async def _cancel_active(self) -> None:
|
| 341 |
+
if self._active_turn:
|
| 342 |
+
await self._active_turn.cancel()
|
| 343 |
+
self._active_turn = None
|
| 344 |
+
if self._active_task and not self._active_task.done():
|
| 345 |
+
self._active_task.cancel()
|
| 346 |
+
try:
|
| 347 |
+
await self._active_task
|
| 348 |
+
except (asyncio.CancelledError, Exception):
|
| 349 |
+
pass
|
| 350 |
+
self._active_task = None
|
| 351 |
+
|
| 352 |
+
async def handle_offer(self, sdp: str, sdp_type: str) -> dict:
|
| 353 |
+
"""Process SDP offer from browser. Returns SDP answer."""
|
| 354 |
+
offer = RTCSessionDescription(sdp=sdp, type=sdp_type)
|
| 355 |
+
await self._pc.setRemoteDescription(offer)
|
| 356 |
+
answer = await self._pc.createAnswer()
|
| 357 |
+
await self._pc.setLocalDescription(answer)
|
| 358 |
+
return {
|
| 359 |
+
"sdp": self._pc.localDescription.sdp,
|
| 360 |
+
"type": self._pc.localDescription.type,
|
| 361 |
+
}
|
| 362 |
+
|
| 363 |
+
async def add_ice_candidate(self, candidate: dict) -> None:
|
| 364 |
+
"""Forward browser ICE candidate to aiortc."""
|
| 365 |
+
from aiortc import RTCIceCandidate
|
| 366 |
+
c = RTCIceCandidate(
|
| 367 |
+
component=candidate.get("component", 1),
|
| 368 |
+
foundation=candidate.get("foundation", ""),
|
| 369 |
+
ip=candidate.get("ip", ""),
|
| 370 |
+
port=candidate.get("port", 0),
|
| 371 |
+
priority=candidate.get("priority", 0),
|
| 372 |
+
protocol=candidate.get("protocol", "udp"),
|
| 373 |
+
type=candidate.get("type", "host"),
|
| 374 |
+
sdpMid=candidate.get("sdpMid"),
|
| 375 |
+
sdpMLineIndex=candidate.get("sdpMLineIndex"),
|
| 376 |
+
)
|
| 377 |
+
await self._pc.addIceCandidate(c)
|
| 378 |
+
|
| 379 |
+
async def close(self) -> None:
|
| 380 |
+
await self._cancel_active()
|
| 381 |
+
await self._pc.close()
|