File size: 10,215 Bytes
75ee53d 5dabf9d 6a7bafa fc967af 5dabf9d e33d11d 75ee53d f2ea5fc 75ee53d f84481c ed5b8b8 75ee53d ed5b8b8 58fed26 f84481c 58fed26 42c497a d514191 91c3bff 42c497a 91c3bff d514191 6a7bafa f84481c 75ee53d f84481c 75ee53d 6a7bafa 75ee53d 6a7bafa fc967af 42c497a fc967af 75ee53d f84481c 5dabf9d 2d19124 75ee53d 5dabf9d 75ee53d 91c3bff 5dabf9d fc967af 5dabf9d 6a7bafa 2d19124 ed5b8b8 f84481c ed5b8b8 75ee53d f84481c 6a7bafa 75ee53d fc967af 75ee53d 2d19124 75ee53d 58fed26 2d19124 5dabf9d 75ee53d 5dabf9d fc967af 5dabf9d 75ee53d 58fed26 6a7bafa 5dabf9d 75ee53d 58fed26 75ee53d 58fed26 75ee53d 58fed26 75ee53d f84481c 6a7bafa 75ee53d 5dabf9d 91c3bff 5dabf9d e33d11d 6a7bafa 5dabf9d 2d19124 75ee53d f84481c 75ee53d f84481c 6a7bafa f84481c 6a7bafa f84481c 6a7bafa f84481c 6a7bafa f84481c 6a7bafa f84481c e33d11d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 | """
services/tts.py — Ultra Low-Latency Dual TTS Backend
FIX-ISSUE4 (Normal-speed TTS):
• Default rate is now slightly faster than normal for a more natural
conversational pace.
• split_sentences() now splits on ALL clause delimiters (commas, colons,
em-dashes) in addition to sentence endings, so synthesis tasks are
smaller and start sooner. This pairs with streaming.py's 2–3 word
flush threshold for maximum low-latency playback.
• Parts are synthesised sequentially to guarantee word order in playback.
"""
from dotenv import load_dotenv
import os, re, asyncio
load_dotenv()
USE_ELEVENLABS = True
EDGE_VOICE = "bn-BD-NabanitaNeural"
ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY", "")
ELEVENLABS_VOICE_ID = os.getenv("ELEVENLABS_VOICE_ID", "21m00Tcm4TlvDq8ikWAM")
ELEVENLABS_MODEL_ID = os.getenv("ELEVENLABS_MODEL_ID", "eleven_multilingual_v2")
def _clamp(v: float, lo: float, hi: float) -> float:
return max(lo, min(hi, v))
def _parse_pct(text: str) -> float:
"""
Parse strings like '+10%', '-5%', '12%' into a multiplier delta.
Returns 0.0 when empty/invalid.
"""
raw = (text or "").strip()
if not raw:
return 0.0
if raw.endswith("%"):
raw = raw[:-1].strip()
try:
return float(raw) / 100.0
except Exception:
return 0.0
# ElevenLabs speed configuration:
# - `ELEVENLABS_SPEED` is the base speed (1.0 ≈ normal, >1.0 = faster).
# - `ELEVENLABS_SPEED_PCT` is an optional relative adjustment like "+10%" / "-5%".
# Effective speed = base * (1 + pct).
# - `ELEVENLABS_SPEED_MAX` sets the upper clamp (default 3.0). If your ElevenLabs
# model/voice rejects high values, lower this (e.g. 2.5).
_ELEVEN_BASE_SPEED = float(os.getenv("ELEVENLABS_SPEED", "2.8"))
_ELEVEN_SPEED_PCT = _parse_pct(os.getenv("ELEVENLABS_SPEED_PCT", "0%"))
_ELEVEN_SPEED_MAX = float(os.getenv("ELEVENLABS_SPEED_MAX", "3.5"))
ELEVENLABS_SPEED = _clamp(_ELEVEN_BASE_SPEED * (1.0 + _ELEVEN_SPEED_PCT), 0.5, _ELEVEN_SPEED_MAX)
ELEVENLABS_OUTPUT_FORMAT = "mp3_22050_32"
ELEVENLABS_STABILITY = 0.45
ELEVENLABS_SIMILARITY = 0.80
ELEVENLABS_STYLE = 0.35
ELEVENLABS_SPEAKER_BOOST = True
try:
import edge_tts # type: ignore
EDGE_TTS_AVAILABLE = True
except Exception:
edge_tts = None
EDGE_TTS_AVAILABLE = False
print("[TTS] edge_tts not available; will fall back to ElevenLabs if possible")
if USE_ELEVENLABS and not ELEVENLABS_API_KEY:
raise RuntimeError("[TTS] ELEVENLABS_API_KEY missing")
if not EDGE_TTS_AVAILABLE and not ELEVENLABS_API_KEY:
raise RuntimeError("[TTS] Neither edge_tts nor ELEVENLABS_API_KEY is available")
print(
f"[TTS] Backend: {'ElevenLabs' if USE_ELEVENLABS else 'Edge-TTS'} | "
f"edge rate: +18% | eleven speed: {ELEVENLABS_SPEED:.2f} (base {_ELEVEN_BASE_SPEED:.2f}, pct {_ELEVEN_SPEED_PCT:+.0%})"
)
def split_sentences(text: str) -> list[str]:
"""
Split text into small synthesis chunks for low-latency streaming.
FIX-ISSUE4: Split on sentence boundaries AND clause boundaries so each
TTS task is small (a phrase, not a full sentence). This allows synthesis
to start sooner for later parts of a long response.
"""
# Strip any emotion/tone tags like "[calm]" "[neutral]" etc. These are
# intended for UI display and can degrade/break some TTS backends.
text = re.sub(r"(?:(?<=^)|(?<=\s))\[[^\[\]\n]{1,24}\](?=\s|$)", "", text)
text = re.sub(r"(?:(?<=^)|(?<=\s))\[[A-Za-z]{2,16}(?=\s|$)", "", text)
text = re.sub(r"(?:(?<=^)|(?<=\s))[A-Za-z]{2,16}\](?=\s|$)", "", text)
text = re.sub(r"[ \t]{2,}", " ", text).strip("\n\r\t")
if not text:
return []
# Split on sentence-ending punctuation AND clause delimiters
# The lookbehind keeps the delimiter attached to the preceding chunk.
parts = re.split(r'(?<=[।.!?,;:—–])\s+', text)
# Filter out anything too short to synthesise (punctuation-only fragments)
return [p.strip() for p in parts if len(p.strip()) > 1]
async def _edge_tts_stream(text: str, voice: str = EDGE_VOICE, rate: str = "+22%"):
"""
Stream Edge-TTS audio for a single text chunk.
Default rate is slightly faster than normal.
"""
if edge_tts is None:
raise RuntimeError("edge_tts is not installed")
text = text.strip("\n\r\t")
if not text:
return
try:
async for chunk in edge_tts.Communicate(text, voice, rate=rate).stream():
if chunk["type"] == "audio":
yield chunk["data"]
await asyncio.sleep(0)
except Exception as exc:
print(f"[TTS][Edge] {exc}")
raise
async def _elevenlabs_stream(
text: str,
voice_id: str = ELEVENLABS_VOICE_ID,
model_id: str = ELEVENLABS_MODEL_ID,
output_format: str = ELEVENLABS_OUTPUT_FORMAT,
speed: float = ELEVENLABS_SPEED,
stability: float = ELEVENLABS_STABILITY,
similarity: float = ELEVENLABS_SIMILARITY,
style: float = ELEVENLABS_STYLE,
speaker_boost: bool = ELEVENLABS_SPEAKER_BOOST,
):
import httpx
text = text.strip("\n\r\t")
if not text:
return
# Reduce unnatural pauses for short streamed chunks.
# ElevenLabs adds strong pauses on sentence-ending punctuation; for
# low-latency streaming we prefer faster turn-taking.
text = re.sub(r"[।.!?,;:—–]+$", "", text).strip("\n\r\t")
url = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}/stream"
headers = {
"xi-api-key": ELEVENLABS_API_KEY,
"Content-Type": "application/json",
"Accept": "audio/mpeg",
}
payload = {
"text": text,
"model_id": model_id,
"voice_settings": {
"stability": stability,
"similarity_boost": similarity,
"style": style,
"use_speaker_boost": speaker_boost,
"speed": speed,
},
}
try:
got_any = False
async with httpx.AsyncClient(timeout=httpx.Timeout(10.0, connect=5.0, read=None)) as client:
async with client.stream(
"POST", url, headers=headers, json=payload,
params={"output_format": output_format}
) as resp:
if resp.status_code != 200:
raise RuntimeError(f"[TTS][ElevenLabs] HTTP {resp.status_code}")
async for chunk in resp.aiter_bytes(chunk_size=512):
if chunk:
got_any = True
yield chunk
await asyncio.sleep(0)
if not got_any:
raise RuntimeError("[TTS][ElevenLabs] No audio received")
except Exception as exc:
print(f"[TTS][ElevenLabs] {exc}")
raise
async def text_to_speech_stream(
text: str,
voice: str | None = None,
rate: str = "+22%",
):
"""
Stream TTS audio for `text`.
Splits text into small clause-level parts, synthesises each part in order,
and yields one complete audio blob per part in order.
IMPORTANT:
The browser playback path uses decodeAudioData(), which expects a
self-contained audio buffer. Forwarding provider stream fragments
directly causes decode buffering/stalls on the client. We therefore
accumulate each phrase's bytes and only emit it once the part is fully
synthesised. The phrases are kept intentionally small by
services/streaming.py, so latency remains low.
"""
# Preserve normal spaces inside/around streamed phrase chunks; don't
# aggressively trim because it can glue words across chunk boundaries
# (e.g. "দিয়ে" + "আপনার" → "দিয়েআপনার").
text = text.strip("\n\r\t")
if not text:
return
voice_to_use = voice or (ELEVENLABS_VOICE_ID if USE_ELEVENLABS else EDGE_VOICE)
parts = split_sentences(text)
if not parts:
return
_SENT = object() # sentinel
async def _synth_part(part: str, q: asyncio.Queue):
buf = bytearray()
backend_ok = False
try:
if USE_ELEVENLABS:
async for chunk in _elevenlabs_stream(part, voice_id=voice_to_use):
buf.extend(chunk)
else:
async for chunk in _edge_tts_stream(part, voice=voice_to_use, rate=rate):
buf.extend(chunk)
backend_ok = True
if buf:
await q.put(bytes(buf))
except Exception as exc:
print(f"[TTS] synth error: {exc}")
# Primary backend failed. Try the other backend before giving up.
try:
buf.clear()
if USE_ELEVENLABS:
if EDGE_TTS_AVAILABLE:
async for chunk in _edge_tts_stream(part, voice=EDGE_VOICE, rate=rate):
buf.extend(chunk)
elif ELEVENLABS_API_KEY:
async for chunk in _elevenlabs_stream(part, voice_id=ELEVENLABS_VOICE_ID):
buf.extend(chunk)
else:
if ELEVENLABS_API_KEY:
async for chunk in _elevenlabs_stream(part, voice_id=ELEVENLABS_VOICE_ID):
buf.extend(chunk)
elif EDGE_TTS_AVAILABLE:
async for chunk in _edge_tts_stream(part, voice=EDGE_VOICE, rate=rate):
buf.extend(chunk)
backend_ok = bool(buf)
if buf:
await q.put(bytes(buf))
except Exception as fallback_exc:
print(f"[TTS] fallback synth error: {fallback_exc}")
finally:
if not backend_ok and not buf:
print(f"[TTS] no audio produced for chunk: {part[:60]!r}")
await q.put(_SENT)
# Sequential synthesis guarantees exact playback order.
for part in parts:
q: asyncio.Queue = asyncio.Queue()
await _synth_part(part, q)
while True:
chunk = await q.get()
if chunk is _SENT:
break
yield chunk
|