File size: 10,215 Bytes

75ee53d
 
5dabf9d
6a7bafa
fc967af
 
5dabf9d
 
 
 
e33d11d
75ee53d
f2ea5fc
75ee53d
f84481c
ed5b8b8
75ee53d
ed5b8b8
58fed26
f84481c
 
 
 
58fed26
 
 
42c497a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d514191
 
 
 
 
91c3bff
42c497a
91c3bff
d514191
6a7bafa
f84481c
75ee53d
f84481c
75ee53d
 
6a7bafa
 
 
 
 
 
 
 
75ee53d
 
 
6a7bafa
 
 
fc967af
 
42c497a
fc967af
75ee53d
 
f84481c
5dabf9d
 
 
 
 
 
 
2d19124
 
 
 
 
 
75ee53d
 
5dabf9d
 
 
 
75ee53d
 
 
91c3bff
5dabf9d
 
fc967af
5dabf9d
6a7bafa
 
2d19124
ed5b8b8
 
 
f84481c
ed5b8b8
 
75ee53d
 
f84481c
6a7bafa
75ee53d
 
 
 
 
 
 
fc967af
75ee53d
 
 
 
 
 
2d19124
75ee53d
 
58fed26
 
 
2d19124
5dabf9d
 
 
 
 
 
75ee53d
5dabf9d
 
 
 
 
 
 
fc967af
5dabf9d
75ee53d
 
58fed26
6a7bafa
5dabf9d
 
 
 
75ee53d
58fed26
75ee53d
 
58fed26
75ee53d
 
58fed26
 
75ee53d
f84481c
6a7bafa
75ee53d
 
5dabf9d
 
 
91c3bff
5dabf9d
 
 
 
e33d11d
 
6a7bafa
 
 
 
 
 
 
 
5dabf9d
2d19124
 
 
 
75ee53d
 
 
f84481c
 
 
 
75ee53d
f84481c
 
 
6a7bafa
 
f84481c
 
 
6a7bafa
f84481c
 
6a7bafa
 
 
 
f84481c
 
6a7bafa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f84481c
6a7bafa
 
f84481c
 
e33d11d

"""
services/tts.py — Ultra Low-Latency Dual TTS Backend

FIX-ISSUE4 (Normal-speed TTS):
  • Default rate is now slightly faster than normal for a more natural
    conversational pace.
  • split_sentences() now splits on ALL clause delimiters (commas, colons,
    em-dashes) in addition to sentence endings, so synthesis tasks are
    smaller and start sooner. This pairs with streaming.py's 2–3 word
    flush threshold for maximum low-latency playback.
  • Parts are synthesised sequentially to guarantee word order in playback.
"""

from dotenv import load_dotenv
import os, re, asyncio

load_dotenv()

USE_ELEVENLABS       = True
EDGE_VOICE           = "bn-BD-NabanitaNeural"
ELEVENLABS_API_KEY   = os.getenv("ELEVENLABS_API_KEY", "")
ELEVENLABS_VOICE_ID  = os.getenv("ELEVENLABS_VOICE_ID", "21m00Tcm4TlvDq8ikWAM")
ELEVENLABS_MODEL_ID  = os.getenv("ELEVENLABS_MODEL_ID", "eleven_multilingual_v2")
def _clamp(v: float, lo: float, hi: float) -> float:
    return max(lo, min(hi, v))

def _parse_pct(text: str) -> float:
    """
    Parse strings like '+10%', '-5%', '12%' into a multiplier delta.
    Returns 0.0 when empty/invalid.
    """
    raw = (text or "").strip()
    if not raw:
        return 0.0
    if raw.endswith("%"):
        raw = raw[:-1].strip()
    try:
        return float(raw) / 100.0
    except Exception:
        return 0.0

# ElevenLabs speed configuration:
# - `ELEVENLABS_SPEED` is the base speed (1.0 ≈ normal, >1.0 = faster).
# - `ELEVENLABS_SPEED_PCT` is an optional relative adjustment like "+10%" / "-5%".
#   Effective speed = base * (1 + pct).
# - `ELEVENLABS_SPEED_MAX` sets the upper clamp (default 3.0). If your ElevenLabs
#   model/voice rejects high values, lower this (e.g. 2.5).
_ELEVEN_BASE_SPEED = float(os.getenv("ELEVENLABS_SPEED", "2.8"))
_ELEVEN_SPEED_PCT  = _parse_pct(os.getenv("ELEVENLABS_SPEED_PCT", "0%"))
_ELEVEN_SPEED_MAX  = float(os.getenv("ELEVENLABS_SPEED_MAX", "3.5"))
ELEVENLABS_SPEED   = _clamp(_ELEVEN_BASE_SPEED * (1.0 + _ELEVEN_SPEED_PCT), 0.5, _ELEVEN_SPEED_MAX)
ELEVENLABS_OUTPUT_FORMAT = "mp3_22050_32"
ELEVENLABS_STABILITY  = 0.45
ELEVENLABS_SIMILARITY = 0.80
ELEVENLABS_STYLE      = 0.35
ELEVENLABS_SPEAKER_BOOST = True

try:
    import edge_tts  # type: ignore
    EDGE_TTS_AVAILABLE = True
except Exception:
    edge_tts = None
    EDGE_TTS_AVAILABLE = False
    print("[TTS] edge_tts not available; will fall back to ElevenLabs if possible")

if USE_ELEVENLABS and not ELEVENLABS_API_KEY:
    raise RuntimeError("[TTS] ELEVENLABS_API_KEY missing")

if not EDGE_TTS_AVAILABLE and not ELEVENLABS_API_KEY:
    raise RuntimeError("[TTS] Neither edge_tts nor ELEVENLABS_API_KEY is available")

print(
    f"[TTS] Backend: {'ElevenLabs' if USE_ELEVENLABS else 'Edge-TTS'} | "
    f"edge rate: +18% | eleven speed: {ELEVENLABS_SPEED:.2f} (base {_ELEVEN_BASE_SPEED:.2f}, pct {_ELEVEN_SPEED_PCT:+.0%})"
)


def split_sentences(text: str) -> list[str]:
    """
    Split text into small synthesis chunks for low-latency streaming.

    FIX-ISSUE4: Split on sentence boundaries AND clause boundaries so each
    TTS task is small (a phrase, not a full sentence). This allows synthesis
    to start sooner for later parts of a long response.
    """
    # Strip any emotion/tone tags like "[calm]" "[neutral]" etc. These are
    # intended for UI display and can degrade/break some TTS backends.
    text = re.sub(r"(?:(?<=^)|(?<=\s))\[[^\[\]\n]{1,24}\](?=\s|$)", "", text)
    text = re.sub(r"(?:(?<=^)|(?<=\s))\[[A-Za-z]{2,16}(?=\s|$)", "", text)
    text = re.sub(r"(?:(?<=^)|(?<=\s))[A-Za-z]{2,16}\](?=\s|$)", "", text)
    text = re.sub(r"[ \t]{2,}", " ", text).strip("\n\r\t")
    if not text:
        return []
    # Split on sentence-ending punctuation AND clause delimiters
    # The lookbehind keeps the delimiter attached to the preceding chunk.
    parts = re.split(r'(?<=[।.!?,;:—–])\s+', text)
    # Filter out anything too short to synthesise (punctuation-only fragments)
    return [p.strip() for p in parts if len(p.strip()) > 1]


async def _edge_tts_stream(text: str, voice: str = EDGE_VOICE, rate: str = "+22%"):
    """
    Stream Edge-TTS audio for a single text chunk.
    Default rate is slightly faster than normal.
    """
    if edge_tts is None:
        raise RuntimeError("edge_tts is not installed")
    text = text.strip("\n\r\t")
    if not text:
        return
    try:
        async for chunk in edge_tts.Communicate(text, voice, rate=rate).stream():
            if chunk["type"] == "audio":
                yield chunk["data"]
                await asyncio.sleep(0)
    except Exception as exc:
        print(f"[TTS][Edge] {exc}")
        raise


async def _elevenlabs_stream(
    text: str,
    voice_id: str = ELEVENLABS_VOICE_ID,
    model_id: str = ELEVENLABS_MODEL_ID,
    output_format: str = ELEVENLABS_OUTPUT_FORMAT,
    speed: float = ELEVENLABS_SPEED,
    stability: float = ELEVENLABS_STABILITY,
    similarity: float = ELEVENLABS_SIMILARITY,
    style: float = ELEVENLABS_STYLE,
    speaker_boost: bool = ELEVENLABS_SPEAKER_BOOST,
):
    import httpx
    text = text.strip("\n\r\t")
    if not text:
        return
    # Reduce unnatural pauses for short streamed chunks.
    # ElevenLabs adds strong pauses on sentence-ending punctuation; for
    # low-latency streaming we prefer faster turn-taking.
    text = re.sub(r"[।.!?,;:—–]+$", "", text).strip("\n\r\t")
    url     = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}/stream"
    headers = {
        "xi-api-key":   ELEVENLABS_API_KEY,
        "Content-Type": "application/json",
        "Accept":       "audio/mpeg",
    }
    payload = {
        "text":           text,
        "model_id":       model_id,
        "voice_settings": {
            "stability":        stability,
            "similarity_boost": similarity,
            "style":            style,
            "use_speaker_boost": speaker_boost,
            "speed":            speed,
        },
    }
    try:
        got_any = False
        async with httpx.AsyncClient(timeout=httpx.Timeout(10.0, connect=5.0, read=None)) as client:
            async with client.stream(
                "POST", url, headers=headers, json=payload,
                params={"output_format": output_format}
            ) as resp:
                if resp.status_code != 200:
                    raise RuntimeError(f"[TTS][ElevenLabs] HTTP {resp.status_code}")
                async for chunk in resp.aiter_bytes(chunk_size=512):
                    if chunk:
                        got_any = True
                        yield chunk
                        await asyncio.sleep(0)
        if not got_any:
            raise RuntimeError("[TTS][ElevenLabs] No audio received")
    except Exception as exc:
        print(f"[TTS][ElevenLabs] {exc}")
        raise


async def text_to_speech_stream(
    text: str,
    voice: str | None = None,
    rate: str = "+22%",
):
    """
    Stream TTS audio for `text`.

    Splits text into small clause-level parts, synthesises each part in order,
    and yields one complete audio blob per part in order.

    IMPORTANT:
      The browser playback path uses decodeAudioData(), which expects a
      self-contained audio buffer. Forwarding provider stream fragments
      directly causes decode buffering/stalls on the client. We therefore
      accumulate each phrase's bytes and only emit it once the part is fully
      synthesised. The phrases are kept intentionally small by
      services/streaming.py, so latency remains low.
    """
    # Preserve normal spaces inside/around streamed phrase chunks; don't
    # aggressively trim because it can glue words across chunk boundaries
    # (e.g. "দিয়ে" + "আপনার" → "দিয়েআপনার").
    text = text.strip("\n\r\t")
    if not text:
        return

    voice_to_use = voice or (ELEVENLABS_VOICE_ID if USE_ELEVENLABS else EDGE_VOICE)
    parts = split_sentences(text)
    if not parts:
        return

    _SENT = object()  # sentinel

    async def _synth_part(part: str, q: asyncio.Queue):
        buf = bytearray()
        backend_ok = False
        try:
            if USE_ELEVENLABS:
                async for chunk in _elevenlabs_stream(part, voice_id=voice_to_use):
                    buf.extend(chunk)
            else:
                async for chunk in _edge_tts_stream(part, voice=voice_to_use, rate=rate):
                    buf.extend(chunk)
            backend_ok = True
            if buf:
                await q.put(bytes(buf))
        except Exception as exc:
            print(f"[TTS] synth error: {exc}")
            # Primary backend failed. Try the other backend before giving up.
            try:
                buf.clear()
                if USE_ELEVENLABS:
                    if EDGE_TTS_AVAILABLE:
                        async for chunk in _edge_tts_stream(part, voice=EDGE_VOICE, rate=rate):
                            buf.extend(chunk)
                    elif ELEVENLABS_API_KEY:
                        async for chunk in _elevenlabs_stream(part, voice_id=ELEVENLABS_VOICE_ID):
                            buf.extend(chunk)
                else:
                    if ELEVENLABS_API_KEY:
                        async for chunk in _elevenlabs_stream(part, voice_id=ELEVENLABS_VOICE_ID):
                            buf.extend(chunk)
                    elif EDGE_TTS_AVAILABLE:
                        async for chunk in _edge_tts_stream(part, voice=EDGE_VOICE, rate=rate):
                            buf.extend(chunk)
                backend_ok = bool(buf)
                if buf:
                    await q.put(bytes(buf))
            except Exception as fallback_exc:
                print(f"[TTS] fallback synth error: {fallback_exc}")
        finally:
            if not backend_ok and not buf:
                print(f"[TTS] no audio produced for chunk: {part[:60]!r}")
            await q.put(_SENT)

    # Sequential synthesis guarantees exact playback order.
    for part in parts:
        q: asyncio.Queue = asyncio.Queue()
        await _synth_part(part, q)
        while True:
            chunk = await q.get()
            if chunk is _SENT:
                break
            yield chunk