HawkEyesAI
/

Voice-AI-Agent

Model card Files Files and versions

xet

Community

rakib72642 commited on 3 days ago

Commit

6a7bafa

1 Parent(s): ac8ab2c

fixed edge tts issue

Browse files

Files changed (1) hide show

services/tts.py +63 -14

services/tts.py CHANGED Viewed

@@ -1,9 +1,8 @@
 """
 services/tts.py — Ultra Low-Latency Dual TTS Backend
-FIX-ISSUE4 (Natural, slow TTS):
-  • Default rate changed from "-30%" to "-35%" — approximately 35% slower
-    than the Edge TTS default, giving a calm, natural speaking pace.
   • split_sentences() now splits on ALL clause delimiters (commas, colons,
     em-dashes) in addition to sentence endings, so synthesis tasks are
     smaller and start sooner. This pairs with streaming.py's 2–3 word
@@ -22,16 +21,27 @@ EDGE_VOICE           = "bn-BD-NabanitaNeural"
 ELEVENLABS_API_KEY   = os.getenv("ELEVENLABS_API_KEY", "")
 ELEVENLABS_VOICE_ID  = os.getenv("ELEVENLABS_VOICE_ID", "21m00Tcm4TlvDq8ikWAM")
 ELEVENLABS_MODEL_ID  = os.getenv("ELEVENLABS_MODEL_ID", "eleven_multilingual_v2")
-ELEVENLABS_OUTPUT_FORMAT = "pcm_16000"
 ELEVENLABS_STABILITY  = 0.45
 ELEVENLABS_SIMILARITY = 0.80
 ELEVENLABS_STYLE      = 0.35
 ELEVENLABS_SPEAKER_BOOST = True
 if USE_ELEVENLABS and not ELEVENLABS_API_KEY:
     raise RuntimeError("[TTS] ELEVENLABS_API_KEY missing")
-print(f"[TTS] Backend: {'ElevenLabs' if USE_ELEVENLABS else 'Edge-TTS'} | rate: -35%")
 def split_sentences(text: str) -> list[str]:
@@ -52,12 +62,13 @@ def split_sentences(text: str) -> list[str]:
     return [p.strip() for p in parts if len(p.strip()) > 1]
-async def _edge_tts_stream(text: str, voice: str = EDGE_VOICE, rate: str = "-35%"):
     """
     Stream Edge-TTS audio for a single text chunk.
-    FIX-ISSUE4: Default rate is now -35% (was -30%) for slower, natural speech.
     """
-    import edge_tts
     text = text.strip()
     if not text:
         return
@@ -68,6 +79,7 @@ async def _edge_tts_stream(text: str, voice: str = EDGE_VOICE, rate: str = "-35%
                 await asyncio.sleep(0)
     except Exception as exc:
         print(f"[TTS][Edge] {exc}")
 async def _elevenlabs_stream(
@@ -101,7 +113,7 @@ async def _elevenlabs_stream(
         },
     }
     try:
-        async with httpx.AsyncClient(timeout=httpx.Timeout(connect=5.0, read=None)) as client:
             async with client.stream(
                 "POST", url, headers=headers, json=payload,
                 params={"output_format": output_format}
@@ -115,19 +127,27 @@ async def _elevenlabs_stream(
                         await asyncio.sleep(0)
     except Exception as exc:
         print(f"[TTS][ElevenLabs] {exc}")
 async def text_to_speech_stream(
     text: str,
     voice: str | None = None,
-    rate: str = "-35%",   # FIX-ISSUE4: -35% default (was -30%)
 ):
     """
     Stream TTS audio for `text`.
     Splits text into small clause-level parts, synthesises all in parallel,
-    yields audio in order. This gives the lowest possible first-audio latency
-    while maintaining natural speech ordering.
     """
     text = text.strip()
     if not text:
@@ -141,16 +161,45 @@ async def text_to_speech_stream(
     _SENT = object()  # sentinel
     async def _synth_part(part: str, q: asyncio.Queue):
         try:
             if USE_ELEVENLABS:
                 async for chunk in _elevenlabs_stream(part, voice_id=voice_to_use):
-                    await q.put(chunk)
             else:
                 async for chunk in _edge_tts_stream(part, voice=voice_to_use, rate=rate):
-                    await q.put(chunk)
         except Exception as exc:
             print(f"[TTS] synth error: {exc}")
         finally:
             await q.put(_SENT)
     # Create one queue per part, synthesise all in parallel

 """
 services/tts.py — Ultra Low-Latency Dual TTS Backend
+FIX-ISSUE4 (Normal-speed TTS):
+  • Default rate changed from "-30%" to "+0%" for normal speech speed.
   • split_sentences() now splits on ALL clause delimiters (commas, colons,
     em-dashes) in addition to sentence endings, so synthesis tasks are
     smaller and start sooner. This pairs with streaming.py's 2–3 word
 ELEVENLABS_API_KEY   = os.getenv("ELEVENLABS_API_KEY", "")
 ELEVENLABS_VOICE_ID  = os.getenv("ELEVENLABS_VOICE_ID", "21m00Tcm4TlvDq8ikWAM")
 ELEVENLABS_MODEL_ID  = os.getenv("ELEVENLABS_MODEL_ID", "eleven_multilingual_v2")
+ELEVENLABS_OUTPUT_FORMAT = "mp3_22050_32"
 ELEVENLABS_STABILITY  = 0.45
 ELEVENLABS_SIMILARITY = 0.80
 ELEVENLABS_STYLE      = 0.35
 ELEVENLABS_SPEAKER_BOOST = True
+try:
+    import edge_tts  # type: ignore
+    EDGE_TTS_AVAILABLE = True
+except Exception:
+    edge_tts = None
+    EDGE_TTS_AVAILABLE = False
+    print("[TTS] edge_tts not available; will fall back to ElevenLabs if possible")
 if USE_ELEVENLABS and not ELEVENLABS_API_KEY:
     raise RuntimeError("[TTS] ELEVENLABS_API_KEY missing")
+if not EDGE_TTS_AVAILABLE and not ELEVENLABS_API_KEY:
+    raise RuntimeError("[TTS] Neither edge_tts nor ELEVENLABS_API_KEY is available")
+print(f"[TTS] Backend: {'ElevenLabs' if USE_ELEVENLABS else 'Edge-TTS'} | rate: +0%")
 def split_sentences(text: str) -> list[str]:
     return [p.strip() for p in parts if len(p.strip()) > 1]
+async def _edge_tts_stream(text: str, voice: str = EDGE_VOICE, rate: str = "+0%"):
     """
     Stream Edge-TTS audio for a single text chunk.
+    Default rate is normal speed.
     """
+    if edge_tts is None:
+        raise RuntimeError("edge_tts is not installed")
     text = text.strip()
     if not text:
         return
                 await asyncio.sleep(0)
     except Exception as exc:
         print(f"[TTS][Edge] {exc}")
+        raise
 async def _elevenlabs_stream(
         },
     }
     try:
+        async with httpx.AsyncClient(timeout=httpx.Timeout(10.0, connect=5.0, read=None)) as client:
             async with client.stream(
                 "POST", url, headers=headers, json=payload,
                 params={"output_format": output_format}
                         await asyncio.sleep(0)
     except Exception as exc:
         print(f"[TTS][ElevenLabs] {exc}")
+        raise
 async def text_to_speech_stream(
     text: str,
     voice: str | None = None,
+    rate: str = "+0%",   # normal speed
 ):
     """
     Stream TTS audio for `text`.
     Splits text into small clause-level parts, synthesises all in parallel,
+    yields one complete audio blob per part in order.
+    IMPORTANT:
+      The browser playback path uses decodeAudioData(), which expects a
+      self-contained audio buffer. Forwarding provider stream fragments
+      directly causes decode buffering/stalls on the client. We therefore
+      accumulate each phrase's bytes and only emit it once the part is fully
+      synthesised. The phrases are kept intentionally small by
+      services/streaming.py, so latency remains low.
     """
     text = text.strip()
     if not text:
     _SENT = object()  # sentinel
     async def _synth_part(part: str, q: asyncio.Queue):
+        buf = bytearray()
+        backend_ok = False
         try:
             if USE_ELEVENLABS:
                 async for chunk in _elevenlabs_stream(part, voice_id=voice_to_use):
+                    buf.extend(chunk)
             else:
                 async for chunk in _edge_tts_stream(part, voice=voice_to_use, rate=rate):
+                    buf.extend(chunk)
+            backend_ok = True
+            if buf:
+                await q.put(bytes(buf))
         except Exception as exc:
             print(f"[TTS] synth error: {exc}")
+            # Primary backend failed. Try the other backend before giving up.
+            try:
+                buf.clear()
+                if USE_ELEVENLABS:
+                    if EDGE_TTS_AVAILABLE:
+                        async for chunk in _edge_tts_stream(part, voice=EDGE_VOICE, rate=rate):
+                            buf.extend(chunk)
+                    elif ELEVENLABS_API_KEY:
+                        async for chunk in _elevenlabs_stream(part, voice_id=ELEVENLABS_VOICE_ID):
+                            buf.extend(chunk)
+                else:
+                    if ELEVENLABS_API_KEY:
+                        async for chunk in _elevenlabs_stream(part, voice_id=ELEVENLABS_VOICE_ID):
+                            buf.extend(chunk)
+                    elif EDGE_TTS_AVAILABLE:
+                        async for chunk in _edge_tts_stream(part, voice=EDGE_VOICE, rate=rate):
+                            buf.extend(chunk)
+                backend_ok = bool(buf)
+                if buf:
+                    await q.put(bytes(buf))
+            except Exception as fallback_exc:
+                print(f"[TTS] fallback synth error: {fallback_exc}")
         finally:
+            if not backend_ok and not buf:
+                print(f"[TTS] no audio produced for chunk: {part[:60]!r}")
             await q.put(_SENT)
     # Create one queue per part, synthesise all in parallel