rakib72642 commited on
Commit
6a7bafa
·
1 Parent(s): ac8ab2c

fixed edge tts issue

Browse files
Files changed (1) hide show
  1. services/tts.py +63 -14
services/tts.py CHANGED
@@ -1,9 +1,8 @@
1
  """
2
  services/tts.py — Ultra Low-Latency Dual TTS Backend
3
 
4
- FIX-ISSUE4 (Natural, slow TTS):
5
- • Default rate changed from "-30%" to "-35%" approximately 35% slower
6
- than the Edge TTS default, giving a calm, natural speaking pace.
7
  • split_sentences() now splits on ALL clause delimiters (commas, colons,
8
  em-dashes) in addition to sentence endings, so synthesis tasks are
9
  smaller and start sooner. This pairs with streaming.py's 2–3 word
@@ -22,16 +21,27 @@ EDGE_VOICE = "bn-BD-NabanitaNeural"
22
  ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY", "")
23
  ELEVENLABS_VOICE_ID = os.getenv("ELEVENLABS_VOICE_ID", "21m00Tcm4TlvDq8ikWAM")
24
  ELEVENLABS_MODEL_ID = os.getenv("ELEVENLABS_MODEL_ID", "eleven_multilingual_v2")
25
- ELEVENLABS_OUTPUT_FORMAT = "pcm_16000"
26
  ELEVENLABS_STABILITY = 0.45
27
  ELEVENLABS_SIMILARITY = 0.80
28
  ELEVENLABS_STYLE = 0.35
29
  ELEVENLABS_SPEAKER_BOOST = True
30
 
 
 
 
 
 
 
 
 
31
  if USE_ELEVENLABS and not ELEVENLABS_API_KEY:
32
  raise RuntimeError("[TTS] ELEVENLABS_API_KEY missing")
33
 
34
- print(f"[TTS] Backend: {'ElevenLabs' if USE_ELEVENLABS else 'Edge-TTS'} | rate: -35%")
 
 
 
35
 
36
 
37
  def split_sentences(text: str) -> list[str]:
@@ -52,12 +62,13 @@ def split_sentences(text: str) -> list[str]:
52
  return [p.strip() for p in parts if len(p.strip()) > 1]
53
 
54
 
55
- async def _edge_tts_stream(text: str, voice: str = EDGE_VOICE, rate: str = "-35%"):
56
  """
57
  Stream Edge-TTS audio for a single text chunk.
58
- FIX-ISSUE4: Default rate is now -35% (was -30%) for slower, natural speech.
59
  """
60
- import edge_tts
 
61
  text = text.strip()
62
  if not text:
63
  return
@@ -68,6 +79,7 @@ async def _edge_tts_stream(text: str, voice: str = EDGE_VOICE, rate: str = "-35%
68
  await asyncio.sleep(0)
69
  except Exception as exc:
70
  print(f"[TTS][Edge] {exc}")
 
71
 
72
 
73
  async def _elevenlabs_stream(
@@ -101,7 +113,7 @@ async def _elevenlabs_stream(
101
  },
102
  }
103
  try:
104
- async with httpx.AsyncClient(timeout=httpx.Timeout(connect=5.0, read=None)) as client:
105
  async with client.stream(
106
  "POST", url, headers=headers, json=payload,
107
  params={"output_format": output_format}
@@ -115,19 +127,27 @@ async def _elevenlabs_stream(
115
  await asyncio.sleep(0)
116
  except Exception as exc:
117
  print(f"[TTS][ElevenLabs] {exc}")
 
118
 
119
 
120
  async def text_to_speech_stream(
121
  text: str,
122
  voice: str | None = None,
123
- rate: str = "-35%", # FIX-ISSUE4: -35% default (was -30%)
124
  ):
125
  """
126
  Stream TTS audio for `text`.
127
 
128
  Splits text into small clause-level parts, synthesises all in parallel,
129
- yields audio in order. This gives the lowest possible first-audio latency
130
- while maintaining natural speech ordering.
 
 
 
 
 
 
 
131
  """
132
  text = text.strip()
133
  if not text:
@@ -141,16 +161,45 @@ async def text_to_speech_stream(
141
  _SENT = object() # sentinel
142
 
143
  async def _synth_part(part: str, q: asyncio.Queue):
 
 
144
  try:
145
  if USE_ELEVENLABS:
146
  async for chunk in _elevenlabs_stream(part, voice_id=voice_to_use):
147
- await q.put(chunk)
148
  else:
149
  async for chunk in _edge_tts_stream(part, voice=voice_to_use, rate=rate):
150
- await q.put(chunk)
 
 
 
151
  except Exception as exc:
152
  print(f"[TTS] synth error: {exc}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
  finally:
 
 
154
  await q.put(_SENT)
155
 
156
  # Create one queue per part, synthesise all in parallel
 
1
  """
2
  services/tts.py — Ultra Low-Latency Dual TTS Backend
3
 
4
+ FIX-ISSUE4 (Normal-speed TTS):
5
+ • Default rate changed from "-30%" to "+0%" for normal speech speed.
 
6
  • split_sentences() now splits on ALL clause delimiters (commas, colons,
7
  em-dashes) in addition to sentence endings, so synthesis tasks are
8
  smaller and start sooner. This pairs with streaming.py's 2–3 word
 
21
  ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY", "")
22
  ELEVENLABS_VOICE_ID = os.getenv("ELEVENLABS_VOICE_ID", "21m00Tcm4TlvDq8ikWAM")
23
  ELEVENLABS_MODEL_ID = os.getenv("ELEVENLABS_MODEL_ID", "eleven_multilingual_v2")
24
+ ELEVENLABS_OUTPUT_FORMAT = "mp3_22050_32"
25
  ELEVENLABS_STABILITY = 0.45
26
  ELEVENLABS_SIMILARITY = 0.80
27
  ELEVENLABS_STYLE = 0.35
28
  ELEVENLABS_SPEAKER_BOOST = True
29
 
30
+ try:
31
+ import edge_tts # type: ignore
32
+ EDGE_TTS_AVAILABLE = True
33
+ except Exception:
34
+ edge_tts = None
35
+ EDGE_TTS_AVAILABLE = False
36
+ print("[TTS] edge_tts not available; will fall back to ElevenLabs if possible")
37
+
38
  if USE_ELEVENLABS and not ELEVENLABS_API_KEY:
39
  raise RuntimeError("[TTS] ELEVENLABS_API_KEY missing")
40
 
41
+ if not EDGE_TTS_AVAILABLE and not ELEVENLABS_API_KEY:
42
+ raise RuntimeError("[TTS] Neither edge_tts nor ELEVENLABS_API_KEY is available")
43
+
44
+ print(f"[TTS] Backend: {'ElevenLabs' if USE_ELEVENLABS else 'Edge-TTS'} | rate: +0%")
45
 
46
 
47
  def split_sentences(text: str) -> list[str]:
 
62
  return [p.strip() for p in parts if len(p.strip()) > 1]
63
 
64
 
65
+ async def _edge_tts_stream(text: str, voice: str = EDGE_VOICE, rate: str = "+0%"):
66
  """
67
  Stream Edge-TTS audio for a single text chunk.
68
+ Default rate is normal speed.
69
  """
70
+ if edge_tts is None:
71
+ raise RuntimeError("edge_tts is not installed")
72
  text = text.strip()
73
  if not text:
74
  return
 
79
  await asyncio.sleep(0)
80
  except Exception as exc:
81
  print(f"[TTS][Edge] {exc}")
82
+ raise
83
 
84
 
85
  async def _elevenlabs_stream(
 
113
  },
114
  }
115
  try:
116
+ async with httpx.AsyncClient(timeout=httpx.Timeout(10.0, connect=5.0, read=None)) as client:
117
  async with client.stream(
118
  "POST", url, headers=headers, json=payload,
119
  params={"output_format": output_format}
 
127
  await asyncio.sleep(0)
128
  except Exception as exc:
129
  print(f"[TTS][ElevenLabs] {exc}")
130
+ raise
131
 
132
 
133
  async def text_to_speech_stream(
134
  text: str,
135
  voice: str | None = None,
136
+ rate: str = "+0%", # normal speed
137
  ):
138
  """
139
  Stream TTS audio for `text`.
140
 
141
  Splits text into small clause-level parts, synthesises all in parallel,
142
+ yields one complete audio blob per part in order.
143
+
144
+ IMPORTANT:
145
+ The browser playback path uses decodeAudioData(), which expects a
146
+ self-contained audio buffer. Forwarding provider stream fragments
147
+ directly causes decode buffering/stalls on the client. We therefore
148
+ accumulate each phrase's bytes and only emit it once the part is fully
149
+ synthesised. The phrases are kept intentionally small by
150
+ services/streaming.py, so latency remains low.
151
  """
152
  text = text.strip()
153
  if not text:
 
161
  _SENT = object() # sentinel
162
 
163
  async def _synth_part(part: str, q: asyncio.Queue):
164
+ buf = bytearray()
165
+ backend_ok = False
166
  try:
167
  if USE_ELEVENLABS:
168
  async for chunk in _elevenlabs_stream(part, voice_id=voice_to_use):
169
+ buf.extend(chunk)
170
  else:
171
  async for chunk in _edge_tts_stream(part, voice=voice_to_use, rate=rate):
172
+ buf.extend(chunk)
173
+ backend_ok = True
174
+ if buf:
175
+ await q.put(bytes(buf))
176
  except Exception as exc:
177
  print(f"[TTS] synth error: {exc}")
178
+ # Primary backend failed. Try the other backend before giving up.
179
+ try:
180
+ buf.clear()
181
+ if USE_ELEVENLABS:
182
+ if EDGE_TTS_AVAILABLE:
183
+ async for chunk in _edge_tts_stream(part, voice=EDGE_VOICE, rate=rate):
184
+ buf.extend(chunk)
185
+ elif ELEVENLABS_API_KEY:
186
+ async for chunk in _elevenlabs_stream(part, voice_id=ELEVENLABS_VOICE_ID):
187
+ buf.extend(chunk)
188
+ else:
189
+ if ELEVENLABS_API_KEY:
190
+ async for chunk in _elevenlabs_stream(part, voice_id=ELEVENLABS_VOICE_ID):
191
+ buf.extend(chunk)
192
+ elif EDGE_TTS_AVAILABLE:
193
+ async for chunk in _edge_tts_stream(part, voice=EDGE_VOICE, rate=rate):
194
+ buf.extend(chunk)
195
+ backend_ok = bool(buf)
196
+ if buf:
197
+ await q.put(bytes(buf))
198
+ except Exception as fallback_exc:
199
+ print(f"[TTS] fallback synth error: {fallback_exc}")
200
  finally:
201
+ if not backend_ok and not buf:
202
+ print(f"[TTS] no audio produced for chunk: {part[:60]!r}")
203
  await q.put(_SENT)
204
 
205
  # Create one queue per part, synthesise all in parallel