rakib72642 commited on
Commit
caa1385
Β·
1 Parent(s): 17cb949

Refactor voice handling for barge-in support and update UI text labels

Browse files
Files changed (4) hide show
  1. app.py +14 -2
  2. frontend/index.html +1 -1
  3. frontend/script.js +158 -30
  4. services/tts.py +2 -2
app.py CHANGED
@@ -392,18 +392,25 @@ async def ws_voice(ws: WebSocket):
392
  await _safe_text(ws, {"type": "end"})
393
 
394
  async def _utterance_worker():
 
395
  while True:
396
  audio_bytes = await _utterance_q.get()
397
  if audio_bytes is None:
398
  break
399
  try:
400
- await _handle_utterance(audio_bytes)
 
 
 
401
  except asyncio.CancelledError:
402
- raise
 
403
  except Exception as exc:
404
  print(f"[VOICE] Utterance worker error: {exc}")
405
  await _safe_text(ws, {"type": "error", "text": str(exc)})
406
  await _safe_text(ws, {"type": "end"})
 
 
407
 
408
  try:
409
  _worker_task = asyncio.create_task(_utterance_worker())
@@ -424,6 +431,11 @@ async def ws_voice(ws: WebSocket):
424
  if "bytes" in data and data["bytes"]:
425
  audio_bytes = data["bytes"]
426
  print(f"[VOICE] [{user_id}] Utterance: {len(audio_bytes):,} bytes")
 
 
 
 
 
427
  await _utterance_q.put(audio_bytes)
428
 
429
  elif "text" in data and data["text"]:
 
392
  await _safe_text(ws, {"type": "end"})
393
 
394
  async def _utterance_worker():
395
+ nonlocal _active_task
396
  while True:
397
  audio_bytes = await _utterance_q.get()
398
  if audio_bytes is None:
399
  break
400
  try:
401
+ # Run each utterance as a cancellable task so barge-in can
402
+ # immediately interrupt LLM+TTS mid-turn.
403
+ _active_task = asyncio.create_task(_handle_utterance(audio_bytes))
404
+ await _active_task
405
  except asyncio.CancelledError:
406
+ # Interruption is normal (client barge-in / cancel).
407
+ pass
408
  except Exception as exc:
409
  print(f"[VOICE] Utterance worker error: {exc}")
410
  await _safe_text(ws, {"type": "error", "text": str(exc)})
411
  await _safe_text(ws, {"type": "end"})
412
+ finally:
413
+ _active_task = None
414
 
415
  try:
416
  _worker_task = asyncio.create_task(_utterance_worker())
 
431
  if "bytes" in data and data["bytes"]:
432
  audio_bytes = data["bytes"]
433
  print(f"[VOICE] [{user_id}] Utterance: {len(audio_bytes):,} bytes")
434
+ # If a turn is currently speaking, treat a new utterance as
435
+ # barge-in: cancel current output and drop any queued audio.
436
+ if _active_task is not None and not _active_task.done():
437
+ await _cancel_active()
438
+ await _drain_utterance_queue()
439
  await _utterance_q.put(audio_bytes)
440
 
441
  elif "text" in data and data["text"]:
frontend/index.html CHANGED
@@ -247,7 +247,7 @@
247
  <div class="voice-row">
248
  <button id="mic-btn" class="mic-btn mic-off">
249
  <span class="mic-icon">🎀</span>
250
- <span class="mic-label">Voice ঢুরু করুন</span>
251
  </button>
252
  <button id="stop-btn" class="stop-btn" title="Stop AI speech">
253
  <svg width="18" height="18" viewBox="0 0 24 24" fill="currentColor">
 
247
  <div class="voice-row">
248
  <button id="mic-btn" class="mic-btn mic-off">
249
  <span class="mic-icon">🎀</span>
250
+ <span class="mic-label">Start</span>
251
  </button>
252
  <button id="stop-btn" class="stop-btn" title="Stop AI speech">
253
  <svg width="18" height="18" viewBox="0 0 24 24" fill="currentColor">
frontend/script.js CHANGED
@@ -1,5 +1,3 @@
1
-
2
-
3
  'use strict';
4
 
5
  // ─── DOM refs ─────────────────────────────────────────────────────────────────
@@ -58,9 +56,10 @@ const USER_ID = (() => {
58
  const WS_BASES = (() => {
59
  const scheme = window.location.protocol === 'https:' ? 'wss:' : 'ws:';
60
  const bases = [];
61
- const host = window.location.host && window.location.host !== 'null'
62
- ? `${scheme}//${window.location.host}`
63
- : '';
 
64
  const push = (base) => {
65
  if (base && !bases.includes(base)) bases.push(base);
66
  };
@@ -83,10 +82,10 @@ let _chatRetryTimer = null;
83
  let _voiceRetryTimer = null;
84
 
85
  // ─── VAD / recording settings ─────────────────────────────────────────────────
86
- let SILENCE_MS = 1200; // BUG-FIX-B: was 450 ms
87
  let SILENCE_DB = -38;
88
- const VAD_MS = 80;
89
- const MIN_SPEECH_MS = 400; // discard noise bursts shorter than this
90
 
91
  // ─── Playback state ───────────────────────────────────────────────────────────
92
  let _ctx = null;
@@ -95,6 +94,12 @@ let _endTimer = null;
95
  let _cancelled = false;
96
  let _inFlight = 0;
97
  let _ttsPlaying = false;
 
 
 
 
 
 
98
  let brainMode = false;
99
  let brainVoiceActive = false;
100
  let brainRestartTimer = null;
@@ -293,8 +298,18 @@ function onChatMsg(ev) {
293
  // ── Voice WS handler ──────────────────────────────────────────────────────────
294
  function onVoiceMsg(ev) {
295
  if (ev.data instanceof ArrayBuffer) {
 
296
  _ttsPlaying = true;
297
- enqueueAudio(ev.data);
 
 
 
 
 
 
 
 
 
298
  return;
299
  }
300
 
@@ -439,6 +454,19 @@ function _ctxEnsure() {
439
  return _ctx;
440
  }
441
 
 
 
 
 
 
 
 
 
 
 
 
 
 
442
  async function enqueueAudio(buf) {
443
  if (_cancelled) return;
444
  _inFlight++;
@@ -470,13 +498,23 @@ async function enqueueAudio(buf) {
470
  src.buffer = decoded;
471
  src.connect(ctx.destination);
472
  const now = ctx.currentTime;
473
- const start = Math.max(now + 0.01, _schedEnd);
 
 
 
 
 
 
 
 
474
  src.start(start);
475
  _schedEnd = start + decoded.duration;
476
 
477
  src.onended = () => {
478
  _inFlight = Math.max(0, _inFlight - 1);
479
  _vizQ();
 
 
480
  };
481
 
482
  setState('speaking');
@@ -522,7 +560,13 @@ function _done() {
522
  if (brainMode && brainVoiceActive) {
523
  clearTimeout(brainAutoRestartTimer);
524
  brainAutoRestartTimer = setTimeout(() => {
525
- if (!brainMode || !brainVoiceActive || isListening || isProcessing || isRecordingLocked) {
 
 
 
 
 
 
526
  return;
527
  }
528
  _brainResumeListening();
@@ -534,17 +578,55 @@ function _done() {
534
  function stopAllAudio() {
535
  _cancelled = true;
536
  _ttsPlaying = false;
 
 
 
 
537
  clearTimeout(_endTimer);
538
  _endTimer = null;
539
  _schedEnd = 0;
540
  _inFlight = 0;
541
  _vizQ();
542
- if (_ctx && _ctx.state === 'running') _ctx.suspend().catch(() => {});
 
 
 
 
543
  if (voiceWS && voiceWS.readyState === WebSocket.OPEN) {
544
  voiceWS.send(JSON.stringify({ type: 'cancel' }));
545
  }
546
  }
547
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
548
  // ═══════════════════════════════════════════════════════════════════════════════
549
  // TEXT CHAT
550
  // ═══════════════════════════════════════════════════════════════════════════════
@@ -707,8 +789,9 @@ function _resetVoiceState() {
707
  // ── VAD tick ──────────────────────────────────────────────────────────────────
708
  function vadTick() {
709
  if (!analyser) return;
710
- if (_ttsPlaying) return; // mute during TTS playback
711
- if (isProcessing || isRecordingLocked) return; // hard lock
 
712
 
713
  const buf = new Float32Array(analyser.frequencyBinCount);
714
  analyser.getFloatTimeDomainData(buf);
@@ -718,6 +801,27 @@ function vadTick() {
718
  const speech = db > SILENCE_DB;
719
 
720
  if (speech) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
721
  clearTimeout(silenceTimer);
722
  silenceTimer = null;
723
 
@@ -733,6 +837,7 @@ function vadTick() {
733
  console.log('[VAD] Speech detected β€” recording');
734
  }
735
  } else {
 
736
  if (isSpeaking && !silenceTimer) {
737
  silenceTimer = setTimeout(_onSilenceTimeout, SILENCE_MS);
738
  }
@@ -764,14 +869,19 @@ function _onSilenceTimeout() {
764
  `[VAD] Silence after ${speechDuration} ms β€” finalising utterance`,
765
  );
766
 
767
- // Stop VAD before stopRecorder so no new speech detection during processing
768
- clearInterval(vadInt);
769
- clearInterval(vizInt);
770
- vadInt = vizInt = null;
 
 
 
 
 
771
 
772
  // Lock state BEFORE stopRecorder (onstop may fire almost immediately)
773
  isSpeaking = false;
774
- isListening = false;
775
  isProcessing = true;
776
  isRecordingLocked = true;
777
  _cancelled = false;
@@ -780,9 +890,9 @@ function _onSilenceTimeout() {
780
  tLlm = 0;
781
  tTts = 0;
782
 
783
- micBtn.disabled = true;
784
- setMic('processing');
785
- setState('processing');
786
 
787
  stopRecorder(); // β†’ triggers onstop asynchronously
788
  }
@@ -958,14 +1068,14 @@ function setState(s) {
958
  }
959
 
960
  const MIC_MAP = {
961
- off: { cls: 'mic-off', label: 'Voice ঢুরু করুন', icon: '🎀' },
962
  listening: {
963
  cls: 'mic-listening',
964
- label: 'ঢুনছি… (বাঀিল ���রঀে ক্লিক)',
965
  icon: '🟒',
966
  },
967
- recording: { cls: 'mic-recording', label: 'বলছেন…', icon: 'πŸ”΄' },
968
- processing: { cls: 'mic-processing', label: 'ΰ¦ͺ্রক্রিয়া করছে…', icon: '⏳' },
969
  };
970
 
971
  function setMic(s) {
@@ -1022,10 +1132,18 @@ function setBrainMode(on) {
1022
  sidebarToggle.textContent = 'β€Ί';
1023
  chatBox.scrollTop = chatBox.scrollHeight;
1024
  textInput.blur();
1025
- _brainModeSetSearch(isProcessing || isListening || isSpeaking || _ttsPlaying);
 
 
1026
  if (!isListening && !isProcessing && !isRecordingLocked) {
1027
  setTimeout(() => {
1028
- if (brainMode && brainVoiceActive && !isListening && !isProcessing && !isRecordingLocked) {
 
 
 
 
 
 
1029
  _brainResumeListening();
1030
  }
1031
  }, 180);
@@ -1064,7 +1182,13 @@ function _brainSetTtsBubble(text, active = true) {
1064
  }
1065
 
1066
  function _brainResumeListening() {
1067
- if (!brainMode || !brainVoiceActive || isListening || isProcessing || isRecordingLocked) {
 
 
 
 
 
 
1068
  return;
1069
  }
1070
  if (micStream && analyserCtx && analyser) {
@@ -1093,7 +1217,11 @@ function _queueBrainReconnect() {
1093
  }
1094
 
1095
  function _flushVoicePendingPackets() {
1096
- if (!voiceWS || voiceWS.readyState !== WebSocket.OPEN || !voicePendingPackets.length) {
 
 
 
 
1097
  return;
1098
  }
1099
  const packets = voicePendingPackets.splice(0);
 
 
 
1
  'use strict';
2
 
3
  // ─── DOM refs ─────────────────────────────────────────────────────────────────
 
56
  const WS_BASES = (() => {
57
  const scheme = window.location.protocol === 'https:' ? 'wss:' : 'ws:';
58
  const bases = [];
59
+ const host =
60
+ window.location.host && window.location.host !== 'null'
61
+ ? `${scheme}//${window.location.host}`
62
+ : '';
63
  const push = (base) => {
64
  if (base && !bases.includes(base)) bases.push(base);
65
  };
 
82
  let _voiceRetryTimer = null;
83
 
84
  // ─── VAD / recording settings ─────────────────────────────────────────────────
85
+ let SILENCE_MS = 900; // default; user-adjustable in UI
86
  let SILENCE_DB = -38;
87
+ const VAD_MS = 60;
88
+ const MIN_SPEECH_MS = 320; // discard noise bursts shorter than this
89
 
90
  // ─── Playback state ───────────────────────────────────────────────────────────
91
  let _ctx = null;
 
94
  let _cancelled = false;
95
  let _inFlight = 0;
96
  let _ttsPlaying = false;
97
+ let _activeSources = [];
98
+ let _bargeInArmedAt = 0;
99
+ let _bargeInFiredAt = 0;
100
+ let _dropAudioUntil = 0;
101
+ let _audioChain = Promise.resolve();
102
+ let _playbackGen = 0;
103
  let brainMode = false;
104
  let brainVoiceActive = false;
105
  let brainRestartTimer = null;
 
298
  // ── Voice WS handler ──────────────────────────────────────────────────────────
299
  function onVoiceMsg(ev) {
300
  if (ev.data instanceof ArrayBuffer) {
301
+ if (Date.now() < _dropAudioUntil) return; // drop late packets after cancel
302
  _ttsPlaying = true;
303
+ // Ensure decode+schedule happens strictly in arrival order.
304
+ // decodeAudioData is async and can complete out-of-order otherwise.
305
+ const gen = _playbackGen;
306
+ _audioChain = _audioChain
307
+ .catch(() => {})
308
+ .then(() => {
309
+ if (gen !== _playbackGen) return;
310
+ if (_cancelled) return;
311
+ return enqueueAudio(ev.data);
312
+ });
313
  return;
314
  }
315
 
 
454
  return _ctx;
455
  }
456
 
457
+ function _stopAllSources() {
458
+ const sources = _activeSources.splice(0);
459
+ for (const src of sources) {
460
+ try {
461
+ src.onended = null;
462
+ src.stop(0);
463
+ } catch {}
464
+ try {
465
+ src.disconnect();
466
+ } catch {}
467
+ }
468
+ }
469
+
470
  async function enqueueAudio(buf) {
471
  if (_cancelled) return;
472
  _inFlight++;
 
498
  src.buffer = decoded;
499
  src.connect(ctx.destination);
500
  const now = ctx.currentTime;
501
+ // Tiny gap between chunks improves perceived naturalness (less "machine-gun").
502
+ const GAP_S = 0.015;
503
+ const start = Math.max(now + 0.01, _schedEnd + GAP_S);
504
+ if (_cancelled) {
505
+ _inFlight = Math.max(0, _inFlight - 1);
506
+ _vizQ();
507
+ return;
508
+ }
509
+ _activeSources.push(src);
510
  src.start(start);
511
  _schedEnd = start + decoded.duration;
512
 
513
  src.onended = () => {
514
  _inFlight = Math.max(0, _inFlight - 1);
515
  _vizQ();
516
+ const idx = _activeSources.indexOf(src);
517
+ if (idx >= 0) _activeSources.splice(idx, 1);
518
  };
519
 
520
  setState('speaking');
 
560
  if (brainMode && brainVoiceActive) {
561
  clearTimeout(brainAutoRestartTimer);
562
  brainAutoRestartTimer = setTimeout(() => {
563
+ if (
564
+ !brainMode ||
565
+ !brainVoiceActive ||
566
+ isListening ||
567
+ isProcessing ||
568
+ isRecordingLocked
569
+ ) {
570
  return;
571
  }
572
  _brainResumeListening();
 
578
  function stopAllAudio() {
579
  _cancelled = true;
580
  _ttsPlaying = false;
581
+ _dropAudioUntil = Date.now() + 700;
582
+ _playbackGen++;
583
+ _audioChain = Promise.resolve();
584
+ _stopAllSources();
585
  clearTimeout(_endTimer);
586
  _endTimer = null;
587
  _schedEnd = 0;
588
  _inFlight = 0;
589
  _vizQ();
590
+ if (_ctx && _ctx.state !== 'closed') {
591
+ // Close releases scheduled audio immediately; a new ctx is created on demand.
592
+ _ctx.close().catch(() => {});
593
+ }
594
+ _ctx = null;
595
  if (voiceWS && voiceWS.readyState === WebSocket.OPEN) {
596
  voiceWS.send(JSON.stringify({ type: 'cancel' }));
597
  }
598
  }
599
 
600
+ function _bargeInNow(reason = 'speech') {
601
+ const now = Date.now();
602
+ if (now - _bargeInFiredAt < 500) return; // debounce
603
+ _bargeInFiredAt = now;
604
+
605
+ console.log('[BargeIn] interrupt:', reason);
606
+ stopAllAudio();
607
+
608
+ // Unlock immediately so the user can speak right away.
609
+ isProcessing = false;
610
+ isRecordingLocked = false;
611
+ _cancelled = false;
612
+ aiEl = null;
613
+ aiTxt = '';
614
+ _setCaption('');
615
+ _removeThinking();
616
+ micBtn.disabled = false;
617
+
618
+ // If mic is already warm (brain continuous mode), just re-arm VAD.
619
+ if (brainMode && brainVoiceActive) {
620
+ _brainModeSetSearch(false);
621
+ // If analyser/mic are already active, VAD tick will immediately
622
+ // transition into recording on the next speech sample.
623
+ _brainResumeListening();
624
+ return;
625
+ }
626
+ // Otherwise, start listening fresh (user initiated by speaking).
627
+ startListening().catch(() => {});
628
+ }
629
+
630
  // ═══════════════════════════════════════════════════════════════════════════════
631
  // TEXT CHAT
632
  // ═══════════════════════════════════════════════════════════════════════════════
 
789
  // ── VAD tick ──────────────────────────────────────────────────────────────────
790
  function vadTick() {
791
  if (!analyser) return;
792
+ // In brain mode we allow "barge-in": user speech interrupts TTS playback.
793
+ // In non-brain mode we still keep the hard lock to prevent overlapping turns.
794
+ if (!brainMode && (isProcessing || isRecordingLocked)) return;
795
 
796
  const buf = new Float32Array(analyser.frequencyBinCount);
797
  analyser.getFloatTimeDomainData(buf);
 
801
  const speech = db > SILENCE_DB;
802
 
803
  if (speech) {
804
+ // ── Barge-in detector ────────────────────────────────────────────────
805
+ if (brainMode && brainVoiceActive && (_ttsPlaying || isProcessing || isRecordingLocked)) {
806
+ // Stricter threshold reduces false triggers from echo + noise.
807
+ const loud = db > SILENCE_DB + 4;
808
+ if (loud) {
809
+ if (!_bargeInArmedAt) _bargeInArmedAt = Date.now();
810
+ if (Date.now() - _bargeInArmedAt >= 90) {
811
+ _bargeInArmedAt = 0;
812
+ _bargeInNow(_ttsPlaying ? 'vad_tts' : 'vad_thinking');
813
+ // After barge-in unlock, continue into the normal recording start
814
+ // path in this same tick.
815
+ } else {
816
+ // Don't start recording until we confirm it’s real barge-in speech.
817
+ return;
818
+ }
819
+ } else {
820
+ _bargeInArmedAt = 0;
821
+ return;
822
+ }
823
+ }
824
+
825
  clearTimeout(silenceTimer);
826
  silenceTimer = null;
827
 
 
837
  console.log('[VAD] Speech detected β€” recording');
838
  }
839
  } else {
840
+ _bargeInArmedAt = 0;
841
  if (isSpeaking && !silenceTimer) {
842
  silenceTimer = setTimeout(_onSilenceTimeout, SILENCE_MS);
843
  }
 
869
  `[VAD] Silence after ${speechDuration} ms β€” finalising utterance`,
870
  );
871
 
872
+ const keepBrainMicWarm = brainMode && brainVoiceActive;
873
+
874
+ // In brain mode we keep VAD running so we can detect barge-in while the AI is
875
+ // thinking/speaking. Outside brain mode we stop VAD during processing.
876
+ if (!keepBrainMicWarm) {
877
+ clearInterval(vadInt);
878
+ clearInterval(vizInt);
879
+ vadInt = vizInt = null;
880
+ }
881
 
882
  // Lock state BEFORE stopRecorder (onstop may fire almost immediately)
883
  isSpeaking = false;
884
+ isListening = keepBrainMicWarm; // mic stays "hot" in brain mode
885
  isProcessing = true;
886
  isRecordingLocked = true;
887
  _cancelled = false;
 
890
  tLlm = 0;
891
  tTts = 0;
892
 
893
+ micBtn.disabled = !keepBrainMicWarm;
894
+ setMic(keepBrainMicWarm ? 'listening' : 'processing');
895
+ setState(keepBrainMicWarm ? 'listening' : 'processing');
896
 
897
  stopRecorder(); // β†’ triggers onstop asynchronously
898
  }
 
1068
  }
1069
 
1070
  const MIC_MAP = {
1071
+ off: { cls: 'mic-off', label: 'Press to Start talking', icon: '🎀' },
1072
  listening: {
1073
  cls: 'mic-listening',
1074
+ label: 'Listening...',
1075
  icon: '🟒',
1076
  },
1077
+ recording: { cls: 'mic-recording', label: 'Listening..', icon: 'πŸ”΄' },
1078
+ processing: { cls: 'mic-processing', label: 'Please wait !!!', icon: '⏳' },
1079
  };
1080
 
1081
  function setMic(s) {
 
1132
  sidebarToggle.textContent = 'β€Ί';
1133
  chatBox.scrollTop = chatBox.scrollHeight;
1134
  textInput.blur();
1135
+ _brainModeSetSearch(
1136
+ isProcessing || isListening || isSpeaking || _ttsPlaying,
1137
+ );
1138
  if (!isListening && !isProcessing && !isRecordingLocked) {
1139
  setTimeout(() => {
1140
+ if (
1141
+ brainMode &&
1142
+ brainVoiceActive &&
1143
+ !isListening &&
1144
+ !isProcessing &&
1145
+ !isRecordingLocked
1146
+ ) {
1147
  _brainResumeListening();
1148
  }
1149
  }, 180);
 
1182
  }
1183
 
1184
  function _brainResumeListening() {
1185
+ if (
1186
+ !brainMode ||
1187
+ !brainVoiceActive ||
1188
+ isListening ||
1189
+ isProcessing ||
1190
+ isRecordingLocked
1191
+ ) {
1192
  return;
1193
  }
1194
  if (micStream && analyserCtx && analyser) {
 
1217
  }
1218
 
1219
  function _flushVoicePendingPackets() {
1220
+ if (
1221
+ !voiceWS ||
1222
+ voiceWS.readyState !== WebSocket.OPEN ||
1223
+ !voicePendingPackets.length
1224
+ ) {
1225
  return;
1226
  }
1227
  const packets = voicePendingPackets.splice(0);
services/tts.py CHANGED
@@ -21,7 +21,7 @@ EDGE_VOICE = "bn-BD-NabanitaNeural"
21
  ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY", "")
22
  ELEVENLABS_VOICE_ID = os.getenv("ELEVENLABS_VOICE_ID", "21m00Tcm4TlvDq8ikWAM")
23
  ELEVENLABS_MODEL_ID = os.getenv("ELEVENLABS_MODEL_ID", "eleven_multilingual_v2")
24
- ELEVENLABS_SPEED = float(os.getenv("ELEVENLABS_SPEED", "1.08"))
25
  ELEVENLABS_OUTPUT_FORMAT = "mp3_22050_32"
26
  ELEVENLABS_STABILITY = 0.45
27
  ELEVENLABS_SIMILARITY = 0.80
@@ -139,7 +139,7 @@ async def _elevenlabs_stream(
139
  async def text_to_speech_stream(
140
  text: str,
141
  voice: str | None = None,
142
- rate: str = "+8%",
143
  ):
144
  """
145
  Stream TTS audio for `text`.
 
21
  ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY", "")
22
  ELEVENLABS_VOICE_ID = os.getenv("ELEVENLABS_VOICE_ID", "21m00Tcm4TlvDq8ikWAM")
23
  ELEVENLABS_MODEL_ID = os.getenv("ELEVENLABS_MODEL_ID", "eleven_multilingual_v2")
24
+ ELEVENLABS_SPEED = float(os.getenv("ELEVENLABS_SPEED", "1.05"))
25
  ELEVENLABS_OUTPUT_FORMAT = "mp3_22050_32"
26
  ELEVENLABS_STABILITY = 0.45
27
  ELEVENLABS_SIMILARITY = 0.80
 
139
  async def text_to_speech_stream(
140
  text: str,
141
  voice: str | None = None,
142
+ rate: str = "+4%",
143
  ):
144
  """
145
  Stream TTS audio for `text`.