Commit Β·
caa1385
1
Parent(s): 17cb949
Refactor voice handling for barge-in support and update UI text labels
Browse files- app.py +14 -2
- frontend/index.html +1 -1
- frontend/script.js +158 -30
- services/tts.py +2 -2
app.py
CHANGED
|
@@ -392,18 +392,25 @@ async def ws_voice(ws: WebSocket):
|
|
| 392 |
await _safe_text(ws, {"type": "end"})
|
| 393 |
|
| 394 |
async def _utterance_worker():
|
|
|
|
| 395 |
while True:
|
| 396 |
audio_bytes = await _utterance_q.get()
|
| 397 |
if audio_bytes is None:
|
| 398 |
break
|
| 399 |
try:
|
| 400 |
-
|
|
|
|
|
|
|
|
|
|
| 401 |
except asyncio.CancelledError:
|
| 402 |
-
|
|
|
|
| 403 |
except Exception as exc:
|
| 404 |
print(f"[VOICE] Utterance worker error: {exc}")
|
| 405 |
await _safe_text(ws, {"type": "error", "text": str(exc)})
|
| 406 |
await _safe_text(ws, {"type": "end"})
|
|
|
|
|
|
|
| 407 |
|
| 408 |
try:
|
| 409 |
_worker_task = asyncio.create_task(_utterance_worker())
|
|
@@ -424,6 +431,11 @@ async def ws_voice(ws: WebSocket):
|
|
| 424 |
if "bytes" in data and data["bytes"]:
|
| 425 |
audio_bytes = data["bytes"]
|
| 426 |
print(f"[VOICE] [{user_id}] Utterance: {len(audio_bytes):,} bytes")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 427 |
await _utterance_q.put(audio_bytes)
|
| 428 |
|
| 429 |
elif "text" in data and data["text"]:
|
|
|
|
| 392 |
await _safe_text(ws, {"type": "end"})
|
| 393 |
|
| 394 |
async def _utterance_worker():
|
| 395 |
+
nonlocal _active_task
|
| 396 |
while True:
|
| 397 |
audio_bytes = await _utterance_q.get()
|
| 398 |
if audio_bytes is None:
|
| 399 |
break
|
| 400 |
try:
|
| 401 |
+
# Run each utterance as a cancellable task so barge-in can
|
| 402 |
+
# immediately interrupt LLM+TTS mid-turn.
|
| 403 |
+
_active_task = asyncio.create_task(_handle_utterance(audio_bytes))
|
| 404 |
+
await _active_task
|
| 405 |
except asyncio.CancelledError:
|
| 406 |
+
# Interruption is normal (client barge-in / cancel).
|
| 407 |
+
pass
|
| 408 |
except Exception as exc:
|
| 409 |
print(f"[VOICE] Utterance worker error: {exc}")
|
| 410 |
await _safe_text(ws, {"type": "error", "text": str(exc)})
|
| 411 |
await _safe_text(ws, {"type": "end"})
|
| 412 |
+
finally:
|
| 413 |
+
_active_task = None
|
| 414 |
|
| 415 |
try:
|
| 416 |
_worker_task = asyncio.create_task(_utterance_worker())
|
|
|
|
| 431 |
if "bytes" in data and data["bytes"]:
|
| 432 |
audio_bytes = data["bytes"]
|
| 433 |
print(f"[VOICE] [{user_id}] Utterance: {len(audio_bytes):,} bytes")
|
| 434 |
+
# If a turn is currently speaking, treat a new utterance as
|
| 435 |
+
# barge-in: cancel current output and drop any queued audio.
|
| 436 |
+
if _active_task is not None and not _active_task.done():
|
| 437 |
+
await _cancel_active()
|
| 438 |
+
await _drain_utterance_queue()
|
| 439 |
await _utterance_q.put(audio_bytes)
|
| 440 |
|
| 441 |
elif "text" in data and data["text"]:
|
frontend/index.html
CHANGED
|
@@ -247,7 +247,7 @@
|
|
| 247 |
<div class="voice-row">
|
| 248 |
<button id="mic-btn" class="mic-btn mic-off">
|
| 249 |
<span class="mic-icon">π€</span>
|
| 250 |
-
<span class="mic-label">
|
| 251 |
</button>
|
| 252 |
<button id="stop-btn" class="stop-btn" title="Stop AI speech">
|
| 253 |
<svg width="18" height="18" viewBox="0 0 24 24" fill="currentColor">
|
|
|
|
| 247 |
<div class="voice-row">
|
| 248 |
<button id="mic-btn" class="mic-btn mic-off">
|
| 249 |
<span class="mic-icon">π€</span>
|
| 250 |
+
<span class="mic-label">Start</span>
|
| 251 |
</button>
|
| 252 |
<button id="stop-btn" class="stop-btn" title="Stop AI speech">
|
| 253 |
<svg width="18" height="18" viewBox="0 0 24 24" fill="currentColor">
|
frontend/script.js
CHANGED
|
@@ -1,5 +1,3 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
'use strict';
|
| 4 |
|
| 5 |
// βββ DOM refs βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
@@ -58,9 +56,10 @@ const USER_ID = (() => {
|
|
| 58 |
const WS_BASES = (() => {
|
| 59 |
const scheme = window.location.protocol === 'https:' ? 'wss:' : 'ws:';
|
| 60 |
const bases = [];
|
| 61 |
-
const host =
|
| 62 |
-
|
| 63 |
-
|
|
|
|
| 64 |
const push = (base) => {
|
| 65 |
if (base && !bases.includes(base)) bases.push(base);
|
| 66 |
};
|
|
@@ -83,10 +82,10 @@ let _chatRetryTimer = null;
|
|
| 83 |
let _voiceRetryTimer = null;
|
| 84 |
|
| 85 |
// βββ VAD / recording settings βββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 86 |
-
let SILENCE_MS =
|
| 87 |
let SILENCE_DB = -38;
|
| 88 |
-
const VAD_MS =
|
| 89 |
-
const MIN_SPEECH_MS =
|
| 90 |
|
| 91 |
// βββ Playback state βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 92 |
let _ctx = null;
|
|
@@ -95,6 +94,12 @@ let _endTimer = null;
|
|
| 95 |
let _cancelled = false;
|
| 96 |
let _inFlight = 0;
|
| 97 |
let _ttsPlaying = false;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
let brainMode = false;
|
| 99 |
let brainVoiceActive = false;
|
| 100 |
let brainRestartTimer = null;
|
|
@@ -293,8 +298,18 @@ function onChatMsg(ev) {
|
|
| 293 |
// ββ Voice WS handler ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 294 |
function onVoiceMsg(ev) {
|
| 295 |
if (ev.data instanceof ArrayBuffer) {
|
|
|
|
| 296 |
_ttsPlaying = true;
|
| 297 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 298 |
return;
|
| 299 |
}
|
| 300 |
|
|
@@ -439,6 +454,19 @@ function _ctxEnsure() {
|
|
| 439 |
return _ctx;
|
| 440 |
}
|
| 441 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 442 |
async function enqueueAudio(buf) {
|
| 443 |
if (_cancelled) return;
|
| 444 |
_inFlight++;
|
|
@@ -470,13 +498,23 @@ async function enqueueAudio(buf) {
|
|
| 470 |
src.buffer = decoded;
|
| 471 |
src.connect(ctx.destination);
|
| 472 |
const now = ctx.currentTime;
|
| 473 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 474 |
src.start(start);
|
| 475 |
_schedEnd = start + decoded.duration;
|
| 476 |
|
| 477 |
src.onended = () => {
|
| 478 |
_inFlight = Math.max(0, _inFlight - 1);
|
| 479 |
_vizQ();
|
|
|
|
|
|
|
| 480 |
};
|
| 481 |
|
| 482 |
setState('speaking');
|
|
@@ -522,7 +560,13 @@ function _done() {
|
|
| 522 |
if (brainMode && brainVoiceActive) {
|
| 523 |
clearTimeout(brainAutoRestartTimer);
|
| 524 |
brainAutoRestartTimer = setTimeout(() => {
|
| 525 |
-
if (
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 526 |
return;
|
| 527 |
}
|
| 528 |
_brainResumeListening();
|
|
@@ -534,17 +578,55 @@ function _done() {
|
|
| 534 |
function stopAllAudio() {
|
| 535 |
_cancelled = true;
|
| 536 |
_ttsPlaying = false;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 537 |
clearTimeout(_endTimer);
|
| 538 |
_endTimer = null;
|
| 539 |
_schedEnd = 0;
|
| 540 |
_inFlight = 0;
|
| 541 |
_vizQ();
|
| 542 |
-
if (_ctx && _ctx.state ==
|
|
|
|
|
|
|
|
|
|
|
|
|
| 543 |
if (voiceWS && voiceWS.readyState === WebSocket.OPEN) {
|
| 544 |
voiceWS.send(JSON.stringify({ type: 'cancel' }));
|
| 545 |
}
|
| 546 |
}
|
| 547 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 548 |
// βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 549 |
// TEXT CHAT
|
| 550 |
// βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
@@ -707,8 +789,9 @@ function _resetVoiceState() {
|
|
| 707 |
// ββ VAD tick ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 708 |
function vadTick() {
|
| 709 |
if (!analyser) return;
|
| 710 |
-
|
| 711 |
-
|
|
|
|
| 712 |
|
| 713 |
const buf = new Float32Array(analyser.frequencyBinCount);
|
| 714 |
analyser.getFloatTimeDomainData(buf);
|
|
@@ -718,6 +801,27 @@ function vadTick() {
|
|
| 718 |
const speech = db > SILENCE_DB;
|
| 719 |
|
| 720 |
if (speech) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 721 |
clearTimeout(silenceTimer);
|
| 722 |
silenceTimer = null;
|
| 723 |
|
|
@@ -733,6 +837,7 @@ function vadTick() {
|
|
| 733 |
console.log('[VAD] Speech detected β recording');
|
| 734 |
}
|
| 735 |
} else {
|
|
|
|
| 736 |
if (isSpeaking && !silenceTimer) {
|
| 737 |
silenceTimer = setTimeout(_onSilenceTimeout, SILENCE_MS);
|
| 738 |
}
|
|
@@ -764,14 +869,19 @@ function _onSilenceTimeout() {
|
|
| 764 |
`[VAD] Silence after ${speechDuration} ms β finalising utterance`,
|
| 765 |
);
|
| 766 |
|
| 767 |
-
|
| 768 |
-
|
| 769 |
-
|
| 770 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 771 |
|
| 772 |
// Lock state BEFORE stopRecorder (onstop may fire almost immediately)
|
| 773 |
isSpeaking = false;
|
| 774 |
-
isListening =
|
| 775 |
isProcessing = true;
|
| 776 |
isRecordingLocked = true;
|
| 777 |
_cancelled = false;
|
|
@@ -780,9 +890,9 @@ function _onSilenceTimeout() {
|
|
| 780 |
tLlm = 0;
|
| 781 |
tTts = 0;
|
| 782 |
|
| 783 |
-
micBtn.disabled =
|
| 784 |
-
setMic('processing');
|
| 785 |
-
setState('processing');
|
| 786 |
|
| 787 |
stopRecorder(); // β triggers onstop asynchronously
|
| 788 |
}
|
|
@@ -958,14 +1068,14 @@ function setState(s) {
|
|
| 958 |
}
|
| 959 |
|
| 960 |
const MIC_MAP = {
|
| 961 |
-
off: { cls: 'mic-off', label: '
|
| 962 |
listening: {
|
| 963 |
cls: 'mic-listening',
|
| 964 |
-
label: '
|
| 965 |
icon: 'π’',
|
| 966 |
},
|
| 967 |
-
recording: { cls: 'mic-recording', label: '
|
| 968 |
-
processing: { cls: 'mic-processing', label: '
|
| 969 |
};
|
| 970 |
|
| 971 |
function setMic(s) {
|
|
@@ -1022,10 +1132,18 @@ function setBrainMode(on) {
|
|
| 1022 |
sidebarToggle.textContent = 'βΊ';
|
| 1023 |
chatBox.scrollTop = chatBox.scrollHeight;
|
| 1024 |
textInput.blur();
|
| 1025 |
-
_brainModeSetSearch(
|
|
|
|
|
|
|
| 1026 |
if (!isListening && !isProcessing && !isRecordingLocked) {
|
| 1027 |
setTimeout(() => {
|
| 1028 |
-
if (
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1029 |
_brainResumeListening();
|
| 1030 |
}
|
| 1031 |
}, 180);
|
|
@@ -1064,7 +1182,13 @@ function _brainSetTtsBubble(text, active = true) {
|
|
| 1064 |
}
|
| 1065 |
|
| 1066 |
function _brainResumeListening() {
|
| 1067 |
-
if (
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1068 |
return;
|
| 1069 |
}
|
| 1070 |
if (micStream && analyserCtx && analyser) {
|
|
@@ -1093,7 +1217,11 @@ function _queueBrainReconnect() {
|
|
| 1093 |
}
|
| 1094 |
|
| 1095 |
function _flushVoicePendingPackets() {
|
| 1096 |
-
if (
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1097 |
return;
|
| 1098 |
}
|
| 1099 |
const packets = voicePendingPackets.splice(0);
|
|
|
|
|
|
|
|
|
|
| 1 |
'use strict';
|
| 2 |
|
| 3 |
// βββ DOM refs βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 56 |
const WS_BASES = (() => {
|
| 57 |
const scheme = window.location.protocol === 'https:' ? 'wss:' : 'ws:';
|
| 58 |
const bases = [];
|
| 59 |
+
const host =
|
| 60 |
+
window.location.host && window.location.host !== 'null'
|
| 61 |
+
? `${scheme}//${window.location.host}`
|
| 62 |
+
: '';
|
| 63 |
const push = (base) => {
|
| 64 |
if (base && !bases.includes(base)) bases.push(base);
|
| 65 |
};
|
|
|
|
| 82 |
let _voiceRetryTimer = null;
|
| 83 |
|
| 84 |
// βββ VAD / recording settings βββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 85 |
+
let SILENCE_MS = 900; // default; user-adjustable in UI
|
| 86 |
let SILENCE_DB = -38;
|
| 87 |
+
const VAD_MS = 60;
|
| 88 |
+
const MIN_SPEECH_MS = 320; // discard noise bursts shorter than this
|
| 89 |
|
| 90 |
// βββ Playback state βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 91 |
let _ctx = null;
|
|
|
|
| 94 |
let _cancelled = false;
|
| 95 |
let _inFlight = 0;
|
| 96 |
let _ttsPlaying = false;
|
| 97 |
+
let _activeSources = [];
|
| 98 |
+
let _bargeInArmedAt = 0;
|
| 99 |
+
let _bargeInFiredAt = 0;
|
| 100 |
+
let _dropAudioUntil = 0;
|
| 101 |
+
let _audioChain = Promise.resolve();
|
| 102 |
+
let _playbackGen = 0;
|
| 103 |
let brainMode = false;
|
| 104 |
let brainVoiceActive = false;
|
| 105 |
let brainRestartTimer = null;
|
|
|
|
| 298 |
// ββ Voice WS handler ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 299 |
function onVoiceMsg(ev) {
|
| 300 |
if (ev.data instanceof ArrayBuffer) {
|
| 301 |
+
if (Date.now() < _dropAudioUntil) return; // drop late packets after cancel
|
| 302 |
_ttsPlaying = true;
|
| 303 |
+
// Ensure decode+schedule happens strictly in arrival order.
|
| 304 |
+
// decodeAudioData is async and can complete out-of-order otherwise.
|
| 305 |
+
const gen = _playbackGen;
|
| 306 |
+
_audioChain = _audioChain
|
| 307 |
+
.catch(() => {})
|
| 308 |
+
.then(() => {
|
| 309 |
+
if (gen !== _playbackGen) return;
|
| 310 |
+
if (_cancelled) return;
|
| 311 |
+
return enqueueAudio(ev.data);
|
| 312 |
+
});
|
| 313 |
return;
|
| 314 |
}
|
| 315 |
|
|
|
|
| 454 |
return _ctx;
|
| 455 |
}
|
| 456 |
|
| 457 |
+
function _stopAllSources() {
|
| 458 |
+
const sources = _activeSources.splice(0);
|
| 459 |
+
for (const src of sources) {
|
| 460 |
+
try {
|
| 461 |
+
src.onended = null;
|
| 462 |
+
src.stop(0);
|
| 463 |
+
} catch {}
|
| 464 |
+
try {
|
| 465 |
+
src.disconnect();
|
| 466 |
+
} catch {}
|
| 467 |
+
}
|
| 468 |
+
}
|
| 469 |
+
|
| 470 |
async function enqueueAudio(buf) {
|
| 471 |
if (_cancelled) return;
|
| 472 |
_inFlight++;
|
|
|
|
| 498 |
src.buffer = decoded;
|
| 499 |
src.connect(ctx.destination);
|
| 500 |
const now = ctx.currentTime;
|
| 501 |
+
// Tiny gap between chunks improves perceived naturalness (less "machine-gun").
|
| 502 |
+
const GAP_S = 0.015;
|
| 503 |
+
const start = Math.max(now + 0.01, _schedEnd + GAP_S);
|
| 504 |
+
if (_cancelled) {
|
| 505 |
+
_inFlight = Math.max(0, _inFlight - 1);
|
| 506 |
+
_vizQ();
|
| 507 |
+
return;
|
| 508 |
+
}
|
| 509 |
+
_activeSources.push(src);
|
| 510 |
src.start(start);
|
| 511 |
_schedEnd = start + decoded.duration;
|
| 512 |
|
| 513 |
src.onended = () => {
|
| 514 |
_inFlight = Math.max(0, _inFlight - 1);
|
| 515 |
_vizQ();
|
| 516 |
+
const idx = _activeSources.indexOf(src);
|
| 517 |
+
if (idx >= 0) _activeSources.splice(idx, 1);
|
| 518 |
};
|
| 519 |
|
| 520 |
setState('speaking');
|
|
|
|
| 560 |
if (brainMode && brainVoiceActive) {
|
| 561 |
clearTimeout(brainAutoRestartTimer);
|
| 562 |
brainAutoRestartTimer = setTimeout(() => {
|
| 563 |
+
if (
|
| 564 |
+
!brainMode ||
|
| 565 |
+
!brainVoiceActive ||
|
| 566 |
+
isListening ||
|
| 567 |
+
isProcessing ||
|
| 568 |
+
isRecordingLocked
|
| 569 |
+
) {
|
| 570 |
return;
|
| 571 |
}
|
| 572 |
_brainResumeListening();
|
|
|
|
| 578 |
function stopAllAudio() {
|
| 579 |
_cancelled = true;
|
| 580 |
_ttsPlaying = false;
|
| 581 |
+
_dropAudioUntil = Date.now() + 700;
|
| 582 |
+
_playbackGen++;
|
| 583 |
+
_audioChain = Promise.resolve();
|
| 584 |
+
_stopAllSources();
|
| 585 |
clearTimeout(_endTimer);
|
| 586 |
_endTimer = null;
|
| 587 |
_schedEnd = 0;
|
| 588 |
_inFlight = 0;
|
| 589 |
_vizQ();
|
| 590 |
+
if (_ctx && _ctx.state !== 'closed') {
|
| 591 |
+
// Close releases scheduled audio immediately; a new ctx is created on demand.
|
| 592 |
+
_ctx.close().catch(() => {});
|
| 593 |
+
}
|
| 594 |
+
_ctx = null;
|
| 595 |
if (voiceWS && voiceWS.readyState === WebSocket.OPEN) {
|
| 596 |
voiceWS.send(JSON.stringify({ type: 'cancel' }));
|
| 597 |
}
|
| 598 |
}
|
| 599 |
|
| 600 |
+
function _bargeInNow(reason = 'speech') {
|
| 601 |
+
const now = Date.now();
|
| 602 |
+
if (now - _bargeInFiredAt < 500) return; // debounce
|
| 603 |
+
_bargeInFiredAt = now;
|
| 604 |
+
|
| 605 |
+
console.log('[BargeIn] interrupt:', reason);
|
| 606 |
+
stopAllAudio();
|
| 607 |
+
|
| 608 |
+
// Unlock immediately so the user can speak right away.
|
| 609 |
+
isProcessing = false;
|
| 610 |
+
isRecordingLocked = false;
|
| 611 |
+
_cancelled = false;
|
| 612 |
+
aiEl = null;
|
| 613 |
+
aiTxt = '';
|
| 614 |
+
_setCaption('');
|
| 615 |
+
_removeThinking();
|
| 616 |
+
micBtn.disabled = false;
|
| 617 |
+
|
| 618 |
+
// If mic is already warm (brain continuous mode), just re-arm VAD.
|
| 619 |
+
if (brainMode && brainVoiceActive) {
|
| 620 |
+
_brainModeSetSearch(false);
|
| 621 |
+
// If analyser/mic are already active, VAD tick will immediately
|
| 622 |
+
// transition into recording on the next speech sample.
|
| 623 |
+
_brainResumeListening();
|
| 624 |
+
return;
|
| 625 |
+
}
|
| 626 |
+
// Otherwise, start listening fresh (user initiated by speaking).
|
| 627 |
+
startListening().catch(() => {});
|
| 628 |
+
}
|
| 629 |
+
|
| 630 |
// βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 631 |
// TEXT CHAT
|
| 632 |
// βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 789 |
// ββ VAD tick ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 790 |
function vadTick() {
|
| 791 |
if (!analyser) return;
|
| 792 |
+
// In brain mode we allow "barge-in": user speech interrupts TTS playback.
|
| 793 |
+
// In non-brain mode we still keep the hard lock to prevent overlapping turns.
|
| 794 |
+
if (!brainMode && (isProcessing || isRecordingLocked)) return;
|
| 795 |
|
| 796 |
const buf = new Float32Array(analyser.frequencyBinCount);
|
| 797 |
analyser.getFloatTimeDomainData(buf);
|
|
|
|
| 801 |
const speech = db > SILENCE_DB;
|
| 802 |
|
| 803 |
if (speech) {
|
| 804 |
+
// ββ Barge-in detector ββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 805 |
+
if (brainMode && brainVoiceActive && (_ttsPlaying || isProcessing || isRecordingLocked)) {
|
| 806 |
+
// Stricter threshold reduces false triggers from echo + noise.
|
| 807 |
+
const loud = db > SILENCE_DB + 4;
|
| 808 |
+
if (loud) {
|
| 809 |
+
if (!_bargeInArmedAt) _bargeInArmedAt = Date.now();
|
| 810 |
+
if (Date.now() - _bargeInArmedAt >= 90) {
|
| 811 |
+
_bargeInArmedAt = 0;
|
| 812 |
+
_bargeInNow(_ttsPlaying ? 'vad_tts' : 'vad_thinking');
|
| 813 |
+
// After barge-in unlock, continue into the normal recording start
|
| 814 |
+
// path in this same tick.
|
| 815 |
+
} else {
|
| 816 |
+
// Don't start recording until we confirm itβs real barge-in speech.
|
| 817 |
+
return;
|
| 818 |
+
}
|
| 819 |
+
} else {
|
| 820 |
+
_bargeInArmedAt = 0;
|
| 821 |
+
return;
|
| 822 |
+
}
|
| 823 |
+
}
|
| 824 |
+
|
| 825 |
clearTimeout(silenceTimer);
|
| 826 |
silenceTimer = null;
|
| 827 |
|
|
|
|
| 837 |
console.log('[VAD] Speech detected β recording');
|
| 838 |
}
|
| 839 |
} else {
|
| 840 |
+
_bargeInArmedAt = 0;
|
| 841 |
if (isSpeaking && !silenceTimer) {
|
| 842 |
silenceTimer = setTimeout(_onSilenceTimeout, SILENCE_MS);
|
| 843 |
}
|
|
|
|
| 869 |
`[VAD] Silence after ${speechDuration} ms β finalising utterance`,
|
| 870 |
);
|
| 871 |
|
| 872 |
+
const keepBrainMicWarm = brainMode && brainVoiceActive;
|
| 873 |
+
|
| 874 |
+
// In brain mode we keep VAD running so we can detect barge-in while the AI is
|
| 875 |
+
// thinking/speaking. Outside brain mode we stop VAD during processing.
|
| 876 |
+
if (!keepBrainMicWarm) {
|
| 877 |
+
clearInterval(vadInt);
|
| 878 |
+
clearInterval(vizInt);
|
| 879 |
+
vadInt = vizInt = null;
|
| 880 |
+
}
|
| 881 |
|
| 882 |
// Lock state BEFORE stopRecorder (onstop may fire almost immediately)
|
| 883 |
isSpeaking = false;
|
| 884 |
+
isListening = keepBrainMicWarm; // mic stays "hot" in brain mode
|
| 885 |
isProcessing = true;
|
| 886 |
isRecordingLocked = true;
|
| 887 |
_cancelled = false;
|
|
|
|
| 890 |
tLlm = 0;
|
| 891 |
tTts = 0;
|
| 892 |
|
| 893 |
+
micBtn.disabled = !keepBrainMicWarm;
|
| 894 |
+
setMic(keepBrainMicWarm ? 'listening' : 'processing');
|
| 895 |
+
setState(keepBrainMicWarm ? 'listening' : 'processing');
|
| 896 |
|
| 897 |
stopRecorder(); // β triggers onstop asynchronously
|
| 898 |
}
|
|
|
|
| 1068 |
}
|
| 1069 |
|
| 1070 |
const MIC_MAP = {
|
| 1071 |
+
off: { cls: 'mic-off', label: 'Press to Start talking', icon: 'π€' },
|
| 1072 |
listening: {
|
| 1073 |
cls: 'mic-listening',
|
| 1074 |
+
label: 'Listening...',
|
| 1075 |
icon: 'π’',
|
| 1076 |
},
|
| 1077 |
+
recording: { cls: 'mic-recording', label: 'Listening..', icon: 'π΄' },
|
| 1078 |
+
processing: { cls: 'mic-processing', label: 'Please wait !!!', icon: 'β³' },
|
| 1079 |
};
|
| 1080 |
|
| 1081 |
function setMic(s) {
|
|
|
|
| 1132 |
sidebarToggle.textContent = 'βΊ';
|
| 1133 |
chatBox.scrollTop = chatBox.scrollHeight;
|
| 1134 |
textInput.blur();
|
| 1135 |
+
_brainModeSetSearch(
|
| 1136 |
+
isProcessing || isListening || isSpeaking || _ttsPlaying,
|
| 1137 |
+
);
|
| 1138 |
if (!isListening && !isProcessing && !isRecordingLocked) {
|
| 1139 |
setTimeout(() => {
|
| 1140 |
+
if (
|
| 1141 |
+
brainMode &&
|
| 1142 |
+
brainVoiceActive &&
|
| 1143 |
+
!isListening &&
|
| 1144 |
+
!isProcessing &&
|
| 1145 |
+
!isRecordingLocked
|
| 1146 |
+
) {
|
| 1147 |
_brainResumeListening();
|
| 1148 |
}
|
| 1149 |
}, 180);
|
|
|
|
| 1182 |
}
|
| 1183 |
|
| 1184 |
function _brainResumeListening() {
|
| 1185 |
+
if (
|
| 1186 |
+
!brainMode ||
|
| 1187 |
+
!brainVoiceActive ||
|
| 1188 |
+
isListening ||
|
| 1189 |
+
isProcessing ||
|
| 1190 |
+
isRecordingLocked
|
| 1191 |
+
) {
|
| 1192 |
return;
|
| 1193 |
}
|
| 1194 |
if (micStream && analyserCtx && analyser) {
|
|
|
|
| 1217 |
}
|
| 1218 |
|
| 1219 |
function _flushVoicePendingPackets() {
|
| 1220 |
+
if (
|
| 1221 |
+
!voiceWS ||
|
| 1222 |
+
voiceWS.readyState !== WebSocket.OPEN ||
|
| 1223 |
+
!voicePendingPackets.length
|
| 1224 |
+
) {
|
| 1225 |
return;
|
| 1226 |
}
|
| 1227 |
const packets = voicePendingPackets.splice(0);
|
services/tts.py
CHANGED
|
@@ -21,7 +21,7 @@ EDGE_VOICE = "bn-BD-NabanitaNeural"
|
|
| 21 |
ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY", "")
|
| 22 |
ELEVENLABS_VOICE_ID = os.getenv("ELEVENLABS_VOICE_ID", "21m00Tcm4TlvDq8ikWAM")
|
| 23 |
ELEVENLABS_MODEL_ID = os.getenv("ELEVENLABS_MODEL_ID", "eleven_multilingual_v2")
|
| 24 |
-
ELEVENLABS_SPEED = float(os.getenv("ELEVENLABS_SPEED", "1.
|
| 25 |
ELEVENLABS_OUTPUT_FORMAT = "mp3_22050_32"
|
| 26 |
ELEVENLABS_STABILITY = 0.45
|
| 27 |
ELEVENLABS_SIMILARITY = 0.80
|
|
@@ -139,7 +139,7 @@ async def _elevenlabs_stream(
|
|
| 139 |
async def text_to_speech_stream(
|
| 140 |
text: str,
|
| 141 |
voice: str | None = None,
|
| 142 |
-
rate: str = "+
|
| 143 |
):
|
| 144 |
"""
|
| 145 |
Stream TTS audio for `text`.
|
|
|
|
| 21 |
ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY", "")
|
| 22 |
ELEVENLABS_VOICE_ID = os.getenv("ELEVENLABS_VOICE_ID", "21m00Tcm4TlvDq8ikWAM")
|
| 23 |
ELEVENLABS_MODEL_ID = os.getenv("ELEVENLABS_MODEL_ID", "eleven_multilingual_v2")
|
| 24 |
+
ELEVENLABS_SPEED = float(os.getenv("ELEVENLABS_SPEED", "1.05"))
|
| 25 |
ELEVENLABS_OUTPUT_FORMAT = "mp3_22050_32"
|
| 26 |
ELEVENLABS_STABILITY = 0.45
|
| 27 |
ELEVENLABS_SIMILARITY = 0.80
|
|
|
|
| 139 |
async def text_to_speech_stream(
|
| 140 |
text: str,
|
| 141 |
voice: str | None = None,
|
| 142 |
+
rate: str = "+4%",
|
| 143 |
):
|
| 144 |
"""
|
| 145 |
Stream TTS audio for `text`.
|