Commit Β·
1ce5806
1
Parent(s): caa1385
voice trigger fixed
Browse files- app.py +13 -3
- frontend/script.js +33 -9
app.py
CHANGED
|
@@ -26,6 +26,7 @@ LLM+TTS) preserved.
|
|
| 26 |
import asyncio
|
| 27 |
import json
|
| 28 |
import os
|
|
|
|
| 29 |
import uuid
|
| 30 |
from contextlib import asynccontextmanager
|
| 31 |
from pathlib import Path
|
|
@@ -365,6 +366,7 @@ async def ws_voice(ws: WebSocket):
|
|
| 365 |
# ββ LLM + TTS (concurrent) βββββββββββββββββββββββββββββββββββββββββββββ
|
| 366 |
tts_streamer = ParallelTTSStreamer()
|
| 367 |
_active_streamer = tts_streamer
|
|
|
|
| 368 |
|
| 369 |
async def run_llm():
|
| 370 |
try:
|
|
@@ -382,12 +384,20 @@ async def ws_voice(ws: WebSocket):
|
|
| 382 |
finally:
|
| 383 |
await tts_streamer.flush()
|
| 384 |
|
| 385 |
-
async def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 386 |
async for chunk in tts_streamer.stream_audio():
|
| 387 |
-
|
|
|
|
| 388 |
break
|
|
|
|
| 389 |
|
| 390 |
-
await asyncio.gather(run_llm(),
|
| 391 |
_active_streamer = None
|
| 392 |
await _safe_text(ws, {"type": "end"})
|
| 393 |
|
|
|
|
| 26 |
import asyncio
|
| 27 |
import json
|
| 28 |
import os
|
| 29 |
+
import struct
|
| 30 |
import uuid
|
| 31 |
from contextlib import asynccontextmanager
|
| 32 |
from pathlib import Path
|
|
|
|
| 366 |
# ββ LLM + TTS (concurrent) βββββββββββββββββββββββββββββββββββββββββββββ
|
| 367 |
tts_streamer = ParallelTTSStreamer()
|
| 368 |
_active_streamer = tts_streamer
|
| 369 |
+
audio_seq = 0
|
| 370 |
|
| 371 |
async def run_llm():
|
| 372 |
try:
|
|
|
|
| 384 |
finally:
|
| 385 |
await tts_streamer.flush()
|
| 386 |
|
| 387 |
+
async def run_tts_framed():
|
| 388 |
+
"""
|
| 389 |
+
Send audio as: 4-byte big-endian sequence id + raw audio bytes.
|
| 390 |
+
This lets the browser reorder deterministically even if decode
|
| 391 |
+
completes out-of-order.
|
| 392 |
+
"""
|
| 393 |
+
nonlocal audio_seq
|
| 394 |
async for chunk in tts_streamer.stream_audio():
|
| 395 |
+
framed = struct.pack(">I", audio_seq) + chunk
|
| 396 |
+
if not await _safe_bytes(ws, framed):
|
| 397 |
break
|
| 398 |
+
audio_seq += 1
|
| 399 |
|
| 400 |
+
await asyncio.gather(run_llm(), run_tts_framed(), return_exceptions=True)
|
| 401 |
_active_streamer = None
|
| 402 |
await _safe_text(ws, {"type": "end"})
|
| 403 |
|
frontend/script.js
CHANGED
|
@@ -100,6 +100,8 @@ let _bargeInFiredAt = 0;
|
|
| 100 |
let _dropAudioUntil = 0;
|
| 101 |
let _audioChain = Promise.resolve();
|
| 102 |
let _playbackGen = 0;
|
|
|
|
|
|
|
| 103 |
let brainMode = false;
|
| 104 |
let brainVoiceActive = false;
|
| 105 |
let brainRestartTimer = null;
|
|
@@ -300,16 +302,29 @@ function onVoiceMsg(ev) {
|
|
| 300 |
if (ev.data instanceof ArrayBuffer) {
|
| 301 |
if (Date.now() < _dropAudioUntil) return; // drop late packets after cancel
|
| 302 |
_ttsPlaying = true;
|
| 303 |
-
//
|
| 304 |
-
//
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 305 |
const gen = _playbackGen;
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
.
|
| 309 |
-
|
| 310 |
-
|
| 311 |
-
|
| 312 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 313 |
return;
|
| 314 |
}
|
| 315 |
|
|
@@ -327,6 +342,9 @@ function onVoiceMsg(ev) {
|
|
| 327 |
break;
|
| 328 |
|
| 329 |
case 'stt':
|
|
|
|
|
|
|
|
|
|
| 330 |
tStt = Date.now();
|
| 331 |
if (tSend > 0) mStt.textContent = tStt - tSend + ' ms';
|
| 332 |
_removeThinking();
|
|
@@ -372,6 +390,8 @@ function onVoiceMsg(ev) {
|
|
| 372 |
aiEl = null;
|
| 373 |
aiTxt = '';
|
| 374 |
_setCaption('');
|
|
|
|
|
|
|
| 375 |
if (tSend > 0) mTotal.textContent = Date.now() - tSend + ' ms';
|
| 376 |
tSend = tStt = tLlm = tTts = 0;
|
| 377 |
isProcessing = false;
|
|
@@ -386,6 +406,8 @@ function onVoiceMsg(ev) {
|
|
| 386 |
aiEl = null;
|
| 387 |
aiTxt = '';
|
| 388 |
_setCaption('');
|
|
|
|
|
|
|
| 389 |
_brainSetTtsBubble('', false);
|
| 390 |
_brainModeSetSearch(false);
|
| 391 |
isProcessing = false;
|
|
@@ -581,6 +603,8 @@ function stopAllAudio() {
|
|
| 581 |
_dropAudioUntil = Date.now() + 700;
|
| 582 |
_playbackGen++;
|
| 583 |
_audioChain = Promise.resolve();
|
|
|
|
|
|
|
| 584 |
_stopAllSources();
|
| 585 |
clearTimeout(_endTimer);
|
| 586 |
_endTimer = null;
|
|
|
|
| 100 |
let _dropAudioUntil = 0;
|
| 101 |
let _audioChain = Promise.resolve();
|
| 102 |
let _playbackGen = 0;
|
| 103 |
+
let _expectedSeq = 0;
|
| 104 |
+
let _pendingAudio = new Map();
|
| 105 |
let brainMode = false;
|
| 106 |
let brainVoiceActive = false;
|
| 107 |
let brainRestartTimer = null;
|
|
|
|
| 302 |
if (ev.data instanceof ArrayBuffer) {
|
| 303 |
if (Date.now() < _dropAudioUntil) return; // drop late packets after cancel
|
| 304 |
_ttsPlaying = true;
|
| 305 |
+
// Framed audio: 4-byte big-endian seq id + raw audio bytes.
|
| 306 |
+
// We buffer/reorder by seq so playback always matches text order.
|
| 307 |
+
const u8 = new Uint8Array(ev.data);
|
| 308 |
+
if (u8.length <= 4) return;
|
| 309 |
+
const seq =
|
| 310 |
+
(u8[0] << 24) | (u8[1] << 16) | (u8[2] << 8) | (u8[3] << 0);
|
| 311 |
+
const payload = ev.data.slice(4);
|
| 312 |
+
_pendingAudio.set(seq >>> 0, payload);
|
| 313 |
+
|
| 314 |
const gen = _playbackGen;
|
| 315 |
+
while (_pendingAudio.has(_expectedSeq)) {
|
| 316 |
+
const buf = _pendingAudio.get(_expectedSeq);
|
| 317 |
+
_pendingAudio.delete(_expectedSeq);
|
| 318 |
+
const playBuf = buf;
|
| 319 |
+
_audioChain = _audioChain
|
| 320 |
+
.catch(() => {})
|
| 321 |
+
.then(() => {
|
| 322 |
+
if (gen !== _playbackGen) return;
|
| 323 |
+
if (_cancelled) return;
|
| 324 |
+
return enqueueAudio(playBuf);
|
| 325 |
+
});
|
| 326 |
+
_expectedSeq++;
|
| 327 |
+
}
|
| 328 |
return;
|
| 329 |
}
|
| 330 |
|
|
|
|
| 342 |
break;
|
| 343 |
|
| 344 |
case 'stt':
|
| 345 |
+
// New turn: reset audio ordering/buffers.
|
| 346 |
+
_expectedSeq = 0;
|
| 347 |
+
_pendingAudio.clear();
|
| 348 |
tStt = Date.now();
|
| 349 |
if (tSend > 0) mStt.textContent = tStt - tSend + ' ms';
|
| 350 |
_removeThinking();
|
|
|
|
| 390 |
aiEl = null;
|
| 391 |
aiTxt = '';
|
| 392 |
_setCaption('');
|
| 393 |
+
_expectedSeq = 0;
|
| 394 |
+
_pendingAudio.clear();
|
| 395 |
if (tSend > 0) mTotal.textContent = Date.now() - tSend + ' ms';
|
| 396 |
tSend = tStt = tLlm = tTts = 0;
|
| 397 |
isProcessing = false;
|
|
|
|
| 406 |
aiEl = null;
|
| 407 |
aiTxt = '';
|
| 408 |
_setCaption('');
|
| 409 |
+
_expectedSeq = 0;
|
| 410 |
+
_pendingAudio.clear();
|
| 411 |
_brainSetTtsBubble('', false);
|
| 412 |
_brainModeSetSearch(false);
|
| 413 |
isProcessing = false;
|
|
|
|
| 603 |
_dropAudioUntil = Date.now() + 700;
|
| 604 |
_playbackGen++;
|
| 605 |
_audioChain = Promise.resolve();
|
| 606 |
+
_expectedSeq = 0;
|
| 607 |
+
_pendingAudio.clear();
|
| 608 |
_stopAllSources();
|
| 609 |
clearTimeout(_endTimer);
|
| 610 |
_endTimer = null;
|