rakib72642 commited on
Commit
1ce5806
Β·
1 Parent(s): caa1385

voice trigger fixed

Browse files
Files changed (2) hide show
  1. app.py +13 -3
  2. frontend/script.js +33 -9
app.py CHANGED
@@ -26,6 +26,7 @@ LLM+TTS) preserved.
26
  import asyncio
27
  import json
28
  import os
 
29
  import uuid
30
  from contextlib import asynccontextmanager
31
  from pathlib import Path
@@ -365,6 +366,7 @@ async def ws_voice(ws: WebSocket):
365
  # ── LLM + TTS (concurrent) ─────────────────────────────────────────────
366
  tts_streamer = ParallelTTSStreamer()
367
  _active_streamer = tts_streamer
 
368
 
369
  async def run_llm():
370
  try:
@@ -382,12 +384,20 @@ async def ws_voice(ws: WebSocket):
382
  finally:
383
  await tts_streamer.flush()
384
 
385
- async def run_tts():
 
 
 
 
 
 
386
  async for chunk in tts_streamer.stream_audio():
387
- if not await _safe_bytes(ws, chunk):
 
388
  break
 
389
 
390
- await asyncio.gather(run_llm(), run_tts(), return_exceptions=True)
391
  _active_streamer = None
392
  await _safe_text(ws, {"type": "end"})
393
 
 
26
  import asyncio
27
  import json
28
  import os
29
+ import struct
30
  import uuid
31
  from contextlib import asynccontextmanager
32
  from pathlib import Path
 
366
  # ── LLM + TTS (concurrent) ─────────────────────────────────────────────
367
  tts_streamer = ParallelTTSStreamer()
368
  _active_streamer = tts_streamer
369
+ audio_seq = 0
370
 
371
  async def run_llm():
372
  try:
 
384
  finally:
385
  await tts_streamer.flush()
386
 
387
+ async def run_tts_framed():
388
+ """
389
+ Send audio as: 4-byte big-endian sequence id + raw audio bytes.
390
+ This lets the browser reorder deterministically even if decode
391
+ completes out-of-order.
392
+ """
393
+ nonlocal audio_seq
394
  async for chunk in tts_streamer.stream_audio():
395
+ framed = struct.pack(">I", audio_seq) + chunk
396
+ if not await _safe_bytes(ws, framed):
397
  break
398
+ audio_seq += 1
399
 
400
+ await asyncio.gather(run_llm(), run_tts_framed(), return_exceptions=True)
401
  _active_streamer = None
402
  await _safe_text(ws, {"type": "end"})
403
 
frontend/script.js CHANGED
@@ -100,6 +100,8 @@ let _bargeInFiredAt = 0;
100
  let _dropAudioUntil = 0;
101
  let _audioChain = Promise.resolve();
102
  let _playbackGen = 0;
 
 
103
  let brainMode = false;
104
  let brainVoiceActive = false;
105
  let brainRestartTimer = null;
@@ -300,16 +302,29 @@ function onVoiceMsg(ev) {
300
  if (ev.data instanceof ArrayBuffer) {
301
  if (Date.now() < _dropAudioUntil) return; // drop late packets after cancel
302
  _ttsPlaying = true;
303
- // Ensure decode+schedule happens strictly in arrival order.
304
- // decodeAudioData is async and can complete out-of-order otherwise.
 
 
 
 
 
 
 
305
  const gen = _playbackGen;
306
- _audioChain = _audioChain
307
- .catch(() => {})
308
- .then(() => {
309
- if (gen !== _playbackGen) return;
310
- if (_cancelled) return;
311
- return enqueueAudio(ev.data);
312
- });
 
 
 
 
 
 
313
  return;
314
  }
315
 
@@ -327,6 +342,9 @@ function onVoiceMsg(ev) {
327
  break;
328
 
329
  case 'stt':
 
 
 
330
  tStt = Date.now();
331
  if (tSend > 0) mStt.textContent = tStt - tSend + ' ms';
332
  _removeThinking();
@@ -372,6 +390,8 @@ function onVoiceMsg(ev) {
372
  aiEl = null;
373
  aiTxt = '';
374
  _setCaption('');
 
 
375
  if (tSend > 0) mTotal.textContent = Date.now() - tSend + ' ms';
376
  tSend = tStt = tLlm = tTts = 0;
377
  isProcessing = false;
@@ -386,6 +406,8 @@ function onVoiceMsg(ev) {
386
  aiEl = null;
387
  aiTxt = '';
388
  _setCaption('');
 
 
389
  _brainSetTtsBubble('', false);
390
  _brainModeSetSearch(false);
391
  isProcessing = false;
@@ -581,6 +603,8 @@ function stopAllAudio() {
581
  _dropAudioUntil = Date.now() + 700;
582
  _playbackGen++;
583
  _audioChain = Promise.resolve();
 
 
584
  _stopAllSources();
585
  clearTimeout(_endTimer);
586
  _endTimer = null;
 
100
  let _dropAudioUntil = 0;
101
  let _audioChain = Promise.resolve();
102
  let _playbackGen = 0;
103
+ let _expectedSeq = 0;
104
+ let _pendingAudio = new Map();
105
  let brainMode = false;
106
  let brainVoiceActive = false;
107
  let brainRestartTimer = null;
 
302
  if (ev.data instanceof ArrayBuffer) {
303
  if (Date.now() < _dropAudioUntil) return; // drop late packets after cancel
304
  _ttsPlaying = true;
305
+ // Framed audio: 4-byte big-endian seq id + raw audio bytes.
306
+ // We buffer/reorder by seq so playback always matches text order.
307
+ const u8 = new Uint8Array(ev.data);
308
+ if (u8.length <= 4) return;
309
+ const seq =
310
+ (u8[0] << 24) | (u8[1] << 16) | (u8[2] << 8) | (u8[3] << 0);
311
+ const payload = ev.data.slice(4);
312
+ _pendingAudio.set(seq >>> 0, payload);
313
+
314
  const gen = _playbackGen;
315
+ while (_pendingAudio.has(_expectedSeq)) {
316
+ const buf = _pendingAudio.get(_expectedSeq);
317
+ _pendingAudio.delete(_expectedSeq);
318
+ const playBuf = buf;
319
+ _audioChain = _audioChain
320
+ .catch(() => {})
321
+ .then(() => {
322
+ if (gen !== _playbackGen) return;
323
+ if (_cancelled) return;
324
+ return enqueueAudio(playBuf);
325
+ });
326
+ _expectedSeq++;
327
+ }
328
  return;
329
  }
330
 
 
342
  break;
343
 
344
  case 'stt':
345
+ // New turn: reset audio ordering/buffers.
346
+ _expectedSeq = 0;
347
+ _pendingAudio.clear();
348
  tStt = Date.now();
349
  if (tSend > 0) mStt.textContent = tStt - tSend + ' ms';
350
  _removeThinking();
 
390
  aiEl = null;
391
  aiTxt = '';
392
  _setCaption('');
393
+ _expectedSeq = 0;
394
+ _pendingAudio.clear();
395
  if (tSend > 0) mTotal.textContent = Date.now() - tSend + ' ms';
396
  tSend = tStt = tLlm = tTts = 0;
397
  isProcessing = false;
 
406
  aiEl = null;
407
  aiTxt = '';
408
  _setCaption('');
409
+ _expectedSeq = 0;
410
+ _pendingAudio.clear();
411
  _brainSetTtsBubble('', false);
412
  _brainModeSetSearch(false);
413
  isProcessing = false;
 
603
  _dropAudioUntil = Date.now() + 700;
604
  _playbackGen++;
605
  _audioChain = Promise.resolve();
606
+ _expectedSeq = 0;
607
+ _pendingAudio.clear();
608
  _stopAllSources();
609
  clearTimeout(_endTimer);
610
  _endTimer = null;