HawkEyesAI
/

Voice-AI-Agent

Model card Files Files and versions

xet

Community

rakib72642 commited on 2 days ago

Commit

1ce5806

1 Parent(s): caa1385

voice trigger fixed

Browse files

Files changed (2) hide show

app.py +13 -3
frontend/script.js +33 -9

app.py CHANGED Viewed

@@ -26,6 +26,7 @@ LLM+TTS) preserved.
 import asyncio
 import json
 import os
 import uuid
 from contextlib import asynccontextmanager
 from pathlib import Path
@@ -365,6 +366,7 @@ async def ws_voice(ws: WebSocket):
         # ── LLM + TTS (concurrent) ─────────────────────────────────────────────
         tts_streamer     = ParallelTTSStreamer()
         _active_streamer = tts_streamer
         async def run_llm():
             try:
@@ -382,12 +384,20 @@ async def ws_voice(ws: WebSocket):
             finally:
                 await tts_streamer.flush()
-        async def run_tts():
             async for chunk in tts_streamer.stream_audio():
-                if not await _safe_bytes(ws, chunk):
                     break
-        await asyncio.gather(run_llm(), run_tts(), return_exceptions=True)
         _active_streamer = None
         await _safe_text(ws, {"type": "end"})

 import asyncio
 import json
 import os
+import struct
 import uuid
 from contextlib import asynccontextmanager
 from pathlib import Path
         # ── LLM + TTS (concurrent) ─────────────────────────────────────────────
         tts_streamer     = ParallelTTSStreamer()
         _active_streamer = tts_streamer
+        audio_seq        = 0
         async def run_llm():
             try:
             finally:
                 await tts_streamer.flush()
+        async def run_tts_framed():
+            """
+            Send audio as: 4-byte big-endian sequence id + raw audio bytes.
+            This lets the browser reorder deterministically even if decode
+            completes out-of-order.
+            """
+            nonlocal audio_seq
             async for chunk in tts_streamer.stream_audio():
+                framed = struct.pack(">I", audio_seq) + chunk
+                if not await _safe_bytes(ws, framed):
                     break
+                audio_seq += 1
+        await asyncio.gather(run_llm(), run_tts_framed(), return_exceptions=True)
         _active_streamer = None
         await _safe_text(ws, {"type": "end"})

frontend/script.js CHANGED Viewed

@@ -100,6 +100,8 @@ let _bargeInFiredAt = 0;
 let _dropAudioUntil = 0;
 let _audioChain = Promise.resolve();
 let _playbackGen = 0;
 let brainMode = false;
 let brainVoiceActive = false;
 let brainRestartTimer = null;
@@ -300,16 +302,29 @@ function onVoiceMsg(ev) {
   if (ev.data instanceof ArrayBuffer) {
     if (Date.now() < _dropAudioUntil) return; // drop late packets after cancel
     _ttsPlaying = true;
-    // Ensure decode+schedule happens strictly in arrival order.
-    // decodeAudioData is async and can complete out-of-order otherwise.
     const gen = _playbackGen;
-    _audioChain = _audioChain
-      .catch(() => {})
-      .then(() => {
-        if (gen !== _playbackGen) return;
-        if (_cancelled) return;
-        return enqueueAudio(ev.data);
-      });
     return;
   }
@@ -327,6 +342,9 @@ function onVoiceMsg(ev) {
       break;
     case 'stt':
       tStt = Date.now();
       if (tSend > 0) mStt.textContent = tStt - tSend + ' ms';
       _removeThinking();
@@ -372,6 +390,8 @@ function onVoiceMsg(ev) {
       aiEl = null;
       aiTxt = '';
       _setCaption('');
       if (tSend > 0) mTotal.textContent = Date.now() - tSend + ' ms';
       tSend = tStt = tLlm = tTts = 0;
       isProcessing = false;
@@ -386,6 +406,8 @@ function onVoiceMsg(ev) {
       aiEl = null;
       aiTxt = '';
       _setCaption('');
       _brainSetTtsBubble('', false);
       _brainModeSetSearch(false);
       isProcessing = false;
@@ -581,6 +603,8 @@ function stopAllAudio() {
   _dropAudioUntil = Date.now() + 700;
   _playbackGen++;
   _audioChain = Promise.resolve();
   _stopAllSources();
   clearTimeout(_endTimer);
   _endTimer = null;

 let _dropAudioUntil = 0;
 let _audioChain = Promise.resolve();
 let _playbackGen = 0;
+let _expectedSeq = 0;
+let _pendingAudio = new Map();
 let brainMode = false;
 let brainVoiceActive = false;
 let brainRestartTimer = null;
   if (ev.data instanceof ArrayBuffer) {
     if (Date.now() < _dropAudioUntil) return; // drop late packets after cancel
     _ttsPlaying = true;
+    // Framed audio: 4-byte big-endian seq id + raw audio bytes.
+    // We buffer/reorder by seq so playback always matches text order.
+    const u8 = new Uint8Array(ev.data);
+    if (u8.length <= 4) return;
+    const seq =
+      (u8[0] << 24) | (u8[1] << 16) | (u8[2] << 8) | (u8[3] << 0);
+    const payload = ev.data.slice(4);
+    _pendingAudio.set(seq >>> 0, payload);
     const gen = _playbackGen;
+    while (_pendingAudio.has(_expectedSeq)) {
+      const buf = _pendingAudio.get(_expectedSeq);
+      _pendingAudio.delete(_expectedSeq);
+      const playBuf = buf;
+      _audioChain = _audioChain
+        .catch(() => {})
+        .then(() => {
+          if (gen !== _playbackGen) return;
+          if (_cancelled) return;
+          return enqueueAudio(playBuf);
+        });
+      _expectedSeq++;
+    }
     return;
   }
       break;
     case 'stt':
+      // New turn: reset audio ordering/buffers.
+      _expectedSeq = 0;
+      _pendingAudio.clear();
       tStt = Date.now();
       if (tSend > 0) mStt.textContent = tStt - tSend + ' ms';
       _removeThinking();
       aiEl = null;
       aiTxt = '';
       _setCaption('');
+      _expectedSeq = 0;
+      _pendingAudio.clear();
       if (tSend > 0) mTotal.textContent = Date.now() - tSend + ' ms';
       tSend = tStt = tLlm = tTts = 0;
       isProcessing = false;
       aiEl = null;
       aiTxt = '';
       _setCaption('');
+      _expectedSeq = 0;
+      _pendingAudio.clear();
       _brainSetTtsBubble('', false);
       _brainModeSetSearch(false);
       isProcessing = false;
   _dropAudioUntil = Date.now() + 700;
   _playbackGen++;
   _audioChain = Promise.resolve();
+  _expectedSeq = 0;
+  _pendingAudio.clear();
   _stopAllSources();
   clearTimeout(_endTimer);
   _endTimer = null;