HawkEyesAI
/

Voice-AI-Agent

Model card Files Files and versions

xet

Community

rakib72642 commited on about 13 hours ago

Commit

496a69a

1 Parent(s): d4d01e4

checkpoint 3 stable

Browse files

Files changed (2) hide show

app.py +72 -41
frontend/script.js +16 -3

app.py CHANGED Viewed

@@ -416,6 +416,7 @@ async def ws_voice(ws: WebSocket):
     _utterance_q: asyncio.Queue[object | None] = asyncio.Queue()
     _worker_task: asyncio.Task | None = None
     _turn_id: int = 0
     async def _cancel_active():
         nonlocal _active_streamer, _active_task
@@ -482,6 +483,7 @@ async def ws_voice(ws: WebSocket):
     async def _handle_utterance(audio_bytes: bytes):
         nonlocal _active_streamer
         nonlocal _turn_id
         # ── STT ───────────────────────────────────────────────────────────────
         transcript = await stt.transcribe(audio_bytes)
@@ -498,51 +500,78 @@ async def ws_voice(ws: WebSocket):
         if not await _safe_text(ws, {"type": "stt", "text": transcript, "turn": my_turn}):
             return
-        # ── LLM + TTS (concurrent) ─────────────────────────────────────────────
-        tts_streamer     = ParallelTTSStreamer()
-        _active_streamer = tts_streamer
-        audio_seq        = 0
-        async def run_llm():
-            full_text = ""
-            try:
-                stream = await ai.main(user_id, transcript)
-                async for token in stream:
-                    if not token:
-                        continue
-                    token = _normalize_ai_text(token)
-                    full_text += token
-                    if not await _safe_text(ws, {"type": "llm_token", "token": token, "turn": my_turn}):
                         break
-            except asyncio.CancelledError:
-                raise
-            except Exception as exc:
-                print(f"[VOICE] LLM error: {exc}")
-            finally:
-                # Best-effort: send the full text once at the end so the UI can
-                # recover if it missed any streamed tokens.
-                if full_text:
-                    await _safe_text(ws, {"type": "llm_full", "text": _normalize_ai_text(full_text), "turn": my_turn})
-                    # Voice synthesis uses the completed response so TTS gets
-                    # full sentence context instead of fragmentary token chunks.
-                    await tts_streamer.add_token(full_text)
-                await tts_streamer.flush()
-        async def run_tts_framed():
-            """
-            Send audio as: 4-byte big-endian sequence id + raw audio bytes.
-            This lets the browser reorder deterministically even if decode
-            completes out-of-order.
-            """
-            nonlocal audio_seq
-            async for chunk in tts_streamer.stream_audio():
-                framed = struct.pack(">II", my_turn, audio_seq) + chunk
-                if not await _safe_bytes(ws, framed):
-                    break
-                audio_seq += 1
-        await asyncio.gather(run_llm(), run_tts_framed(), return_exceptions=True)
-        _active_streamer = None
         await _safe_text(ws, {"type": "end"})
     async def _utterance_worker():
@@ -608,6 +637,8 @@ async def ws_voice(ws: WebSocket):
                             user_id = claimed
                             await _register_user(user_id)
                         await _safe_text(ws, {"type": "init_ack", "user_id": user_id})
                     elif t == "ping":
                         await _safe_text(ws, {"type": "pong"})
                     elif t == "cancel":

     _utterance_q: asyncio.Queue[object | None] = asyncio.Queue()
     _worker_task: asyncio.Task | None = None
     _turn_id: int = 0
+    brain_mode_enabled = False
     async def _cancel_active():
         nonlocal _active_streamer, _active_task
     async def _handle_utterance(audio_bytes: bytes):
         nonlocal _active_streamer
         nonlocal _turn_id
+        nonlocal brain_mode_enabled
         # ── STT ───────────────────────────────────────────────────────────────
         transcript = await stt.transcribe(audio_bytes)
         if not await _safe_text(ws, {"type": "stt", "text": transcript, "turn": my_turn}):
             return
+        if brain_mode_enabled:
+            # Brain mode prioritizes immediacy. Stream tokens and TTS together.
+            tts_streamer = ParallelTTSStreamer()
+            _active_streamer = tts_streamer
+            audio_seq = 0
+            async def run_llm():
+                full_text = ""
+                try:
+                    stream = await ai.main(user_id, transcript)
+                    async for token in stream:
+                        if not token:
+                            continue
+                        token = _normalize_ai_text(token)
+                        full_text += token
+                        if not await _safe_text(ws, {"type": "llm_token", "token": token, "turn": my_turn}):
+                            break
+                except asyncio.CancelledError:
+                    raise
+                except Exception as exc:
+                    print(f"[VOICE] LLM error: {exc}")
+                finally:
+                    if full_text:
+                        await _safe_text(ws, {"type": "llm_full", "text": _normalize_ai_text(full_text), "turn": my_turn})
+                        await tts_streamer.add_token(full_text)
+                    await tts_streamer.flush()
+            async def run_tts_framed():
+                nonlocal audio_seq
+                async for chunk in tts_streamer.stream_audio():
+                    framed = struct.pack(">II", my_turn, audio_seq) + chunk
+                    if not await _safe_bytes(ws, framed):
                         break
+                    audio_seq += 1
+            await asyncio.gather(run_llm(), run_tts_framed(), return_exceptions=True)
+            _active_streamer = None
+        else:
+            # Normal mode keeps audio silent until the full response is ready.
+            audio_seq = 0
+            async def run_llm():
+                full_text = ""
+                try:
+                    stream = await ai.main(user_id, transcript)
+                    async for token in stream:
+                        if not token:
+                            continue
+                        token = _normalize_ai_text(token)
+                        full_text += token
+                        if not await _safe_text(ws, {"type": "llm_token", "token": token, "turn": my_turn}):
+                            break
+                except asyncio.CancelledError:
+                    raise
+                except Exception as exc:
+                    print(f"[VOICE] LLM error: {exc}")
+                return full_text
+            full_text = await run_llm()
+            if full_text:
+                await _safe_text(ws, {"type": "llm_full", "text": _normalize_ai_text(full_text), "turn": my_turn})
+                tts_streamer = ParallelTTSStreamer()
+                _active_streamer = tts_streamer
+                await tts_streamer.add_token(full_text)
+                await tts_streamer.flush()
+                async for chunk in tts_streamer.stream_audio():
+                    framed = struct.pack(">II", my_turn, audio_seq) + chunk
+                    if not await _safe_bytes(ws, framed):
+                        break
+                    audio_seq += 1
+            _active_streamer = None
         await _safe_text(ws, {"type": "end"})
     async def _utterance_worker():
                             user_id = claimed
                             await _register_user(user_id)
                         await _safe_text(ws, {"type": "init_ack", "user_id": user_id})
+                    elif t in ("brain_mode", "mode"):
+                        brain_mode_enabled = bool(msg.get("enabled", False))
                     elif t == "ping":
                         await _safe_text(ws, {"type": "pong"})
                     elif t == "cancel":

frontend/script.js CHANGED Viewed

@@ -675,7 +675,7 @@ function _done() {
         return;
       }
       _brainResumeListening();
-    }, 180);
   }
   console.log('[Voice] Idle — ready for next manual press');
 }
@@ -1240,6 +1240,7 @@ function setBrainMode(on) {
   brainBtn.setAttribute('aria-pressed', String(brainMode));
   if (brainStage) brainStage.setAttribute('aria-hidden', String(!brainMode));
   if (voiceCaption) voiceCaption.textContent = '';
   if (brainMode) {
     brainBubbleSttText.textContent = 'Listening…';
     brainBubbleTtsText.textContent =
@@ -1259,7 +1260,7 @@ function setBrainMode(on) {
         if (!brainMode || !brainVoiceActive) return;
         if (isProcessing || isSpeaking || _ttsPlaying) return;
         _brainSendWelcome();
-      }, 220);
     }
     if (!isListening && !isProcessing && !isRecordingLocked) {
       setTimeout(() => {
@@ -1272,7 +1273,7 @@ function setBrainMode(on) {
         ) {
           _brainResumeListening();
         }
-      }, 180);
     }
   } else {
     brainVoiceActive = false;
@@ -1287,6 +1288,18 @@ function setBrainMode(on) {
   }
 }
 function _brainSendWelcome() {
   const payload = JSON.stringify({ type: 'speak', text: BRAIN_WELCOME_TEXT });
   if (!voiceWS || voiceWS.readyState !== WebSocket.OPEN) {

         return;
       }
       _brainResumeListening();
+    }, 0);
   }
   console.log('[Voice] Idle — ready for next manual press');
 }
   brainBtn.setAttribute('aria-pressed', String(brainMode));
   if (brainStage) brainStage.setAttribute('aria-hidden', String(!brainMode));
   if (voiceCaption) voiceCaption.textContent = '';
+  _sendVoiceControl({ type: 'brain_mode', enabled: brainMode });
   if (brainMode) {
     brainBubbleSttText.textContent = 'Listening…';
     brainBubbleTtsText.textContent =
         if (!brainMode || !brainVoiceActive) return;
         if (isProcessing || isSpeaking || _ttsPlaying) return;
         _brainSendWelcome();
+      }, 0);
     }
     if (!isListening && !isProcessing && !isRecordingLocked) {
       setTimeout(() => {
         ) {
           _brainResumeListening();
         }
+      }, 0);
     }
   } else {
     brainVoiceActive = false;
   }
 }
+function _sendVoiceControl(payload) {
+  const packet = JSON.stringify(payload);
+  if (voiceWS && voiceWS.readyState === WebSocket.OPEN) {
+    try {
+      voiceWS.send(packet);
+      return;
+    } catch {}
+  }
+  voicePendingPackets.push(packet);
+  _connectVoice();
+}
 function _brainSendWelcome() {
   const payload = JSON.stringify({ type: 'speak', text: BRAIN_WELCOME_TEXT });
   if (!voiceWS || voiceWS.readyState !== WebSocket.OPEN) {