Commit ·
496a69a
1
Parent(s): d4d01e4
checkpoint 3 stable
Browse files- app.py +72 -41
- frontend/script.js +16 -3
app.py
CHANGED
|
@@ -416,6 +416,7 @@ async def ws_voice(ws: WebSocket):
|
|
| 416 |
_utterance_q: asyncio.Queue[object | None] = asyncio.Queue()
|
| 417 |
_worker_task: asyncio.Task | None = None
|
| 418 |
_turn_id: int = 0
|
|
|
|
| 419 |
|
| 420 |
async def _cancel_active():
|
| 421 |
nonlocal _active_streamer, _active_task
|
|
@@ -482,6 +483,7 @@ async def ws_voice(ws: WebSocket):
|
|
| 482 |
async def _handle_utterance(audio_bytes: bytes):
|
| 483 |
nonlocal _active_streamer
|
| 484 |
nonlocal _turn_id
|
|
|
|
| 485 |
|
| 486 |
# ── STT ───────────────────────────────────────────────────────────────
|
| 487 |
transcript = await stt.transcribe(audio_bytes)
|
|
@@ -498,51 +500,78 @@ async def ws_voice(ws: WebSocket):
|
|
| 498 |
if not await _safe_text(ws, {"type": "stt", "text": transcript, "turn": my_turn}):
|
| 499 |
return
|
| 500 |
|
| 501 |
-
|
| 502 |
-
|
| 503 |
-
|
| 504 |
-
|
|
|
|
| 505 |
|
| 506 |
-
|
| 507 |
-
|
| 508 |
-
|
| 509 |
-
|
| 510 |
-
|
| 511 |
-
|
| 512 |
-
|
| 513 |
-
|
| 514 |
-
|
| 515 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 516 |
break
|
| 517 |
-
|
| 518 |
-
raise
|
| 519 |
-
except Exception as exc:
|
| 520 |
-
print(f"[VOICE] LLM error: {exc}")
|
| 521 |
-
finally:
|
| 522 |
-
# Best-effort: send the full text once at the end so the UI can
|
| 523 |
-
# recover if it missed any streamed tokens.
|
| 524 |
-
if full_text:
|
| 525 |
-
await _safe_text(ws, {"type": "llm_full", "text": _normalize_ai_text(full_text), "turn": my_turn})
|
| 526 |
-
# Voice synthesis uses the completed response so TTS gets
|
| 527 |
-
# full sentence context instead of fragmentary token chunks.
|
| 528 |
-
await tts_streamer.add_token(full_text)
|
| 529 |
-
await tts_streamer.flush()
|
| 530 |
|
| 531 |
-
|
| 532 |
-
|
| 533 |
-
|
| 534 |
-
|
| 535 |
-
|
| 536 |
-
|
| 537 |
-
|
| 538 |
-
|
| 539 |
-
|
| 540 |
-
|
| 541 |
-
|
| 542 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 543 |
|
| 544 |
-
await asyncio.gather(run_llm(), run_tts_framed(), return_exceptions=True)
|
| 545 |
-
_active_streamer = None
|
| 546 |
await _safe_text(ws, {"type": "end"})
|
| 547 |
|
| 548 |
async def _utterance_worker():
|
|
@@ -608,6 +637,8 @@ async def ws_voice(ws: WebSocket):
|
|
| 608 |
user_id = claimed
|
| 609 |
await _register_user(user_id)
|
| 610 |
await _safe_text(ws, {"type": "init_ack", "user_id": user_id})
|
|
|
|
|
|
|
| 611 |
elif t == "ping":
|
| 612 |
await _safe_text(ws, {"type": "pong"})
|
| 613 |
elif t == "cancel":
|
|
|
|
| 416 |
_utterance_q: asyncio.Queue[object | None] = asyncio.Queue()
|
| 417 |
_worker_task: asyncio.Task | None = None
|
| 418 |
_turn_id: int = 0
|
| 419 |
+
brain_mode_enabled = False
|
| 420 |
|
| 421 |
async def _cancel_active():
|
| 422 |
nonlocal _active_streamer, _active_task
|
|
|
|
| 483 |
async def _handle_utterance(audio_bytes: bytes):
|
| 484 |
nonlocal _active_streamer
|
| 485 |
nonlocal _turn_id
|
| 486 |
+
nonlocal brain_mode_enabled
|
| 487 |
|
| 488 |
# ── STT ───────────────────────────────────────────────────────────────
|
| 489 |
transcript = await stt.transcribe(audio_bytes)
|
|
|
|
| 500 |
if not await _safe_text(ws, {"type": "stt", "text": transcript, "turn": my_turn}):
|
| 501 |
return
|
| 502 |
|
| 503 |
+
if brain_mode_enabled:
|
| 504 |
+
# Brain mode prioritizes immediacy. Stream tokens and TTS together.
|
| 505 |
+
tts_streamer = ParallelTTSStreamer()
|
| 506 |
+
_active_streamer = tts_streamer
|
| 507 |
+
audio_seq = 0
|
| 508 |
|
| 509 |
+
async def run_llm():
|
| 510 |
+
full_text = ""
|
| 511 |
+
try:
|
| 512 |
+
stream = await ai.main(user_id, transcript)
|
| 513 |
+
async for token in stream:
|
| 514 |
+
if not token:
|
| 515 |
+
continue
|
| 516 |
+
token = _normalize_ai_text(token)
|
| 517 |
+
full_text += token
|
| 518 |
+
if not await _safe_text(ws, {"type": "llm_token", "token": token, "turn": my_turn}):
|
| 519 |
+
break
|
| 520 |
+
except asyncio.CancelledError:
|
| 521 |
+
raise
|
| 522 |
+
except Exception as exc:
|
| 523 |
+
print(f"[VOICE] LLM error: {exc}")
|
| 524 |
+
finally:
|
| 525 |
+
if full_text:
|
| 526 |
+
await _safe_text(ws, {"type": "llm_full", "text": _normalize_ai_text(full_text), "turn": my_turn})
|
| 527 |
+
await tts_streamer.add_token(full_text)
|
| 528 |
+
await tts_streamer.flush()
|
| 529 |
+
|
| 530 |
+
async def run_tts_framed():
|
| 531 |
+
nonlocal audio_seq
|
| 532 |
+
async for chunk in tts_streamer.stream_audio():
|
| 533 |
+
framed = struct.pack(">II", my_turn, audio_seq) + chunk
|
| 534 |
+
if not await _safe_bytes(ws, framed):
|
| 535 |
break
|
| 536 |
+
audio_seq += 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 537 |
|
| 538 |
+
await asyncio.gather(run_llm(), run_tts_framed(), return_exceptions=True)
|
| 539 |
+
_active_streamer = None
|
| 540 |
+
else:
|
| 541 |
+
# Normal mode keeps audio silent until the full response is ready.
|
| 542 |
+
audio_seq = 0
|
| 543 |
+
|
| 544 |
+
async def run_llm():
|
| 545 |
+
full_text = ""
|
| 546 |
+
try:
|
| 547 |
+
stream = await ai.main(user_id, transcript)
|
| 548 |
+
async for token in stream:
|
| 549 |
+
if not token:
|
| 550 |
+
continue
|
| 551 |
+
token = _normalize_ai_text(token)
|
| 552 |
+
full_text += token
|
| 553 |
+
if not await _safe_text(ws, {"type": "llm_token", "token": token, "turn": my_turn}):
|
| 554 |
+
break
|
| 555 |
+
except asyncio.CancelledError:
|
| 556 |
+
raise
|
| 557 |
+
except Exception as exc:
|
| 558 |
+
print(f"[VOICE] LLM error: {exc}")
|
| 559 |
+
return full_text
|
| 560 |
+
|
| 561 |
+
full_text = await run_llm()
|
| 562 |
+
if full_text:
|
| 563 |
+
await _safe_text(ws, {"type": "llm_full", "text": _normalize_ai_text(full_text), "turn": my_turn})
|
| 564 |
+
tts_streamer = ParallelTTSStreamer()
|
| 565 |
+
_active_streamer = tts_streamer
|
| 566 |
+
await tts_streamer.add_token(full_text)
|
| 567 |
+
await tts_streamer.flush()
|
| 568 |
+
async for chunk in tts_streamer.stream_audio():
|
| 569 |
+
framed = struct.pack(">II", my_turn, audio_seq) + chunk
|
| 570 |
+
if not await _safe_bytes(ws, framed):
|
| 571 |
+
break
|
| 572 |
+
audio_seq += 1
|
| 573 |
+
_active_streamer = None
|
| 574 |
|
|
|
|
|
|
|
| 575 |
await _safe_text(ws, {"type": "end"})
|
| 576 |
|
| 577 |
async def _utterance_worker():
|
|
|
|
| 637 |
user_id = claimed
|
| 638 |
await _register_user(user_id)
|
| 639 |
await _safe_text(ws, {"type": "init_ack", "user_id": user_id})
|
| 640 |
+
elif t in ("brain_mode", "mode"):
|
| 641 |
+
brain_mode_enabled = bool(msg.get("enabled", False))
|
| 642 |
elif t == "ping":
|
| 643 |
await _safe_text(ws, {"type": "pong"})
|
| 644 |
elif t == "cancel":
|
frontend/script.js
CHANGED
|
@@ -675,7 +675,7 @@ function _done() {
|
|
| 675 |
return;
|
| 676 |
}
|
| 677 |
_brainResumeListening();
|
| 678 |
-
},
|
| 679 |
}
|
| 680 |
console.log('[Voice] Idle — ready for next manual press');
|
| 681 |
}
|
|
@@ -1240,6 +1240,7 @@ function setBrainMode(on) {
|
|
| 1240 |
brainBtn.setAttribute('aria-pressed', String(brainMode));
|
| 1241 |
if (brainStage) brainStage.setAttribute('aria-hidden', String(!brainMode));
|
| 1242 |
if (voiceCaption) voiceCaption.textContent = '';
|
|
|
|
| 1243 |
if (brainMode) {
|
| 1244 |
brainBubbleSttText.textContent = 'Listening…';
|
| 1245 |
brainBubbleTtsText.textContent =
|
|
@@ -1259,7 +1260,7 @@ function setBrainMode(on) {
|
|
| 1259 |
if (!brainMode || !brainVoiceActive) return;
|
| 1260 |
if (isProcessing || isSpeaking || _ttsPlaying) return;
|
| 1261 |
_brainSendWelcome();
|
| 1262 |
-
},
|
| 1263 |
}
|
| 1264 |
if (!isListening && !isProcessing && !isRecordingLocked) {
|
| 1265 |
setTimeout(() => {
|
|
@@ -1272,7 +1273,7 @@ function setBrainMode(on) {
|
|
| 1272 |
) {
|
| 1273 |
_brainResumeListening();
|
| 1274 |
}
|
| 1275 |
-
},
|
| 1276 |
}
|
| 1277 |
} else {
|
| 1278 |
brainVoiceActive = false;
|
|
@@ -1287,6 +1288,18 @@ function setBrainMode(on) {
|
|
| 1287 |
}
|
| 1288 |
}
|
| 1289 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1290 |
function _brainSendWelcome() {
|
| 1291 |
const payload = JSON.stringify({ type: 'speak', text: BRAIN_WELCOME_TEXT });
|
| 1292 |
if (!voiceWS || voiceWS.readyState !== WebSocket.OPEN) {
|
|
|
|
| 675 |
return;
|
| 676 |
}
|
| 677 |
_brainResumeListening();
|
| 678 |
+
}, 0);
|
| 679 |
}
|
| 680 |
console.log('[Voice] Idle — ready for next manual press');
|
| 681 |
}
|
|
|
|
| 1240 |
brainBtn.setAttribute('aria-pressed', String(brainMode));
|
| 1241 |
if (brainStage) brainStage.setAttribute('aria-hidden', String(!brainMode));
|
| 1242 |
if (voiceCaption) voiceCaption.textContent = '';
|
| 1243 |
+
_sendVoiceControl({ type: 'brain_mode', enabled: brainMode });
|
| 1244 |
if (brainMode) {
|
| 1245 |
brainBubbleSttText.textContent = 'Listening…';
|
| 1246 |
brainBubbleTtsText.textContent =
|
|
|
|
| 1260 |
if (!brainMode || !brainVoiceActive) return;
|
| 1261 |
if (isProcessing || isSpeaking || _ttsPlaying) return;
|
| 1262 |
_brainSendWelcome();
|
| 1263 |
+
}, 0);
|
| 1264 |
}
|
| 1265 |
if (!isListening && !isProcessing && !isRecordingLocked) {
|
| 1266 |
setTimeout(() => {
|
|
|
|
| 1273 |
) {
|
| 1274 |
_brainResumeListening();
|
| 1275 |
}
|
| 1276 |
+
}, 0);
|
| 1277 |
}
|
| 1278 |
} else {
|
| 1279 |
brainVoiceActive = false;
|
|
|
|
| 1288 |
}
|
| 1289 |
}
|
| 1290 |
|
| 1291 |
+
function _sendVoiceControl(payload) {
|
| 1292 |
+
const packet = JSON.stringify(payload);
|
| 1293 |
+
if (voiceWS && voiceWS.readyState === WebSocket.OPEN) {
|
| 1294 |
+
try {
|
| 1295 |
+
voiceWS.send(packet);
|
| 1296 |
+
return;
|
| 1297 |
+
} catch {}
|
| 1298 |
+
}
|
| 1299 |
+
voicePendingPackets.push(packet);
|
| 1300 |
+
_connectVoice();
|
| 1301 |
+
}
|
| 1302 |
+
|
| 1303 |
function _brainSendWelcome() {
|
| 1304 |
const payload = JSON.stringify({ type: 'speak', text: BRAIN_WELCOME_TEXT });
|
| 1305 |
if (!voiceWS || voiceWS.readyState !== WebSocket.OPEN) {
|