rakib72642 commited on
Commit
496a69a
·
1 Parent(s): d4d01e4

checkpoint 3 stable

Browse files
Files changed (2) hide show
  1. app.py +72 -41
  2. frontend/script.js +16 -3
app.py CHANGED
@@ -416,6 +416,7 @@ async def ws_voice(ws: WebSocket):
416
  _utterance_q: asyncio.Queue[object | None] = asyncio.Queue()
417
  _worker_task: asyncio.Task | None = None
418
  _turn_id: int = 0
 
419
 
420
  async def _cancel_active():
421
  nonlocal _active_streamer, _active_task
@@ -482,6 +483,7 @@ async def ws_voice(ws: WebSocket):
482
  async def _handle_utterance(audio_bytes: bytes):
483
  nonlocal _active_streamer
484
  nonlocal _turn_id
 
485
 
486
  # ── STT ───────────────────────────────────────────────────────────────
487
  transcript = await stt.transcribe(audio_bytes)
@@ -498,51 +500,78 @@ async def ws_voice(ws: WebSocket):
498
  if not await _safe_text(ws, {"type": "stt", "text": transcript, "turn": my_turn}):
499
  return
500
 
501
- # ── LLM + TTS (concurrent) ─────────────────────────────────────────────
502
- tts_streamer = ParallelTTSStreamer()
503
- _active_streamer = tts_streamer
504
- audio_seq = 0
 
505
 
506
- async def run_llm():
507
- full_text = ""
508
- try:
509
- stream = await ai.main(user_id, transcript)
510
- async for token in stream:
511
- if not token:
512
- continue
513
- token = _normalize_ai_text(token)
514
- full_text += token
515
- if not await _safe_text(ws, {"type": "llm_token", "token": token, "turn": my_turn}):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
516
  break
517
- except asyncio.CancelledError:
518
- raise
519
- except Exception as exc:
520
- print(f"[VOICE] LLM error: {exc}")
521
- finally:
522
- # Best-effort: send the full text once at the end so the UI can
523
- # recover if it missed any streamed tokens.
524
- if full_text:
525
- await _safe_text(ws, {"type": "llm_full", "text": _normalize_ai_text(full_text), "turn": my_turn})
526
- # Voice synthesis uses the completed response so TTS gets
527
- # full sentence context instead of fragmentary token chunks.
528
- await tts_streamer.add_token(full_text)
529
- await tts_streamer.flush()
530
 
531
- async def run_tts_framed():
532
- """
533
- Send audio as: 4-byte big-endian sequence id + raw audio bytes.
534
- This lets the browser reorder deterministically even if decode
535
- completes out-of-order.
536
- """
537
- nonlocal audio_seq
538
- async for chunk in tts_streamer.stream_audio():
539
- framed = struct.pack(">II", my_turn, audio_seq) + chunk
540
- if not await _safe_bytes(ws, framed):
541
- break
542
- audio_seq += 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
543
 
544
- await asyncio.gather(run_llm(), run_tts_framed(), return_exceptions=True)
545
- _active_streamer = None
546
  await _safe_text(ws, {"type": "end"})
547
 
548
  async def _utterance_worker():
@@ -608,6 +637,8 @@ async def ws_voice(ws: WebSocket):
608
  user_id = claimed
609
  await _register_user(user_id)
610
  await _safe_text(ws, {"type": "init_ack", "user_id": user_id})
 
 
611
  elif t == "ping":
612
  await _safe_text(ws, {"type": "pong"})
613
  elif t == "cancel":
 
416
  _utterance_q: asyncio.Queue[object | None] = asyncio.Queue()
417
  _worker_task: asyncio.Task | None = None
418
  _turn_id: int = 0
419
+ brain_mode_enabled = False
420
 
421
  async def _cancel_active():
422
  nonlocal _active_streamer, _active_task
 
483
  async def _handle_utterance(audio_bytes: bytes):
484
  nonlocal _active_streamer
485
  nonlocal _turn_id
486
+ nonlocal brain_mode_enabled
487
 
488
  # ── STT ───────────────────────────────────────────────────────────────
489
  transcript = await stt.transcribe(audio_bytes)
 
500
  if not await _safe_text(ws, {"type": "stt", "text": transcript, "turn": my_turn}):
501
  return
502
 
503
+ if brain_mode_enabled:
504
+ # Brain mode prioritizes immediacy. Stream tokens and TTS together.
505
+ tts_streamer = ParallelTTSStreamer()
506
+ _active_streamer = tts_streamer
507
+ audio_seq = 0
508
 
509
+ async def run_llm():
510
+ full_text = ""
511
+ try:
512
+ stream = await ai.main(user_id, transcript)
513
+ async for token in stream:
514
+ if not token:
515
+ continue
516
+ token = _normalize_ai_text(token)
517
+ full_text += token
518
+ if not await _safe_text(ws, {"type": "llm_token", "token": token, "turn": my_turn}):
519
+ break
520
+ except asyncio.CancelledError:
521
+ raise
522
+ except Exception as exc:
523
+ print(f"[VOICE] LLM error: {exc}")
524
+ finally:
525
+ if full_text:
526
+ await _safe_text(ws, {"type": "llm_full", "text": _normalize_ai_text(full_text), "turn": my_turn})
527
+ await tts_streamer.add_token(full_text)
528
+ await tts_streamer.flush()
529
+
530
+ async def run_tts_framed():
531
+ nonlocal audio_seq
532
+ async for chunk in tts_streamer.stream_audio():
533
+ framed = struct.pack(">II", my_turn, audio_seq) + chunk
534
+ if not await _safe_bytes(ws, framed):
535
  break
536
+ audio_seq += 1
 
 
 
 
 
 
 
 
 
 
 
 
537
 
538
+ await asyncio.gather(run_llm(), run_tts_framed(), return_exceptions=True)
539
+ _active_streamer = None
540
+ else:
541
+ # Normal mode keeps audio silent until the full response is ready.
542
+ audio_seq = 0
543
+
544
+ async def run_llm():
545
+ full_text = ""
546
+ try:
547
+ stream = await ai.main(user_id, transcript)
548
+ async for token in stream:
549
+ if not token:
550
+ continue
551
+ token = _normalize_ai_text(token)
552
+ full_text += token
553
+ if not await _safe_text(ws, {"type": "llm_token", "token": token, "turn": my_turn}):
554
+ break
555
+ except asyncio.CancelledError:
556
+ raise
557
+ except Exception as exc:
558
+ print(f"[VOICE] LLM error: {exc}")
559
+ return full_text
560
+
561
+ full_text = await run_llm()
562
+ if full_text:
563
+ await _safe_text(ws, {"type": "llm_full", "text": _normalize_ai_text(full_text), "turn": my_turn})
564
+ tts_streamer = ParallelTTSStreamer()
565
+ _active_streamer = tts_streamer
566
+ await tts_streamer.add_token(full_text)
567
+ await tts_streamer.flush()
568
+ async for chunk in tts_streamer.stream_audio():
569
+ framed = struct.pack(">II", my_turn, audio_seq) + chunk
570
+ if not await _safe_bytes(ws, framed):
571
+ break
572
+ audio_seq += 1
573
+ _active_streamer = None
574
 
 
 
575
  await _safe_text(ws, {"type": "end"})
576
 
577
  async def _utterance_worker():
 
637
  user_id = claimed
638
  await _register_user(user_id)
639
  await _safe_text(ws, {"type": "init_ack", "user_id": user_id})
640
+ elif t in ("brain_mode", "mode"):
641
+ brain_mode_enabled = bool(msg.get("enabled", False))
642
  elif t == "ping":
643
  await _safe_text(ws, {"type": "pong"})
644
  elif t == "cancel":
frontend/script.js CHANGED
@@ -675,7 +675,7 @@ function _done() {
675
  return;
676
  }
677
  _brainResumeListening();
678
- }, 180);
679
  }
680
  console.log('[Voice] Idle — ready for next manual press');
681
  }
@@ -1240,6 +1240,7 @@ function setBrainMode(on) {
1240
  brainBtn.setAttribute('aria-pressed', String(brainMode));
1241
  if (brainStage) brainStage.setAttribute('aria-hidden', String(!brainMode));
1242
  if (voiceCaption) voiceCaption.textContent = '';
 
1243
  if (brainMode) {
1244
  brainBubbleSttText.textContent = 'Listening…';
1245
  brainBubbleTtsText.textContent =
@@ -1259,7 +1260,7 @@ function setBrainMode(on) {
1259
  if (!brainMode || !brainVoiceActive) return;
1260
  if (isProcessing || isSpeaking || _ttsPlaying) return;
1261
  _brainSendWelcome();
1262
- }, 220);
1263
  }
1264
  if (!isListening && !isProcessing && !isRecordingLocked) {
1265
  setTimeout(() => {
@@ -1272,7 +1273,7 @@ function setBrainMode(on) {
1272
  ) {
1273
  _brainResumeListening();
1274
  }
1275
- }, 180);
1276
  }
1277
  } else {
1278
  brainVoiceActive = false;
@@ -1287,6 +1288,18 @@ function setBrainMode(on) {
1287
  }
1288
  }
1289
 
 
 
 
 
 
 
 
 
 
 
 
 
1290
  function _brainSendWelcome() {
1291
  const payload = JSON.stringify({ type: 'speak', text: BRAIN_WELCOME_TEXT });
1292
  if (!voiceWS || voiceWS.readyState !== WebSocket.OPEN) {
 
675
  return;
676
  }
677
  _brainResumeListening();
678
+ }, 0);
679
  }
680
  console.log('[Voice] Idle — ready for next manual press');
681
  }
 
1240
  brainBtn.setAttribute('aria-pressed', String(brainMode));
1241
  if (brainStage) brainStage.setAttribute('aria-hidden', String(!brainMode));
1242
  if (voiceCaption) voiceCaption.textContent = '';
1243
+ _sendVoiceControl({ type: 'brain_mode', enabled: brainMode });
1244
  if (brainMode) {
1245
  brainBubbleSttText.textContent = 'Listening…';
1246
  brainBubbleTtsText.textContent =
 
1260
  if (!brainMode || !brainVoiceActive) return;
1261
  if (isProcessing || isSpeaking || _ttsPlaying) return;
1262
  _brainSendWelcome();
1263
+ }, 0);
1264
  }
1265
  if (!isListening && !isProcessing && !isRecordingLocked) {
1266
  setTimeout(() => {
 
1273
  ) {
1274
  _brainResumeListening();
1275
  }
1276
+ }, 0);
1277
  }
1278
  } else {
1279
  brainVoiceActive = false;
 
1288
  }
1289
  }
1290
 
1291
+ function _sendVoiceControl(payload) {
1292
+ const packet = JSON.stringify(payload);
1293
+ if (voiceWS && voiceWS.readyState === WebSocket.OPEN) {
1294
+ try {
1295
+ voiceWS.send(packet);
1296
+ return;
1297
+ } catch {}
1298
+ }
1299
+ voicePendingPackets.push(packet);
1300
+ _connectVoice();
1301
+ }
1302
+
1303
  function _brainSendWelcome() {
1304
  const payload = JSON.stringify({ type: 'speak', text: BRAIN_WELCOME_TEXT });
1305
  if (!voiceWS || voiceWS.readyState !== WebSocket.OPEN) {