""" Buddy CLI — Headless Push-to-Talk AI Assistant Bruk uten GUI for testing eller som daemon. python cli.py """ import sys import threading import time from pynput import keyboard from audio_io import PushToTalkRecorder, TTSEngine from screen_capture import capture_primary_monitor from vision_llm import MultimodalAssistant def main(): print("=" * 60) print(" Buddy — Push-to-Talk AI Assistant (CLI)") print("=" * 60) print("\nTaster:") print(" F9 — Hold for aa snakke, slipp for aa sende") print(" Esc — Avslutt\n") print("Laster modeller (dette tar et minutt...)...") recorder = PushToTalkRecorder() tts = TTSEngine(voice="nb-NO-FinnNeural") try: assistant = MultimodalAssistant() except Exception as e: print("Feil ved modelllasting: " + str(e)) sys.exit(1) print("Klar!\n") recording = False lock = threading.Lock() def process_turn(audio_bytes): try: print("[STT] Transkriberer...") transcript = assistant.transcribe_audio(audio_bytes) print("Du: " + transcript) print("[VLM] Ser paa skjermbildet og tenker...") screenshot = capture_primary_monitor() response = assistant.ask_with_image(screenshot, transcript) print("Buddy: " + response) print("-" * 60) tts.speak(response, blocking=False) except Exception as e: print("[feil] " + str(e)) def on_press(key): nonlocal recording if key == keyboard.Key.f9: with lock: if not recording: recording = True recorder.start() print("\n[Lytter... slipp F9 for aa sende]") if key == keyboard.Key.esc: print("\nAvslutter...") return False def on_release(key): nonlocal recording if key == keyboard.Key.f9: with lock: if recording: recording = False audio = recorder.stop() if audio: threading.Thread(target=process_turn, args=(audio,), daemon=True).start() else: print("[Ingen lyd fanget opp]\n") with keyboard.Listener(on_press=on_press, on_release=on_release) as listener: listener.join() print("Ha det!") if __name__ == "__main__": main()