File size: 2,477 Bytes
887bb38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
"""
Buddy CLI — Headless Push-to-Talk AI Assistant

Bruk uten GUI for testing eller som daemon.
    python cli.py
"""
import sys
import threading
import time
from pynput import keyboard

from audio_io import PushToTalkRecorder, TTSEngine
from screen_capture import capture_primary_monitor
from vision_llm import MultimodalAssistant


def main():
    print("=" * 60)
    print("  Buddy — Push-to-Talk AI Assistant (CLI)")
    print("=" * 60)
    print("\nTaster:")
    print("  F9  — Hold for aa snakke, slipp for aa sende")
    print("  Esc — Avslutt\n")

    print("Laster modeller (dette tar et minutt...)...")
    recorder = PushToTalkRecorder()
    tts = TTSEngine(voice="nb-NO-FinnNeural")

    try:
        assistant = MultimodalAssistant()
    except Exception as e:
        print("Feil ved modelllasting: " + str(e))
        sys.exit(1)

    print("Klar!\n")
    recording = False
    lock = threading.Lock()

    def process_turn(audio_bytes):
        try:
            print("[STT] Transkriberer...")
            transcript = assistant.transcribe_audio(audio_bytes)
            print("Du: " + transcript)

            print("[VLM] Ser paa skjermbildet og tenker...")
            screenshot = capture_primary_monitor()
            response = assistant.ask_with_image(screenshot, transcript)

            print("Buddy: " + response)
            print("-" * 60)
            tts.speak(response, blocking=False)
        except Exception as e:
            print("[feil] " + str(e))

    def on_press(key):
        nonlocal recording
        if key == keyboard.Key.f9:
            with lock:
                if not recording:
                    recording = True
                    recorder.start()
                    print("\n[Lytter... slipp F9 for aa sende]")

        if key == keyboard.Key.esc:
            print("\nAvslutter...")
            return False

    def on_release(key):
        nonlocal recording
        if key == keyboard.Key.f9:
            with lock:
                if recording:
                    recording = False
                    audio = recorder.stop()
                    if audio:
                        threading.Thread(target=process_turn, args=(audio,), daemon=True).start()
                    else:
                        print("[Ingen lyd fanget opp]\n")

    with keyboard.Listener(on_press=on_press, on_release=on_release) as listener:
        listener.join()

    print("Ha det!")


if __name__ == "__main__":
    main()