|
|
| """ |
| Buddy — Desktop Push-to-Talk AI Assistant |
| |
| Hold F9 for å snakke. Assistenten ser skjermbildet ditt, |
| transkriberer talen, svarer via multimodal VLM, |
| og snakker tilbake. |
| |
| Krav: |
| pip install -r requirements.txt |
| |
| Bruk: |
| python main.py |
| |
| Hotkeys: |
| F9 — Hold for å snakke, slipp for å sende |
| Esc — Avslutt |
| """ |
| import tkinter as tk |
| from tkinter import ttk, scrolledtext |
| from PIL import ImageTk, Image |
| import threading |
| import queue |
| import time |
| import sys |
|
|
| from pynput import keyboard |
|
|
| from audio_io import PushToTalkRecorder, TTSEngine |
| from screen_capture import capture_primary_monitor |
| from vision_llm import MultimodalAssistant |
|
|
| |
| _MISSING_DEPS = [] |
| try: |
| import sounddevice |
| except ImportError: |
| _MISSING_DEPS.append("sounddevice") |
| try: |
| import soundfile |
| except ImportError: |
| _MISSING_DEPS.append("soundfile") |
| try: |
| import pynput |
| except ImportError: |
| _MISSING_DEPS.append("pynput") |
|
|
| if _MISSING_DEPS: |
| print("=" * 60) |
| print("FEIL: Mangler Python-pakker: %s" % ", ".join(_MISSING_DEPS)) |
| print() |
| print("Kj\u00f8r f\u00f8rst:") |
| print(" pip install -r requirements.txt") |
| print() |
| print("Windows-brukere: sounddevice kan kreve wheel fra:") |
| print(" https://www.lfd.uci.edu/~gohlke/pythonlibs/#sounddevice") |
| print("=" * 60) |
| sys.exit(1) |
|
|
|
|
| class BuddyApp: |
| def __init__(self): |
| self.root = tk.Tk() |
| self.root.title("Buddy — Push-to-Talk AI Assistant") |
| self.root.geometry("900x750") |
| self.root.configure(bg="#1e1e2e") |
|
|
| |
| self.style = ttk.Style() |
| self.style.theme_use("clam") |
| self.style.configure("TFrame", background="#1e1e2e") |
| self.style.configure("TLabel", background="#1e1e2e", foreground="#cdd6f4", font=("Inter", 11)) |
| self.style.configure("Status.TLabel", background="#1e1e2e", foreground="#a6e3a1", font=("Inter", 12, "bold")) |
| self.style.configure("Recording.TLabel", background="#1e1e2e", foreground="#f38ba8", font=("Inter", 12, "bold")) |
|
|
| |
| self.recorder = PushToTalkRecorder() |
| self.tts = TTSEngine(voice="nb-NO-FinnNeural") |
| self.assistant = None |
| self.model_loaded = False |
|
|
| self.msg_queue = queue.Queue() |
| self.history = [] |
|
|
| self._build_ui() |
| self._start_keyboard_listener() |
| self._process_queue() |
|
|
| |
| threading.Thread(target=self._load_model, daemon=True).start() |
|
|
| def _build_ui(self): |
| |
| header = ttk.Frame(self.root) |
| header.pack(fill="x", padx=20, pady=(20, 10)) |
|
|
| ttk.Label(header, text="Buddy", font=("Inter", 24, "bold"), foreground="#89b4fa").pack(side="left") |
| self.status_label = ttk.Label(header, text="Laster modeller...", style="Status.TLabel") |
| self.status_label.pack(side="right") |
|
|
| |
| self.screenshot_label = ttk.Label(self.root, text="Skjermbilde vises her") |
| self.screenshot_label.pack(padx=20, pady=10) |
|
|
| |
| chat_frame = ttk.Frame(self.root) |
| chat_frame.pack(fill="both", expand=True, padx=20, pady=10) |
|
|
| self.chat_text = scrolledtext.ScrolledText( |
| chat_frame, |
| wrap=tk.WORD, |
| bg="#181825", |
| fg="#cdd6f4", |
| insertbackground="#cdd6f4", |
| font=("Inter", 11), |
| borderwidth=0, |
| padx=10, |
| pady=10, |
| ) |
| self.chat_text.pack(fill="both", expand=True) |
| self.chat_text.config(state=tk.DISABLED) |
|
|
| |
| controls = ttk.Frame(self.root) |
| controls.pack(fill="x", padx=20, pady=(10, 20)) |
|
|
| self.ptt_btn = tk.Button( |
| controls, |
| text="Hold F9 for å snakke (eller trykk her)", |
| font=("Inter", 12, "bold"), |
| bg="#89b4fa", |
| fg="#1e1e2e", |
| activebackground="#b4befe", |
| relief=tk.FLAT, |
| padx=20, |
| pady=10, |
| cursor="hand2", |
| ) |
| self.ptt_btn.pack(side="left", fill="x", expand=True) |
|
|
| |
| self.ptt_btn.bind("<ButtonPress-1>", lambda e: self._start_recording()) |
| self.ptt_btn.bind("<ButtonRelease-1>", lambda e: self._stop_recording()) |
|
|
| |
| hint = ttk.Label( |
| self.root, |
| text="F9 = Push-to-Talk | Esc = Avslutt | Assistenten ser automatisk skjermbildet ditt", |
| font=("Inter", 9), |
| foreground="#6c7086", |
| ) |
| hint.pack(pady=(0, 15)) |
|
|
| def _append_chat(self, sender, text): |
| self.chat_text.config(state=tk.NORMAL) |
| if sender == "Du": |
| line = "Du: " + text + "\n\n" |
| self.chat_text.insert(tk.END, line) |
| else: |
| line = "Buddy: " + text + "\n\n" |
| self.chat_text.insert(tk.END, line) |
| self.chat_text.config(state=tk.DISABLED) |
| self.chat_text.see(tk.END) |
|
|
| def _update_screenshot(self): |
| try: |
| img = capture_primary_monitor() |
| img.thumbnail((800, 450)) |
| self.tk_screenshot = ImageTk.PhotoImage(img) |
| self.screenshot_label.config(image=self.tk_screenshot, text="") |
| except Exception as e: |
| self.screenshot_label.config(text="Kunne ikke ta skjermbilde: " + str(e)) |
|
|
| def _load_model(self): |
| import torch |
| try: |
| self.assistant = MultimodalAssistant() |
| self.model_loaded = True |
| self.msg_queue.put(("status", "Klar! Hold F9 for å snakke")) |
| except torch.cuda.OutOfMemoryError: |
| self.msg_queue.put(("status", "OOM: Vil tvinge qwen2-vl-2b...")) |
| print("[init] OOM! Prøver fallback til qwen2-vl-2b...") |
| try: |
| import os |
| os.environ["BUDDY_VLM_MODEL"] = "qwen2-vl-2b" |
| self.assistant = MultimodalAssistant() |
| self.model_loaded = True |
| self.msg_queue.put(("status", "Klar! (bruker lett modell) Hold F9 for å snakke")) |
| except Exception as e2: |
| self.msg_queue.put(("status", "Også OOM med 2B: " + str(e2))) |
| except Exception as e: |
| self.msg_queue.put(("status", "Feil ved lasting: " + str(e))) |
| print("[init] Model load error: " + str(e)) |
|
|
| def _start_recording(self): |
| if not self.model_loaded: |
| self.msg_queue.put(("status", "Vent, modell laster...")) |
| return |
| if self.recorder.is_recording(): |
| return |
|
|
| self.recorder.start() |
| self.msg_queue.put(("status_recording", "Lytter... slipp for å sende")) |
| self._update_screenshot() |
|
|
| def _stop_recording(self): |
| if not self.recorder.is_recording(): |
| return |
|
|
| audio_bytes = self.recorder.stop() |
| self.msg_queue.put(("status", "Transkriberer...")) |
|
|
| if not audio_bytes: |
| self.msg_queue.put(("status", "Ingen lyd fanget opp")) |
| return |
|
|
| threading.Thread( |
| target=self._process_turn, |
| args=(audio_bytes,), |
| daemon=True, |
| ).start() |
|
|
| def _process_turn(self, audio_bytes): |
| try: |
| transcript = self.assistant.transcribe_audio(audio_bytes) |
| self.msg_queue.put(("chat_user", transcript)) |
|
|
| self.msg_queue.put(("status", "Ser på skjermbildet...")) |
| screenshot = capture_primary_monitor() |
| self.msg_queue.put(("screenshot", None)) |
|
|
| self.msg_queue.put(("status", "Tenker...")) |
| response = self.assistant.ask_with_image(screenshot, transcript) |
|
|
| self.msg_queue.put(("chat_buddy", response)) |
| self.msg_queue.put(("status", "Klar!")) |
| self.tts.speak(response, blocking=False) |
|
|
| except Exception as e: |
| self.msg_queue.put(("status", "Feil: " + str(e))) |
| print("[turn] Error: " + str(e)) |
|
|
| def _start_keyboard_listener(self): |
| def on_press(key): |
| if key == keyboard.Key.f9 and self.model_loaded and not self.recorder.is_recording(): |
| self.msg_queue.put(("start_recording", None)) |
| if key == keyboard.Key.esc: |
| self.msg_queue.put(("quit", None)) |
|
|
| def on_release(key): |
| if key == keyboard.Key.f9 and self.recorder.is_recording(): |
| self.msg_queue.put(("stop_recording", None)) |
|
|
| self.listener = keyboard.Listener(on_press=on_press, on_release=on_release) |
| self.listener.start() |
|
|
| def _process_queue(self): |
| try: |
| while True: |
| kind, payload = self.msg_queue.get_nowait() |
|
|
| if kind == "status": |
| self.status_label.config(text=payload, style="Status.TLabel") |
| elif kind == "status_recording": |
| self.status_label.config(text=payload, style="Recording.TLabel") |
| elif kind == "chat_user": |
| self._append_chat("Du", payload) |
| elif kind == "chat_buddy": |
| self._append_chat("Buddy", payload) |
| elif kind == "screenshot": |
| self._update_screenshot() |
| elif kind == "start_recording": |
| self._start_recording() |
| elif kind == "stop_recording": |
| self._stop_recording() |
| elif kind == "quit": |
| self._quit() |
|
|
| except queue.Empty: |
| pass |
|
|
| self.root.after(50, self._process_queue) |
|
|
| def _quit(self): |
| self.listener.stop() |
| self.root.destroy() |
| sys.exit(0) |
|
|
| def run(self): |
| self.root.mainloop() |
|
|
|
|
| if __name__ == "__main__": |
| app = BuddyApp() |
| app.run() |
|
|