carbonx commited on
Commit
93a741b
·
verified ·
1 Parent(s): cea569d

Add main.py desktop GUI

Browse files
Files changed (1) hide show
  1. main.py +260 -0
main.py ADDED
@@ -0,0 +1,260 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Buddy — Desktop Push-to-Talk AI Assistant
3
+
4
+ Hold F9 for å snakke. Assistenten ser skjermbildet ditt,
5
+ transkriberer talen, svarer via multimodal VLM,
6
+ og snakker tilbake.
7
+
8
+ Krav:
9
+ pip install -r requirements.txt
10
+
11
+ Bruk:
12
+ python main.py
13
+
14
+ Hotkeys:
15
+ F9 — Hold for å snakke, slipp for å sende
16
+ Esc — Avslutt
17
+ """
18
+ import tkinter as tk
19
+ from tkinter import ttk, scrolledtext
20
+ from PIL import ImageTk, Image
21
+ import threading
22
+ import queue
23
+ import time
24
+ import sys
25
+
26
+ from pynput import keyboard
27
+
28
+ from audio_io import PushToTalkRecorder, TTSEngine
29
+ from screen_capture import capture_primary_monitor
30
+ from vision_llm import MultimodalAssistant
31
+
32
+
33
+ class BuddyApp:
34
+ def __init__(self):
35
+ self.root = tk.Tk()
36
+ self.root.title("Buddy — Push-to-Talk AI Assistant")
37
+ self.root.geometry("900x750")
38
+ self.root.configure(bg="#1e1e2e")
39
+
40
+ # --- Styling ---
41
+ self.style = ttk.Style()
42
+ self.style.theme_use("clam")
43
+ self.style.configure("TFrame", background="#1e1e2e")
44
+ self.style.configure("TLabel", background="#1e1e2e", foreground="#cdd6f4", font=("Inter", 11))
45
+ self.style.configure("Status.TLabel", background="#1e1e2e", foreground="#a6e3a1", font=("Inter", 12, "bold"))
46
+ self.style.configure("Recording.TLabel", background="#1e1e2e", foreground="#f38ba8", font=("Inter", 12, "bold"))
47
+
48
+ # --- State ---
49
+ self.recorder = PushToTalkRecorder()
50
+ self.tts = TTSEngine(voice="nb-NO-FinnNeural")
51
+ self.assistant = None
52
+ self.model_loaded = False
53
+
54
+ self.msg_queue = queue.Queue()
55
+ self.history = []
56
+
57
+ self._build_ui()
58
+ self._start_keyboard_listener()
59
+ self._process_queue()
60
+
61
+ # Load model in background
62
+ threading.Thread(target=self._load_model, daemon=True).start()
63
+
64
+ def _build_ui(self):
65
+ # Header
66
+ header = ttk.Frame(self.root)
67
+ header.pack(fill="x", padx=20, pady=(20, 10))
68
+
69
+ ttk.Label(header, text="Buddy", font=("Inter", 24, "bold"), foreground="#89b4fa").pack(side="left")
70
+ self.status_label = ttk.Label(header, text="Laster modeller...", style="Status.TLabel")
71
+ self.status_label.pack(side="right")
72
+
73
+ # Screenshot preview
74
+ self.screenshot_label = ttk.Label(self.root, text="Skjermbilde vises her")
75
+ self.screenshot_label.pack(padx=20, pady=10)
76
+
77
+ # Chat log
78
+ chat_frame = ttk.Frame(self.root)
79
+ chat_frame.pack(fill="both", expand=True, padx=20, pady=10)
80
+
81
+ self.chat_text = scrolledtext.ScrolledText(
82
+ chat_frame,
83
+ wrap=tk.WORD,
84
+ bg="#181825",
85
+ fg="#cdd6f4",
86
+ insertbackground="#cdd6f4",
87
+ font=("Inter", 11),
88
+ borderwidth=0,
89
+ padx=10,
90
+ pady=10,
91
+ )
92
+ self.chat_text.pack(fill="both", expand=True)
93
+ self.chat_text.config(state=tk.DISABLED)
94
+
95
+ # Controls
96
+ controls = ttk.Frame(self.root)
97
+ controls.pack(fill="x", padx=20, pady=(10, 20))
98
+
99
+ self.ptt_btn = tk.Button(
100
+ controls,
101
+ text="Hold F9 for å snakke (eller trykk her)",
102
+ font=("Inter", 12, "bold"),
103
+ bg="#89b4fa",
104
+ fg="#1e1e2e",
105
+ activebackground="#b4befe",
106
+ relief=tk.FLAT,
107
+ padx=20,
108
+ pady=10,
109
+ cursor="hand2",
110
+ )
111
+ self.ptt_btn.pack(side="left", fill="x", expand=True)
112
+
113
+ # Mouse push-to-talk
114
+ self.ptt_btn.bind("<ButtonPress-1>", lambda e: self._start_recording())
115
+ self.ptt_btn.bind("<ButtonRelease-1>", lambda e: self._stop_recording())
116
+
117
+ # Hint
118
+ hint = ttk.Label(
119
+ self.root,
120
+ text="F9 = Push-to-Talk | Esc = Avslutt | Assistenten ser automatisk skjermbildet ditt",
121
+ font=("Inter", 9),
122
+ foreground="#6c7086",
123
+ )
124
+ hint.pack(pady=(0, 15))
125
+
126
+ def _append_chat(self, sender, text):
127
+ self.chat_text.config(state=tk.NORMAL)
128
+ if sender == "Du":
129
+ line = "Du: " + text + "\n\n"
130
+ self.chat_text.insert(tk.END, line)
131
+ else:
132
+ line = "Buddy: " + text + "\n\n"
133
+ self.chat_text.insert(tk.END, line)
134
+ self.chat_text.config(state=tk.DISABLED)
135
+ self.chat_text.see(tk.END)
136
+
137
+ def _update_screenshot(self):
138
+ try:
139
+ img = capture_primary_monitor()
140
+ img.thumbnail((800, 450))
141
+ self.tk_screenshot = ImageTk.PhotoImage(img)
142
+ self.screenshot_label.config(image=self.tk_screenshot, text="")
143
+ except Exception as e:
144
+ self.screenshot_label.config(text="Kunne ikke ta skjermbilde: " + str(e))
145
+
146
+ def _load_model(self):
147
+ try:
148
+ self.assistant = MultimodalAssistant()
149
+ self.model_loaded = True
150
+ self.msg_queue.put(("status", "Klar! Hold F9 for å snakke"))
151
+ except Exception as e:
152
+ self.msg_queue.put(("status", "Feil ved lasting: " + str(e)))
153
+ print("[init] Model load error: " + str(e))
154
+
155
+ def _start_recording(self):
156
+ if not self.model_loaded:
157
+ self.msg_queue.put(("status", "Vent, modell laster ennaa..."))
158
+ return
159
+ if self.recorder.is_recording():
160
+ return
161
+
162
+ self.recorder.start()
163
+ self.msg_queue.put(("status_recording", "Lytter... slipp for aa sende"))
164
+ self._update_screenshot()
165
+
166
+ def _stop_recording(self):
167
+ if not self.recorder.is_recording():
168
+ return
169
+
170
+ audio_bytes = self.recorder.stop()
171
+ self.msg_queue.put(("status", "Transkriberer..."))
172
+
173
+ if not audio_bytes:
174
+ self.msg_queue.put(("status", "Ingen lyd fanget opp"))
175
+ return
176
+
177
+ # Process in background thread
178
+ threading.Thread(
179
+ target=self._process_turn,
180
+ args=(audio_bytes,),
181
+ daemon=True,
182
+ ).start()
183
+
184
+ def _process_turn(self, audio_bytes):
185
+ try:
186
+ # 1. STT
187
+ transcript = self.assistant.transcribe_audio(audio_bytes)
188
+ self.msg_queue.put(("chat_user", transcript))
189
+
190
+ # 2. Capture fresh screenshot
191
+ self.msg_queue.put(("status", "Ser paa skjermbildet..."))
192
+ screenshot = capture_primary_monitor()
193
+ self.msg_queue.put(("screenshot", None))
194
+
195
+ # 3. VLM
196
+ self.msg_queue.put(("status", "Tenker..."))
197
+ response = self.assistant.ask_with_image(screenshot, transcript)
198
+
199
+ # 4. Update UI + TTS
200
+ self.msg_queue.put(("chat_buddy", response))
201
+ self.msg_queue.put(("status", "Klar!"))
202
+ self.tts.speak(response, blocking=False)
203
+
204
+ except Exception as e:
205
+ self.msg_queue.put(("status", "Feil: " + str(e)))
206
+ print("[turn] Error: " + str(e))
207
+
208
+ def _start_keyboard_listener(self):
209
+ def on_press(key):
210
+ if key == keyboard.Key.f9 and self.model_loaded and not self.recorder.is_recording():
211
+ self.msg_queue.put(("start_recording", None))
212
+ if key == keyboard.Key.esc:
213
+ self.msg_queue.put(("quit", None))
214
+
215
+ def on_release(key):
216
+ if key == keyboard.Key.f9 and self.recorder.is_recording():
217
+ self.msg_queue.put(("stop_recording", None))
218
+
219
+ self.listener = keyboard.Listener(on_press=on_press, on_release=on_release)
220
+ self.listener.start()
221
+
222
+ def _process_queue(self):
223
+ try:
224
+ while True:
225
+ kind, payload = self.msg_queue.get_nowait()
226
+
227
+ if kind == "status":
228
+ self.status_label.config(text=payload, style="Status.TLabel")
229
+ elif kind == "status_recording":
230
+ self.status_label.config(text=payload, style="Recording.TLabel")
231
+ elif kind == "chat_user":
232
+ self._append_chat("Du", payload)
233
+ elif kind == "chat_buddy":
234
+ self._append_chat("Buddy", payload)
235
+ elif kind == "screenshot":
236
+ self._update_screenshot()
237
+ elif kind == "start_recording":
238
+ self._start_recording()
239
+ elif kind == "stop_recording":
240
+ self._stop_recording()
241
+ elif kind == "quit":
242
+ self._quit()
243
+
244
+ except queue.Empty:
245
+ pass
246
+
247
+ self.root.after(50, self._process_queue)
248
+
249
+ def _quit(self):
250
+ self.listener.stop()
251
+ self.root.destroy()
252
+ sys.exit(0)
253
+
254
+ def run(self):
255
+ self.root.mainloop()
256
+
257
+
258
+ if __name__ == "__main__":
259
+ app = BuddyApp()
260
+ app.run()