Matzan commited on
Commit
bfcb2a0
·
verified ·
1 Parent(s): c08e478

Upload agent.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. agent.py +289 -0
agent.py ADDED
@@ -0,0 +1,289 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Desktop Agent: Ojos (screenshot) + Cerebro (VLM) + Manos (pyautogui)
3
+ Modelo recomendado: huihui-ai/Huihui-Qwen3.5-35B-A3B-abliterated (sin censura, MoE)
4
+ """
5
+
6
+ import os
7
+ import json
8
+ import time
9
+ import base64
10
+ import io
11
+ from datetime import datetime
12
+ from pathlib import Path
13
+ from typing import Optional, List, Dict
14
+
15
+ import pyautogui
16
+ from PIL import Image
17
+ from transformers import AutoModelForCausalLM, AutoProcessor, BitsAndBytesConfig
18
+ import torch
19
+
20
+ # Configuración
21
+ MODEL_ID = os.getenv("AGENT_MODEL", "huihui-ai/Huihui-Qwen3.5-35B-A3B-abliterated")
22
+ SAVE_DIR = Path("/app/agent_logs")
23
+ SAVE_DIR.mkdir(exist_ok=True)
24
+
25
+ # Desactivar failsafe de pyautogui (cuidado!)
26
+ pyautogui.FAILSAFE = True # Mueve mouse a esquina superior izquierda para abortar
27
+
28
+
29
+ class DesktopAgent:
30
+ def __init__(self, model_id: str = MODEL_ID, load_in_4bit: bool = True):
31
+ self.model_id = model_id
32
+ self.session_id = datetime.now().strftime("%Y%m%d_%H%M%S")
33
+ self.history: List[Dict] = []
34
+
35
+ print(f"🧠 Cargando modelo: {model_id}")
36
+
37
+ # Quantization para ahorrar VRAM
38
+ if load_in_4bit:
39
+ bnb_config = BitsAndBytesConfig(
40
+ load_in_4bit=True,
41
+ bnb_4bit_compute_dtype=torch.bfloat16,
42
+ bnb_4bit_use_double_quant=True,
43
+ )
44
+ self.model = AutoModelForCausalLM.from_pretrained(
45
+ model_id,
46
+ quantization_config=bnb_config,
47
+ device_map="auto",
48
+ trust_remote_code=True,
49
+ torch_dtype="auto",
50
+ )
51
+ else:
52
+ self.model = AutoModelForCausalLM.from_pretrained(
53
+ model_id,
54
+ device_map="auto",
55
+ trust_remote_code=True,
56
+ torch_dtype="auto",
57
+ )
58
+
59
+ self.processor = AutoProcessor.from_pretrained(
60
+ model_id,
61
+ trust_remote_code=True,
62
+ )
63
+
64
+ self.device = next(self.model.parameters()).device
65
+ print(f"✅ Modelo cargado en {self.device}")
66
+
67
+ def capture_screen(self, region: Optional[tuple] = None) -> Image.Image:
68
+ """👁️ CAPTURA PANTALLA — Los ojos del agente"""
69
+ screenshot = pyautogui.screenshot(region=region)
70
+ return screenshot
71
+
72
+ def save_screenshot(self, img: Image.Image, step: int) -> str:
73
+ path = SAVE_DIR / f"session_{self.session_id}_step{step:04d}.png"
74
+ img.save(path)
75
+ return str(path)
76
+
77
+ def encode_image(self, img: Image.Image) -> str:
78
+ """Codifica imagen para el modelo VLM"""
79
+ buffer = io.BytesIO()
80
+ img.save(buffer, format="PNG")
81
+ return base64.b64encode(buffer.getvalue()).decode("utf-8")
82
+
83
+ def think(self, img: Image.Image, task: str, previous_actions: str = "") -> str:
84
+ """🧠 CEREBRO: El modelo analiza la pantalla y decide"""
85
+
86
+ # Construir prompt con historial
87
+ system_prompt = (
88
+ "You are an autonomous desktop agent. You can see the screen and decide actions.\n"
89
+ "Available actions:\n"
90
+ "- click(x, y): Click at normalized coordinates (0-1)\n"
91
+ "- type(text): Type text\n"
92
+ "- scroll(x, y, direction): Scroll at position\n"
93
+ "- key(key_name): Press a key (enter, escape, etc.)\n"
94
+ "- done(reason): Task completed\n"
95
+ "- fail(reason): Cannot complete task\n"
96
+ "\nRespond ONLY with the action. Be precise."
97
+ )
98
+
99
+ user_text = f"Task: {task}\n"
100
+ if previous_actions:
101
+ user_text += f"Previous actions:\n{previous_actions}\n"
102
+ user_text += "What do you see? What action should you take next?"
103
+
104
+ messages = [
105
+ {"role": "system", "content": system_prompt},
106
+ {"role": "user", "content": [
107
+ {"type": "image", "image": img},
108
+ {"type": "text", "text": user_text},
109
+ ]},
110
+ ]
111
+
112
+ # Procesar
113
+ text = self.processor.apply_chat_template(
114
+ messages, tokenize=False, add_generation_prompt=True
115
+ )
116
+ inputs = self.processor(
117
+ text=[text],
118
+ images=[img],
119
+ return_tensors="pt",
120
+ padding=True,
121
+ ).to(self.device)
122
+
123
+ # Generar
124
+ with torch.no_grad():
125
+ output = self.model.generate(
126
+ **inputs,
127
+ max_new_tokens=256,
128
+ temperature=0.3,
129
+ do_sample=True,
130
+ top_p=0.9,
131
+ )
132
+
133
+ response = self.processor.decode(output[0], skip_special_tokens=True)
134
+ # Extraer solo la respuesta del asistente
135
+ if "assistant" in response:
136
+ response = response.split("assistant")[-1].strip()
137
+
138
+ return response
139
+
140
+ def execute_action(self, action_text: str) -> bool:
141
+ """🖐️ MANOS: Ejecuta la acción en el desktop"""
142
+ import re
143
+
144
+ screen_w, screen_h = pyautogui.size()
145
+ action_text = action_text.strip().lower()
146
+
147
+ try:
148
+ # Click: click(0.5, 0.3)
149
+ if action_text.startswith("click("):
150
+ match = re.search(r'click\(([-+]?[0-9]*\.?[0-9]+),\s*([-+]?[0-9]*\.?[0-9]+)\)', action_text)
151
+ if match:
152
+ x_norm, y_norm = float(match.group(1)), float(match.group(2))
153
+ x = int(x_norm * screen_w)
154
+ y = int(y_norm * screen_h)
155
+ pyautogui.click(x, y)
156
+ print(f" 🖱️ Click en ({x}, {y})")
157
+ return True
158
+
159
+ # Type: type("hello world")
160
+ elif action_text.startswith("type("):
161
+ match = re.search(r'type\("(.+?)"\)', action_text)
162
+ if match:
163
+ text = match.group(1)
164
+ pyautogui.typewrite(text, interval=0.01)
165
+ print(f" ⌨️ Type: {text}")
166
+ return True
167
+
168
+ # Key: key("enter")
169
+ elif action_text.startswith("key("):
170
+ match = re.search(r'key\("(.+?)"\)', action_text)
171
+ if match:
172
+ key = match.group(1)
173
+ pyautogui.press(key)
174
+ print(f" ⌨️ Key: {key}")
175
+ return True
176
+
177
+ # Scroll: scroll(0.5, 0.5, "down")
178
+ elif action_text.startswith("scroll("):
179
+ match = re.search(r'scroll\(([-+]?[0-9]*\.?[0-9]+),\s*([-+]?[0-9]*\.?[0-9]+),\s*"(.+?)"\)', action_text)
180
+ if match:
181
+ x_norm, y_norm, direction = float(match.group(1)), float(match.group(2)), match.group(3)
182
+ x = int(x_norm * screen_w)
183
+ y = int(y_norm * screen_h)
184
+ clicks = -500 if direction == "down" else 500
185
+ pyautogui.scroll(clicks, x, y)
186
+ print(f" 🖱️ Scroll {direction} en ({x}, {y})")
187
+ return True
188
+
189
+ # Done / Fail
190
+ elif action_text.startswith("done(") or action_text.startswith("fail("):
191
+ print(f" 🏁 {action_text}")
192
+ return False # Termina el loop
193
+
194
+ else:
195
+ print(f" ⚠️ Acción no reconocida: {action_text}")
196
+ return False
197
+
198
+ except Exception as e:
199
+ print(f" ❌ Error ejecutando acción: {e}")
200
+ return False
201
+
202
+ return False
203
+
204
+ def run(self, task: str, max_steps: int = 50, delay: float = 2.0):
205
+ """
206
+ 🚀 LOOP PRINCIPAL DEL AGENTE
207
+
208
+ 1. Captura pantalla
209
+ 2. Piensa (VLM)
210
+ 3. Ejecuta acción
211
+ 4. Repite
212
+ """
213
+ print(f"\n{'='*60}")
214
+ print(f"🚀 AGENTE AUTÓNOMO INICIADO")
215
+ print(f"📋 Tarea: {task}")
216
+ print(f"🔢 Max steps: {max_steps}")
217
+ print(f"{'='*60}\n")
218
+
219
+ previous_actions = "None"
220
+
221
+ for step in range(1, max_steps + 1):
222
+ print(f"\n--- Step {step}/{max_steps} ---")
223
+
224
+ # 1. OJOS: Capturar pantalla
225
+ print("👁️ Capturando pantalla...")
226
+ screenshot = self.capture_screen()
227
+ img_path = self.save_screenshot(screenshot, step)
228
+
229
+ # 2. CEREBRO: Pensar
230
+ print("🧠 Pensando...")
231
+ action = self.think(screenshot, task, previous_actions)
232
+ print(f"💭 Decisión: {action}")
233
+
234
+ # Guardar en historial
235
+ self.history.append({
236
+ "step": step,
237
+ "timestamp": datetime.now().isoformat(),
238
+ "screenshot": img_path,
239
+ "action": action,
240
+ "task": task,
241
+ })
242
+
243
+ # 3. MANOS: Ejecutar
244
+ print("🖐️ Ejecutando...")
245
+ should_continue = self.execute_action(action)
246
+
247
+ # Actualizar historial para próximo paso
248
+ previous_actions += f"\nStep {step}: {action}"
249
+
250
+ # Guardar log
251
+ log_path = SAVE_DIR / f"session_{self.session_id}_log.json"
252
+ with open(log_path, "w") as f:
253
+ json.dump(self.history, f, indent=2)
254
+
255
+ if not should_continue:
256
+ print("\n🏁 Agente terminó la tarea.")
257
+ break
258
+
259
+ # Esperar entre acciones
260
+ time.sleep(delay)
261
+
262
+ print(f"\n{'='*60}")
263
+ print(f"✅ SESIÓN COMPLETADA")
264
+ print(f"📁 Logs guardados en: {SAVE_DIR}")
265
+ print(f"{'='*60}\n")
266
+
267
+
268
+ if __name__ == "__main__":
269
+ import argparse
270
+
271
+ parser = argparse.ArgumentParser(description="Desktop Agent Autónomo")
272
+ parser.add_argument("--task", default="Open Chrome and search for 'Hugging Face'", help="Tarea a realizar")
273
+ parser.add_argument("--model", default=MODEL_ID, help="Modelo VLM a usar")
274
+ parser.add_argument("--steps", type=int, default=20, help="Máximo de pasos")
275
+ parser.add_argument("--delay", type=float, default=3.0, help="Segundos entre acciones")
276
+ parser.add_argument("--no-4bit", action="store_true", help="Cargar en fp16 (más VRAM)")
277
+
278
+ args = parser.parse_args()
279
+
280
+ agent = DesktopAgent(
281
+ model_id=args.model,
282
+ load_in_4bit=not args.no_4bit,
283
+ )
284
+
285
+ agent.run(
286
+ task=args.task,
287
+ max_steps=args.steps,
288
+ delay=args.delay,
289
+ )