| """ |
| Desktop Agent: Ojos (screenshot) + Cerebro (VLM) + Manos (pyautogui) |
| Modelo recomendado: huihui-ai/Huihui-Qwen3.5-35B-A3B-abliterated (sin censura, MoE) |
| """ |
|
|
| import os |
| import json |
| import time |
| import base64 |
| import io |
| from datetime import datetime |
| from pathlib import Path |
| from typing import Optional, List, Dict |
|
|
| import pyautogui |
| from PIL import Image |
| from transformers import AutoModelForCausalLM, AutoProcessor, BitsAndBytesConfig |
| import torch |
|
|
| |
| MODEL_ID = os.getenv("AGENT_MODEL", "huihui-ai/Huihui-Qwen3.5-35B-A3B-abliterated") |
| SAVE_DIR = Path("/app/agent_logs") |
| SAVE_DIR.mkdir(exist_ok=True) |
|
|
| |
| pyautogui.FAILSAFE = True |
|
|
|
|
| class DesktopAgent: |
| def __init__(self, model_id: str = MODEL_ID, load_in_4bit: bool = True): |
| self.model_id = model_id |
| self.session_id = datetime.now().strftime("%Y%m%d_%H%M%S") |
| self.history: List[Dict] = [] |
| |
| print(f"🧠 Cargando modelo: {model_id}") |
| |
| |
| if load_in_4bit: |
| bnb_config = BitsAndBytesConfig( |
| load_in_4bit=True, |
| bnb_4bit_compute_dtype=torch.bfloat16, |
| bnb_4bit_use_double_quant=True, |
| ) |
| self.model = AutoModelForCausalLM.from_pretrained( |
| model_id, |
| quantization_config=bnb_config, |
| device_map="auto", |
| trust_remote_code=True, |
| torch_dtype="auto", |
| ) |
| else: |
| self.model = AutoModelForCausalLM.from_pretrained( |
| model_id, |
| device_map="auto", |
| trust_remote_code=True, |
| torch_dtype="auto", |
| ) |
| |
| self.processor = AutoProcessor.from_pretrained( |
| model_id, |
| trust_remote_code=True, |
| ) |
| |
| self.device = next(self.model.parameters()).device |
| print(f"✅ Modelo cargado en {self.device}") |
| |
| def capture_screen(self, region: Optional[tuple] = None) -> Image.Image: |
| """👁️ CAPTURA PANTALLA — Los ojos del agente""" |
| screenshot = pyautogui.screenshot(region=region) |
| return screenshot |
| |
| def save_screenshot(self, img: Image.Image, step: int) -> str: |
| path = SAVE_DIR / f"session_{self.session_id}_step{step:04d}.png" |
| img.save(path) |
| return str(path) |
| |
| def encode_image(self, img: Image.Image) -> str: |
| """Codifica imagen para el modelo VLM""" |
| buffer = io.BytesIO() |
| img.save(buffer, format="PNG") |
| return base64.b64encode(buffer.getvalue()).decode("utf-8") |
| |
| def think(self, img: Image.Image, task: str, previous_actions: str = "") -> str: |
| """🧠 CEREBRO: El modelo analiza la pantalla y decide""" |
| |
| |
| system_prompt = ( |
| "You are an autonomous desktop agent. You can see the screen and decide actions.\n" |
| "Available actions:\n" |
| "- click(x, y): Click at normalized coordinates (0-1)\n" |
| "- type(text): Type text\n" |
| "- scroll(x, y, direction): Scroll at position\n" |
| "- key(key_name): Press a key (enter, escape, etc.)\n" |
| "- done(reason): Task completed\n" |
| "- fail(reason): Cannot complete task\n" |
| "\nRespond ONLY with the action. Be precise." |
| ) |
| |
| user_text = f"Task: {task}\n" |
| if previous_actions: |
| user_text += f"Previous actions:\n{previous_actions}\n" |
| user_text += "What do you see? What action should you take next?" |
| |
| messages = [ |
| {"role": "system", "content": system_prompt}, |
| {"role": "user", "content": [ |
| {"type": "image", "image": img}, |
| {"type": "text", "text": user_text}, |
| ]}, |
| ] |
| |
| |
| text = self.processor.apply_chat_template( |
| messages, tokenize=False, add_generation_prompt=True |
| ) |
| inputs = self.processor( |
| text=[text], |
| images=[img], |
| return_tensors="pt", |
| padding=True, |
| ).to(self.device) |
| |
| |
| with torch.no_grad(): |
| output = self.model.generate( |
| **inputs, |
| max_new_tokens=256, |
| temperature=0.3, |
| do_sample=True, |
| top_p=0.9, |
| ) |
| |
| response = self.processor.decode(output[0], skip_special_tokens=True) |
| |
| if "assistant" in response: |
| response = response.split("assistant")[-1].strip() |
| |
| return response |
| |
| def execute_action(self, action_text: str) -> bool: |
| """🖐️ MANOS: Ejecuta la acción en el desktop""" |
| import re |
| |
| screen_w, screen_h = pyautogui.size() |
| action_text = action_text.strip().lower() |
| |
| try: |
| |
| if action_text.startswith("click("): |
| match = re.search(r'click\(([-+]?[0-9]*\.?[0-9]+),\s*([-+]?[0-9]*\.?[0-9]+)\)', action_text) |
| if match: |
| x_norm, y_norm = float(match.group(1)), float(match.group(2)) |
| x = int(x_norm * screen_w) |
| y = int(y_norm * screen_h) |
| pyautogui.click(x, y) |
| print(f" 🖱️ Click en ({x}, {y})") |
| return True |
| |
| |
| elif action_text.startswith("type("): |
| match = re.search(r'type\("(.+?)"\)', action_text) |
| if match: |
| text = match.group(1) |
| pyautogui.typewrite(text, interval=0.01) |
| print(f" ⌨️ Type: {text}") |
| return True |
| |
| |
| elif action_text.startswith("key("): |
| match = re.search(r'key\("(.+?)"\)', action_text) |
| if match: |
| key = match.group(1) |
| pyautogui.press(key) |
| print(f" ⌨️ Key: {key}") |
| return True |
| |
| |
| elif action_text.startswith("scroll("): |
| match = re.search(r'scroll\(([-+]?[0-9]*\.?[0-9]+),\s*([-+]?[0-9]*\.?[0-9]+),\s*"(.+?)"\)', action_text) |
| if match: |
| x_norm, y_norm, direction = float(match.group(1)), float(match.group(2)), match.group(3) |
| x = int(x_norm * screen_w) |
| y = int(y_norm * screen_h) |
| clicks = -500 if direction == "down" else 500 |
| pyautogui.scroll(clicks, x, y) |
| print(f" 🖱️ Scroll {direction} en ({x}, {y})") |
| return True |
| |
| |
| elif action_text.startswith("done(") or action_text.startswith("fail("): |
| print(f" 🏁 {action_text}") |
| return False |
| |
| else: |
| print(f" ⚠️ Acción no reconocida: {action_text}") |
| return False |
| |
| except Exception as e: |
| print(f" ❌ Error ejecutando acción: {e}") |
| return False |
| |
| return False |
| |
| def run(self, task: str, max_steps: int = 50, delay: float = 2.0): |
| """ |
| 🚀 LOOP PRINCIPAL DEL AGENTE |
| |
| 1. Captura pantalla |
| 2. Piensa (VLM) |
| 3. Ejecuta acción |
| 4. Repite |
| """ |
| print(f"\n{'='*60}") |
| print(f"🚀 AGENTE AUTÓNOMO INICIADO") |
| print(f"📋 Tarea: {task}") |
| print(f"🔢 Max steps: {max_steps}") |
| print(f"{'='*60}\n") |
| |
| previous_actions = "None" |
| |
| for step in range(1, max_steps + 1): |
| print(f"\n--- Step {step}/{max_steps} ---") |
| |
| |
| print("👁️ Capturando pantalla...") |
| screenshot = self.capture_screen() |
| img_path = self.save_screenshot(screenshot, step) |
| |
| |
| print("🧠 Pensando...") |
| action = self.think(screenshot, task, previous_actions) |
| print(f"💭 Decisión: {action}") |
| |
| |
| self.history.append({ |
| "step": step, |
| "timestamp": datetime.now().isoformat(), |
| "screenshot": img_path, |
| "action": action, |
| "task": task, |
| }) |
| |
| |
| print("🖐️ Ejecutando...") |
| should_continue = self.execute_action(action) |
| |
| |
| previous_actions += f"\nStep {step}: {action}" |
| |
| |
| log_path = SAVE_DIR / f"session_{self.session_id}_log.json" |
| with open(log_path, "w") as f: |
| json.dump(self.history, f, indent=2) |
| |
| if not should_continue: |
| print("\n🏁 Agente terminó la tarea.") |
| break |
| |
| |
| time.sleep(delay) |
| |
| print(f"\n{'='*60}") |
| print(f"✅ SESIÓN COMPLETADA") |
| print(f"📁 Logs guardados en: {SAVE_DIR}") |
| print(f"{'='*60}\n") |
|
|
|
|
| if __name__ == "__main__": |
| import argparse |
| |
| parser = argparse.ArgumentParser(description="Desktop Agent Autónomo") |
| parser.add_argument("--task", default="Open Chrome and search for 'Hugging Face'", help="Tarea a realizar") |
| parser.add_argument("--model", default=MODEL_ID, help="Modelo VLM a usar") |
| parser.add_argument("--steps", type=int, default=20, help="Máximo de pasos") |
| parser.add_argument("--delay", type=float, default=3.0, help="Segundos entre acciones") |
| parser.add_argument("--no-4bit", action="store_true", help="Cargar en fp16 (más VRAM)") |
| |
| args = parser.parse_args() |
| |
| agent = DesktopAgent( |
| model_id=args.model, |
| load_in_4bit=not args.no_4bit, |
| ) |
| |
| agent.run( |
| task=args.task, |
| max_steps=args.steps, |
| delay=args.delay, |
| ) |
|
|