Matzan
/

desktop-agent-uncensored

ml-intern

Model card Files Files and versions

xet

Community

Matzan commited on 3 days ago

Commit

bfcb2a0

verified ·

1 Parent(s): c08e478

Upload agent.py with huggingface_hub

Browse files

Files changed (1) hide show

agent.py +289 -0

agent.py ADDED Viewed

	@@ -0,0 +1,289 @@

+"""
+Desktop Agent: Ojos (screenshot) + Cerebro (VLM) + Manos (pyautogui)
+Modelo recomendado: huihui-ai/Huihui-Qwen3.5-35B-A3B-abliterated (sin censura, MoE)
+"""
+import os
+import json
+import time
+import base64
+import io
+from datetime import datetime
+from pathlib import Path
+from typing import Optional, List, Dict
+import pyautogui
+from PIL import Image
+from transformers import AutoModelForCausalLM, AutoProcessor, BitsAndBytesConfig
+import torch
+# Configuración
+MODEL_ID = os.getenv("AGENT_MODEL", "huihui-ai/Huihui-Qwen3.5-35B-A3B-abliterated")
+SAVE_DIR = Path("/app/agent_logs")
+SAVE_DIR.mkdir(exist_ok=True)
+# Desactivar failsafe de pyautogui (cuidado!)
+pyautogui.FAILSAFE = True  # Mueve mouse a esquina superior izquierda para abortar
+class DesktopAgent:
+    def __init__(self, model_id: str = MODEL_ID, load_in_4bit: bool = True):
+        self.model_id = model_id
+        self.session_id = datetime.now().strftime("%Y%m%d_%H%M%S")
+        self.history: List[Dict] = []
+        print(f"🧠 Cargando modelo: {model_id}")
+        # Quantization para ahorrar VRAM
+        if load_in_4bit:
+            bnb_config = BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_compute_dtype=torch.bfloat16,
+                bnb_4bit_use_double_quant=True,
+            )
+            self.model = AutoModelForCausalLM.from_pretrained(
+                model_id,
+                quantization_config=bnb_config,
+                device_map="auto",
+                trust_remote_code=True,
+                torch_dtype="auto",
+            )
+        else:
+            self.model = AutoModelForCausalLM.from_pretrained(
+                model_id,
+                device_map="auto",
+                trust_remote_code=True,
+                torch_dtype="auto",
+            )
+        self.processor = AutoProcessor.from_pretrained(
+            model_id,
+            trust_remote_code=True,
+        )
+        self.device = next(self.model.parameters()).device
+        print(f"✅ Modelo cargado en {self.device}")
+    def capture_screen(self, region: Optional[tuple] = None) -> Image.Image:
+        """👁️ CAPTURA PANTALLA — Los ojos del agente"""
+        screenshot = pyautogui.screenshot(region=region)
+        return screenshot
+    def save_screenshot(self, img: Image.Image, step: int) -> str:
+        path = SAVE_DIR / f"session_{self.session_id}_step{step:04d}.png"
+        img.save(path)
+        return str(path)
+    def encode_image(self, img: Image.Image) -> str:
+        """Codifica imagen para el modelo VLM"""
+        buffer = io.BytesIO()
+        img.save(buffer, format="PNG")
+        return base64.b64encode(buffer.getvalue()).decode("utf-8")
+    def think(self, img: Image.Image, task: str, previous_actions: str = "") -> str:
+        """🧠 CEREBRO: El modelo analiza la pantalla y decide"""
+        # Construir prompt con historial
+        system_prompt = (
+            "You are an autonomous desktop agent. You can see the screen and decide actions.\n"
+            "Available actions:\n"
+            "- click(x, y): Click at normalized coordinates (0-1)\n"
+            "- type(text): Type text\n"
+            "- scroll(x, y, direction): Scroll at position\n"
+            "- key(key_name): Press a key (enter, escape, etc.)\n"
+            "- done(reason): Task completed\n"
+            "- fail(reason): Cannot complete task\n"
+            "\nRespond ONLY with the action. Be precise."
+        )
+        user_text = f"Task: {task}\n"
+        if previous_actions:
+            user_text += f"Previous actions:\n{previous_actions}\n"
+        user_text += "What do you see? What action should you take next?"
+        messages = [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": [
+                {"type": "image", "image": img},
+                {"type": "text", "text": user_text},
+            ]},
+        ]
+        # Procesar
+        text = self.processor.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
+        )
+        inputs = self.processor(
+            text=[text],
+            images=[img],
+            return_tensors="pt",
+            padding=True,
+        ).to(self.device)
+        # Generar
+        with torch.no_grad():
+            output = self.model.generate(
+                **inputs,
+                max_new_tokens=256,
+                temperature=0.3,
+                do_sample=True,
+                top_p=0.9,
+            )
+        response = self.processor.decode(output[0], skip_special_tokens=True)
+        # Extraer solo la respuesta del asistente
+        if "assistant" in response:
+            response = response.split("assistant")[-1].strip()
+        return response
+    def execute_action(self, action_text: str) -> bool:
+        """🖐️ MANOS: Ejecuta la acción en el desktop"""
+        import re
+        screen_w, screen_h = pyautogui.size()
+        action_text = action_text.strip().lower()
+        try:
+            # Click: click(0.5, 0.3)
+            if action_text.startswith("click("):
+                match = re.search(r'click\(([-+]?[0-9]*\.?[0-9]+),\s*([-+]?[0-9]*\.?[0-9]+)\)', action_text)
+                if match:
+                    x_norm, y_norm = float(match.group(1)), float(match.group(2))
+                    x = int(x_norm * screen_w)
+                    y = int(y_norm * screen_h)
+                    pyautogui.click(x, y)
+                    print(f"  🖱️  Click en ({x}, {y})")
+                    return True
+            # Type: type("hello world")
+            elif action_text.startswith("type("):
+                match = re.search(r'type\("(.+?)"\)', action_text)
+                if match:
+                    text = match.group(1)
+                    pyautogui.typewrite(text, interval=0.01)
+                    print(f"  ⌨️  Type: {text}")
+                    return True
+            # Key: key("enter")
+            elif action_text.startswith("key("):
+                match = re.search(r'key\("(.+?)"\)', action_text)
+                if match:
+                    key = match.group(1)
+                    pyautogui.press(key)
+                    print(f"  ⌨️  Key: {key}")
+                    return True
+            # Scroll: scroll(0.5, 0.5, "down")
+            elif action_text.startswith("scroll("):
+                match = re.search(r'scroll\(([-+]?[0-9]*\.?[0-9]+),\s*([-+]?[0-9]*\.?[0-9]+),\s*"(.+?)"\)', action_text)
+                if match:
+                    x_norm, y_norm, direction = float(match.group(1)), float(match.group(2)), match.group(3)
+                    x = int(x_norm * screen_w)
+                    y = int(y_norm * screen_h)
+                    clicks = -500 if direction == "down" else 500
+                    pyautogui.scroll(clicks, x, y)
+                    print(f"  🖱️  Scroll {direction} en ({x}, {y})")
+                    return True
+            # Done / Fail
+            elif action_text.startswith("done(") or action_text.startswith("fail("):
+                print(f"  🏁 {action_text}")
+                return False  # Termina el loop
+            else:
+                print(f"  ⚠️  Acción no reconocida: {action_text}")
+                return False
+        except Exception as e:
+            print(f"  ❌ Error ejecutando acción: {e}")
+            return False
+        return False
+    def run(self, task: str, max_steps: int = 50, delay: float = 2.0):
+        """
+        🚀 LOOP PRINCIPAL DEL AGENTE
+        1. Captura pantalla
+        2. Piensa (VLM)
+        3. Ejecuta acción
+        4. Repite
+        """
+        print(f"\n{'='*60}")
+        print(f"🚀 AGENTE AUTÓNOMO INICIADO")
+        print(f"📋 Tarea: {task}")
+        print(f"🔢 Max steps: {max_steps}")
+        print(f"{'='*60}\n")
+        previous_actions = "None"
+        for step in range(1, max_steps + 1):
+            print(f"\n--- Step {step}/{max_steps} ---")
+            # 1. OJOS: Capturar pantalla
+            print("👁️  Capturando pantalla...")
+            screenshot = self.capture_screen()
+            img_path = self.save_screenshot(screenshot, step)
+            # 2. CEREBRO: Pensar
+            print("🧠 Pensando...")
+            action = self.think(screenshot, task, previous_actions)
+            print(f"💭 Decisión: {action}")
+            # Guardar en historial
+            self.history.append({
+                "step": step,
+                "timestamp": datetime.now().isoformat(),
+                "screenshot": img_path,
+                "action": action,
+                "task": task,
+            })
+            # 3. MANOS: Ejecutar
+            print("🖐️  Ejecutando...")
+            should_continue = self.execute_action(action)
+            # Actualizar historial para próximo paso
+            previous_actions += f"\nStep {step}: {action}"
+            # Guardar log
+            log_path = SAVE_DIR / f"session_{self.session_id}_log.json"
+            with open(log_path, "w") as f:
+                json.dump(self.history, f, indent=2)
+            if not should_continue:
+                print("\n🏁 Agente terminó la tarea.")
+                break
+            # Esperar entre acciones
+            time.sleep(delay)
+        print(f"\n{'='*60}")
+        print(f"✅ SESIÓN COMPLETADA")
+        print(f"📁 Logs guardados en: {SAVE_DIR}")
+        print(f"{'='*60}\n")
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="Desktop Agent Autónomo")
+    parser.add_argument("--task", default="Open Chrome and search for 'Hugging Face'", help="Tarea a realizar")
+    parser.add_argument("--model", default=MODEL_ID, help="Modelo VLM a usar")
+    parser.add_argument("--steps", type=int, default=20, help="Máximo de pasos")
+    parser.add_argument("--delay", type=float, default=3.0, help="Segundos entre acciones")
+    parser.add_argument("--no-4bit", action="store_true", help="Cargar en fp16 (más VRAM)")
+    args = parser.parse_args()
+    agent = DesktopAgent(
+        model_id=args.model,
+        load_in_4bit=not args.no_4bit,
+    )
+    agent.run(
+        task=args.task,
+        max_steps=args.steps,
+        delay=args.delay,
+    )