| import spaces |
|
|
| import re |
| from typing import Tuple, Optional |
|
|
| import gradio as gr |
| import numpy as np |
| from PIL import Image, ImageDraw, ImageFont |
| from smolvlm_inference import TransformersModel |
|
|
| from prompt import OS_SYSTEM_PROMPT |
|
|
| |
| MODEL_ID = "smolagents/SmolVLM2-2.2B-Instruct-Agentic-GUI" |
|
|
| |
| print(f"Loading model and processor for {MODEL_ID}...") |
| model = None |
| processor = None |
| model_loaded = False |
| load_error_message = "" |
|
|
|
|
|
|
| model = TransformersModel( |
| model_id=MODEL_ID, |
| to_device="cuda:0", |
| ) |
|
|
|
|
| title = "Smol2Operator Demo" |
|
|
| description = """ |
| This is a demo of the Smol2Operator model designed to interact with graphical user interfaces (GUIs) and perform actions within them. |
| This proof-of-concept (POC) version, described in [blogpost], showcases the model’s core capabilities. |
| This compact release is intentionally scoped to fundamental tasks, with complex workflows planned for future iterations. :hugging_face: |
| """ |
|
|
|
|
|
|
| SYSTEM_PROMPT: str = OS_SYSTEM_PROMPT |
|
|
|
|
| def get_navigation_prompt(task, image, step=1): |
| """ |
| Get the prompt for the navigation task. |
| - task: The task to complete |
| - image: The current screenshot of the web page |
| - step: The current step of the task |
| """ |
| system_prompt = SYSTEM_PROMPT |
| return [ |
| { |
| "role": "system", |
| "content": [ |
| {"type": "text", "text": system_prompt}, |
| ], |
| }, |
| { |
| "role": "user", |
| "content": [ |
| { |
| "type": "image", |
| "image": image, |
| }, |
| {"type": "text", "text": f"Please generate the next move according to the UI screenshot, instruction and previous actions.\n\nInstruction: {task}\n\nPrevious actions:\nNone"}, |
| ], |
| }, |
| ] |
|
|
|
|
| def array_to_image(image_array: np.ndarray) -> Image.Image: |
| if image_array is None: |
| raise ValueError("No image provided. Please upload an image before submitting.") |
| |
| img = Image.fromarray(np.uint8(image_array)) |
| return img |
|
|
|
|
| def parse_actions_from_response(response: str) -> list[str]: |
| """Parse actions from model response using regex pattern.""" |
| pattern = r"<code>\n(.*?)\n</code>" |
| matches = re.findall(pattern, response, re.DOTALL) |
| return matches |
|
|
|
|
| def extract_coordinates_from_action(action_code: str) -> list[dict]: |
| """Extract coordinates from action code for localization actions.""" |
| localization_actions = [] |
| |
| |
| patterns = { |
| 'click': r'click\((?:x=)?([0-9.]+)(?:,\s*(?:y=)?([0-9.]+))?\)', |
| 'double_click': r'double_click\((?:x=)?([0-9.]+)(?:,\s*(?:y=)?([0-9.]+))?\)', |
| 'move_mouse': r'move_mouse\((?:self,\s*)?(?:x=)?([0-9.]+)(?:,\s*(?:y=)?([0-9.]+))\)', |
| 'drag': r'drag\(\[([0-9.]+),\s*([0-9.]+)\],\s*\[([0-9.]+),\s*([0-9.]+)\]\)' |
| } |
| |
| for action_type, pattern in patterns.items(): |
| matches = re.finditer(pattern, action_code) |
| for match in matches: |
| if action_type == 'drag': |
| |
| from_x, from_y, to_x, to_y = match.groups() |
| localization_actions.append({ |
| 'type': 'drag_from', |
| 'x': float(from_x), |
| 'y': float(from_y), |
| 'action': action_type |
| }) |
| localization_actions.append({ |
| 'type': 'drag_to', |
| 'x': float(to_x), |
| 'y': float(to_y), |
| 'action': action_type |
| }) |
| else: |
| |
| x_val = match.group(1) |
| y_val = match.group(2) if match.group(2) else x_val |
| if x_val and y_val: |
| localization_actions.append({ |
| 'type': action_type, |
| 'x': float(x_val), |
| 'y': float(y_val), |
| 'action': action_type |
| }) |
| |
| return localization_actions |
|
|
|
|
| def create_localized_image(original_image: Image.Image, coordinates: list[dict]) -> Optional[Image.Image]: |
| """Create an image with localization markers drawn on it.""" |
| if not coordinates: |
| return None |
| |
| |
| img_copy = original_image.copy() |
| draw = ImageDraw.Draw(img_copy) |
| |
| |
| width, height = img_copy.size |
| |
| |
| font = ImageFont.load_default() |
|
|
| |
| |
| colors = { |
| 'click': 'red', |
| 'double_click': 'blue', |
| 'move_mouse': 'green', |
| 'drag_from': 'orange', |
| 'drag_to': 'purple' |
| } |
| |
| for i, coord in enumerate(coordinates): |
| |
| pixel_x = int(coord['x'] * width) |
| pixel_y = int(coord['y'] * height) |
| |
| |
| color = colors.get(coord['type'], 'red') |
| |
| |
| circle_radius = 8 |
| draw.ellipse([ |
| pixel_x - circle_radius, pixel_y - circle_radius, |
| pixel_x + circle_radius, pixel_y + circle_radius |
| ], fill=color, outline='white', width=2) |
| |
| |
| label = f"{coord['type']}({coord['x']:.2f},{coord['y']:.2f})" |
| if font: |
| draw.text((pixel_x + 10, pixel_y - 10), label, fill=color, font=font) |
| else: |
| draw.text((pixel_x + 10, pixel_y - 10), label, fill=color) |
| |
| |
| if coord['type'] == 'drag_from' and i + 1 < len(coordinates) and coordinates[i + 1]['type'] == 'drag_to': |
| next_coord = coordinates[i + 1] |
| end_x = int(next_coord['x'] * width) |
| end_y = int(next_coord['y'] * height) |
| |
| |
| draw.line([pixel_x, pixel_y, end_x, end_y], fill='orange', width=3) |
| |
| |
| arrow_size = 10 |
| dx = end_x - pixel_x |
| dy = end_y - pixel_y |
| length = (dx**2 + dy**2)**0.5 |
| if length > 0: |
| dx_norm = dx / length |
| dy_norm = dy / length |
| |
| |
| arrow_x1 = end_x - arrow_size * dx_norm + arrow_size * dy_norm * 0.5 |
| arrow_y1 = end_y - arrow_size * dy_norm - arrow_size * dx_norm * 0.5 |
| arrow_x2 = end_x - arrow_size * dx_norm - arrow_size * dy_norm * 0.5 |
| arrow_y2 = end_y - arrow_size * dy_norm + arrow_size * dx_norm * 0.5 |
| |
| draw.polygon([end_x, end_y, arrow_x1, arrow_y1, arrow_x2, arrow_y2], fill='orange') |
| |
| return img_copy |
|
|
|
|
| |
| @spaces.GPU |
| def navigate(input_numpy_image: np.ndarray, task: str) -> Tuple[str, Optional[Image.Image]]: |
| input_pil_image = array_to_image(input_numpy_image) |
| assert isinstance(input_pil_image, Image.Image) |
|
|
| prompt = get_navigation_prompt(task, input_pil_image) |
|
|
|
|
| if model is None: |
| raise ValueError("Model not loaded") |
| |
| navigation_str = model.generate(prompt, max_new_tokens=500) |
| print(f"Navigation string: {navigation_str}") |
| navigation_str = navigation_str.strip() |
|
|
| |
| actions = parse_actions_from_response(navigation_str) |
| |
| |
| all_coordinates = [] |
| for action_code in actions: |
| coordinates = extract_coordinates_from_action(action_code) |
| all_coordinates.extend(coordinates) |
| |
| |
| localized_image = None |
| if all_coordinates: |
| localized_image = create_localized_image(input_pil_image, all_coordinates) |
| print(f"Found {len(all_coordinates)} localization actions") |
| |
| return navigation_str, localized_image |
|
|
|
|
| |
| example_1_image: str = "./assets/google.png" |
| example_1_image = Image.open(example_1_image) |
| example_1_task = "Search for the name of the current UK Prime Minister." |
|
|
| example_2_image: str = "./assets/huggingface.png" |
| example_2_image = Image.open(example_2_image) |
| example_2_task = "Find the most trending model." |
|
|
|
|
| with gr.Blocks(theme=gr.themes.Soft()) as demo: |
| gr.Markdown(f"<h1 style='text-align: center;'>{title}</h1>") |
| |
|
|
| with gr.Row(): |
| input_image_component = gr.Image(label="UI Image", height=500) |
| with gr.Row(): |
| with gr.Column(): |
| task_component = gr.Textbox( |
| label="task", |
| placeholder="e.g., Search for the name of the current UK Prime Minister.", |
| info="Type the task you want the model to complete.", |
| ) |
| submit_button = gr.Button("Call Agent", variant="primary") |
|
|
| with gr.Column(): |
| output_coords_component = gr.Textbox(label="Agent Output", lines=10) |
|
|
| submit_button.click(navigate, [input_image_component, task_component], [output_coords_component, input_image_component]) |
|
|
| gr.Examples( |
| examples=[[example_1_image, example_1_task], [example_2_image, example_2_task]], |
| inputs=[input_image_component, task_component], |
| outputs=[output_coords_component, input_image_component], |
| fn=navigate, |
| cache_examples=True, |
| ) |
|
|
| demo.queue(api_open=False) |
| demo.launch(debug=True, share=True) |
|
|