| import torch |
| import re |
| from transformers import AutoProcessor, AutoModelForCausalLM |
| from PIL import Image |
| import gradio as gr |
|
|
| MODEL_ID = "ByteDance-Seed/UI-TARS-1.5-7B" |
|
|
| |
| |
| |
| processor = AutoProcessor.from_pretrained(MODEL_ID) |
|
|
| model = AutoModelForCausalLM.from_pretrained( |
| MODEL_ID, |
| torch_dtype=torch.float32, |
| low_cpu_mem_usage=True |
| ) |
|
|
| model.eval() |
|
|
| |
| |
| |
| def extract_coordinates(text, image_size): |
| width, height = image_size |
|
|
| match = re.search(r"\(([\d\.]+),\s*([\d\.]+)\)", text) |
| if match: |
| x, y = float(match.group(1)), float(match.group(2)) |
|
|
| if x <= 1 and y <= 1: |
| x = int(x * width) |
| y = int(y * height) |
| else: |
| x, y = int(x), int(y) |
|
|
| return (x, y) |
|
|
| match_box = re.search(r"\[([\d\.,\s]+)\]", text) |
| if match_box: |
| nums = list(map(float, match_box.group(1).split(","))) |
| if len(nums) == 4: |
| x1, y1, x2, y2 = nums |
|
|
| if max(nums) <= 1: |
| x1, x2 = int(x1 * width), int(x2 * width) |
| y1, y2 = int(y1 * height), int(y2 * height) |
| else: |
| x1, y1, x2, y2 = map(int, nums) |
|
|
| return (x1, y1, x2, y2) |
|
|
| return None |
|
|
|
|
| |
| |
| |
| def predict(image, prompt): |
| if image is None: |
| return "Upload image", "No coordinates" |
|
|
| image_pil = Image.fromarray(image).convert("RGB") |
| width, height = image_pil.size |
|
|
| inputs = processor( |
| images=image_pil, |
| text=prompt, |
| return_tensors="pt" |
| ) |
|
|
| with torch.no_grad(): |
| output = model.generate( |
| **inputs, |
| max_new_tokens=150 |
| ) |
|
|
| result = processor.batch_decode(output, skip_special_tokens=True)[0] |
|
|
| coords = extract_coordinates(result, (width, height)) |
|
|
| coord_text = ( |
| f"{coords} (origin: top-left, x→right, y↓)" |
| if coords else "No coordinates detected" |
| ) |
|
|
| return result, coord_text |
|
|
|
|
| |
| |
| |
| with gr.Blocks() as demo: |
| gr.Markdown("# UI-TARS CPU Demo (Slow ⚠️)") |
|
|
| with gr.Row(): |
| image_input = gr.Image(type="numpy", label="Image") |
| text_input = gr.Textbox(label="Prompt") |
|
|
| btn = gr.Button("Run") |
|
|
| output_text = gr.Textbox(label="Model Output") |
| coord_output = gr.Textbox(label="Coordinates") |
|
|
| btn.click( |
| fn=predict, |
| inputs=[image_input, text_input], |
| outputs=[output_text, coord_output] |
| ) |
|
|
| demo.launch() |