import torch import re from transformers import AutoProcessor, AutoModelForCausalLM from PIL import Image import gradio as gr MODEL_ID = "ByteDance-Seed/UI-TARS-1.5-7B" # ---------------------------- # Load model (CPU optimized) # ---------------------------- processor = AutoProcessor.from_pretrained(MODEL_ID) model = AutoModelForCausalLM.from_pretrained( MODEL_ID, torch_dtype=torch.float32, # CPU safe low_cpu_mem_usage=True ) model.eval() # ---------------------------- # Coordinate Extraction # ---------------------------- def extract_coordinates(text, image_size): width, height = image_size match = re.search(r"\(([\d\.]+),\s*([\d\.]+)\)", text) if match: x, y = float(match.group(1)), float(match.group(2)) if x <= 1 and y <= 1: x = int(x * width) y = int(y * height) else: x, y = int(x), int(y) return (x, y) match_box = re.search(r"\[([\d\.,\s]+)\]", text) if match_box: nums = list(map(float, match_box.group(1).split(","))) if len(nums) == 4: x1, y1, x2, y2 = nums if max(nums) <= 1: x1, x2 = int(x1 * width), int(x2 * width) y1, y2 = int(y1 * height), int(y2 * height) else: x1, y1, x2, y2 = map(int, nums) return (x1, y1, x2, y2) return None # ---------------------------- # Prediction # ---------------------------- def predict(image, prompt): if image is None: return "Upload image", "No coordinates" image_pil = Image.fromarray(image).convert("RGB") width, height = image_pil.size inputs = processor( images=image_pil, text=prompt, return_tensors="pt" ) with torch.no_grad(): output = model.generate( **inputs, max_new_tokens=150 ) result = processor.batch_decode(output, skip_special_tokens=True)[0] coords = extract_coordinates(result, (width, height)) coord_text = ( f"{coords} (origin: top-left, x→right, y↓)" if coords else "No coordinates detected" ) return result, coord_text # ---------------------------- # UI # ---------------------------- with gr.Blocks() as demo: gr.Markdown("# UI-TARS CPU Demo (Slow ⚠️)") with gr.Row(): image_input = gr.Image(type="numpy", label="Image") text_input = gr.Textbox(label="Prompt") btn = gr.Button("Run") output_text = gr.Textbox(label="Model Output") coord_output = gr.Textbox(label="Coordinates") btn.click( fn=predict, inputs=[image_input, text_input], outputs=[output_text, coord_output] ) demo.launch()