ui / app.py
dpv007's picture
Update app.py
68be95b verified
import torch
import re
from transformers import AutoProcessor, AutoModelForCausalLM
from PIL import Image
import gradio as gr
MODEL_ID = "ByteDance-Seed/UI-TARS-1.5-7B"
# ----------------------------
# Load model (CPU optimized)
# ----------------------------
processor = AutoProcessor.from_pretrained(MODEL_ID)
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
torch_dtype=torch.float32, # CPU safe
low_cpu_mem_usage=True
)
model.eval()
# ----------------------------
# Coordinate Extraction
# ----------------------------
def extract_coordinates(text, image_size):
width, height = image_size
match = re.search(r"\(([\d\.]+),\s*([\d\.]+)\)", text)
if match:
x, y = float(match.group(1)), float(match.group(2))
if x <= 1 and y <= 1:
x = int(x * width)
y = int(y * height)
else:
x, y = int(x), int(y)
return (x, y)
match_box = re.search(r"\[([\d\.,\s]+)\]", text)
if match_box:
nums = list(map(float, match_box.group(1).split(",")))
if len(nums) == 4:
x1, y1, x2, y2 = nums
if max(nums) <= 1:
x1, x2 = int(x1 * width), int(x2 * width)
y1, y2 = int(y1 * height), int(y2 * height)
else:
x1, y1, x2, y2 = map(int, nums)
return (x1, y1, x2, y2)
return None
# ----------------------------
# Prediction
# ----------------------------
def predict(image, prompt):
if image is None:
return "Upload image", "No coordinates"
image_pil = Image.fromarray(image).convert("RGB")
width, height = image_pil.size
inputs = processor(
images=image_pil,
text=prompt,
return_tensors="pt"
)
with torch.no_grad():
output = model.generate(
**inputs,
max_new_tokens=150
)
result = processor.batch_decode(output, skip_special_tokens=True)[0]
coords = extract_coordinates(result, (width, height))
coord_text = (
f"{coords} (origin: top-left, x→right, y↓)"
if coords else "No coordinates detected"
)
return result, coord_text
# ----------------------------
# UI
# ----------------------------
with gr.Blocks() as demo:
gr.Markdown("# UI-TARS CPU Demo (Slow ⚠️)")
with gr.Row():
image_input = gr.Image(type="numpy", label="Image")
text_input = gr.Textbox(label="Prompt")
btn = gr.Button("Run")
output_text = gr.Textbox(label="Model Output")
coord_output = gr.Textbox(label="Coordinates")
btn.click(
fn=predict,
inputs=[image_input, text_input],
outputs=[output_text, coord_output]
)
demo.launch()