Spaces:

dpv007
/

ui

Runtime error

App Files Files Community

ui / app.py

dpv007

Update app.py

68be95b verified 16 days ago

raw

history blame contribute delete

2.73 kB

	import torch
	import re
	from transformers import AutoProcessor, AutoModelForCausalLM
	from PIL import Image
	import gradio as gr

	MODEL_ID = "ByteDance-Seed/UI-TARS-1.5-7B"

	# ----------------------------
	# Load model (CPU optimized)
	# ----------------------------
	processor = AutoProcessor.from_pretrained(MODEL_ID)

	model = AutoModelForCausalLM.from_pretrained(
	MODEL_ID,
	torch_dtype=torch.float32, # CPU safe
	low_cpu_mem_usage=True
	)

	model.eval()

	# ----------------------------
	# Coordinate Extraction
	# ----------------------------
	def extract_coordinates(text, image_size):
	width, height = image_size

	match = re.search(r"\(([\d\.]+),\s*([\d\.]+)\)", text)
	if match:
	x, y = float(match.group(1)), float(match.group(2))

	if x <= 1 and y <= 1:
	x = int(x * width)
	y = int(y * height)
	else:
	x, y = int(x), int(y)

	return (x, y)

	match_box = re.search(r"\[([\d\.,\s]+)\]", text)
	if match_box:
	nums = list(map(float, match_box.group(1).split(",")))
	if len(nums) == 4:
	x1, y1, x2, y2 = nums

	if max(nums) <= 1:
	x1, x2 = int(x1 * width), int(x2 * width)
	y1, y2 = int(y1 * height), int(y2 * height)
	else:
	x1, y1, x2, y2 = map(int, nums)

	return (x1, y1, x2, y2)

	return None


	# ----------------------------
	# Prediction
	# ----------------------------
	def predict(image, prompt):
	if image is None:
	return "Upload image", "No coordinates"

	image_pil = Image.fromarray(image).convert("RGB")
	width, height = image_pil.size

	inputs = processor(
	images=image_pil,
	text=prompt,
	return_tensors="pt"
	)

	with torch.no_grad():
	output = model.generate(
	**inputs,
	max_new_tokens=150
	)

	result = processor.batch_decode(output, skip_special_tokens=True)[0]

	coords = extract_coordinates(result, (width, height))

	coord_text = (
	f"{coords} (origin: top-left, x→right, y↓)"
	if coords else "No coordinates detected"
	)

	return result, coord_text


	# ----------------------------
	# UI
	# ----------------------------
	with gr.Blocks() as demo:
	gr.Markdown("# UI-TARS CPU Demo (Slow ⚠️)")

	with gr.Row():
	image_input = gr.Image(type="numpy", label="Image")
	text_input = gr.Textbox(label="Prompt")

	btn = gr.Button("Run")

	output_text = gr.Textbox(label="Model Output")
	coord_output = gr.Textbox(label="Coordinates")

	btn.click(
	fn=predict,
	inputs=[image_input, text_input],
	outputs=[output_text, coord_output]
	)

	demo.launch()