import torch
import re
from transformers import AutoProcessor, AutoModelForCausalLM
from PIL import Image
import gradio as gr

MODEL_ID = "ByteDance-Seed/UI-TARS-1.5-7B"

# ----------------------------
# Load model (CPU optimized)
# ----------------------------
processor = AutoProcessor.from_pretrained(MODEL_ID)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.float32,        # CPU safe
    low_cpu_mem_usage=True
)

model.eval()

# ----------------------------
# Coordinate Extraction
# ----------------------------
def extract_coordinates(text, image_size):
    width, height = image_size

    match = re.search(r"\(([\d\.]+),\s*([\d\.]+)\)", text)
    if match:
        x, y = float(match.group(1)), float(match.group(2))

        if x <= 1 and y <= 1:
            x = int(x * width)
            y = int(y * height)
        else:
            x, y = int(x), int(y)

        return (x, y)

    match_box = re.search(r"\[([\d\.,\s]+)\]", text)
    if match_box:
        nums = list(map(float, match_box.group(1).split(",")))
        if len(nums) == 4:
            x1, y1, x2, y2 = nums

            if max(nums) <= 1:
                x1, x2 = int(x1 * width), int(x2 * width)
                y1, y2 = int(y1 * height), int(y2 * height)
            else:
                x1, y1, x2, y2 = map(int, nums)

            return (x1, y1, x2, y2)

    return None


# ----------------------------
# Prediction
# ----------------------------
def predict(image, prompt):
    if image is None:
        return "Upload image", "No coordinates"

    image_pil = Image.fromarray(image).convert("RGB")
    width, height = image_pil.size

    inputs = processor(
        images=image_pil,
        text=prompt,
        return_tensors="pt"
    )

    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=150
        )

    result = processor.batch_decode(output, skip_special_tokens=True)[0]

    coords = extract_coordinates(result, (width, height))

    coord_text = (
        f"{coords} (origin: top-left, x→right, y↓)"
        if coords else "No coordinates detected"
    )

    return result, coord_text


# ----------------------------
# UI
# ----------------------------
with gr.Blocks() as demo:
    gr.Markdown("# UI-TARS CPU Demo (Slow ⚠️)")

    with gr.Row():
        image_input = gr.Image(type="numpy", label="Image")
        text_input = gr.Textbox(label="Prompt")

    btn = gr.Button("Run")

    output_text = gr.Textbox(label="Model Output")
    coord_output = gr.Textbox(label="Coordinates")

    btn.click(
        fn=predict,
        inputs=[image_input, text_input],
        outputs=[output_text, coord_output]
    )

demo.launch()