import gradio as gr
import torch
from transformers import AutoProcessor, AutoModelForImageTextToText

# =========================
# Load model (CPU optimized)
# =========================
model_id = "microsoft/GUI-Actor-Verifier-2B"

processor = AutoProcessor.from_pretrained(
    model_id,
    trust_remote_code=True
)

model = AutoModelForImageTextToText.from_pretrained(
    model_id,
    trust_remote_code=True,
    torch_dtype=torch.float32,   # CPU needs float32
    device_map="cpu",            # force CPU
    low_cpu_mem_usage=True
)

model.eval()


# =========================
# Inference
# =========================
def run_model(image, prompt):
    try:
        if image is None:
            return "❌ Please upload an image."

        if not prompt or prompt.strip() == "":
            prompt = "Describe this image."

        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "image", "image": image},
                    {"type": "text", "text": prompt}
                ]
            }
        ]

        inputs = processor.apply_chat_template(
            messages,
            add_generation_prompt=True,
            tokenize=True,
            return_dict=True,
            return_tensors="pt",
        )

        # Move tensors to CPU explicitly
        inputs = {k: v.to("cpu") for k, v in inputs.items()}

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=50,   # IMPORTANT: keep small for CPU
                do_sample=False
            )

        result = processor.decode(
            outputs[0][inputs["input_ids"].shape[-1]:],
            skip_special_tokens=True
        )

        return result

    except Exception as e:
        return f"❌ Error: {str(e)}"


# =========================
# UI
# =========================
demo = gr.Interface(
    fn=run_model,
    inputs=[
        gr.Image(type="pil", label="Upload Image"),
        gr.Textbox(label="Your Question")
    ],
    outputs=gr.Textbox(label="Model Output"),
    title="GUI Actor Verifier (CPU Mode)",
    description="⚠️ Running on CPU — responses may be slow."
)

demo.launch()