| import gradio as gr |
| import torch |
| from transformers import AutoProcessor, AutoModelForImageTextToText |
|
|
| |
| |
| |
| model_id = "microsoft/GUI-Actor-Verifier-2B" |
|
|
| processor = AutoProcessor.from_pretrained( |
| model_id, |
| trust_remote_code=True |
| ) |
|
|
| model = AutoModelForImageTextToText.from_pretrained( |
| model_id, |
| trust_remote_code=True, |
| torch_dtype=torch.float32, |
| device_map="cpu", |
| low_cpu_mem_usage=True |
| ) |
|
|
| model.eval() |
|
|
|
|
| |
| |
| |
| def run_model(image, prompt): |
| try: |
| if image is None: |
| return "❌ Please upload an image." |
|
|
| if not prompt or prompt.strip() == "": |
| prompt = "Describe this image." |
|
|
| messages = [ |
| { |
| "role": "user", |
| "content": [ |
| {"type": "image", "image": image}, |
| {"type": "text", "text": prompt} |
| ] |
| } |
| ] |
|
|
| inputs = processor.apply_chat_template( |
| messages, |
| add_generation_prompt=True, |
| tokenize=True, |
| return_dict=True, |
| return_tensors="pt", |
| ) |
|
|
| |
| inputs = {k: v.to("cpu") for k, v in inputs.items()} |
|
|
| with torch.no_grad(): |
| outputs = model.generate( |
| **inputs, |
| max_new_tokens=50, |
| do_sample=False |
| ) |
|
|
| result = processor.decode( |
| outputs[0][inputs["input_ids"].shape[-1]:], |
| skip_special_tokens=True |
| ) |
|
|
| return result |
|
|
| except Exception as e: |
| return f"❌ Error: {str(e)}" |
|
|
|
|
| |
| |
| |
| demo = gr.Interface( |
| fn=run_model, |
| inputs=[ |
| gr.Image(type="pil", label="Upload Image"), |
| gr.Textbox(label="Your Question") |
| ], |
| outputs=gr.Textbox(label="Model Output"), |
| title="GUI Actor Verifier (CPU Mode)", |
| description="⚠️ Running on CPU — responses may be slow." |
| ) |
|
|
| demo.launch() |