import gradio as gr import torch from transformers import AutoProcessor, AutoModelForImageTextToText # ========================= # Load model (CPU optimized) # ========================= model_id = "microsoft/GUI-Actor-Verifier-2B" processor = AutoProcessor.from_pretrained( model_id, trust_remote_code=True ) model = AutoModelForImageTextToText.from_pretrained( model_id, trust_remote_code=True, torch_dtype=torch.float32, # CPU needs float32 device_map="cpu", # force CPU low_cpu_mem_usage=True ) model.eval() # ========================= # Inference # ========================= def run_model(image, prompt): try: if image is None: return "❌ Please upload an image." if not prompt or prompt.strip() == "": prompt = "Describe this image." messages = [ { "role": "user", "content": [ {"type": "image", "image": image}, {"type": "text", "text": prompt} ] } ] inputs = processor.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ) # Move tensors to CPU explicitly inputs = {k: v.to("cpu") for k, v in inputs.items()} with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=50, # IMPORTANT: keep small for CPU do_sample=False ) result = processor.decode( outputs[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True ) return result except Exception as e: return f"❌ Error: {str(e)}" # ========================= # UI # ========================= demo = gr.Interface( fn=run_model, inputs=[ gr.Image(type="pil", label="Upload Image"), gr.Textbox(label="Your Question") ], outputs=gr.Textbox(label="Model Output"), title="GUI Actor Verifier (CPU Mode)", description="⚠️ Running on CPU — responses may be slow." ) demo.launch()