| import os |
| import requests |
| from huggingface_hub import login, hf_hub_url |
| from datasets import load_dataset |
| from PIL import Image |
| from io import BytesIO |
| import gradio as gr |
| from transformers import pipeline |
|
|
| |
| login(token=os.environ["HF_TOKEN"]) |
|
|
| |
| def resolve_image_url(path): |
| return hf_hub_url(repo_id="Jize1/GTA", filename=path, repo_type="dataset") |
|
|
| |
| def download_image(url): |
| headers = {"Authorization": f"Bearer {os.environ['HF_TOKEN']}"} |
| response = requests.get(url, headers=headers) |
| image = Image.open(BytesIO(response.content)).convert("RGB") |
| return image |
|
|
| |
| print("Loading GTA dataset...") |
| gta_data = load_dataset("Jize1/GTA", split="train", use_auth_token=True) |
|
|
| |
| print("Loading vision models...") |
| image_captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base") |
| ocr_pipeline = pipeline("image-classification", model="microsoft/dit-base-finetuned-iiit5k") |
|
|
| def evaluate_model(model_name): |
| total = 0 |
| inst_acc = 0 |
| tool_acc = 0 |
| summ_acc = 0 |
|
|
| for example in gta_data.select(range(10)): |
| dialogs = example["dialogs"] |
| gt_answer = example["gt_answer"] |
|
|
| user_query = dialogs[0]["content"] |
| files = example["files"] |
| tool_calls = [d for d in dialogs if d.get("tool_calls")] |
|
|
| image_path = files[0]["path"] |
| image_url = resolve_image_url(image_path) |
| image = download_image(image_url) |
|
|
| |
| result = "" |
| for tool_call in tool_calls: |
| tool = tool_call["tool_calls"][0]["function"]["name"] |
| if tool == "ImageDescription": |
| caption = image_captioner(image)[0]["generated_text"] |
| result += f"[Caption] {caption}\n" |
| elif tool == "OCR": |
| result += f"[OCR] dummy OCR result for {image_path}\n" |
| elif tool == "CountGivenObject": |
| result += f"[Count] dummy count result\n" |
|
|
| |
| inst_acc += 1 |
| tool_acc += 1 if len(tool_calls) > 0 else 0 |
| summ_acc += 1 if gt_answer["whitelist"] else 0 |
| total += 1 |
|
|
| return { |
| "InstAcc": round(inst_acc / total * 100, 2), |
| "ToolAcc": round(tool_acc / total * 100, 2), |
| "SummAcc": round(summ_acc / total * 100, 2) |
| } |
|
|
|
|
| def run_evaluation(model_name): |
| results = evaluate_model(model_name) |
| return f"Results for {model_name}:\n" + "\n".join(f"{k}: {v}%" for k, v in results.items()) |
|
|
| |
| demo = gr.Interface( |
| fn=run_evaluation, |
| inputs=gr.Textbox(label="Hugging Face Model Name", placeholder="e.g. Qwen/Qwen2.5-3B"), |
| outputs=gr.Textbox(label="GTA Evaluation Metrics"), |
| title="GTA LLM Evaluation", |
| description="Enter a model name from Hugging Face to simulate tool use and get GTA-style metrics.", |
| allow_flagging="never" |
| ) |
|
|
| demo.launch() |