Oculus / benchmark_vlm.py

Upload benchmark_vlm.py with huggingface_hub

d6e0b94 verified 3 months ago

18.5 kB

	#!/usr/bin/env python3
	"""
	Oculus VLM Benchmark Suite

	Evaluates Oculus on industry-standard VLM benchmarks similar to Isaac/Moondream.
	Benchmarks:
	1. VQA v2 (subset)
	2. RefCOCO Grounding
	3. Counting (CVBench-style)
	4. COCO Detection (mAP)
	5. Captioning (BLEU)
	"""

	import os
	import sys
	import json
	import random
	import time
	from pathlib import Path
	from dataclasses import dataclass
	from typing import List, Dict, Optional
	from collections import defaultdict

	import numpy as np
	import torch
	from PIL import Image
	from tqdm import tqdm

	OCULUS_ROOT = Path(__file__).parent
	sys.path.insert(0, str(OCULUS_ROOT))

	from oculus_inference import OculusPredictor


	# ============================================================================
	# Benchmark Utilities
	# ============================================================================

	def compute_iou(box1, box2):
	"""Compute IoU between two boxes [x1, y1, x2, y2]."""
	x1 = max(box1[0], box2[0])
	y1 = max(box1[1], box2[1])
	x2 = min(box1[2], box2[2])
	y2 = min(box1[3], box2[3])

	inter = max(0, x2 - x1) * max(0, y2 - y1)
	area1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
	area2 = (box2[2] - box2[0]) * (box2[3] - box2[1])

	return inter / (area1 + area2 - inter + 1e-8)


	# ============================================================================
	# Benchmark 1: VQA v2 Style
	# ============================================================================

	class VQABenchmark:
	"""Visual Question Answering benchmark using COCO-derived questions."""

	def __init__(self, data_dir="data/coco", max_samples=200):
	self.samples = []

	# Load COCO annotations to generate VQA-style questions
	ann_file = Path(data_dir) / "annotations" / "instances_train2017.json"
	cap_file = Path(data_dir) / "annotations" / "captions_train2017.json"

	if not ann_file.exists():
	print("⚠️ COCO annotations not found")
	return

	with open(ann_file) as f:
	instances = json.load(f)

	cat_map = {c['id']: c['name'] for c in instances['categories']}
	img_cats = defaultdict(set)

	for ann in instances['annotations']:
	img_cats[ann['image_id']].add(cat_map.get(ann['category_id'], 'object'))

	# Generate VQA samples
	for img in instances['images']:
	img_path = Path(data_dir) / "images" / img['file_name']
	if not img_path.exists():
	continue

	cats = list(img_cats.get(img['id'], []))
	if not cats:
	continue

	cat = random.choice(cats)

	# Create different question types
	templates = [
	(f"Is there a {cat} in this image?", "yes"),
	(f"What type of object is visible?", cat),
	(f"Does this image contain a {cat}?", "yes"),
	]

	q, a = random.choice(templates)
	self.samples.append({
	'path': str(img_path),
	'question': q,
	'answer': a.lower(),
	'category': cat
	})

	if len(self.samples) >= max_samples:
	break

	print(f" VQA: Loaded {len(self.samples)} samples")

	def evaluate(self, model: OculusPredictor) -> Dict:
	"""Run VQA evaluation."""
	print("\n📊 VQA v2 Style Benchmark")
	print("-" * 50)

	correct = 0
	total = 0

	for sample in tqdm(self.samples, desc="VQA"):
	try:
	answer = model.ask(sample['path'], sample['question'])

	# Check if expected answer is in response
	if sample['answer'] in answer.lower():
	correct += 1
	total += 1
	except Exception as e:
	pass

	accuracy = correct / total if total > 0 else 0
	print(f" Accuracy: {accuracy:.2%} ({correct}/{total})")

	return {
	'accuracy': float(accuracy),
	'correct': correct,
	'total': total
	}


	# ============================================================================
	# Benchmark 2: RefCOCO Grounding
	# ============================================================================

	class RefCOCOBenchmark:
	"""Referring Expression Grounding using COCO boxes."""

	def __init__(self, data_dir="data/coco", max_samples=100):
	self.samples = []

	ann_file = Path(data_dir) / "annotations" / "instances_train2017.json"
	if not ann_file.exists():
	return

	with open(ann_file) as f:
	instances = json.load(f)

	cat_map = {c['id']: c['name'] for c in instances['categories']}
	img_map = {img['id']: img for img in instances['images']}

	# Group annotations by image
	img_anns = defaultdict(list)
	for ann in instances['annotations']:
	img_anns[ann['image_id']].append(ann)

	for img_id, anns in img_anns.items():
	if len(anns) < 1:
	continue

	img = img_map.get(img_id)
	if not img:
	continue

	img_path = Path(data_dir) / "images" / img['file_name']
	if not img_path.exists():
	continue

	# Pick a random object
	ann = random.choice(anns)
	cat = cat_map.get(ann['category_id'], 'object')

	# Normalize bbox
	x, y, w, h = ann['bbox']
	box = [
	x / img['width'],
	y / img['height'],
	(x + w) / img['width'],
	(y + h) / img['height']
	]

	self.samples.append({
	'path': str(img_path),
	'expression': f"the {cat}",
	'gt_box': box
	})

	if len(self.samples) >= max_samples:
	break

	print(f" RefCOCO: Loaded {len(self.samples)} samples")

	def evaluate(self, model: OculusPredictor) -> Dict:
	"""Run grounding evaluation."""
	print("\n📊 RefCOCO Grounding Benchmark")
	print("-" * 50)

	ious = []
	acc_50 = 0

	for sample in tqdm(self.samples, desc="RefCOCO"):
	try:
	results = model.detect(sample['path'], prompt=f"Find {sample['expression']}")

	if len(results['boxes']) > 0:
	# Take highest confidence box
	pred_box = results['boxes'][0]
	iou = compute_iou(sample['gt_box'], pred_box)
	ious.append(iou)
	if iou >= 0.5:
	acc_50 += 1
	else:
	ious.append(0)
	except:
	ious.append(0)

	mean_iou = np.mean(ious) if ious else 0
	accuracy = acc_50 / len(self.samples) if self.samples else 0

	print(f" Mean IoU: {mean_iou:.4f}")
	print(f" Acc@0.5: {accuracy:.2%}")

	return {
	'mean_iou': float(mean_iou),
	'accuracy_50': float(accuracy),
	'num_samples': len(self.samples)
	}


	# ============================================================================
	# Benchmark 3: Counting (CountBench Style)
	# ============================================================================

	class CountBenchmark:
	"""Object counting benchmark."""

	def __init__(self, data_dir="data/coco", max_samples=100):
	self.samples = []

	ann_file = Path(data_dir) / "annotations" / "instances_train2017.json"
	if not ann_file.exists():
	return

	with open(ann_file) as f:
	instances = json.load(f)

	cat_map = {c['id']: c['name'] for c in instances['categories']}
	img_map = {img['id']: img for img in instances['images']}

	# Count objects per image per category
	img_counts = defaultdict(lambda: defaultdict(int))
	for ann in instances['annotations']:
	if not ann.get('iscrowd', 0):
	cat = cat_map.get(ann['category_id'], 'object')
	img_counts[ann['image_id']][cat] += 1

	for img_id, counts in img_counts.items():
	img = img_map.get(img_id)
	if not img:
	continue

	img_path = Path(data_dir) / "images" / img['file_name']
	if not img_path.exists():
	continue

	# Pick category with 2-10 objects (reasonable counting range)
	for cat, count in counts.items():
	if 2 <= count <= 10:
	self.samples.append({
	'path': str(img_path),
	'category': cat,
	'count': count
	})
	break

	if len(self.samples) >= max_samples:
	break

	print(f" CountBench: Loaded {len(self.samples)} samples")

	def evaluate(self, model: OculusPredictor) -> Dict:
	"""Run counting evaluation."""
	print("\n📊 CountBench Benchmark")
	print("-" * 50)

	exact = 0
	within_one = 0
	errors = []

	word_to_num = {'zero': 0, 'one': 1, 'two': 2, 'three': 3, 'four': 4,
	'five': 5, 'six': 6, 'seven': 7, 'eight': 8, 'nine': 9, 'ten': 10}

	for sample in tqdm(self.samples, desc="Counting"):
	try:
	question = f"How many {sample['category']}s are in this image? Answer with a number."
	answer = model.ask(sample['path'], question).lower()

	# Parse number
	pred = None
	for word in answer.split():
	try:
	pred = int(word)
	break
	except:
	if word in word_to_num:
	pred = word_to_num[word]
	break

	if pred is not None:
	gt = sample['count']
	if pred == gt:
	exact += 1
	if abs(pred - gt) <= 1:
	within_one += 1
	errors.append(abs(pred - gt))
	except:
	pass

	total = len(self.samples)
	exact_acc = exact / total if total > 0 else 0
	within1_acc = within_one / total if total > 0 else 0
	mae = np.mean(errors) if errors else 0

	print(f" Exact Accuracy: {exact_acc:.2%}")
	print(f" Within-1 Accuracy: {within1_acc:.2%}")
	print(f" MAE: {mae:.2f}")

	return {
	'exact_accuracy': float(exact_acc),
	'within_one_accuracy': float(within1_acc),
	'mae': float(mae),
	'total': total
	}


	# ============================================================================
	# Benchmark 4: COCO Detection (mAP)
	# ============================================================================

	class DetectionBenchmark:
	"""Object Detection benchmark."""

	def __init__(self, data_dir="data/coco", max_samples=100):
	self.samples = []

	ann_file = Path(data_dir) / "annotations" / "instances_train2017.json"
	if not ann_file.exists():
	return

	with open(ann_file) as f:
	instances = json.load(f)

	cat_map = {c['id']: c['name'] for c in instances['categories']}
	cat_idx = {c['id']: i for i, c in enumerate(instances['categories'])}
	img_map = {img['id']: img for img in instances['images']}

	img_anns = defaultdict(list)
	for ann in instances['annotations']:
	img_anns[ann['image_id']].append(ann)

	for img_id, anns in img_anns.items():
	img = img_map.get(img_id)
	if not img:
	continue

	img_path = Path(data_dir) / "images" / img['file_name']
	if not img_path.exists():
	continue

	boxes = []
	labels = []
	for ann in anns:
	if 'bbox' not in ann:
	continue
	x, y, w, h = ann['bbox']
	boxes.append([
	x / img['width'],
	y / img['height'],
	(x + w) / img['width'],
	(y + h) / img['height']
	])
	labels.append(cat_idx.get(ann['category_id'], 0))

	if boxes:
	self.samples.append({
	'path': str(img_path),
	'boxes': boxes,
	'labels': labels
	})

	if len(self.samples) >= max_samples:
	break

	print(f" Detection: Loaded {len(self.samples)} samples")

	def evaluate(self, model: OculusPredictor) -> Dict:
	"""Run detection evaluation."""
	print("\n📊 COCO Detection Benchmark")
	print("-" * 50)

	all_ious = []
	all_correct = []

	for sample in tqdm(self.samples, desc="Detection"):
	try:
	results = model.detect(sample['path'])

	pred_boxes = results['boxes']
	pred_labels = [int(l) for l in results['labels']]

	for gt_box, gt_label in zip(sample['boxes'], sample['labels']):
	best_iou = 0
	correct = False

	for pred_box, pred_label in zip(pred_boxes, pred_labels):
	iou = compute_iou(gt_box, list(pred_box))
	if iou > best_iou:
	best_iou = iou
	correct = (iou >= 0.5) and (pred_label == gt_label)

	all_ious.append(best_iou)
	all_correct.append(correct)
	except:
	pass

	mean_iou = np.mean(all_ious) if all_ious else 0
	accuracy = np.mean(all_correct) if all_correct else 0

	print(f" Mean IoU: {mean_iou:.4f}")
	print(f" mAP@0.5: {accuracy:.4f}")

	return {
	'mean_iou': float(mean_iou),
	'map_50': float(accuracy),
	'num_samples': len(self.samples)
	}


	# ============================================================================
	# Main Runner
	# ============================================================================

	def run_all_benchmarks():
	"""Run complete benchmark suite."""
	print("=" * 60)
	print("🔮 OCULUS VLM BENCHMARK SUITE")
	print("=" * 60)

	# Initialize model
	print("\n[Loading Oculus Model]")
	model = OculusPredictor()

	results = {}

	# Run benchmarks
	print("\n[Running Benchmarks]")

	# 1. VQA
	vqa = VQABenchmark(max_samples=200)
	results['vqa_v2'] = vqa.evaluate(model)

	# 2. RefCOCO
	refcoco = RefCOCOBenchmark(max_samples=100)
	results['refcoco'] = refcoco.evaluate(model)

	# 3. Counting
	counting = CountBenchmark(max_samples=100)
	results['countbench'] = counting.evaluate(model)

	# 4. Detection
	detection = DetectionBenchmark(max_samples=100)
	results['coco_detection'] = detection.evaluate(model)

	# Summary
	print("\n" + "=" * 60)
	print("📊 BENCHMARK RESULTS SUMMARY")
	print("=" * 60)

	print(f"""
	╔═══════════════════════════════════════════════════════════╗
	║ OCULUS BENCHMARKS ║
	╠═══════════════════════════════════════════════════════════╣
	║ VQA v2 (Style) ║
	║ Accuracy: {results['vqa_v2']['accuracy']:.2%} ║
	╠═══════════════════════════════════════════════════════════╣
	║ RefCOCO Grounding ║
	║ Mean IoU: {results['refcoco']['mean_iou']:.4f} ║
	║ Acc@0.5: {results['refcoco']['accuracy_50']:.2%} ║
	╠═══════════════════════════════════════════════════════════╣
	║ CountBench ║
	║ Exact Accuracy: {results['countbench']['exact_accuracy']:.2%} ║
	║ Within-1 Acc: {results['countbench']['within_one_accuracy']:.2%} ║
	╠═══════════════════════════════════════════════════════════╣
	║ COCO Detection ║
	║ Mean IoU: {results['coco_detection']['mean_iou']:.4f} ║
	║ mAP@0.5: {results['coco_detection']['map_50']:.4f} ║
	╚═══════════════════════════════════════════════════════════╝
	""")

	# Save results
	output_path = OCULUS_ROOT / "benchmark_results.json"
	with open(output_path, "w") as f:
	json.dump(results, f, indent=2)
	print(f"💾 Results saved to: {output_path}")

	return results


	if __name__ == "__main__":
	run_all_benchmarks()