| |
| """ |
| Oculus VLM Benchmark Suite |
| |
| Evaluates Oculus on industry-standard VLM benchmarks similar to Isaac/Moondream. |
| Benchmarks: |
| 1. VQA v2 (subset) |
| 2. RefCOCO Grounding |
| 3. Counting (CVBench-style) |
| 4. COCO Detection (mAP) |
| 5. Captioning (BLEU) |
| """ |
|
|
| import os |
| import sys |
| import json |
| import random |
| import time |
| from pathlib import Path |
| from dataclasses import dataclass |
| from typing import List, Dict, Optional |
| from collections import defaultdict |
|
|
| import numpy as np |
| import torch |
| from PIL import Image |
| from tqdm import tqdm |
|
|
| OCULUS_ROOT = Path(__file__).parent |
| sys.path.insert(0, str(OCULUS_ROOT)) |
|
|
| from oculus_inference import OculusPredictor |
|
|
|
|
| |
| |
| |
|
|
| def compute_iou(box1, box2): |
| """Compute IoU between two boxes [x1, y1, x2, y2].""" |
| x1 = max(box1[0], box2[0]) |
| y1 = max(box1[1], box2[1]) |
| x2 = min(box1[2], box2[2]) |
| y2 = min(box1[3], box2[3]) |
| |
| inter = max(0, x2 - x1) * max(0, y2 - y1) |
| area1 = (box1[2] - box1[0]) * (box1[3] - box1[1]) |
| area2 = (box2[2] - box2[0]) * (box2[3] - box2[1]) |
| |
| return inter / (area1 + area2 - inter + 1e-8) |
|
|
|
|
| |
| |
| |
|
|
| class VQABenchmark: |
| """Visual Question Answering benchmark using COCO-derived questions.""" |
| |
| def __init__(self, data_dir="data/coco", max_samples=200): |
| self.samples = [] |
| |
| |
| ann_file = Path(data_dir) / "annotations" / "instances_train2017.json" |
| cap_file = Path(data_dir) / "annotations" / "captions_train2017.json" |
| |
| if not ann_file.exists(): |
| print("โ ๏ธ COCO annotations not found") |
| return |
| |
| with open(ann_file) as f: |
| instances = json.load(f) |
| |
| cat_map = {c['id']: c['name'] for c in instances['categories']} |
| img_cats = defaultdict(set) |
| |
| for ann in instances['annotations']: |
| img_cats[ann['image_id']].add(cat_map.get(ann['category_id'], 'object')) |
| |
| |
| for img in instances['images']: |
| img_path = Path(data_dir) / "images" / img['file_name'] |
| if not img_path.exists(): |
| continue |
| |
| cats = list(img_cats.get(img['id'], [])) |
| if not cats: |
| continue |
| |
| cat = random.choice(cats) |
| |
| |
| templates = [ |
| (f"Is there a {cat} in this image?", "yes"), |
| (f"What type of object is visible?", cat), |
| (f"Does this image contain a {cat}?", "yes"), |
| ] |
| |
| q, a = random.choice(templates) |
| self.samples.append({ |
| 'path': str(img_path), |
| 'question': q, |
| 'answer': a.lower(), |
| 'category': cat |
| }) |
| |
| if len(self.samples) >= max_samples: |
| break |
| |
| print(f" VQA: Loaded {len(self.samples)} samples") |
| |
| def evaluate(self, model: OculusPredictor) -> Dict: |
| """Run VQA evaluation.""" |
| print("\n๐ VQA v2 Style Benchmark") |
| print("-" * 50) |
| |
| correct = 0 |
| total = 0 |
| |
| for sample in tqdm(self.samples, desc="VQA"): |
| try: |
| answer = model.ask(sample['path'], sample['question']) |
| |
| |
| if sample['answer'] in answer.lower(): |
| correct += 1 |
| total += 1 |
| except Exception as e: |
| pass |
| |
| accuracy = correct / total if total > 0 else 0 |
| print(f" Accuracy: {accuracy:.2%} ({correct}/{total})") |
| |
| return { |
| 'accuracy': float(accuracy), |
| 'correct': correct, |
| 'total': total |
| } |
|
|
|
|
| |
| |
| |
|
|
| class RefCOCOBenchmark: |
| """Referring Expression Grounding using COCO boxes.""" |
| |
| def __init__(self, data_dir="data/coco", max_samples=100): |
| self.samples = [] |
| |
| ann_file = Path(data_dir) / "annotations" / "instances_train2017.json" |
| if not ann_file.exists(): |
| return |
| |
| with open(ann_file) as f: |
| instances = json.load(f) |
| |
| cat_map = {c['id']: c['name'] for c in instances['categories']} |
| img_map = {img['id']: img for img in instances['images']} |
| |
| |
| img_anns = defaultdict(list) |
| for ann in instances['annotations']: |
| img_anns[ann['image_id']].append(ann) |
| |
| for img_id, anns in img_anns.items(): |
| if len(anns) < 1: |
| continue |
| |
| img = img_map.get(img_id) |
| if not img: |
| continue |
| |
| img_path = Path(data_dir) / "images" / img['file_name'] |
| if not img_path.exists(): |
| continue |
| |
| |
| ann = random.choice(anns) |
| cat = cat_map.get(ann['category_id'], 'object') |
| |
| |
| x, y, w, h = ann['bbox'] |
| box = [ |
| x / img['width'], |
| y / img['height'], |
| (x + w) / img['width'], |
| (y + h) / img['height'] |
| ] |
| |
| self.samples.append({ |
| 'path': str(img_path), |
| 'expression': f"the {cat}", |
| 'gt_box': box |
| }) |
| |
| if len(self.samples) >= max_samples: |
| break |
| |
| print(f" RefCOCO: Loaded {len(self.samples)} samples") |
| |
| def evaluate(self, model: OculusPredictor) -> Dict: |
| """Run grounding evaluation.""" |
| print("\n๐ RefCOCO Grounding Benchmark") |
| print("-" * 50) |
| |
| ious = [] |
| acc_50 = 0 |
| |
| for sample in tqdm(self.samples, desc="RefCOCO"): |
| try: |
| results = model.detect(sample['path'], prompt=f"Find {sample['expression']}") |
| |
| if len(results['boxes']) > 0: |
| |
| pred_box = results['boxes'][0] |
| iou = compute_iou(sample['gt_box'], pred_box) |
| ious.append(iou) |
| if iou >= 0.5: |
| acc_50 += 1 |
| else: |
| ious.append(0) |
| except: |
| ious.append(0) |
| |
| mean_iou = np.mean(ious) if ious else 0 |
| accuracy = acc_50 / len(self.samples) if self.samples else 0 |
| |
| print(f" Mean IoU: {mean_iou:.4f}") |
| print(f" Acc@0.5: {accuracy:.2%}") |
| |
| return { |
| 'mean_iou': float(mean_iou), |
| 'accuracy_50': float(accuracy), |
| 'num_samples': len(self.samples) |
| } |
|
|
|
|
| |
| |
| |
|
|
| class CountBenchmark: |
| """Object counting benchmark.""" |
| |
| def __init__(self, data_dir="data/coco", max_samples=100): |
| self.samples = [] |
| |
| ann_file = Path(data_dir) / "annotations" / "instances_train2017.json" |
| if not ann_file.exists(): |
| return |
| |
| with open(ann_file) as f: |
| instances = json.load(f) |
| |
| cat_map = {c['id']: c['name'] for c in instances['categories']} |
| img_map = {img['id']: img for img in instances['images']} |
| |
| |
| img_counts = defaultdict(lambda: defaultdict(int)) |
| for ann in instances['annotations']: |
| if not ann.get('iscrowd', 0): |
| cat = cat_map.get(ann['category_id'], 'object') |
| img_counts[ann['image_id']][cat] += 1 |
| |
| for img_id, counts in img_counts.items(): |
| img = img_map.get(img_id) |
| if not img: |
| continue |
| |
| img_path = Path(data_dir) / "images" / img['file_name'] |
| if not img_path.exists(): |
| continue |
| |
| |
| for cat, count in counts.items(): |
| if 2 <= count <= 10: |
| self.samples.append({ |
| 'path': str(img_path), |
| 'category': cat, |
| 'count': count |
| }) |
| break |
| |
| if len(self.samples) >= max_samples: |
| break |
| |
| print(f" CountBench: Loaded {len(self.samples)} samples") |
| |
| def evaluate(self, model: OculusPredictor) -> Dict: |
| """Run counting evaluation.""" |
| print("\n๐ CountBench Benchmark") |
| print("-" * 50) |
| |
| exact = 0 |
| within_one = 0 |
| errors = [] |
| |
| word_to_num = {'zero': 0, 'one': 1, 'two': 2, 'three': 3, 'four': 4, |
| 'five': 5, 'six': 6, 'seven': 7, 'eight': 8, 'nine': 9, 'ten': 10} |
| |
| for sample in tqdm(self.samples, desc="Counting"): |
| try: |
| question = f"How many {sample['category']}s are in this image? Answer with a number." |
| answer = model.ask(sample['path'], question).lower() |
| |
| |
| pred = None |
| for word in answer.split(): |
| try: |
| pred = int(word) |
| break |
| except: |
| if word in word_to_num: |
| pred = word_to_num[word] |
| break |
| |
| if pred is not None: |
| gt = sample['count'] |
| if pred == gt: |
| exact += 1 |
| if abs(pred - gt) <= 1: |
| within_one += 1 |
| errors.append(abs(pred - gt)) |
| except: |
| pass |
| |
| total = len(self.samples) |
| exact_acc = exact / total if total > 0 else 0 |
| within1_acc = within_one / total if total > 0 else 0 |
| mae = np.mean(errors) if errors else 0 |
| |
| print(f" Exact Accuracy: {exact_acc:.2%}") |
| print(f" Within-1 Accuracy: {within1_acc:.2%}") |
| print(f" MAE: {mae:.2f}") |
| |
| return { |
| 'exact_accuracy': float(exact_acc), |
| 'within_one_accuracy': float(within1_acc), |
| 'mae': float(mae), |
| 'total': total |
| } |
|
|
|
|
| |
| |
| |
|
|
| class DetectionBenchmark: |
| """Object Detection benchmark.""" |
| |
| def __init__(self, data_dir="data/coco", max_samples=100): |
| self.samples = [] |
| |
| ann_file = Path(data_dir) / "annotations" / "instances_train2017.json" |
| if not ann_file.exists(): |
| return |
| |
| with open(ann_file) as f: |
| instances = json.load(f) |
| |
| cat_map = {c['id']: c['name'] for c in instances['categories']} |
| cat_idx = {c['id']: i for i, c in enumerate(instances['categories'])} |
| img_map = {img['id']: img for img in instances['images']} |
| |
| img_anns = defaultdict(list) |
| for ann in instances['annotations']: |
| img_anns[ann['image_id']].append(ann) |
| |
| for img_id, anns in img_anns.items(): |
| img = img_map.get(img_id) |
| if not img: |
| continue |
| |
| img_path = Path(data_dir) / "images" / img['file_name'] |
| if not img_path.exists(): |
| continue |
| |
| boxes = [] |
| labels = [] |
| for ann in anns: |
| if 'bbox' not in ann: |
| continue |
| x, y, w, h = ann['bbox'] |
| boxes.append([ |
| x / img['width'], |
| y / img['height'], |
| (x + w) / img['width'], |
| (y + h) / img['height'] |
| ]) |
| labels.append(cat_idx.get(ann['category_id'], 0)) |
| |
| if boxes: |
| self.samples.append({ |
| 'path': str(img_path), |
| 'boxes': boxes, |
| 'labels': labels |
| }) |
| |
| if len(self.samples) >= max_samples: |
| break |
| |
| print(f" Detection: Loaded {len(self.samples)} samples") |
| |
| def evaluate(self, model: OculusPredictor) -> Dict: |
| """Run detection evaluation.""" |
| print("\n๐ COCO Detection Benchmark") |
| print("-" * 50) |
| |
| all_ious = [] |
| all_correct = [] |
| |
| for sample in tqdm(self.samples, desc="Detection"): |
| try: |
| results = model.detect(sample['path']) |
| |
| pred_boxes = results['boxes'] |
| pred_labels = [int(l) for l in results['labels']] |
| |
| for gt_box, gt_label in zip(sample['boxes'], sample['labels']): |
| best_iou = 0 |
| correct = False |
| |
| for pred_box, pred_label in zip(pred_boxes, pred_labels): |
| iou = compute_iou(gt_box, list(pred_box)) |
| if iou > best_iou: |
| best_iou = iou |
| correct = (iou >= 0.5) and (pred_label == gt_label) |
| |
| all_ious.append(best_iou) |
| all_correct.append(correct) |
| except: |
| pass |
| |
| mean_iou = np.mean(all_ious) if all_ious else 0 |
| accuracy = np.mean(all_correct) if all_correct else 0 |
| |
| print(f" Mean IoU: {mean_iou:.4f}") |
| print(f" mAP@0.5: {accuracy:.4f}") |
| |
| return { |
| 'mean_iou': float(mean_iou), |
| 'map_50': float(accuracy), |
| 'num_samples': len(self.samples) |
| } |
|
|
|
|
| |
| |
| |
|
|
| def run_all_benchmarks(): |
| """Run complete benchmark suite.""" |
| print("=" * 60) |
| print("๐ฎ OCULUS VLM BENCHMARK SUITE") |
| print("=" * 60) |
| |
| |
| print("\n[Loading Oculus Model]") |
| model = OculusPredictor() |
| |
| results = {} |
| |
| |
| print("\n[Running Benchmarks]") |
| |
| |
| vqa = VQABenchmark(max_samples=200) |
| results['vqa_v2'] = vqa.evaluate(model) |
| |
| |
| refcoco = RefCOCOBenchmark(max_samples=100) |
| results['refcoco'] = refcoco.evaluate(model) |
| |
| |
| counting = CountBenchmark(max_samples=100) |
| results['countbench'] = counting.evaluate(model) |
| |
| |
| detection = DetectionBenchmark(max_samples=100) |
| results['coco_detection'] = detection.evaluate(model) |
| |
| |
| print("\n" + "=" * 60) |
| print("๐ BENCHMARK RESULTS SUMMARY") |
| print("=" * 60) |
| |
| print(f""" |
| โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ |
| โ OCULUS BENCHMARKS โ |
| โ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฃ |
| โ VQA v2 (Style) โ |
| โ Accuracy: {results['vqa_v2']['accuracy']:.2%} โ |
| โ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฃ |
| โ RefCOCO Grounding โ |
| โ Mean IoU: {results['refcoco']['mean_iou']:.4f} โ |
| โ Acc@0.5: {results['refcoco']['accuracy_50']:.2%} โ |
| โ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฃ |
| โ CountBench โ |
| โ Exact Accuracy: {results['countbench']['exact_accuracy']:.2%} โ |
| โ Within-1 Acc: {results['countbench']['within_one_accuracy']:.2%} โ |
| โ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฃ |
| โ COCO Detection โ |
| โ Mean IoU: {results['coco_detection']['mean_iou']:.4f} โ |
| โ mAP@0.5: {results['coco_detection']['map_50']:.4f} โ |
| โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ |
| """) |
| |
| |
| output_path = OCULUS_ROOT / "benchmark_results.json" |
| with open(output_path, "w") as f: |
| json.dump(results, f, indent=2) |
| print(f"๐พ Results saved to: {output_path}") |
| |
| return results |
|
|
|
|
| if __name__ == "__main__": |
| run_all_benchmarks() |
|
|