| |
| """Evaluate MiniCPM-o 4.5 on Video-MME. |
| |
| Reuses the data loader and metrics from CleverHans-Evaluation's Qwen3-Omni |
| eval_videomme.py and swaps out the inference with MiniCPM-o. Video-MME is |
| video-only (no audio), so we do NOT pass audio in. |
| """ |
| from __future__ import annotations |
|
|
| import _common |
|
|
| import argparse |
| import gc |
| import io |
| import contextlib |
| import json |
| from pathlib import Path |
|
|
| import torch |
| from tqdm import tqdm |
|
|
| ch = _common.ch("videomme") |
| load_videomme = ch.load_videomme |
| extract_answer = ch.extract_answer |
| compute_metrics = ch.compute_metrics |
| print_summary = ch.print_summary |
| DEFAULT_VIDEO_DIR = ch.DEFAULT_VIDEO_DIR |
| DEFAULT_OUTPUT_DIR = ch.DEFAULT_OUTPUT_DIR |
|
|
| from minicpmo_inference import load_model, run_inference |
|
|
|
|
| def parse_args() -> argparse.Namespace: |
| p = argparse.ArgumentParser(description="Evaluate MiniCPM-o on Video-MME.") |
| p.add_argument("--model-id", type=str, default="openbmb/MiniCPM-o-4_5") |
| p.add_argument("--video-dir", type=Path, default=DEFAULT_VIDEO_DIR) |
| p.add_argument("--output-dir", type=Path, |
| default=Path("/home/ubuntu/eval_results/videomme_minicpmo")) |
| p.add_argument("--max-samples", type=int, default=-1) |
| p.add_argument("--max-new-tokens", type=int, default=32) |
| p.add_argument("--temperature", type=float, default=0.0) |
| p.add_argument("--label", type=str, default="minicpmo_videomme") |
| p.add_argument("--max-frames", type=int, default=64, |
| help="Max frames sampled from each video (MiniCPM-o uses " |
| "PIL images).") |
| p.add_argument("--fps", type=float, default=1.0) |
| p.add_argument("--attn", type=str, default="flash_attention_2", |
| choices=["sdpa", "flash_attention_2", "eager"]) |
| |
| p.add_argument("--vllm", action="store_true", default=False, |
| help="(no-op for MiniCPM-o 4.5; auto-falls back to transformers).") |
| p.add_argument("--tp", type=int, default=None) |
| p.add_argument("--gpu-memory-utilization", type=float, default=0.90) |
| p.add_argument("--max-model-len", type=int, default=65536) |
| p.add_argument("--batch-size", type=int, default=32) |
| |
| p.add_argument("--shard", type=int, default=0) |
| p.add_argument("--num-shards", type=int, default=1) |
| return p.parse_args() |
|
|
|
|
| def main() -> None: |
| args = parse_args() |
| out_dir = args.output_dir / args.label |
| out_dir.mkdir(parents=True, exist_ok=True) |
| shard_suffix = (f".shard{args.shard}of{args.num_shards}" |
| if args.num_shards > 1 else "") |
| results_jsonl = out_dir / f"eval_results{shard_suffix}.jsonl" |
| metrics_json = out_dir / "metrics.json" |
| summary_txt = out_dir / "summary.txt" |
|
|
| if args.vllm: |
| print("[warn] --vllm requested but MiniCPM-o 4.5 multimodal vLLM is not " |
| "supported upstream yet; falling back to transformers.") |
| print("[data] Loading Video-MME dataset...") |
| test_data = load_videomme(args.video_dir, args.max_samples) |
| if args.num_shards > 1: |
| test_data = [x for i, x in enumerate(test_data) if i % args.num_shards == args.shard] |
| print(f"[shard] shard {args.shard}/{args.num_shards}: {len(test_data)} questions") |
| else: |
| print(f"[data] {len(test_data)} questions ready") |
|
|
| processed: set = set() |
| if results_jsonl.exists(): |
| with open(results_jsonl) as f: |
| for line in f: |
| obj = json.loads(line) |
| processed.add(obj["question_id"]) |
| print(f"[resume] {len(processed)} already processed, skipping") |
|
|
| model, tokenizer = load_model(args.model_id, attn_implementation=args.attn, |
| init_audio=False) |
|
|
| for item in tqdm(test_data, desc="Video-MME", unit="q"): |
| if item["question_id"] in processed: |
| continue |
| try: |
| raw_output = run_inference( |
| model, tokenizer, |
| video_path=item["video_path"], |
| audio_path=None, |
| prompt=item["prompt"], |
| max_new_tokens=args.max_new_tokens, |
| temperature=args.temperature, |
| max_frames=args.max_frames, |
| fps=args.fps, |
| ) |
| except Exception as exc: |
| import traceback |
| print(f" [error] {item['question_id']}: {exc}") |
| traceback.print_exc() |
| raw_output = "" |
|
|
| pred = extract_answer(raw_output) |
| result = { |
| "question_id": item["question_id"], |
| "video_id": item["video_id"], |
| "duration": item["duration"], |
| "domain": item["domain"], |
| "sub_category": item["sub_category"], |
| "task_type": item["task_type"], |
| "question": item["question"], |
| "options": item["options"], |
| "gt_answer": item["gt_answer"], |
| "pred_answer": pred, |
| "raw_output": raw_output, |
| } |
| with open(results_jsonl, "a", encoding="utf-8") as f: |
| f.write(json.dumps(result, ensure_ascii=False) + "\n") |
|
|
| processed.add(item["question_id"]) |
| gc.collect() |
| torch.cuda.empty_cache() |
|
|
| if args.num_shards > 1: |
| print(f"\n[shard {args.shard}/{args.num_shards}] Done. Results: {results_jsonl}") |
| print(f"[shard] Run merge_shards.py --bench videomme --label-dir {out_dir}") |
| return |
|
|
| all_results = [] |
| if results_jsonl.exists(): |
| with open(results_jsonl) as f: |
| for line in f: |
| all_results.append(json.loads(line)) |
|
|
| metrics = compute_metrics(all_results) |
| metrics["eval_config"] = { |
| "model_id": args.model_id, |
| "video_dir": str(args.video_dir), |
| "max_new_tokens": args.max_new_tokens, |
| "temperature": args.temperature, |
| "max_frames": args.max_frames, |
| "fps": args.fps, |
| "attn": args.attn, |
| } |
|
|
| with open(metrics_json, "w", encoding="utf-8") as f: |
| json.dump(metrics, f, indent=2, ensure_ascii=False) |
|
|
| print_summary(metrics, args.label) |
| with open(summary_txt, "w", encoding="utf-8") as f: |
| buf = io.StringIO() |
| with contextlib.redirect_stdout(buf): |
| print_summary(metrics, args.label) |
| f.write(buf.getvalue()) |
|
|
| print(f"\n[output] Results: {results_jsonl}") |
| print(f"[output] Metrics: {metrics_json}") |
| print(f"[output] Summary: {summary_txt}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|