#!/usr/bin/env bash # Run all 6 benchmarks for MiniCPM-o 4.5 with 4-GPU data parallelism. # # For each bench, launches NUM_SHARDS python workers simultaneously (one per # GPU), each processes 1/NUM_SHARDS of the samples. After all shards finish, # merge_shards.py aggregates the per-shard jsonls and computes metrics. # Only ONE bench runs at a time; benches run sequentially. # # Two sync benches use freetext + GPT judge (matches Qwen3-Omni reference). # # Usage: # export OPENAI_API_KEY=sk-... # bash run_all.sh # # Override via env vars, e.g.: # CUDA_VISIBLE_DEVICES=4,5,6,7 LABEL=minicpmo_ckpt200 bash run_all.sh # NUM_SHARDS=2 CUDA_VISIBLE_DEVICES=6,7 bash run_all.sh set -uo pipefail # no -e: one bench failure shouldn't block the rest # ── Config ───────────────────────────────────────────────────────────────────── export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-4,5,6,7}" MODEL="${MODEL:-openbmb/MiniCPM-o-4_5}" LABEL="${LABEL:-minicpmo_4_5}" SCRIPTS="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/scripts" CONDA_ENV="${CONDA_ENV:-minicpmo}" # Data parallel: how many shards (= number of GPUs to use) IFS=',' read -ra GPU_ARR <<< "$CUDA_VISIBLE_DEVICES" NUM_SHARDS="${NUM_SHARDS:-${#GPU_ARR[@]}}" # Data paths (match Qwen3-Omni reference config) DATA_ROOT="${DATA_ROOT:-/opt/dlami/nvme/video_source}" SYNC_TEST_JSONL="${SYNC_TEST_JSONL:-/home/ubuntu/CleverHans-Evaluation/data/kto_training_data_v2_test.jsonl}" VGG_TEST_JSONL="${VGG_TEST_JSONL:-/opt/dlami/nvme/vggsoundsync_test/test_3k.jsonl}" WORLDSENSE_DIR="${WORLDSENSE_DIR:-/opt/dlami/nvme/worldsense}" DAILY_OMNI_DIR="${DAILY_OMNI_DIR:-/opt/dlami/nvme/daily_omni}" VIDEOMME_DIR="${VIDEOMME_DIR:-/opt/dlami/nvme/videomme/data/data}" LVBENCH_DIR="${LVBENCH_DIR:-/opt/dlami/nvme/lvbench}" EVAL_ROOT="${EVAL_ROOT:-/home/ubuntu/eval_results}" # ── Conda ────────────────────────────────────────────────────────────────────── if [[ -f "${HOME}/anaconda3/etc/profile.d/conda.sh" ]]; then source "${HOME}/anaconda3/etc/profile.d/conda.sh" fi conda activate "${CONDA_ENV}" echo "=== Model: $MODEL | Label: $LABEL" echo "=== GPUs: ${GPU_ARR[*]} | Shards: $NUM_SHARDS" # ── Helper: run one bench with data-parallel sharding ───────────────────────── # $1 -- bench short name (matches merge_shards.py --bench) # $2 -- eval script path # $3 -- full label (e.g. sync_minicpmo_4_5) # $4 -- output-dir root (e.g. $EVAL_ROOT/sync) # $5+ -- extra args passed to each eval script run_bench_dp() { local bench="$1"; shift local script="$1"; shift local full_label="$1"; shift local out_root="$1"; shift local label_dir="${out_root}/${full_label}" mkdir -p "${label_dir}/logs" echo "" echo "==== [$(date +%T)] Bench: $bench | Label: $full_label ====" local pids=() for (( i=0; i "$log" 2>&1 & pids+=($!) done # Wait for all shard workers local fail=0 for pid in "${pids[@]}"; do wait "$pid" || fail=$((fail+1)) done if (( fail > 0 )); then echo " !! $fail shard(s) exited with error; check ${label_dir}/logs/" fi # Merge echo " → merging shards ..." python "$SCRIPTS/merge_shards.py" \ --bench "$bench" \ --label-dir "$label_dir" || echo " !! merge failed" } # ── 1/6 Sync (in-domain) — freetext + GPT judge ──────────────────────────── run_bench_dp dpo_sync "$SCRIPTS/eval_dpo_sync.py" \ "sync_${LABEL}" "$EVAL_ROOT/sync" \ --model-id "$MODEL" \ --data-root "$DATA_ROOT" \ --test-jsonl "$SYNC_TEST_JSONL" \ --gpt-judge # ── 2/6 VGGSoundSync — freetext + GPT judge ──────────────────────────────── run_bench_dp vggsoundsync "$SCRIPTS/eval_vggsoundsync.py" \ "vggsync_freetext_${LABEL}_3k" "$EVAL_ROOT/vggsoundsync" \ --model-id "$MODEL" \ --test-jsonl "$VGG_TEST_JSONL" \ --mode freetext --gpt-judge # ── 3/6 WorldSense ─────────────────────────────────────────────────────────── run_bench_dp worldsense "$SCRIPTS/eval_worldsense.py" \ "ws_${LABEL}" "$EVAL_ROOT/worldsense" \ --model-id "$MODEL" \ --data-dir "$WORLDSENSE_DIR" \ --max-samples -1 # ── 4/6 Daily-Omni ─────────────────────────────────────────────────────────── run_bench_dp daily_omni "$SCRIPTS/eval_daily_omni.py" \ "do_${LABEL}" "$EVAL_ROOT/daily_omni" \ --model-id "$MODEL" \ --data-dir "$DAILY_OMNI_DIR" \ --max-samples -1 # ── 5/6 Video-MME ──────────────────────────────────────────────────────────── run_bench_dp videomme "$SCRIPTS/eval_videomme.py" \ "vmme_${LABEL}" "$EVAL_ROOT/videomme" \ --model-id "$MODEL" \ --video-dir "$VIDEOMME_DIR" \ --max-samples -1 # ── 6/6 LVBench ────────────────────────────────────────────────────────────── run_bench_dp lvbench "$SCRIPTS/eval_lvbench.py" \ "lvb_${LABEL}" "$EVAL_ROOT/lvbench" \ --model-id "$MODEL" \ --video-dir "$LVBENCH_DIR" \ --max-samples -1 echo "" echo "=== All done: $LABEL ===" for b_out in \ "$EVAL_ROOT/sync/sync_${LABEL}" \ "$EVAL_ROOT/vggsoundsync/vggsync_freetext_${LABEL}_3k" \ "$EVAL_ROOT/worldsense/ws_${LABEL}" \ "$EVAL_ROOT/daily_omni/do_${LABEL}" \ "$EVAL_ROOT/videomme/vmme_${LABEL}" \ "$EVAL_ROOT/lvbench/lvb_${LABEL}"; do echo " ${b_out}/metrics.json" done