MiniCPM-Evaluation / run_all.sh
Rakancorle11's picture
Upload folder using huggingface_hub
b2c2640 verified
#!/usr/bin/env bash
# Run all 6 benchmarks for MiniCPM-o 4.5 with 4-GPU data parallelism.
#
# For each bench, launches NUM_SHARDS python workers simultaneously (one per
# GPU), each processes 1/NUM_SHARDS of the samples. After all shards finish,
# merge_shards.py aggregates the per-shard jsonls and computes metrics.
# Only ONE bench runs at a time; benches run sequentially.
#
# Two sync benches use freetext + GPT judge (matches Qwen3-Omni reference).
#
# Usage:
# export OPENAI_API_KEY=sk-...
# bash run_all.sh
#
# Override via env vars, e.g.:
# CUDA_VISIBLE_DEVICES=4,5,6,7 LABEL=minicpmo_ckpt200 bash run_all.sh
# NUM_SHARDS=2 CUDA_VISIBLE_DEVICES=6,7 bash run_all.sh
set -uo pipefail # no -e: one bench failure shouldn't block the rest
# ── Config ─────────────────────────────────────────────────────────────────────
export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-4,5,6,7}"
MODEL="${MODEL:-openbmb/MiniCPM-o-4_5}"
LABEL="${LABEL:-minicpmo_4_5}"
SCRIPTS="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/scripts"
CONDA_ENV="${CONDA_ENV:-minicpmo}"
# Data parallel: how many shards (= number of GPUs to use)
IFS=',' read -ra GPU_ARR <<< "$CUDA_VISIBLE_DEVICES"
NUM_SHARDS="${NUM_SHARDS:-${#GPU_ARR[@]}}"
# Data paths (match Qwen3-Omni reference config)
DATA_ROOT="${DATA_ROOT:-/opt/dlami/nvme/video_source}"
SYNC_TEST_JSONL="${SYNC_TEST_JSONL:-/home/ubuntu/CleverHans-Evaluation/data/kto_training_data_v2_test.jsonl}"
VGG_TEST_JSONL="${VGG_TEST_JSONL:-/opt/dlami/nvme/vggsoundsync_test/test_3k.jsonl}"
WORLDSENSE_DIR="${WORLDSENSE_DIR:-/opt/dlami/nvme/worldsense}"
DAILY_OMNI_DIR="${DAILY_OMNI_DIR:-/opt/dlami/nvme/daily_omni}"
VIDEOMME_DIR="${VIDEOMME_DIR:-/opt/dlami/nvme/videomme/data/data}"
LVBENCH_DIR="${LVBENCH_DIR:-/opt/dlami/nvme/lvbench}"
EVAL_ROOT="${EVAL_ROOT:-/home/ubuntu/eval_results}"
# ── Conda ──────────────────────────────────────────────────────────────────────
if [[ -f "${HOME}/anaconda3/etc/profile.d/conda.sh" ]]; then
source "${HOME}/anaconda3/etc/profile.d/conda.sh"
fi
conda activate "${CONDA_ENV}"
echo "=== Model: $MODEL | Label: $LABEL"
echo "=== GPUs: ${GPU_ARR[*]} | Shards: $NUM_SHARDS"
# ── Helper: run one bench with data-parallel sharding ─────────────────────────
# $1 -- bench short name (matches merge_shards.py --bench)
# $2 -- eval script path
# $3 -- full label (e.g. sync_minicpmo_4_5)
# $4 -- output-dir root (e.g. $EVAL_ROOT/sync)
# $5+ -- extra args passed to each eval script
run_bench_dp() {
local bench="$1"; shift
local script="$1"; shift
local full_label="$1"; shift
local out_root="$1"; shift
local label_dir="${out_root}/${full_label}"
mkdir -p "${label_dir}/logs"
echo ""
echo "==== [$(date +%T)] Bench: $bench | Label: $full_label ===="
local pids=()
for (( i=0; i<NUM_SHARDS; i++ )); do
local gpu="${GPU_ARR[$i]}"
local log="${label_dir}/logs/shard${i}of${NUM_SHARDS}.log"
echo " β†’ shard $i on GPU $gpu (log: $log)"
CUDA_VISIBLE_DEVICES="$gpu" python "$script" \
"$@" \
--output-dir "$out_root" \
--label "$full_label" \
--shard "$i" --num-shards "$NUM_SHARDS" \
> "$log" 2>&1 &
pids+=($!)
done
# Wait for all shard workers
local fail=0
for pid in "${pids[@]}"; do
wait "$pid" || fail=$((fail+1))
done
if (( fail > 0 )); then
echo " !! $fail shard(s) exited with error; check ${label_dir}/logs/"
fi
# Merge
echo " β†’ merging shards ..."
python "$SCRIPTS/merge_shards.py" \
--bench "$bench" \
--label-dir "$label_dir" || echo " !! merge failed"
}
# ── 1/6 Sync (in-domain) β€” freetext + GPT judge ────────────────────────────
run_bench_dp dpo_sync "$SCRIPTS/eval_dpo_sync.py" \
"sync_${LABEL}" "$EVAL_ROOT/sync" \
--model-id "$MODEL" \
--data-root "$DATA_ROOT" \
--test-jsonl "$SYNC_TEST_JSONL" \
--gpt-judge
# ── 2/6 VGGSoundSync β€” freetext + GPT judge ────────────────────────────────
run_bench_dp vggsoundsync "$SCRIPTS/eval_vggsoundsync.py" \
"vggsync_freetext_${LABEL}_3k" "$EVAL_ROOT/vggsoundsync" \
--model-id "$MODEL" \
--test-jsonl "$VGG_TEST_JSONL" \
--mode freetext --gpt-judge
# ── 3/6 WorldSense ───────────────────────────────────────────────────────────
run_bench_dp worldsense "$SCRIPTS/eval_worldsense.py" \
"ws_${LABEL}" "$EVAL_ROOT/worldsense" \
--model-id "$MODEL" \
--data-dir "$WORLDSENSE_DIR" \
--max-samples -1
# ── 4/6 Daily-Omni ───────────────────────────────────────────────────────────
run_bench_dp daily_omni "$SCRIPTS/eval_daily_omni.py" \
"do_${LABEL}" "$EVAL_ROOT/daily_omni" \
--model-id "$MODEL" \
--data-dir "$DAILY_OMNI_DIR" \
--max-samples -1
# ── 5/6 Video-MME ────────────────────────────────────────────────────────────
run_bench_dp videomme "$SCRIPTS/eval_videomme.py" \
"vmme_${LABEL}" "$EVAL_ROOT/videomme" \
--model-id "$MODEL" \
--video-dir "$VIDEOMME_DIR" \
--max-samples -1
# ── 6/6 LVBench ──────────────────────────────────────────────────────────────
run_bench_dp lvbench "$SCRIPTS/eval_lvbench.py" \
"lvb_${LABEL}" "$EVAL_ROOT/lvbench" \
--model-id "$MODEL" \
--video-dir "$LVBENCH_DIR" \
--max-samples -1
echo ""
echo "=== All done: $LABEL ==="
for b_out in \
"$EVAL_ROOT/sync/sync_${LABEL}" \
"$EVAL_ROOT/vggsoundsync/vggsync_freetext_${LABEL}_3k" \
"$EVAL_ROOT/worldsense/ws_${LABEL}" \
"$EVAL_ROOT/daily_omni/do_${LABEL}" \
"$EVAL_ROOT/videomme/vmme_${LABEL}" \
"$EVAL_ROOT/lvbench/lvb_${LABEL}"; do
echo " ${b_out}/metrics.json"
done