#!/bin/bash # Run HumanEval / MT-Bench / GSM8K benchmarks for DFLASH speculative decoding. # Run AFTER start_server_dflash.sh is up. # Usage: # bash run_bench_dflash.sh # all three benches, full dataset # bash run_bench_dflash.sh humaneval # only humaneval # bash run_bench_dflash.sh mtbench gsm8k # pick any subset set -e INTRANET_IP=10.1.1.22 PORT=30000 BASE_MODEL=/workspace/models/Qwen3-8B DRAFT_MODEL=/workspace/models/Qwen3-8B-DFlash-b16 BENCH_DIR=/workspace/hanrui/syxin_old/Specforge/benchmarks RESULT_DIR=$BENCH_DIR/results # ---- sanity check ---- echo "Checking server at http://$INTRANET_IP:$PORT ..." curl -sf http://$INTRANET_IP:$PORT/v1/models > /dev/null || { echo "[ERROR] Server not reachable. Start it first: bash start_server_dflash.sh" exit 1 } echo "Server OK." mkdir -p $RESULT_DIR cd $BENCH_DIR export PYTHONPATH=/workspace/hanrui/syxin_old/Specforge:$PYTHONPATH # ---- decide which benches to run ---- TARGETS=("$@") if [ ${#TARGETS[@]} -eq 0 ]; then TARGETS=(humaneval mtbench gsm8k) fi BENCH_ARGS="" for t in "${TARGETS[@]}"; do case $t in humaneval) BENCH_ARGS="$BENCH_ARGS humaneval:164" ;; mtbench) BENCH_ARGS="$BENCH_ARGS mtbench:80" ;; gsm8k) BENCH_ARGS="$BENCH_ARGS gsm8k:1319" ;; *) echo "[ERROR] Unknown bench: $t (choices: humaneval mtbench gsm8k)" exit 1 ;; esac done TIMESTAMP=$(date +%Y%m%d_%H%M%S) echo "Running: $BENCH_ARGS" echo "Results -> $RESULT_DIR" echo "" /workspace/miniconda3/envs/dflash/bin/python3 bench_eagle3.py \ --model-path $BASE_MODEL \ --speculative-draft-model-path $DRAFT_MODEL \ --host $INTRANET_IP \ --port $PORT \ --config-list "16,4,1,4" \ --benchmark-list $BENCH_ARGS \ --output-dir $RESULT_DIR \ --name dflash_b16_${TIMESTAMP} \ --skip-launch-server \ 2>&1 | tee $RESULT_DIR/bench_dflash_b16_${TIMESTAMP}.log echo "" echo "Done. Latest result files:" ls -lht $RESULT_DIR/*.jsonl 2>/dev/null | head -5