Accept Length 测试指南
0. 准备工作
创建目录
cd /workspace/hanrui/SpecForge-ext
mkdir -p logs results
下载数据集(首次运行)
cd /workspace/hanrui/SpecForge-ext
python download_datasets.py
数据保存位置:
- MT-Bench:
/workspace/hanrui/datasets/mtbench/question.jsonl - GSM8K:
/workspace/hanrui/datasets/gsm8k/test.jsonl - HumanEval:
/workspace/hanrui/datasets/humaneval/test.jsonl
1. 测试 Baseline 模型
启动服务器(终端1)
cd /workspace/hanrui/SpecForge-ext
# 设置环境变量
export NO_PROXY="localhost,127.0.0.1,::1,10.0.0.0/8,172.16.0.0/12,192.168.0.0/16"
export no_proxy="localhost,127.0.0.1,::1,10.0.0.0/8,172.16.0.0/12,192.168.0.0/16"
# 启动 baseline 服务器
python3 -m sglang.launch_server \
--model /workspace/Qwen3-8B \
--speculative-algorithm EAGLE3 \
--speculative-draft-model-path /workspace/qwen3_8b_eagle3 \
--speculative-num-steps 3 \
--speculative-eagle-topk 1 \
--speculative-num-draft-tokens 4 \
--mem-fraction-static 0.75 \
--cuda-graph-max-bs 1 \
--tp 1 \
--trust-remote-code \
--host 0.0.0.0 \
--port 30000 \
--dtype bfloat16 \
--skip-server-warmup
等待看到 Application startup complete 后,继续下一步。
运行三个 Benchmark(终端2)
cd /workspace/hanrui/SpecForge-ext
conda activate /workspace/Hanrui/
# 设置环境变量
export NO_PROXY="localhost,127.0.0.1,::1,10.0.0.0/8,172.16.0.0/12,192.168.0.0/16"
export no_proxy="localhost,127.0.0.1,::1,10.0.0.0/8,172.16.0.0/12,192.168.0.0/16"
# 1. MT-Bench
echo "=== Running MT-Bench (Baseline) ==="
python benchmarks/bench_eagle3.py \
--model-path /workspace/Qwen3-8B \
--host 10.1.1.31 \
--port 30000 \
--config-list 1,3,1,4 \
--benchmark-list mtbench:80 \
--dtype bfloat16 \
--skip-launch-server \
--name baseline_mtbench \
--output-dir ./results \
2>&1 | tee logs/baseline_mtbench_$(date +%Y%m%d_%H%M%S).log
# 2. GSM8K
echo "=== Running GSM8K (Baseline) ==="
python benchmarks/bench_eagle3.py \
--model-path /workspace/Qwen3-8B \
--host 10.1.1.31 \
--port 30000 \
--config-list 1,3,1,4 \
--benchmark-list gsm8k:100 \
--dtype bfloat16 \
--skip-launch-server \
--name baseline_gsm8k \
--output-dir ./results \
2>&1 | tee logs/baseline_gsm8k_$(date +%Y%m%d_%H%M%S).log
# 3. HumanEval
echo "=== Running HumanEval (Baseline) ==="
python benchmarks/bench_eagle3.py \
--model-path /workspace/Qwen3-8B \
--host 10.1.1.31 \
--port 30000 \
--config-list 1,3,1,4 \
--benchmark-list humaneval:164 \
--dtype bfloat16 \
--skip-launch-server \
--name baseline_humaneval \
--output-dir ./results \
2>&1 | tee logs/baseline_humaneval_$(date +%Y%m%d_%H%M%S).log
echo "=== Baseline 测试完成 ==="
2. 测试训练后的模型
停止 Baseline 服务器并启动训练后的服务器(终端1)
cd /workspace/hanrui/SpecForge-ext
# 停止旧服务器
pkill -f "sglang.launch_server"
sleep 5
# 设置环境变量
export NO_PROXY="localhost,127.0.0.1,::1,10.0.0.0/8,172.16.0.0/12,192.168.0.0/16"
export no_proxy="localhost,127.0.0.1,::1,10.0.0.0/8,172.16.0.0/12,192.168.0.0/16"
# 启动训练后的服务器
python3 -m sglang.launch_server \
--model /workspace/Qwen3-8B \
--speculative-algorithm EAGLE3 \
--speculative-draft-model-path /workspace/hanrui/SpecForge-ext/outputs/qwen3-8b-qwen3eagle-5layer/epoch_9_step_12310 \
--speculative-num-steps 3 \
--speculative-eagle-topk 1 \
--speculative-num-draft-tokens 4 \
--mem-fraction-static 0.75 \
--cuda-graph-max-bs 1 \
--tp 1 \
--trust-remote-code \
--host 0.0.0.0 \
--port 30000 \
--dtype bfloat16 \
--skip-server-warmup
等待看到 Application startup complete 后,继续下一步。
运行三个 Benchmark(终端2)
cd /workspace/hanrui/SpecForge-ext
conda activate /workspace/Hanrui/
# 设置环境变量
export NO_PROXY="localhost,127.0.0.1,::1,10.0.0.0/8,172.16.0.0/12,192.168.0.0/16"
export no_proxy="localhost,127.0.0.1,::1,10.0.0.0/8,172.16.0.0/12,192.168.0.0/16"
# 1. MT-Bench
echo "=== Running MT-Bench (Trained) ==="
python benchmarks/bench_eagle3.py \
--model-path /workspace/Qwen3-8B \
--host 10.1.1.31 \
--port 30000 \
--config-list 1,3,1,4 \
--benchmark-list mtbench:80 \
--dtype bfloat16 \
--skip-launch-server \
--name trained_mtbench \
--output-dir ./results \
2>&1 | tee logs/trained_mtbench_$(date +%Y%m%d_%H%M%S).log
# 2. GSM8K
echo "=== Running GSM8K (Trained) ==="
python benchmarks/bench_eagle3.py \
--model-path /workspace/Qwen3-8B \
--host 10.1.1.31 \
--port 30000 \
--config-list 1,3,1,4 \
--benchmark-list gsm8k:100 \
--dtype bfloat16 \
--skip-launch-server \
--name trained_gsm8k \
--output-dir ./results \
2>&1 | tee logs/trained_gsm8k_$(date +%Y%m%d_%H%M%S).log
# 3. HumanEval
echo "=== Running HumanEval (Trained) ==="
python benchmarks/bench_eagle3.py \
--model-path /workspace/Qwen3-8B \
--host 10.1.1.31 \
--port 30000 \
--config-list 1,3,1,4 \
--benchmark-list humaneval:164 \
--dtype bfloat16 \
--skip-launch-server \
--name trained_humaneval \
--output-dir ./results \
2>&1 | tee logs/trained_humaneval_$(date +%Y%m%d_%H%M%S).log
echo "=== Trained 测试完成 ==="
3. 查看结果
日志文件位置
所有日志保存在:/workspace/hanrui/SpecForge-ext/logs/
baseline_mtbench_*.logbaseline_gsm8k_*.logbaseline_humaneval_*.logtrained_mtbench_*.logtrained_gsm8k_*.logtrained_humaneval_*.log
所有结果保存在:/workspace/hanrui/SpecForge-ext/results/
baseline_mtbench_*.jsonlbaseline_gsm8k_*.jsonlbaseline_humaneval_*.jsonltrained_mtbench_*.jsonltrained_gsm8k_*.jsonltrained_humaneval_*.jsonl
生成对比报告
cd /workspace/hanrui/SpecForge-ext
python3 << 'EOF'
import json
import glob
print("=" * 80)
print("Accept Length 对比报告")
print("=" * 80)
datasets = ['mtbench', 'gsm8k', 'humaneval']
for dataset in datasets:
print(f"\n{'=' * 80}")
print(f"{dataset.upper()} 结果对比")
print('=' * 80)
baseline_files = sorted(glob.glob(f'results/baseline_{dataset}_*.jsonl'))
trained_files = sorted(glob.glob(f'results/trained_{dataset}_*.jsonl'))
if not baseline_files or not trained_files:
print(f" 未找到 {dataset} 的结果文件")
continue
with open(baseline_files[-1], 'r') as f:
baseline = json.load(f)
with open(trained_files[-1], 'r') as f:
trained = json.load(f)
baseline_metrics = baseline[dataset][0]['metrics'][0]
trained_metrics = trained[dataset][0]['metrics'][0]
print(f"\nBaseline:")
print(f" Accept Length: {baseline_metrics['accept_length']:.4f}")
print(f" Output Throughput: {baseline_metrics['output_throughput']:.2f} tokens/s")
if 'accuracy' in baseline_metrics and baseline_metrics['accuracy'] is not None:
print(f" Accuracy: {baseline_metrics['accuracy']:.2%}")
print(f"\nTrained:")
print(f" Accept Length: {trained_metrics['accept_length']:.4f}")
print(f" Output Throughput: {trained_metrics['output_throughput']:.2f} tokens/s")
if 'accuracy' in trained_metrics and trained_metrics['accuracy'] is not None:
print(f" Accuracy: {trained_metrics['accuracy']:.2%}")
accept_diff = trained_metrics['accept_length'] - baseline_metrics['accept_length']
accept_pct = (accept_diff / baseline_metrics['accept_length']) * 100
throughput_diff = trained_metrics['output_throughput'] - baseline_metrics['output_throughput']
throughput_pct = (throughput_diff / baseline_metrics['output_throughput']) * 100
print(f"\n差异:")
print(f" Accept Length: {accept_diff:+.4f} ({accept_pct:+.2f}%)")
print(f" Throughput: {throughput_diff:+.2f} tokens/s ({throughput_pct:+.2f}%)")
if 'accuracy' in baseline_metrics and baseline_metrics['accuracy'] is not None:
acc_diff = trained_metrics['accuracy'] - baseline_metrics['accuracy']
acc_pct = acc_diff * 100
print(f" Accuracy: {acc_pct:+.2f} percentage points")
print("\n" + "=" * 80)
EOF
4. 快速查看单个结果
cd /workspace/hanrui/SpecForge-ext
# 查看 baseline 的 accept_length
cat results/baseline_mtbench_*.jsonl | jq '.mtbench[0].metrics[0].accept_length'
cat results/baseline_gsm8k_*.jsonl | jq '.gsm8k[0].metrics[0].accept_length'
cat results/baseline_humaneval_*.jsonl | jq '.humaneval[0].metrics[0].accept_length'
# 查看 trained 的 accept_length
cat results/trained_mtbench_*.jsonl | jq '.mtbench[0].metrics[0].accept_length'
cat results/trained_gsm8k_*.jsonl | jq '.gsm8k[0].metrics[0].accept_length'
cat results/trained_humaneval_*.jsonl | jq '.humaneval[0].metrics[0].accept_length'