# Accept Length 测试指南 ## 0. 准备工作 ### 创建目录 ```bash cd /workspace/hanrui/SpecForge-ext mkdir -p logs results ``` ### 下载数据集(首次运行) ```bash cd /workspace/hanrui/SpecForge-ext python download_datasets.py ``` 数据保存位置: - MT-Bench: `/workspace/hanrui/datasets/mtbench/question.jsonl` - GSM8K: `/workspace/hanrui/datasets/gsm8k/test.jsonl` - HumanEval: `/workspace/hanrui/datasets/humaneval/test.jsonl` --- ## 1. 测试 Baseline 模型 ### 启动服务器(终端1) ```bash cd /workspace/hanrui/SpecForge-ext # 设置环境变量 export NO_PROXY="localhost,127.0.0.1,::1,10.0.0.0/8,172.16.0.0/12,192.168.0.0/16" export no_proxy="localhost,127.0.0.1,::1,10.0.0.0/8,172.16.0.0/12,192.168.0.0/16" # 启动 baseline 服务器 python3 -m sglang.launch_server \ --model /workspace/Qwen3-8B \ --speculative-algorithm EAGLE3 \ --speculative-draft-model-path /workspace/qwen3_8b_eagle3 \ --speculative-num-steps 3 \ --speculative-eagle-topk 1 \ --speculative-num-draft-tokens 4 \ --mem-fraction-static 0.75 \ --cuda-graph-max-bs 1 \ --tp 1 \ --trust-remote-code \ --host 0.0.0.0 \ --port 30000 \ --dtype bfloat16 \ --skip-server-warmup ``` 等待看到 `Application startup complete` 后,继续下一步。 ### 运行三个 Benchmark(终端2) ```bash cd /workspace/hanrui/SpecForge-ext conda activate /workspace/Hanrui/ # 设置环境变量 export NO_PROXY="localhost,127.0.0.1,::1,10.0.0.0/8,172.16.0.0/12,192.168.0.0/16" export no_proxy="localhost,127.0.0.1,::1,10.0.0.0/8,172.16.0.0/12,192.168.0.0/16" # 1. MT-Bench echo "=== Running MT-Bench (Baseline) ===" python benchmarks/bench_eagle3.py \ --model-path /workspace/Qwen3-8B \ --host 10.1.1.31 \ --port 30000 \ --config-list 1,3,1,4 \ --benchmark-list mtbench:80 \ --dtype bfloat16 \ --skip-launch-server \ --name baseline_mtbench \ --output-dir ./results \ 2>&1 | tee logs/baseline_mtbench_$(date +%Y%m%d_%H%M%S).log # 2. GSM8K echo "=== Running GSM8K (Baseline) ===" python benchmarks/bench_eagle3.py \ --model-path /workspace/Qwen3-8B \ --host 10.1.1.31 \ --port 30000 \ --config-list 1,3,1,4 \ --benchmark-list gsm8k:100 \ --dtype bfloat16 \ --skip-launch-server \ --name baseline_gsm8k \ --output-dir ./results \ 2>&1 | tee logs/baseline_gsm8k_$(date +%Y%m%d_%H%M%S).log # 3. HumanEval echo "=== Running HumanEval (Baseline) ===" python benchmarks/bench_eagle3.py \ --model-path /workspace/Qwen3-8B \ --host 10.1.1.31 \ --port 30000 \ --config-list 1,3,1,4 \ --benchmark-list humaneval:164 \ --dtype bfloat16 \ --skip-launch-server \ --name baseline_humaneval \ --output-dir ./results \ 2>&1 | tee logs/baseline_humaneval_$(date +%Y%m%d_%H%M%S).log echo "=== Baseline 测试完成 ===" ``` --- ## 2. 测试训练后的模型 ### 停止 Baseline 服务器并启动训练后的服务器(终端1) ```bash cd /workspace/hanrui/SpecForge-ext # 停止旧服务器 pkill -f "sglang.launch_server" sleep 5 # 设置环境变量 export NO_PROXY="localhost,127.0.0.1,::1,10.0.0.0/8,172.16.0.0/12,192.168.0.0/16" export no_proxy="localhost,127.0.0.1,::1,10.0.0.0/8,172.16.0.0/12,192.168.0.0/16" # 启动训练后的服务器 python3 -m sglang.launch_server \ --model /workspace/Qwen3-8B \ --speculative-algorithm EAGLE3 \ --speculative-draft-model-path /workspace/hanrui/SpecForge-ext/outputs/qwen3-8b-qwen3eagle-5layer/epoch_9_step_12310 \ --speculative-num-steps 3 \ --speculative-eagle-topk 1 \ --speculative-num-draft-tokens 4 \ --mem-fraction-static 0.75 \ --cuda-graph-max-bs 1 \ --tp 1 \ --trust-remote-code \ --host 0.0.0.0 \ --port 30000 \ --dtype bfloat16 \ --skip-server-warmup ``` 等待看到 `Application startup complete` 后,继续下一步。 ### 运行三个 Benchmark(终端2) ```bash cd /workspace/hanrui/SpecForge-ext conda activate /workspace/Hanrui/ # 设置环境变量 export NO_PROXY="localhost,127.0.0.1,::1,10.0.0.0/8,172.16.0.0/12,192.168.0.0/16" export no_proxy="localhost,127.0.0.1,::1,10.0.0.0/8,172.16.0.0/12,192.168.0.0/16" # 1. MT-Bench echo "=== Running MT-Bench (Trained) ===" python benchmarks/bench_eagle3.py \ --model-path /workspace/Qwen3-8B \ --host 10.1.1.31 \ --port 30000 \ --config-list 1,3,1,4 \ --benchmark-list mtbench:80 \ --dtype bfloat16 \ --skip-launch-server \ --name trained_mtbench \ --output-dir ./results \ 2>&1 | tee logs/trained_mtbench_$(date +%Y%m%d_%H%M%S).log # 2. GSM8K echo "=== Running GSM8K (Trained) ===" python benchmarks/bench_eagle3.py \ --model-path /workspace/Qwen3-8B \ --host 10.1.1.31 \ --port 30000 \ --config-list 1,3,1,4 \ --benchmark-list gsm8k:100 \ --dtype bfloat16 \ --skip-launch-server \ --name trained_gsm8k \ --output-dir ./results \ 2>&1 | tee logs/trained_gsm8k_$(date +%Y%m%d_%H%M%S).log # 3. HumanEval echo "=== Running HumanEval (Trained) ===" python benchmarks/bench_eagle3.py \ --model-path /workspace/Qwen3-8B \ --host 10.1.1.31 \ --port 30000 \ --config-list 1,3,1,4 \ --benchmark-list humaneval:164 \ --dtype bfloat16 \ --skip-launch-server \ --name trained_humaneval \ --output-dir ./results \ 2>&1 | tee logs/trained_humaneval_$(date +%Y%m%d_%H%M%S).log echo "=== Trained 测试完成 ===" ``` --- ## 3. 查看结果 ### 日志文件位置 所有日志保存在:`/workspace/hanrui/SpecForge-ext/logs/` - `baseline_mtbench_*.log` - `baseline_gsm8k_*.log` - `baseline_humaneval_*.log` - `trained_mtbench_*.log` - `trained_gsm8k_*.log` - `trained_humaneval_*.log` 所有结果保存在:`/workspace/hanrui/SpecForge-ext/results/` - `baseline_mtbench_*.jsonl` - `baseline_gsm8k_*.jsonl` - `baseline_humaneval_*.jsonl` - `trained_mtbench_*.jsonl` - `trained_gsm8k_*.jsonl` - `trained_humaneval_*.jsonl` ### 生成对比报告 ```bash cd /workspace/hanrui/SpecForge-ext python3 << 'EOF' import json import glob print("=" * 80) print("Accept Length 对比报告") print("=" * 80) datasets = ['mtbench', 'gsm8k', 'humaneval'] for dataset in datasets: print(f"\n{'=' * 80}") print(f"{dataset.upper()} 结果对比") print('=' * 80) baseline_files = sorted(glob.glob(f'results/baseline_{dataset}_*.jsonl')) trained_files = sorted(glob.glob(f'results/trained_{dataset}_*.jsonl')) if not baseline_files or not trained_files: print(f" 未找到 {dataset} 的结果文件") continue with open(baseline_files[-1], 'r') as f: baseline = json.load(f) with open(trained_files[-1], 'r') as f: trained = json.load(f) baseline_metrics = baseline[dataset][0]['metrics'][0] trained_metrics = trained[dataset][0]['metrics'][0] print(f"\nBaseline:") print(f" Accept Length: {baseline_metrics['accept_length']:.4f}") print(f" Output Throughput: {baseline_metrics['output_throughput']:.2f} tokens/s") if 'accuracy' in baseline_metrics and baseline_metrics['accuracy'] is not None: print(f" Accuracy: {baseline_metrics['accuracy']:.2%}") print(f"\nTrained:") print(f" Accept Length: {trained_metrics['accept_length']:.4f}") print(f" Output Throughput: {trained_metrics['output_throughput']:.2f} tokens/s") if 'accuracy' in trained_metrics and trained_metrics['accuracy'] is not None: print(f" Accuracy: {trained_metrics['accuracy']:.2%}") accept_diff = trained_metrics['accept_length'] - baseline_metrics['accept_length'] accept_pct = (accept_diff / baseline_metrics['accept_length']) * 100 throughput_diff = trained_metrics['output_throughput'] - baseline_metrics['output_throughput'] throughput_pct = (throughput_diff / baseline_metrics['output_throughput']) * 100 print(f"\n差异:") print(f" Accept Length: {accept_diff:+.4f} ({accept_pct:+.2f}%)") print(f" Throughput: {throughput_diff:+.2f} tokens/s ({throughput_pct:+.2f}%)") if 'accuracy' in baseline_metrics and baseline_metrics['accuracy'] is not None: acc_diff = trained_metrics['accuracy'] - baseline_metrics['accuracy'] acc_pct = acc_diff * 100 print(f" Accuracy: {acc_pct:+.2f} percentage points") print("\n" + "=" * 80) EOF ``` --- ## 4. 快速查看单个结果 ```bash cd /workspace/hanrui/SpecForge-ext # 查看 baseline 的 accept_length cat results/baseline_mtbench_*.jsonl | jq '.mtbench[0].metrics[0].accept_length' cat results/baseline_gsm8k_*.jsonl | jq '.gsm8k[0].metrics[0].accept_length' cat results/baseline_humaneval_*.jsonl | jq '.humaneval[0].metrics[0].accept_length' # 查看 trained 的 accept_length cat results/trained_mtbench_*.jsonl | jq '.mtbench[0].metrics[0].accept_length' cat results/trained_gsm8k_*.jsonl | jq '.gsm8k[0].metrics[0].accept_length' cat results/trained_humaneval_*.jsonl | jq '.humaneval[0].metrics[0].accept_length' ```