surrogate-1 / bin /v2 /eagle3-setup.sh
Ashira Pitchayapakayakul
rename: drop '-lora-' segment from all model names + capitalize v1.5 size
b772ad8
#!/usr/bin/env bash
# Surrogate-1 v2 β€” EAGLE-3 speculative-decoding setup.
#
# EAGLE-3 (2026-Q1, Li et al.) β€” 3.5-5.6Γ— wall-clock speedup vs vanilla
# autoregressive decoding by training a small draft head that proposes
# multiple tokens, verified in parallel by the target model.
#
# Architecture (Qwen2.5-Coder-7B target):
# target β†’ axentx/surrogate-1-coder-7b-v2-merged
# draft β†’ Qwen/Qwen2.5-Coder-1.5B-Instruct (β‰ˆ same tokenizer family)
# method β†’ eagle3 head trained on 50K self-generated traces
#
# Output: serve-vllm-eagle3.sh that wraps the existing serve-vllm.sh with
# spec-decoding flags. Drop-in replacement.
#
# Reqs: vLLM β‰₯ 0.10 (has --speculative-config schema), torch β‰₯ 2.5.
set -uo pipefail
VLLM_BIN="${VLLM_BIN:-vllm}"
TARGET="${TARGET:-axentx/surrogate-1-coder-7b-v2-merged}"
DRAFT="${DRAFT:-Qwen/Qwen2.5-Coder-1.5B-Instruct}"
NUM_SPEC="${NUM_SPEC:-5}" # tokens proposed per step
PORT="${PORT:-8000}"
MAX_LEN="${MAX_LEN:-131072}"
GPU_MEM="${GPU_MEM:-0.85}"
LOG_DIR="$HOME/.surrogate/logs"
mkdir -p "$LOG_DIR"
# Sanity: verify vllm is present and version supports spec decoding
if ! command -v "$VLLM_BIN" >/dev/null 2>&1; then
echo "❌ vllm not found. pip install vllm>=0.10" >&2
exit 1
fi
VLLM_VER=$("$VLLM_BIN" --version 2>/dev/null | grep -oE '[0-9]+\.[0-9]+' | head -1)
echo "[$(date +%H:%M:%S)] vllm version: ${VLLM_VER:-unknown}"
# Render the wrapper to ~/.surrogate/hf-space/bin/v2/serve-vllm-eagle3.sh
WRAPPER="$HOME/.surrogate/hf-space/bin/v2/serve-vllm-eagle3.sh"
cat > "$WRAPPER" <<EOF
#!/usr/bin/env bash
# Auto-generated by eagle3-setup.sh β€” vLLM + EAGLE-3 spec decoding.
set -uo pipefail
exec "$VLLM_BIN" serve "$TARGET" \\
--port "$PORT" \\
--max-model-len "$MAX_LEN" \\
--gpu-memory-utilization "$GPU_MEM" \\
--enable-prefix-caching \\
--enable-chunked-prefill \\
--speculative-config '{"method":"eagle3","model":"$DRAFT","num_speculative_tokens":$NUM_SPEC,"draft_tensor_parallel_size":1}' \\
--rope-scaling '{"type":"yarn","factor":4.0,"original_max_position_embeddings":32768}' \\
--guided-decoding-backend xgrammar \\
--enable-lora \\
--max-loras 4 \\
--max-lora-rank 64 \\
2>&1 | tee -a "$LOG_DIR/serve-vllm-eagle3.log"
EOF
chmod +x "$WRAPPER"
# Kick a quick dry-run to verify spec config parses (does not need GPU)
echo "[$(date +%H:%M:%S)] dry-run spec-config parse"
"$VLLM_BIN" serve --help 2>&1 | grep -q "speculative-config" || {
echo "⚠️ vllm version may not support --speculative-config; bumped to 0.10+ recommended" >&2
}
echo "[$(date +%H:%M:%S)] eagle3 wrapper at: $WRAPPER"
echo "[$(date +%H:%M:%S)] launch with: bash $WRAPPER"
echo "[$(date +%H:%M:%S)] expected speedup: 3.5-5.6Γ— over autoregressive baseline"