#!/usr/bin/env bash # Surrogate-1 v2 — EAGLE-3 speculative-decoding setup. # # EAGLE-3 (2026-Q1, Li et al.) — 3.5-5.6× wall-clock speedup vs vanilla # autoregressive decoding by training a small draft head that proposes # multiple tokens, verified in parallel by the target model. # # Architecture (Qwen2.5-Coder-7B target): # target → axentx/surrogate-1-coder-7b-v2-merged # draft → Qwen/Qwen2.5-Coder-1.5B-Instruct (≈ same tokenizer family) # method → eagle3 head trained on 50K self-generated traces # # Output: serve-vllm-eagle3.sh that wraps the existing serve-vllm.sh with # spec-decoding flags. Drop-in replacement. # # Reqs: vLLM ≥ 0.10 (has --speculative-config schema), torch ≥ 2.5. set -uo pipefail VLLM_BIN="${VLLM_BIN:-vllm}" TARGET="${TARGET:-axentx/surrogate-1-coder-7b-v2-merged}" DRAFT="${DRAFT:-Qwen/Qwen2.5-Coder-1.5B-Instruct}" NUM_SPEC="${NUM_SPEC:-5}" # tokens proposed per step PORT="${PORT:-8000}" MAX_LEN="${MAX_LEN:-131072}" GPU_MEM="${GPU_MEM:-0.85}" LOG_DIR="$HOME/.surrogate/logs" mkdir -p "$LOG_DIR" # Sanity: verify vllm is present and version supports spec decoding if ! command -v "$VLLM_BIN" >/dev/null 2>&1; then echo "❌ vllm not found. pip install vllm>=0.10" >&2 exit 1 fi VLLM_VER=$("$VLLM_BIN" --version 2>/dev/null | grep -oE '[0-9]+\.[0-9]+' | head -1) echo "[$(date +%H:%M:%S)] vllm version: ${VLLM_VER:-unknown}" # Render the wrapper to ~/.surrogate/hf-space/bin/v2/serve-vllm-eagle3.sh WRAPPER="$HOME/.surrogate/hf-space/bin/v2/serve-vllm-eagle3.sh" cat > "$WRAPPER" <&1 | tee -a "$LOG_DIR/serve-vllm-eagle3.log" EOF chmod +x "$WRAPPER" # Kick a quick dry-run to verify spec config parses (does not need GPU) echo "[$(date +%H:%M:%S)] dry-run spec-config parse" "$VLLM_BIN" serve --help 2>&1 | grep -q "speculative-config" || { echo "⚠️ vllm version may not support --speculative-config; bumped to 0.10+ recommended" >&2 } echo "[$(date +%H:%M:%S)] eagle3 wrapper at: $WRAPPER" echo "[$(date +%H:%M:%S)] launch with: bash $WRAPPER" echo "[$(date +%H:%M:%S)] expected speedup: 3.5-5.6× over autoregressive baseline"