Spaces:
Runtime error
Runtime error
Ashira Pitchayapakayakul
rename: drop '-lora-' segment from all model names + capitalize v1.5 size
b772ad8 | # Surrogate-1 v2 β EAGLE-3 speculative-decoding setup. | |
| # | |
| # EAGLE-3 (2026-Q1, Li et al.) β 3.5-5.6Γ wall-clock speedup vs vanilla | |
| # autoregressive decoding by training a small draft head that proposes | |
| # multiple tokens, verified in parallel by the target model. | |
| # | |
| # Architecture (Qwen2.5-Coder-7B target): | |
| # target β axentx/surrogate-1-coder-7b-v2-merged | |
| # draft β Qwen/Qwen2.5-Coder-1.5B-Instruct (β same tokenizer family) | |
| # method β eagle3 head trained on 50K self-generated traces | |
| # | |
| # Output: serve-vllm-eagle3.sh that wraps the existing serve-vllm.sh with | |
| # spec-decoding flags. Drop-in replacement. | |
| # | |
| # Reqs: vLLM β₯ 0.10 (has --speculative-config schema), torch β₯ 2.5. | |
| set -uo pipefail | |
| VLLM_BIN="${VLLM_BIN:-vllm}" | |
| TARGET="${TARGET:-axentx/surrogate-1-coder-7b-v2-merged}" | |
| DRAFT="${DRAFT:-Qwen/Qwen2.5-Coder-1.5B-Instruct}" | |
| NUM_SPEC="${NUM_SPEC:-5}" # tokens proposed per step | |
| PORT="${PORT:-8000}" | |
| MAX_LEN="${MAX_LEN:-131072}" | |
| GPU_MEM="${GPU_MEM:-0.85}" | |
| LOG_DIR="$HOME/.surrogate/logs" | |
| mkdir -p "$LOG_DIR" | |
| # Sanity: verify vllm is present and version supports spec decoding | |
| if ! command -v "$VLLM_BIN" >/dev/null 2>&1; then | |
| echo "β vllm not found. pip install vllm>=0.10" >&2 | |
| exit 1 | |
| fi | |
| VLLM_VER=$("$VLLM_BIN" --version 2>/dev/null | grep -oE '[0-9]+\.[0-9]+' | head -1) | |
| echo "[$(date +%H:%M:%S)] vllm version: ${VLLM_VER:-unknown}" | |
| # Render the wrapper to ~/.surrogate/hf-space/bin/v2/serve-vllm-eagle3.sh | |
| WRAPPER="$HOME/.surrogate/hf-space/bin/v2/serve-vllm-eagle3.sh" | |
| cat > "$WRAPPER" <<EOF | |
| #!/usr/bin/env bash | |
| # Auto-generated by eagle3-setup.sh β vLLM + EAGLE-3 spec decoding. | |
| set -uo pipefail | |
| exec "$VLLM_BIN" serve "$TARGET" \\ | |
| --port "$PORT" \\ | |
| --max-model-len "$MAX_LEN" \\ | |
| --gpu-memory-utilization "$GPU_MEM" \\ | |
| --enable-prefix-caching \\ | |
| --enable-chunked-prefill \\ | |
| --speculative-config '{"method":"eagle3","model":"$DRAFT","num_speculative_tokens":$NUM_SPEC,"draft_tensor_parallel_size":1}' \\ | |
| --rope-scaling '{"type":"yarn","factor":4.0,"original_max_position_embeddings":32768}' \\ | |
| --guided-decoding-backend xgrammar \\ | |
| --enable-lora \\ | |
| --max-loras 4 \\ | |
| --max-lora-rank 64 \\ | |
| 2>&1 | tee -a "$LOG_DIR/serve-vllm-eagle3.log" | |
| EOF | |
| chmod +x "$WRAPPER" | |
| # Kick a quick dry-run to verify spec config parses (does not need GPU) | |
| echo "[$(date +%H:%M:%S)] dry-run spec-config parse" | |
| "$VLLM_BIN" serve --help 2>&1 | grep -q "speculative-config" || { | |
| echo "β οΈ vllm version may not support --speculative-config; bumped to 0.10+ recommended" >&2 | |
| } | |
| echo "[$(date +%H:%M:%S)] eagle3 wrapper at: $WRAPPER" | |
| echo "[$(date +%H:%M:%S)] launch with: bash $WRAPPER" | |
| echo "[$(date +%H:%M:%S)] expected speedup: 3.5-5.6Γ over autoregressive baseline" | |