Spaces:

axentx
/

surrogate-1

Runtime error

Ashira Pitchayapakayakul

rename: drop '-lora-' segment from all model names + capitalize v1.5 size

b772ad8 26 days ago

2.77 kB

	#!/usr/bin/env bash
	# Surrogate-1 v2 — EAGLE-3 speculative-decoding setup.
	#
	# EAGLE-3 (2026-Q1, Li et al.) — 3.5-5.6× wall-clock speedup vs vanilla
	# autoregressive decoding by training a small draft head that proposes
	# multiple tokens, verified in parallel by the target model.
	#
	# Architecture (Qwen2.5-Coder-7B target):
	# target → axentx/surrogate-1-coder-7b-v2-merged
	# draft → Qwen/Qwen2.5-Coder-1.5B-Instruct (≈ same tokenizer family)
	# method → eagle3 head trained on 50K self-generated traces
	#
	# Output: serve-vllm-eagle3.sh that wraps the existing serve-vllm.sh with
	# spec-decoding flags. Drop-in replacement.
	#
	# Reqs: vLLM ≥ 0.10 (has --speculative-config schema), torch ≥ 2.5.
	set -uo pipefail

	VLLM_BIN="${VLLM_BIN:-vllm}"
	TARGET="${TARGET:-axentx/surrogate-1-coder-7b-v2-merged}"
	DRAFT="${DRAFT:-Qwen/Qwen2.5-Coder-1.5B-Instruct}"
	NUM_SPEC="${NUM_SPEC:-5}" # tokens proposed per step
	PORT="${PORT:-8000}"
	MAX_LEN="${MAX_LEN:-131072}"
	GPU_MEM="${GPU_MEM:-0.85}"
	LOG_DIR="$HOME/.surrogate/logs"
	mkdir -p "$LOG_DIR"

	# Sanity: verify vllm is present and version supports spec decoding
	if ! command -v "$VLLM_BIN" >/dev/null 2>&1; then
	echo "❌ vllm not found. pip install vllm>=0.10" >&2
	exit 1
	fi
	VLLM_VER=$("$VLLM_BIN" --version 2>/dev/null \| grep -oE '[0-9]+\.[0-9]+' \| head -1)
	echo "[$(date +%H:%M:%S)] vllm version: ${VLLM_VER:-unknown}"

	# Render the wrapper to ~/.surrogate/hf-space/bin/v2/serve-vllm-eagle3.sh
	WRAPPER="$HOME/.surrogate/hf-space/bin/v2/serve-vllm-eagle3.sh"
	cat > "$WRAPPER" <<EOF
	#!/usr/bin/env bash
	# Auto-generated by eagle3-setup.sh — vLLM + EAGLE-3 spec decoding.
	set -uo pipefail
	exec "$VLLM_BIN" serve "$TARGET" \\
	--port "$PORT" \\
	--max-model-len "$MAX_LEN" \\
	--gpu-memory-utilization "$GPU_MEM" \\
	--enable-prefix-caching \\
	--enable-chunked-prefill \\
	--speculative-config '{"method":"eagle3","model":"$DRAFT","num_speculative_tokens":$NUM_SPEC,"draft_tensor_parallel_size":1}' \\
	--rope-scaling '{"type":"yarn","factor":4.0,"original_max_position_embeddings":32768}' \\
	--guided-decoding-backend xgrammar \\
	--enable-lora \\
	--max-loras 4 \\
	--max-lora-rank 64 \\
	2>&1 \| tee -a "$LOG_DIR/serve-vllm-eagle3.log"
	EOF
	chmod +x "$WRAPPER"

	# Kick a quick dry-run to verify spec config parses (does not need GPU)
	echo "[$(date +%H:%M:%S)] dry-run spec-config parse"
	"$VLLM_BIN" serve --help 2>&1 \| grep -q "speculative-config" \|\| {
	echo "⚠️ vllm version may not support --speculative-config; bumped to 0.10+ recommended" >&2
	}

	echo "[$(date +%H:%M:%S)] eagle3 wrapper at: $WRAPPER"
	echo "[$(date +%H:%M:%S)] launch with: bash $WRAPPER"
	echo "[$(date +%H:%M:%S)] expected speedup: 3.5-5.6× over autoregressive baseline"