#!/bin/bash # Step 2: Launch SGLang server with STANDALONE speculative decoding. # Usage: # bash start_server.sh # bash start_server.sh 8 # use tp=8 set -e TP=${1:-2} BASE_MODEL=/workspace/models/Qwen3-8B MERGED=/workspace/hanrui/syxin_old/Specforge/outputs/qwen3-8b-sft-32gpu-v2-merged INTRANET_IP=10.1.1.72 PORT=30000 if [ ! -d "$MERGED" ]; then echo "[ERROR] Merged model not found: $MERGED" echo " Run: conda activate sglang && python3 merge_lora.py" exit 1 fi echo "============================================" echo " SGLang STANDALONE Speculative Decoding" echo " target : $BASE_MODEL" echo " draft : $MERGED" echo " host : $INTRANET_IP:$PORT" echo " tp : $TP" echo "============================================" /workspace/miniconda3/envs/sglang/bin/python3 -m sglang.launch_server \ --model-path $BASE_MODEL \ --speculative-algorithm STANDALONE \ --speculative-draft-model-path $MERGED \ --speculative-num-steps 4 \ --speculative-eagle-topk 1 \ --speculative-num-draft-tokens 4 \ --tp-size $TP \ --mem-fraction-static 0.30 \ --trust-remote-code \ --host $INTRANET_IP \ --port $PORT \ --dtype bfloat16