#!/bin/bash
# Step 2: Launch SGLang server with STANDALONE speculative decoding.
# Usage:
#   bash start_server.sh
#   bash start_server.sh 8   # use tp=8

set -e

TP=${1:-2}

BASE_MODEL=/workspace/models/Qwen3-8B
MERGED=/workspace/hanrui/syxin_old/Specforge/outputs/qwen3-8b-sft-32gpu-v2-merged
INTRANET_IP=10.1.1.72
PORT=30000

if [ ! -d "$MERGED" ]; then
    echo "[ERROR] Merged model not found: $MERGED"
    echo "        Run: conda activate sglang && python3 merge_lora.py"
    exit 1
fi

echo "============================================"
echo " SGLang STANDALONE Speculative Decoding"
echo " target : $BASE_MODEL"
echo " draft  : $MERGED"
echo " host   : $INTRANET_IP:$PORT"
echo " tp     : $TP"
echo "============================================"

/workspace/miniconda3/envs/sglang/bin/python3 -m sglang.launch_server \
    --model-path                    $BASE_MODEL \
    --speculative-algorithm         STANDALONE \
    --speculative-draft-model-path  $MERGED \
    --speculative-num-steps         4 \
    --speculative-eagle-topk        1 \
    --speculative-num-draft-tokens  4 \
    --tp-size                       $TP \
    --mem-fraction-static           0.30 \
    --trust-remote-code \
    --host                          $INTRANET_IP \
    --port                          $PORT \
    --dtype                         bfloat16