#!/bin/bash
# Launch SGLang server with DFLASH speculative decoding.
# Usage:
#   bash start_server_dflash.sh
#   bash start_server_dflash.sh 4   # use tp=4

set -e

TP=${1:-2}

BASE_MODEL=/workspace/models/Qwen3-8B
DRAFT_MODEL=/workspace/models/Qwen3-8B-DFlash-b16
INTRANET_IP=10.1.1.22
PORT=30000

echo "============================================"
echo " SGLang DFLASH Speculative Decoding"
echo " target : $BASE_MODEL"
echo " draft  : $DRAFT_MODEL"
echo " host   : $INTRANET_IP:$PORT"
echo " tp     : $TP"
echo "============================================"

export SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1

/workspace/miniconda3/envs/dflash/bin/python -m sglang.launch_server \
    --model-path                   $BASE_MODEL \
    --speculative-algorithm        DFLASH \
    --speculative-draft-model-path $DRAFT_MODEL \
    --tp-size                      $TP \
    --dtype                        bfloat16 \
    --attention-backend            fa3 \
    --mem-fraction-static          0.30 \
    --trust-remote-code \
    --host                         $INTRANET_IP \
    --port                         $PORT