#!/bin/bash # Launch SGLang server with DFLASH speculative decoding. # Usage: # bash start_server_dflash.sh # bash start_server_dflash.sh 4 # use tp=4 set -e TP=${1:-2} BASE_MODEL=/workspace/models/Qwen3-8B DRAFT_MODEL=/workspace/models/Qwen3-8B-DFlash-b16 INTRANET_IP=10.1.1.22 PORT=30000 echo "============================================" echo " SGLang DFLASH Speculative Decoding" echo " target : $BASE_MODEL" echo " draft : $DRAFT_MODEL" echo " host : $INTRANET_IP:$PORT" echo " tp : $TP" echo "============================================" export SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1 /workspace/miniconda3/envs/dflash/bin/python -m sglang.launch_server \ --model-path $BASE_MODEL \ --speculative-algorithm DFLASH \ --speculative-draft-model-path $DRAFT_MODEL \ --tp-size $TP \ --dtype bfloat16 \ --attention-backend fa3 \ --mem-fraction-static 0.30 \ --trust-remote-code \ --host $INTRANET_IP \ --port $PORT