File size: 3,834 Bytes
e1624f5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 | #!/usr/bin/env bash
# ============================================================
# OncoAgent — vLLM Startup Script for AMD Instinct MI300X
# Serves both Tier 1 (Qwen3.5-9B) and Tier 2 (Qwen3.6-27B)
# using the OpenAI-compatible API on port 8000.
#
# Usage:
# chmod +x deploy/start_vllm.sh
# ./deploy/start_vllm.sh [tier1|tier2|both]
#
# Default: tier1 (single-model mode for demos)
# ============================================================
set -euo pipefail
# --- Configuration ---
TIER1_MODEL="${TIER1_MODEL_ID:-Qwen/Qwen3.5-9B}"
TIER2_MODEL="${TIER2_MODEL_ID:-Qwen/Qwen3.6-27B}"
VLLM_PORT="${VLLM_PORT:-8000}"
TENSOR_PARALLEL="${TENSOR_PARALLEL_SIZE:-1}"
MAX_MODEL_LEN="${MAX_MODEL_LEN:-8192}"
GPU_MEMORY_UTIL="${GPU_MEMORY_UTILIZATION:-0.90}"
ADAPTER_PATH="${LOCAL_ADAPTER_PATH:-}"
# ROCm environment
export HSA_OVERRIDE_GFX_VERSION="${HSA_OVERRIDE_GFX_VERSION:-9.4.2}"
export PYTORCH_ROCM_ARCH="gfx942"
MODE="${1:-tier1}"
echo "============================================"
echo " OncoAgent vLLM Server — AMD MI300X"
echo " Mode: ${MODE}"
echo "============================================"
case "${MODE}" in
tier1)
echo "Starting Tier 1: ${TIER1_MODEL}"
SERVE_MODEL="${TIER1_MODEL}"
# Build command
CMD="python -m vllm.entrypoints.openai.api_server \
--model ${SERVE_MODEL} \
--port ${VLLM_PORT} \
--tensor-parallel-size ${TENSOR_PARALLEL} \
--max-model-len ${MAX_MODEL_LEN} \
--gpu-memory-utilization ${GPU_MEMORY_UTIL} \
--dtype bfloat16 \
--trust-remote-code \
--enable-auto-tool-choice"
# Add LoRA adapters if configured
if [ -n "${ADAPTER_PATH}" ] && [ -d "${ADAPTER_PATH}" ]; then
echo "Loading LoRA adapters from: ${ADAPTER_PATH}"
CMD="${CMD} --enable-lora --lora-modules oncoagent-tier1=${ADAPTER_PATH}"
fi
eval ${CMD}
;;
tier2)
echo "Starting Tier 2: ${TIER2_MODEL}"
python -m vllm.entrypoints.openai.api_server \
--model "${TIER2_MODEL}" \
--port "${VLLM_PORT}" \
--tensor-parallel-size "${TENSOR_PARALLEL}" \
--max-model-len "${MAX_MODEL_LEN}" \
--gpu-memory-utilization "${GPU_MEMORY_UTIL}" \
--dtype bfloat16 \
--trust-remote-code \
--enable-auto-tool-choice
;;
both)
echo "Multi-model mode: serving both tiers"
echo " Tier 1 (port ${VLLM_PORT}): ${TIER1_MODEL}"
echo " Tier 2 (port $((VLLM_PORT + 1))): ${TIER2_MODEL}"
echo ""
echo "Starting Tier 1..."
CMD_T1="python -m vllm.entrypoints.openai.api_server \
--model ${TIER1_MODEL} \
--port ${VLLM_PORT} \
--tensor-parallel-size ${TENSOR_PARALLEL} \
--max-model-len ${MAX_MODEL_LEN} \
--gpu-memory-utilization 0.45 \
--dtype bfloat16 \
--trust-remote-code"
if [ -n "${ADAPTER_PATH}" ] && [ -d "${ADAPTER_PATH}" ]; then
CMD_T1="${CMD_T1} --enable-lora --lora-modules oncoagent-tier1=${ADAPTER_PATH}"
fi
eval ${CMD_T1} &
T1_PID=$!
sleep 10
echo "Starting Tier 2..."
python -m vllm.entrypoints.openai.api_server \
--model "${TIER2_MODEL}" \
--port "$((VLLM_PORT + 1))" \
--tensor-parallel-size "${TENSOR_PARALLEL}" \
--max-model-len "${MAX_MODEL_LEN}" \
--gpu-memory-utilization 0.45 \
--dtype bfloat16 \
--trust-remote-code &
T2_PID=$!
echo "Both models running. PIDs: Tier1=${T1_PID} Tier2=${T2_PID}"
wait
;;
*)
echo "Usage: $0 [tier1|tier2|both]"
exit 1
;;
esac
|