| #!/usr/bin/env bash |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| set -euo pipefail |
|
|
| |
| TIER1_MODEL="${TIER1_MODEL_ID:-Qwen/Qwen3.5-9B}" |
| TIER2_MODEL="${TIER2_MODEL_ID:-Qwen/Qwen3.6-27B}" |
| VLLM_PORT="${VLLM_PORT:-8000}" |
| TENSOR_PARALLEL="${TENSOR_PARALLEL_SIZE:-1}" |
| MAX_MODEL_LEN="${MAX_MODEL_LEN:-8192}" |
| GPU_MEMORY_UTIL="${GPU_MEMORY_UTILIZATION:-0.90}" |
| ADAPTER_PATH="${LOCAL_ADAPTER_PATH:-}" |
|
|
| |
| export HSA_OVERRIDE_GFX_VERSION="${HSA_OVERRIDE_GFX_VERSION:-9.4.2}" |
| export PYTORCH_ROCM_ARCH="gfx942" |
|
|
| MODE="${1:-tier1}" |
|
|
| echo "============================================" |
| echo " OncoAgent vLLM Server — AMD MI300X" |
| echo " Mode: ${MODE}" |
| echo "============================================" |
|
|
| case "${MODE}" in |
| tier1) |
| echo "Starting Tier 1: ${TIER1_MODEL}" |
| SERVE_MODEL="${TIER1_MODEL}" |
|
|
| |
| CMD="python -m vllm.entrypoints.openai.api_server \ |
| --model ${SERVE_MODEL} \ |
| --port ${VLLM_PORT} \ |
| --tensor-parallel-size ${TENSOR_PARALLEL} \ |
| --max-model-len ${MAX_MODEL_LEN} \ |
| --gpu-memory-utilization ${GPU_MEMORY_UTIL} \ |
| --dtype bfloat16 \ |
| --trust-remote-code \ |
| --enable-auto-tool-choice" |
|
|
| |
| if [ -n "${ADAPTER_PATH}" ] && [ -d "${ADAPTER_PATH}" ]; then |
| echo "Loading LoRA adapters from: ${ADAPTER_PATH}" |
| CMD="${CMD} --enable-lora --lora-modules oncoagent-tier1=${ADAPTER_PATH}" |
| fi |
|
|
| eval ${CMD} |
| ;; |
|
|
| tier2) |
| echo "Starting Tier 2: ${TIER2_MODEL}" |
| python -m vllm.entrypoints.openai.api_server \ |
| --model "${TIER2_MODEL}" \ |
| --port "${VLLM_PORT}" \ |
| --tensor-parallel-size "${TENSOR_PARALLEL}" \ |
| --max-model-len "${MAX_MODEL_LEN}" \ |
| --gpu-memory-utilization "${GPU_MEMORY_UTIL}" \ |
| --dtype bfloat16 \ |
| --trust-remote-code \ |
| --enable-auto-tool-choice |
| ;; |
|
|
| both) |
| echo "Multi-model mode: serving both tiers" |
| echo " Tier 1 (port ${VLLM_PORT}): ${TIER1_MODEL}" |
| echo " Tier 2 (port $((VLLM_PORT + 1))): ${TIER2_MODEL}" |
| echo "" |
| echo "Starting Tier 1..." |
|
|
| CMD_T1="python -m vllm.entrypoints.openai.api_server \ |
| --model ${TIER1_MODEL} \ |
| --port ${VLLM_PORT} \ |
| --tensor-parallel-size ${TENSOR_PARALLEL} \ |
| --max-model-len ${MAX_MODEL_LEN} \ |
| --gpu-memory-utilization 0.45 \ |
| --dtype bfloat16 \ |
| --trust-remote-code" |
|
|
| if [ -n "${ADAPTER_PATH}" ] && [ -d "${ADAPTER_PATH}" ]; then |
| CMD_T1="${CMD_T1} --enable-lora --lora-modules oncoagent-tier1=${ADAPTER_PATH}" |
| fi |
|
|
| eval ${CMD_T1} & |
| T1_PID=$! |
| sleep 10 |
|
|
| echo "Starting Tier 2..." |
| python -m vllm.entrypoints.openai.api_server \ |
| --model "${TIER2_MODEL}" \ |
| --port "$((VLLM_PORT + 1))" \ |
| --tensor-parallel-size "${TENSOR_PARALLEL}" \ |
| --max-model-len "${MAX_MODEL_LEN}" \ |
| --gpu-memory-utilization 0.45 \ |
| --dtype bfloat16 \ |
| --trust-remote-code & |
| T2_PID=$! |
|
|
| echo "Both models running. PIDs: Tier1=${T1_PID} Tier2=${T2_PID}" |
| wait |
| ;; |
|
|
| *) |
| echo "Usage: $0 [tier1|tier2|both]" |
| exit 1 |
| ;; |
| esac |
|
|