File size: 3,834 Bytes
e1624f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#!/usr/bin/env bash
# ============================================================
# OncoAgent — vLLM Startup Script for AMD Instinct MI300X
# Serves both Tier 1 (Qwen3.5-9B) and Tier 2 (Qwen3.6-27B)
# using the OpenAI-compatible API on port 8000.
#
# Usage:
#   chmod +x deploy/start_vllm.sh
#   ./deploy/start_vllm.sh [tier1|tier2|both]
#
# Default: tier1 (single-model mode for demos)
# ============================================================

set -euo pipefail

# --- Configuration ---
TIER1_MODEL="${TIER1_MODEL_ID:-Qwen/Qwen3.5-9B}"
TIER2_MODEL="${TIER2_MODEL_ID:-Qwen/Qwen3.6-27B}"
VLLM_PORT="${VLLM_PORT:-8000}"
TENSOR_PARALLEL="${TENSOR_PARALLEL_SIZE:-1}"
MAX_MODEL_LEN="${MAX_MODEL_LEN:-8192}"
GPU_MEMORY_UTIL="${GPU_MEMORY_UTILIZATION:-0.90}"
ADAPTER_PATH="${LOCAL_ADAPTER_PATH:-}"

# ROCm environment
export HSA_OVERRIDE_GFX_VERSION="${HSA_OVERRIDE_GFX_VERSION:-9.4.2}"
export PYTORCH_ROCM_ARCH="gfx942"

MODE="${1:-tier1}"

echo "============================================"
echo "  OncoAgent vLLM Server — AMD MI300X"
echo "  Mode: ${MODE}"
echo "============================================"

case "${MODE}" in
    tier1)
        echo "Starting Tier 1: ${TIER1_MODEL}"
        SERVE_MODEL="${TIER1_MODEL}"

        # Build command
        CMD="python -m vllm.entrypoints.openai.api_server \
            --model ${SERVE_MODEL} \
            --port ${VLLM_PORT} \
            --tensor-parallel-size ${TENSOR_PARALLEL} \
            --max-model-len ${MAX_MODEL_LEN} \
            --gpu-memory-utilization ${GPU_MEMORY_UTIL} \
            --dtype bfloat16 \
            --trust-remote-code \
            --enable-auto-tool-choice"

        # Add LoRA adapters if configured
        if [ -n "${ADAPTER_PATH}" ] && [ -d "${ADAPTER_PATH}" ]; then
            echo "Loading LoRA adapters from: ${ADAPTER_PATH}"
            CMD="${CMD} --enable-lora --lora-modules oncoagent-tier1=${ADAPTER_PATH}"
        fi

        eval ${CMD}
        ;;

    tier2)
        echo "Starting Tier 2: ${TIER2_MODEL}"
        python -m vllm.entrypoints.openai.api_server \
            --model "${TIER2_MODEL}" \
            --port "${VLLM_PORT}" \
            --tensor-parallel-size "${TENSOR_PARALLEL}" \
            --max-model-len "${MAX_MODEL_LEN}" \
            --gpu-memory-utilization "${GPU_MEMORY_UTIL}" \
            --dtype bfloat16 \
            --trust-remote-code \
            --enable-auto-tool-choice
        ;;

    both)
        echo "Multi-model mode: serving both tiers"
        echo "  Tier 1 (port ${VLLM_PORT}): ${TIER1_MODEL}"
        echo "  Tier 2 (port $((VLLM_PORT + 1))): ${TIER2_MODEL}"
        echo ""
        echo "Starting Tier 1..."

        CMD_T1="python -m vllm.entrypoints.openai.api_server \
            --model ${TIER1_MODEL} \
            --port ${VLLM_PORT} \
            --tensor-parallel-size ${TENSOR_PARALLEL} \
            --max-model-len ${MAX_MODEL_LEN} \
            --gpu-memory-utilization 0.45 \
            --dtype bfloat16 \
            --trust-remote-code"

        if [ -n "${ADAPTER_PATH}" ] && [ -d "${ADAPTER_PATH}" ]; then
            CMD_T1="${CMD_T1} --enable-lora --lora-modules oncoagent-tier1=${ADAPTER_PATH}"
        fi

        eval ${CMD_T1} &
        T1_PID=$!
        sleep 10

        echo "Starting Tier 2..."
        python -m vllm.entrypoints.openai.api_server \
            --model "${TIER2_MODEL}" \
            --port "$((VLLM_PORT + 1))" \
            --tensor-parallel-size "${TENSOR_PARALLEL}" \
            --max-model-len "${MAX_MODEL_LEN}" \
            --gpu-memory-utilization 0.45 \
            --dtype bfloat16 \
            --trust-remote-code &
        T2_PID=$!

        echo "Both models running. PIDs: Tier1=${T1_PID} Tier2=${T2_PID}"
        wait
        ;;

    *)
        echo "Usage: $0 [tier1|tier2|both]"
        exit 1
        ;;
esac