Bloody hell!! running perfectly on 3x 3090 at 160k context, speeds between 65tk/s to 30tk/s (depending on lenght) , my script:
Ubuntu 24.04 LTS, nvidia driver 575.
op@op:~$ cat vllm_qwen3_80b_starter.sh
#!/bin/bash
vLLM Qwen3 Model Starter Script
This script starts the vLLM server with the Qwen3 model
echo "Starting vLLM server with Qwen3 model..."
Activate the vllm conda environment (as that's where the running instance is)
source /home/op/miniconda3/etc/profile.d/conda.sh
conda activate vllm
Check if conda activate was successful
if [ $? -ne 0 ]; then
echo "Error: Failed to activate conda environment 'vllm'"
exit 1
fi
echo "Conda environment 'vllm' activated successfully"
Start the vLLM server with the Qwen3 model
echo "Starting vLLM server with the following parameters:"
echo " Model: cpatonn/Qwen3-Next-80B-A3B-Instruct-AWQ-4bit"
echo " Pipeline parallel size: 3"
echo " Tensor parallel size: 1"
echo " Dtype: float16"
echo " KV cache dtype: fp16"
echo " GPU memory utilization: 0.95"
echo " Max num seqs: 1"
echo " Max model length: 32768"
echo " Port: 8000"
echo ""
vllm serve cpatonn/Qwen3-Next-80B-A3B-Instruct-AWQ-4bit
--pipeline-parallel-size 3
--tensor-parallel-size 1
--dtype float16
--kv-cache-dtype auto
--gpu-memory-utilization 0.92
--max-num-seqs 1
--max-model-len 160000
--trust-remote-code
--port 8030
echo "vLLM server stopped"