Fast Start - Docker Compose
#3
by Bellesteck - opened
Drop the following into a docker-compose.yml file, run Docker desktop and then execute docker compose up -d in the terminal in the right directory.
services:
vllm-awq:
image: vllm/vllm-openai:nightly
container_name: vllm-server-awq
ports:
- "8005:8000"
environment:
- CUDA_VISIBLE_DEVICES=0
- HF_TOKEN=${HF_TOKEN}
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
command: [
"--dtype", "half",
"--enable-auto-tool-choice",
"--gpu-memory-utilization", "0.93",
"--host", "0.0.0.0",
"--kv-cache-dtype", "fp8",
"--max-model-len", "100000",
"--max-num-batched-tokens", "10240",
"--max-num-seqs", "6",
"--model", "cyankiwi/Devstral-Small-2-24B-Instruct-2512-AWQ-4bit",
"--port", "8000",
"--quantization", "compressed-tensors",
"--served-model-name", "devstral",
"--tool-call-parser", "mistral",
"--tensor-parallel-size", "1"
]
restart: unless-stopped
shm_size: '2gb'
ulimits:
memlock: -1
stack: 67108864
ipc: host
If you want to stop it, run docker compose down
EZ
FYI this fills an RTX 5090 to the brim.