Fast Start - Docker Compose

#3
by Bellesteck - opened

Drop the following into a docker-compose.yml file, run Docker desktop and then execute docker compose up -d in the terminal in the right directory.

services:
  vllm-awq:
    image: vllm/vllm-openai:nightly
    container_name: vllm-server-awq
    ports:
      - "8005:8000"
    environment:
      - CUDA_VISIBLE_DEVICES=0
      - HF_TOKEN=${HF_TOKEN}
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    command: [
      "--dtype", "half",
      "--enable-auto-tool-choice",
      "--gpu-memory-utilization", "0.93",
      "--host", "0.0.0.0",
      "--kv-cache-dtype", "fp8",
      "--max-model-len", "100000",
      "--max-num-batched-tokens", "10240",
      "--max-num-seqs", "6",
      "--model", "cyankiwi/Devstral-Small-2-24B-Instruct-2512-AWQ-4bit",
      "--port", "8000",
      "--quantization", "compressed-tensors",
      "--served-model-name", "devstral",
      "--tool-call-parser", "mistral",
      "--tensor-parallel-size", "1"
    ]
    restart: unless-stopped
    shm_size: '2gb'
    ulimits:
      memlock: -1
      stack: 67108864
    ipc: host

If you want to stop it, run docker compose down

EZ

FYI this fills an RTX 5090 to the brim.

Sign up or log in to comment