# Chakravyuh — vLLM serving harness
#
# Boots a vLLM 0.6+ server with the Chakravyuh Analyzer LoRA pre-loaded
# against Qwen2.5-7B-Instruct. Exposes an OpenAI-compatible
# `/v1/chat/completions` endpoint on port 8000.
#
# Usage:
#   docker compose -f serving/vllm_compose.yml up
#
# Hardware: requires NVIDIA GPU with at least 14 GB VRAM (A10G+, A100, H100).
#
# To use AWQ quantization (~8 GB VRAM), uncomment the --quantization line below.

version: "3.9"

services:
  chakravyuh-vllm:
    image: vllm/vllm-openai:v0.6.4
    container_name: chakravyuh-analyzer
    restart: unless-stopped
    ports:
      - "8000:8000"
    environment:
      - HUGGING_FACE_HUB_TOKEN=${HUGGING_FACE_HUB_TOKEN:-}
      - HF_HOME=/root/.cache/huggingface
    volumes:
      - hf_cache:/root/.cache/huggingface
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: 1
              capabilities: [gpu]
    command:
      - "--model"
      - "Qwen/Qwen2.5-7B-Instruct"
      - "--enable-lora"
      - "--lora-modules"
      - "chakravyuh-analyzer-lora-v2=ujjwalpardeshi/chakravyuh-analyzer-lora-v2"
      - "--max-loras"
      - "1"
      - "--max-lora-rank"
      - "64"
      - "--dtype"
      - "bfloat16"
      - "--gpu-memory-utilization"
      - "0.85"
      - "--max-model-len"
      - "4096"
      # Uncomment for AWQ 4-bit quantization (~8 GB VRAM):
      # - "--quantization"
      # - "awq"
    healthcheck:
      test: ["CMD-SHELL", "curl -f http://localhost:8000/health || exit 1"]
      interval: 30s
      timeout: 10s
      retries: 5
      start_period: 120s

volumes:
  hf_cache:
    driver: local