chakravyuh / serving /vllm_compose.yml
UjjwalPardeshi
deploy: latest main to HF Space
03815d6
# Chakravyuh — vLLM serving harness
#
# Boots a vLLM 0.6+ server with the Chakravyuh Analyzer LoRA pre-loaded
# against Qwen2.5-7B-Instruct. Exposes an OpenAI-compatible
# `/v1/chat/completions` endpoint on port 8000.
#
# Usage:
# docker compose -f serving/vllm_compose.yml up
#
# Hardware: requires NVIDIA GPU with at least 14 GB VRAM (A10G+, A100, H100).
#
# To use AWQ quantization (~8 GB VRAM), uncomment the --quantization line below.
version: "3.9"
services:
chakravyuh-vllm:
image: vllm/vllm-openai:v0.6.4
container_name: chakravyuh-analyzer
restart: unless-stopped
ports:
- "8000:8000"
environment:
- HUGGING_FACE_HUB_TOKEN=${HUGGING_FACE_HUB_TOKEN:-}
- HF_HOME=/root/.cache/huggingface
volumes:
- hf_cache:/root/.cache/huggingface
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
command:
- "--model"
- "Qwen/Qwen2.5-7B-Instruct"
- "--enable-lora"
- "--lora-modules"
- "chakravyuh-analyzer-lora-v2=ujjwalpardeshi/chakravyuh-analyzer-lora-v2"
- "--max-loras"
- "1"
- "--max-lora-rank"
- "64"
- "--dtype"
- "bfloat16"
- "--gpu-memory-utilization"
- "0.85"
- "--max-model-len"
- "4096"
# Uncomment for AWQ 4-bit quantization (~8 GB VRAM):
# - "--quantization"
# - "awq"
healthcheck:
test: ["CMD-SHELL", "curl -f http://localhost:8000/health || exit 1"]
interval: 30s
timeout: 10s
retries: 5
start_period: 120s
volumes:
hf_cache:
driver: local