Spaces:
Running
Running
| # Chakravyuh — vLLM serving harness | |
| # | |
| # Boots a vLLM 0.6+ server with the Chakravyuh Analyzer LoRA pre-loaded | |
| # against Qwen2.5-7B-Instruct. Exposes an OpenAI-compatible | |
| # `/v1/chat/completions` endpoint on port 8000. | |
| # | |
| # Usage: | |
| # docker compose -f serving/vllm_compose.yml up | |
| # | |
| # Hardware: requires NVIDIA GPU with at least 14 GB VRAM (A10G+, A100, H100). | |
| # | |
| # To use AWQ quantization (~8 GB VRAM), uncomment the --quantization line below. | |
| version: "3.9" | |
| services: | |
| chakravyuh-vllm: | |
| image: vllm/vllm-openai:v0.6.4 | |
| container_name: chakravyuh-analyzer | |
| restart: unless-stopped | |
| ports: | |
| - "8000:8000" | |
| environment: | |
| - HUGGING_FACE_HUB_TOKEN=${HUGGING_FACE_HUB_TOKEN:-} | |
| - HF_HOME=/root/.cache/huggingface | |
| volumes: | |
| - hf_cache:/root/.cache/huggingface | |
| deploy: | |
| resources: | |
| reservations: | |
| devices: | |
| - driver: nvidia | |
| count: 1 | |
| capabilities: [gpu] | |
| command: | |
| - "--model" | |
| - "Qwen/Qwen2.5-7B-Instruct" | |
| - "--enable-lora" | |
| - "--lora-modules" | |
| - "chakravyuh-analyzer-lora-v2=ujjwalpardeshi/chakravyuh-analyzer-lora-v2" | |
| - "--max-loras" | |
| - "1" | |
| - "--max-lora-rank" | |
| - "64" | |
| - "--dtype" | |
| - "bfloat16" | |
| - "--gpu-memory-utilization" | |
| - "0.85" | |
| - "--max-model-len" | |
| - "4096" | |
| # Uncomment for AWQ 4-bit quantization (~8 GB VRAM): | |
| # - "--quantization" | |
| # - "awq" | |
| healthcheck: | |
| test: ["CMD-SHELL", "curl -f http://localhost:8000/health || exit 1"] | |
| interval: 30s | |
| timeout: 10s | |
| retries: 5 | |
| start_period: 120s | |
| volumes: | |
| hf_cache: | |
| driver: local | |