Spaces:

ujjwalpardeshi
/

chakravyuh

Running

UjjwalPardeshi

deploy: latest main to HF Space

03815d6 14 days ago

1.67 kB

	# Chakravyuh — vLLM serving harness
	#
	# Boots a vLLM 0.6+ server with the Chakravyuh Analyzer LoRA pre-loaded
	# against Qwen2.5-7B-Instruct. Exposes an OpenAI-compatible
	# `/v1/chat/completions` endpoint on port 8000.
	#
	# Usage:
	# docker compose -f serving/vllm_compose.yml up
	#
	# Hardware: requires NVIDIA GPU with at least 14 GB VRAM (A10G+, A100, H100).
	#
	# To use AWQ quantization (~8 GB VRAM), uncomment the --quantization line below.

	version: "3.9"

	services:
	chakravyuh-vllm:
	image: vllm/vllm-openai:v0.6.4
	container_name: chakravyuh-analyzer
	restart: unless-stopped
	ports:
	- "8000:8000"
	environment:
	- HUGGING_FACE_HUB_TOKEN=${HUGGING_FACE_HUB_TOKEN:-}
	- HF_HOME=/root/.cache/huggingface
	volumes:
	- hf_cache:/root/.cache/huggingface
	deploy:
	resources:
	reservations:
	devices:
	- driver: nvidia
	count: 1
	capabilities: [gpu]
	command:
	- "--model"
	- "Qwen/Qwen2.5-7B-Instruct"
	- "--enable-lora"
	- "--lora-modules"
	- "chakravyuh-analyzer-lora-v2=ujjwalpardeshi/chakravyuh-analyzer-lora-v2"
	- "--max-loras"
	- "1"
	- "--max-lora-rank"
	- "64"
	- "--dtype"
	- "bfloat16"
	- "--gpu-memory-utilization"
	- "0.85"
	- "--max-model-len"
	- "4096"
	# Uncomment for AWQ 4-bit quantization (~8 GB VRAM):
	# - "--quantization"
	# - "awq"
	healthcheck:
	test: ["CMD-SHELL", "curl -f http://localhost:8000/health \|\| exit 1"]
	interval: 30s
	timeout: 10s
	retries: 5
	start_period: 120s

	volumes:
	hf_cache:
	driver: local