Spaces:

lablab-ai-amd-developer-hackathon
/

paperhawk

Running

Nándorfi Vince

Initial paperhawk push to HF Space (LFS for binaries)

7ff7119 3 days ago

1.5 kB

	# vLLM serving on AMD MI300X via ROCm.
	#
	# Build:
	# docker build -t document-intelligence-vllm:latest -f infra/vllm/Dockerfile .
	#
	# Run on the AMD Developer Cloud MI300X instance:
	# docker run --rm --device=/dev/kfd --device=/dev/dri --group-add video \
	# -p 8000:8000 \
	# -e VLLM_MODEL=Qwen/Qwen2.5-14B-Instruct \
	# -e HF_TOKEN=<your-hf-token-if-gated> \
	# document-intelligence-vllm:latest
	#
	# The base image (rocm/vllm) bundles ROCm + vLLM + PyTorch ROCm wheels.
	# Qwen 2.5 weights are NOT preloaded — vLLM downloads on first run (~28 GB).
	# To preload at build time, uncomment the snapshot_download block below.

	FROM rocm/vllm:latest

	ENV PYTHONUNBUFFERED=1

	# Optional: preload Qwen 2.5 14B Instruct weights (~28 GB) at build time.
	# This bloats the image but eliminates first-run download latency.
	# RUN python -c "from huggingface_hub import snapshot_download; \
	# snapshot_download('Qwen/Qwen2.5-14B-Instruct')"

	EXPOSE 8000

	# vLLM serve via shell so $VLLM_MODEL expands at runtime.
	# --tensor-parallel-size 1: single MI300X has 192 GB HBM, fits 14B easily.
	# --gpu-memory-utilization 0.9: leave 10% headroom for KV cache.
	# --max-model-len 32768: Qwen 2.5 native context is 128K, but 32K is plenty
	# for document workloads and saves KV memory.
	CMD ["sh", "-c", "vllm serve $VLLM_MODEL \
	--host 0.0.0.0 \
	--port 8000 \
	--tensor-parallel-size 1 \
	--dtype auto \
	--gpu-memory-utilization 0.9 \
	--max-model-len 32768"]