paperhawk / infra /vllm /Dockerfile
Nándorfi Vince
Initial paperhawk push to HF Space (LFS for binaries)
7ff7119
raw
history blame
1.5 kB
# vLLM serving on AMD MI300X via ROCm.
#
# Build:
# docker build -t document-intelligence-vllm:latest -f infra/vllm/Dockerfile .
#
# Run on the AMD Developer Cloud MI300X instance:
# docker run --rm --device=/dev/kfd --device=/dev/dri --group-add video \
# -p 8000:8000 \
# -e VLLM_MODEL=Qwen/Qwen2.5-14B-Instruct \
# -e HF_TOKEN=<your-hf-token-if-gated> \
# document-intelligence-vllm:latest
#
# The base image (rocm/vllm) bundles ROCm + vLLM + PyTorch ROCm wheels.
# Qwen 2.5 weights are NOT preloaded — vLLM downloads on first run (~28 GB).
# To preload at build time, uncomment the snapshot_download block below.
FROM rocm/vllm:latest
ENV PYTHONUNBUFFERED=1
# Optional: preload Qwen 2.5 14B Instruct weights (~28 GB) at build time.
# This bloats the image but eliminates first-run download latency.
# RUN python -c "from huggingface_hub import snapshot_download; \
# snapshot_download('Qwen/Qwen2.5-14B-Instruct')"
EXPOSE 8000
# vLLM serve via shell so $VLLM_MODEL expands at runtime.
# --tensor-parallel-size 1: single MI300X has 192 GB HBM, fits 14B easily.
# --gpu-memory-utilization 0.9: leave 10% headroom for KV cache.
# --max-model-len 32768: Qwen 2.5 native context is 128K, but 32K is plenty
# for document workloads and saves KV memory.
CMD ["sh", "-c", "vllm serve $VLLM_MODEL \
--host 0.0.0.0 \
--port 8000 \
--tensor-parallel-size 1 \
--dtype auto \
--gpu-memory-utilization 0.9 \
--max-model-len 32768"]