# vLLM serving on AMD MI300X via ROCm.
#
# Build:
#   docker build -t document-intelligence-vllm:latest -f infra/vllm/Dockerfile .
#
# Run on the AMD Developer Cloud MI300X instance:
#   docker run --rm --device=/dev/kfd --device=/dev/dri --group-add video \
#     -p 8000:8000 \
#     -e VLLM_MODEL=Qwen/Qwen2.5-14B-Instruct \
#     -e HF_TOKEN=<your-hf-token-if-gated> \
#     document-intelligence-vllm:latest
#
# The base image (rocm/vllm) bundles ROCm + vLLM + PyTorch ROCm wheels.
# Qwen 2.5 weights are NOT preloaded — vLLM downloads on first run (~28 GB).
# To preload at build time, uncomment the snapshot_download block below.

FROM rocm/vllm:latest

ENV PYTHONUNBUFFERED=1

# Optional: preload Qwen 2.5 14B Instruct weights (~28 GB) at build time.
# This bloats the image but eliminates first-run download latency.
# RUN python -c "from huggingface_hub import snapshot_download; \
#     snapshot_download('Qwen/Qwen2.5-14B-Instruct')"

EXPOSE 8000

# vLLM serve via shell so $VLLM_MODEL expands at runtime.
# --tensor-parallel-size 1: single MI300X has 192 GB HBM, fits 14B easily.
# --gpu-memory-utilization 0.9: leave 10% headroom for KV cache.
# --max-model-len 32768: Qwen 2.5 native context is 128K, but 32K is plenty
#                       for document workloads and saves KV memory.
CMD ["sh", "-c", "vllm serve $VLLM_MODEL \
    --host 0.0.0.0 \
    --port 8000 \
    --tensor-parallel-size 1 \
    --dtype auto \
    --gpu-memory-utilization 0.9 \
    --max-model-len 32768"]