Spaces:

Pratyush-01
/

physix-infer

Sleeping

File size: 3,711 Bytes

7959cdc

# PhysiX-Infer — dual-model OpenAI-compatible inference Space.
#
# Hosts BOTH:
#   * Qwen/Qwen2.5-3B-Instruct        (untrained baseline)
#   * Pratyush-01/physix-3b-rl        (GRPO-trained variant)
#
# Why this Space exists:
#   The HF Inference Router does not serve Qwen/Qwen2.5-3B-Instruct (no
#   provider has it loaded), and won't serve a private/fine-tune unless
#   the owner pays for an Inference Endpoint. Both checkpoints we want
#   to compare are 3B Qwen2 fp16 models, and on a single 24 GB L4 we can
#   keep two vLLM processes resident at ~45% gpu_memory each and never
#   pay router/endpoint fees.
#
# Architecture (one container, three processes):
#   :8001  vllm serve  Qwen/Qwen2.5-3B-Instruct       --gpu-memory-util 0.40
#   :8002  vllm serve  Pratyush-01/physix-3b-rl       --gpu-memory-util 0.40
#   :7860  uvicorn proxy.py:app    routes by JSON `model` field
#
# Boot order matters: vLLMs come up SEQUENTIALLY, not in parallel. Both
# read `nvidia-smi` free-memory at startup; if they race, the second
# crashes with "No available memory for the cache blocks." See
# entrypoint.sh for the full reasoning.
#
# Why the official vllm/vllm-openai image:
#   vLLM ships pre-compiled CUDA kernels that target the cuda toolkit
#   and pytorch versions it was built against. Building from a generic
#   nvidia/cuda image means recompiling vLLM's C++ kernels (~20 min,
#   often fragile across CUDA minor versions). Starting from
#   vllm/vllm-openai:<tag> guarantees torch / cu / nccl / vllm are all
#   ABI-compatible. We just layer fastapi + httpx for the proxy on top.
#
# Cold start on a fresh HF Spaces L4 (no persistent /data):
#   * Image pull:           ~30 s
#   * vLLM startup:         ~30 s after weights are local
#   * Weight download:      ~45 s for both models from Hub CDN
#   ── total ~90-120 s before /health flips green ──

FROM vllm/vllm-openai:v0.7.3

# vllm/vllm-openai sets ENTRYPOINT to `python -m vllm.entrypoints.openai.api_server`.
# We need to override that to launch our own multi-process entrypoint, so reset.
ENTRYPOINT []

ENV PYTHONUNBUFFERED=1 \
    PIP_NO_CACHE_DIR=1 \
    PIP_DISABLE_PIP_VERSION_CHECK=1 \
    HOME=/tmp/home \
    HF_HOME=/tmp/hf_cache \
    XDG_CACHE_HOME=/tmp/xdg-cache \
    # vLLM's torch.compile cache must land somewhere writable. The image's
    # default ($HOME/.cache/vllm) breaks on HF Spaces because the runtime
    # user has no writable home.
    VLLM_CACHE_ROOT=/tmp/vllm_cache \
    TORCH_HOME=/tmp/torch_cache \
    TRITON_CACHE_DIR=/tmp/triton_cache \
    PORT=7860

# fastapi/uvicorn/httpx for the routing proxy. The image already has them
# transitively (vllm depends on fastapi), but pin minimums to be safe.
# `pip install --no-deps` would be tighter but trades safety for ~5 MB.
RUN pip install \
        "fastapi>=0.110" \
        "uvicorn[standard]>=0.29" \
        "httpx>=0.27"

WORKDIR /app

COPY proxy.py entrypoint.sh ./
RUN chmod +x /app/entrypoint.sh

# HF Spaces runs containers as a non-root UID with no /etc/passwd entry,
# so any cache path under $HOME must exist and be world-writable BEFORE
# the runtime user shows up. Pre-creating /tmp subdirs (which Spaces
# always lets us write to) is the standard workaround.
RUN mkdir -p \
        "$HOME" "$HF_HOME" "$XDG_CACHE_HOME" \
        "$VLLM_CACHE_ROOT" "$TORCH_HOME" "$TRITON_CACHE_DIR" \
        /tmp/logs \
    && chmod -R 0777 /tmp

EXPOSE 7860

# /health is served by proxy.py and turns 200 only when BOTH vLLMs are up.
# Generous start-period covers the ~120 s cold boot.
HEALTHCHECK --interval=30s --timeout=10s --start-period=180s --retries=3 \
    CMD curl -fsS "http://127.0.0.1:${PORT}/health" || exit 1

CMD ["/app/entrypoint.sh"]