#!/usr/bin/env bash # Boot two vLLM processes + the FastAPI proxy, all in one container. # # Lifecycle: # 1. Launch vLLM(qwen) on :8001, wait until /health returns 200. # 2. THEN launch vLLM(physix) on :8002, wait until /health returns 200. # (Sequential, not parallel — see below.) # 3. Exec uvicorn proxy in foreground (PID 1) — when HF Spaces sends # SIGTERM at sleep time, uvicorn exits cleanly and the children get # reaped via the signal trap. # # Why sequential, not parallel: # The first deploy attempt booted both vLLMs in parallel and the second # one died with `ValueError: No available memory for the cache blocks. # Try increasing gpu_memory_utilization`. Cause: vLLM reads the GPU's # *currently free* memory at startup and then reserves # `--gpu-memory-utilization * (free at this moment)`. When two processes # start simultaneously, both read "all 24 GB free" and both try to grab # ~10 GB; whichever one finalises last loses, because by then there's # only ~10-12 GB actually free. # # Sequential boot makes the second vLLM observe the post-first-process # free memory, so its allocation gets sized correctly. # # Why --gpu-memory-utilization 0.40 each (= 80% total reserved): # On L4 (24 GB), 0.40 * 24 ≈ 9.6 GB per process. Qwen2.5-3B fp16 weights # are ~6.2 GB; that leaves ~3.4 GB for KV cache + activations, which # sustains max_model_len=4096 with comfortable margin. The 20% reserve # covers the CUDA workspace + Python/uvicorn heap + the second vLLM's # own ~600 MB CUDA context overhead. We deliberately do NOT push to # 0.45 each — the previous deploy proved the residual headroom isn't # there once both contexts coexist. set -euo pipefail QWEN_MODEL="${QWEN_MODEL:-Qwen/Qwen2.5-3B-Instruct}" PHYSIX_MODEL="${PHYSIX_MODEL:-Pratyush-01/physix-3b-rl}" QWEN_GPU_FRAC="${QWEN_GPU_FRAC:-0.40}" PHYSIX_GPU_FRAC="${PHYSIX_GPU_FRAC:-0.40}" # 4096 is enough for the PhysiX prompt (~1500 tok) + completion (~512 tok) # with comfortable headroom, and tightening it materially shrinks the KV # cache footprint vs vLLM's default of model.max_position_embeddings # (32k for Qwen2.5). MAX_LEN="${MAX_LEN:-4096}" LOG_DIR=/tmp/logs mkdir -p "$LOG_DIR" # Track child PIDs so the signal trap can terminate them all on # SIGTERM/SIGINT. HF Spaces sends SIGTERM when pausing the Space. PIDS=() cleanup() { echo "[entrypoint] SIGTERM/SIGINT — killing children: ${PIDS[*]:-}" >&2 for pid in "${PIDS[@]:-}"; do kill -TERM "$pid" 2>/dev/null || true done wait || true exit 0 } trap cleanup TERM INT wait_healthy() { local name="$1" port="$2" pid="$3" budget="${4:-480}" local deadline=$((SECONDS + budget)) while (( SECONDS < deadline )); do # If the child died, surface its log and bail out — silently # waiting forever for a corpse is the worst failure mode. if ! kill -0 "$pid" 2>/dev/null; then echo "[entrypoint] FATAL: $name (pid $pid) died during boot. Tail of log:" >&2 tail -n 80 "$LOG_DIR/vllm-${name}.log" >&2 || true return 1 fi if curl -fsS "http://127.0.0.1:${port}/health" >/dev/null 2>&1; then echo "[entrypoint] $name healthy on :$port (after ${SECONDS}s)" return 0 fi sleep 5 done echo "[entrypoint] FATAL: $name failed to become healthy in ${budget}s" >&2 tail -n 80 "$LOG_DIR/vllm-${name}.log" >&2 || true return 1 } echo "[entrypoint] step 1/3 — booting vLLM(qwen) = $QWEN_MODEL on :8001 (gpu=${QWEN_GPU_FRAC})" # vllm/vllm-openai base image ships only `python3` (no `python` symlink), # so use python3 explicitly. Using `python -m vllm...` here cost us a # full failed deploy on first try. python3 -m vllm.entrypoints.openai.api_server \ --model "$QWEN_MODEL" \ --served-model-name "$QWEN_MODEL" \ --host 0.0.0.0 --port 8001 \ --gpu-memory-utilization "$QWEN_GPU_FRAC" \ --max-model-len "$MAX_LEN" \ --dtype auto \ --disable-log-requests \ > "$LOG_DIR/vllm-qwen.log" 2>&1 & QWEN_PID=$! PIDS+=("$QWEN_PID") wait_healthy qwen 8001 "$QWEN_PID" echo "[entrypoint] step 2/3 — booting vLLM(physix) = $PHYSIX_MODEL on :8002 (gpu=${PHYSIX_GPU_FRAC})" python3 -m vllm.entrypoints.openai.api_server \ --model "$PHYSIX_MODEL" \ --served-model-name "$PHYSIX_MODEL" \ --host 0.0.0.0 --port 8002 \ --gpu-memory-utilization "$PHYSIX_GPU_FRAC" \ --max-model-len "$MAX_LEN" \ --dtype auto \ --disable-log-requests \ > "$LOG_DIR/vllm-physix.log" 2>&1 & PHYSIX_PID=$! PIDS+=("$PHYSIX_PID") wait_healthy physix 8002 "$PHYSIX_PID" echo "[entrypoint] step 3/3 — both vLLMs healthy; starting proxy on :${PORT}" # `exec` so uvicorn becomes PID 1's foreground job and HF Spaces sees # our process as healthy. The trap above forwards termination back to # the vLLM children when the Space is paused. exec python3 -m uvicorn proxy:app \ --host 0.0.0.0 --port "${PORT}" \ --log-level info