#!/usr/bin/env bash # Boot the two in-Space vLLMs SEQUENTIALLY, then exec uvicorn for the # physix FastAPI server (which serves the API + the React SPA). # # Why sequential, not parallel: # The first deploy attempt booted both vLLMs in parallel and the second # one died with "No available memory for the cache blocks." Reason: vLLM # reads `nvidia-smi`-style free memory at startup and reserves # `--gpu-memory-utilization * (free at this moment)` worth of VRAM. # When two processes start simultaneously, both see "all 24 GB free" and # both try to grab ~10 GB; the second one to finalize loses. Booting # sequentially makes the second one observe the post-first-process free # memory, so its allocation is sized correctly. # # Why --gpu-memory-utilization 0.40 each (= 80% total): # On L4 (24 GB), 40% = ~9.6 GB per process. Qwen2.5-3B fp16 weights are # ~6.2 GB; that leaves ~3.4 GB per process for KV cache + activations, # which sustains max_model_len=4096 with comfortable margin. The 20% # reserve covers CUDA workspace + uvicorn + Python heap. Pushing this # much higher (e.g. 0.45 each) is what failed on the first deploy # because once you account for the ~600 MB CUDA context + the second # process's overhead, weights+KV no longer fit. set -euo pipefail QWEN_MODEL="${QWEN_MODEL:-Qwen/Qwen2.5-3B-Instruct}" PHYSIX_MODEL="${PHYSIX_MODEL:-Pratyush-01/physix-3b-rl}" QWEN_GPU_FRAC="${QWEN_GPU_FRAC:-0.40}" PHYSIX_GPU_FRAC="${PHYSIX_GPU_FRAC:-0.40}" MAX_LEN="${MAX_LEN:-4096}" LOG_DIR=/tmp/logs mkdir -p "$LOG_DIR" # Forward signals so HF's "Pause" / "Restart" actually shuts everything # down cleanly — otherwise CUDA memory leaks across container restarts. PIDS=() cleanup() { echo "[entrypoint] SIGTERM/SIGINT — killing children: ${PIDS[*]:-}" >&2 for pid in "${PIDS[@]:-}"; do kill -TERM "$pid" 2>/dev/null || true done wait || true exit 0 } trap cleanup TERM INT wait_healthy() { local name="$1" port="$2" pid="$3" budget="${4:-300}" local deadline=$((SECONDS + budget)) while (( SECONDS < deadline )); do if ! kill -0 "$pid" 2>/dev/null; then echo "[entrypoint] FATAL: $name (pid $pid) died during boot. Tail of log:" >&2 tail -n 80 "$LOG_DIR/${name}.log" >&2 || true return 1 fi if curl -fsS "http://127.0.0.1:${port}/health" >/dev/null 2>&1; then echo "[entrypoint] $name healthy on :$port (after ${SECONDS}s)" return 0 fi sleep 5 done echo "[entrypoint] FATAL: $name failed to become healthy in ${budget}s" >&2 tail -n 80 "$LOG_DIR/${name}.log" >&2 || true return 1 } echo "[entrypoint] step 1/3 — booting vLLM(qwen) = $QWEN_MODEL on :8001 (gpu=${QWEN_GPU_FRAC})" # vllm/vllm-openai image only ships `python3` — no `python` symlink. python3 -m vllm.entrypoints.openai.api_server \ --model "$QWEN_MODEL" \ --served-model-name "$QWEN_MODEL" \ --host 0.0.0.0 --port 8001 \ --gpu-memory-utilization "$QWEN_GPU_FRAC" \ --max-model-len "$MAX_LEN" \ --dtype auto \ --disable-log-requests \ > "$LOG_DIR/qwen.log" 2>&1 & QWEN_PID=$! PIDS+=("$QWEN_PID") wait_healthy qwen 8001 "$QWEN_PID" 300 echo "[entrypoint] step 2/3 — booting vLLM(physix) = $PHYSIX_MODEL on :8002 (gpu=${PHYSIX_GPU_FRAC})" python3 -m vllm.entrypoints.openai.api_server \ --model "$PHYSIX_MODEL" \ --served-model-name "$PHYSIX_MODEL" \ --host 0.0.0.0 --port 8002 \ --gpu-memory-utilization "$PHYSIX_GPU_FRAC" \ --max-model-len "$MAX_LEN" \ --dtype auto \ --disable-log-requests \ > "$LOG_DIR/physix.log" 2>&1 & PHYSIX_PID=$! PIDS+=("$PHYSIX_PID") wait_healthy physix 8002 "$PHYSIX_PID" 300 echo "[entrypoint] step 3/3 — both vLLMs healthy; starting uvicorn on :${PORT}" # `exec` so uvicorn becomes PID 1's foreground job and HF Spaces sees # our process as healthy. The trap above forwards termination back to # the vLLM children when the Space is paused. exec python3 -m uvicorn _space_app:app \ --host 0.0.0.0 --port "${PORT:-7860}" \ --log-level info