Spaces:
Sleeping
Sleeping
| # Boot two vLLM processes + the FastAPI proxy, all in one container. | |
| # | |
| # Lifecycle: | |
| # 1. Launch vLLM(qwen) on :8001, wait until /health returns 200. | |
| # 2. THEN launch vLLM(physix) on :8002, wait until /health returns 200. | |
| # (Sequential, not parallel — see below.) | |
| # 3. Exec uvicorn proxy in foreground (PID 1) — when HF Spaces sends | |
| # SIGTERM at sleep time, uvicorn exits cleanly and the children get | |
| # reaped via the signal trap. | |
| # | |
| # Why sequential, not parallel: | |
| # The first deploy attempt booted both vLLMs in parallel and the second | |
| # one died with `ValueError: No available memory for the cache blocks. | |
| # Try increasing gpu_memory_utilization`. Cause: vLLM reads the GPU's | |
| # *currently free* memory at startup and then reserves | |
| # `--gpu-memory-utilization * (free at this moment)`. When two processes | |
| # start simultaneously, both read "all 24 GB free" and both try to grab | |
| # ~10 GB; whichever one finalises last loses, because by then there's | |
| # only ~10-12 GB actually free. | |
| # | |
| # Sequential boot makes the second vLLM observe the post-first-process | |
| # free memory, so its allocation gets sized correctly. | |
| # | |
| # Why --gpu-memory-utilization 0.40 each (= 80% total reserved): | |
| # On L4 (24 GB), 0.40 * 24 ≈ 9.6 GB per process. Qwen2.5-3B fp16 weights | |
| # are ~6.2 GB; that leaves ~3.4 GB for KV cache + activations, which | |
| # sustains max_model_len=4096 with comfortable margin. The 20% reserve | |
| # covers the CUDA workspace + Python/uvicorn heap + the second vLLM's | |
| # own ~600 MB CUDA context overhead. We deliberately do NOT push to | |
| # 0.45 each — the previous deploy proved the residual headroom isn't | |
| # there once both contexts coexist. | |
| set -euo pipefail | |
| QWEN_MODEL="${QWEN_MODEL:-Qwen/Qwen2.5-3B-Instruct}" | |
| PHYSIX_MODEL="${PHYSIX_MODEL:-Pratyush-01/physix-3b-rl}" | |
| QWEN_GPU_FRAC="${QWEN_GPU_FRAC:-0.40}" | |
| PHYSIX_GPU_FRAC="${PHYSIX_GPU_FRAC:-0.40}" | |
| # 4096 is enough for the PhysiX prompt (~1500 tok) + completion (~512 tok) | |
| # with comfortable headroom, and tightening it materially shrinks the KV | |
| # cache footprint vs vLLM's default of model.max_position_embeddings | |
| # (32k for Qwen2.5). | |
| MAX_LEN="${MAX_LEN:-4096}" | |
| LOG_DIR=/tmp/logs | |
| mkdir -p "$LOG_DIR" | |
| # Track child PIDs so the signal trap can terminate them all on | |
| # SIGTERM/SIGINT. HF Spaces sends SIGTERM when pausing the Space. | |
| PIDS=() | |
| cleanup() { | |
| echo "[entrypoint] SIGTERM/SIGINT — killing children: ${PIDS[*]:-}" >&2 | |
| for pid in "${PIDS[@]:-}"; do | |
| kill -TERM "$pid" 2>/dev/null || true | |
| done | |
| wait || true | |
| exit 0 | |
| } | |
| trap cleanup TERM INT | |
| wait_healthy() { | |
| local name="$1" port="$2" pid="$3" budget="${4:-480}" | |
| local deadline=$((SECONDS + budget)) | |
| while (( SECONDS < deadline )); do | |
| # If the child died, surface its log and bail out — silently | |
| # waiting forever for a corpse is the worst failure mode. | |
| if ! kill -0 "$pid" 2>/dev/null; then | |
| echo "[entrypoint] FATAL: $name (pid $pid) died during boot. Tail of log:" >&2 | |
| tail -n 80 "$LOG_DIR/vllm-${name}.log" >&2 || true | |
| return 1 | |
| fi | |
| if curl -fsS "http://127.0.0.1:${port}/health" >/dev/null 2>&1; then | |
| echo "[entrypoint] $name healthy on :$port (after ${SECONDS}s)" | |
| return 0 | |
| fi | |
| sleep 5 | |
| done | |
| echo "[entrypoint] FATAL: $name failed to become healthy in ${budget}s" >&2 | |
| tail -n 80 "$LOG_DIR/vllm-${name}.log" >&2 || true | |
| return 1 | |
| } | |
| echo "[entrypoint] step 1/3 — booting vLLM(qwen) = $QWEN_MODEL on :8001 (gpu=${QWEN_GPU_FRAC})" | |
| # vllm/vllm-openai base image ships only `python3` (no `python` symlink), | |
| # so use python3 explicitly. Using `python -m vllm...` here cost us a | |
| # full failed deploy on first try. | |
| python3 -m vllm.entrypoints.openai.api_server \ | |
| --model "$QWEN_MODEL" \ | |
| --served-model-name "$QWEN_MODEL" \ | |
| --host 0.0.0.0 --port 8001 \ | |
| --gpu-memory-utilization "$QWEN_GPU_FRAC" \ | |
| --max-model-len "$MAX_LEN" \ | |
| --dtype auto \ | |
| --disable-log-requests \ | |
| > "$LOG_DIR/vllm-qwen.log" 2>&1 & | |
| QWEN_PID=$! | |
| PIDS+=("$QWEN_PID") | |
| wait_healthy qwen 8001 "$QWEN_PID" | |
| echo "[entrypoint] step 2/3 — booting vLLM(physix) = $PHYSIX_MODEL on :8002 (gpu=${PHYSIX_GPU_FRAC})" | |
| python3 -m vllm.entrypoints.openai.api_server \ | |
| --model "$PHYSIX_MODEL" \ | |
| --served-model-name "$PHYSIX_MODEL" \ | |
| --host 0.0.0.0 --port 8002 \ | |
| --gpu-memory-utilization "$PHYSIX_GPU_FRAC" \ | |
| --max-model-len "$MAX_LEN" \ | |
| --dtype auto \ | |
| --disable-log-requests \ | |
| > "$LOG_DIR/vllm-physix.log" 2>&1 & | |
| PHYSIX_PID=$! | |
| PIDS+=("$PHYSIX_PID") | |
| wait_healthy physix 8002 "$PHYSIX_PID" | |
| echo "[entrypoint] step 3/3 — both vLLMs healthy; starting proxy on :${PORT}" | |
| # `exec` so uvicorn becomes PID 1's foreground job and HF Spaces sees | |
| # our process as healthy. The trap above forwards termination back to | |
| # the vLLM children when the Space is paused. | |
| exec python3 -m uvicorn proxy:app \ | |
| --host 0.0.0.0 --port "${PORT}" \ | |
| --log-level info | |