Spaces:

Pratyush-01
/

physix-infer

Sleeping

App Files Files Community

physix-infer / entrypoint.sh

Pratyush-01

Re-create physix-infer: sequential vLLM boot, gpu_mem 0.40 each, python3 fix

7959cdc verified 12 days ago

raw

history blame contribute delete

5.01 kB

	#!/usr/bin/env bash
	# Boot two vLLM processes + the FastAPI proxy, all in one container.
	#
	# Lifecycle:
	# 1. Launch vLLM(qwen) on :8001, wait until /health returns 200.
	# 2. THEN launch vLLM(physix) on :8002, wait until /health returns 200.
	# (Sequential, not parallel — see below.)
	# 3. Exec uvicorn proxy in foreground (PID 1) — when HF Spaces sends
	# SIGTERM at sleep time, uvicorn exits cleanly and the children get
	# reaped via the signal trap.
	#
	# Why sequential, not parallel:
	# The first deploy attempt booted both vLLMs in parallel and the second
	# one died with `ValueError: No available memory for the cache blocks.
	# Try increasing gpu_memory_utilization`. Cause: vLLM reads the GPU's
	# currently free memory at startup and then reserves
	# `--gpu-memory-utilization * (free at this moment)`. When two processes
	# start simultaneously, both read "all 24 GB free" and both try to grab
	# ~10 GB; whichever one finalises last loses, because by then there's
	# only ~10-12 GB actually free.
	#
	# Sequential boot makes the second vLLM observe the post-first-process
	# free memory, so its allocation gets sized correctly.
	#
	# Why --gpu-memory-utilization 0.40 each (= 80% total reserved):
	# On L4 (24 GB), 0.40 * 24 ≈ 9.6 GB per process. Qwen2.5-3B fp16 weights
	# are ~6.2 GB; that leaves ~3.4 GB for KV cache + activations, which
	# sustains max_model_len=4096 with comfortable margin. The 20% reserve
	# covers the CUDA workspace + Python/uvicorn heap + the second vLLM's
	# own ~600 MB CUDA context overhead. We deliberately do NOT push to
	# 0.45 each — the previous deploy proved the residual headroom isn't
	# there once both contexts coexist.

	set -euo pipefail

	QWEN_MODEL="${QWEN_MODEL:-Qwen/Qwen2.5-3B-Instruct}"
	PHYSIX_MODEL="${PHYSIX_MODEL:-Pratyush-01/physix-3b-rl}"

	QWEN_GPU_FRAC="${QWEN_GPU_FRAC:-0.40}"
	PHYSIX_GPU_FRAC="${PHYSIX_GPU_FRAC:-0.40}"

	# 4096 is enough for the PhysiX prompt (~1500 tok) + completion (~512 tok)
	# with comfortable headroom, and tightening it materially shrinks the KV
	# cache footprint vs vLLM's default of model.max_position_embeddings
	# (32k for Qwen2.5).
	MAX_LEN="${MAX_LEN:-4096}"

	LOG_DIR=/tmp/logs
	mkdir -p "$LOG_DIR"

	# Track child PIDs so the signal trap can terminate them all on
	# SIGTERM/SIGINT. HF Spaces sends SIGTERM when pausing the Space.
	PIDS=()
	cleanup() {
	echo "[entrypoint] SIGTERM/SIGINT — killing children: ${PIDS[*]:-}" >&2
	for pid in "${PIDS[@]:-}"; do
	kill -TERM "$pid" 2>/dev/null \|\| true
	done
	wait \|\| true
	exit 0
	}
	trap cleanup TERM INT

	wait_healthy() {
	local name="$1" port="$2" pid="$3" budget="${4:-480}"
	local deadline=$((SECONDS + budget))
	while (( SECONDS < deadline )); do
	# If the child died, surface its log and bail out — silently
	# waiting forever for a corpse is the worst failure mode.
	if ! kill -0 "$pid" 2>/dev/null; then
	echo "[entrypoint] FATAL: $name (pid $pid) died during boot. Tail of log:" >&2
	tail -n 80 "$LOG_DIR/vllm-${name}.log" >&2 \|\| true
	return 1
	fi
	if curl -fsS "http://127.0.0.1:${port}/health" >/dev/null 2>&1; then
	echo "[entrypoint] $name healthy on :$port (after ${SECONDS}s)"
	return 0
	fi
	sleep 5
	done
	echo "[entrypoint] FATAL: $name failed to become healthy in ${budget}s" >&2
	tail -n 80 "$LOG_DIR/vllm-${name}.log" >&2 \|\| true
	return 1
	}

	echo "[entrypoint] step 1/3 — booting vLLM(qwen) = $QWEN_MODEL on :8001 (gpu=${QWEN_GPU_FRAC})"
	# vllm/vllm-openai base image ships only `python3` (no `python` symlink),
	# so use python3 explicitly. Using `python -m vllm...` here cost us a
	# full failed deploy on first try.
	python3 -m vllm.entrypoints.openai.api_server \
	--model "$QWEN_MODEL" \
	--served-model-name "$QWEN_MODEL" \
	--host 0.0.0.0 --port 8001 \
	--gpu-memory-utilization "$QWEN_GPU_FRAC" \
	--max-model-len "$MAX_LEN" \
	--dtype auto \
	--disable-log-requests \
	> "$LOG_DIR/vllm-qwen.log" 2>&1 &
	QWEN_PID=$!
	PIDS+=("$QWEN_PID")
	wait_healthy qwen 8001 "$QWEN_PID"

	echo "[entrypoint] step 2/3 — booting vLLM(physix) = $PHYSIX_MODEL on :8002 (gpu=${PHYSIX_GPU_FRAC})"
	python3 -m vllm.entrypoints.openai.api_server \
	--model "$PHYSIX_MODEL" \
	--served-model-name "$PHYSIX_MODEL" \
	--host 0.0.0.0 --port 8002 \
	--gpu-memory-utilization "$PHYSIX_GPU_FRAC" \
	--max-model-len "$MAX_LEN" \
	--dtype auto \
	--disable-log-requests \
	> "$LOG_DIR/vllm-physix.log" 2>&1 &
	PHYSIX_PID=$!
	PIDS+=("$PHYSIX_PID")
	wait_healthy physix 8002 "$PHYSIX_PID"

	echo "[entrypoint] step 3/3 — both vLLMs healthy; starting proxy on :${PORT}"
	# `exec` so uvicorn becomes PID 1's foreground job and HF Spaces sees
	# our process as healthy. The trap above forwards termination back to
	# the vLLM children when the Space is paused.
	exec python3 -m uvicorn proxy:app \
	--host 0.0.0.0 --port "${PORT}" \
	--log-level info