Spaces:
Sleeping
Sleeping
| # Boot the two in-Space vLLMs SEQUENTIALLY, then exec uvicorn for the | |
| # physix FastAPI server (which serves the API + the React SPA). | |
| # | |
| # Why sequential, not parallel: | |
| # The first deploy attempt booted both vLLMs in parallel and the second | |
| # one died with "No available memory for the cache blocks." Reason: vLLM | |
| # reads `nvidia-smi`-style free memory at startup and reserves | |
| # `--gpu-memory-utilization * (free at this moment)` worth of VRAM. | |
| # When two processes start simultaneously, both see "all 24 GB free" and | |
| # both try to grab ~10 GB; the second one to finalize loses. Booting | |
| # sequentially makes the second one observe the post-first-process free | |
| # memory, so its allocation is sized correctly. | |
| # | |
| # Why --gpu-memory-utilization 0.40 each (= 80% total): | |
| # On L4 (24 GB), 40% = ~9.6 GB per process. Qwen2.5-3B fp16 weights are | |
| # ~6.2 GB; that leaves ~3.4 GB per process for KV cache + activations, | |
| # which sustains max_model_len=4096 with comfortable margin. The 20% | |
| # reserve covers CUDA workspace + uvicorn + Python heap. Pushing this | |
| # much higher (e.g. 0.45 each) is what failed on the first deploy | |
| # because once you account for the ~600 MB CUDA context + the second | |
| # process's overhead, weights+KV no longer fit. | |
| set -euo pipefail | |
| QWEN_MODEL="${QWEN_MODEL:-Qwen/Qwen2.5-3B-Instruct}" | |
| PHYSIX_MODEL="${PHYSIX_MODEL:-Pratyush-01/physix-3b-rl}" | |
| QWEN_GPU_FRAC="${QWEN_GPU_FRAC:-0.40}" | |
| PHYSIX_GPU_FRAC="${PHYSIX_GPU_FRAC:-0.40}" | |
| MAX_LEN="${MAX_LEN:-4096}" | |
| LOG_DIR=/tmp/logs | |
| mkdir -p "$LOG_DIR" | |
| # Forward signals so HF's "Pause" / "Restart" actually shuts everything | |
| # down cleanly — otherwise CUDA memory leaks across container restarts. | |
| PIDS=() | |
| cleanup() { | |
| echo "[entrypoint] SIGTERM/SIGINT — killing children: ${PIDS[*]:-}" >&2 | |
| for pid in "${PIDS[@]:-}"; do | |
| kill -TERM "$pid" 2>/dev/null || true | |
| done | |
| wait || true | |
| exit 0 | |
| } | |
| trap cleanup TERM INT | |
| wait_healthy() { | |
| local name="$1" port="$2" pid="$3" budget="${4:-300}" | |
| local deadline=$((SECONDS + budget)) | |
| while (( SECONDS < deadline )); do | |
| if ! kill -0 "$pid" 2>/dev/null; then | |
| echo "[entrypoint] FATAL: $name (pid $pid) died during boot. Tail of log:" >&2 | |
| tail -n 80 "$LOG_DIR/${name}.log" >&2 || true | |
| return 1 | |
| fi | |
| if curl -fsS "http://127.0.0.1:${port}/health" >/dev/null 2>&1; then | |
| echo "[entrypoint] $name healthy on :$port (after ${SECONDS}s)" | |
| return 0 | |
| fi | |
| sleep 5 | |
| done | |
| echo "[entrypoint] FATAL: $name failed to become healthy in ${budget}s" >&2 | |
| tail -n 80 "$LOG_DIR/${name}.log" >&2 || true | |
| return 1 | |
| } | |
| echo "[entrypoint] step 1/3 — booting vLLM(qwen) = $QWEN_MODEL on :8001 (gpu=${QWEN_GPU_FRAC})" | |
| # vllm/vllm-openai image only ships `python3` — no `python` symlink. | |
| python3 -m vllm.entrypoints.openai.api_server \ | |
| --model "$QWEN_MODEL" \ | |
| --served-model-name "$QWEN_MODEL" \ | |
| --host 0.0.0.0 --port 8001 \ | |
| --gpu-memory-utilization "$QWEN_GPU_FRAC" \ | |
| --max-model-len "$MAX_LEN" \ | |
| --dtype auto \ | |
| --disable-log-requests \ | |
| > "$LOG_DIR/qwen.log" 2>&1 & | |
| QWEN_PID=$! | |
| PIDS+=("$QWEN_PID") | |
| wait_healthy qwen 8001 "$QWEN_PID" 300 | |
| echo "[entrypoint] step 2/3 — booting vLLM(physix) = $PHYSIX_MODEL on :8002 (gpu=${PHYSIX_GPU_FRAC})" | |
| python3 -m vllm.entrypoints.openai.api_server \ | |
| --model "$PHYSIX_MODEL" \ | |
| --served-model-name "$PHYSIX_MODEL" \ | |
| --host 0.0.0.0 --port 8002 \ | |
| --gpu-memory-utilization "$PHYSIX_GPU_FRAC" \ | |
| --max-model-len "$MAX_LEN" \ | |
| --dtype auto \ | |
| --disable-log-requests \ | |
| > "$LOG_DIR/physix.log" 2>&1 & | |
| PHYSIX_PID=$! | |
| PIDS+=("$PHYSIX_PID") | |
| wait_healthy physix 8002 "$PHYSIX_PID" 300 | |
| echo "[entrypoint] step 3/3 — both vLLMs healthy; starting uvicorn on :${PORT}" | |
| # `exec` so uvicorn becomes PID 1's foreground job and HF Spaces sees | |
| # our process as healthy. The trap above forwards termination back to | |
| # the vLLM children when the Space is paused. | |
| exec python3 -m uvicorn _space_app:app \ | |
| --host 0.0.0.0 --port "${PORT:-7860}" \ | |
| --log-level info | |