Spaces:

Pratyush-01
/

physix-live

Sleeping

App Files Files Community

Pratyush-01 commited on 12 days ago

Commit

d2b2154

verified ·

1 Parent(s): 08f8699

Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

Dockerfile +80 -102
frontend/src/lib/llmPresets.ts +32 -29
scripts/space_app.py +55 -0
scripts/space_entrypoint.sh +103 -0

Dockerfile CHANGED Viewed

@@ -1,16 +1,32 @@
-# PhysiX-Live env Space — FastAPI server + built React UI on port 7860.
 #
-# Two-stage build:
-#   1. node:20 builds the Vite/React frontend into frontend/dist with
-#      same-origin API base URL (VITE_PHYSIX_API_URL=""), so the SPA
-#      fetches /interactive/* relative to the Space's own host.
-#   2. python:3.11-slim installs physix as an editable package and serves
-#      both the FastAPI routes (/reset, /step, /interactive/*) AND the
-#      built SPA as static assets from a single uvicorn process.
 #
-# We deliberately do NOT bundle the training stack (torch/unsloth/trl) —
-# this is the env Space, not the training Space. Training lives in
-# `train/` and runs on HF Jobs; see train/README.md.
 ############################
 # Stage 1: build the SPA
@@ -19,35 +35,27 @@ FROM node:20-alpine AS frontend
 WORKDIR /build
 RUN corepack enable
-# Install deps separately from sources so layer cache survives source edits.
 COPY frontend/package.json frontend/pnpm-lock.yaml ./
 RUN pnpm install --frozen-lockfile --silent
 COPY frontend/ ./
 # Same-origin API fetches (relative paths). The Space serves both API and UI.
 ENV VITE_PHYSIX_API_URL=""
-# Cache-bust: HF Spaces' BuildKit occasionally reuses the previous
-# image's stage-1 output even when frontend/ source changed (the layer
-# hash is keyed on more than just file content). Bumping this comment
-# is the documented workaround — increment when you push a UI change
-# and the Space is still serving the previous SPA bundle hash.
-# physix-spa-rebuild: 2
-# Run typecheck + vite build separately so we can pass --base=/web/ to
-# vite without it landing on tsc. The SPA is mounted at /web/ in the
-# Space and the redirect from / -> /web/ is registered by the wrapper
-# in _space_app.py below. All asset URLs in the built index.html
-# include the /web/ prefix.
 RUN pnpm exec tsc -b \
     && pnpm exec vite build --base=/web/
 ############################
-# Stage 2: runtime
 ############################
-FROM python:3.11-slim AS runtime
-# HF Spaces convention: write everything under /tmp (only writable path
-# at runtime). The container also runs as UID 1000 with no /etc/passwd
-# entry, so set USER/HOME so getpass.getuser() and pathlib don't crash.
 ENV PYTHONUNBUFFERED=1 \
     PIP_NO_CACHE_DIR=1 \
     PIP_DISABLE_PIP_VERSION_CHECK=1 \
@@ -56,103 +64,73 @@ ENV PYTHONUNBUFFERED=1 \
     LOGNAME=physix \
     HF_HOME=/tmp/hf_cache \
     XDG_CACHE_HOME=/tmp/xdg-cache \
     PORT=7860 \
     PHYSIX_HOST=0.0.0.0 \
     PHYSIX_CORS_ORIGINS=*
-WORKDIR /app
-# System deps: build-essential briefly for any pip wheels that need it
-# (sympy/scipy ship wheels for linux_x86_64 so this is mostly a safety
-# net), curl for the healthcheck.
 RUN apt-get update \
     && apt-get install -y --no-install-recommends curl \
     && rm -rf /var/lib/apt/lists/*
-# Install python deps separately from sources for cache friendliness.
-COPY pyproject.toml ./
-RUN pip install --upgrade pip \
-    && pip install \
         "openenv-core[core]>=0.2.2" \
         "numpy>=1.24" \
         "scipy>=1.10" \
         "sympy>=1.12" \
-        "fastapi>=0.110" \
-        "uvicorn>=0.29" \
-        "pydantic>=2.5" \
         "requests>=2.31"
-# Install physix (no-deps so we don't re-resolve the stack we just installed).
 COPY physix ./physix
 COPY README.md ./
 RUN pip install --no-deps -e .
 COPY --from=frontend /build/dist /app/static
-# Space wrapper: re-exports physix.server.app:app with the React SPA
-# mounted at /web/ (overriding OpenEnv's default Gradio UI). The base
-# /health endpoint provided by OpenEnv is reused for the Docker
-# HEALTHCHECK below — we don't need to add a custom one.
-#
-# Kept in the Dockerfile rather than physix/server/app.py so the package
-# stays UI-free for the cloud training jobs that import it.
-RUN cat > /app/_space_app.py <<'PY'
-"""Space entrypoint: physix.server.app:app + static UI mount."""
-from pathlib import Path
-from fastapi.responses import RedirectResponse
-from fastapi.staticfiles import StaticFiles
-from physix.server.app import app
-_STATIC_DIR = Path("/app/static")
-# OpenEnv's `create_fastapi_app` (which physix.server.app uses directly)
-# does NOT register a `/` redirect — that's only added by the higher-level
-# `create_web_interface_app` wrapper, which we deliberately don't use
-# because it would mount Gradio at /web and clobber our React SPA. So the
-# redirects have to live here, otherwise:
-#
-#   * `https://<space>.hf.space/`     -> 404
-#   * `https://<space>.hf.space/web`  -> 404 (no trailing slash)
-#
-# Both are landing pages users hit (the bare URL is the canonical Space
-# link the HF UI exposes), so without these the Space appears blank.
-@app.get("/", include_in_schema=False)
-async def _root_redirect() -> RedirectResponse:
-    return RedirectResponse(url="/web/")
-@app.get("/web", include_in_schema=False)
-async def _web_no_slash_redirect() -> RedirectResponse:
-    return RedirectResponse(url="/web/")
-if _STATIC_DIR.is_dir():
-    # html=True makes StaticFiles serve index.html for directory hits and
-    # fall back to it for unknown sub-paths (so client-side React routing
-    # works). Mounted last so registered API routes (/web/metadata,
-    # /web/reset, /web/step from OpenEnv; /interactive/* from physix)
-    # always win. The vite build was run with --base=/web/ so asset
-    # URLs in index.html already include the prefix.
-    app.mount(
-        "/web",
-        StaticFiles(directory=str(_STATIC_DIR), html=True),
-        name="ui",
-    )
-PY
-# Pre-create writable dirs so the first request doesn't crash on a
-# missing cache path.
-RUN mkdir -p "$HOME" "$HF_HOME" "$XDG_CACHE_HOME" \
-    && chmod -R 0777 /tmp/home /tmp/hf_cache /tmp/xdg-cache /app
 EXPOSE 7860
-HEALTHCHECK --interval=30s --timeout=5s --start-period=20s --retries=3 \
     CMD curl -fsS "http://127.0.0.1:${PORT}/health" || exit 1
 ENV ENABLE_WEB_INTERFACE=true
-CMD ["sh", "-c", "uvicorn _space_app:app --host 0.0.0.0 --port ${PORT:-7860} --log-level info"]

+# PhysiX-Live Space — combined env + UI + dual-model GPU inference.
 #
+# Single L4 Space hosts EVERYTHING the demo needs:
 #
+#   :8001  vllm serve  Qwen/Qwen2.5-3B-Instruct      (--gpu-memory-util 0.40)
+#   :8002  vllm serve  Pratyush-01/physix-3b-rl      (--gpu-memory-util 0.40)
+#   :7860  uvicorn physix.server.app:app
+#          ├─ /reset, /step                 (OpenEnv stateless API)
+#          ├─ /interactive/*                (browser session API)
+#          ├─ /web/                         (built React SPA)
+#          └─ /interactive/.../llm-step      (LLM-driven episode)
+#               └─ when base_url=local://router, dispatches by `model`
+#                  to one of the localhost vLLMs above. The browser
+#                  never sees those ports — that prevents anyone from
+#                  bypassing the demo to run free GPU calls.
+#
+# Why one Space, not two:
+#   * Same sleep timer — when the demo sleeps, GPU sleeps. No "demo
+#     is awake but inference is asleep" UX gap.
+#   * No CORS between SPA and inference — same origin.
+#   * One thing to babysit, one URL to share.
+#
+# Why vllm/vllm-openai as the base, not nvidia/cuda:
+#   * vLLM ships pre-compiled CUDA kernels for a specific cu+torch combo.
+#     Building from scratch on cu12.4 means recompiling vLLM (~20 min,
+#     fragile across minor versions). The official image guarantees the
+#     ABI is right out of the box.
+#   * Already includes Python 3.12, torch, ncclm, FastAPI dependencies —
+#     we just layer the physix app + frontend on top.
 ############################
 # Stage 1: build the SPA
 WORKDIR /build
 RUN corepack enable
 COPY frontend/package.json frontend/pnpm-lock.yaml ./
 RUN pnpm install --frozen-lockfile --silent
 COPY frontend/ ./
 # Same-origin API fetches (relative paths). The Space serves both API and UI.
 ENV VITE_PHYSIX_API_URL=""
+# Cache-bust marker. Bump when an SPA change isn't taking on the Space —
+# HF BuildKit occasionally reuses stage-1 output even when sources changed.
+# physix-spa-rebuild: 3
 RUN pnpm exec tsc -b \
     && pnpm exec vite build --base=/web/
 ############################
+# Stage 2: runtime (vLLM + physix server + SPA)
 ############################
+FROM vllm/vllm-openai:v0.7.3 AS runtime
+# vllm/vllm-openai sets ENTRYPOINT to `python3 -m vllm.entrypoints.openai.api_server`.
+# We need our own multi-process supervisor, so reset.
+ENTRYPOINT []
 ENV PYTHONUNBUFFERED=1 \
     PIP_NO_CACHE_DIR=1 \
     PIP_DISABLE_PIP_VERSION_CHECK=1 \
     LOGNAME=physix \
     HF_HOME=/tmp/hf_cache \
     XDG_CACHE_HOME=/tmp/xdg-cache \
+    VLLM_CACHE_ROOT=/tmp/vllm_cache \
+    TORCH_HOME=/tmp/torch_cache \
+    TRITON_CACHE_DIR=/tmp/triton_cache \
     PORT=7860 \
     PHYSIX_HOST=0.0.0.0 \
     PHYSIX_CORS_ORIGINS=*
+# Need curl for healthchecks; the vLLM image is python-only so apt is fine.
 RUN apt-get update \
     && apt-get install -y --no-install-recommends curl \
     && rm -rf /var/lib/apt/lists/*
+WORKDIR /app
+# Physix backend deps. The vLLM image already has fastapi/uvicorn/pydantic
+# transitively (vllm depends on them), so this is the small physics stack
+# plus openenv-core.
+#
+# IMPORTANT: install with --no-build-isolation if you ever switch to a
+# package that needs torch at build time — you do NOT want pip to try
+# rebuilding torch in this image.
+RUN pip install \
         "openenv-core[core]>=0.2.2" \
         "numpy>=1.24" \
         "scipy>=1.10" \
         "sympy>=1.12" \
+        "openai>=1.40" \
         "requests>=2.31"
+# Install physix as an editable package. --no-deps because we just
+# installed the runtime stack above; pyproject's deps would reinstall
+# pinned versions and likely conflict with vLLM's torch.
+COPY pyproject.toml ./
 COPY physix ./physix
 COPY README.md ./
 RUN pip install --no-deps -e .
+# Built SPA from stage 1.
 COPY --from=frontend /build/dist /app/static
+# Space wrapper — mounts the React SPA at /web/, registers `/` -> `/web/`
+# redirect (OpenEnv's create_fastapi_app doesn't do this for us). Same
+# pattern as the previous CPU-only build, just kept in a real file now
+# instead of a heredoc so syntax errors are caught at build time.
+COPY scripts/space_app.py /app/_space_app.py
+# Supervisor entrypoint that boots the two vLLMs sequentially (avoids
+# the CUDA memory race we hit on the first push) then execs uvicorn.
+COPY scripts/space_entrypoint.sh /app/entrypoint.sh
+RUN chmod +x /app/entrypoint.sh
+# Pre-create writable dirs. HF Spaces runs containers as a non-root UID
+# with no /etc/passwd entry, so all cache paths under $HOME must exist
+# and be world-writable BEFORE the runtime user shows up.
+RUN mkdir -p \
+        "$HOME" "$HF_HOME" "$XDG_CACHE_HOME" \
+        "$VLLM_CACHE_ROOT" "$TORCH_HOME" "$TRITON_CACHE_DIR" \
+        /tmp/logs \
+    && chmod -R 0777 /tmp /app
 EXPOSE 7860
+# /health is OpenEnv's stock endpoint and turns 200 once uvicorn binds.
+# We give a generous start-period because vLLM cold-load + frontend serve
+# is up to ~150 s on first boot.
+HEALTHCHECK --interval=30s --timeout=10s --start-period=240s --retries=3 \
     CMD curl -fsS "http://127.0.0.1:${PORT}/health" || exit 1
 ENV ENABLE_WEB_INTERFACE=true
+CMD ["/app/entrypoint.sh"]

frontend/src/lib/llmPresets.ts CHANGED Viewed

@@ -19,13 +19,17 @@ export const OLLAMA_OPENAI_BASE_URL = "http://localhost:11434/v1";
 export const PHYSIX_MODEL_ID = "Pratyush-01/physix-3b-rl";
 export const QWEN_BASE_MODEL_ID = "Qwen/Qwen2.5-3B-Instruct";
-/** Magic in-container vLLM target. The browser sends `local://<name>`
- *  in `LlmStepRequest.base_url`; the server rewrites it to the right
- *  localhost vLLM (see physix/server/providers.py::LOCAL_VLLM_ENDPOINTS).
- *  Two name buckets: "qwen" -> Qwen2.5-3B-Instruct, "physix" -> physix-3b-rl.
- *  Both are served by the same Space, on the same L4, so swapping
- *  models doesn't pay a cold-start. */
-export const LOCAL_VLLM_BASE_URL = "local://router";  // model field picks the upstream
 export type EndpointId = "ollama" | "hf" | "openai" | "custom" | "physix";
@@ -63,25 +67,25 @@ export interface Endpoint {
 export const ENDPOINTS: readonly Endpoint[] = [
   {
     id: "physix",
-    label: "PhysiX-Infer GPU (both 3B models ✦)",
-    baseUrl: PHYSIX_INFER_BASE_URL,
-    // No auth — the Space is open access, bounded by its sleep timer.
-    // Setting needsKey:false keeps the API-key field dimmed by default;
-    // power users can still type one if they put auth in front of a
-    // forked deployment.
     needsKey: false,
     modelInputMode: "freeform-with-suggestions",
-    // Both models are served by the same Space. First entry pre-fills,
-    // and is the trained model so the comparison story is "trained vs
-    // base" with one click.
     modelSuggestions: [
       { id: PHYSIX_MODEL_ID, tag: "trained ✦" },
       { id: QWEN_BASE_MODEL_ID, tag: "base (apples-to-apples)" },
     ],
     hint:
-      "Dedicated L4 Space hosting both physix-3b-rl and Qwen2.5-3B-Instruct " +
-      "via vLLM. No token needed. Sleeps after 5 min idle — first call after " +
-      "sleep is ~90-120 s while both models load; subsequent calls are fast.",
   },
   {
     id: "ollama",
@@ -182,24 +186,23 @@ export interface LlmConnection {
   apiKey: string;
 }
-/** Default A side: trained PhysiX-3B via the dedicated GPU Space. No
- *  token needed; first call may take ~90-120 s while the Space wakes
- *  from sleep, but subsequent calls run on a hot L4. */
 export const DEFAULT_CONNECTION_A: LlmConnection = {
   endpointId: "physix",
-  baseUrl: PHYSIX_INFER_BASE_URL,
   model: PHYSIX_MODEL_ID,
   apiKey: "",
 };
-/** Default B side: the same GPU Space but pointed at the Qwen 2.5 3B
- *  base model. Apples-to-apples comparison: identical architecture,
- *  identical hardware, identical generation params — only the weights
- *  differ. Same Space means second-side wake doesn't add cold-start
- *  cost (the L4 is already warm from side A). */
 export const DEFAULT_CONNECTION_B: LlmConnection = {
   endpointId: "physix",
-  baseUrl: PHYSIX_INFER_BASE_URL,
   model: QWEN_BASE_MODEL_ID,
   apiKey: "",
 };

 export const PHYSIX_MODEL_ID = "Pratyush-01/physix-3b-rl";
 export const QWEN_BASE_MODEL_ID = "Qwen/Qwen2.5-3B-Instruct";
+/** Magic base URL that tells the server "use the in-container vLLMs".
+ *  The actual upstream is picked server-side based on the model field
+ *  (see physix/server/providers.py::LOCAL_VLLM_PORTS). The browser
+ *  never sees the inference ports — keeping them off the public URL
+ *  prevents abuse of the L4 GPU.
+ *
+ *  Both models live on the same Space + same L4, so flipping between
+ *  the trained fine-tune and the Qwen baseline is instant once both
+ *  are warm. First call after a cold-boot still costs ~90-120s while
+ *  vLLM loads weights. */
+export const LOCAL_VLLM_BASE_URL = "local://router";
 export type EndpointId = "ollama" | "hf" | "openai" | "custom" | "physix";
 export const ENDPOINTS: readonly Endpoint[] = [
   {
     id: "physix",
+    label: "PhysiX GPU (in-Space vLLM ✦)",
+    // `local://router` is a magic value the server recognises. The
+    // browser never talks to the inference ports directly — the
+    // physix backend dispatches to the right vLLM by `model` field.
+    // See physix/server/providers.py::LOCAL_VLLM_PORTS.
+    baseUrl: LOCAL_VLLM_BASE_URL,
     needsKey: false,
     modelInputMode: "freeform-with-suggestions",
+    // Both models live on the same in-Space L4 vLLM. First entry
+    // pre-fills, so the default comparison is "trained vs base" with
+    // identical hardware / generation params — only the weights differ.
     modelSuggestions: [
       { id: PHYSIX_MODEL_ID, tag: "trained ✦" },
       { id: QWEN_BASE_MODEL_ID, tag: "base (apples-to-apples)" },
     ],
     hint:
+      "Both 3B models hosted on the Space's own L4 GPU via vLLM. No token. " +
+      "Space sleeps after 5 min idle — first call after sleep is ~90-120 s " +
+      "while weights load; subsequent calls are fast.",
   },
   {
     id: "ollama",
   apiKey: string;
 }
+/** Default A side: trained PhysiX-3B served by the in-Space L4 vLLM.
+ *  No token needed; first call after sleep is ~90-120 s while weights
+ *  load, then fast. */
 export const DEFAULT_CONNECTION_A: LlmConnection = {
   endpointId: "physix",
+  baseUrl: LOCAL_VLLM_BASE_URL,
   model: PHYSIX_MODEL_ID,
   apiKey: "",
 };
+/** Default B side: the same in-Space vLLM, pointed at Qwen 2.5 3B.
+ *  Apples-to-apples — identical architecture, identical hardware,
+ *  identical generation params; only the weights differ. Both models
+ *  share the same GPU, so warming side A also warms side B. */
 export const DEFAULT_CONNECTION_B: LlmConnection = {
   endpointId: "physix",
+  baseUrl: LOCAL_VLLM_BASE_URL,
   model: QWEN_BASE_MODEL_ID,
   apiKey: "",
 };

scripts/space_app.py ADDED Viewed

	@@ -0,0 +1,55 @@

+"""Space entrypoint: physix.server.app:app + static UI mount.
+Imported at runtime by the Dockerfile's CMD via ``uvicorn _space_app:app``.
+What this wrapper adds on top of ``physix.server.app:app``:
+  1. ``GET /``            -> 302 to ``/web/`` (so the bare Space URL
+                             doesn't 404 — OpenEnv's ``create_fastapi_app``
+                             does NOT add a root redirect; that's only in
+                             the higher-level wrapper which mounts Gradio
+                             at ``/web`` and would clobber our React SPA).
+  2. ``GET /web``         -> 302 to ``/web/`` (same reason; users hit the
+                             no-trailing-slash variant from outside links).
+  3. ``StaticFiles`` mount at ``/web/`` serving the built Vite SPA. The
+     vite build was run with ``--base=/web/`` so all asset URLs in the
+     emitted ``index.html`` already include the prefix.
+Kept as a real .py file (not a heredoc inside the Dockerfile) so any
+syntax error is caught by the build's static analysis rather than at
+runtime — saved several deploy-fail loops in earlier iterations.
+"""
+from __future__ import annotations
+from pathlib import Path
+from fastapi.responses import RedirectResponse
+from fastapi.staticfiles import StaticFiles
+from physix.server.app import app
+_STATIC_DIR = Path("/app/static")
+@app.get("/", include_in_schema=False)
+async def _root_redirect() -> RedirectResponse:
+    return RedirectResponse(url="/web/")
+@app.get("/web", include_in_schema=False)
+async def _web_no_slash_redirect() -> RedirectResponse:
+    return RedirectResponse(url="/web/")
+if _STATIC_DIR.is_dir():
+    # html=True makes StaticFiles serve index.html for directory hits and
+    # fall back to it for unknown sub-paths (so client-side React routing
+    # works). Mounted last so registered API routes (/web/metadata,
+    # /web/reset, /web/step from OpenEnv; /interactive/* from physix)
+    # always win over the static handler.
+    app.mount(
+        "/web",
+        StaticFiles(directory=str(_STATIC_DIR), html=True),
+        name="ui",
+    )

scripts/space_entrypoint.sh ADDED Viewed

	@@ -0,0 +1,103 @@

+#!/usr/bin/env bash
+# Boot the two in-Space vLLMs SEQUENTIALLY, then exec uvicorn for the
+# physix FastAPI server (which serves the API + the React SPA).
+#
+# Why sequential, not parallel:
+#   The first deploy attempt booted both vLLMs in parallel and the second
+#   one died with "No available memory for the cache blocks." Reason: vLLM
+#   reads `nvidia-smi`-style free memory at startup and reserves
+#   `--gpu-memory-utilization * (free at this moment)` worth of VRAM.
+#   When two processes start simultaneously, both see "all 24 GB free" and
+#   both try to grab ~10 GB; the second one to finalize loses. Booting
+#   sequentially makes the second one observe the post-first-process free
+#   memory, so its allocation is sized correctly.
+#
+# Why --gpu-memory-utilization 0.40 each (= 80% total):
+#   On L4 (24 GB), 40% = ~9.6 GB per process. Qwen2.5-3B fp16 weights are
+#   ~6.2 GB; that leaves ~3.4 GB per process for KV cache + activations,
+#   which sustains max_model_len=4096 with comfortable margin. The 20%
+#   reserve covers CUDA workspace + uvicorn + Python heap. Pushing this
+#   much higher (e.g. 0.45 each) is what failed on the first deploy
+#   because once you account for the ~600 MB CUDA context + the second
+#   process's overhead, weights+KV no longer fit.
+set -euo pipefail
+QWEN_MODEL="${QWEN_MODEL:-Qwen/Qwen2.5-3B-Instruct}"
+PHYSIX_MODEL="${PHYSIX_MODEL:-Pratyush-01/physix-3b-rl}"
+QWEN_GPU_FRAC="${QWEN_GPU_FRAC:-0.40}"
+PHYSIX_GPU_FRAC="${PHYSIX_GPU_FRAC:-0.40}"
+MAX_LEN="${MAX_LEN:-4096}"
+LOG_DIR=/tmp/logs
+mkdir -p "$LOG_DIR"
+# Forward signals so HF's "Pause" / "Restart" actually shuts everything
+# down cleanly — otherwise CUDA memory leaks across container restarts.
+PIDS=()
+cleanup() {
+    echo "[entrypoint] SIGTERM/SIGINT — killing children: ${PIDS[*]:-}" >&2
+    for pid in "${PIDS[@]:-}"; do
+        kill -TERM "$pid" 2>/dev/null || true
+    done
+    wait || true
+    exit 0
+}
+trap cleanup TERM INT
+wait_healthy() {
+    local name="$1" port="$2" pid="$3" budget="${4:-300}"
+    local deadline=$((SECONDS + budget))
+    while (( SECONDS < deadline )); do
+        if ! kill -0 "$pid" 2>/dev/null; then
+            echo "[entrypoint] FATAL: $name (pid $pid) died during boot. Tail of log:" >&2
+            tail -n 80 "$LOG_DIR/${name}.log" >&2 || true
+            return 1
+        fi
+        if curl -fsS "http://127.0.0.1:${port}/health" >/dev/null 2>&1; then
+            echo "[entrypoint] $name healthy on :$port (after ${SECONDS}s)"
+            return 0
+        fi
+        sleep 5
+    done
+    echo "[entrypoint] FATAL: $name failed to become healthy in ${budget}s" >&2
+    tail -n 80 "$LOG_DIR/${name}.log" >&2 || true
+    return 1
+}
+echo "[entrypoint] step 1/3 — booting vLLM(qwen) = $QWEN_MODEL on :8001 (gpu=${QWEN_GPU_FRAC})"
+# vllm/vllm-openai image only ships `python3` — no `python` symlink.
+python3 -m vllm.entrypoints.openai.api_server \
+    --model "$QWEN_MODEL" \
+    --served-model-name "$QWEN_MODEL" \
+    --host 0.0.0.0 --port 8001 \
+    --gpu-memory-utilization "$QWEN_GPU_FRAC" \
+    --max-model-len "$MAX_LEN" \
+    --dtype auto \
+    --disable-log-requests \
+    > "$LOG_DIR/qwen.log" 2>&1 &
+QWEN_PID=$!
+PIDS+=("$QWEN_PID")
+wait_healthy qwen 8001 "$QWEN_PID" 300
+echo "[entrypoint] step 2/3 — booting vLLM(physix) = $PHYSIX_MODEL on :8002 (gpu=${PHYSIX_GPU_FRAC})"
+python3 -m vllm.entrypoints.openai.api_server \
+    --model "$PHYSIX_MODEL" \
+    --served-model-name "$PHYSIX_MODEL" \
+    --host 0.0.0.0 --port 8002 \
+    --gpu-memory-utilization "$PHYSIX_GPU_FRAC" \
+    --max-model-len "$MAX_LEN" \
+    --dtype auto \
+    --disable-log-requests \
+    > "$LOG_DIR/physix.log" 2>&1 &
+PHYSIX_PID=$!
+PIDS+=("$PHYSIX_PID")
+wait_healthy physix 8002 "$PHYSIX_PID" 300
+echo "[entrypoint] step 3/3 — both vLLMs healthy; starting uvicorn on :${PORT}"
+# `exec` so uvicorn becomes PID 1's foreground job and HF Spaces sees
+# our process as healthy. The trap above forwards termination back to
+# the vLLM children when the Space is paused.
+exec python3 -m uvicorn _space_app:app \
+    --host 0.0.0.0 --port "${PORT:-7860}" \
+    --log-level info