Spaces:

Pratyush-01
/

physix-live

Sleeping

App Files Files Community

Pratyush-01 commited on 12 days ago

Commit

27caebd

verified ·

1 Parent(s): c5fa5d5

Upload folder using huggingface_hub

Browse files

Files changed (3) hide show

Dockerfile +32 -62
frontend/src/lib/llmPresets.ts +25 -32
physix/server/providers.py +9 -48

Dockerfile CHANGED Viewed

@@ -1,32 +1,27 @@
-# PhysiX-Live Space — combined env + UI + dual-model GPU inference.
 #
-# Single L4 Space hosts EVERYTHING the demo needs:
 #
-#   :8001  vllm serve  Qwen/Qwen2.5-3B-Instruct      (--gpu-memory-util 0.40)
-#   :8002  vllm serve  Pratyush-01/physix-3b-rl      (--gpu-memory-util 0.40)
-#   :7860  uvicorn physix.server.app:app
 #          ├─ /reset, /step                 (OpenEnv stateless API)
 #          ├─ /interactive/*                (browser session API)
 #          ├─ /web/                         (built React SPA)
 #          └─ /interactive/.../llm-step      (LLM-driven episode)
-#               └─ when base_url=local://router, dispatches by `model`
-#                  to one of the localhost vLLMs above. The browser
-#                  never sees those ports — that prevents anyone from
-#                  bypassing the demo to run free GPU calls.
 #
-# Why one Space, not two:
-#   * Same sleep timer — when the demo sleeps, GPU sleeps. No "demo
-#     is awake but inference is asleep" UX gap.
-#   * No CORS between SPA and inference — same origin.
-#   * One thing to babysit, one URL to share.
 #
-# Why vllm/vllm-openai as the base, not nvidia/cuda:
-#   * vLLM ships pre-compiled CUDA kernels for a specific cu+torch combo.
-#     Building from scratch on cu12.4 means recompiling vLLM (~20 min,
-#     fragile across minor versions). The official image guarantees the
-#     ABI is right out of the box.
-#   * Already includes Python 3.12, torch, ncclm, FastAPI dependencies —
-#     we just layer the physix app + frontend on top.
 ############################
 # Stage 1: build the SPA
@@ -43,59 +38,46 @@ COPY frontend/ ./
 ENV VITE_PHYSIX_API_URL=""
 # Cache-bust marker. Bump when an SPA change isn't taking on the Space —
 # HF BuildKit occasionally reuses stage-1 output even when sources changed.
-# physix-spa-rebuild: 3
 RUN pnpm exec tsc -b \
     && pnpm exec vite build --base=/web/
 ############################
-# Stage 2: runtime (vLLM + physix server + SPA)
 ############################
-FROM vllm/vllm-openai:v0.7.3 AS runtime
-# vllm/vllm-openai sets ENTRYPOINT to `python3 -m vllm.entrypoints.openai.api_server`.
-# We need our own multi-process supervisor, so reset.
-ENTRYPOINT []
 ENV PYTHONUNBUFFERED=1 \
     PIP_NO_CACHE_DIR=1 \
     PIP_DISABLE_PIP_VERSION_CHECK=1 \
     HOME=/tmp/home \
-    USER=physix \
-    LOGNAME=physix \
     HF_HOME=/tmp/hf_cache \
     XDG_CACHE_HOME=/tmp/xdg-cache \
-    VLLM_CACHE_ROOT=/tmp/vllm_cache \
-    TORCH_HOME=/tmp/torch_cache \
-    TRITON_CACHE_DIR=/tmp/triton_cache \
     PORT=7860 \
     PHYSIX_HOST=0.0.0.0 \
     PHYSIX_CORS_ORIGINS=*
-# Need curl for healthchecks; the vLLM image is python-only so apt is fine.
 RUN apt-get update \
     && apt-get install -y --no-install-recommends curl \
     && rm -rf /var/lib/apt/lists/*
 WORKDIR /app
-# Physix backend deps. The vLLM image already has fastapi/uvicorn/pydantic
-# transitively (vllm depends on them), so this is the small physics stack
-# plus openenv-core.
-#
-# IMPORTANT: install with --no-build-isolation if you ever switch to a
-# package that needs torch at build time — you do NOT want pip to try
-# rebuilding torch in this image.
 RUN pip install \
         "openenv-core[core]>=0.2.2" \
         "numpy>=1.24" \
         "scipy>=1.10" \
         "sympy>=1.12" \
         "openai>=1.40" \
         "requests>=2.31"
-# Install physix as an editable package. --no-deps because we just
-# installed the runtime stack above; pyproject's deps would reinstall
-# pinned versions and likely conflict with vLLM's torch.
 COPY pyproject.toml ./
 COPY physix ./physix
 COPY README.md ./
@@ -104,33 +86,21 @@ RUN pip install --no-deps -e .
 # Built SPA from stage 1.
 COPY --from=frontend /build/dist /app/static
-# Space wrapper — mounts the React SPA at /web/, registers `/` -> `/web/`
-# redirect (OpenEnv's create_fastapi_app doesn't do this for us). Same
-# pattern as the previous CPU-only build, just kept in a real file now
-# instead of a heredoc so syntax errors are caught at build time.
 COPY scripts/space_app.py /app/_space_app.py
-# Supervisor entrypoint that boots the two vLLMs sequentially (avoids
-# the CUDA memory race we hit on the first push) then execs uvicorn.
-COPY scripts/space_entrypoint.sh /app/entrypoint.sh
-RUN chmod +x /app/entrypoint.sh
 # Pre-create writable dirs. HF Spaces runs containers as a non-root UID
-# with no /etc/passwd entry, so all cache paths under $HOME must exist
 # and be world-writable BEFORE the runtime user shows up.
-RUN mkdir -p \
-        "$HOME" "$HF_HOME" "$XDG_CACHE_HOME" \
-        "$VLLM_CACHE_ROOT" "$TORCH_HOME" "$TRITON_CACHE_DIR" \
-        /tmp/logs \
     && chmod -R 0777 /tmp /app
 EXPOSE 7860
 # /health is OpenEnv's stock endpoint and turns 200 once uvicorn binds.
-# We give a generous start-period because vLLM cold-load + frontend serve
-# is up to ~150 s on first boot.
-HEALTHCHECK --interval=30s --timeout=10s --start-period=240s --retries=3 \
     CMD curl -fsS "http://127.0.0.1:${PORT}/health" || exit 1
 ENV ENABLE_WEB_INTERFACE=true
-CMD ["/app/entrypoint.sh"]

+# PhysiX-Live demo Space — CPU-only env + UI.
 #
+# What this Space hosts:
 #
+#   :7860  uvicorn _space_app:app
 #          ├─ /reset, /step                 (OpenEnv stateless API)
 #          ├─ /interactive/*                (browser session API)
 #          ├─ /web/                         (built React SPA)
 #          └─ /interactive/.../llm-step      (LLM-driven episode)
 #
+# What this Space does NOT host:
+#   * Inference. The demo is CPU-only — no torch, no vLLM, no GPU. When
+#     the UI calls `/interactive/.../llm-step` the server forwards to
+#     whatever OpenAI-compatible base URL the browser handed us
+#     (HF Router, OpenAI, Ollama, or our sister L4 Space at
+#     `Pratyush-01/physix-infer` for the trained 3B + Qwen baseline).
 #
+# Why a separate inference Space:
+#   Keeps this CPU image tiny (sub-second cold-start) so the demo URL
+#   never feels like it's stalled. The L4 Space pays GPU rates only
+#   while it's actually serving requests — its `sleep_time=300s` shuts
+#   it down between sessions. Two Spaces, two failure surfaces; if
+#   inference is broken the verifier-only demo (Custom URL → Ollama
+#   etc.) still works.
 ############################
 # Stage 1: build the SPA
 ENV VITE_PHYSIX_API_URL=""
 # Cache-bust marker. Bump when an SPA change isn't taking on the Space —
 # HF BuildKit occasionally reuses stage-1 output even when sources changed.
+# physix-spa-rebuild: 4
 RUN pnpm exec tsc -b \
     && pnpm exec vite build --base=/web/
 ############################
+# Stage 2: runtime (FastAPI + SPA)
 ############################
+FROM python:3.11-slim AS runtime
 ENV PYTHONUNBUFFERED=1 \
     PIP_NO_CACHE_DIR=1 \
     PIP_DISABLE_PIP_VERSION_CHECK=1 \
     HOME=/tmp/home \
     HF_HOME=/tmp/hf_cache \
     XDG_CACHE_HOME=/tmp/xdg-cache \
     PORT=7860 \
     PHYSIX_HOST=0.0.0.0 \
     PHYSIX_CORS_ORIGINS=*
+# curl for healthchecks; the slim image has neither curl nor build tools
+# by default. Everything else (numpy, scipy, sympy) is a wheel install.
 RUN apt-get update \
     && apt-get install -y --no-install-recommends curl \
     && rm -rf /var/lib/apt/lists/*
 WORKDIR /app
+# Pin the server-side runtime stack. NO torch / unsloth / trl here —
+# this Space never trains and never runs a model locally.
 RUN pip install \
         "openenv-core[core]>=0.2.2" \
         "numpy>=1.24" \
         "scipy>=1.10" \
         "sympy>=1.12" \
+        "fastapi>=0.110" \
+        "uvicorn[standard]>=0.29" \
+        "pydantic>=2.5" \
         "openai>=1.40" \
         "requests>=2.31"
 COPY pyproject.toml ./
 COPY physix ./physix
 COPY README.md ./
 # Built SPA from stage 1.
 COPY --from=frontend /build/dist /app/static
+# Space wrapper — mounts the React SPA at /web/, registers / -> /web/
+# redirect (OpenEnv's create_fastapi_app doesn't add one for us).
 COPY scripts/space_app.py /app/_space_app.py
 # Pre-create writable dirs. HF Spaces runs containers as a non-root UID
+# with no /etc/passwd entry, so any cache path under $HOME must exist
 # and be world-writable BEFORE the runtime user shows up.
+RUN mkdir -p "$HOME" "$HF_HOME" "$XDG_CACHE_HOME" \
     && chmod -R 0777 /tmp /app
 EXPOSE 7860
 # /health is OpenEnv's stock endpoint and turns 200 once uvicorn binds.
+HEALTHCHECK --interval=30s --timeout=10s --start-period=30s --retries=3 \
     CMD curl -fsS "http://127.0.0.1:${PORT}/health" || exit 1
 ENV ENABLE_WEB_INTERFACE=true
+CMD ["python3", "-m", "uvicorn", "_space_app:app", "--host", "0.0.0.0", "--port", "7860"]

frontend/src/lib/llmPresets.ts CHANGED Viewed

@@ -19,17 +19,13 @@ export const OLLAMA_OPENAI_BASE_URL = "http://localhost:11434/v1";
 export const PHYSIX_MODEL_ID = "Pratyush-01/physix-3b-rl";
 export const QWEN_BASE_MODEL_ID = "Qwen/Qwen2.5-3B-Instruct";
-/** Magic base URL that tells the server "use the in-container vLLMs".
- *  The actual upstream is picked server-side based on the model field
- *  (see physix/server/providers.py::LOCAL_VLLM_PORTS). The browser
- *  never sees the inference ports — keeping them off the public URL
- *  prevents abuse of the L4 GPU.
- *
- *  Both models live on the same Space + same L4, so flipping between
- *  the trained fine-tune and the Qwen baseline is instant once both
- *  are warm. First call after a cold-boot still costs ~90-120s while
- *  vLLM loads weights. */
-export const LOCAL_VLLM_BASE_URL = "local://router";
 export type EndpointId = "ollama" | "hf" | "openai" | "custom" | "physix";
@@ -67,25 +63,23 @@ export interface Endpoint {
 export const ENDPOINTS: readonly Endpoint[] = [
   {
     id: "physix",
-    label: "PhysiX GPU (in-Space vLLM ✦)",
-    // `local://router` is a magic value the server recognises. The
-    // browser never talks to the inference ports directly — the
-    // physix backend dispatches to the right vLLM by `model` field.
-    // See physix/server/providers.py::LOCAL_VLLM_PORTS.
-    baseUrl: LOCAL_VLLM_BASE_URL,
     needsKey: false,
     modelInputMode: "freeform-with-suggestions",
-    // Both models live on the same in-Space L4 vLLM. First entry
-    // pre-fills, so the default comparison is "trained vs base" with
-    // identical hardware / generation params — only the weights differ.
     modelSuggestions: [
       { id: PHYSIX_MODEL_ID, tag: "trained ✦" },
       { id: QWEN_BASE_MODEL_ID, tag: "base (apples-to-apples)" },
     ],
     hint:
-      "Both 3B models hosted on the Space's own L4 GPU via vLLM. No token. " +
-      "Space sleeps after 5 min idle — first call after sleep is ~90-120 s " +
-      "while weights load; subsequent calls are fast.",
   },
   {
     id: "ollama",
@@ -186,23 +180,22 @@ export interface LlmConnection {
   apiKey: string;
 }
-/** Default A side: trained PhysiX-3B served by the in-Space L4 vLLM.
- *  No token needed; first call after sleep is ~90-120 s while weights
- *  load, then fast. */
 export const DEFAULT_CONNECTION_A: LlmConnection = {
   endpointId: "physix",
-  baseUrl: LOCAL_VLLM_BASE_URL,
   model: PHYSIX_MODEL_ID,
   apiKey: "",
 };
-/** Default B side: the same in-Space vLLM, pointed at Qwen 2.5 3B.
- *  Apples-to-apples — identical architecture, identical hardware,
- *  identical generation params; only the weights differ. Both models
- *  share the same GPU, so warming side A also warms side B. */
 export const DEFAULT_CONNECTION_B: LlmConnection = {
   endpointId: "physix",
-  baseUrl: LOCAL_VLLM_BASE_URL,
   model: QWEN_BASE_MODEL_ID,
   apiKey: "",
 };

 export const PHYSIX_MODEL_ID = "Pratyush-01/physix-3b-rl";
 export const QWEN_BASE_MODEL_ID = "Qwen/Qwen2.5-3B-Instruct";
+/** Sister GPU Space that hosts both the trained PhysiX-3B and the Qwen
+ *  2.5 3B baseline behind a single OpenAI-compatible URL. Open access
+ *  (no token); routing on the `model` field happens inside the proxy.
+ *  Sleeps after 5 min idle, so the first call after sleep is ~90-120 s
+ *  while vLLM warms up — subsequent calls are fast. */
+export const PHYSIX_INFER_BASE_URL =
+  "https://pratyush-01-physix-infer.hf.space/v1";
 export type EndpointId = "ollama" | "hf" | "openai" | "custom" | "physix";
 export const ENDPOINTS: readonly Endpoint[] = [
   {
     id: "physix",
+    label: "PhysiX-Infer GPU ✦",
+    // Sister L4 Space hosting both checkpoints behind one URL; the
+    // proxy there picks the right vLLM based on the `model` field.
+    baseUrl: PHYSIX_INFER_BASE_URL,
     needsKey: false,
     modelInputMode: "freeform-with-suggestions",
+    // First entry pre-fills, so the default comparison is "trained vs
+    // base" with identical hardware / generation params — only the
+    // weights differ.
     modelSuggestions: [
       { id: PHYSIX_MODEL_ID, tag: "trained ✦" },
       { id: QWEN_BASE_MODEL_ID, tag: "base (apples-to-apples)" },
     ],
     hint:
+      "Both 3B models on a sister L4 Space — no token, no key. The Space " +
+      "sleeps after 5 min idle, so the first call after sleep is ~90-120 s " +
+      "while vLLM loads weights; subsequent calls are fast.",
   },
   {
     id: "ollama",
   apiKey: string;
 }
+/** Default A side: trained PhysiX-3B on the sister GPU Space.
+ *  No token needed; first call after sleep is ~90-120 s, then fast. */
 export const DEFAULT_CONNECTION_A: LlmConnection = {
   endpointId: "physix",
+  baseUrl: PHYSIX_INFER_BASE_URL,
   model: PHYSIX_MODEL_ID,
   apiKey: "",
 };
+/** Default B side: same sister Space, same L4 GPU, just the Qwen 2.5
+ *  3B baseline. Apples-to-apples — identical architecture, identical
+ *  hardware, identical generation params; only the weights differ.
+ *  Both models share the same Space, so warming side A also warms B. */
 export const DEFAULT_CONNECTION_B: LlmConnection = {
   endpointId: "physix",
+  baseUrl: PHYSIX_INFER_BASE_URL,
   model: QWEN_BASE_MODEL_ID,
   apiKey: "",
 };

physix/server/providers.py CHANGED Viewed

@@ -30,46 +30,10 @@ _log = logging.getLogger(__name__)
 HF_ROUTER_BASE_URL = "https://router.huggingface.co/v1"
 OPENAI_BASE_URL = "https://api.openai.com/v1"
 OLLAMA_OPENAI_BASE_URL = "http://localhost:11434/v1"
-# Magic base-URL scheme. The browser sends ``local://router`` for both
-# in-container models; the server then picks the right localhost vLLM
-# based on the ``model`` field of the request. This keeps the public
-# Space URL from exposing the raw inference ports (so visitors can't
-# bypass the demo to run free GPU calls), and lets the user flip
-# between trained / baseline without changing the connection URL.
-#
-# Ports MUST match what the Space's entrypoint.sh launches — the two
-# are pinned to the same constants here for symmetry.
-_LOCAL_VLLM_SCHEME = "local://"
-LOCAL_VLLM_PORTS: dict[str, int] = {
-    # model id -> localhost port served by entrypoint.sh
-    "Qwen/Qwen2.5-3B-Instruct": 8001,
-    "Pratyush-01/physix-3b-rl": 8002,
-}
-def _maybe_rewrite_local_url(base_url: str, model: str) -> str:
-    """If the URL uses the magic ``local://`` scheme, swap in the real
-    localhost target keyed by ``model``. Returns the URL unchanged
-    otherwise.
-    Raises HTTPException(400) when the ``model`` isn't one of the in-
-    container vLLMs — without this a typo silently falls through to
-    OpenAI's SDK and surfaces as a generic 502.
-    """
-    if not base_url.startswith(_LOCAL_VLLM_SCHEME):
-        return base_url
-    port = LOCAL_VLLM_PORTS.get(model)
-    if port is None:
-        raise HTTPException(
-            status_code=400,
-            detail=(
-                f"Model {model!r} is not hosted by the in-container vLLMs. "
-                f"Available: {sorted(LOCAL_VLLM_PORTS)}."
-            ),
-        )
-    return f"http://127.0.0.1:{port}/v1"
 class LlmStepRequest(BaseModel):
@@ -128,9 +92,10 @@ def resolve_api_key(request: LlmStepRequest) -> Optional[str]:
         return request.api_key
     base_url = (request.base_url or "").lower()
-    # `local://*` targets are the in-container vLLMs — no auth.
-    if base_url.startswith(_LOCAL_VLLM_SCHEME):
-        return "local"
     if "huggingface" in base_url:
         return os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_API_KEY")
     if "openai.com" in base_url:
@@ -185,12 +150,8 @@ def default_openai_compat_policy_factory(request: LlmStepRequest) -> LlmPolicy:
         ) from exc
     api_key = resolve_api_key(request)
-    # Resolve `local://*` -> the actual localhost vLLM URL. Keeps the
-    # OpenAI client unaware of the indirection; auth/headers just flow
-    # through normally.
-    resolved_base_url = _maybe_rewrite_local_url(request.base_url, request.model)
     client = OpenAI(
-        base_url=resolved_base_url,
         api_key=api_key or "missing",
         timeout=request.request_timeout_s,
         # Identifies us to providers that rate-limit by UA. Cheap to

 HF_ROUTER_BASE_URL = "https://router.huggingface.co/v1"
 OPENAI_BASE_URL = "https://api.openai.com/v1"
 OLLAMA_OPENAI_BASE_URL = "http://localhost:11434/v1"
+# Sister GPU Space hosting both the trained PhysiX-3B and the Qwen 2.5 3B
+# baseline. Open access (no key); sleeps after 5 min idle. See the
+# physix-infer/ directory in the repo for the Dockerfile + proxy code.
+PHYSIX_INFER_BASE_URL = "https://pratyush-01-physix-infer.hf.space/v1"
 class LlmStepRequest(BaseModel):
         return request.api_key
     base_url = (request.base_url or "").lower()
+    # The PhysiX-Infer sister Space serves Qwen + the trained 3B with no
+    # auth — it's open-access by design (rate-limited only by sleep).
+    if "physix-infer" in base_url:
+        return "physix-infer"
     if "huggingface" in base_url:
         return os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_API_KEY")
     if "openai.com" in base_url:
         ) from exc
     api_key = resolve_api_key(request)
     client = OpenAI(
+        base_url=request.base_url,
         api_key=api_key or "missing",
         timeout=request.request_timeout_s,
         # Identifies us to providers that rate-limit by UA. Cheap to