Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- Dockerfile +80 -102
- frontend/src/lib/llmPresets.ts +32 -29
- scripts/space_app.py +55 -0
- scripts/space_entrypoint.sh +103 -0
Dockerfile
CHANGED
|
@@ -1,16 +1,32 @@
|
|
| 1 |
-
# PhysiX-Live
|
| 2 |
#
|
| 3 |
-
#
|
| 4 |
-
# 1. node:20 builds the Vite/React frontend into frontend/dist with
|
| 5 |
-
# same-origin API base URL (VITE_PHYSIX_API_URL=""), so the SPA
|
| 6 |
-
# fetches /interactive/* relative to the Space's own host.
|
| 7 |
-
# 2. python:3.11-slim installs physix as an editable package and serves
|
| 8 |
-
# both the FastAPI routes (/reset, /step, /interactive/*) AND the
|
| 9 |
-
# built SPA as static assets from a single uvicorn process.
|
| 10 |
#
|
| 11 |
-
#
|
| 12 |
-
#
|
| 13 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
############################
|
| 16 |
# Stage 1: build the SPA
|
|
@@ -19,35 +35,27 @@ FROM node:20-alpine AS frontend
|
|
| 19 |
WORKDIR /build
|
| 20 |
RUN corepack enable
|
| 21 |
|
| 22 |
-
# Install deps separately from sources so layer cache survives source edits.
|
| 23 |
COPY frontend/package.json frontend/pnpm-lock.yaml ./
|
| 24 |
RUN pnpm install --frozen-lockfile --silent
|
| 25 |
|
| 26 |
COPY frontend/ ./
|
| 27 |
# Same-origin API fetches (relative paths). The Space serves both API and UI.
|
| 28 |
ENV VITE_PHYSIX_API_URL=""
|
| 29 |
-
# Cache-bust
|
| 30 |
-
#
|
| 31 |
-
#
|
| 32 |
-
# is the documented workaround — increment when you push a UI change
|
| 33 |
-
# and the Space is still serving the previous SPA bundle hash.
|
| 34 |
-
# physix-spa-rebuild: 2
|
| 35 |
-
# Run typecheck + vite build separately so we can pass --base=/web/ to
|
| 36 |
-
# vite without it landing on tsc. The SPA is mounted at /web/ in the
|
| 37 |
-
# Space and the redirect from / -> /web/ is registered by the wrapper
|
| 38 |
-
# in _space_app.py below. All asset URLs in the built index.html
|
| 39 |
-
# include the /web/ prefix.
|
| 40 |
RUN pnpm exec tsc -b \
|
| 41 |
&& pnpm exec vite build --base=/web/
|
| 42 |
|
| 43 |
############################
|
| 44 |
-
# Stage 2: runtime
|
| 45 |
############################
|
| 46 |
-
FROM
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
|
| 48 |
-
# HF Spaces convention: write everything under /tmp (only writable path
|
| 49 |
-
# at runtime). The container also runs as UID 1000 with no /etc/passwd
|
| 50 |
-
# entry, so set USER/HOME so getpass.getuser() and pathlib don't crash.
|
| 51 |
ENV PYTHONUNBUFFERED=1 \
|
| 52 |
PIP_NO_CACHE_DIR=1 \
|
| 53 |
PIP_DISABLE_PIP_VERSION_CHECK=1 \
|
|
@@ -56,103 +64,73 @@ ENV PYTHONUNBUFFERED=1 \
|
|
| 56 |
LOGNAME=physix \
|
| 57 |
HF_HOME=/tmp/hf_cache \
|
| 58 |
XDG_CACHE_HOME=/tmp/xdg-cache \
|
|
|
|
|
|
|
|
|
|
| 59 |
PORT=7860 \
|
| 60 |
PHYSIX_HOST=0.0.0.0 \
|
| 61 |
PHYSIX_CORS_ORIGINS=*
|
| 62 |
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
# System deps: build-essential briefly for any pip wheels that need it
|
| 66 |
-
# (sympy/scipy ship wheels for linux_x86_64 so this is mostly a safety
|
| 67 |
-
# net), curl for the healthcheck.
|
| 68 |
RUN apt-get update \
|
| 69 |
&& apt-get install -y --no-install-recommends curl \
|
| 70 |
&& rm -rf /var/lib/apt/lists/*
|
| 71 |
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
"openenv-core[core]>=0.2.2" \
|
| 77 |
"numpy>=1.24" \
|
| 78 |
"scipy>=1.10" \
|
| 79 |
"sympy>=1.12" \
|
| 80 |
-
"
|
| 81 |
-
"uvicorn>=0.29" \
|
| 82 |
-
"pydantic>=2.5" \
|
| 83 |
"requests>=2.31"
|
| 84 |
|
| 85 |
-
# Install physix
|
|
|
|
|
|
|
|
|
|
| 86 |
COPY physix ./physix
|
| 87 |
COPY README.md ./
|
| 88 |
RUN pip install --no-deps -e .
|
| 89 |
|
|
|
|
| 90 |
COPY --from=frontend /build/dist /app/static
|
| 91 |
|
| 92 |
-
# Space wrapper
|
| 93 |
-
#
|
| 94 |
-
#
|
| 95 |
-
#
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
#
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
# OpenEnv's `create_fastapi_app` (which physix.server.app uses directly)
|
| 113 |
-
# does NOT register a `/` redirect — that's only added by the higher-level
|
| 114 |
-
# `create_web_interface_app` wrapper, which we deliberately don't use
|
| 115 |
-
# because it would mount Gradio at /web and clobber our React SPA. So the
|
| 116 |
-
# redirects have to live here, otherwise:
|
| 117 |
-
#
|
| 118 |
-
# * `https://<space>.hf.space/` -> 404
|
| 119 |
-
# * `https://<space>.hf.space/web` -> 404 (no trailing slash)
|
| 120 |
-
#
|
| 121 |
-
# Both are landing pages users hit (the bare URL is the canonical Space
|
| 122 |
-
# link the HF UI exposes), so without these the Space appears blank.
|
| 123 |
-
@app.get("/", include_in_schema=False)
|
| 124 |
-
async def _root_redirect() -> RedirectResponse:
|
| 125 |
-
return RedirectResponse(url="/web/")
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
@app.get("/web", include_in_schema=False)
|
| 129 |
-
async def _web_no_slash_redirect() -> RedirectResponse:
|
| 130 |
-
return RedirectResponse(url="/web/")
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
if _STATIC_DIR.is_dir():
|
| 134 |
-
# html=True makes StaticFiles serve index.html for directory hits and
|
| 135 |
-
# fall back to it for unknown sub-paths (so client-side React routing
|
| 136 |
-
# works). Mounted last so registered API routes (/web/metadata,
|
| 137 |
-
# /web/reset, /web/step from OpenEnv; /interactive/* from physix)
|
| 138 |
-
# always win. The vite build was run with --base=/web/ so asset
|
| 139 |
-
# URLs in index.html already include the prefix.
|
| 140 |
-
app.mount(
|
| 141 |
-
"/web",
|
| 142 |
-
StaticFiles(directory=str(_STATIC_DIR), html=True),
|
| 143 |
-
name="ui",
|
| 144 |
-
)
|
| 145 |
-
PY
|
| 146 |
-
|
| 147 |
-
# Pre-create writable dirs so the first request doesn't crash on a
|
| 148 |
-
# missing cache path.
|
| 149 |
-
RUN mkdir -p "$HOME" "$HF_HOME" "$XDG_CACHE_HOME" \
|
| 150 |
-
&& chmod -R 0777 /tmp/home /tmp/hf_cache /tmp/xdg-cache /app
|
| 151 |
|
| 152 |
EXPOSE 7860
|
| 153 |
|
| 154 |
-
|
|
|
|
|
|
|
|
|
|
| 155 |
CMD curl -fsS "http://127.0.0.1:${PORT}/health" || exit 1
|
| 156 |
|
| 157 |
ENV ENABLE_WEB_INTERFACE=true
|
| 158 |
-
CMD ["
|
|
|
|
| 1 |
+
# PhysiX-Live Space — combined env + UI + dual-model GPU inference.
|
| 2 |
#
|
| 3 |
+
# Single L4 Space hosts EVERYTHING the demo needs:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
#
|
| 5 |
+
# :8001 vllm serve Qwen/Qwen2.5-3B-Instruct (--gpu-memory-util 0.40)
|
| 6 |
+
# :8002 vllm serve Pratyush-01/physix-3b-rl (--gpu-memory-util 0.40)
|
| 7 |
+
# :7860 uvicorn physix.server.app:app
|
| 8 |
+
# ├─ /reset, /step (OpenEnv stateless API)
|
| 9 |
+
# ├─ /interactive/* (browser session API)
|
| 10 |
+
# ├─ /web/ (built React SPA)
|
| 11 |
+
# └─ /interactive/.../llm-step (LLM-driven episode)
|
| 12 |
+
# └─ when base_url=local://router, dispatches by `model`
|
| 13 |
+
# to one of the localhost vLLMs above. The browser
|
| 14 |
+
# never sees those ports — that prevents anyone from
|
| 15 |
+
# bypassing the demo to run free GPU calls.
|
| 16 |
+
#
|
| 17 |
+
# Why one Space, not two:
|
| 18 |
+
# * Same sleep timer — when the demo sleeps, GPU sleeps. No "demo
|
| 19 |
+
# is awake but inference is asleep" UX gap.
|
| 20 |
+
# * No CORS between SPA and inference — same origin.
|
| 21 |
+
# * One thing to babysit, one URL to share.
|
| 22 |
+
#
|
| 23 |
+
# Why vllm/vllm-openai as the base, not nvidia/cuda:
|
| 24 |
+
# * vLLM ships pre-compiled CUDA kernels for a specific cu+torch combo.
|
| 25 |
+
# Building from scratch on cu12.4 means recompiling vLLM (~20 min,
|
| 26 |
+
# fragile across minor versions). The official image guarantees the
|
| 27 |
+
# ABI is right out of the box.
|
| 28 |
+
# * Already includes Python 3.12, torch, ncclm, FastAPI dependencies —
|
| 29 |
+
# we just layer the physix app + frontend on top.
|
| 30 |
|
| 31 |
############################
|
| 32 |
# Stage 1: build the SPA
|
|
|
|
| 35 |
WORKDIR /build
|
| 36 |
RUN corepack enable
|
| 37 |
|
|
|
|
| 38 |
COPY frontend/package.json frontend/pnpm-lock.yaml ./
|
| 39 |
RUN pnpm install --frozen-lockfile --silent
|
| 40 |
|
| 41 |
COPY frontend/ ./
|
| 42 |
# Same-origin API fetches (relative paths). The Space serves both API and UI.
|
| 43 |
ENV VITE_PHYSIX_API_URL=""
|
| 44 |
+
# Cache-bust marker. Bump when an SPA change isn't taking on the Space —
|
| 45 |
+
# HF BuildKit occasionally reuses stage-1 output even when sources changed.
|
| 46 |
+
# physix-spa-rebuild: 3
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
RUN pnpm exec tsc -b \
|
| 48 |
&& pnpm exec vite build --base=/web/
|
| 49 |
|
| 50 |
############################
|
| 51 |
+
# Stage 2: runtime (vLLM + physix server + SPA)
|
| 52 |
############################
|
| 53 |
+
FROM vllm/vllm-openai:v0.7.3 AS runtime
|
| 54 |
+
|
| 55 |
+
# vllm/vllm-openai sets ENTRYPOINT to `python3 -m vllm.entrypoints.openai.api_server`.
|
| 56 |
+
# We need our own multi-process supervisor, so reset.
|
| 57 |
+
ENTRYPOINT []
|
| 58 |
|
|
|
|
|
|
|
|
|
|
| 59 |
ENV PYTHONUNBUFFERED=1 \
|
| 60 |
PIP_NO_CACHE_DIR=1 \
|
| 61 |
PIP_DISABLE_PIP_VERSION_CHECK=1 \
|
|
|
|
| 64 |
LOGNAME=physix \
|
| 65 |
HF_HOME=/tmp/hf_cache \
|
| 66 |
XDG_CACHE_HOME=/tmp/xdg-cache \
|
| 67 |
+
VLLM_CACHE_ROOT=/tmp/vllm_cache \
|
| 68 |
+
TORCH_HOME=/tmp/torch_cache \
|
| 69 |
+
TRITON_CACHE_DIR=/tmp/triton_cache \
|
| 70 |
PORT=7860 \
|
| 71 |
PHYSIX_HOST=0.0.0.0 \
|
| 72 |
PHYSIX_CORS_ORIGINS=*
|
| 73 |
|
| 74 |
+
# Need curl for healthchecks; the vLLM image is python-only so apt is fine.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
RUN apt-get update \
|
| 76 |
&& apt-get install -y --no-install-recommends curl \
|
| 77 |
&& rm -rf /var/lib/apt/lists/*
|
| 78 |
|
| 79 |
+
WORKDIR /app
|
| 80 |
+
|
| 81 |
+
# Physix backend deps. The vLLM image already has fastapi/uvicorn/pydantic
|
| 82 |
+
# transitively (vllm depends on them), so this is the small physics stack
|
| 83 |
+
# plus openenv-core.
|
| 84 |
+
#
|
| 85 |
+
# IMPORTANT: install with --no-build-isolation if you ever switch to a
|
| 86 |
+
# package that needs torch at build time — you do NOT want pip to try
|
| 87 |
+
# rebuilding torch in this image.
|
| 88 |
+
RUN pip install \
|
| 89 |
"openenv-core[core]>=0.2.2" \
|
| 90 |
"numpy>=1.24" \
|
| 91 |
"scipy>=1.10" \
|
| 92 |
"sympy>=1.12" \
|
| 93 |
+
"openai>=1.40" \
|
|
|
|
|
|
|
| 94 |
"requests>=2.31"
|
| 95 |
|
| 96 |
+
# Install physix as an editable package. --no-deps because we just
|
| 97 |
+
# installed the runtime stack above; pyproject's deps would reinstall
|
| 98 |
+
# pinned versions and likely conflict with vLLM's torch.
|
| 99 |
+
COPY pyproject.toml ./
|
| 100 |
COPY physix ./physix
|
| 101 |
COPY README.md ./
|
| 102 |
RUN pip install --no-deps -e .
|
| 103 |
|
| 104 |
+
# Built SPA from stage 1.
|
| 105 |
COPY --from=frontend /build/dist /app/static
|
| 106 |
|
| 107 |
+
# Space wrapper — mounts the React SPA at /web/, registers `/` -> `/web/`
|
| 108 |
+
# redirect (OpenEnv's create_fastapi_app doesn't do this for us). Same
|
| 109 |
+
# pattern as the previous CPU-only build, just kept in a real file now
|
| 110 |
+
# instead of a heredoc so syntax errors are caught at build time.
|
| 111 |
+
COPY scripts/space_app.py /app/_space_app.py
|
| 112 |
+
|
| 113 |
+
# Supervisor entrypoint that boots the two vLLMs sequentially (avoids
|
| 114 |
+
# the CUDA memory race we hit on the first push) then execs uvicorn.
|
| 115 |
+
COPY scripts/space_entrypoint.sh /app/entrypoint.sh
|
| 116 |
+
RUN chmod +x /app/entrypoint.sh
|
| 117 |
+
|
| 118 |
+
# Pre-create writable dirs. HF Spaces runs containers as a non-root UID
|
| 119 |
+
# with no /etc/passwd entry, so all cache paths under $HOME must exist
|
| 120 |
+
# and be world-writable BEFORE the runtime user shows up.
|
| 121 |
+
RUN mkdir -p \
|
| 122 |
+
"$HOME" "$HF_HOME" "$XDG_CACHE_HOME" \
|
| 123 |
+
"$VLLM_CACHE_ROOT" "$TORCH_HOME" "$TRITON_CACHE_DIR" \
|
| 124 |
+
/tmp/logs \
|
| 125 |
+
&& chmod -R 0777 /tmp /app
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
|
| 127 |
EXPOSE 7860
|
| 128 |
|
| 129 |
+
# /health is OpenEnv's stock endpoint and turns 200 once uvicorn binds.
|
| 130 |
+
# We give a generous start-period because vLLM cold-load + frontend serve
|
| 131 |
+
# is up to ~150 s on first boot.
|
| 132 |
+
HEALTHCHECK --interval=30s --timeout=10s --start-period=240s --retries=3 \
|
| 133 |
CMD curl -fsS "http://127.0.0.1:${PORT}/health" || exit 1
|
| 134 |
|
| 135 |
ENV ENABLE_WEB_INTERFACE=true
|
| 136 |
+
CMD ["/app/entrypoint.sh"]
|
frontend/src/lib/llmPresets.ts
CHANGED
|
@@ -19,13 +19,17 @@ export const OLLAMA_OPENAI_BASE_URL = "http://localhost:11434/v1";
|
|
| 19 |
export const PHYSIX_MODEL_ID = "Pratyush-01/physix-3b-rl";
|
| 20 |
export const QWEN_BASE_MODEL_ID = "Qwen/Qwen2.5-3B-Instruct";
|
| 21 |
|
| 22 |
-
/** Magic
|
| 23 |
-
*
|
| 24 |
-
*
|
| 25 |
-
*
|
| 26 |
-
*
|
| 27 |
-
*
|
| 28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
|
| 30 |
export type EndpointId = "ollama" | "hf" | "openai" | "custom" | "physix";
|
| 31 |
|
|
@@ -63,25 +67,25 @@ export interface Endpoint {
|
|
| 63 |
export const ENDPOINTS: readonly Endpoint[] = [
|
| 64 |
{
|
| 65 |
id: "physix",
|
| 66 |
-
label: "PhysiX
|
| 67 |
-
|
| 68 |
-
//
|
| 69 |
-
//
|
| 70 |
-
//
|
| 71 |
-
|
| 72 |
needsKey: false,
|
| 73 |
modelInputMode: "freeform-with-suggestions",
|
| 74 |
-
// Both models
|
| 75 |
-
//
|
| 76 |
-
//
|
| 77 |
modelSuggestions: [
|
| 78 |
{ id: PHYSIX_MODEL_ID, tag: "trained ✦" },
|
| 79 |
{ id: QWEN_BASE_MODEL_ID, tag: "base (apples-to-apples)" },
|
| 80 |
],
|
| 81 |
hint:
|
| 82 |
-
"
|
| 83 |
-
"
|
| 84 |
-
"
|
| 85 |
},
|
| 86 |
{
|
| 87 |
id: "ollama",
|
|
@@ -182,24 +186,23 @@ export interface LlmConnection {
|
|
| 182 |
apiKey: string;
|
| 183 |
}
|
| 184 |
|
| 185 |
-
/** Default A side: trained PhysiX-3B
|
| 186 |
-
* token needed; first call
|
| 187 |
-
*
|
| 188 |
export const DEFAULT_CONNECTION_A: LlmConnection = {
|
| 189 |
endpointId: "physix",
|
| 190 |
-
baseUrl:
|
| 191 |
model: PHYSIX_MODEL_ID,
|
| 192 |
apiKey: "",
|
| 193 |
};
|
| 194 |
|
| 195 |
-
/** Default B side: the same
|
| 196 |
-
*
|
| 197 |
-
* identical
|
| 198 |
-
*
|
| 199 |
-
* cost (the L4 is already warm from side A). */
|
| 200 |
export const DEFAULT_CONNECTION_B: LlmConnection = {
|
| 201 |
endpointId: "physix",
|
| 202 |
-
baseUrl:
|
| 203 |
model: QWEN_BASE_MODEL_ID,
|
| 204 |
apiKey: "",
|
| 205 |
};
|
|
|
|
| 19 |
export const PHYSIX_MODEL_ID = "Pratyush-01/physix-3b-rl";
|
| 20 |
export const QWEN_BASE_MODEL_ID = "Qwen/Qwen2.5-3B-Instruct";
|
| 21 |
|
| 22 |
+
/** Magic base URL that tells the server "use the in-container vLLMs".
|
| 23 |
+
* The actual upstream is picked server-side based on the model field
|
| 24 |
+
* (see physix/server/providers.py::LOCAL_VLLM_PORTS). The browser
|
| 25 |
+
* never sees the inference ports — keeping them off the public URL
|
| 26 |
+
* prevents abuse of the L4 GPU.
|
| 27 |
+
*
|
| 28 |
+
* Both models live on the same Space + same L4, so flipping between
|
| 29 |
+
* the trained fine-tune and the Qwen baseline is instant once both
|
| 30 |
+
* are warm. First call after a cold-boot still costs ~90-120s while
|
| 31 |
+
* vLLM loads weights. */
|
| 32 |
+
export const LOCAL_VLLM_BASE_URL = "local://router";
|
| 33 |
|
| 34 |
export type EndpointId = "ollama" | "hf" | "openai" | "custom" | "physix";
|
| 35 |
|
|
|
|
| 67 |
export const ENDPOINTS: readonly Endpoint[] = [
|
| 68 |
{
|
| 69 |
id: "physix",
|
| 70 |
+
label: "PhysiX GPU (in-Space vLLM ✦)",
|
| 71 |
+
// `local://router` is a magic value the server recognises. The
|
| 72 |
+
// browser never talks to the inference ports directly — the
|
| 73 |
+
// physix backend dispatches to the right vLLM by `model` field.
|
| 74 |
+
// See physix/server/providers.py::LOCAL_VLLM_PORTS.
|
| 75 |
+
baseUrl: LOCAL_VLLM_BASE_URL,
|
| 76 |
needsKey: false,
|
| 77 |
modelInputMode: "freeform-with-suggestions",
|
| 78 |
+
// Both models live on the same in-Space L4 vLLM. First entry
|
| 79 |
+
// pre-fills, so the default comparison is "trained vs base" with
|
| 80 |
+
// identical hardware / generation params — only the weights differ.
|
| 81 |
modelSuggestions: [
|
| 82 |
{ id: PHYSIX_MODEL_ID, tag: "trained ✦" },
|
| 83 |
{ id: QWEN_BASE_MODEL_ID, tag: "base (apples-to-apples)" },
|
| 84 |
],
|
| 85 |
hint:
|
| 86 |
+
"Both 3B models hosted on the Space's own L4 GPU via vLLM. No token. " +
|
| 87 |
+
"Space sleeps after 5 min idle — first call after sleep is ~90-120 s " +
|
| 88 |
+
"while weights load; subsequent calls are fast.",
|
| 89 |
},
|
| 90 |
{
|
| 91 |
id: "ollama",
|
|
|
|
| 186 |
apiKey: string;
|
| 187 |
}
|
| 188 |
|
| 189 |
+
/** Default A side: trained PhysiX-3B served by the in-Space L4 vLLM.
|
| 190 |
+
* No token needed; first call after sleep is ~90-120 s while weights
|
| 191 |
+
* load, then fast. */
|
| 192 |
export const DEFAULT_CONNECTION_A: LlmConnection = {
|
| 193 |
endpointId: "physix",
|
| 194 |
+
baseUrl: LOCAL_VLLM_BASE_URL,
|
| 195 |
model: PHYSIX_MODEL_ID,
|
| 196 |
apiKey: "",
|
| 197 |
};
|
| 198 |
|
| 199 |
+
/** Default B side: the same in-Space vLLM, pointed at Qwen 2.5 3B.
|
| 200 |
+
* Apples-to-apples — identical architecture, identical hardware,
|
| 201 |
+
* identical generation params; only the weights differ. Both models
|
| 202 |
+
* share the same GPU, so warming side A also warms side B. */
|
|
|
|
| 203 |
export const DEFAULT_CONNECTION_B: LlmConnection = {
|
| 204 |
endpointId: "physix",
|
| 205 |
+
baseUrl: LOCAL_VLLM_BASE_URL,
|
| 206 |
model: QWEN_BASE_MODEL_ID,
|
| 207 |
apiKey: "",
|
| 208 |
};
|
scripts/space_app.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Space entrypoint: physix.server.app:app + static UI mount.
|
| 2 |
+
|
| 3 |
+
Imported at runtime by the Dockerfile's CMD via ``uvicorn _space_app:app``.
|
| 4 |
+
|
| 5 |
+
What this wrapper adds on top of ``physix.server.app:app``:
|
| 6 |
+
|
| 7 |
+
1. ``GET /`` -> 302 to ``/web/`` (so the bare Space URL
|
| 8 |
+
doesn't 404 — OpenEnv's ``create_fastapi_app``
|
| 9 |
+
does NOT add a root redirect; that's only in
|
| 10 |
+
the higher-level wrapper which mounts Gradio
|
| 11 |
+
at ``/web`` and would clobber our React SPA).
|
| 12 |
+
2. ``GET /web`` -> 302 to ``/web/`` (same reason; users hit the
|
| 13 |
+
no-trailing-slash variant from outside links).
|
| 14 |
+
3. ``StaticFiles`` mount at ``/web/`` serving the built Vite SPA. The
|
| 15 |
+
vite build was run with ``--base=/web/`` so all asset URLs in the
|
| 16 |
+
emitted ``index.html`` already include the prefix.
|
| 17 |
+
|
| 18 |
+
Kept as a real .py file (not a heredoc inside the Dockerfile) so any
|
| 19 |
+
syntax error is caught by the build's static analysis rather than at
|
| 20 |
+
runtime — saved several deploy-fail loops in earlier iterations.
|
| 21 |
+
"""
|
| 22 |
+
|
| 23 |
+
from __future__ import annotations
|
| 24 |
+
|
| 25 |
+
from pathlib import Path
|
| 26 |
+
|
| 27 |
+
from fastapi.responses import RedirectResponse
|
| 28 |
+
from fastapi.staticfiles import StaticFiles
|
| 29 |
+
|
| 30 |
+
from physix.server.app import app
|
| 31 |
+
|
| 32 |
+
_STATIC_DIR = Path("/app/static")
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
@app.get("/", include_in_schema=False)
|
| 36 |
+
async def _root_redirect() -> RedirectResponse:
|
| 37 |
+
return RedirectResponse(url="/web/")
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
@app.get("/web", include_in_schema=False)
|
| 41 |
+
async def _web_no_slash_redirect() -> RedirectResponse:
|
| 42 |
+
return RedirectResponse(url="/web/")
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
if _STATIC_DIR.is_dir():
|
| 46 |
+
# html=True makes StaticFiles serve index.html for directory hits and
|
| 47 |
+
# fall back to it for unknown sub-paths (so client-side React routing
|
| 48 |
+
# works). Mounted last so registered API routes (/web/metadata,
|
| 49 |
+
# /web/reset, /web/step from OpenEnv; /interactive/* from physix)
|
| 50 |
+
# always win over the static handler.
|
| 51 |
+
app.mount(
|
| 52 |
+
"/web",
|
| 53 |
+
StaticFiles(directory=str(_STATIC_DIR), html=True),
|
| 54 |
+
name="ui",
|
| 55 |
+
)
|
scripts/space_entrypoint.sh
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# Boot the two in-Space vLLMs SEQUENTIALLY, then exec uvicorn for the
|
| 3 |
+
# physix FastAPI server (which serves the API + the React SPA).
|
| 4 |
+
#
|
| 5 |
+
# Why sequential, not parallel:
|
| 6 |
+
# The first deploy attempt booted both vLLMs in parallel and the second
|
| 7 |
+
# one died with "No available memory for the cache blocks." Reason: vLLM
|
| 8 |
+
# reads `nvidia-smi`-style free memory at startup and reserves
|
| 9 |
+
# `--gpu-memory-utilization * (free at this moment)` worth of VRAM.
|
| 10 |
+
# When two processes start simultaneously, both see "all 24 GB free" and
|
| 11 |
+
# both try to grab ~10 GB; the second one to finalize loses. Booting
|
| 12 |
+
# sequentially makes the second one observe the post-first-process free
|
| 13 |
+
# memory, so its allocation is sized correctly.
|
| 14 |
+
#
|
| 15 |
+
# Why --gpu-memory-utilization 0.40 each (= 80% total):
|
| 16 |
+
# On L4 (24 GB), 40% = ~9.6 GB per process. Qwen2.5-3B fp16 weights are
|
| 17 |
+
# ~6.2 GB; that leaves ~3.4 GB per process for KV cache + activations,
|
| 18 |
+
# which sustains max_model_len=4096 with comfortable margin. The 20%
|
| 19 |
+
# reserve covers CUDA workspace + uvicorn + Python heap. Pushing this
|
| 20 |
+
# much higher (e.g. 0.45 each) is what failed on the first deploy
|
| 21 |
+
# because once you account for the ~600 MB CUDA context + the second
|
| 22 |
+
# process's overhead, weights+KV no longer fit.
|
| 23 |
+
|
| 24 |
+
set -euo pipefail
|
| 25 |
+
|
| 26 |
+
QWEN_MODEL="${QWEN_MODEL:-Qwen/Qwen2.5-3B-Instruct}"
|
| 27 |
+
PHYSIX_MODEL="${PHYSIX_MODEL:-Pratyush-01/physix-3b-rl}"
|
| 28 |
+
QWEN_GPU_FRAC="${QWEN_GPU_FRAC:-0.40}"
|
| 29 |
+
PHYSIX_GPU_FRAC="${PHYSIX_GPU_FRAC:-0.40}"
|
| 30 |
+
MAX_LEN="${MAX_LEN:-4096}"
|
| 31 |
+
|
| 32 |
+
LOG_DIR=/tmp/logs
|
| 33 |
+
mkdir -p "$LOG_DIR"
|
| 34 |
+
|
| 35 |
+
# Forward signals so HF's "Pause" / "Restart" actually shuts everything
|
| 36 |
+
# down cleanly — otherwise CUDA memory leaks across container restarts.
|
| 37 |
+
PIDS=()
|
| 38 |
+
cleanup() {
|
| 39 |
+
echo "[entrypoint] SIGTERM/SIGINT — killing children: ${PIDS[*]:-}" >&2
|
| 40 |
+
for pid in "${PIDS[@]:-}"; do
|
| 41 |
+
kill -TERM "$pid" 2>/dev/null || true
|
| 42 |
+
done
|
| 43 |
+
wait || true
|
| 44 |
+
exit 0
|
| 45 |
+
}
|
| 46 |
+
trap cleanup TERM INT
|
| 47 |
+
|
| 48 |
+
wait_healthy() {
|
| 49 |
+
local name="$1" port="$2" pid="$3" budget="${4:-300}"
|
| 50 |
+
local deadline=$((SECONDS + budget))
|
| 51 |
+
while (( SECONDS < deadline )); do
|
| 52 |
+
if ! kill -0 "$pid" 2>/dev/null; then
|
| 53 |
+
echo "[entrypoint] FATAL: $name (pid $pid) died during boot. Tail of log:" >&2
|
| 54 |
+
tail -n 80 "$LOG_DIR/${name}.log" >&2 || true
|
| 55 |
+
return 1
|
| 56 |
+
fi
|
| 57 |
+
if curl -fsS "http://127.0.0.1:${port}/health" >/dev/null 2>&1; then
|
| 58 |
+
echo "[entrypoint] $name healthy on :$port (after ${SECONDS}s)"
|
| 59 |
+
return 0
|
| 60 |
+
fi
|
| 61 |
+
sleep 5
|
| 62 |
+
done
|
| 63 |
+
echo "[entrypoint] FATAL: $name failed to become healthy in ${budget}s" >&2
|
| 64 |
+
tail -n 80 "$LOG_DIR/${name}.log" >&2 || true
|
| 65 |
+
return 1
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
echo "[entrypoint] step 1/3 — booting vLLM(qwen) = $QWEN_MODEL on :8001 (gpu=${QWEN_GPU_FRAC})"
|
| 69 |
+
# vllm/vllm-openai image only ships `python3` — no `python` symlink.
|
| 70 |
+
python3 -m vllm.entrypoints.openai.api_server \
|
| 71 |
+
--model "$QWEN_MODEL" \
|
| 72 |
+
--served-model-name "$QWEN_MODEL" \
|
| 73 |
+
--host 0.0.0.0 --port 8001 \
|
| 74 |
+
--gpu-memory-utilization "$QWEN_GPU_FRAC" \
|
| 75 |
+
--max-model-len "$MAX_LEN" \
|
| 76 |
+
--dtype auto \
|
| 77 |
+
--disable-log-requests \
|
| 78 |
+
> "$LOG_DIR/qwen.log" 2>&1 &
|
| 79 |
+
QWEN_PID=$!
|
| 80 |
+
PIDS+=("$QWEN_PID")
|
| 81 |
+
wait_healthy qwen 8001 "$QWEN_PID" 300
|
| 82 |
+
|
| 83 |
+
echo "[entrypoint] step 2/3 — booting vLLM(physix) = $PHYSIX_MODEL on :8002 (gpu=${PHYSIX_GPU_FRAC})"
|
| 84 |
+
python3 -m vllm.entrypoints.openai.api_server \
|
| 85 |
+
--model "$PHYSIX_MODEL" \
|
| 86 |
+
--served-model-name "$PHYSIX_MODEL" \
|
| 87 |
+
--host 0.0.0.0 --port 8002 \
|
| 88 |
+
--gpu-memory-utilization "$PHYSIX_GPU_FRAC" \
|
| 89 |
+
--max-model-len "$MAX_LEN" \
|
| 90 |
+
--dtype auto \
|
| 91 |
+
--disable-log-requests \
|
| 92 |
+
> "$LOG_DIR/physix.log" 2>&1 &
|
| 93 |
+
PHYSIX_PID=$!
|
| 94 |
+
PIDS+=("$PHYSIX_PID")
|
| 95 |
+
wait_healthy physix 8002 "$PHYSIX_PID" 300
|
| 96 |
+
|
| 97 |
+
echo "[entrypoint] step 3/3 — both vLLMs healthy; starting uvicorn on :${PORT}"
|
| 98 |
+
# `exec` so uvicorn becomes PID 1's foreground job and HF Spaces sees
|
| 99 |
+
# our process as healthy. The trap above forwards termination back to
|
| 100 |
+
# the vLLM children when the Space is paused.
|
| 101 |
+
exec python3 -m uvicorn _space_app:app \
|
| 102 |
+
--host 0.0.0.0 --port "${PORT:-7860}" \
|
| 103 |
+
--log-level info
|