Pratyush-01 commited on
Commit
27caebd
·
verified ·
1 Parent(s): c5fa5d5

Upload folder using huggingface_hub

Browse files
Dockerfile CHANGED
@@ -1,32 +1,27 @@
1
- # PhysiX-Live Space — combined env + UI + dual-model GPU inference.
2
  #
3
- # Single L4 Space hosts EVERYTHING the demo needs:
4
  #
5
- # :8001 vllm serve Qwen/Qwen2.5-3B-Instruct (--gpu-memory-util 0.40)
6
- # :8002 vllm serve Pratyush-01/physix-3b-rl (--gpu-memory-util 0.40)
7
- # :7860 uvicorn physix.server.app:app
8
  # ├─ /reset, /step (OpenEnv stateless API)
9
  # ├─ /interactive/* (browser session API)
10
  # ├─ /web/ (built React SPA)
11
  # └─ /interactive/.../llm-step (LLM-driven episode)
12
- # └─ when base_url=local://router, dispatches by `model`
13
- # to one of the localhost vLLMs above. The browser
14
- # never sees those ports — that prevents anyone from
15
- # bypassing the demo to run free GPU calls.
16
  #
17
- # Why one Space, not two:
18
- # * Same sleep timerwhen the demo sleeps, GPU sleeps. No "demo
19
- # is awake but inference is asleep" UX gap.
20
- # * No CORS between SPA and inference — same origin.
21
- # * One thing to babysit, one URL to share.
 
22
  #
23
- # Why vllm/vllm-openai as the base, not nvidia/cuda:
24
- # * vLLM ships pre-compiled CUDA kernels for a specific cu+torch combo.
25
- # Building from scratch on cu12.4 means recompiling vLLM (~20 min,
26
- # fragile across minor versions). The official image guarantees the
27
- # ABI is right out of the box.
28
- # * Already includes Python 3.12, torch, ncclm, FastAPI dependencies
29
- # we just layer the physix app + frontend on top.
30
 
31
  ############################
32
  # Stage 1: build the SPA
@@ -43,59 +38,46 @@ COPY frontend/ ./
43
  ENV VITE_PHYSIX_API_URL=""
44
  # Cache-bust marker. Bump when an SPA change isn't taking on the Space —
45
  # HF BuildKit occasionally reuses stage-1 output even when sources changed.
46
- # physix-spa-rebuild: 3
47
  RUN pnpm exec tsc -b \
48
  && pnpm exec vite build --base=/web/
49
 
50
  ############################
51
- # Stage 2: runtime (vLLM + physix server + SPA)
52
  ############################
53
- FROM vllm/vllm-openai:v0.7.3 AS runtime
54
-
55
- # vllm/vllm-openai sets ENTRYPOINT to `python3 -m vllm.entrypoints.openai.api_server`.
56
- # We need our own multi-process supervisor, so reset.
57
- ENTRYPOINT []
58
 
59
  ENV PYTHONUNBUFFERED=1 \
60
  PIP_NO_CACHE_DIR=1 \
61
  PIP_DISABLE_PIP_VERSION_CHECK=1 \
62
  HOME=/tmp/home \
63
- USER=physix \
64
- LOGNAME=physix \
65
  HF_HOME=/tmp/hf_cache \
66
  XDG_CACHE_HOME=/tmp/xdg-cache \
67
- VLLM_CACHE_ROOT=/tmp/vllm_cache \
68
- TORCH_HOME=/tmp/torch_cache \
69
- TRITON_CACHE_DIR=/tmp/triton_cache \
70
  PORT=7860 \
71
  PHYSIX_HOST=0.0.0.0 \
72
  PHYSIX_CORS_ORIGINS=*
73
 
74
- # Need curl for healthchecks; the vLLM image is python-only so apt is fine.
 
75
  RUN apt-get update \
76
  && apt-get install -y --no-install-recommends curl \
77
  && rm -rf /var/lib/apt/lists/*
78
 
79
  WORKDIR /app
80
 
81
- # Physix backend deps. The vLLM image already has fastapi/uvicorn/pydantic
82
- # transitively (vllm depends on them), so this is the small physics stack
83
- # plus openenv-core.
84
- #
85
- # IMPORTANT: install with --no-build-isolation if you ever switch to a
86
- # package that needs torch at build time — you do NOT want pip to try
87
- # rebuilding torch in this image.
88
  RUN pip install \
89
  "openenv-core[core]>=0.2.2" \
90
  "numpy>=1.24" \
91
  "scipy>=1.10" \
92
  "sympy>=1.12" \
 
 
 
93
  "openai>=1.40" \
94
  "requests>=2.31"
95
 
96
- # Install physix as an editable package. --no-deps because we just
97
- # installed the runtime stack above; pyproject's deps would reinstall
98
- # pinned versions and likely conflict with vLLM's torch.
99
  COPY pyproject.toml ./
100
  COPY physix ./physix
101
  COPY README.md ./
@@ -104,33 +86,21 @@ RUN pip install --no-deps -e .
104
  # Built SPA from stage 1.
105
  COPY --from=frontend /build/dist /app/static
106
 
107
- # Space wrapper — mounts the React SPA at /web/, registers `/` -> `/web/`
108
- # redirect (OpenEnv's create_fastapi_app doesn't do this for us). Same
109
- # pattern as the previous CPU-only build, just kept in a real file now
110
- # instead of a heredoc so syntax errors are caught at build time.
111
  COPY scripts/space_app.py /app/_space_app.py
112
 
113
- # Supervisor entrypoint that boots the two vLLMs sequentially (avoids
114
- # the CUDA memory race we hit on the first push) then execs uvicorn.
115
- COPY scripts/space_entrypoint.sh /app/entrypoint.sh
116
- RUN chmod +x /app/entrypoint.sh
117
-
118
  # Pre-create writable dirs. HF Spaces runs containers as a non-root UID
119
- # with no /etc/passwd entry, so all cache paths under $HOME must exist
120
  # and be world-writable BEFORE the runtime user shows up.
121
- RUN mkdir -p \
122
- "$HOME" "$HF_HOME" "$XDG_CACHE_HOME" \
123
- "$VLLM_CACHE_ROOT" "$TORCH_HOME" "$TRITON_CACHE_DIR" \
124
- /tmp/logs \
125
  && chmod -R 0777 /tmp /app
126
 
127
  EXPOSE 7860
128
 
129
  # /health is OpenEnv's stock endpoint and turns 200 once uvicorn binds.
130
- # We give a generous start-period because vLLM cold-load + frontend serve
131
- # is up to ~150 s on first boot.
132
- HEALTHCHECK --interval=30s --timeout=10s --start-period=240s --retries=3 \
133
  CMD curl -fsS "http://127.0.0.1:${PORT}/health" || exit 1
134
 
135
  ENV ENABLE_WEB_INTERFACE=true
136
- CMD ["/app/entrypoint.sh"]
 
1
+ # PhysiX-Live demo Space — CPU-only env + UI.
2
  #
3
+ # What this Space hosts:
4
  #
5
+ # :7860 uvicorn _space_app:app
 
 
6
  # ├─ /reset, /step (OpenEnv stateless API)
7
  # ├─ /interactive/* (browser session API)
8
  # ├─ /web/ (built React SPA)
9
  # └─ /interactive/.../llm-step (LLM-driven episode)
 
 
 
 
10
  #
11
+ # What this Space does NOT host:
12
+ # * Inference. The demo is CPU-only no torch, no vLLM, no GPU. When
13
+ # the UI calls `/interactive/.../llm-step` the server forwards to
14
+ # whatever OpenAI-compatible base URL the browser handed us
15
+ # (HF Router, OpenAI, Ollama, or our sister L4 Space at
16
+ # `Pratyush-01/physix-infer` for the trained 3B + Qwen baseline).
17
  #
18
+ # Why a separate inference Space:
19
+ # Keeps this CPU image tiny (sub-second cold-start) so the demo URL
20
+ # never feels like it's stalled. The L4 Space pays GPU rates only
21
+ # while it's actually serving requests its `sleep_time=300s` shuts
22
+ # it down between sessions. Two Spaces, two failure surfaces; if
23
+ # inference is broken the verifier-only demo (Custom URL Ollama
24
+ # etc.) still works.
25
 
26
  ############################
27
  # Stage 1: build the SPA
 
38
  ENV VITE_PHYSIX_API_URL=""
39
  # Cache-bust marker. Bump when an SPA change isn't taking on the Space —
40
  # HF BuildKit occasionally reuses stage-1 output even when sources changed.
41
+ # physix-spa-rebuild: 4
42
  RUN pnpm exec tsc -b \
43
  && pnpm exec vite build --base=/web/
44
 
45
  ############################
46
+ # Stage 2: runtime (FastAPI + SPA)
47
  ############################
48
+ FROM python:3.11-slim AS runtime
 
 
 
 
49
 
50
  ENV PYTHONUNBUFFERED=1 \
51
  PIP_NO_CACHE_DIR=1 \
52
  PIP_DISABLE_PIP_VERSION_CHECK=1 \
53
  HOME=/tmp/home \
 
 
54
  HF_HOME=/tmp/hf_cache \
55
  XDG_CACHE_HOME=/tmp/xdg-cache \
 
 
 
56
  PORT=7860 \
57
  PHYSIX_HOST=0.0.0.0 \
58
  PHYSIX_CORS_ORIGINS=*
59
 
60
+ # curl for healthchecks; the slim image has neither curl nor build tools
61
+ # by default. Everything else (numpy, scipy, sympy) is a wheel install.
62
  RUN apt-get update \
63
  && apt-get install -y --no-install-recommends curl \
64
  && rm -rf /var/lib/apt/lists/*
65
 
66
  WORKDIR /app
67
 
68
+ # Pin the server-side runtime stack. NO torch / unsloth / trl here —
69
+ # this Space never trains and never runs a model locally.
 
 
 
 
 
70
  RUN pip install \
71
  "openenv-core[core]>=0.2.2" \
72
  "numpy>=1.24" \
73
  "scipy>=1.10" \
74
  "sympy>=1.12" \
75
+ "fastapi>=0.110" \
76
+ "uvicorn[standard]>=0.29" \
77
+ "pydantic>=2.5" \
78
  "openai>=1.40" \
79
  "requests>=2.31"
80
 
 
 
 
81
  COPY pyproject.toml ./
82
  COPY physix ./physix
83
  COPY README.md ./
 
86
  # Built SPA from stage 1.
87
  COPY --from=frontend /build/dist /app/static
88
 
89
+ # Space wrapper — mounts the React SPA at /web/, registers / -> /web/
90
+ # redirect (OpenEnv's create_fastapi_app doesn't add one for us).
 
 
91
  COPY scripts/space_app.py /app/_space_app.py
92
 
 
 
 
 
 
93
  # Pre-create writable dirs. HF Spaces runs containers as a non-root UID
94
+ # with no /etc/passwd entry, so any cache path under $HOME must exist
95
  # and be world-writable BEFORE the runtime user shows up.
96
+ RUN mkdir -p "$HOME" "$HF_HOME" "$XDG_CACHE_HOME" \
 
 
 
97
  && chmod -R 0777 /tmp /app
98
 
99
  EXPOSE 7860
100
 
101
  # /health is OpenEnv's stock endpoint and turns 200 once uvicorn binds.
102
+ HEALTHCHECK --interval=30s --timeout=10s --start-period=30s --retries=3 \
 
 
103
  CMD curl -fsS "http://127.0.0.1:${PORT}/health" || exit 1
104
 
105
  ENV ENABLE_WEB_INTERFACE=true
106
+ CMD ["python3", "-m", "uvicorn", "_space_app:app", "--host", "0.0.0.0", "--port", "7860"]
frontend/src/lib/llmPresets.ts CHANGED
@@ -19,17 +19,13 @@ export const OLLAMA_OPENAI_BASE_URL = "http://localhost:11434/v1";
19
  export const PHYSIX_MODEL_ID = "Pratyush-01/physix-3b-rl";
20
  export const QWEN_BASE_MODEL_ID = "Qwen/Qwen2.5-3B-Instruct";
21
 
22
- /** Magic base URL that tells the server "use the in-container vLLMs".
23
- * The actual upstream is picked server-side based on the model field
24
- * (see physix/server/providers.py::LOCAL_VLLM_PORTS). The browser
25
- * never sees the inference ports keeping them off the public URL
26
- * prevents abuse of the L4 GPU.
27
- *
28
- * Both models live on the same Space + same L4, so flipping between
29
- * the trained fine-tune and the Qwen baseline is instant once both
30
- * are warm. First call after a cold-boot still costs ~90-120s while
31
- * vLLM loads weights. */
32
- export const LOCAL_VLLM_BASE_URL = "local://router";
33
 
34
  export type EndpointId = "ollama" | "hf" | "openai" | "custom" | "physix";
35
 
@@ -67,25 +63,23 @@ export interface Endpoint {
67
  export const ENDPOINTS: readonly Endpoint[] = [
68
  {
69
  id: "physix",
70
- label: "PhysiX GPU (in-Space vLLM )",
71
- // `local://router` is a magic value the server recognises. The
72
- // browser never talks to the inference ports directly the
73
- // physix backend dispatches to the right vLLM by `model` field.
74
- // See physix/server/providers.py::LOCAL_VLLM_PORTS.
75
- baseUrl: LOCAL_VLLM_BASE_URL,
76
  needsKey: false,
77
  modelInputMode: "freeform-with-suggestions",
78
- // Both models live on the same in-Space L4 vLLM. First entry
79
- // pre-fills, so the default comparison is "trained vs base" with
80
- // identical hardware / generation params — only the weights differ.
81
  modelSuggestions: [
82
  { id: PHYSIX_MODEL_ID, tag: "trained ✦" },
83
  { id: QWEN_BASE_MODEL_ID, tag: "base (apples-to-apples)" },
84
  ],
85
  hint:
86
- "Both 3B models hosted on the Space's own L4 GPU via vLLM. No token. " +
87
- "Space sleeps after 5 min idle first call after sleep is ~90-120 s " +
88
- "while weights load; subsequent calls are fast.",
89
  },
90
  {
91
  id: "ollama",
@@ -186,23 +180,22 @@ export interface LlmConnection {
186
  apiKey: string;
187
  }
188
 
189
- /** Default A side: trained PhysiX-3B served by the in-Space L4 vLLM.
190
- * No token needed; first call after sleep is ~90-120 s while weights
191
- * load, then fast. */
192
  export const DEFAULT_CONNECTION_A: LlmConnection = {
193
  endpointId: "physix",
194
- baseUrl: LOCAL_VLLM_BASE_URL,
195
  model: PHYSIX_MODEL_ID,
196
  apiKey: "",
197
  };
198
 
199
- /** Default B side: the same in-Space vLLM, pointed at Qwen 2.5 3B.
200
- * Apples-to-apples — identical architecture, identical hardware,
201
- * identical generation params; only the weights differ. Both models
202
- * share the same GPU, so warming side A also warms side B. */
203
  export const DEFAULT_CONNECTION_B: LlmConnection = {
204
  endpointId: "physix",
205
- baseUrl: LOCAL_VLLM_BASE_URL,
206
  model: QWEN_BASE_MODEL_ID,
207
  apiKey: "",
208
  };
 
19
  export const PHYSIX_MODEL_ID = "Pratyush-01/physix-3b-rl";
20
  export const QWEN_BASE_MODEL_ID = "Qwen/Qwen2.5-3B-Instruct";
21
 
22
+ /** Sister GPU Space that hosts both the trained PhysiX-3B and the Qwen
23
+ * 2.5 3B baseline behind a single OpenAI-compatible URL. Open access
24
+ * (no token); routing on the `model` field happens inside the proxy.
25
+ * Sleeps after 5 min idle, so the first call after sleep is ~90-120 s
26
+ * while vLLM warms up subsequent calls are fast. */
27
+ export const PHYSIX_INFER_BASE_URL =
28
+ "https://pratyush-01-physix-infer.hf.space/v1";
 
 
 
 
29
 
30
  export type EndpointId = "ollama" | "hf" | "openai" | "custom" | "physix";
31
 
 
63
  export const ENDPOINTS: readonly Endpoint[] = [
64
  {
65
  id: "physix",
66
+ label: "PhysiX-Infer GPU ✦",
67
+ // Sister L4 Space hosting both checkpoints behind one URL; the
68
+ // proxy there picks the right vLLM based on the `model` field.
69
+ baseUrl: PHYSIX_INFER_BASE_URL,
 
 
70
  needsKey: false,
71
  modelInputMode: "freeform-with-suggestions",
72
+ // First entry pre-fills, so the default comparison is "trained vs
73
+ // base" with identical hardware / generation params only the
74
+ // weights differ.
75
  modelSuggestions: [
76
  { id: PHYSIX_MODEL_ID, tag: "trained ✦" },
77
  { id: QWEN_BASE_MODEL_ID, tag: "base (apples-to-apples)" },
78
  ],
79
  hint:
80
+ "Both 3B models on a sister L4 Space no token, no key. The Space " +
81
+ "sleeps after 5 min idle, so the first call after sleep is ~90-120 s " +
82
+ "while vLLM loads weights; subsequent calls are fast.",
83
  },
84
  {
85
  id: "ollama",
 
180
  apiKey: string;
181
  }
182
 
183
+ /** Default A side: trained PhysiX-3B on the sister GPU Space.
184
+ * No token needed; first call after sleep is ~90-120 s, then fast. */
 
185
  export const DEFAULT_CONNECTION_A: LlmConnection = {
186
  endpointId: "physix",
187
+ baseUrl: PHYSIX_INFER_BASE_URL,
188
  model: PHYSIX_MODEL_ID,
189
  apiKey: "",
190
  };
191
 
192
+ /** Default B side: same sister Space, same L4 GPU, just the Qwen 2.5
193
+ * 3B baseline. Apples-to-apples — identical architecture, identical
194
+ * hardware, identical generation params; only the weights differ.
195
+ * Both models share the same Space, so warming side A also warms B. */
196
  export const DEFAULT_CONNECTION_B: LlmConnection = {
197
  endpointId: "physix",
198
+ baseUrl: PHYSIX_INFER_BASE_URL,
199
  model: QWEN_BASE_MODEL_ID,
200
  apiKey: "",
201
  };
physix/server/providers.py CHANGED
@@ -30,46 +30,10 @@ _log = logging.getLogger(__name__)
30
  HF_ROUTER_BASE_URL = "https://router.huggingface.co/v1"
31
  OPENAI_BASE_URL = "https://api.openai.com/v1"
32
  OLLAMA_OPENAI_BASE_URL = "http://localhost:11434/v1"
33
-
34
- # Magic base-URL scheme. The browser sends ``local://router`` for both
35
- # in-container models; the server then picks the right localhost vLLM
36
- # based on the ``model`` field of the request. This keeps the public
37
- # Space URL from exposing the raw inference ports (so visitors can't
38
- # bypass the demo to run free GPU calls), and lets the user flip
39
- # between trained / baseline without changing the connection URL.
40
- #
41
- # Ports MUST match what the Space's entrypoint.sh launches — the two
42
- # are pinned to the same constants here for symmetry.
43
- _LOCAL_VLLM_SCHEME = "local://"
44
- LOCAL_VLLM_PORTS: dict[str, int] = {
45
- # model id -> localhost port served by entrypoint.sh
46
- "Qwen/Qwen2.5-3B-Instruct": 8001,
47
- "Pratyush-01/physix-3b-rl": 8002,
48
- }
49
-
50
-
51
- def _maybe_rewrite_local_url(base_url: str, model: str) -> str:
52
- """If the URL uses the magic ``local://`` scheme, swap in the real
53
- localhost target keyed by ``model``. Returns the URL unchanged
54
- otherwise.
55
-
56
- Raises HTTPException(400) when the ``model`` isn't one of the in-
57
- container vLLMs — without this a typo silently falls through to
58
- OpenAI's SDK and surfaces as a generic 502.
59
- """
60
-
61
- if not base_url.startswith(_LOCAL_VLLM_SCHEME):
62
- return base_url
63
- port = LOCAL_VLLM_PORTS.get(model)
64
- if port is None:
65
- raise HTTPException(
66
- status_code=400,
67
- detail=(
68
- f"Model {model!r} is not hosted by the in-container vLLMs. "
69
- f"Available: {sorted(LOCAL_VLLM_PORTS)}."
70
- ),
71
- )
72
- return f"http://127.0.0.1:{port}/v1"
73
 
74
 
75
  class LlmStepRequest(BaseModel):
@@ -128,9 +92,10 @@ def resolve_api_key(request: LlmStepRequest) -> Optional[str]:
128
  return request.api_key
129
 
130
  base_url = (request.base_url or "").lower()
131
- # `local://*` targets are the in-container vLLMs no auth.
132
- if base_url.startswith(_LOCAL_VLLM_SCHEME):
133
- return "local"
 
134
  if "huggingface" in base_url:
135
  return os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_API_KEY")
136
  if "openai.com" in base_url:
@@ -185,12 +150,8 @@ def default_openai_compat_policy_factory(request: LlmStepRequest) -> LlmPolicy:
185
  ) from exc
186
 
187
  api_key = resolve_api_key(request)
188
- # Resolve `local://*` -> the actual localhost vLLM URL. Keeps the
189
- # OpenAI client unaware of the indirection; auth/headers just flow
190
- # through normally.
191
- resolved_base_url = _maybe_rewrite_local_url(request.base_url, request.model)
192
  client = OpenAI(
193
- base_url=resolved_base_url,
194
  api_key=api_key or "missing",
195
  timeout=request.request_timeout_s,
196
  # Identifies us to providers that rate-limit by UA. Cheap to
 
30
  HF_ROUTER_BASE_URL = "https://router.huggingface.co/v1"
31
  OPENAI_BASE_URL = "https://api.openai.com/v1"
32
  OLLAMA_OPENAI_BASE_URL = "http://localhost:11434/v1"
33
+ # Sister GPU Space hosting both the trained PhysiX-3B and the Qwen 2.5 3B
34
+ # baseline. Open access (no key); sleeps after 5 min idle. See the
35
+ # physix-infer/ directory in the repo for the Dockerfile + proxy code.
36
+ PHYSIX_INFER_BASE_URL = "https://pratyush-01-physix-infer.hf.space/v1"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
 
39
  class LlmStepRequest(BaseModel):
 
92
  return request.api_key
93
 
94
  base_url = (request.base_url or "").lower()
95
+ # The PhysiX-Infer sister Space serves Qwen + the trained 3B with no
96
+ # auth — it's open-access by design (rate-limited only by sleep).
97
+ if "physix-infer" in base_url:
98
+ return "physix-infer"
99
  if "huggingface" in base_url:
100
  return os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_API_KEY")
101
  if "openai.com" in base_url:
 
150
  ) from exc
151
 
152
  api_key = resolve_api_key(request)
 
 
 
 
153
  client = OpenAI(
154
+ base_url=request.base_url,
155
  api_key=api_key or "missing",
156
  timeout=request.request_timeout_s,
157
  # Identifies us to providers that rate-limit by UA. Cheap to