Pratyush-01 commited on
Commit
d2b2154
·
verified ·
1 Parent(s): 08f8699

Upload folder using huggingface_hub

Browse files
Dockerfile CHANGED
@@ -1,16 +1,32 @@
1
- # PhysiX-Live env Space — FastAPI server + built React UI on port 7860.
2
  #
3
- # Two-stage build:
4
- # 1. node:20 builds the Vite/React frontend into frontend/dist with
5
- # same-origin API base URL (VITE_PHYSIX_API_URL=""), so the SPA
6
- # fetches /interactive/* relative to the Space's own host.
7
- # 2. python:3.11-slim installs physix as an editable package and serves
8
- # both the FastAPI routes (/reset, /step, /interactive/*) AND the
9
- # built SPA as static assets from a single uvicorn process.
10
  #
11
- # We deliberately do NOT bundle the training stack (torch/unsloth/trl)
12
- # this is the env Space, not the training Space. Training lives in
13
- # `train/` and runs on HF Jobs; see train/README.md.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
  ############################
16
  # Stage 1: build the SPA
@@ -19,35 +35,27 @@ FROM node:20-alpine AS frontend
19
  WORKDIR /build
20
  RUN corepack enable
21
 
22
- # Install deps separately from sources so layer cache survives source edits.
23
  COPY frontend/package.json frontend/pnpm-lock.yaml ./
24
  RUN pnpm install --frozen-lockfile --silent
25
 
26
  COPY frontend/ ./
27
  # Same-origin API fetches (relative paths). The Space serves both API and UI.
28
  ENV VITE_PHYSIX_API_URL=""
29
- # Cache-bust: HF Spaces' BuildKit occasionally reuses the previous
30
- # image's stage-1 output even when frontend/ source changed (the layer
31
- # hash is keyed on more than just file content). Bumping this comment
32
- # is the documented workaround — increment when you push a UI change
33
- # and the Space is still serving the previous SPA bundle hash.
34
- # physix-spa-rebuild: 2
35
- # Run typecheck + vite build separately so we can pass --base=/web/ to
36
- # vite without it landing on tsc. The SPA is mounted at /web/ in the
37
- # Space and the redirect from / -> /web/ is registered by the wrapper
38
- # in _space_app.py below. All asset URLs in the built index.html
39
- # include the /web/ prefix.
40
  RUN pnpm exec tsc -b \
41
  && pnpm exec vite build --base=/web/
42
 
43
  ############################
44
- # Stage 2: runtime
45
  ############################
46
- FROM python:3.11-slim AS runtime
 
 
 
 
47
 
48
- # HF Spaces convention: write everything under /tmp (only writable path
49
- # at runtime). The container also runs as UID 1000 with no /etc/passwd
50
- # entry, so set USER/HOME so getpass.getuser() and pathlib don't crash.
51
  ENV PYTHONUNBUFFERED=1 \
52
  PIP_NO_CACHE_DIR=1 \
53
  PIP_DISABLE_PIP_VERSION_CHECK=1 \
@@ -56,103 +64,73 @@ ENV PYTHONUNBUFFERED=1 \
56
  LOGNAME=physix \
57
  HF_HOME=/tmp/hf_cache \
58
  XDG_CACHE_HOME=/tmp/xdg-cache \
 
 
 
59
  PORT=7860 \
60
  PHYSIX_HOST=0.0.0.0 \
61
  PHYSIX_CORS_ORIGINS=*
62
 
63
- WORKDIR /app
64
-
65
- # System deps: build-essential briefly for any pip wheels that need it
66
- # (sympy/scipy ship wheels for linux_x86_64 so this is mostly a safety
67
- # net), curl for the healthcheck.
68
  RUN apt-get update \
69
  && apt-get install -y --no-install-recommends curl \
70
  && rm -rf /var/lib/apt/lists/*
71
 
72
- # Install python deps separately from sources for cache friendliness.
73
- COPY pyproject.toml ./
74
- RUN pip install --upgrade pip \
75
- && pip install \
 
 
 
 
 
 
76
  "openenv-core[core]>=0.2.2" \
77
  "numpy>=1.24" \
78
  "scipy>=1.10" \
79
  "sympy>=1.12" \
80
- "fastapi>=0.110" \
81
- "uvicorn>=0.29" \
82
- "pydantic>=2.5" \
83
  "requests>=2.31"
84
 
85
- # Install physix (no-deps so we don't re-resolve the stack we just installed).
 
 
 
86
  COPY physix ./physix
87
  COPY README.md ./
88
  RUN pip install --no-deps -e .
89
 
 
90
  COPY --from=frontend /build/dist /app/static
91
 
92
- # Space wrapper: re-exports physix.server.app:app with the React SPA
93
- # mounted at /web/ (overriding OpenEnv's default Gradio UI). The base
94
- # /health endpoint provided by OpenEnv is reused for the Docker
95
- # HEALTHCHECK below we don't need to add a custom one.
96
- #
97
- # Kept in the Dockerfile rather than physix/server/app.py so the package
98
- # stays UI-free for the cloud training jobs that import it.
99
- RUN cat > /app/_space_app.py <<'PY'
100
- """Space entrypoint: physix.server.app:app + static UI mount."""
101
-
102
- from pathlib import Path
103
-
104
- from fastapi.responses import RedirectResponse
105
- from fastapi.staticfiles import StaticFiles
106
-
107
- from physix.server.app import app
108
-
109
- _STATIC_DIR = Path("/app/static")
110
-
111
-
112
- # OpenEnv's `create_fastapi_app` (which physix.server.app uses directly)
113
- # does NOT register a `/` redirect — that's only added by the higher-level
114
- # `create_web_interface_app` wrapper, which we deliberately don't use
115
- # because it would mount Gradio at /web and clobber our React SPA. So the
116
- # redirects have to live here, otherwise:
117
- #
118
- # * `https://<space>.hf.space/` -> 404
119
- # * `https://<space>.hf.space/web` -> 404 (no trailing slash)
120
- #
121
- # Both are landing pages users hit (the bare URL is the canonical Space
122
- # link the HF UI exposes), so without these the Space appears blank.
123
- @app.get("/", include_in_schema=False)
124
- async def _root_redirect() -> RedirectResponse:
125
- return RedirectResponse(url="/web/")
126
-
127
-
128
- @app.get("/web", include_in_schema=False)
129
- async def _web_no_slash_redirect() -> RedirectResponse:
130
- return RedirectResponse(url="/web/")
131
-
132
-
133
- if _STATIC_DIR.is_dir():
134
- # html=True makes StaticFiles serve index.html for directory hits and
135
- # fall back to it for unknown sub-paths (so client-side React routing
136
- # works). Mounted last so registered API routes (/web/metadata,
137
- # /web/reset, /web/step from OpenEnv; /interactive/* from physix)
138
- # always win. The vite build was run with --base=/web/ so asset
139
- # URLs in index.html already include the prefix.
140
- app.mount(
141
- "/web",
142
- StaticFiles(directory=str(_STATIC_DIR), html=True),
143
- name="ui",
144
- )
145
- PY
146
-
147
- # Pre-create writable dirs so the first request doesn't crash on a
148
- # missing cache path.
149
- RUN mkdir -p "$HOME" "$HF_HOME" "$XDG_CACHE_HOME" \
150
- && chmod -R 0777 /tmp/home /tmp/hf_cache /tmp/xdg-cache /app
151
 
152
  EXPOSE 7860
153
 
154
- HEALTHCHECK --interval=30s --timeout=5s --start-period=20s --retries=3 \
 
 
 
155
  CMD curl -fsS "http://127.0.0.1:${PORT}/health" || exit 1
156
 
157
  ENV ENABLE_WEB_INTERFACE=true
158
- CMD ["sh", "-c", "uvicorn _space_app:app --host 0.0.0.0 --port ${PORT:-7860} --log-level info"]
 
1
+ # PhysiX-Live Space — combined env + UI + dual-model GPU inference.
2
  #
3
+ # Single L4 Space hosts EVERYTHING the demo needs:
 
 
 
 
 
 
4
  #
5
+ # :8001 vllm serve Qwen/Qwen2.5-3B-Instruct (--gpu-memory-util 0.40)
6
+ # :8002 vllm serve Pratyush-01/physix-3b-rl (--gpu-memory-util 0.40)
7
+ # :7860 uvicorn physix.server.app:app
8
+ # ├─ /reset, /step (OpenEnv stateless API)
9
+ # ├─ /interactive/* (browser session API)
10
+ # ├─ /web/ (built React SPA)
11
+ # └─ /interactive/.../llm-step (LLM-driven episode)
12
+ # └─ when base_url=local://router, dispatches by `model`
13
+ # to one of the localhost vLLMs above. The browser
14
+ # never sees those ports — that prevents anyone from
15
+ # bypassing the demo to run free GPU calls.
16
+ #
17
+ # Why one Space, not two:
18
+ # * Same sleep timer — when the demo sleeps, GPU sleeps. No "demo
19
+ # is awake but inference is asleep" UX gap.
20
+ # * No CORS between SPA and inference — same origin.
21
+ # * One thing to babysit, one URL to share.
22
+ #
23
+ # Why vllm/vllm-openai as the base, not nvidia/cuda:
24
+ # * vLLM ships pre-compiled CUDA kernels for a specific cu+torch combo.
25
+ # Building from scratch on cu12.4 means recompiling vLLM (~20 min,
26
+ # fragile across minor versions). The official image guarantees the
27
+ # ABI is right out of the box.
28
+ # * Already includes Python 3.12, torch, ncclm, FastAPI dependencies —
29
+ # we just layer the physix app + frontend on top.
30
 
31
  ############################
32
  # Stage 1: build the SPA
 
35
  WORKDIR /build
36
  RUN corepack enable
37
 
 
38
  COPY frontend/package.json frontend/pnpm-lock.yaml ./
39
  RUN pnpm install --frozen-lockfile --silent
40
 
41
  COPY frontend/ ./
42
  # Same-origin API fetches (relative paths). The Space serves both API and UI.
43
  ENV VITE_PHYSIX_API_URL=""
44
+ # Cache-bust marker. Bump when an SPA change isn't taking on the Space —
45
+ # HF BuildKit occasionally reuses stage-1 output even when sources changed.
46
+ # physix-spa-rebuild: 3
 
 
 
 
 
 
 
 
47
  RUN pnpm exec tsc -b \
48
  && pnpm exec vite build --base=/web/
49
 
50
  ############################
51
+ # Stage 2: runtime (vLLM + physix server + SPA)
52
  ############################
53
+ FROM vllm/vllm-openai:v0.7.3 AS runtime
54
+
55
+ # vllm/vllm-openai sets ENTRYPOINT to `python3 -m vllm.entrypoints.openai.api_server`.
56
+ # We need our own multi-process supervisor, so reset.
57
+ ENTRYPOINT []
58
 
 
 
 
59
  ENV PYTHONUNBUFFERED=1 \
60
  PIP_NO_CACHE_DIR=1 \
61
  PIP_DISABLE_PIP_VERSION_CHECK=1 \
 
64
  LOGNAME=physix \
65
  HF_HOME=/tmp/hf_cache \
66
  XDG_CACHE_HOME=/tmp/xdg-cache \
67
+ VLLM_CACHE_ROOT=/tmp/vllm_cache \
68
+ TORCH_HOME=/tmp/torch_cache \
69
+ TRITON_CACHE_DIR=/tmp/triton_cache \
70
  PORT=7860 \
71
  PHYSIX_HOST=0.0.0.0 \
72
  PHYSIX_CORS_ORIGINS=*
73
 
74
+ # Need curl for healthchecks; the vLLM image is python-only so apt is fine.
 
 
 
 
75
  RUN apt-get update \
76
  && apt-get install -y --no-install-recommends curl \
77
  && rm -rf /var/lib/apt/lists/*
78
 
79
+ WORKDIR /app
80
+
81
+ # Physix backend deps. The vLLM image already has fastapi/uvicorn/pydantic
82
+ # transitively (vllm depends on them), so this is the small physics stack
83
+ # plus openenv-core.
84
+ #
85
+ # IMPORTANT: install with --no-build-isolation if you ever switch to a
86
+ # package that needs torch at build time — you do NOT want pip to try
87
+ # rebuilding torch in this image.
88
+ RUN pip install \
89
  "openenv-core[core]>=0.2.2" \
90
  "numpy>=1.24" \
91
  "scipy>=1.10" \
92
  "sympy>=1.12" \
93
+ "openai>=1.40" \
 
 
94
  "requests>=2.31"
95
 
96
+ # Install physix as an editable package. --no-deps because we just
97
+ # installed the runtime stack above; pyproject's deps would reinstall
98
+ # pinned versions and likely conflict with vLLM's torch.
99
+ COPY pyproject.toml ./
100
  COPY physix ./physix
101
  COPY README.md ./
102
  RUN pip install --no-deps -e .
103
 
104
+ # Built SPA from stage 1.
105
  COPY --from=frontend /build/dist /app/static
106
 
107
+ # Space wrapper mounts the React SPA at /web/, registers `/` -> `/web/`
108
+ # redirect (OpenEnv's create_fastapi_app doesn't do this for us). Same
109
+ # pattern as the previous CPU-only build, just kept in a real file now
110
+ # instead of a heredoc so syntax errors are caught at build time.
111
+ COPY scripts/space_app.py /app/_space_app.py
112
+
113
+ # Supervisor entrypoint that boots the two vLLMs sequentially (avoids
114
+ # the CUDA memory race we hit on the first push) then execs uvicorn.
115
+ COPY scripts/space_entrypoint.sh /app/entrypoint.sh
116
+ RUN chmod +x /app/entrypoint.sh
117
+
118
+ # Pre-create writable dirs. HF Spaces runs containers as a non-root UID
119
+ # with no /etc/passwd entry, so all cache paths under $HOME must exist
120
+ # and be world-writable BEFORE the runtime user shows up.
121
+ RUN mkdir -p \
122
+ "$HOME" "$HF_HOME" "$XDG_CACHE_HOME" \
123
+ "$VLLM_CACHE_ROOT" "$TORCH_HOME" "$TRITON_CACHE_DIR" \
124
+ /tmp/logs \
125
+ && chmod -R 0777 /tmp /app
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
 
127
  EXPOSE 7860
128
 
129
+ # /health is OpenEnv's stock endpoint and turns 200 once uvicorn binds.
130
+ # We give a generous start-period because vLLM cold-load + frontend serve
131
+ # is up to ~150 s on first boot.
132
+ HEALTHCHECK --interval=30s --timeout=10s --start-period=240s --retries=3 \
133
  CMD curl -fsS "http://127.0.0.1:${PORT}/health" || exit 1
134
 
135
  ENV ENABLE_WEB_INTERFACE=true
136
+ CMD ["/app/entrypoint.sh"]
frontend/src/lib/llmPresets.ts CHANGED
@@ -19,13 +19,17 @@ export const OLLAMA_OPENAI_BASE_URL = "http://localhost:11434/v1";
19
  export const PHYSIX_MODEL_ID = "Pratyush-01/physix-3b-rl";
20
  export const QWEN_BASE_MODEL_ID = "Qwen/Qwen2.5-3B-Instruct";
21
 
22
- /** Magic in-container vLLM target. The browser sends `local://<name>`
23
- * in `LlmStepRequest.base_url`; the server rewrites it to the right
24
- * localhost vLLM (see physix/server/providers.py::LOCAL_VLLM_ENDPOINTS).
25
- * Two name buckets: "qwen" -> Qwen2.5-3B-Instruct, "physix" -> physix-3b-rl.
26
- * Both are served by the same Space, on the same L4, so swapping
27
- * models doesn't pay a cold-start. */
28
- export const LOCAL_VLLM_BASE_URL = "local://router"; // model field picks the upstream
 
 
 
 
29
 
30
  export type EndpointId = "ollama" | "hf" | "openai" | "custom" | "physix";
31
 
@@ -63,25 +67,25 @@ export interface Endpoint {
63
  export const ENDPOINTS: readonly Endpoint[] = [
64
  {
65
  id: "physix",
66
- label: "PhysiX-Infer GPU (both 3B models ✦)",
67
- baseUrl: PHYSIX_INFER_BASE_URL,
68
- // No auth the Space is open access, bounded by its sleep timer.
69
- // Setting needsKey:false keeps the API-key field dimmed by default;
70
- // power users can still type one if they put auth in front of a
71
- // forked deployment.
72
  needsKey: false,
73
  modelInputMode: "freeform-with-suggestions",
74
- // Both models are served by the same Space. First entry pre-fills,
75
- // and is the trained model so the comparison story is "trained vs
76
- // base" with one click.
77
  modelSuggestions: [
78
  { id: PHYSIX_MODEL_ID, tag: "trained ✦" },
79
  { id: QWEN_BASE_MODEL_ID, tag: "base (apples-to-apples)" },
80
  ],
81
  hint:
82
- "Dedicated L4 Space hosting both physix-3b-rl and Qwen2.5-3B-Instruct " +
83
- "via vLLM. No token needed. Sleeps after 5 min idle — first call after " +
84
- "sleep is ~90-120 s while both models load; subsequent calls are fast.",
85
  },
86
  {
87
  id: "ollama",
@@ -182,24 +186,23 @@ export interface LlmConnection {
182
  apiKey: string;
183
  }
184
 
185
- /** Default A side: trained PhysiX-3B via the dedicated GPU Space. No
186
- * token needed; first call may take ~90-120 s while the Space wakes
187
- * from sleep, but subsequent calls run on a hot L4. */
188
  export const DEFAULT_CONNECTION_A: LlmConnection = {
189
  endpointId: "physix",
190
- baseUrl: PHYSIX_INFER_BASE_URL,
191
  model: PHYSIX_MODEL_ID,
192
  apiKey: "",
193
  };
194
 
195
- /** Default B side: the same GPU Space but pointed at the Qwen 2.5 3B
196
- * base model. Apples-to-apples comparison: identical architecture,
197
- * identical hardware, identical generation params only the weights
198
- * differ. Same Space means second-side wake doesn't add cold-start
199
- * cost (the L4 is already warm from side A). */
200
  export const DEFAULT_CONNECTION_B: LlmConnection = {
201
  endpointId: "physix",
202
- baseUrl: PHYSIX_INFER_BASE_URL,
203
  model: QWEN_BASE_MODEL_ID,
204
  apiKey: "",
205
  };
 
19
  export const PHYSIX_MODEL_ID = "Pratyush-01/physix-3b-rl";
20
  export const QWEN_BASE_MODEL_ID = "Qwen/Qwen2.5-3B-Instruct";
21
 
22
+ /** Magic base URL that tells the server "use the in-container vLLMs".
23
+ * The actual upstream is picked server-side based on the model field
24
+ * (see physix/server/providers.py::LOCAL_VLLM_PORTS). The browser
25
+ * never sees the inference ports keeping them off the public URL
26
+ * prevents abuse of the L4 GPU.
27
+ *
28
+ * Both models live on the same Space + same L4, so flipping between
29
+ * the trained fine-tune and the Qwen baseline is instant once both
30
+ * are warm. First call after a cold-boot still costs ~90-120s while
31
+ * vLLM loads weights. */
32
+ export const LOCAL_VLLM_BASE_URL = "local://router";
33
 
34
  export type EndpointId = "ollama" | "hf" | "openai" | "custom" | "physix";
35
 
 
67
  export const ENDPOINTS: readonly Endpoint[] = [
68
  {
69
  id: "physix",
70
+ label: "PhysiX GPU (in-Space vLLM ✦)",
71
+ // `local://router` is a magic value the server recognises. The
72
+ // browser never talks to the inference ports directly the
73
+ // physix backend dispatches to the right vLLM by `model` field.
74
+ // See physix/server/providers.py::LOCAL_VLLM_PORTS.
75
+ baseUrl: LOCAL_VLLM_BASE_URL,
76
  needsKey: false,
77
  modelInputMode: "freeform-with-suggestions",
78
+ // Both models live on the same in-Space L4 vLLM. First entry
79
+ // pre-fills, so the default comparison is "trained vs base" with
80
+ // identical hardware / generation params — only the weights differ.
81
  modelSuggestions: [
82
  { id: PHYSIX_MODEL_ID, tag: "trained ✦" },
83
  { id: QWEN_BASE_MODEL_ID, tag: "base (apples-to-apples)" },
84
  ],
85
  hint:
86
+ "Both 3B models hosted on the Space's own L4 GPU via vLLM. No token. " +
87
+ "Space sleeps after 5 min idle — first call after sleep is ~90-120 s " +
88
+ "while weights load; subsequent calls are fast.",
89
  },
90
  {
91
  id: "ollama",
 
186
  apiKey: string;
187
  }
188
 
189
+ /** Default A side: trained PhysiX-3B served by the in-Space L4 vLLM.
190
+ * No token needed; first call after sleep is ~90-120 s while weights
191
+ * load, then fast. */
192
  export const DEFAULT_CONNECTION_A: LlmConnection = {
193
  endpointId: "physix",
194
+ baseUrl: LOCAL_VLLM_BASE_URL,
195
  model: PHYSIX_MODEL_ID,
196
  apiKey: "",
197
  };
198
 
199
+ /** Default B side: the same in-Space vLLM, pointed at Qwen 2.5 3B.
200
+ * Apples-to-apples identical architecture, identical hardware,
201
+ * identical generation params; only the weights differ. Both models
202
+ * share the same GPU, so warming side A also warms side B. */
 
203
  export const DEFAULT_CONNECTION_B: LlmConnection = {
204
  endpointId: "physix",
205
+ baseUrl: LOCAL_VLLM_BASE_URL,
206
  model: QWEN_BASE_MODEL_ID,
207
  apiKey: "",
208
  };
scripts/space_app.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Space entrypoint: physix.server.app:app + static UI mount.
2
+
3
+ Imported at runtime by the Dockerfile's CMD via ``uvicorn _space_app:app``.
4
+
5
+ What this wrapper adds on top of ``physix.server.app:app``:
6
+
7
+ 1. ``GET /`` -> 302 to ``/web/`` (so the bare Space URL
8
+ doesn't 404 — OpenEnv's ``create_fastapi_app``
9
+ does NOT add a root redirect; that's only in
10
+ the higher-level wrapper which mounts Gradio
11
+ at ``/web`` and would clobber our React SPA).
12
+ 2. ``GET /web`` -> 302 to ``/web/`` (same reason; users hit the
13
+ no-trailing-slash variant from outside links).
14
+ 3. ``StaticFiles`` mount at ``/web/`` serving the built Vite SPA. The
15
+ vite build was run with ``--base=/web/`` so all asset URLs in the
16
+ emitted ``index.html`` already include the prefix.
17
+
18
+ Kept as a real .py file (not a heredoc inside the Dockerfile) so any
19
+ syntax error is caught by the build's static analysis rather than at
20
+ runtime — saved several deploy-fail loops in earlier iterations.
21
+ """
22
+
23
+ from __future__ import annotations
24
+
25
+ from pathlib import Path
26
+
27
+ from fastapi.responses import RedirectResponse
28
+ from fastapi.staticfiles import StaticFiles
29
+
30
+ from physix.server.app import app
31
+
32
+ _STATIC_DIR = Path("/app/static")
33
+
34
+
35
+ @app.get("/", include_in_schema=False)
36
+ async def _root_redirect() -> RedirectResponse:
37
+ return RedirectResponse(url="/web/")
38
+
39
+
40
+ @app.get("/web", include_in_schema=False)
41
+ async def _web_no_slash_redirect() -> RedirectResponse:
42
+ return RedirectResponse(url="/web/")
43
+
44
+
45
+ if _STATIC_DIR.is_dir():
46
+ # html=True makes StaticFiles serve index.html for directory hits and
47
+ # fall back to it for unknown sub-paths (so client-side React routing
48
+ # works). Mounted last so registered API routes (/web/metadata,
49
+ # /web/reset, /web/step from OpenEnv; /interactive/* from physix)
50
+ # always win over the static handler.
51
+ app.mount(
52
+ "/web",
53
+ StaticFiles(directory=str(_STATIC_DIR), html=True),
54
+ name="ui",
55
+ )
scripts/space_entrypoint.sh ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # Boot the two in-Space vLLMs SEQUENTIALLY, then exec uvicorn for the
3
+ # physix FastAPI server (which serves the API + the React SPA).
4
+ #
5
+ # Why sequential, not parallel:
6
+ # The first deploy attempt booted both vLLMs in parallel and the second
7
+ # one died with "No available memory for the cache blocks." Reason: vLLM
8
+ # reads `nvidia-smi`-style free memory at startup and reserves
9
+ # `--gpu-memory-utilization * (free at this moment)` worth of VRAM.
10
+ # When two processes start simultaneously, both see "all 24 GB free" and
11
+ # both try to grab ~10 GB; the second one to finalize loses. Booting
12
+ # sequentially makes the second one observe the post-first-process free
13
+ # memory, so its allocation is sized correctly.
14
+ #
15
+ # Why --gpu-memory-utilization 0.40 each (= 80% total):
16
+ # On L4 (24 GB), 40% = ~9.6 GB per process. Qwen2.5-3B fp16 weights are
17
+ # ~6.2 GB; that leaves ~3.4 GB per process for KV cache + activations,
18
+ # which sustains max_model_len=4096 with comfortable margin. The 20%
19
+ # reserve covers CUDA workspace + uvicorn + Python heap. Pushing this
20
+ # much higher (e.g. 0.45 each) is what failed on the first deploy
21
+ # because once you account for the ~600 MB CUDA context + the second
22
+ # process's overhead, weights+KV no longer fit.
23
+
24
+ set -euo pipefail
25
+
26
+ QWEN_MODEL="${QWEN_MODEL:-Qwen/Qwen2.5-3B-Instruct}"
27
+ PHYSIX_MODEL="${PHYSIX_MODEL:-Pratyush-01/physix-3b-rl}"
28
+ QWEN_GPU_FRAC="${QWEN_GPU_FRAC:-0.40}"
29
+ PHYSIX_GPU_FRAC="${PHYSIX_GPU_FRAC:-0.40}"
30
+ MAX_LEN="${MAX_LEN:-4096}"
31
+
32
+ LOG_DIR=/tmp/logs
33
+ mkdir -p "$LOG_DIR"
34
+
35
+ # Forward signals so HF's "Pause" / "Restart" actually shuts everything
36
+ # down cleanly — otherwise CUDA memory leaks across container restarts.
37
+ PIDS=()
38
+ cleanup() {
39
+ echo "[entrypoint] SIGTERM/SIGINT — killing children: ${PIDS[*]:-}" >&2
40
+ for pid in "${PIDS[@]:-}"; do
41
+ kill -TERM "$pid" 2>/dev/null || true
42
+ done
43
+ wait || true
44
+ exit 0
45
+ }
46
+ trap cleanup TERM INT
47
+
48
+ wait_healthy() {
49
+ local name="$1" port="$2" pid="$3" budget="${4:-300}"
50
+ local deadline=$((SECONDS + budget))
51
+ while (( SECONDS < deadline )); do
52
+ if ! kill -0 "$pid" 2>/dev/null; then
53
+ echo "[entrypoint] FATAL: $name (pid $pid) died during boot. Tail of log:" >&2
54
+ tail -n 80 "$LOG_DIR/${name}.log" >&2 || true
55
+ return 1
56
+ fi
57
+ if curl -fsS "http://127.0.0.1:${port}/health" >/dev/null 2>&1; then
58
+ echo "[entrypoint] $name healthy on :$port (after ${SECONDS}s)"
59
+ return 0
60
+ fi
61
+ sleep 5
62
+ done
63
+ echo "[entrypoint] FATAL: $name failed to become healthy in ${budget}s" >&2
64
+ tail -n 80 "$LOG_DIR/${name}.log" >&2 || true
65
+ return 1
66
+ }
67
+
68
+ echo "[entrypoint] step 1/3 — booting vLLM(qwen) = $QWEN_MODEL on :8001 (gpu=${QWEN_GPU_FRAC})"
69
+ # vllm/vllm-openai image only ships `python3` — no `python` symlink.
70
+ python3 -m vllm.entrypoints.openai.api_server \
71
+ --model "$QWEN_MODEL" \
72
+ --served-model-name "$QWEN_MODEL" \
73
+ --host 0.0.0.0 --port 8001 \
74
+ --gpu-memory-utilization "$QWEN_GPU_FRAC" \
75
+ --max-model-len "$MAX_LEN" \
76
+ --dtype auto \
77
+ --disable-log-requests \
78
+ > "$LOG_DIR/qwen.log" 2>&1 &
79
+ QWEN_PID=$!
80
+ PIDS+=("$QWEN_PID")
81
+ wait_healthy qwen 8001 "$QWEN_PID" 300
82
+
83
+ echo "[entrypoint] step 2/3 — booting vLLM(physix) = $PHYSIX_MODEL on :8002 (gpu=${PHYSIX_GPU_FRAC})"
84
+ python3 -m vllm.entrypoints.openai.api_server \
85
+ --model "$PHYSIX_MODEL" \
86
+ --served-model-name "$PHYSIX_MODEL" \
87
+ --host 0.0.0.0 --port 8002 \
88
+ --gpu-memory-utilization "$PHYSIX_GPU_FRAC" \
89
+ --max-model-len "$MAX_LEN" \
90
+ --dtype auto \
91
+ --disable-log-requests \
92
+ > "$LOG_DIR/physix.log" 2>&1 &
93
+ PHYSIX_PID=$!
94
+ PIDS+=("$PHYSIX_PID")
95
+ wait_healthy physix 8002 "$PHYSIX_PID" 300
96
+
97
+ echo "[entrypoint] step 3/3 — both vLLMs healthy; starting uvicorn on :${PORT}"
98
+ # `exec` so uvicorn becomes PID 1's foreground job and HF Spaces sees
99
+ # our process as healthy. The trap above forwards termination back to
100
+ # the vLLM children when the Space is paused.
101
+ exec python3 -m uvicorn _space_app:app \
102
+ --host 0.0.0.0 --port "${PORT:-7860}" \
103
+ --log-level info