Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- Dockerfile +32 -62
- frontend/src/lib/llmPresets.ts +25 -32
- physix/server/providers.py +9 -48
Dockerfile
CHANGED
|
@@ -1,32 +1,27 @@
|
|
| 1 |
-
# PhysiX-Live Space —
|
| 2 |
#
|
| 3 |
-
#
|
| 4 |
#
|
| 5 |
-
# :
|
| 6 |
-
# :8002 vllm serve Pratyush-01/physix-3b-rl (--gpu-memory-util 0.40)
|
| 7 |
-
# :7860 uvicorn physix.server.app:app
|
| 8 |
# ├─ /reset, /step (OpenEnv stateless API)
|
| 9 |
# ├─ /interactive/* (browser session API)
|
| 10 |
# ├─ /web/ (built React SPA)
|
| 11 |
# └─ /interactive/.../llm-step (LLM-driven episode)
|
| 12 |
-
# └─ when base_url=local://router, dispatches by `model`
|
| 13 |
-
# to one of the localhost vLLMs above. The browser
|
| 14 |
-
# never sees those ports — that prevents anyone from
|
| 15 |
-
# bypassing the demo to run free GPU calls.
|
| 16 |
#
|
| 17 |
-
#
|
| 18 |
-
# *
|
| 19 |
-
#
|
| 20 |
-
#
|
| 21 |
-
#
|
|
|
|
| 22 |
#
|
| 23 |
-
# Why
|
| 24 |
-
#
|
| 25 |
-
#
|
| 26 |
-
#
|
| 27 |
-
#
|
| 28 |
-
#
|
| 29 |
-
#
|
| 30 |
|
| 31 |
############################
|
| 32 |
# Stage 1: build the SPA
|
|
@@ -43,59 +38,46 @@ COPY frontend/ ./
|
|
| 43 |
ENV VITE_PHYSIX_API_URL=""
|
| 44 |
# Cache-bust marker. Bump when an SPA change isn't taking on the Space —
|
| 45 |
# HF BuildKit occasionally reuses stage-1 output even when sources changed.
|
| 46 |
-
# physix-spa-rebuild:
|
| 47 |
RUN pnpm exec tsc -b \
|
| 48 |
&& pnpm exec vite build --base=/web/
|
| 49 |
|
| 50 |
############################
|
| 51 |
-
# Stage 2: runtime (
|
| 52 |
############################
|
| 53 |
-
FROM
|
| 54 |
-
|
| 55 |
-
# vllm/vllm-openai sets ENTRYPOINT to `python3 -m vllm.entrypoints.openai.api_server`.
|
| 56 |
-
# We need our own multi-process supervisor, so reset.
|
| 57 |
-
ENTRYPOINT []
|
| 58 |
|
| 59 |
ENV PYTHONUNBUFFERED=1 \
|
| 60 |
PIP_NO_CACHE_DIR=1 \
|
| 61 |
PIP_DISABLE_PIP_VERSION_CHECK=1 \
|
| 62 |
HOME=/tmp/home \
|
| 63 |
-
USER=physix \
|
| 64 |
-
LOGNAME=physix \
|
| 65 |
HF_HOME=/tmp/hf_cache \
|
| 66 |
XDG_CACHE_HOME=/tmp/xdg-cache \
|
| 67 |
-
VLLM_CACHE_ROOT=/tmp/vllm_cache \
|
| 68 |
-
TORCH_HOME=/tmp/torch_cache \
|
| 69 |
-
TRITON_CACHE_DIR=/tmp/triton_cache \
|
| 70 |
PORT=7860 \
|
| 71 |
PHYSIX_HOST=0.0.0.0 \
|
| 72 |
PHYSIX_CORS_ORIGINS=*
|
| 73 |
|
| 74 |
-
#
|
|
|
|
| 75 |
RUN apt-get update \
|
| 76 |
&& apt-get install -y --no-install-recommends curl \
|
| 77 |
&& rm -rf /var/lib/apt/lists/*
|
| 78 |
|
| 79 |
WORKDIR /app
|
| 80 |
|
| 81 |
-
#
|
| 82 |
-
#
|
| 83 |
-
# plus openenv-core.
|
| 84 |
-
#
|
| 85 |
-
# IMPORTANT: install with --no-build-isolation if you ever switch to a
|
| 86 |
-
# package that needs torch at build time — you do NOT want pip to try
|
| 87 |
-
# rebuilding torch in this image.
|
| 88 |
RUN pip install \
|
| 89 |
"openenv-core[core]>=0.2.2" \
|
| 90 |
"numpy>=1.24" \
|
| 91 |
"scipy>=1.10" \
|
| 92 |
"sympy>=1.12" \
|
|
|
|
|
|
|
|
|
|
| 93 |
"openai>=1.40" \
|
| 94 |
"requests>=2.31"
|
| 95 |
|
| 96 |
-
# Install physix as an editable package. --no-deps because we just
|
| 97 |
-
# installed the runtime stack above; pyproject's deps would reinstall
|
| 98 |
-
# pinned versions and likely conflict with vLLM's torch.
|
| 99 |
COPY pyproject.toml ./
|
| 100 |
COPY physix ./physix
|
| 101 |
COPY README.md ./
|
|
@@ -104,33 +86,21 @@ RUN pip install --no-deps -e .
|
|
| 104 |
# Built SPA from stage 1.
|
| 105 |
COPY --from=frontend /build/dist /app/static
|
| 106 |
|
| 107 |
-
# Space wrapper — mounts the React SPA at /web/, registers
|
| 108 |
-
# redirect (OpenEnv's create_fastapi_app doesn't
|
| 109 |
-
# pattern as the previous CPU-only build, just kept in a real file now
|
| 110 |
-
# instead of a heredoc so syntax errors are caught at build time.
|
| 111 |
COPY scripts/space_app.py /app/_space_app.py
|
| 112 |
|
| 113 |
-
# Supervisor entrypoint that boots the two vLLMs sequentially (avoids
|
| 114 |
-
# the CUDA memory race we hit on the first push) then execs uvicorn.
|
| 115 |
-
COPY scripts/space_entrypoint.sh /app/entrypoint.sh
|
| 116 |
-
RUN chmod +x /app/entrypoint.sh
|
| 117 |
-
|
| 118 |
# Pre-create writable dirs. HF Spaces runs containers as a non-root UID
|
| 119 |
-
# with no /etc/passwd entry, so
|
| 120 |
# and be world-writable BEFORE the runtime user shows up.
|
| 121 |
-
RUN mkdir -p \
|
| 122 |
-
"$HOME" "$HF_HOME" "$XDG_CACHE_HOME" \
|
| 123 |
-
"$VLLM_CACHE_ROOT" "$TORCH_HOME" "$TRITON_CACHE_DIR" \
|
| 124 |
-
/tmp/logs \
|
| 125 |
&& chmod -R 0777 /tmp /app
|
| 126 |
|
| 127 |
EXPOSE 7860
|
| 128 |
|
| 129 |
# /health is OpenEnv's stock endpoint and turns 200 once uvicorn binds.
|
| 130 |
-
|
| 131 |
-
# is up to ~150 s on first boot.
|
| 132 |
-
HEALTHCHECK --interval=30s --timeout=10s --start-period=240s --retries=3 \
|
| 133 |
CMD curl -fsS "http://127.0.0.1:${PORT}/health" || exit 1
|
| 134 |
|
| 135 |
ENV ENABLE_WEB_INTERFACE=true
|
| 136 |
-
CMD ["
|
|
|
|
| 1 |
+
# PhysiX-Live demo Space — CPU-only env + UI.
|
| 2 |
#
|
| 3 |
+
# What this Space hosts:
|
| 4 |
#
|
| 5 |
+
# :7860 uvicorn _space_app:app
|
|
|
|
|
|
|
| 6 |
# ├─ /reset, /step (OpenEnv stateless API)
|
| 7 |
# ├─ /interactive/* (browser session API)
|
| 8 |
# ├─ /web/ (built React SPA)
|
| 9 |
# └─ /interactive/.../llm-step (LLM-driven episode)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
#
|
| 11 |
+
# What this Space does NOT host:
|
| 12 |
+
# * Inference. The demo is CPU-only — no torch, no vLLM, no GPU. When
|
| 13 |
+
# the UI calls `/interactive/.../llm-step` the server forwards to
|
| 14 |
+
# whatever OpenAI-compatible base URL the browser handed us
|
| 15 |
+
# (HF Router, OpenAI, Ollama, or our sister L4 Space at
|
| 16 |
+
# `Pratyush-01/physix-infer` for the trained 3B + Qwen baseline).
|
| 17 |
#
|
| 18 |
+
# Why a separate inference Space:
|
| 19 |
+
# Keeps this CPU image tiny (sub-second cold-start) so the demo URL
|
| 20 |
+
# never feels like it's stalled. The L4 Space pays GPU rates only
|
| 21 |
+
# while it's actually serving requests — its `sleep_time=300s` shuts
|
| 22 |
+
# it down between sessions. Two Spaces, two failure surfaces; if
|
| 23 |
+
# inference is broken the verifier-only demo (Custom URL → Ollama
|
| 24 |
+
# etc.) still works.
|
| 25 |
|
| 26 |
############################
|
| 27 |
# Stage 1: build the SPA
|
|
|
|
| 38 |
ENV VITE_PHYSIX_API_URL=""
|
| 39 |
# Cache-bust marker. Bump when an SPA change isn't taking on the Space —
|
| 40 |
# HF BuildKit occasionally reuses stage-1 output even when sources changed.
|
| 41 |
+
# physix-spa-rebuild: 4
|
| 42 |
RUN pnpm exec tsc -b \
|
| 43 |
&& pnpm exec vite build --base=/web/
|
| 44 |
|
| 45 |
############################
|
| 46 |
+
# Stage 2: runtime (FastAPI + SPA)
|
| 47 |
############################
|
| 48 |
+
FROM python:3.11-slim AS runtime
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
|
| 50 |
ENV PYTHONUNBUFFERED=1 \
|
| 51 |
PIP_NO_CACHE_DIR=1 \
|
| 52 |
PIP_DISABLE_PIP_VERSION_CHECK=1 \
|
| 53 |
HOME=/tmp/home \
|
|
|
|
|
|
|
| 54 |
HF_HOME=/tmp/hf_cache \
|
| 55 |
XDG_CACHE_HOME=/tmp/xdg-cache \
|
|
|
|
|
|
|
|
|
|
| 56 |
PORT=7860 \
|
| 57 |
PHYSIX_HOST=0.0.0.0 \
|
| 58 |
PHYSIX_CORS_ORIGINS=*
|
| 59 |
|
| 60 |
+
# curl for healthchecks; the slim image has neither curl nor build tools
|
| 61 |
+
# by default. Everything else (numpy, scipy, sympy) is a wheel install.
|
| 62 |
RUN apt-get update \
|
| 63 |
&& apt-get install -y --no-install-recommends curl \
|
| 64 |
&& rm -rf /var/lib/apt/lists/*
|
| 65 |
|
| 66 |
WORKDIR /app
|
| 67 |
|
| 68 |
+
# Pin the server-side runtime stack. NO torch / unsloth / trl here —
|
| 69 |
+
# this Space never trains and never runs a model locally.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
RUN pip install \
|
| 71 |
"openenv-core[core]>=0.2.2" \
|
| 72 |
"numpy>=1.24" \
|
| 73 |
"scipy>=1.10" \
|
| 74 |
"sympy>=1.12" \
|
| 75 |
+
"fastapi>=0.110" \
|
| 76 |
+
"uvicorn[standard]>=0.29" \
|
| 77 |
+
"pydantic>=2.5" \
|
| 78 |
"openai>=1.40" \
|
| 79 |
"requests>=2.31"
|
| 80 |
|
|
|
|
|
|
|
|
|
|
| 81 |
COPY pyproject.toml ./
|
| 82 |
COPY physix ./physix
|
| 83 |
COPY README.md ./
|
|
|
|
| 86 |
# Built SPA from stage 1.
|
| 87 |
COPY --from=frontend /build/dist /app/static
|
| 88 |
|
| 89 |
+
# Space wrapper — mounts the React SPA at /web/, registers / -> /web/
|
| 90 |
+
# redirect (OpenEnv's create_fastapi_app doesn't add one for us).
|
|
|
|
|
|
|
| 91 |
COPY scripts/space_app.py /app/_space_app.py
|
| 92 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
# Pre-create writable dirs. HF Spaces runs containers as a non-root UID
|
| 94 |
+
# with no /etc/passwd entry, so any cache path under $HOME must exist
|
| 95 |
# and be world-writable BEFORE the runtime user shows up.
|
| 96 |
+
RUN mkdir -p "$HOME" "$HF_HOME" "$XDG_CACHE_HOME" \
|
|
|
|
|
|
|
|
|
|
| 97 |
&& chmod -R 0777 /tmp /app
|
| 98 |
|
| 99 |
EXPOSE 7860
|
| 100 |
|
| 101 |
# /health is OpenEnv's stock endpoint and turns 200 once uvicorn binds.
|
| 102 |
+
HEALTHCHECK --interval=30s --timeout=10s --start-period=30s --retries=3 \
|
|
|
|
|
|
|
| 103 |
CMD curl -fsS "http://127.0.0.1:${PORT}/health" || exit 1
|
| 104 |
|
| 105 |
ENV ENABLE_WEB_INTERFACE=true
|
| 106 |
+
CMD ["python3", "-m", "uvicorn", "_space_app:app", "--host", "0.0.0.0", "--port", "7860"]
|
frontend/src/lib/llmPresets.ts
CHANGED
|
@@ -19,17 +19,13 @@ export const OLLAMA_OPENAI_BASE_URL = "http://localhost:11434/v1";
|
|
| 19 |
export const PHYSIX_MODEL_ID = "Pratyush-01/physix-3b-rl";
|
| 20 |
export const QWEN_BASE_MODEL_ID = "Qwen/Qwen2.5-3B-Instruct";
|
| 21 |
|
| 22 |
-
/**
|
| 23 |
-
*
|
| 24 |
-
* (
|
| 25 |
-
*
|
| 26 |
-
*
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
* the trained fine-tune and the Qwen baseline is instant once both
|
| 30 |
-
* are warm. First call after a cold-boot still costs ~90-120s while
|
| 31 |
-
* vLLM loads weights. */
|
| 32 |
-
export const LOCAL_VLLM_BASE_URL = "local://router";
|
| 33 |
|
| 34 |
export type EndpointId = "ollama" | "hf" | "openai" | "custom" | "physix";
|
| 35 |
|
|
@@ -67,25 +63,23 @@ export interface Endpoint {
|
|
| 67 |
export const ENDPOINTS: readonly Endpoint[] = [
|
| 68 |
{
|
| 69 |
id: "physix",
|
| 70 |
-
label: "PhysiX GPU
|
| 71 |
-
//
|
| 72 |
-
//
|
| 73 |
-
|
| 74 |
-
// See physix/server/providers.py::LOCAL_VLLM_PORTS.
|
| 75 |
-
baseUrl: LOCAL_VLLM_BASE_URL,
|
| 76 |
needsKey: false,
|
| 77 |
modelInputMode: "freeform-with-suggestions",
|
| 78 |
-
//
|
| 79 |
-
//
|
| 80 |
-
//
|
| 81 |
modelSuggestions: [
|
| 82 |
{ id: PHYSIX_MODEL_ID, tag: "trained ✦" },
|
| 83 |
{ id: QWEN_BASE_MODEL_ID, tag: "base (apples-to-apples)" },
|
| 84 |
],
|
| 85 |
hint:
|
| 86 |
-
"Both 3B models
|
| 87 |
-
"
|
| 88 |
-
"while
|
| 89 |
},
|
| 90 |
{
|
| 91 |
id: "ollama",
|
|
@@ -186,23 +180,22 @@ export interface LlmConnection {
|
|
| 186 |
apiKey: string;
|
| 187 |
}
|
| 188 |
|
| 189 |
-
/** Default A side: trained PhysiX-3B
|
| 190 |
-
* No token needed; first call after sleep is ~90-120 s
|
| 191 |
-
* load, then fast. */
|
| 192 |
export const DEFAULT_CONNECTION_A: LlmConnection = {
|
| 193 |
endpointId: "physix",
|
| 194 |
-
baseUrl:
|
| 195 |
model: PHYSIX_MODEL_ID,
|
| 196 |
apiKey: "",
|
| 197 |
};
|
| 198 |
|
| 199 |
-
/** Default B side:
|
| 200 |
-
* Apples-to-apples — identical architecture, identical
|
| 201 |
-
* identical generation params; only the weights differ.
|
| 202 |
-
* share the same
|
| 203 |
export const DEFAULT_CONNECTION_B: LlmConnection = {
|
| 204 |
endpointId: "physix",
|
| 205 |
-
baseUrl:
|
| 206 |
model: QWEN_BASE_MODEL_ID,
|
| 207 |
apiKey: "",
|
| 208 |
};
|
|
|
|
| 19 |
export const PHYSIX_MODEL_ID = "Pratyush-01/physix-3b-rl";
|
| 20 |
export const QWEN_BASE_MODEL_ID = "Qwen/Qwen2.5-3B-Instruct";
|
| 21 |
|
| 22 |
+
/** Sister GPU Space that hosts both the trained PhysiX-3B and the Qwen
|
| 23 |
+
* 2.5 3B baseline behind a single OpenAI-compatible URL. Open access
|
| 24 |
+
* (no token); routing on the `model` field happens inside the proxy.
|
| 25 |
+
* Sleeps after 5 min idle, so the first call after sleep is ~90-120 s
|
| 26 |
+
* while vLLM warms up — subsequent calls are fast. */
|
| 27 |
+
export const PHYSIX_INFER_BASE_URL =
|
| 28 |
+
"https://pratyush-01-physix-infer.hf.space/v1";
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
|
| 30 |
export type EndpointId = "ollama" | "hf" | "openai" | "custom" | "physix";
|
| 31 |
|
|
|
|
| 63 |
export const ENDPOINTS: readonly Endpoint[] = [
|
| 64 |
{
|
| 65 |
id: "physix",
|
| 66 |
+
label: "PhysiX-Infer GPU ✦",
|
| 67 |
+
// Sister L4 Space hosting both checkpoints behind one URL; the
|
| 68 |
+
// proxy there picks the right vLLM based on the `model` field.
|
| 69 |
+
baseUrl: PHYSIX_INFER_BASE_URL,
|
|
|
|
|
|
|
| 70 |
needsKey: false,
|
| 71 |
modelInputMode: "freeform-with-suggestions",
|
| 72 |
+
// First entry pre-fills, so the default comparison is "trained vs
|
| 73 |
+
// base" with identical hardware / generation params — only the
|
| 74 |
+
// weights differ.
|
| 75 |
modelSuggestions: [
|
| 76 |
{ id: PHYSIX_MODEL_ID, tag: "trained ✦" },
|
| 77 |
{ id: QWEN_BASE_MODEL_ID, tag: "base (apples-to-apples)" },
|
| 78 |
],
|
| 79 |
hint:
|
| 80 |
+
"Both 3B models on a sister L4 Space — no token, no key. The Space " +
|
| 81 |
+
"sleeps after 5 min idle, so the first call after sleep is ~90-120 s " +
|
| 82 |
+
"while vLLM loads weights; subsequent calls are fast.",
|
| 83 |
},
|
| 84 |
{
|
| 85 |
id: "ollama",
|
|
|
|
| 180 |
apiKey: string;
|
| 181 |
}
|
| 182 |
|
| 183 |
+
/** Default A side: trained PhysiX-3B on the sister GPU Space.
|
| 184 |
+
* No token needed; first call after sleep is ~90-120 s, then fast. */
|
|
|
|
| 185 |
export const DEFAULT_CONNECTION_A: LlmConnection = {
|
| 186 |
endpointId: "physix",
|
| 187 |
+
baseUrl: PHYSIX_INFER_BASE_URL,
|
| 188 |
model: PHYSIX_MODEL_ID,
|
| 189 |
apiKey: "",
|
| 190 |
};
|
| 191 |
|
| 192 |
+
/** Default B side: same sister Space, same L4 GPU, just the Qwen 2.5
|
| 193 |
+
* 3B baseline. Apples-to-apples — identical architecture, identical
|
| 194 |
+
* hardware, identical generation params; only the weights differ.
|
| 195 |
+
* Both models share the same Space, so warming side A also warms B. */
|
| 196 |
export const DEFAULT_CONNECTION_B: LlmConnection = {
|
| 197 |
endpointId: "physix",
|
| 198 |
+
baseUrl: PHYSIX_INFER_BASE_URL,
|
| 199 |
model: QWEN_BASE_MODEL_ID,
|
| 200 |
apiKey: "",
|
| 201 |
};
|
physix/server/providers.py
CHANGED
|
@@ -30,46 +30,10 @@ _log = logging.getLogger(__name__)
|
|
| 30 |
HF_ROUTER_BASE_URL = "https://router.huggingface.co/v1"
|
| 31 |
OPENAI_BASE_URL = "https://api.openai.com/v1"
|
| 32 |
OLLAMA_OPENAI_BASE_URL = "http://localhost:11434/v1"
|
| 33 |
-
|
| 34 |
-
#
|
| 35 |
-
#
|
| 36 |
-
|
| 37 |
-
# Space URL from exposing the raw inference ports (so visitors can't
|
| 38 |
-
# bypass the demo to run free GPU calls), and lets the user flip
|
| 39 |
-
# between trained / baseline without changing the connection URL.
|
| 40 |
-
#
|
| 41 |
-
# Ports MUST match what the Space's entrypoint.sh launches — the two
|
| 42 |
-
# are pinned to the same constants here for symmetry.
|
| 43 |
-
_LOCAL_VLLM_SCHEME = "local://"
|
| 44 |
-
LOCAL_VLLM_PORTS: dict[str, int] = {
|
| 45 |
-
# model id -> localhost port served by entrypoint.sh
|
| 46 |
-
"Qwen/Qwen2.5-3B-Instruct": 8001,
|
| 47 |
-
"Pratyush-01/physix-3b-rl": 8002,
|
| 48 |
-
}
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
def _maybe_rewrite_local_url(base_url: str, model: str) -> str:
|
| 52 |
-
"""If the URL uses the magic ``local://`` scheme, swap in the real
|
| 53 |
-
localhost target keyed by ``model``. Returns the URL unchanged
|
| 54 |
-
otherwise.
|
| 55 |
-
|
| 56 |
-
Raises HTTPException(400) when the ``model`` isn't one of the in-
|
| 57 |
-
container vLLMs — without this a typo silently falls through to
|
| 58 |
-
OpenAI's SDK and surfaces as a generic 502.
|
| 59 |
-
"""
|
| 60 |
-
|
| 61 |
-
if not base_url.startswith(_LOCAL_VLLM_SCHEME):
|
| 62 |
-
return base_url
|
| 63 |
-
port = LOCAL_VLLM_PORTS.get(model)
|
| 64 |
-
if port is None:
|
| 65 |
-
raise HTTPException(
|
| 66 |
-
status_code=400,
|
| 67 |
-
detail=(
|
| 68 |
-
f"Model {model!r} is not hosted by the in-container vLLMs. "
|
| 69 |
-
f"Available: {sorted(LOCAL_VLLM_PORTS)}."
|
| 70 |
-
),
|
| 71 |
-
)
|
| 72 |
-
return f"http://127.0.0.1:{port}/v1"
|
| 73 |
|
| 74 |
|
| 75 |
class LlmStepRequest(BaseModel):
|
|
@@ -128,9 +92,10 @@ def resolve_api_key(request: LlmStepRequest) -> Optional[str]:
|
|
| 128 |
return request.api_key
|
| 129 |
|
| 130 |
base_url = (request.base_url or "").lower()
|
| 131 |
-
#
|
| 132 |
-
|
| 133 |
-
|
|
|
|
| 134 |
if "huggingface" in base_url:
|
| 135 |
return os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_API_KEY")
|
| 136 |
if "openai.com" in base_url:
|
|
@@ -185,12 +150,8 @@ def default_openai_compat_policy_factory(request: LlmStepRequest) -> LlmPolicy:
|
|
| 185 |
) from exc
|
| 186 |
|
| 187 |
api_key = resolve_api_key(request)
|
| 188 |
-
# Resolve `local://*` -> the actual localhost vLLM URL. Keeps the
|
| 189 |
-
# OpenAI client unaware of the indirection; auth/headers just flow
|
| 190 |
-
# through normally.
|
| 191 |
-
resolved_base_url = _maybe_rewrite_local_url(request.base_url, request.model)
|
| 192 |
client = OpenAI(
|
| 193 |
-
base_url=
|
| 194 |
api_key=api_key or "missing",
|
| 195 |
timeout=request.request_timeout_s,
|
| 196 |
# Identifies us to providers that rate-limit by UA. Cheap to
|
|
|
|
| 30 |
HF_ROUTER_BASE_URL = "https://router.huggingface.co/v1"
|
| 31 |
OPENAI_BASE_URL = "https://api.openai.com/v1"
|
| 32 |
OLLAMA_OPENAI_BASE_URL = "http://localhost:11434/v1"
|
| 33 |
+
# Sister GPU Space hosting both the trained PhysiX-3B and the Qwen 2.5 3B
|
| 34 |
+
# baseline. Open access (no key); sleeps after 5 min idle. See the
|
| 35 |
+
# physix-infer/ directory in the repo for the Dockerfile + proxy code.
|
| 36 |
+
PHYSIX_INFER_BASE_URL = "https://pratyush-01-physix-infer.hf.space/v1"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
|
| 38 |
|
| 39 |
class LlmStepRequest(BaseModel):
|
|
|
|
| 92 |
return request.api_key
|
| 93 |
|
| 94 |
base_url = (request.base_url or "").lower()
|
| 95 |
+
# The PhysiX-Infer sister Space serves Qwen + the trained 3B with no
|
| 96 |
+
# auth — it's open-access by design (rate-limited only by sleep).
|
| 97 |
+
if "physix-infer" in base_url:
|
| 98 |
+
return "physix-infer"
|
| 99 |
if "huggingface" in base_url:
|
| 100 |
return os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_API_KEY")
|
| 101 |
if "openai.com" in base_url:
|
|
|
|
| 150 |
) from exc
|
| 151 |
|
| 152 |
api_key = resolve_api_key(request)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
client = OpenAI(
|
| 154 |
+
base_url=request.base_url,
|
| 155 |
api_key=api_key or "missing",
|
| 156 |
timeout=request.request_timeout_s,
|
| 157 |
# Identifies us to providers that rate-limit by UA. Cheap to
|