Spaces:
Sleeping
Sleeping
Re-create physix-infer: sequential vLLM boot, gpu_mem 0.40 each, python3 fix
Browse files- .dockerignore +32 -0
- .gitattributes +5 -35
- Dockerfile +89 -0
- README.md +87 -5
- entrypoint.sh +123 -0
- proxy.py +260 -0
.dockerignore
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Patterns excluded from the Docker build context.
|
| 2 |
+
#
|
| 3 |
+
# Keeps anything heavy/host-specific out of BuildKit. The image only
|
| 4 |
+
# needs proxy.py + entrypoint.sh + the README; everything else is noise
|
| 5 |
+
# or actively harmful (e.g. a host venv landing under /app would shadow
|
| 6 |
+
# the image's own python install).
|
| 7 |
+
|
| 8 |
+
# Python venv / caches.
|
| 9 |
+
.venv
|
| 10 |
+
**/__pycache__
|
| 11 |
+
**/*.pyc
|
| 12 |
+
**/*.pyo
|
| 13 |
+
.pytest_cache
|
| 14 |
+
.ruff_cache
|
| 15 |
+
.mypy_cache
|
| 16 |
+
|
| 17 |
+
# Build / packaging artefacts.
|
| 18 |
+
*.egg-info
|
| 19 |
+
build
|
| 20 |
+
dist
|
| 21 |
+
|
| 22 |
+
# Editor / OS detritus.
|
| 23 |
+
.DS_Store
|
| 24 |
+
*.swp
|
| 25 |
+
.vscode
|
| 26 |
+
.idea
|
| 27 |
+
.git
|
| 28 |
+
.github
|
| 29 |
+
|
| 30 |
+
# scripts/ holds host-side deploy helpers (configure_space.py, deploy.py).
|
| 31 |
+
# They run from your laptop, never inside the image.
|
| 32 |
+
scripts
|
.gitattributes
CHANGED
|
@@ -1,35 +1,5 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
*.
|
| 4 |
-
|
| 5 |
-
*.
|
| 6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 1 |
+
# Make sure shell scripts and Dockerfile aren't treated as binary by HF's
|
| 2 |
+
# diff viewer (default heuristic occasionally trips on `set -e` lines).
|
| 3 |
+
*.sh text eol=lf
|
| 4 |
+
Dockerfile text eol=lf
|
| 5 |
+
*.py text eol=lf
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Dockerfile
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# PhysiX-Infer — dual-model OpenAI-compatible inference Space.
|
| 2 |
+
#
|
| 3 |
+
# Hosts BOTH:
|
| 4 |
+
# * Qwen/Qwen2.5-3B-Instruct (untrained baseline)
|
| 5 |
+
# * Pratyush-01/physix-3b-rl (GRPO-trained variant)
|
| 6 |
+
#
|
| 7 |
+
# Why this Space exists:
|
| 8 |
+
# The HF Inference Router does not serve Qwen/Qwen2.5-3B-Instruct (no
|
| 9 |
+
# provider has it loaded), and won't serve a private/fine-tune unless
|
| 10 |
+
# the owner pays for an Inference Endpoint. Both checkpoints we want
|
| 11 |
+
# to compare are 3B Qwen2 fp16 models, and on a single 24 GB L4 we can
|
| 12 |
+
# keep two vLLM processes resident at ~45% gpu_memory each and never
|
| 13 |
+
# pay router/endpoint fees.
|
| 14 |
+
#
|
| 15 |
+
# Architecture (one container, three processes):
|
| 16 |
+
# :8001 vllm serve Qwen/Qwen2.5-3B-Instruct --gpu-memory-util 0.40
|
| 17 |
+
# :8002 vllm serve Pratyush-01/physix-3b-rl --gpu-memory-util 0.40
|
| 18 |
+
# :7860 uvicorn proxy.py:app routes by JSON `model` field
|
| 19 |
+
#
|
| 20 |
+
# Boot order matters: vLLMs come up SEQUENTIALLY, not in parallel. Both
|
| 21 |
+
# read `nvidia-smi` free-memory at startup; if they race, the second
|
| 22 |
+
# crashes with "No available memory for the cache blocks." See
|
| 23 |
+
# entrypoint.sh for the full reasoning.
|
| 24 |
+
#
|
| 25 |
+
# Why the official vllm/vllm-openai image:
|
| 26 |
+
# vLLM ships pre-compiled CUDA kernels that target the cuda toolkit
|
| 27 |
+
# and pytorch versions it was built against. Building from a generic
|
| 28 |
+
# nvidia/cuda image means recompiling vLLM's C++ kernels (~20 min,
|
| 29 |
+
# often fragile across CUDA minor versions). Starting from
|
| 30 |
+
# vllm/vllm-openai:<tag> guarantees torch / cu / nccl / vllm are all
|
| 31 |
+
# ABI-compatible. We just layer fastapi + httpx for the proxy on top.
|
| 32 |
+
#
|
| 33 |
+
# Cold start on a fresh HF Spaces L4 (no persistent /data):
|
| 34 |
+
# * Image pull: ~30 s
|
| 35 |
+
# * vLLM startup: ~30 s after weights are local
|
| 36 |
+
# * Weight download: ~45 s for both models from Hub CDN
|
| 37 |
+
# ── total ~90-120 s before /health flips green ──
|
| 38 |
+
|
| 39 |
+
FROM vllm/vllm-openai:v0.7.3
|
| 40 |
+
|
| 41 |
+
# vllm/vllm-openai sets ENTRYPOINT to `python -m vllm.entrypoints.openai.api_server`.
|
| 42 |
+
# We need to override that to launch our own multi-process entrypoint, so reset.
|
| 43 |
+
ENTRYPOINT []
|
| 44 |
+
|
| 45 |
+
ENV PYTHONUNBUFFERED=1 \
|
| 46 |
+
PIP_NO_CACHE_DIR=1 \
|
| 47 |
+
PIP_DISABLE_PIP_VERSION_CHECK=1 \
|
| 48 |
+
HOME=/tmp/home \
|
| 49 |
+
HF_HOME=/tmp/hf_cache \
|
| 50 |
+
XDG_CACHE_HOME=/tmp/xdg-cache \
|
| 51 |
+
# vLLM's torch.compile cache must land somewhere writable. The image's
|
| 52 |
+
# default ($HOME/.cache/vllm) breaks on HF Spaces because the runtime
|
| 53 |
+
# user has no writable home.
|
| 54 |
+
VLLM_CACHE_ROOT=/tmp/vllm_cache \
|
| 55 |
+
TORCH_HOME=/tmp/torch_cache \
|
| 56 |
+
TRITON_CACHE_DIR=/tmp/triton_cache \
|
| 57 |
+
PORT=7860
|
| 58 |
+
|
| 59 |
+
# fastapi/uvicorn/httpx for the routing proxy. The image already has them
|
| 60 |
+
# transitively (vllm depends on fastapi), but pin minimums to be safe.
|
| 61 |
+
# `pip install --no-deps` would be tighter but trades safety for ~5 MB.
|
| 62 |
+
RUN pip install \
|
| 63 |
+
"fastapi>=0.110" \
|
| 64 |
+
"uvicorn[standard]>=0.29" \
|
| 65 |
+
"httpx>=0.27"
|
| 66 |
+
|
| 67 |
+
WORKDIR /app
|
| 68 |
+
|
| 69 |
+
COPY proxy.py entrypoint.sh ./
|
| 70 |
+
RUN chmod +x /app/entrypoint.sh
|
| 71 |
+
|
| 72 |
+
# HF Spaces runs containers as a non-root UID with no /etc/passwd entry,
|
| 73 |
+
# so any cache path under $HOME must exist and be world-writable BEFORE
|
| 74 |
+
# the runtime user shows up. Pre-creating /tmp subdirs (which Spaces
|
| 75 |
+
# always lets us write to) is the standard workaround.
|
| 76 |
+
RUN mkdir -p \
|
| 77 |
+
"$HOME" "$HF_HOME" "$XDG_CACHE_HOME" \
|
| 78 |
+
"$VLLM_CACHE_ROOT" "$TORCH_HOME" "$TRITON_CACHE_DIR" \
|
| 79 |
+
/tmp/logs \
|
| 80 |
+
&& chmod -R 0777 /tmp
|
| 81 |
+
|
| 82 |
+
EXPOSE 7860
|
| 83 |
+
|
| 84 |
+
# /health is served by proxy.py and turns 200 only when BOTH vLLMs are up.
|
| 85 |
+
# Generous start-period covers the ~120 s cold boot.
|
| 86 |
+
HEALTHCHECK --interval=30s --timeout=10s --start-period=180s --retries=3 \
|
| 87 |
+
CMD curl -fsS "http://127.0.0.1:${PORT}/health" || exit 1
|
| 88 |
+
|
| 89 |
+
CMD ["/app/entrypoint.sh"]
|
README.md
CHANGED
|
@@ -1,10 +1,92 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: docker
|
|
|
|
| 7 |
pinned: false
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
---
|
| 9 |
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: PhysiX-Infer
|
| 3 |
+
emoji: ⚡
|
| 4 |
+
colorFrom: yellow
|
| 5 |
+
colorTo: red
|
| 6 |
sdk: docker
|
| 7 |
+
app_port: 7860
|
| 8 |
pinned: false
|
| 9 |
+
license: apache-2.0
|
| 10 |
+
short_description: Dual-model inference (Qwen 2.5 3B + physix-3b-rl)
|
| 11 |
+
suggested_hardware: l4x1
|
| 12 |
+
tags:
|
| 13 |
+
- inference
|
| 14 |
+
- vllm
|
| 15 |
+
- qwen2
|
| 16 |
+
- physix
|
| 17 |
---
|
| 18 |
|
| 19 |
+
<!--
|
| 20 |
+
Note: `hardware:` and `sleep_time:` are NOT readable from this frontmatter.
|
| 21 |
+
Only `suggested_hardware:` is, and even that is informational (it shows up
|
| 22 |
+
on the Space card but does not auto-upgrade). After the first push, run
|
| 23 |
+
`scripts/configure_space.py` once to:
|
| 24 |
+
1. Upgrade the Space to L4 (l4x1)
|
| 25 |
+
2. Set sleep_time to 300 seconds
|
| 26 |
+
See that script's docstring for details.
|
| 27 |
+
-->
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
# PhysiX-Infer — dual-model inference Space
|
| 31 |
+
|
| 32 |
+
OpenAI-compatible inference for the two 3B Qwen2 checkpoints used by the [PhysiX-Live](https://huggingface.co/spaces/Pratyush-01/physix-live) demo:
|
| 33 |
+
|
| 34 |
+
| Model id (use as `model` field) | Role |
|
| 35 |
+
| --- | --- |
|
| 36 |
+
| `Qwen/Qwen2.5-3B-Instruct` | Untrained baseline |
|
| 37 |
+
| `Pratyush-01/physix-3b-rl` | GRPO-trained variant |
|
| 38 |
+
|
| 39 |
+
## Why this Space exists
|
| 40 |
+
|
| 41 |
+
The HF Inference Router does not currently serve `Qwen/Qwen2.5-3B-Instruct` (no provider has it loaded), and won't serve the fine-tune unless its owner runs a paid Inference Endpoint. Both checkpoints are small enough to share a single L4 (24 GB) — `~6.2 GB` each in fp16, plus KV cache — so we just run two `vllm serve` processes side by side and dispatch on the `model` field.
|
| 42 |
+
|
| 43 |
+
## Architecture
|
| 44 |
+
|
| 45 |
+
```
|
| 46 |
+
┌────────────────── Space (L4, 24 GB) ──────────────────┐
|
| 47 |
+
│ │
|
| 48 |
+
│ :8001 vllm serve Qwen/Qwen2.5-3B-Instruct │
|
| 49 |
+
│ :8002 vllm serve Pratyush-01/physix-3b-rl │
|
| 50 |
+
│ │
|
| 51 |
+
│ :7860 proxy.py (FastAPI) │
|
| 52 |
+
│ routes by JSON `model` field │
|
| 53 |
+
└───────────────────────────────────────────────────────┘
|
| 54 |
+
```
|
| 55 |
+
|
| 56 |
+
Each vLLM gets `--gpu-memory-utilization 0.40` and `--max-model-len 4096`, and they're booted **sequentially** (Qwen first, then PhysiX) so the second process correctly observes the post-first-process free VRAM — booting in parallel caused a "No available memory for the cache blocks" crash on the first deploy attempt. Proxy is `~150` lines of FastAPI + httpx; streaming bytes are forwarded verbatim so SSE framing survives.
|
| 57 |
+
|
| 58 |
+
## Sleep behavior
|
| 59 |
+
|
| 60 |
+
`sleep_time: 300` in the frontmatter — the Space pauses after **5 minutes** idle and stops billing immediately. First request after a sleep cold-boots both vLLMs, which takes **~90-120 s** on a warm Hub cache. The proxy's `/health` returns `503` while either upstream is still booting; the demo's frontend uses that to render a "warming up" badge.
|
| 61 |
+
|
| 62 |
+
## Endpoints
|
| 63 |
+
|
| 64 |
+
| Method | Path | Notes |
|
| 65 |
+
| --- | --- | --- |
|
| 66 |
+
| `POST` | `/v1/chat/completions` | OpenAI spec; `model` field selects upstream |
|
| 67 |
+
| `POST` | `/v1/completions` | same routing, kept for older clients |
|
| 68 |
+
| `GET` | `/v1/models` | lists both ids |
|
| 69 |
+
| `GET` | `/health` | 200 iff both vLLMs healthy |
|
| 70 |
+
| `GET` | `/` | plain HTML landing page |
|
| 71 |
+
|
| 72 |
+
## Auth
|
| 73 |
+
|
| 74 |
+
None. The Space is open access, bounded by the 5-min sleep window — anyone can hit it, but they can't run it for free past one idle cycle.
|
| 75 |
+
|
| 76 |
+
## Local smoke test
|
| 77 |
+
|
| 78 |
+
You need a CUDA GPU with 16+ GB free.
|
| 79 |
+
|
| 80 |
+
```bash
|
| 81 |
+
docker build -t physix-infer .
|
| 82 |
+
docker run --rm --gpus all -p 7860:7860 physix-infer
|
| 83 |
+
# wait ~90s, then:
|
| 84 |
+
curl -sS http://localhost:7860/health
|
| 85 |
+
curl -sS -X POST http://localhost:7860/v1/chat/completions \
|
| 86 |
+
-H 'content-type: application/json' \
|
| 87 |
+
-d '{"model":"Qwen/Qwen2.5-3B-Instruct","messages":[{"role":"user","content":"hi"}]}'
|
| 88 |
+
```
|
| 89 |
+
|
| 90 |
+
## Wiring into the demo
|
| 91 |
+
|
| 92 |
+
In the [physix-live](https://github.com/openenv-hackathon/physix-live) frontend, this Space is exposed as the **PhysiX-Infer (GPU)** preset. Pick it from the endpoint dropdown and pick either model id from the suggestions. No API key required.
|
entrypoint.sh
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# Boot two vLLM processes + the FastAPI proxy, all in one container.
|
| 3 |
+
#
|
| 4 |
+
# Lifecycle:
|
| 5 |
+
# 1. Launch vLLM(qwen) on :8001, wait until /health returns 200.
|
| 6 |
+
# 2. THEN launch vLLM(physix) on :8002, wait until /health returns 200.
|
| 7 |
+
# (Sequential, not parallel — see below.)
|
| 8 |
+
# 3. Exec uvicorn proxy in foreground (PID 1) — when HF Spaces sends
|
| 9 |
+
# SIGTERM at sleep time, uvicorn exits cleanly and the children get
|
| 10 |
+
# reaped via the signal trap.
|
| 11 |
+
#
|
| 12 |
+
# Why sequential, not parallel:
|
| 13 |
+
# The first deploy attempt booted both vLLMs in parallel and the second
|
| 14 |
+
# one died with `ValueError: No available memory for the cache blocks.
|
| 15 |
+
# Try increasing gpu_memory_utilization`. Cause: vLLM reads the GPU's
|
| 16 |
+
# *currently free* memory at startup and then reserves
|
| 17 |
+
# `--gpu-memory-utilization * (free at this moment)`. When two processes
|
| 18 |
+
# start simultaneously, both read "all 24 GB free" and both try to grab
|
| 19 |
+
# ~10 GB; whichever one finalises last loses, because by then there's
|
| 20 |
+
# only ~10-12 GB actually free.
|
| 21 |
+
#
|
| 22 |
+
# Sequential boot makes the second vLLM observe the post-first-process
|
| 23 |
+
# free memory, so its allocation gets sized correctly.
|
| 24 |
+
#
|
| 25 |
+
# Why --gpu-memory-utilization 0.40 each (= 80% total reserved):
|
| 26 |
+
# On L4 (24 GB), 0.40 * 24 ≈ 9.6 GB per process. Qwen2.5-3B fp16 weights
|
| 27 |
+
# are ~6.2 GB; that leaves ~3.4 GB for KV cache + activations, which
|
| 28 |
+
# sustains max_model_len=4096 with comfortable margin. The 20% reserve
|
| 29 |
+
# covers the CUDA workspace + Python/uvicorn heap + the second vLLM's
|
| 30 |
+
# own ~600 MB CUDA context overhead. We deliberately do NOT push to
|
| 31 |
+
# 0.45 each — the previous deploy proved the residual headroom isn't
|
| 32 |
+
# there once both contexts coexist.
|
| 33 |
+
|
| 34 |
+
set -euo pipefail
|
| 35 |
+
|
| 36 |
+
QWEN_MODEL="${QWEN_MODEL:-Qwen/Qwen2.5-3B-Instruct}"
|
| 37 |
+
PHYSIX_MODEL="${PHYSIX_MODEL:-Pratyush-01/physix-3b-rl}"
|
| 38 |
+
|
| 39 |
+
QWEN_GPU_FRAC="${QWEN_GPU_FRAC:-0.40}"
|
| 40 |
+
PHYSIX_GPU_FRAC="${PHYSIX_GPU_FRAC:-0.40}"
|
| 41 |
+
|
| 42 |
+
# 4096 is enough for the PhysiX prompt (~1500 tok) + completion (~512 tok)
|
| 43 |
+
# with comfortable headroom, and tightening it materially shrinks the KV
|
| 44 |
+
# cache footprint vs vLLM's default of model.max_position_embeddings
|
| 45 |
+
# (32k for Qwen2.5).
|
| 46 |
+
MAX_LEN="${MAX_LEN:-4096}"
|
| 47 |
+
|
| 48 |
+
LOG_DIR=/tmp/logs
|
| 49 |
+
mkdir -p "$LOG_DIR"
|
| 50 |
+
|
| 51 |
+
# Track child PIDs so the signal trap can terminate them all on
|
| 52 |
+
# SIGTERM/SIGINT. HF Spaces sends SIGTERM when pausing the Space.
|
| 53 |
+
PIDS=()
|
| 54 |
+
cleanup() {
|
| 55 |
+
echo "[entrypoint] SIGTERM/SIGINT — killing children: ${PIDS[*]:-}" >&2
|
| 56 |
+
for pid in "${PIDS[@]:-}"; do
|
| 57 |
+
kill -TERM "$pid" 2>/dev/null || true
|
| 58 |
+
done
|
| 59 |
+
wait || true
|
| 60 |
+
exit 0
|
| 61 |
+
}
|
| 62 |
+
trap cleanup TERM INT
|
| 63 |
+
|
| 64 |
+
wait_healthy() {
|
| 65 |
+
local name="$1" port="$2" pid="$3" budget="${4:-480}"
|
| 66 |
+
local deadline=$((SECONDS + budget))
|
| 67 |
+
while (( SECONDS < deadline )); do
|
| 68 |
+
# If the child died, surface its log and bail out — silently
|
| 69 |
+
# waiting forever for a corpse is the worst failure mode.
|
| 70 |
+
if ! kill -0 "$pid" 2>/dev/null; then
|
| 71 |
+
echo "[entrypoint] FATAL: $name (pid $pid) died during boot. Tail of log:" >&2
|
| 72 |
+
tail -n 80 "$LOG_DIR/vllm-${name}.log" >&2 || true
|
| 73 |
+
return 1
|
| 74 |
+
fi
|
| 75 |
+
if curl -fsS "http://127.0.0.1:${port}/health" >/dev/null 2>&1; then
|
| 76 |
+
echo "[entrypoint] $name healthy on :$port (after ${SECONDS}s)"
|
| 77 |
+
return 0
|
| 78 |
+
fi
|
| 79 |
+
sleep 5
|
| 80 |
+
done
|
| 81 |
+
echo "[entrypoint] FATAL: $name failed to become healthy in ${budget}s" >&2
|
| 82 |
+
tail -n 80 "$LOG_DIR/vllm-${name}.log" >&2 || true
|
| 83 |
+
return 1
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
echo "[entrypoint] step 1/3 — booting vLLM(qwen) = $QWEN_MODEL on :8001 (gpu=${QWEN_GPU_FRAC})"
|
| 87 |
+
# vllm/vllm-openai base image ships only `python3` (no `python` symlink),
|
| 88 |
+
# so use python3 explicitly. Using `python -m vllm...` here cost us a
|
| 89 |
+
# full failed deploy on first try.
|
| 90 |
+
python3 -m vllm.entrypoints.openai.api_server \
|
| 91 |
+
--model "$QWEN_MODEL" \
|
| 92 |
+
--served-model-name "$QWEN_MODEL" \
|
| 93 |
+
--host 0.0.0.0 --port 8001 \
|
| 94 |
+
--gpu-memory-utilization "$QWEN_GPU_FRAC" \
|
| 95 |
+
--max-model-len "$MAX_LEN" \
|
| 96 |
+
--dtype auto \
|
| 97 |
+
--disable-log-requests \
|
| 98 |
+
> "$LOG_DIR/vllm-qwen.log" 2>&1 &
|
| 99 |
+
QWEN_PID=$!
|
| 100 |
+
PIDS+=("$QWEN_PID")
|
| 101 |
+
wait_healthy qwen 8001 "$QWEN_PID"
|
| 102 |
+
|
| 103 |
+
echo "[entrypoint] step 2/3 — booting vLLM(physix) = $PHYSIX_MODEL on :8002 (gpu=${PHYSIX_GPU_FRAC})"
|
| 104 |
+
python3 -m vllm.entrypoints.openai.api_server \
|
| 105 |
+
--model "$PHYSIX_MODEL" \
|
| 106 |
+
--served-model-name "$PHYSIX_MODEL" \
|
| 107 |
+
--host 0.0.0.0 --port 8002 \
|
| 108 |
+
--gpu-memory-utilization "$PHYSIX_GPU_FRAC" \
|
| 109 |
+
--max-model-len "$MAX_LEN" \
|
| 110 |
+
--dtype auto \
|
| 111 |
+
--disable-log-requests \
|
| 112 |
+
> "$LOG_DIR/vllm-physix.log" 2>&1 &
|
| 113 |
+
PHYSIX_PID=$!
|
| 114 |
+
PIDS+=("$PHYSIX_PID")
|
| 115 |
+
wait_healthy physix 8002 "$PHYSIX_PID"
|
| 116 |
+
|
| 117 |
+
echo "[entrypoint] step 3/3 — both vLLMs healthy; starting proxy on :${PORT}"
|
| 118 |
+
# `exec` so uvicorn becomes PID 1's foreground job and HF Spaces sees
|
| 119 |
+
# our process as healthy. The trap above forwards termination back to
|
| 120 |
+
# the vLLM children when the Space is paused.
|
| 121 |
+
exec python3 -m uvicorn proxy:app \
|
| 122 |
+
--host 0.0.0.0 --port "${PORT}" \
|
| 123 |
+
--log-level info
|
proxy.py
ADDED
|
@@ -0,0 +1,260 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""OpenAI-compatible proxy that fans out to two local vLLM servers.
|
| 2 |
+
|
| 3 |
+
Why a custom proxy and not nginx:
|
| 4 |
+
* nginx routing on a JSON body field requires lua-nginx or njs (build
|
| 5 |
+
pain on a CUDA base image), and we need to PEEK at the body without
|
| 6 |
+
consuming it for streaming requests.
|
| 7 |
+
* httpx async streaming + FastAPI is ~80 LoC, debuggable in plain Python,
|
| 8 |
+
and reuses the same connection pool across requests.
|
| 9 |
+
|
| 10 |
+
Endpoints exposed on :7860 (matches OpenAI spec):
|
| 11 |
+
* GET /v1/models — lists both registered model ids
|
| 12 |
+
* GET /v1/models/{model_id} — single model lookup
|
| 13 |
+
* POST /v1/chat/completions — main route. Reads `model` from body
|
| 14 |
+
and forwards to whichever vLLM owns it.
|
| 15 |
+
* POST /v1/completions — same routing, kept for old clients.
|
| 16 |
+
* GET /health — 200 iff both upstreams are healthy.
|
| 17 |
+
HF's container monitor uses the Docker
|
| 18 |
+
HEALTHCHECK from the Dockerfile, but
|
| 19 |
+
we expose this for the demo's frontend
|
| 20 |
+
so it can show a "warming up..." badge
|
| 21 |
+
during cold starts.
|
| 22 |
+
* GET / — friendly landing page so the bare
|
| 23 |
+
Space URL doesn't 404.
|
| 24 |
+
|
| 25 |
+
Streaming is forwarded byte-for-byte (StreamingResponse over the upstream's
|
| 26 |
+
chunks) so SSE `data: {...}\n\n` framing survives intact.
|
| 27 |
+
"""
|
| 28 |
+
|
| 29 |
+
from __future__ import annotations
|
| 30 |
+
|
| 31 |
+
import json
|
| 32 |
+
import logging
|
| 33 |
+
import os
|
| 34 |
+
from contextlib import asynccontextmanager
|
| 35 |
+
from typing import AsyncIterator
|
| 36 |
+
|
| 37 |
+
import httpx
|
| 38 |
+
from fastapi import FastAPI, HTTPException, Request
|
| 39 |
+
from fastapi.responses import HTMLResponse, JSONResponse, StreamingResponse
|
| 40 |
+
|
| 41 |
+
logger = logging.getLogger("physix-infer-proxy")
|
| 42 |
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s")
|
| 43 |
+
|
| 44 |
+
QWEN_MODEL = os.environ.get("QWEN_MODEL", "Qwen/Qwen2.5-3B-Instruct")
|
| 45 |
+
PHYSIX_MODEL = os.environ.get("PHYSIX_MODEL", "Pratyush-01/physix-3b-rl")
|
| 46 |
+
|
| 47 |
+
QWEN_UPSTREAM = "http://127.0.0.1:8001"
|
| 48 |
+
PHYSIX_UPSTREAM = "http://127.0.0.1:8002"
|
| 49 |
+
|
| 50 |
+
ROUTING: dict[str, str] = {
|
| 51 |
+
QWEN_MODEL: QWEN_UPSTREAM,
|
| 52 |
+
PHYSIX_MODEL: PHYSIX_UPSTREAM,
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
# Generous timeout — first request after a cold start can sit on the
|
| 56 |
+
# upstream for ~30 s while CUDA graphs warm up. Streaming tokens come
|
| 57 |
+
# back fast once that's done.
|
| 58 |
+
TIMEOUT = httpx.Timeout(connect=10.0, read=600.0, write=60.0, pool=5.0)
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
@asynccontextmanager
|
| 62 |
+
async def lifespan(_app: FastAPI):
|
| 63 |
+
"""Open one shared httpx client for the proxy's lifetime.
|
| 64 |
+
|
| 65 |
+
Keep-alive across requests matters: every chat completion otherwise
|
| 66 |
+
pays a TCP+HTTP/1.1 handshake (~1-2 ms localhost, but it adds up
|
| 67 |
+
under autoplay loops that fire 8 turns/episode).
|
| 68 |
+
"""
|
| 69 |
+
async with httpx.AsyncClient(timeout=TIMEOUT) as client:
|
| 70 |
+
_app.state.http = client
|
| 71 |
+
yield
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
app = FastAPI(
|
| 75 |
+
title="PhysiX-Infer",
|
| 76 |
+
description="Dual-model OpenAI-compatible inference (Qwen 2.5 3B + physix-3b-rl).",
|
| 77 |
+
lifespan=lifespan,
|
| 78 |
+
)
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def _resolve_upstream(model: str | None) -> str:
|
| 82 |
+
if not model:
|
| 83 |
+
raise HTTPException(
|
| 84 |
+
status_code=400,
|
| 85 |
+
detail="Missing 'model' field. Pass either "
|
| 86 |
+
f"'{QWEN_MODEL}' or '{PHYSIX_MODEL}'.",
|
| 87 |
+
)
|
| 88 |
+
upstream = ROUTING.get(model)
|
| 89 |
+
if upstream is None:
|
| 90 |
+
raise HTTPException(
|
| 91 |
+
status_code=400,
|
| 92 |
+
detail=(
|
| 93 |
+
f"Model '{model}' is not served by this Space. "
|
| 94 |
+
f"Available: {list(ROUTING.keys())}."
|
| 95 |
+
),
|
| 96 |
+
)
|
| 97 |
+
return upstream
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
async def _proxy_json(request: Request, path: str) -> JSONResponse | StreamingResponse:
|
| 101 |
+
"""Read body, route on `model`, forward, stream back if `stream=true`."""
|
| 102 |
+
|
| 103 |
+
raw = await request.body()
|
| 104 |
+
try:
|
| 105 |
+
payload = json.loads(raw) if raw else {}
|
| 106 |
+
except json.JSONDecodeError as exc:
|
| 107 |
+
raise HTTPException(status_code=400, detail=f"Invalid JSON: {exc}") from exc
|
| 108 |
+
|
| 109 |
+
upstream = _resolve_upstream(payload.get("model"))
|
| 110 |
+
is_stream = bool(payload.get("stream"))
|
| 111 |
+
|
| 112 |
+
# Strip any hop-by-hop headers; pass auth/content-type through.
|
| 113 |
+
fwd_headers = {
|
| 114 |
+
k: v
|
| 115 |
+
for k, v in request.headers.items()
|
| 116 |
+
if k.lower() in {"content-type", "accept", "authorization", "x-request-id"}
|
| 117 |
+
}
|
| 118 |
+
fwd_headers.setdefault("content-type", "application/json")
|
| 119 |
+
|
| 120 |
+
client: httpx.AsyncClient = request.app.state.http
|
| 121 |
+
upstream_url = f"{upstream}{path}"
|
| 122 |
+
|
| 123 |
+
if not is_stream:
|
| 124 |
+
try:
|
| 125 |
+
resp = await client.post(upstream_url, content=raw, headers=fwd_headers)
|
| 126 |
+
except httpx.HTTPError as exc:
|
| 127 |
+
logger.exception("upstream %s failed", upstream_url)
|
| 128 |
+
raise HTTPException(status_code=502, detail=f"Upstream error: {exc}") from exc
|
| 129 |
+
# vLLM returns JSON with content-type=application/json or text/event-stream
|
| 130 |
+
# for streaming. We've handled streaming above, so trust upstream content-type.
|
| 131 |
+
return JSONResponse(
|
| 132 |
+
status_code=resp.status_code,
|
| 133 |
+
content=resp.json() if resp.headers.get("content-type", "").startswith("application/json")
|
| 134 |
+
else {"raw": resp.text},
|
| 135 |
+
)
|
| 136 |
+
|
| 137 |
+
# Streaming path: open the upstream as a streaming request and pump
|
| 138 |
+
# chunks straight to the client. Note the `async with` lives INSIDE
|
| 139 |
+
# the generator so it stays open until StreamingResponse is done.
|
| 140 |
+
async def _gen() -> AsyncIterator[bytes]:
|
| 141 |
+
try:
|
| 142 |
+
async with client.stream(
|
| 143 |
+
"POST", upstream_url, content=raw, headers=fwd_headers
|
| 144 |
+
) as upstream_resp:
|
| 145 |
+
if upstream_resp.status_code >= 400:
|
| 146 |
+
body = await upstream_resp.aread()
|
| 147 |
+
yield body
|
| 148 |
+
return
|
| 149 |
+
async for chunk in upstream_resp.aiter_raw():
|
| 150 |
+
if chunk:
|
| 151 |
+
yield chunk
|
| 152 |
+
except httpx.HTTPError as exc:
|
| 153 |
+
logger.exception("upstream stream %s failed", upstream_url)
|
| 154 |
+
err = json.dumps({"error": {"message": str(exc), "type": "upstream_error"}})
|
| 155 |
+
yield f"data: {err}\n\n".encode()
|
| 156 |
+
|
| 157 |
+
return StreamingResponse(_gen(), media_type="text/event-stream")
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
@app.post("/v1/chat/completions")
|
| 161 |
+
async def chat_completions(request: Request):
|
| 162 |
+
return await _proxy_json(request, "/v1/chat/completions")
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
@app.post("/v1/completions")
|
| 166 |
+
async def completions(request: Request):
|
| 167 |
+
return await _proxy_json(request, "/v1/completions")
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
@app.get("/v1/models")
|
| 171 |
+
async def list_models():
|
| 172 |
+
"""Static listing — vLLM exposes the same shape per-upstream, but we
|
| 173 |
+
union them here so a single GET covers both. `created` and `owned_by`
|
| 174 |
+
are filled with sensible placeholders since neither field is load-bearing
|
| 175 |
+
for any client we know of."""
|
| 176 |
+
return {
|
| 177 |
+
"object": "list",
|
| 178 |
+
"data": [
|
| 179 |
+
{
|
| 180 |
+
"id": QWEN_MODEL,
|
| 181 |
+
"object": "model",
|
| 182 |
+
"created": 0,
|
| 183 |
+
"owned_by": "Qwen",
|
| 184 |
+
},
|
| 185 |
+
{
|
| 186 |
+
"id": PHYSIX_MODEL,
|
| 187 |
+
"object": "model",
|
| 188 |
+
"created": 0,
|
| 189 |
+
"owned_by": "Pratyush-01",
|
| 190 |
+
},
|
| 191 |
+
],
|
| 192 |
+
}
|
| 193 |
+
|
| 194 |
+
|
| 195 |
+
@app.get("/v1/models/{model_id:path}")
|
| 196 |
+
async def get_model(model_id: str):
|
| 197 |
+
if model_id not in ROUTING:
|
| 198 |
+
raise HTTPException(status_code=404, detail=f"Model '{model_id}' not found.")
|
| 199 |
+
owner = "Qwen" if model_id == QWEN_MODEL else "Pratyush-01"
|
| 200 |
+
return {"id": model_id, "object": "model", "created": 0, "owned_by": owner}
|
| 201 |
+
|
| 202 |
+
|
| 203 |
+
@app.get("/health")
|
| 204 |
+
async def health(request: Request):
|
| 205 |
+
"""Both upstreams must answer /health — the demo frontend uses this
|
| 206 |
+
to decide whether to show a 'warming up' notice on cold start."""
|
| 207 |
+
client: httpx.AsyncClient = request.app.state.http
|
| 208 |
+
statuses = {}
|
| 209 |
+
overall_ok = True
|
| 210 |
+
for name, base in (("qwen", QWEN_UPSTREAM), ("physix", PHYSIX_UPSTREAM)):
|
| 211 |
+
try:
|
| 212 |
+
r = await client.get(f"{base}/health", timeout=5.0)
|
| 213 |
+
statuses[name] = "ok" if r.status_code == 200 else f"status={r.status_code}"
|
| 214 |
+
overall_ok = overall_ok and r.status_code == 200
|
| 215 |
+
except httpx.HTTPError as exc:
|
| 216 |
+
statuses[name] = f"unreachable: {exc.__class__.__name__}"
|
| 217 |
+
overall_ok = False
|
| 218 |
+
return JSONResponse(
|
| 219 |
+
status_code=200 if overall_ok else 503,
|
| 220 |
+
content={"status": "ok" if overall_ok else "starting", "upstreams": statuses},
|
| 221 |
+
)
|
| 222 |
+
|
| 223 |
+
|
| 224 |
+
@app.get("/", response_class=HTMLResponse)
|
| 225 |
+
async def root():
|
| 226 |
+
"""Landing page so the bare Space URL doesn't 404. Plain HTML — no
|
| 227 |
+
framework, no static dir to manage."""
|
| 228 |
+
return f"""<!doctype html>
|
| 229 |
+
<html><head><meta charset="utf-8"><title>PhysiX-Infer</title>
|
| 230 |
+
<style>
|
| 231 |
+
body{{font-family:system-ui,sans-serif;max-width:680px;margin:3em auto;padding:0 1em;color:#222}}
|
| 232 |
+
code,pre{{background:#f4f4f4;padding:.2em .4em;border-radius:4px;font-size:.95em}}
|
| 233 |
+
pre{{padding:1em;overflow-x:auto}}
|
| 234 |
+
h1{{margin-bottom:.2em}}
|
| 235 |
+
.muted{{color:#777}}
|
| 236 |
+
</style>
|
| 237 |
+
</head><body>
|
| 238 |
+
<h1>PhysiX-Infer</h1>
|
| 239 |
+
<p class="muted">OpenAI-compatible inference proxy for two 3B Qwen2 checkpoints.</p>
|
| 240 |
+
|
| 241 |
+
<h3>Models served</h3>
|
| 242 |
+
<ul>
|
| 243 |
+
<li><code>{QWEN_MODEL}</code> — untrained baseline</li>
|
| 244 |
+
<li><code>{PHYSIX_MODEL}</code> — GRPO-trained variant</li>
|
| 245 |
+
</ul>
|
| 246 |
+
|
| 247 |
+
<h3>Endpoints</h3>
|
| 248 |
+
<ul>
|
| 249 |
+
<li><code>GET /v1/models</code></li>
|
| 250 |
+
<li><code>POST /v1/chat/completions</code> (set <code>model</code> to one of the ids above)</li>
|
| 251 |
+
<li><code>GET /health</code></li>
|
| 252 |
+
</ul>
|
| 253 |
+
|
| 254 |
+
<h3>Example</h3>
|
| 255 |
+
<pre>curl -X POST https://<this-space>.hf.space/v1/chat/completions \\
|
| 256 |
+
-H 'content-type: application/json' \\
|
| 257 |
+
-d '{{"model":"{PHYSIX_MODEL}","messages":[{{"role":"user","content":"hi"}}]}}'</pre>
|
| 258 |
+
|
| 259 |
+
<p class="muted">No auth, but the Space sleeps after a short idle window — first request after sleep takes ~90 s while both vLLMs warm up.</p>
|
| 260 |
+
</body></html>"""
|