Spaces:
Sleeping
Sleeping
File size: 12,744 Bytes
0e24aff | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 | /** Status banner for the PhysiX-Infer GPU Space.
*
* Why this exists:
* The PhysiX-Infer Space sleeps after 5 min of idle to avoid burning
* GPU time. A cold-start takes 90-120 s while vLLM downloads / loads
* weights for both 3B models. Without warning, a user picks the
* PhysiX endpoint, hits Run, and stares at a spinner for 2 minutes
* convinced something is broken.
*
* This panel surfaces the underlying state so the wait is *expected*,
* not surprising — and offers a one-click "Prewarm" button so the
* user can kick the boot off before they pick a system / hit Run.
*
* Mechanics:
* - On mount, GET https://pratyush-01-physix-infer.hf.space/health.
* - HF Spaces' edge proxy returns one of three observable states:
* * 200 with body { upstreams: { qwen: "ok", physix: "ok" } }
* → both vLLMs are loaded and serving. Fast next call.
* * 200 with one upstream not "ok"
* → container running but vLLM still warming. Some calls fast,
* some still slow. Treat as "warming".
* * 503 / connection-stuck-for->5s
* → Space is asleep. Whatever woke it (this very request) will
* now ride the cold-boot pipeline.
* - Re-poll every 15 s while the component is mounted so the badge
* stays accurate as the user thinks. Polling is cheap and the
* requests count as activity, which keeps the Space awake while
* they read — exactly the UX we want during a demo.
*
* Note on CORS: the physix-infer FastAPI uses default CORS. The
* /health endpoint returns plain JSON; modern browsers allow simple
* GETs across origins to read the status code, but reading the BODY
* needs Access-Control-Allow-Origin. If we can't read the body, we
* fall back to "container is up" (best-effort) on any successful
* response and "asleep" on network failure. */
import { useCallback, useEffect, useRef, useState } from "react";
import { cn } from "@/lib/cn";
import { PHYSIX_INFER_BASE_URL } from "@/lib/llmPresets";
// /health is mounted at the proxy root, so strip the trailing /v1.
const HEALTH_URL = PHYSIX_INFER_BASE_URL.replace(/\/v1\/?$/, "") + "/health";
// 15 s strikes a balance: long enough that we don't spam HF's edge with
// requests, short enough that "GPU is now warm" surfaces well before
// the user has finished typing their prompt.
const POLL_INTERVAL_MS = 15_000;
// Hard ceiling on a single probe. HF holds requests open while a Space
// boots, and that boot can take ~120 s. We don't want to *wait* for
// the boot — we want to detect the asleep state early so we can
// render "cold" and offer the Prewarm button. Anything past 6 s
// without a response is "asleep" for our purposes.
const PROBE_TIMEOUT_MS = 6_000;
type Status =
| { kind: "unknown" }
| { kind: "awake"; bothUpstreams: boolean }
| { kind: "warming" }
| { kind: "asleep" }
| { kind: "error"; message: string };
interface ProbeResult {
status: Status;
/** True if the probe itself was successful enough to count as a
* wake-up signal — i.e. HF Spaces' edge proxy received it and
* routed it to the container. */
hitContainer: boolean;
}
// Module-level dedup. The Compare pane mounts TWO copies of this
// component (one per side), and without coalescing they'd each fire
// their own `/health` GET every 15 s — pointless duplicate load on
// the GPU Space's edge. We share a single in-flight promise across
// concurrent callers and cache the last successful result for a
// short window so the second mount on the same tick reuses the
// first probe's answer instead of issuing its own.
let inFlight: Promise<ProbeResult> | null = null;
let lastResult: { result: ProbeResult; at: number } | null = null;
const SHARED_RESULT_WINDOW_MS = 5_000;
async function probe(): Promise<ProbeResult> {
// Coalesce: a second probe() call that lands while the first is
// still in flight piggy-backs on the same network request.
if (inFlight) return inFlight;
// Replay the last result if it's fresh enough — covers the
// "two component mounts in the same render commit" case where
// both useEffects fire microseconds apart but neither has yet
// populated `inFlight`.
if (lastResult && Date.now() - lastResult.at < SHARED_RESULT_WINDOW_MS) {
return lastResult.result;
}
inFlight = (async (): Promise<ProbeResult> => {
const controller = new AbortController();
const timeoutId = window.setTimeout(
() => controller.abort(),
PROBE_TIMEOUT_MS,
);
try {
return await runProbe(controller.signal);
} finally {
window.clearTimeout(timeoutId);
}
})();
try {
const result = await inFlight;
lastResult = { result, at: Date.now() };
return result;
} finally {
inFlight = null;
}
}
async function runProbe(signal: AbortSignal): Promise<ProbeResult> {
try {
const response = await fetch(HEALTH_URL, {
method: "GET",
mode: "cors",
signal,
});
if (!response.ok) {
// 503 from /health = at least one vLLM still booting. We hit the
// container, so we *did* wake the Space (HF Spaces' edge sends a
// 503 with body during cold-boot, then the body changes to ok
// once vLLMs come up).
return { status: { kind: "warming" }, hitContainer: true };
}
// 200 — try to read the body. If CORS strips it, default to "awake
// but unsure about per-upstream status".
try {
const body = (await response.json()) as {
upstreams?: Record<string, string>;
};
const upstreams = body.upstreams ?? {};
const allOk = Object.values(upstreams).every((v) => v === "ok");
if (allOk && Object.keys(upstreams).length > 0) {
return {
status: { kind: "awake", bothUpstreams: true },
hitContainer: true,
};
}
return { status: { kind: "warming" }, hitContainer: true };
} catch {
// CORS or non-JSON body. Best effort: 200 means the container
// answered, so it's awake; we just can't see the per-upstream
// detail.
return {
status: { kind: "awake", bothUpstreams: false },
hitContainer: true,
};
}
} catch (exc) {
// AbortError → timed out. Network error → DNS / offline / cors
// preflight refused. In either case the Space is effectively
// unreachable from the browser; the most likely cause is
// "asleep + slow cold-boot" rather than a real outage, so we
// render "asleep" (with a Prewarm button).
if ((exc as Error).name === "AbortError") {
return { status: { kind: "asleep" }, hitContainer: false };
}
return {
status: { kind: "error", message: (exc as Error).message },
hitContainer: false,
};
}
}
export function PhysixInferStatus(): JSX.Element {
const [status, setStatus] = useState<Status>({ kind: "unknown" });
const [prewarming, setPrewarming] = useState(false);
// Track whether we've ever seen "awake" so we don't downgrade to
// "asleep" on a transient network blip — the 15 s poll will
// re-confirm soon enough.
const wasAwakeRef = useRef(false);
const refresh = useCallback(async () => {
const result = await probe();
setStatus((prev) => {
// Sticky-awake: if we'd previously confirmed awake and this
// probe came back asleep / error, keep showing awake. If it
// really did go to sleep, the next 15 s poll will agree and
// we'll flip then.
if (
wasAwakeRef.current &&
(result.status.kind === "asleep" ||
result.status.kind === "error")
) {
return prev;
}
if (result.status.kind === "awake") {
wasAwakeRef.current = true;
}
return result.status;
});
}, []);
useEffect(() => {
void refresh();
const id = window.setInterval(() => void refresh(), POLL_INTERVAL_MS);
return () => window.clearInterval(id);
}, [refresh]);
async function handlePrewarm(): Promise<void> {
if (prewarming) return;
setPrewarming(true);
setStatus({ kind: "warming" });
// Fire and forget: HF holds the request open until the container
// is up. We don't `await` because the result of THIS request is
// less interesting than the next 15 s poll which will tell us
// when both upstreams flipped to "ok".
try {
// No timeout here — let the browser hold the connection until
// HF Spaces wakes up and answers.
await fetch(HEALTH_URL, { method: "GET", mode: "cors" });
} catch {
// Ignore — the polling loop will surface the real state.
} finally {
setPrewarming(false);
void refresh();
}
}
return <StatusBanner status={status} onPrewarm={handlePrewarm} prewarming={prewarming} />;
}
// ---------------------------------------------------------------------
// Render
// ---------------------------------------------------------------------
interface StatusBannerProps {
status: Status;
onPrewarm: () => void;
prewarming: boolean;
}
function StatusBanner({
status,
onPrewarm,
prewarming,
}: StatusBannerProps): JSX.Element {
const tone = toneFor(status);
return (
<div
className={cn(
"rounded-lg border px-3 py-2 text-[11px] leading-relaxed",
tone.bg,
tone.border,
)}
>
<div className="flex items-start gap-2">
<span
aria-hidden
className={cn("mt-1 inline-block h-2 w-2 shrink-0 rounded-full", tone.dot)}
/>
<div className="flex-1 min-w-0">
<p className={cn("font-medium", tone.title)}>{labelFor(status)}</p>
<p className="mt-0.5 text-textMuted">{descriptionFor(status)}</p>
</div>
{showsPrewarm(status) ? (
<button
type="button"
onClick={onPrewarm}
disabled={prewarming}
className={cn(
"shrink-0 rounded-md border border-border bg-surface px-2 py-1 text-[10px] font-medium uppercase tracking-wider transition",
"hover:bg-surfaceMuted disabled:cursor-not-allowed disabled:opacity-60",
)}
>
{prewarming ? "Prewarming…" : "Prewarm GPU"}
</button>
) : null}
</div>
</div>
);
}
interface Tone {
bg: string;
border: string;
dot: string;
title: string;
}
function toneFor(status: Status): Tone {
switch (status.kind) {
case "awake":
return {
bg: "bg-emerald-950/40",
border: "border-emerald-800/60",
dot: "bg-emerald-400",
title: "text-emerald-200",
};
case "warming":
return {
bg: "bg-amber-950/40",
border: "border-amber-800/60",
dot: "bg-amber-400 animate-pulse",
title: "text-amber-200",
};
case "asleep":
return {
bg: "bg-amber-950/40",
border: "border-amber-800/60",
dot: "bg-amber-500",
title: "text-amber-200",
};
case "error":
return {
bg: "bg-rose-950/40",
border: "border-rose-800/60",
dot: "bg-rose-500",
title: "text-rose-200",
};
case "unknown":
default:
return {
bg: "bg-surfaceMuted",
border: "border-border",
dot: "bg-textMuted animate-pulse",
title: "text-textPrimary",
};
}
}
function labelFor(status: Status): string {
switch (status.kind) {
case "awake":
return status.bothUpstreams
? "GPU is warm — both models loaded"
: "GPU is warm";
case "warming":
return "GPU is warming up";
case "asleep":
return "GPU is asleep";
case "error":
return "Couldn't reach the GPU Space";
case "unknown":
default:
return "Checking GPU status…";
}
}
function descriptionFor(status: Status): string {
switch (status.kind) {
case "awake":
return "Next request will respond in ~1-3 s. Sleeps again after 5 min idle.";
case "warming":
return "vLLM is loading the 3B weights. First request will resolve in ~30-90 s; subsequent calls are fast.";
case "asleep":
return "First request will trigger a cold boot (~90-120 s while vLLM loads two 3B models on the L4). Click Prewarm now if you'd rather not wait inside the episode.";
case "error":
return "The Space might be temporarily unreachable. Episodes targeting PhysiX-Infer will fail until it recovers — try Hugging Face Router as a fallback.";
case "unknown":
default:
return "Probing https://pratyush-01-physix-infer.hf.space/health …";
}
}
function showsPrewarm(status: Status): boolean {
return status.kind === "asleep" || status.kind === "error";
}
|