"""Real-network smoke test for the Hugging Face Router path. The judges will overwhelmingly run the demo through HF Router, so we verify it works against the real endpoint *before* shipping the Space. This script: 1. Confirms HF_TOKEN is set and has the right scope by listing accounts via /v1/models. (Cheap; doesn't bill credits.) 2. For every model in the demo's HF Router suggestion list, fires a single chat completion to confirm at least one serving provider is warm. Reports which ones serve and which 404. 3. Drives one full PhysiX episode end-to-end through the live server using whichever model served first, and prints the reward breakdown. Not part of pytest because it (a) needs the real HF_TOKEN, (b) burns real credits (~one cent per run), and (c) is tied to which providers are warm at any given moment, which is intrinsically flaky. Usage: export HF_TOKEN=hf_xxx # in one terminal: python -m physix.server.app --host 127.0.0.1 --port 8000 # in another: python scripts/verify_hf_router.py """ from __future__ import annotations import argparse import asyncio import os import sys from dataclasses import dataclass import openai import requests from physix.server.providers import HF_ROUTER_BASE_URL # Models the connection panel suggests under the HF Router endpoint. # Keep this list in sync with `frontend/src/lib/llmPresets.ts`. HF_SUGGESTED_MODELS: list[str] = [ "Pratyush-01/physix-3b-rl", "Pratyush-01/physix-3b-sft-merged", "Qwen/Qwen2.5-3B-Instruct", "Qwen/Qwen2.5-7B-Instruct", ] @dataclass class ModelCheckResult: model: str served: bool detail: str latency_s: float = 0.0 def _green(s: str) -> str: return f"\033[32m{s}\033[0m" def _red(s: str) -> str: return f"\033[31m{s}\033[0m" def _yellow(s: str) -> str: return f"\033[33m{s}\033[0m" def _bold(s: str) -> str: return f"\033[1m{s}\033[0m" def check_token() -> str: """Verify HF_TOKEN exists and has Inference Providers scope. Returns the token. Exits 1 on any auth-level failure with a clear remediation message — this is the most common reason the demo appears to "not work" for first-time visitors. """ token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_API_KEY") if not token: print(_red("HF_TOKEN is not set.")) print( " → Create one at https://huggingface.co/settings/tokens", file=sys.stderr, ) print( " with the 'Make calls to Inference Providers' fine-grained", file=sys.stderr, ) print( " permission, then `export HF_TOKEN=hf_...` and re-run.", file=sys.stderr, ) sys.exit(1) # /v1/models is the cheapest way to confirm the token has the # right scope; HF returns 200 with a paginated catalogue. try: response = requests.get( f"{HF_ROUTER_BASE_URL}/models", headers={"Authorization": f"Bearer {token}"}, timeout=15, ) except requests.RequestException as exc: print(_red(f"Could not reach {HF_ROUTER_BASE_URL}: {exc}")) sys.exit(1) if response.status_code == 401: print( _red( "HF_TOKEN was rejected (401). The token likely doesn't have " "the 'Make calls to Inference Providers' permission." ) ) print( " → Re-create the token at https://huggingface.co/settings/tokens", file=sys.stderr, ) print( " making sure that fine-grained scope is checked.", file=sys.stderr, ) sys.exit(1) if not response.ok: print(_red(f"HF Router rejected /models lookup: HTTP {response.status_code}")) print(response.text[:500], file=sys.stderr) sys.exit(1) print(_green("✓ HF_TOKEN is valid and has Inference Providers scope.")) return token def check_model(token: str, model: str, *, timeout_s: float = 60.0) -> ModelCheckResult: """Fire one tiny chat completion against a model. Returns a structured result indicating whether at least one provider is currently serving that model. We deliberately use a 1-token completion to keep credit usage minimal. """ client = openai.OpenAI( base_url=HF_ROUTER_BASE_URL, api_key=token, timeout=timeout_s, default_headers={"User-Agent": "physix-live-demo/0.1 (verify_hf_router)"}, ) import time t0 = time.perf_counter() try: response = client.chat.completions.create( model=model, messages=[{"role": "user", "content": "Reply with the single word OK."}], max_tokens=4, temperature=0.0, ) except openai.NotFoundError: return ModelCheckResult( model=model, served=False, detail=( "404 — no Inference Provider is currently serving this model. " "Check the model card's 'Deploy → Inference API' panel." ), ) except openai.AuthenticationError as exc: return ModelCheckResult( model=model, served=False, detail=f"401 — {exc}", ) except openai.BadRequestError as exc: return ModelCheckResult( model=model, served=False, detail=f"400 — {exc}", ) except (openai.APIConnectionError, openai.APITimeoutError) as exc: return ModelCheckResult( model=model, served=False, detail=f"connection/timeout — {exc}", ) except Exception as exc: # noqa: BLE001 return ModelCheckResult( model=model, served=False, detail=f"{type(exc).__name__}: {exc}", ) elapsed = time.perf_counter() - t0 content = (response.choices[0].message.content if response.choices else "") or "" return ModelCheckResult( model=model, served=True, detail=f"got: {content.strip()[:40]!r}", latency_s=elapsed, ) async def drive_one_episode(token: str, model: str, base_url: str) -> None: """Drive a single PhysiX episode end-to-end through the live server, using the chosen HF Router model. Confirms not just that the LLM responds, but that the full env+verifier+UI loop works.""" import httpx print() print(_bold(f"--- Driving one PhysiX episode through {model} ---")) timeout = httpx.Timeout(180.0, connect=10.0) async with httpx.AsyncClient(base_url=base_url, timeout=timeout) as http: # Sanity: the local server is up. try: await http.get("/interactive/systems") except httpx.HTTPError as exc: print(_red(f"Local PhysiX server unreachable at {base_url}: {exc}")) print( " → Start it with `python -m physix.server.app --host 127.0.0.1 --port 8000`", file=sys.stderr, ) return start = await http.post( "/interactive/sessions", json={"system_id": "free_fall_drag", "seed": 42, "max_turns": 4}, ) start.raise_for_status() session_id = start.json()["session_id"] print(f" session_id: {session_id}") for turn in range(4): step = await http.post( f"/interactive/sessions/{session_id}/llm-step", json={ "base_url": HF_ROUTER_BASE_URL, "model": model, "api_key": token, "temperature": 0.4, "max_tokens": 1024, }, ) if step.status_code != 200: print(_red(f" turn {turn + 1}: HTTP {step.status_code}")) try: detail = step.json().get("detail", step.text) except Exception: detail = step.text print(f" {detail}") break body = step.json() reward = body["observation"]["reward_breakdown"] print( f" turn {turn + 1}: " f"match={reward['match']:.2f} " f"format={reward['format']:.2f} " f"total={reward['total']:.2f} " f"({body['latency_s']:.1f}s)" ) print(f" equation: {body['action']['equation']!r}") if body["observation"]["done"]: print(_green(" done.")) break await http.delete(f"/interactive/sessions/{session_id}") def main() -> None: parser = argparse.ArgumentParser(description=__doc__) parser.add_argument( "--server-url", default="http://127.0.0.1:8000", help="Local PhysiX server (default: http://127.0.0.1:8000)", ) parser.add_argument( "--skip-episode", action="store_true", help="Skip the end-to-end episode drive; only do model probes.", ) args = parser.parse_args() print(_bold("=== Step 1: HF_TOKEN ===")) token = check_token() print() print(_bold("=== Step 2: probing each suggested HF model ===")) print( " (one tiny completion per model; non-served models will 404 quickly)" ) print() results: list[ModelCheckResult] = [] for model in HF_SUGGESTED_MODELS: print(f" → {model:50s}", end=" ", flush=True) result = check_model(token, model) results.append(result) if result.served: print(_green(f"OK ({result.latency_s:.1f}s) {result.detail}")) else: print(_red("NOT SERVED")) print(f" {result.detail}") served = [r for r in results if r.served] not_served = [r for r in results if not r.served] print() print(_bold("=== Summary ===")) print(f" {_green(f'{len(served)} served')} / {_yellow(f'{len(not_served)} not served')} of {len(results)}") if not_served: print() print(_yellow("Not served:")) for r in not_served: print(f" · {r.model}") print() print( "If the trained PhysiX model is in the not-served list, you have" ) print("a few options before shipping:") print(" 1. Open the model card → 'Deploy' → 'Inference Providers' →") print(" enable a provider that hosts it (Featherless / Together).") print(" 2. Append `:fastest` to the model id in the demo's preset") print(" to let HF auto-pick a provider.") print( " 3. Fall back to one of the served baselines — the comparison" ) print(" story still works.") if args.skip_episode or not served: return asyncio.run(drive_one_episode(token, served[0].model, args.server_url)) if __name__ == "__main__": main()