Spaces:

Pratyush-01
/

physix

Sleeping

App Files Files Community

physix / scripts /verify_hf_router.py

Pratyush-01

Upload folder using huggingface_hub

0e24aff verified 12 days ago

raw

history blame contribute delete

11 kB

	"""Real-network smoke test for the Hugging Face Router path.

	The judges will overwhelmingly run the demo through HF Router, so we
	verify it works against the real endpoint before shipping the Space.
	This script:

	1. Confirms HF_TOKEN is set and has the right scope by listing
	accounts via /v1/models. (Cheap; doesn't bill credits.)
	2. For every model in the demo's HF Router suggestion list, fires
	a single chat completion to confirm at least one serving
	provider is warm. Reports which ones serve and which 404.
	3. Drives one full PhysiX episode end-to-end through the live
	server using whichever model served first, and prints the
	reward breakdown.

	Not part of pytest because it (a) needs the real HF_TOKEN, (b) burns
	real credits (~one cent per run), and (c) is tied to which providers
	are warm at any given moment, which is intrinsically flaky.

	Usage:

	export HF_TOKEN=hf_xxx
	# in one terminal:
	python -m physix.server.app --host 127.0.0.1 --port 8000
	# in another:
	python scripts/verify_hf_router.py
	"""

	from __future__ import annotations

	import argparse
	import asyncio
	import json
	import os
	import sys
	from dataclasses import dataclass

	import openai
	import requests

	from physix.server.providers import HF_ROUTER_BASE_URL


	# Models the connection panel suggests under the HF Router endpoint.
	# Keep this list in sync with `frontend/src/lib/llmPresets.ts`.
	HF_SUGGESTED_MODELS: list[str] = [
	"Pratyush-01/physix-3b-rl",
	"Pratyush-01/physix-3b-sft-merged",
	"Qwen/Qwen2.5-3B-Instruct",
	"Qwen/Qwen2.5-7B-Instruct",
	]


	@dataclass
	class ModelCheckResult:
	model: str
	served: bool
	detail: str
	latency_s: float = 0.0


	def _green(s: str) -> str:
	return f"\033[32m{s}\033[0m"


	def _red(s: str) -> str:
	return f"\033[31m{s}\033[0m"


	def _yellow(s: str) -> str:
	return f"\033[33m{s}\033[0m"


	def _bold(s: str) -> str:
	return f"\033[1m{s}\033[0m"


	def check_token() -> str:
	"""Verify HF_TOKEN exists and has Inference Providers scope.

	Returns the token. Exits 1 on any auth-level failure with a clear
	remediation message — this is the most common reason the demo
	appears to "not work" for first-time visitors.
	"""
	token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_API_KEY")
	if not token:
	print(_red("HF_TOKEN is not set."))
	print(
	" → Create one at https://huggingface.co/settings/tokens",
	file=sys.stderr,
	)
	print(
	" with the 'Make calls to Inference Providers' fine-grained",
	file=sys.stderr,
	)
	print(
	" permission, then `export HF_TOKEN=hf_...` and re-run.",
	file=sys.stderr,
	)
	sys.exit(1)

	# /v1/models is the cheapest way to confirm the token has the
	# right scope; HF returns 200 with a paginated catalogue.
	try:
	response = requests.get(
	f"{HF_ROUTER_BASE_URL}/models",
	headers={"Authorization": f"Bearer {token}"},
	timeout=15,
	)
	except requests.RequestException as exc:
	print(_red(f"Could not reach {HF_ROUTER_BASE_URL}: {exc}"))
	sys.exit(1)

	if response.status_code == 401:
	print(
	_red(
	"HF_TOKEN was rejected (401). The token likely doesn't have "
	"the 'Make calls to Inference Providers' permission."
	)
	)
	print(
	" → Re-create the token at https://huggingface.co/settings/tokens",
	file=sys.stderr,
	)
	print(
	" making sure that fine-grained scope is checked.",
	file=sys.stderr,
	)
	sys.exit(1)
	if not response.ok:
	print(_red(f"HF Router rejected /models lookup: HTTP {response.status_code}"))
	print(response.text[:500], file=sys.stderr)
	sys.exit(1)

	print(_green(f"✓ HF_TOKEN is valid and has Inference Providers scope."))
	return token


	def check_model(token: str, model: str, *, timeout_s: float = 60.0) -> ModelCheckResult:
	"""Fire one tiny chat completion against a model.

	Returns a structured result indicating whether at least one
	provider is currently serving that model. We deliberately use a
	1-token completion to keep credit usage minimal.
	"""
	client = openai.OpenAI(
	base_url=HF_ROUTER_BASE_URL,
	api_key=token,
	timeout=timeout_s,
	default_headers={"User-Agent": "physix-live-demo/0.1 (verify_hf_router)"},
	)

	import time
	t0 = time.perf_counter()
	try:
	response = client.chat.completions.create(
	model=model,
	messages=[{"role": "user", "content": "Reply with the single word OK."}],
	max_tokens=4,
	temperature=0.0,
	)
	except openai.NotFoundError:
	return ModelCheckResult(
	model=model,
	served=False,
	detail=(
	"404 — no Inference Provider is currently serving this model. "
	"Check the model card's 'Deploy → Inference API' panel."
	),
	)
	except openai.AuthenticationError as exc:
	return ModelCheckResult(
	model=model,
	served=False,
	detail=f"401 — {exc}",
	)
	except openai.BadRequestError as exc:
	return ModelCheckResult(
	model=model,
	served=False,
	detail=f"400 — {exc}",
	)
	except (openai.APIConnectionError, openai.APITimeoutError) as exc:
	return ModelCheckResult(
	model=model,
	served=False,
	detail=f"connection/timeout — {exc}",
	)
	except Exception as exc: # noqa: BLE001
	return ModelCheckResult(
	model=model,
	served=False,
	detail=f"{type(exc).__name__}: {exc}",
	)

	elapsed = time.perf_counter() - t0
	content = (response.choices[0].message.content if response.choices else "") or ""
	return ModelCheckResult(
	model=model,
	served=True,
	detail=f"got: {content.strip()[:40]!r}",
	latency_s=elapsed,
	)


	async def drive_one_episode(token: str, model: str, base_url: str) -> None:
	"""Drive a single PhysiX episode end-to-end through the live
	server, using the chosen HF Router model. Confirms not just that
	the LLM responds, but that the full env+verifier+UI loop works."""
	import httpx

	print()
	print(_bold(f"--- Driving one PhysiX episode through {model} ---"))

	timeout = httpx.Timeout(180.0, connect=10.0)
	async with httpx.AsyncClient(base_url=base_url, timeout=timeout) as http:
	# Sanity: the local server is up.
	try:
	await http.get("/interactive/systems")
	except httpx.HTTPError as exc:
	print(_red(f"Local PhysiX server unreachable at {base_url}: {exc}"))
	print(
	" → Start it with `python -m physix.server.app --host 127.0.0.1 --port 8000`",
	file=sys.stderr,
	)
	return

	start = await http.post(
	"/interactive/sessions",
	json={"system_id": "free_fall_drag", "seed": 42, "max_turns": 4},
	)
	start.raise_for_status()
	session_id = start.json()["session_id"]
	print(f" session_id: {session_id}")

	for turn in range(4):
	step = await http.post(
	f"/interactive/sessions/{session_id}/llm-step",
	json={
	"base_url": HF_ROUTER_BASE_URL,
	"model": model,
	"api_key": token,
	"temperature": 0.4,
	"max_tokens": 1024,
	},
	)
	if step.status_code != 200:
	print(_red(f" turn {turn + 1}: HTTP {step.status_code}"))
	try:
	detail = step.json().get("detail", step.text)
	except Exception:
	detail = step.text
	print(f" {detail}")
	break
	body = step.json()
	reward = body["observation"]["reward_breakdown"]
	print(
	f" turn {turn + 1}: "
	f"match={reward['match']:.2f} "
	f"format={reward['format']:.2f} "
	f"total={reward['total']:.2f} "
	f"({body['latency_s']:.1f}s)"
	)
	print(f" equation: {body['action']['equation']!r}")
	if body["observation"]["done"]:
	print(_green(" done."))
	break

	await http.delete(f"/interactive/sessions/{session_id}")


	def main() -> None:
	parser = argparse.ArgumentParser(description=__doc__)
	parser.add_argument(
	"--server-url",
	default="http://127.0.0.1:8000",
	help="Local PhysiX server (default: http://127.0.0.1:8000)",
	)
	parser.add_argument(
	"--skip-episode",
	action="store_true",
	help="Skip the end-to-end episode drive; only do model probes.",
	)
	args = parser.parse_args()

	print(_bold("=== Step 1: HF_TOKEN ==="))
	token = check_token()

	print()
	print(_bold("=== Step 2: probing each suggested HF model ==="))
	print(
	" (one tiny completion per model; non-served models will 404 quickly)"
	)
	print()

	results: list[ModelCheckResult] = []
	for model in HF_SUGGESTED_MODELS:
	print(f" → {model:50s}", end=" ", flush=True)
	result = check_model(token, model)
	results.append(result)
	if result.served:
	print(_green(f"OK ({result.latency_s:.1f}s) {result.detail}"))
	else:
	print(_red("NOT SERVED"))
	print(f" {result.detail}")

	served = [r for r in results if r.served]
	not_served = [r for r in results if not r.served]

	print()
	print(_bold("=== Summary ==="))
	print(f" {_green(f'{len(served)} served')} / {_yellow(f'{len(not_served)} not served')} of {len(results)}")
	if not_served:
	print()
	print(_yellow("Not served:"))
	for r in not_served:
	print(f" · {r.model}")
	print()
	print(
	"If the trained PhysiX model is in the not-served list, you have"
	)
	print("a few options before shipping:")
	print(" 1. Open the model card → 'Deploy' → 'Inference Providers' →")
	print(" enable a provider that hosts it (Featherless / Together).")
	print(" 2. Append `:fastest` to the model id in the demo's preset")
	print(" to let HF auto-pick a provider.")
	print(
	" 3. Fall back to one of the served baselines — the comparison"
	)
	print(" story still works.")

	if args.skip_episode or not served:
	return

	asyncio.run(drive_one_episode(token, served[0].model, args.server_url))


	if __name__ == "__main__":
	main()