Spaces:

Pratyush-01
/

physix-live

Sleeping

File size: 11,024 Bytes

"""Real-network smoke test for the Hugging Face Router path.

The judges will overwhelmingly run the demo through HF Router, so we
verify it works against the real endpoint *before* shipping the Space.
This script:

  1. Confirms HF_TOKEN is set and has the right scope by listing
     accounts via /v1/models. (Cheap; doesn't bill credits.)
  2. For every model in the demo's HF Router suggestion list, fires
     a single chat completion to confirm at least one serving
     provider is warm. Reports which ones serve and which 404.
  3. Drives one full PhysiX episode end-to-end through the live
     server using whichever model served first, and prints the
     reward breakdown.

Not part of pytest because it (a) needs the real HF_TOKEN, (b) burns
real credits (~one cent per run), and (c) is tied to which providers
are warm at any given moment, which is intrinsically flaky.

Usage:

    export HF_TOKEN=hf_xxx
    # in one terminal:
    python -m physix.server.app --host 127.0.0.1 --port 8000
    # in another:
    python scripts/verify_hf_router.py
"""

from __future__ import annotations

import argparse
import asyncio
import os
import sys
from dataclasses import dataclass

import openai
import requests

from physix.server.providers import HF_ROUTER_BASE_URL


# Models the connection panel suggests under the HF Router endpoint.
# Keep this list in sync with `frontend/src/lib/llmPresets.ts`.
HF_SUGGESTED_MODELS: list[str] = [
    "Pratyush-01/physix-3b-rl",
    "Pratyush-01/physix-3b-sft-merged",
    "Qwen/Qwen2.5-3B-Instruct",
    "Qwen/Qwen2.5-7B-Instruct",
]


@dataclass
class ModelCheckResult:
    model: str
    served: bool
    detail: str
    latency_s: float = 0.0


def _green(s: str) -> str:
    return f"\033[32m{s}\033[0m"


def _red(s: str) -> str:
    return f"\033[31m{s}\033[0m"


def _yellow(s: str) -> str:
    return f"\033[33m{s}\033[0m"


def _bold(s: str) -> str:
    return f"\033[1m{s}\033[0m"


def check_token() -> str:
    """Verify HF_TOKEN exists and has Inference Providers scope.

    Returns the token. Exits 1 on any auth-level failure with a clear
    remediation message — this is the most common reason the demo
    appears to "not work" for first-time visitors.
    """
    token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_API_KEY")
    if not token:
        print(_red("HF_TOKEN is not set."))
        print(
            "  → Create one at https://huggingface.co/settings/tokens",
            file=sys.stderr,
        )
        print(
            "    with the 'Make calls to Inference Providers' fine-grained",
            file=sys.stderr,
        )
        print(
            "    permission, then `export HF_TOKEN=hf_...` and re-run.",
            file=sys.stderr,
        )
        sys.exit(1)

    # /v1/models is the cheapest way to confirm the token has the
    # right scope; HF returns 200 with a paginated catalogue.
    try:
        response = requests.get(
            f"{HF_ROUTER_BASE_URL}/models",
            headers={"Authorization": f"Bearer {token}"},
            timeout=15,
        )
    except requests.RequestException as exc:
        print(_red(f"Could not reach {HF_ROUTER_BASE_URL}: {exc}"))
        sys.exit(1)

    if response.status_code == 401:
        print(
            _red(
                "HF_TOKEN was rejected (401). The token likely doesn't have "
                "the 'Make calls to Inference Providers' permission."
            )
        )
        print(
            "  → Re-create the token at https://huggingface.co/settings/tokens",
            file=sys.stderr,
        )
        print(
            "    making sure that fine-grained scope is checked.",
            file=sys.stderr,
        )
        sys.exit(1)
    if not response.ok:
        print(_red(f"HF Router rejected /models lookup: HTTP {response.status_code}"))
        print(response.text[:500], file=sys.stderr)
        sys.exit(1)

    print(_green("✓ HF_TOKEN is valid and has Inference Providers scope."))
    return token


def check_model(token: str, model: str, *, timeout_s: float = 60.0) -> ModelCheckResult:
    """Fire one tiny chat completion against a model.

    Returns a structured result indicating whether at least one
    provider is currently serving that model. We deliberately use a
    1-token completion to keep credit usage minimal.
    """
    client = openai.OpenAI(
        base_url=HF_ROUTER_BASE_URL,
        api_key=token,
        timeout=timeout_s,
        default_headers={"User-Agent": "physix-live-demo/0.1 (verify_hf_router)"},
    )

    import time
    t0 = time.perf_counter()
    try:
        response = client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": "Reply with the single word OK."}],
            max_tokens=4,
            temperature=0.0,
        )
    except openai.NotFoundError:
        return ModelCheckResult(
            model=model,
            served=False,
            detail=(
                "404 — no Inference Provider is currently serving this model. "
                "Check the model card's 'Deploy → Inference API' panel."
            ),
        )
    except openai.AuthenticationError as exc:
        return ModelCheckResult(
            model=model,
            served=False,
            detail=f"401 — {exc}",
        )
    except openai.BadRequestError as exc:
        return ModelCheckResult(
            model=model,
            served=False,
            detail=f"400 — {exc}",
        )
    except (openai.APIConnectionError, openai.APITimeoutError) as exc:
        return ModelCheckResult(
            model=model,
            served=False,
            detail=f"connection/timeout — {exc}",
        )
    except Exception as exc:  # noqa: BLE001
        return ModelCheckResult(
            model=model,
            served=False,
            detail=f"{type(exc).__name__}: {exc}",
        )

    elapsed = time.perf_counter() - t0
    content = (response.choices[0].message.content if response.choices else "") or ""
    return ModelCheckResult(
        model=model,
        served=True,
        detail=f"got: {content.strip()[:40]!r}",
        latency_s=elapsed,
    )


async def drive_one_episode(token: str, model: str, base_url: str) -> None:
    """Drive a single PhysiX episode end-to-end through the live
    server, using the chosen HF Router model. Confirms not just that
    the LLM responds, but that the full env+verifier+UI loop works."""
    import httpx

    print()
    print(_bold(f"--- Driving one PhysiX episode through {model} ---"))

    timeout = httpx.Timeout(180.0, connect=10.0)
    async with httpx.AsyncClient(base_url=base_url, timeout=timeout) as http:
        # Sanity: the local server is up.
        try:
            await http.get("/interactive/systems")
        except httpx.HTTPError as exc:
            print(_red(f"Local PhysiX server unreachable at {base_url}: {exc}"))
            print(
                "  → Start it with `python -m physix.server.app --host 127.0.0.1 --port 8000`",
                file=sys.stderr,
            )
            return

        start = await http.post(
            "/interactive/sessions",
            json={"system_id": "free_fall_drag", "seed": 42, "max_turns": 4},
        )
        start.raise_for_status()
        session_id = start.json()["session_id"]
        print(f"  session_id: {session_id}")

        for turn in range(4):
            step = await http.post(
                f"/interactive/sessions/{session_id}/llm-step",
                json={
                    "base_url": HF_ROUTER_BASE_URL,
                    "model": model,
                    "api_key": token,
                    "temperature": 0.4,
                    "max_tokens": 1024,
                },
            )
            if step.status_code != 200:
                print(_red(f"  turn {turn + 1}: HTTP {step.status_code}"))
                try:
                    detail = step.json().get("detail", step.text)
                except Exception:
                    detail = step.text
                print(f"    {detail}")
                break
            body = step.json()
            reward = body["observation"]["reward_breakdown"]
            print(
                f"  turn {turn + 1}: "
                f"match={reward['match']:.2f}  "
                f"format={reward['format']:.2f}  "
                f"total={reward['total']:.2f}  "
                f"({body['latency_s']:.1f}s)"
            )
            print(f"    equation: {body['action']['equation']!r}")
            if body["observation"]["done"]:
                print(_green("  done."))
                break

        await http.delete(f"/interactive/sessions/{session_id}")


def main() -> None:
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument(
        "--server-url",
        default="http://127.0.0.1:8000",
        help="Local PhysiX server (default: http://127.0.0.1:8000)",
    )
    parser.add_argument(
        "--skip-episode",
        action="store_true",
        help="Skip the end-to-end episode drive; only do model probes.",
    )
    args = parser.parse_args()

    print(_bold("=== Step 1: HF_TOKEN ==="))
    token = check_token()

    print()
    print(_bold("=== Step 2: probing each suggested HF model ==="))
    print(
        "  (one tiny completion per model; non-served models will 404 quickly)"
    )
    print()

    results: list[ModelCheckResult] = []
    for model in HF_SUGGESTED_MODELS:
        print(f"  → {model:50s}", end="  ", flush=True)
        result = check_model(token, model)
        results.append(result)
        if result.served:
            print(_green(f"OK  ({result.latency_s:.1f}s)  {result.detail}"))
        else:
            print(_red("NOT SERVED"))
            print(f"     {result.detail}")

    served = [r for r in results if r.served]
    not_served = [r for r in results if not r.served]

    print()
    print(_bold("=== Summary ==="))
    print(f"  {_green(f'{len(served)} served')} / {_yellow(f'{len(not_served)} not served')} of {len(results)}")
    if not_served:
        print()
        print(_yellow("Not served:"))
        for r in not_served:
            print(f"  · {r.model}")
        print()
        print(
            "If the trained PhysiX model is in the not-served list, you have"
        )
        print("a few options before shipping:")
        print("  1. Open the model card → 'Deploy' → 'Inference Providers' →")
        print("     enable a provider that hosts it (Featherless / Together).")
        print("  2. Append `:fastest` to the model id in the demo's preset")
        print("     to let HF auto-pick a provider.")
        print(
            "  3. Fall back to one of the served baselines — the comparison"
        )
        print("     story still works.")

    if args.skip_episode or not served:
        return

    asyncio.run(drive_one_episode(token, served[0].model, args.server_url))


if __name__ == "__main__":
    main()