File size: 11,024 Bytes
08f8699
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7f40db3
08f8699
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
"""Real-network smoke test for the Hugging Face Router path.

The judges will overwhelmingly run the demo through HF Router, so we
verify it works against the real endpoint *before* shipping the Space.
This script:

  1. Confirms HF_TOKEN is set and has the right scope by listing
     accounts via /v1/models. (Cheap; doesn't bill credits.)
  2. For every model in the demo's HF Router suggestion list, fires
     a single chat completion to confirm at least one serving
     provider is warm. Reports which ones serve and which 404.
  3. Drives one full PhysiX episode end-to-end through the live
     server using whichever model served first, and prints the
     reward breakdown.

Not part of pytest because it (a) needs the real HF_TOKEN, (b) burns
real credits (~one cent per run), and (c) is tied to which providers
are warm at any given moment, which is intrinsically flaky.

Usage:

    export HF_TOKEN=hf_xxx
    # in one terminal:
    python -m physix.server.app --host 127.0.0.1 --port 8000
    # in another:
    python scripts/verify_hf_router.py
"""

from __future__ import annotations

import argparse
import asyncio
import os
import sys
from dataclasses import dataclass

import openai
import requests

from physix.server.providers import HF_ROUTER_BASE_URL


# Models the connection panel suggests under the HF Router endpoint.
# Keep this list in sync with `frontend/src/lib/llmPresets.ts`.
HF_SUGGESTED_MODELS: list[str] = [
    "Pratyush-01/physix-3b-rl",
    "Pratyush-01/physix-3b-sft-merged",
    "Qwen/Qwen2.5-3B-Instruct",
    "Qwen/Qwen2.5-7B-Instruct",
]


@dataclass
class ModelCheckResult:
    model: str
    served: bool
    detail: str
    latency_s: float = 0.0


def _green(s: str) -> str:
    return f"\033[32m{s}\033[0m"


def _red(s: str) -> str:
    return f"\033[31m{s}\033[0m"


def _yellow(s: str) -> str:
    return f"\033[33m{s}\033[0m"


def _bold(s: str) -> str:
    return f"\033[1m{s}\033[0m"


def check_token() -> str:
    """Verify HF_TOKEN exists and has Inference Providers scope.

    Returns the token. Exits 1 on any auth-level failure with a clear
    remediation message — this is the most common reason the demo
    appears to "not work" for first-time visitors.
    """
    token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_API_KEY")
    if not token:
        print(_red("HF_TOKEN is not set."))
        print(
            "  → Create one at https://huggingface.co/settings/tokens",
            file=sys.stderr,
        )
        print(
            "    with the 'Make calls to Inference Providers' fine-grained",
            file=sys.stderr,
        )
        print(
            "    permission, then `export HF_TOKEN=hf_...` and re-run.",
            file=sys.stderr,
        )
        sys.exit(1)

    # /v1/models is the cheapest way to confirm the token has the
    # right scope; HF returns 200 with a paginated catalogue.
    try:
        response = requests.get(
            f"{HF_ROUTER_BASE_URL}/models",
            headers={"Authorization": f"Bearer {token}"},
            timeout=15,
        )
    except requests.RequestException as exc:
        print(_red(f"Could not reach {HF_ROUTER_BASE_URL}: {exc}"))
        sys.exit(1)

    if response.status_code == 401:
        print(
            _red(
                "HF_TOKEN was rejected (401). The token likely doesn't have "
                "the 'Make calls to Inference Providers' permission."
            )
        )
        print(
            "  → Re-create the token at https://huggingface.co/settings/tokens",
            file=sys.stderr,
        )
        print(
            "    making sure that fine-grained scope is checked.",
            file=sys.stderr,
        )
        sys.exit(1)
    if not response.ok:
        print(_red(f"HF Router rejected /models lookup: HTTP {response.status_code}"))
        print(response.text[:500], file=sys.stderr)
        sys.exit(1)

    print(_green("✓ HF_TOKEN is valid and has Inference Providers scope."))
    return token


def check_model(token: str, model: str, *, timeout_s: float = 60.0) -> ModelCheckResult:
    """Fire one tiny chat completion against a model.

    Returns a structured result indicating whether at least one
    provider is currently serving that model. We deliberately use a
    1-token completion to keep credit usage minimal.
    """
    client = openai.OpenAI(
        base_url=HF_ROUTER_BASE_URL,
        api_key=token,
        timeout=timeout_s,
        default_headers={"User-Agent": "physix-live-demo/0.1 (verify_hf_router)"},
    )

    import time
    t0 = time.perf_counter()
    try:
        response = client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": "Reply with the single word OK."}],
            max_tokens=4,
            temperature=0.0,
        )
    except openai.NotFoundError:
        return ModelCheckResult(
            model=model,
            served=False,
            detail=(
                "404 — no Inference Provider is currently serving this model. "
                "Check the model card's 'Deploy → Inference API' panel."
            ),
        )
    except openai.AuthenticationError as exc:
        return ModelCheckResult(
            model=model,
            served=False,
            detail=f"401 — {exc}",
        )
    except openai.BadRequestError as exc:
        return ModelCheckResult(
            model=model,
            served=False,
            detail=f"400 — {exc}",
        )
    except (openai.APIConnectionError, openai.APITimeoutError) as exc:
        return ModelCheckResult(
            model=model,
            served=False,
            detail=f"connection/timeout — {exc}",
        )
    except Exception as exc:  # noqa: BLE001
        return ModelCheckResult(
            model=model,
            served=False,
            detail=f"{type(exc).__name__}: {exc}",
        )

    elapsed = time.perf_counter() - t0
    content = (response.choices[0].message.content if response.choices else "") or ""
    return ModelCheckResult(
        model=model,
        served=True,
        detail=f"got: {content.strip()[:40]!r}",
        latency_s=elapsed,
    )


async def drive_one_episode(token: str, model: str, base_url: str) -> None:
    """Drive a single PhysiX episode end-to-end through the live
    server, using the chosen HF Router model. Confirms not just that
    the LLM responds, but that the full env+verifier+UI loop works."""
    import httpx

    print()
    print(_bold(f"--- Driving one PhysiX episode through {model} ---"))

    timeout = httpx.Timeout(180.0, connect=10.0)
    async with httpx.AsyncClient(base_url=base_url, timeout=timeout) as http:
        # Sanity: the local server is up.
        try:
            await http.get("/interactive/systems")
        except httpx.HTTPError as exc:
            print(_red(f"Local PhysiX server unreachable at {base_url}: {exc}"))
            print(
                "  → Start it with `python -m physix.server.app --host 127.0.0.1 --port 8000`",
                file=sys.stderr,
            )
            return

        start = await http.post(
            "/interactive/sessions",
            json={"system_id": "free_fall_drag", "seed": 42, "max_turns": 4},
        )
        start.raise_for_status()
        session_id = start.json()["session_id"]
        print(f"  session_id: {session_id}")

        for turn in range(4):
            step = await http.post(
                f"/interactive/sessions/{session_id}/llm-step",
                json={
                    "base_url": HF_ROUTER_BASE_URL,
                    "model": model,
                    "api_key": token,
                    "temperature": 0.4,
                    "max_tokens": 1024,
                },
            )
            if step.status_code != 200:
                print(_red(f"  turn {turn + 1}: HTTP {step.status_code}"))
                try:
                    detail = step.json().get("detail", step.text)
                except Exception:
                    detail = step.text
                print(f"    {detail}")
                break
            body = step.json()
            reward = body["observation"]["reward_breakdown"]
            print(
                f"  turn {turn + 1}: "
                f"match={reward['match']:.2f}  "
                f"format={reward['format']:.2f}  "
                f"total={reward['total']:.2f}  "
                f"({body['latency_s']:.1f}s)"
            )
            print(f"    equation: {body['action']['equation']!r}")
            if body["observation"]["done"]:
                print(_green("  done."))
                break

        await http.delete(f"/interactive/sessions/{session_id}")


def main() -> None:
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument(
        "--server-url",
        default="http://127.0.0.1:8000",
        help="Local PhysiX server (default: http://127.0.0.1:8000)",
    )
    parser.add_argument(
        "--skip-episode",
        action="store_true",
        help="Skip the end-to-end episode drive; only do model probes.",
    )
    args = parser.parse_args()

    print(_bold("=== Step 1: HF_TOKEN ==="))
    token = check_token()

    print()
    print(_bold("=== Step 2: probing each suggested HF model ==="))
    print(
        "  (one tiny completion per model; non-served models will 404 quickly)"
    )
    print()

    results: list[ModelCheckResult] = []
    for model in HF_SUGGESTED_MODELS:
        print(f"  → {model:50s}", end="  ", flush=True)
        result = check_model(token, model)
        results.append(result)
        if result.served:
            print(_green(f"OK  ({result.latency_s:.1f}s)  {result.detail}"))
        else:
            print(_red("NOT SERVED"))
            print(f"     {result.detail}")

    served = [r for r in results if r.served]
    not_served = [r for r in results if not r.served]

    print()
    print(_bold("=== Summary ==="))
    print(f"  {_green(f'{len(served)} served')} / {_yellow(f'{len(not_served)} not served')} of {len(results)}")
    if not_served:
        print()
        print(_yellow("Not served:"))
        for r in not_served:
            print(f"  · {r.model}")
        print()
        print(
            "If the trained PhysiX model is in the not-served list, you have"
        )
        print("a few options before shipping:")
        print("  1. Open the model card → 'Deploy' → 'Inference Providers' →")
        print("     enable a provider that hosts it (Featherless / Together).")
        print("  2. Append `:fastest` to the model id in the demo's preset")
        print("     to let HF auto-pick a provider.")
        print(
            "  3. Fall back to one of the served baselines — the comparison"
        )
        print("     story still works.")

    if args.skip_episode or not served:
        return

    asyncio.run(drive_one_episode(token, served[0].model, args.server_url))


if __name__ == "__main__":
    main()