Spaces:

Mihir1107
/

DateSelectEnv

Sleeping

Mihir1107 Claude Sonnet 4.6 commited on Apr 7

Commit

4d21b0a

1 Parent(s): 1e7104f

Fix inference.py: correct log format, robust OpenAI client, always exit 0

- Use exact [START]/[STEP]/[END] format required by validator spec
- OpenAI client uses httpx.Client(trust_env=False) to avoid proxy-related
init failures in containerised environments
- [END] is always emitted via try/finally even on exception
- Rule-based fallback runs when LLM is unavailable; script always exits 0
unless the environment server itself is unreachable
- Update default API_BASE_URL to https://router.huggingface.co/v1
- Add httpx>=0.27.0 to requirements.txt

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (2) hide show

inference.py +183 -141
requirements.txt +1 -0

inference.py CHANGED Viewed

@@ -5,15 +5,19 @@ Connects to the environment via WebSocket (/ws) — the required transport
 on HF Spaces where HTTP /reset and /step are not accessible.
 Usage:
-    export HF_TOKEN=hf_...              # or OPENAI_API_KEY=sk-...
     export ENV_HOST=https://your-space.hf.space   # or http://localhost:7860
-    export API_BASE_URL=https://api-inference.huggingface.co/v1  # optional
-    export MODEL_NAME=meta-llama/Llama-3.1-8B-Instruct           # optional
     python inference.py [--host URL]
 Runs all 3 tasks sequentially using one WebSocket connection per task,
 calls POST /grader after each episode, prints scores and final summary.
-Designed to complete in under 20 minutes on 2 vCPU / 8 GB RAM.
 """
 import argparse
@@ -21,17 +25,21 @@ import asyncio
 import json
 import os
 import sys
 import requests
 import websockets
 # ---------------------------------------------------------------------------
 # Config — all overridable via environment variables
 # ---------------------------------------------------------------------------
 DEFAULT_HOST  = os.environ.get("ENV_HOST",      "http://localhost:7860")
-API_BASE_URL  = os.environ.get("API_BASE_URL",  "https://api.openai.com/v1")
-MODEL_NAME    = os.environ.get("MODEL_NAME",    "gpt-4o-mini")
 SEED          = 42
 TASKS         = ["easy", "medium", "hard"]
@@ -71,21 +79,46 @@ Strategy rules:
 # ---------------------------------------------------------------------------
-# Rule-based fallback (used when LLM is unavailable or errors)
 # ---------------------------------------------------------------------------
 def rule_based_action(obs: dict) -> dict:
-    """Produce a sensible action from the observation without an LLM."""
-    noise      = obs.get("noise_estimate", 0.1)
-    diversity  = obs.get("diversity_score", 1.0)
-    budget     = obs.get("remaining_budget", 100)
-    perf       = obs.get("current_performance", 0.5)
-    available  = obs.get("samples_available", 100)
-    # Batch size: shrink near budget exhaustion
     batch_size = 5 if budget < 30 else 10
-    # Weights: penalize uncertainty when noise is high
     if noise > 0.4:
         u, d, r = 0.05, 0.80, 0.15
     elif noise > 0.2:
@@ -95,8 +128,7 @@ def rule_based_action(obs: dict) -> dict:
     else:
         u, d, r = 0.40, 0.40, 0.20
-    # Early stop if doing well and nearly out of budget
-    if perf > 0.65 and budget < 20 and available > 0:
         return {"action_type": "stop", "batch_size": 0,
                 "strategy_weights": {"uncertainty": u, "diversity": d, "random": r}}
@@ -108,49 +140,48 @@ def rule_based_action(obs: dict) -> dict:
 # ---------------------------------------------------------------------------
-# LLM helper — uses requests directly (no openai SDK dependency)
 # ---------------------------------------------------------------------------
-def query_llm(api_key: str | None, obs: dict) -> dict:
     """
-    Call the LLM via plain HTTP (OpenAI-compatible chat/completions endpoint).
-    Returns a parsed action dict. Raises on any error so the caller can
-    fall back to rule_based_action.
     """
-    if not api_key:
-        raise ValueError("No API key available")
-    base_url = (API_BASE_URL or "https://api.openai.com/v1").rstrip("/")
-    url = f"{base_url}/chat/completions"
     user_msg = (
         f"Current observation:\n{json.dumps(obs, indent=2)}\n\n"
         "What action do you take?"
     )
-    payload = {
-        "model": MODEL_NAME,
-        "messages": [
             {"role": "system", "content": SYSTEM_PROMPT},
             {"role": "user",   "content": user_msg},
         ],
-        "temperature": 0.0,
-        "max_tokens": 200,
-    }
-    headers = {
-        "Authorization": f"Bearer {api_key}",
-        "Content-Type": "application/json",
-    }
-    resp = requests.post(url, json=payload, headers=headers, timeout=30)
-    resp.raise_for_status()
-    raw = resp.json()["choices"][0]["message"]["content"].strip()
     # Strip markdown fences if model wraps JSON
     if raw.startswith("```"):
         raw = raw.split("```")[1]
         if raw.startswith("json"):
             raw = raw[4:]
     action = json.loads(raw.strip())
     assert "action_type" in action
     assert "batch_size"  in action
@@ -175,98 +206,103 @@ def ws_url(host: str) -> str:
     return base + "/ws"
-async def run_task_ws(host: str, api_key: str | None, task_id: str) -> dict:
-    """Run one full episode for task_id over a WebSocket. Returns grader result."""
-    print(f"\n{'='*52}")
-    print(f"  Task: {task_id.upper()}")
-    print(f"{'='*52}")
     url = ws_url(host)
-    print(f"  Connecting to {url} ...")
-    async with websockets.connect(url, open_timeout=30, ping_interval=20) as ws:
-        # ── reset ────────────────────────────────────────────────────────
-        await ws.send(json.dumps({
-            "type": "reset",
-            "data": {"task_id": task_id, "seed": SEED},
-        }))
-        resp = json.loads(await ws.recv())
-        if resp["type"] == "error":
-            raise RuntimeError(f"reset error: {resp['data']['message']}")
-        episode_id = resp["data"]["episode_id"]
-        obs        = resp["data"]["observation"]
-        print(f"  Episode ID: {episode_id}")
-        print(f"  Initial obs: {obs}")
-        step         = 0
-        total_reward = 0.0
-        done         = False
-        # ── step loop ────────────────────────────────────────────────────
-        while not done:
-            step += 1
-            # Try LLM; fall back to rule-based on any failure
-            try:
-                action = query_llm(api_key, obs)
-            except Exception as e:
-                print(f"  Step {step}: LLM unavailable ({type(e).__name__}), using rule-based")
-                action = rule_based_action(obs)
-            await ws.send(json.dumps({"type": "step", "data": action}))
             resp = json.loads(await ws.recv())
             if resp["type"] == "error":
-                print(f"  Step {step}: server error: {resp['data']['message']}")
-                break
-            data         = resp["data"]
-            obs          = data["observation"]
-            raw_reward   = data["reward"]
-            reward       = raw_reward["value"] if isinstance(raw_reward, dict) else float(raw_reward)
-            done         = data["done"]
-            total_reward += reward
-            print(
-                f"  Step {step:2d} | perf={obs['current_performance']:.4f} "
-                f"budget={obs['remaining_budget']:3d} "
-                f"reward={reward:+.4f} "
-                f"noise_est={obs['noise_estimate']:.3f}"
-            )
-        # ── close WebSocket cleanly ───────────────────────────────────────
-        await ws.send(json.dumps({"type": "close", "data": {}}))
-        try:
-            await asyncio.wait_for(ws.recv(), timeout=2.0)
-        except (asyncio.TimeoutError, websockets.exceptions.ConnectionClosed):
-            pass
-    print(f"\n  Episode done after {step} steps | total_reward={total_reward:.4f}")
-    print(f"  Final performance: {obs['current_performance']:.4f}")
-    # ── grade via HTTP ────────────────────────────────────────────────────
-    r = requests.post(
-        f"{http_base(host)}/grader",
-        json={"episode_id": episode_id, "task_id": task_id},
-        timeout=15,
-    )
-    r.raise_for_status()
-    grade = r.json()
-    print(f"  Score:   {grade['score']:.4f}")
-    print(f"  Passed:  {grade['passed']}")
-    print(f"  Details: {grade['breakdown']}")
     return {
         "task_id":           task_id,
-        "score":             grade["score"],
-        "passed":            grade["passed"],
-        "breakdown":         grade["breakdown"],
-        "steps":             step,
-        "total_reward":      round(total_reward, 4),
-        "final_performance": obs["current_performance"],
     }
@@ -274,25 +310,26 @@ async def run_task_ws(host: str, api_key: str | None, task_id: str) -> dict:
 # Main
 # ---------------------------------------------------------------------------
-async def amain(host: str, api_key: str | None) -> None:
     results = {}
     for task_id in TASKS:
-        results[task_id] = await run_task_ws(host, api_key, task_id)
-    print(f"\n{'='*52}")
-    print("  INFERENCE RESULTS SUMMARY")
-    print(f"{'='*52}")
-    print(f"{'Task':<10} {'Score':<8} {'Passed':<8} {'Final Perf':<12} {'Steps'}")
-    print("-" * 52)
     for task_id, r in results.items():
         print(
             f"{task_id:<10} {r['score']:<8.4f} {str(r['passed']):<8} "
-            f"{r['final_performance']:<12.4f} {r['steps']}"
         )
     overall = sum(r["score"] for r in results.values()) / len(results)
-    print(f"\nOverall mean score: {overall:.4f}")
-    print(json.dumps({"results": results, "mean_score": round(overall, 4)}, indent=2))
 def main() -> None:
@@ -301,23 +338,28 @@ def main() -> None:
                         help="Environment server base URL (http or https)")
     args = parser.parse_args()
-    # API key is optional — rule-based fallback runs without one
     api_key = os.getenv("HF_TOKEN") or os.getenv("OPENAI_API_KEY")
-    if api_key:
-        print(f"LLM API key found ({len(api_key)} chars); will attempt LLM-guided actions.")
     else:
-        print("No API key (HF_TOKEN / OPENAI_API_KEY); running rule-based fallback.")
     # Health check — environment must be reachable
     try:
         r = requests.get(f"{http_base(args.host)}/health", timeout=15)
         r.raise_for_status()
-        print(f"Connected to {args.host} — {r.json()}")
     except Exception as e:
-        print(f"ERROR: Could not reach environment at {args.host}: {e}")
         sys.exit(1)
-    asyncio.run(amain(args.host, api_key))
 if __name__ == "__main__":

 on HF Spaces where HTTP /reset and /step are not accessible.
 Usage:
+    export HF_TOKEN=hf_...
+    export API_BASE_URL=https://router.huggingface.co/v1
+    export MODEL_NAME=meta-llama/Llama-3.1-8B-Instruct
     export ENV_HOST=https://your-space.hf.space   # or http://localhost:7860
     python inference.py [--host URL]
 Runs all 3 tasks sequentially using one WebSocket connection per task,
 calls POST /grader after each episode, prints scores and final summary.
+STDOUT FORMAT (required by validator):
+    [START] task=<task_name> env=DataSelectEnv model=<model_name>
+    [STEP]  step=<n> action=<action_str> reward=<0.00> done=<true|false> error=<msg|null>
+    [END]   success=<true|false> steps=<n> score=<0.00> rewards=<r1,r2,...,rn>
 """
 import argparse
 import json
 import os
 import sys
+from typing import List, Optional
+import httpx
 import requests
 import websockets
+from openai import OpenAI
 # ---------------------------------------------------------------------------
 # Config — all overridable via environment variables
 # ---------------------------------------------------------------------------
 DEFAULT_HOST  = os.environ.get("ENV_HOST",      "http://localhost:7860")
+API_BASE_URL  = os.environ.get("API_BASE_URL",  "https://router.huggingface.co/v1")
+MODEL_NAME    = os.environ.get("MODEL_NAME",    "meta-llama/Llama-3.1-8B-Instruct")
+BENCHMARK     = "DataSelectEnv"
 SEED          = 42
 TASKS         = ["easy", "medium", "hard"]
 # ---------------------------------------------------------------------------
+# Structured log helpers (validator-required format)
+# ---------------------------------------------------------------------------
+def log_start(task: str, model: str) -> None:
+    print(f"[START] task={task} env={BENCHMARK} model={model}", flush=True)
+def log_step(step: int, action: dict, reward: float, done: bool,
+             error: Optional[str] = None) -> None:
+    error_val = error if error else "null"
+    done_val  = str(done).lower()
+    print(
+        f"[STEP] step={step} action={json.dumps(action)} "
+        f"reward={reward:.2f} done={done_val} error={error_val}",
+        flush=True,
+    )
+def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
+    rewards_str = ",".join(f"{r:.2f}" for r in rewards)
+    print(
+        f"[END] success={str(success).lower()} steps={steps} "
+        f"score={score:.2f} rewards={rewards_str}",
+        flush=True,
+    )
+# ---------------------------------------------------------------------------
+# Rule-based fallback (used when LLM call fails)
 # ---------------------------------------------------------------------------
 def rule_based_action(obs: dict) -> dict:
+    """Adaptive rule-based action derived from observation."""
+    noise     = obs.get("noise_estimate", 0.1)
+    diversity = obs.get("diversity_score", 1.0)
+    budget    = obs.get("remaining_budget", 100)
+    perf      = obs.get("current_performance", 0.5)
     batch_size = 5 if budget < 30 else 10
     if noise > 0.4:
         u, d, r = 0.05, 0.80, 0.15
     elif noise > 0.2:
     else:
         u, d, r = 0.40, 0.40, 0.20
+    if perf > 0.65 and budget < 20:
         return {"action_type": "stop", "batch_size": 0,
                 "strategy_weights": {"uncertainty": u, "diversity": d, "random": r}}
 # ---------------------------------------------------------------------------
+# OpenAI client factory — robust against proxy/env issues in containers
 # ---------------------------------------------------------------------------
+def make_openai_client(api_key: str) -> OpenAI:
     """
+    Create the required OpenAI client.
+    Uses an explicit httpx.Client with trust_env=False to bypass proxy
+    auto-detection that commonly breaks SDK init in containerised environments.
     """
+    base_url = (API_BASE_URL or "https://router.huggingface.co/v1").strip().rstrip("/")
+    http_client = httpx.Client(trust_env=False)
+    try:
+        return OpenAI(api_key=api_key, base_url=base_url, http_client=http_client)
+    except Exception:
+        return OpenAI(api_key=api_key, http_client=http_client)
+# ---------------------------------------------------------------------------
+# LLM helper — uses the required OpenAI client
+# ---------------------------------------------------------------------------
+def query_llm(client: OpenAI, obs: dict) -> dict:
+    """Ask the LLM to produce an action given the current observation."""
     user_msg = (
         f"Current observation:\n{json.dumps(obs, indent=2)}\n\n"
         "What action do you take?"
     )
+    response = client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=[
             {"role": "system", "content": SYSTEM_PROMPT},
             {"role": "user",   "content": user_msg},
         ],
+        temperature=0.0,
+        max_tokens=200,
+    )
+    raw = response.choices[0].message.content.strip()
     # Strip markdown fences if model wraps JSON
     if raw.startswith("```"):
         raw = raw.split("```")[1]
         if raw.startswith("json"):
             raw = raw[4:]
     action = json.loads(raw.strip())
     assert "action_type" in action
     assert "batch_size"  in action
     return base + "/ws"
+async def run_task_ws(host: str, client: Optional[OpenAI], task_id: str) -> dict:
+    """Run one full episode for task_id over WebSocket. Returns grader result."""
     url = ws_url(host)
+    rewards:  List[float] = []
+    steps_taken = 0
+    score       = 0.0
+    success     = False
+    obs         = {}
+    episode_id  = "unknown"
+    log_start(task=task_id, model=MODEL_NAME)
+    try:
+        async with websockets.connect(url, open_timeout=30, ping_interval=20) as ws:
+            # ── reset ────────────────────────────────────────────────────
+            await ws.send(json.dumps({
+                "type": "reset",
+                "data": {"task_id": task_id, "seed": SEED},
+            }))
             resp = json.loads(await ws.recv())
             if resp["type"] == "error":
+                raise RuntimeError(f"reset error: {resp['data']['message']}")
+            episode_id = resp["data"]["episode_id"]
+            obs        = resp["data"]["observation"]
+            done       = False
+            # ── step loop ────────────────────────────────────────────────
+            while not done:
+                step_num = len(rewards) + 1
+                last_error: Optional[str] = None
+                # Try LLM; fall back to rule-based on any failure
+                try:
+                    if client is None:
+                        raise ValueError("no LLM client")
+                    action = query_llm(client, obs)
+                except Exception as e:
+                    last_error = f"{type(e).__name__}: {e}"
+                    action = rule_based_action(obs)
+                await ws.send(json.dumps({"type": "step", "data": action}))
+                resp = json.loads(await ws.recv())
+                if resp["type"] == "error":
+                    err_msg = resp["data"]["message"]
+                    log_step(step_num, action, 0.0, True, error=err_msg)
+                    rewards.append(0.0)
+                    steps_taken = step_num
+                    break
+                data       = resp["data"]
+                obs        = data["observation"]
+                raw_reward = data["reward"]
+                reward     = raw_reward["value"] if isinstance(raw_reward, dict) else float(raw_reward)
+                done       = data["done"]
+                rewards.append(reward)
+                steps_taken = step_num
+                log_step(step_num, action, reward, done, error=last_error)
+            # ── close WebSocket cleanly ───────────────────────────────────
+            await ws.send(json.dumps({"type": "close", "data": {}}))
+            try:
+                await asyncio.wait_for(ws.recv(), timeout=2.0)
+            except (asyncio.TimeoutError, websockets.exceptions.ConnectionClosed):
+                pass
+        # ── grade via HTTP ────────────────────────────────────────────────
+        r = requests.post(
+            f"{http_base(host)}/grader",
+            json={"episode_id": episode_id, "task_id": task_id},
+            timeout=15,
+        )
+        r.raise_for_status()
+        grade   = r.json()
+        score   = float(grade["score"])
+        success = bool(grade["passed"])
+    except Exception as exc:
+        print(f"[DEBUG] Episode error for {task_id}: {exc}", flush=True)
+        score   = 0.0
+        success = False
+    finally:
+        log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
     return {
         "task_id":           task_id,
+        "score":             score,
+        "passed":            success,
+        "steps":             steps_taken,
+        "total_reward":      round(sum(rewards), 4),
+        "final_performance": obs.get("current_performance", 0.0),
     }
 # Main
 # ---------------------------------------------------------------------------
+async def amain(host: str, client: Optional[OpenAI]) -> None:
     results = {}
     for task_id in TASKS:
+        results[task_id] = await run_task_ws(host, client, task_id)
+    print(f"\n{'='*52}", flush=True)
+    print("  INFERENCE RESULTS SUMMARY", flush=True)
+    print(f"{'='*52}", flush=True)
+    print(f"{'Task':<10} {'Score':<8} {'Passed':<8} {'Final Perf':<12} {'Steps'}", flush=True)
+    print("-" * 52, flush=True)
     for task_id, r in results.items():
         print(
             f"{task_id:<10} {r['score']:<8.4f} {str(r['passed']):<8} "
+            f"{r['final_performance']:<12.4f} {r['steps']}",
+            flush=True,
         )
     overall = sum(r["score"] for r in results.values()) / len(results)
+    print(f"\nOverall mean score: {overall:.4f}", flush=True)
+    print(json.dumps({"results": results, "mean_score": round(overall, 4)}, indent=2), flush=True)
 def main() -> None:
                         help="Environment server base URL (http or https)")
     args = parser.parse_args()
+    # Build OpenAI client (required by spec); warn and fall back if unavailable
     api_key = os.getenv("HF_TOKEN") or os.getenv("OPENAI_API_KEY")
+    client: Optional[OpenAI] = None
+    if not api_key:
+        print("WARNING: No HF_TOKEN / OPENAI_API_KEY found — using rule-based fallback.", flush=True)
     else:
+        try:
+            client = make_openai_client(api_key)
+            print(f"OpenAI client ready | base_url={API_BASE_URL} | model={MODEL_NAME}", flush=True)
+        except Exception as e:
+            print(f"WARNING: Could not init OpenAI client ({e}); using rule-based fallback.", flush=True)
     # Health check — environment must be reachable
     try:
         r = requests.get(f"{http_base(args.host)}/health", timeout=15)
         r.raise_for_status()
+        print(f"Environment health: {r.json()}", flush=True)
     except Exception as e:
+        print(f"ERROR: Could not reach environment at {args.host}: {e}", flush=True)
         sys.exit(1)
+    asyncio.run(amain(args.host, client))
 if __name__ == "__main__":

requirements.txt CHANGED Viewed

@@ -4,5 +4,6 @@ pydantic==2.7.4
 numpy==1.26.4
 scikit-learn==1.5.1
 openai==1.40.0
 requests==2.32.3
 websockets>=12.0

 numpy==1.26.4
 scikit-learn==1.5.1
 openai==1.40.0
+httpx>=0.27.0
 requests==2.32.3
 websockets>=12.0