Spaces:
Sleeping
Sleeping
Commit ·
11aa990
1
Parent(s): 2c28868
fix: add run_episode wrapper, use .2f score format, update test for strict bounds
Browse files- inference.py +9 -1
- tests/test_inference.py +2 -2
inference.py
CHANGED
|
@@ -77,7 +77,7 @@ def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> No
|
|
| 77 |
rewards_str = ",".join(f"{r:.2f}" for r in rewards)
|
| 78 |
print(
|
| 79 |
f"[END] success={str(success).lower()} steps={steps} "
|
| 80 |
-
f"score={score:.
|
| 81 |
flush=True,
|
| 82 |
)
|
| 83 |
|
|
@@ -223,6 +223,14 @@ async def run_episode_async(
|
|
| 223 |
# ---------------------------------------------------------------------------
|
| 224 |
|
| 225 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 226 |
def run_episode_sync(
|
| 227 |
env: Any, llm_client: Any, model_name: str, task_id: str
|
| 228 |
) -> None:
|
|
|
|
| 77 |
rewards_str = ",".join(f"{r:.2f}" for r in rewards)
|
| 78 |
print(
|
| 79 |
f"[END] success={str(success).lower()} steps={steps} "
|
| 80 |
+
f"score={score:.2f} rewards={rewards_str}",
|
| 81 |
flush=True,
|
| 82 |
)
|
| 83 |
|
|
|
|
| 223 |
# ---------------------------------------------------------------------------
|
| 224 |
|
| 225 |
|
| 226 |
+
def run_episode(env: Any, llm_client: Any, model_name: str, task_id: str) -> None:
|
| 227 |
+
"""Public episode runner expected by tests.
|
| 228 |
+
|
| 229 |
+
Uses the synchronous env interface: reset(task_id=...), step(...), state().
|
| 230 |
+
"""
|
| 231 |
+
return run_episode_sync(env, llm_client, model_name, task_id)
|
| 232 |
+
|
| 233 |
+
|
| 234 |
def run_episode_sync(
|
| 235 |
env: Any, llm_client: Any, model_name: str, task_id: str
|
| 236 |
) -> None:
|
tests/test_inference.py
CHANGED
|
@@ -55,7 +55,7 @@ def test_run_episode_emits_start_step_end_logs(capsys) -> None:
|
|
| 55 |
total_reward=0.2,
|
| 56 |
done=True,
|
| 57 |
approved=True,
|
| 58 |
-
final_score=
|
| 59 |
)
|
| 60 |
|
| 61 |
class DummyCompletions:
|
|
@@ -80,5 +80,5 @@ def test_run_episode_emits_start_step_end_logs(capsys) -> None:
|
|
| 80 |
assert "[STEP]" in captured
|
| 81 |
assert "[END]" in captured
|
| 82 |
assert "success=true" in captured
|
| 83 |
-
assert "score=
|
| 84 |
|
|
|
|
| 55 |
total_reward=0.2,
|
| 56 |
done=True,
|
| 57 |
approved=True,
|
| 58 |
+
final_score=0.99,
|
| 59 |
)
|
| 60 |
|
| 61 |
class DummyCompletions:
|
|
|
|
| 80 |
assert "[STEP]" in captured
|
| 81 |
assert "[END]" in captured
|
| 82 |
assert "success=true" in captured
|
| 83 |
+
assert "score=0.99" in captured
|
| 84 |
|