Spaces:
Runtime error
Runtime error
| """Smoke tests for the strict inference output contract.""" | |
| from __future__ import annotations | |
| from dataclasses import dataclass, field | |
| from app.env.runner import InferenceRunner | |
| from app.models.inference import AgentDecision, InferenceConfig | |
| class _FakeObservation: | |
| task_id: str | |
| attempts_remaining: int | |
| score: float | |
| done: bool | |
| history: list[object] = field(default_factory=list) | |
| current_code: str = "print('broken')" | |
| last_action_error: str | None = None | |
| class _FakeEnv: | |
| def __init__(self) -> None: | |
| self._step = 0 | |
| def reset(self, *, task_id: str) -> _FakeObservation: | |
| return _FakeObservation(task_id=task_id, attempts_remaining=4, score=0.2, done=False) | |
| def step_result(self, action: object) -> tuple[_FakeObservation, float, bool, dict[str, object]]: | |
| self._step += 1 | |
| if self._step == 1: | |
| return ( | |
| _FakeObservation("demo_task", 3, 0.45, False, current_code="candidate"), | |
| 0.45, | |
| False, | |
| {"last_action_error": None}, | |
| ) | |
| if self._step == 2: | |
| return ( | |
| _FakeObservation("demo_task", 2, 0.97, True, current_code="reference"), | |
| 0.97, | |
| True, | |
| {"last_action_error": None}, | |
| ) | |
| raise AssertionError("runner stepped too many times") | |
| class _FakeAgent: | |
| def __init__(self) -> None: | |
| self._step = 0 | |
| def act(self, observation: object) -> AgentDecision: | |
| self._step += 1 | |
| if self._step == 1: | |
| return AgentDecision(action_type="run_tests") | |
| return AgentDecision(action_type="submit_solution") | |
| def test_inference_runner_emits_strict_lines(capsys) -> None: | |
| runner = InferenceRunner(InferenceConfig.from_env()) | |
| runner.agent = _FakeAgent() | |
| runner._create_env = lambda: _FakeEnv() # type: ignore[method-assign] | |
| runner.run_task("demo_task") | |
| captured = capsys.readouterr().out.strip().splitlines() | |
| assert captured == [ | |
| f"[START] task=demo_task env={runner.config.benchmark_name} model={runner.config.model_name}", | |
| "[STEP] step=1 action=run_tests reward=0.45 done=false error=null", | |
| "[STEP] step=2 action=submit_solution reward=0.97 done=true error=null", | |
| "[END] success=true steps=2 rewards=0.45,0.97", | |
| ] | |