Spaces:
Running
Running
File size: 1,129 Bytes
3eae4cc | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 | #!/usr/bin/env python3
"""
Minimal smoke test for all benchmark tasks.
Runs one deterministic baseline episode per task and checks score bounds.
"""
from __future__ import annotations
import json
from pathlib import Path
from app.baselines import run_policy_episode
from app.tasks import list_tasks
def main() -> int:
results: list[dict] = []
for task_id in list_tasks():
result = run_policy_episode(task_id=task_id, policy_name="backlog_clearance")
score = float(result["score"])
if not (0.0 <= score <= 1.0):
print(f"[FAIL] {task_id}: score out of range {score}")
return 1
results.append(result)
print(
f"[OK] task={task_id} score={score:.4f} "
f"steps={result['steps']} completed={result['completed']}"
)
out_dir = Path("results")
out_dir.mkdir(parents=True, exist_ok=True)
out_path = out_dir / "smoke_test_results.json"
out_path.write_text(json.dumps(results, indent=2), encoding="utf-8")
print(f"[DONE] wrote {out_path}")
return 0
if __name__ == "__main__":
raise SystemExit(main())
|