OSINT / tests /test_cli_eval_outputs.py
siddeshwar-kagatikar
fix(rewards): never crash GRPO on malformed completions
d814291
from __future__ import annotations
import argparse
from osint_env import cli
from osint_env.domain.models import EnvironmentConfig
class _DummyParser:
def __init__(self, namespace: argparse.Namespace):
self._namespace = namespace
def parse_args(self) -> argparse.Namespace:
return self._namespace
class _DummyEnv:
def __init__(self, config: EnvironmentConfig, llm=None):
self.config = config
self.llm = llm
def test_eval_exports_dashboard_and_evaluation(monkeypatch, tmp_path, capsys):
dashboard_path = tmp_path / "eval_dashboard.html"
eval_path = tmp_path / "latest_evaluation.json"
args = argparse.Namespace(
cmd="eval",
episodes=1,
leaderboard="",
dashboard="",
dashboard_dir="",
evaluation="",
)
runtime = {
"default_episodes": 20,
"leaderboard_path": str(tmp_path / "leaderboard.json"),
"dashboard_path": str(dashboard_path),
"sweep_dashboard_dir": str(tmp_path / "sweep"),
}
evaluation_payload = {
"summary": {
"avg_reward": 0.5,
"avg_graph_f1": 0.4,
"task_success_rate": 1.0,
"tool_efficiency": 0.7,
"avg_steps_to_solution": 3.0,
"deanonymization_accuracy": 1.0,
"leaderboard_score": 0.8,
},
"episodes": [
{
"task_id": "metaqa_1-hop_train_0",
"task_type": "metaqa_1-hop",
"question": "who directed [inception]?",
"task_answer": "christopher nolan",
"agent_answer": "christopher nolan",
"graph_f1": 1.0,
"reward": 1.0,
"steps": 2,
"tool_calls": 1,
"success": 1,
}
],
}
calls: dict[str, object] = {}
monkeypatch.setattr(cli, "build_parser", lambda: _DummyParser(args))
monkeypatch.setattr(cli, "_resolve_environment_config", lambda _args: (EnvironmentConfig(), runtime))
monkeypatch.setattr(cli, "build_llm_client", lambda _cfg: object())
monkeypatch.setattr(cli, "OSINTEnvironment", _DummyEnv)
monkeypatch.setattr(cli, "run_evaluation", lambda env, episodes, return_details, llm: evaluation_payload)
def _save(path: str, payload: dict) -> None:
calls["save_path"] = path
calls["save_payload"] = payload
def _export(env, evaluation, leaderboard_records, output_path):
calls["export_output_path"] = output_path
calls["export_eval"] = evaluation
calls["export_leaderboard"] = leaderboard_records
return output_path
monkeypatch.setattr(cli, "_save_evaluation", _save)
monkeypatch.setattr(cli, "load_leaderboard", lambda _path: [])
monkeypatch.setattr(cli, "export_dashboard", _export)
monkeypatch.setattr(cli, "DEFAULT_EVALUATION_PATH", str(eval_path))
cli.main()
assert calls["save_path"] == str(eval_path)
assert calls["save_payload"] == evaluation_payload
assert calls["export_output_path"] == str(dashboard_path)
assert calls["export_eval"] == evaluation_payload
assert calls["export_leaderboard"] == []
output = capsys.readouterr().out
assert '"avg_reward": 0.5' in output
assert '"episodes"' not in output