File size: 3,323 Bytes
d814291
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
from __future__ import annotations

import argparse

from osint_env import cli
from osint_env.domain.models import EnvironmentConfig


class _DummyParser:
    def __init__(self, namespace: argparse.Namespace):
        self._namespace = namespace

    def parse_args(self) -> argparse.Namespace:
        return self._namespace


class _DummyEnv:
    def __init__(self, config: EnvironmentConfig, llm=None):
        self.config = config
        self.llm = llm


def test_eval_exports_dashboard_and_evaluation(monkeypatch, tmp_path, capsys):
    dashboard_path = tmp_path / "eval_dashboard.html"
    eval_path = tmp_path / "latest_evaluation.json"

    args = argparse.Namespace(
        cmd="eval",
        episodes=1,
        leaderboard="",
        dashboard="",
        dashboard_dir="",
        evaluation="",
    )

    runtime = {
        "default_episodes": 20,
        "leaderboard_path": str(tmp_path / "leaderboard.json"),
        "dashboard_path": str(dashboard_path),
        "sweep_dashboard_dir": str(tmp_path / "sweep"),
    }

    evaluation_payload = {
        "summary": {
            "avg_reward": 0.5,
            "avg_graph_f1": 0.4,
            "task_success_rate": 1.0,
            "tool_efficiency": 0.7,
            "avg_steps_to_solution": 3.0,
            "deanonymization_accuracy": 1.0,
            "leaderboard_score": 0.8,
        },
        "episodes": [
            {
                "task_id": "metaqa_1-hop_train_0",
                "task_type": "metaqa_1-hop",
                "question": "who directed [inception]?",
                "task_answer": "christopher nolan",
                "agent_answer": "christopher nolan",
                "graph_f1": 1.0,
                "reward": 1.0,
                "steps": 2,
                "tool_calls": 1,
                "success": 1,
            }
        ],
    }

    calls: dict[str, object] = {}

    monkeypatch.setattr(cli, "build_parser", lambda: _DummyParser(args))
    monkeypatch.setattr(cli, "_resolve_environment_config", lambda _args: (EnvironmentConfig(), runtime))
    monkeypatch.setattr(cli, "build_llm_client", lambda _cfg: object())
    monkeypatch.setattr(cli, "OSINTEnvironment", _DummyEnv)
    monkeypatch.setattr(cli, "run_evaluation", lambda env, episodes, return_details, llm: evaluation_payload)

    def _save(path: str, payload: dict) -> None:
        calls["save_path"] = path
        calls["save_payload"] = payload

    def _export(env, evaluation, leaderboard_records, output_path):
        calls["export_output_path"] = output_path
        calls["export_eval"] = evaluation
        calls["export_leaderboard"] = leaderboard_records
        return output_path

    monkeypatch.setattr(cli, "_save_evaluation", _save)
    monkeypatch.setattr(cli, "load_leaderboard", lambda _path: [])
    monkeypatch.setattr(cli, "export_dashboard", _export)

    monkeypatch.setattr(cli, "DEFAULT_EVALUATION_PATH", str(eval_path))

    cli.main()

    assert calls["save_path"] == str(eval_path)
    assert calls["save_payload"] == evaluation_payload
    assert calls["export_output_path"] == str(dashboard_path)
    assert calls["export_eval"] == evaluation_payload
    assert calls["export_leaderboard"] == []

    output = capsys.readouterr().out
    assert '"avg_reward": 0.5' in output
    assert '"episodes"' not in output