Rohan03 commited on
Commit
52e6e2d
·
verified ·
1 Parent(s): eef94ae

V2 merge: purpose_agent/evalport.py

Browse files
Files changed (1) hide show
  1. purpose_agent/evalport.py +128 -0
purpose_agent/evalport.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ evalport.py — Evaluation port protocol for pluggable scoring.
3
+
4
+ Defines the interface between the framework and evaluation backends.
5
+ EvalPort is a protocol (structural typing) — any object with the right
6
+ methods can serve as an evaluation backend.
7
+ """
8
+ from __future__ import annotations
9
+
10
+ from dataclasses import dataclass, field
11
+ from typing import Any, Protocol, runtime_checkable
12
+
13
+
14
+ @dataclass
15
+ class ScoreBundle:
16
+ """A collection of named scores for one evaluation."""
17
+ scores: dict[str, float] = field(default_factory=dict)
18
+ metadata: dict[str, Any] = field(default_factory=dict)
19
+ passed: bool = True
20
+ failure_reasons: list[str] = field(default_factory=list)
21
+
22
+ def __getitem__(self, key: str) -> float:
23
+ return self.scores[key]
24
+
25
+ def get(self, key: str, default: float = 0.0) -> float:
26
+ return self.scores.get(key, default)
27
+
28
+
29
+ @dataclass
30
+ class EvalCase:
31
+ """A single evaluation test case."""
32
+ id: str
33
+ input_purpose: str
34
+ input_state: dict[str, Any] = field(default_factory=dict)
35
+ expected: dict[str, Any] = field(default_factory=dict)
36
+ max_steps: int = 20
37
+ category: str = "general"
38
+ difficulty: str = "medium"
39
+ split: str = "test" # "train", "validation", "test"
40
+ tags: list[str] = field(default_factory=list)
41
+
42
+
43
+ @runtime_checkable
44
+ class EvalPort(Protocol):
45
+ """
46
+ Protocol for evaluation backends.
47
+
48
+ Implement this to plug in custom scoring logic.
49
+ The framework calls evaluate() after each task and score_bundle()
50
+ to aggregate results.
51
+ """
52
+
53
+ def evaluate(self, case: EvalCase, result_state: dict[str, Any], trajectory: Any) -> ScoreBundle:
54
+ """Evaluate a single completed task."""
55
+ ...
56
+
57
+ def score_bundle(self, bundles: list[ScoreBundle]) -> dict[str, float]:
58
+ """Aggregate multiple ScoreBundles into summary metrics."""
59
+ ...
60
+
61
+
62
+ class DictEvalPort:
63
+ """
64
+ Simple evaluation port that checks expected keys in the final state.
65
+
66
+ Usage:
67
+ port = DictEvalPort()
68
+ case = EvalCase(id="t1", input_purpose="...",
69
+ expected={"task_complete": True})
70
+ bundle = port.evaluate(case, result_state={"task_complete": True}, trajectory=None)
71
+ assert bundle.passed
72
+ """
73
+
74
+ def evaluate(
75
+ self, case: EvalCase, result_state: dict[str, Any], trajectory: Any = None,
76
+ ) -> ScoreBundle:
77
+ scores = {}
78
+ failures = []
79
+
80
+ if not case.expected:
81
+ # No expected values — score by Φ
82
+ if trajectory and hasattr(trajectory, "final_phi"):
83
+ phi = trajectory.final_phi or 0
84
+ scores["phi"] = phi
85
+ if phi < 7.0:
86
+ failures.append(f"final_phi={phi:.1f} < 7.0")
87
+ return ScoreBundle(
88
+ scores=scores, passed=len(failures) == 0,
89
+ failure_reasons=failures,
90
+ )
91
+
92
+ # Check each expected key
93
+ matches = 0
94
+ for key, expected_val in case.expected.items():
95
+ actual = result_state.get(key)
96
+ is_match = actual == expected_val
97
+ scores[f"match_{key}"] = 1.0 if is_match else 0.0
98
+ if is_match:
99
+ matches += 1
100
+ else:
101
+ failures.append(f"{key}: expected={expected_val}, got={actual}")
102
+
103
+ scores["exact_match"] = matches / len(case.expected) if case.expected else 1.0
104
+ return ScoreBundle(
105
+ scores=scores,
106
+ passed=len(failures) == 0,
107
+ failure_reasons=failures,
108
+ )
109
+
110
+ def score_bundle(self, bundles: list[ScoreBundle]) -> dict[str, float]:
111
+ if not bundles:
112
+ return {}
113
+ n = len(bundles)
114
+ pass_rate = sum(1 for b in bundles if b.passed) / n
115
+
116
+ # Average all score keys
117
+ all_keys = set()
118
+ for b in bundles:
119
+ all_keys.update(b.scores.keys())
120
+
121
+ avgs = {}
122
+ for key in all_keys:
123
+ vals = [b.scores.get(key, 0) for b in bundles]
124
+ avgs[f"avg_{key}"] = sum(vals) / len(vals)
125
+
126
+ avgs["pass_rate"] = pass_rate
127
+ avgs["n"] = float(n)
128
+ return avgs