File size: 4,123 Bytes
52e6e2d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
"""
evalport.py — Evaluation port protocol for pluggable scoring.

Defines the interface between the framework and evaluation backends.
EvalPort is a protocol (structural typing) — any object with the right
methods can serve as an evaluation backend.
"""
from __future__ import annotations

from dataclasses import dataclass, field
from typing import Any, Protocol, runtime_checkable


@dataclass
class ScoreBundle:
    """A collection of named scores for one evaluation."""
    scores: dict[str, float] = field(default_factory=dict)
    metadata: dict[str, Any] = field(default_factory=dict)
    passed: bool = True
    failure_reasons: list[str] = field(default_factory=list)

    def __getitem__(self, key: str) -> float:
        return self.scores[key]

    def get(self, key: str, default: float = 0.0) -> float:
        return self.scores.get(key, default)


@dataclass
class EvalCase:
    """A single evaluation test case."""
    id: str
    input_purpose: str
    input_state: dict[str, Any] = field(default_factory=dict)
    expected: dict[str, Any] = field(default_factory=dict)
    max_steps: int = 20
    category: str = "general"
    difficulty: str = "medium"
    split: str = "test"              # "train", "validation", "test"
    tags: list[str] = field(default_factory=list)


@runtime_checkable
class EvalPort(Protocol):
    """
    Protocol for evaluation backends.

    Implement this to plug in custom scoring logic.
    The framework calls evaluate() after each task and score_bundle()
    to aggregate results.
    """

    def evaluate(self, case: EvalCase, result_state: dict[str, Any], trajectory: Any) -> ScoreBundle:
        """Evaluate a single completed task."""
        ...

    def score_bundle(self, bundles: list[ScoreBundle]) -> dict[str, float]:
        """Aggregate multiple ScoreBundles into summary metrics."""
        ...


class DictEvalPort:
    """
    Simple evaluation port that checks expected keys in the final state.

    Usage:
        port = DictEvalPort()
        case = EvalCase(id="t1", input_purpose="...",
                        expected={"task_complete": True})
        bundle = port.evaluate(case, result_state={"task_complete": True}, trajectory=None)
        assert bundle.passed
    """

    def evaluate(
        self, case: EvalCase, result_state: dict[str, Any], trajectory: Any = None,
    ) -> ScoreBundle:
        scores = {}
        failures = []

        if not case.expected:
            # No expected values — score by Φ
            if trajectory and hasattr(trajectory, "final_phi"):
                phi = trajectory.final_phi or 0
                scores["phi"] = phi
                if phi < 7.0:
                    failures.append(f"final_phi={phi:.1f} < 7.0")
            return ScoreBundle(
                scores=scores, passed=len(failures) == 0,
                failure_reasons=failures,
            )

        # Check each expected key
        matches = 0
        for key, expected_val in case.expected.items():
            actual = result_state.get(key)
            is_match = actual == expected_val
            scores[f"match_{key}"] = 1.0 if is_match else 0.0
            if is_match:
                matches += 1
            else:
                failures.append(f"{key}: expected={expected_val}, got={actual}")

        scores["exact_match"] = matches / len(case.expected) if case.expected else 1.0
        return ScoreBundle(
            scores=scores,
            passed=len(failures) == 0,
            failure_reasons=failures,
        )

    def score_bundle(self, bundles: list[ScoreBundle]) -> dict[str, float]:
        if not bundles:
            return {}
        n = len(bundles)
        pass_rate = sum(1 for b in bundles if b.passed) / n

        # Average all score keys
        all_keys = set()
        for b in bundles:
            all_keys.update(b.scores.keys())

        avgs = {}
        for key in all_keys:
            vals = [b.scores.get(key, 0) for b in bundles]
            avgs[f"avg_{key}"] = sum(vals) / len(vals)

        avgs["pass_rate"] = pass_rate
        avgs["n"] = float(n)
        return avgs