File size: 4,123 Bytes
52e6e2d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 | """
evalport.py — Evaluation port protocol for pluggable scoring.
Defines the interface between the framework and evaluation backends.
EvalPort is a protocol (structural typing) — any object with the right
methods can serve as an evaluation backend.
"""
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Any, Protocol, runtime_checkable
@dataclass
class ScoreBundle:
"""A collection of named scores for one evaluation."""
scores: dict[str, float] = field(default_factory=dict)
metadata: dict[str, Any] = field(default_factory=dict)
passed: bool = True
failure_reasons: list[str] = field(default_factory=list)
def __getitem__(self, key: str) -> float:
return self.scores[key]
def get(self, key: str, default: float = 0.0) -> float:
return self.scores.get(key, default)
@dataclass
class EvalCase:
"""A single evaluation test case."""
id: str
input_purpose: str
input_state: dict[str, Any] = field(default_factory=dict)
expected: dict[str, Any] = field(default_factory=dict)
max_steps: int = 20
category: str = "general"
difficulty: str = "medium"
split: str = "test" # "train", "validation", "test"
tags: list[str] = field(default_factory=list)
@runtime_checkable
class EvalPort(Protocol):
"""
Protocol for evaluation backends.
Implement this to plug in custom scoring logic.
The framework calls evaluate() after each task and score_bundle()
to aggregate results.
"""
def evaluate(self, case: EvalCase, result_state: dict[str, Any], trajectory: Any) -> ScoreBundle:
"""Evaluate a single completed task."""
...
def score_bundle(self, bundles: list[ScoreBundle]) -> dict[str, float]:
"""Aggregate multiple ScoreBundles into summary metrics."""
...
class DictEvalPort:
"""
Simple evaluation port that checks expected keys in the final state.
Usage:
port = DictEvalPort()
case = EvalCase(id="t1", input_purpose="...",
expected={"task_complete": True})
bundle = port.evaluate(case, result_state={"task_complete": True}, trajectory=None)
assert bundle.passed
"""
def evaluate(
self, case: EvalCase, result_state: dict[str, Any], trajectory: Any = None,
) -> ScoreBundle:
scores = {}
failures = []
if not case.expected:
# No expected values — score by Φ
if trajectory and hasattr(trajectory, "final_phi"):
phi = trajectory.final_phi or 0
scores["phi"] = phi
if phi < 7.0:
failures.append(f"final_phi={phi:.1f} < 7.0")
return ScoreBundle(
scores=scores, passed=len(failures) == 0,
failure_reasons=failures,
)
# Check each expected key
matches = 0
for key, expected_val in case.expected.items():
actual = result_state.get(key)
is_match = actual == expected_val
scores[f"match_{key}"] = 1.0 if is_match else 0.0
if is_match:
matches += 1
else:
failures.append(f"{key}: expected={expected_val}, got={actual}")
scores["exact_match"] = matches / len(case.expected) if case.expected else 1.0
return ScoreBundle(
scores=scores,
passed=len(failures) == 0,
failure_reasons=failures,
)
def score_bundle(self, bundles: list[ScoreBundle]) -> dict[str, float]:
if not bundles:
return {}
n = len(bundles)
pass_rate = sum(1 for b in bundles if b.passed) / n
# Average all score keys
all_keys = set()
for b in bundles:
all_keys.update(b.scores.keys())
avgs = {}
for key in all_keys:
vals = [b.scores.get(key, 0) for b in bundles]
avgs[f"avg_{key}"] = sum(vals) / len(vals)
avgs["pass_rate"] = pass_rate
avgs["n"] = float(n)
return avgs
|