| """ |
| evalport.py — Evaluation port protocol for pluggable scoring. |
| |
| Defines the interface between the framework and evaluation backends. |
| EvalPort is a protocol (structural typing) — any object with the right |
| methods can serve as an evaluation backend. |
| """ |
| from __future__ import annotations |
|
|
| from dataclasses import dataclass, field |
| from typing import Any, Protocol, runtime_checkable |
|
|
|
|
| @dataclass |
| class ScoreBundle: |
| """A collection of named scores for one evaluation.""" |
| scores: dict[str, float] = field(default_factory=dict) |
| metadata: dict[str, Any] = field(default_factory=dict) |
| passed: bool = True |
| failure_reasons: list[str] = field(default_factory=list) |
|
|
| def __getitem__(self, key: str) -> float: |
| return self.scores[key] |
|
|
| def get(self, key: str, default: float = 0.0) -> float: |
| return self.scores.get(key, default) |
|
|
|
|
| @dataclass |
| class EvalCase: |
| """A single evaluation test case.""" |
| id: str |
| input_purpose: str |
| input_state: dict[str, Any] = field(default_factory=dict) |
| expected: dict[str, Any] = field(default_factory=dict) |
| max_steps: int = 20 |
| category: str = "general" |
| difficulty: str = "medium" |
| split: str = "test" |
| tags: list[str] = field(default_factory=list) |
|
|
|
|
| @runtime_checkable |
| class EvalPort(Protocol): |
| """ |
| Protocol for evaluation backends. |
| |
| Implement this to plug in custom scoring logic. |
| The framework calls evaluate() after each task and score_bundle() |
| to aggregate results. |
| """ |
|
|
| def evaluate(self, case: EvalCase, result_state: dict[str, Any], trajectory: Any) -> ScoreBundle: |
| """Evaluate a single completed task.""" |
| ... |
|
|
| def score_bundle(self, bundles: list[ScoreBundle]) -> dict[str, float]: |
| """Aggregate multiple ScoreBundles into summary metrics.""" |
| ... |
|
|
|
|
| class DictEvalPort: |
| """ |
| Simple evaluation port that checks expected keys in the final state. |
| |
| Usage: |
| port = DictEvalPort() |
| case = EvalCase(id="t1", input_purpose="...", |
| expected={"task_complete": True}) |
| bundle = port.evaluate(case, result_state={"task_complete": True}, trajectory=None) |
| assert bundle.passed |
| """ |
|
|
| def evaluate( |
| self, case: EvalCase, result_state: dict[str, Any], trajectory: Any = None, |
| ) -> ScoreBundle: |
| scores = {} |
| failures = [] |
|
|
| if not case.expected: |
| |
| if trajectory and hasattr(trajectory, "final_phi"): |
| phi = trajectory.final_phi or 0 |
| scores["phi"] = phi |
| if phi < 7.0: |
| failures.append(f"final_phi={phi:.1f} < 7.0") |
| return ScoreBundle( |
| scores=scores, passed=len(failures) == 0, |
| failure_reasons=failures, |
| ) |
|
|
| |
| matches = 0 |
| for key, expected_val in case.expected.items(): |
| actual = result_state.get(key) |
| is_match = actual == expected_val |
| scores[f"match_{key}"] = 1.0 if is_match else 0.0 |
| if is_match: |
| matches += 1 |
| else: |
| failures.append(f"{key}: expected={expected_val}, got={actual}") |
|
|
| scores["exact_match"] = matches / len(case.expected) if case.expected else 1.0 |
| return ScoreBundle( |
| scores=scores, |
| passed=len(failures) == 0, |
| failure_reasons=failures, |
| ) |
|
|
| def score_bundle(self, bundles: list[ScoreBundle]) -> dict[str, float]: |
| if not bundles: |
| return {} |
| n = len(bundles) |
| pass_rate = sum(1 for b in bundles if b.passed) / n |
|
|
| |
| all_keys = set() |
| for b in bundles: |
| all_keys.update(b.scores.keys()) |
|
|
| avgs = {} |
| for key in all_keys: |
| vals = [b.scores.get(key, 0) for b in bundles] |
| avgs[f"avg_{key}"] = sum(vals) / len(vals) |
|
|
| avgs["pass_rate"] = pass_rate |
| avgs["n"] = float(n) |
| return avgs |
|
|