""" evalport.py — Evaluation port protocol for pluggable scoring. Defines the interface between the framework and evaluation backends. EvalPort is a protocol (structural typing) — any object with the right methods can serve as an evaluation backend. """ from __future__ import annotations from dataclasses import dataclass, field from typing import Any, Protocol, runtime_checkable @dataclass class ScoreBundle: """A collection of named scores for one evaluation.""" scores: dict[str, float] = field(default_factory=dict) metadata: dict[str, Any] = field(default_factory=dict) passed: bool = True failure_reasons: list[str] = field(default_factory=list) def __getitem__(self, key: str) -> float: return self.scores[key] def get(self, key: str, default: float = 0.0) -> float: return self.scores.get(key, default) @dataclass class EvalCase: """A single evaluation test case.""" id: str input_purpose: str input_state: dict[str, Any] = field(default_factory=dict) expected: dict[str, Any] = field(default_factory=dict) max_steps: int = 20 category: str = "general" difficulty: str = "medium" split: str = "test" # "train", "validation", "test" tags: list[str] = field(default_factory=list) @runtime_checkable class EvalPort(Protocol): """ Protocol for evaluation backends. Implement this to plug in custom scoring logic. The framework calls evaluate() after each task and score_bundle() to aggregate results. """ def evaluate(self, case: EvalCase, result_state: dict[str, Any], trajectory: Any) -> ScoreBundle: """Evaluate a single completed task.""" ... def score_bundle(self, bundles: list[ScoreBundle]) -> dict[str, float]: """Aggregate multiple ScoreBundles into summary metrics.""" ... class DictEvalPort: """ Simple evaluation port that checks expected keys in the final state. Usage: port = DictEvalPort() case = EvalCase(id="t1", input_purpose="...", expected={"task_complete": True}) bundle = port.evaluate(case, result_state={"task_complete": True}, trajectory=None) assert bundle.passed """ def evaluate( self, case: EvalCase, result_state: dict[str, Any], trajectory: Any = None, ) -> ScoreBundle: scores = {} failures = [] if not case.expected: # No expected values — score by Φ if trajectory and hasattr(trajectory, "final_phi"): phi = trajectory.final_phi or 0 scores["phi"] = phi if phi < 7.0: failures.append(f"final_phi={phi:.1f} < 7.0") return ScoreBundle( scores=scores, passed=len(failures) == 0, failure_reasons=failures, ) # Check each expected key matches = 0 for key, expected_val in case.expected.items(): actual = result_state.get(key) is_match = actual == expected_val scores[f"match_{key}"] = 1.0 if is_match else 0.0 if is_match: matches += 1 else: failures.append(f"{key}: expected={expected_val}, got={actual}") scores["exact_match"] = matches / len(case.expected) if case.expected else 1.0 return ScoreBundle( scores=scores, passed=len(failures) == 0, failure_reasons=failures, ) def score_bundle(self, bundles: list[ScoreBundle]) -> dict[str, float]: if not bundles: return {} n = len(bundles) pass_rate = sum(1 for b in bundles if b.passed) / n # Average all score keys all_keys = set() for b in bundles: all_keys.update(b.scores.keys()) avgs = {} for key in all_keys: vals = [b.scores.get(key, 0) for b in bundles] avgs[f"avg_{key}"] = sum(vals) / len(vals) avgs["pass_rate"] = pass_rate avgs["n"] = float(n) return avgs