Rohan03
/

purpose-agent

Text Generation

Model card Files Files and versions

purpose-agent / purpose_agent /evalport.py

Rohan03's picture

V2 merge: purpose_agent/evalport.py

52e6e2d verified 15 days ago

history blame contribute delete

4.12 kB

	"""
	evalport.py — Evaluation port protocol for pluggable scoring.

	Defines the interface between the framework and evaluation backends.
	EvalPort is a protocol (structural typing) — any object with the right
	methods can serve as an evaluation backend.
	"""
	from __future__ import annotations

	from dataclasses import dataclass, field
	from typing import Any, Protocol, runtime_checkable


	@dataclass
	class ScoreBundle:
	"""A collection of named scores for one evaluation."""
	scores: dict[str, float] = field(default_factory=dict)
	metadata: dict[str, Any] = field(default_factory=dict)
	passed: bool = True
	failure_reasons: list[str] = field(default_factory=list)

	def __getitem__(self, key: str) -> float:
	return self.scores[key]

	def get(self, key: str, default: float = 0.0) -> float:
	return self.scores.get(key, default)


	@dataclass
	class EvalCase:
	"""A single evaluation test case."""
	id: str
	input_purpose: str
	input_state: dict[str, Any] = field(default_factory=dict)
	expected: dict[str, Any] = field(default_factory=dict)
	max_steps: int = 20
	category: str = "general"
	difficulty: str = "medium"
	split: str = "test" # "train", "validation", "test"
	tags: list[str] = field(default_factory=list)


	@runtime_checkable
	class EvalPort(Protocol):
	"""
	Protocol for evaluation backends.

	Implement this to plug in custom scoring logic.
	The framework calls evaluate() after each task and score_bundle()
	to aggregate results.
	"""

	def evaluate(self, case: EvalCase, result_state: dict[str, Any], trajectory: Any) -> ScoreBundle:
	"""Evaluate a single completed task."""
	...

	def score_bundle(self, bundles: list[ScoreBundle]) -> dict[str, float]:
	"""Aggregate multiple ScoreBundles into summary metrics."""
	...


	class DictEvalPort:
	"""
	Simple evaluation port that checks expected keys in the final state.

	Usage:
	port = DictEvalPort()
	case = EvalCase(id="t1", input_purpose="...",
	expected={"task_complete": True})
	bundle = port.evaluate(case, result_state={"task_complete": True}, trajectory=None)
	assert bundle.passed
	"""

	def evaluate(
	self, case: EvalCase, result_state: dict[str, Any], trajectory: Any = None,
	) -> ScoreBundle:
	scores = {}
	failures = []

	if not case.expected:
	# No expected values — score by Φ
	if trajectory and hasattr(trajectory, "final_phi"):
	phi = trajectory.final_phi or 0
	scores["phi"] = phi
	if phi < 7.0:
	failures.append(f"final_phi={phi:.1f} < 7.0")
	return ScoreBundle(
	scores=scores, passed=len(failures) == 0,
	failure_reasons=failures,
	)

	# Check each expected key
	matches = 0
	for key, expected_val in case.expected.items():
	actual = result_state.get(key)
	is_match = actual == expected_val
	scores[f"match_{key}"] = 1.0 if is_match else 0.0
	if is_match:
	matches += 1
	else:
	failures.append(f"{key}: expected={expected_val}, got={actual}")

	scores["exact_match"] = matches / len(case.expected) if case.expected else 1.0
	return ScoreBundle(
	scores=scores,
	passed=len(failures) == 0,
	failure_reasons=failures,
	)

	def score_bundle(self, bundles: list[ScoreBundle]) -> dict[str, float]:
	if not bundles:
	return {}
	n = len(bundles)
	pass_rate = sum(1 for b in bundles if b.passed) / n

	# Average all score keys
	all_keys = set()
	for b in bundles:
	all_keys.update(b.scores.keys())

	avgs = {}
	for key in all_keys:
	vals = [b.scores.get(key, 0) for b in bundles]
	avgs[f"avg_{key}"] = sum(vals) / len(vals)

	avgs["pass_rate"] = pass_rate
	avgs["n"] = float(n)
	return avgs