| """ |
| Evaluation Harness — Benchmark runner with improvement curve tracking. |
| |
| Proves the self-improvement claim: run the same tasks N times and |
| show that performance improves with each iteration. |
| |
| Features: |
| - Run standard benchmarks (or custom task sets) |
| - Track improvement curves across iterations |
| - Compare cold-start vs warm-start performance |
| - Export results as JSON/CSV for plotting |
| - Statistical significance testing |
| """ |
|
|
| from __future__ import annotations |
|
|
| import json |
| import logging |
| import math |
| import time |
| from dataclasses import dataclass, field |
| from pathlib import Path |
| from typing import Any, Callable |
|
|
| from purpose_agent.types import State, Trajectory |
| from purpose_agent.orchestrator import Environment, Orchestrator, TaskResult |
|
|
| logger = logging.getLogger(__name__) |
|
|
|
|
| |
| |
| |
|
|
| @dataclass |
| class BenchmarkTask: |
| """A single task in a benchmark suite.""" |
| id: str |
| purpose: str |
| initial_state: State |
| expected_outcome: dict[str, Any] = field(default_factory=dict) |
| max_steps: int = 20 |
| category: str = "general" |
| difficulty: str = "medium" |
|
|
| def check_success(self, result: TaskResult) -> bool: |
| """Check if the task was completed successfully.""" |
| if not self.expected_outcome: |
| return result.success |
|
|
| |
| final_data = result.final_state.data |
| for key, expected in self.expected_outcome.items(): |
| if key not in final_data: |
| return False |
| if final_data[key] != expected: |
| return False |
| return True |
|
|
|
|
| |
| |
| |
|
|
| @dataclass |
| class EvalResult: |
| """Result of evaluating one task in one iteration.""" |
| task_id: str |
| iteration: int |
| success: bool |
| steps: int |
| cumulative_reward: float |
| final_phi: float | None |
| success_rate: float |
| wall_time_s: float |
| category: str = "" |
| difficulty: str = "" |
|
|
|
|
| @dataclass |
| class BenchmarkResult: |
| """Aggregate results from a benchmark run.""" |
| benchmark_name: str |
| iterations: int |
| results: list[EvalResult] = field(default_factory=list) |
| started_at: float = field(default_factory=time.time) |
| finished_at: float = 0.0 |
|
|
| def get_improvement_curve(self) -> list[dict[str, Any]]: |
| """ |
| Get the improvement curve: success rate per iteration. |
| |
| This is the key chart that proves self-improvement. |
| """ |
| by_iteration: dict[int, list[EvalResult]] = {} |
| for r in self.results: |
| by_iteration.setdefault(r.iteration, []).append(r) |
|
|
| curve = [] |
| for iteration in sorted(by_iteration.keys()): |
| results = by_iteration[iteration] |
| successes = sum(1 for r in results if r.success) |
| total = len(results) |
| avg_phi = sum(r.final_phi or 0 for r in results) / total if total else 0 |
| avg_steps = sum(r.steps for r in results) / total if total else 0 |
| avg_reward = sum(r.cumulative_reward for r in results) / total if total else 0 |
|
|
| curve.append({ |
| "iteration": iteration, |
| "success_rate": successes / total if total else 0, |
| "total_tasks": total, |
| "successes": successes, |
| "avg_final_phi": round(avg_phi, 2), |
| "avg_steps": round(avg_steps, 1), |
| "avg_cumulative_reward": round(avg_reward, 2), |
| }) |
| return curve |
|
|
| def get_per_category(self) -> dict[str, dict]: |
| """Get results broken down by category.""" |
| by_cat: dict[str, list[EvalResult]] = {} |
| for r in self.results: |
| by_cat.setdefault(r.category or "general", []).append(r) |
|
|
| summary = {} |
| for cat, results in by_cat.items(): |
| successes = sum(1 for r in results if r.success) |
| summary[cat] = { |
| "total": len(results), |
| "successes": successes, |
| "success_rate": successes / len(results), |
| } |
| return summary |
|
|
| def summary(self) -> str: |
| """Human-readable summary.""" |
| curve = self.get_improvement_curve() |
| lines = [ |
| f"═══ Benchmark: {self.benchmark_name} ═══", |
| f"Iterations: {self.iterations}", |
| f"Total evaluations: {len(self.results)}", |
| f"Duration: {self.finished_at - self.started_at:.1f}s", |
| "", |
| "Improvement Curve:", |
| f"{'Iteration':>10} {'Success Rate':>15} {'Avg Φ':>10} {'Avg Steps':>12} {'Avg Reward':>12}", |
| "-" * 65, |
| ] |
|
|
| for point in curve: |
| lines.append( |
| f"{point['iteration']:>10} " |
| f"{point['success_rate']:>14.1%} " |
| f"{point['avg_final_phi']:>10.2f} " |
| f"{point['avg_steps']:>12.1f} " |
| f"{point['avg_cumulative_reward']:>12.2f}" |
| ) |
|
|
| |
| if len(curve) >= 2: |
| first = curve[0]["success_rate"] |
| last = curve[-1]["success_rate"] |
| delta = last - first |
| lines.append(f"\nImprovement: {first:.1%} → {last:.1%} ({delta:+.1%})") |
|
|
| return "\n".join(lines) |
|
|
| def to_json(self) -> str: |
| return json.dumps({ |
| "benchmark": self.benchmark_name, |
| "iterations": self.iterations, |
| "improvement_curve": self.get_improvement_curve(), |
| "per_category": self.get_per_category(), |
| "results": [ |
| { |
| "task_id": r.task_id, |
| "iteration": r.iteration, |
| "success": r.success, |
| "steps": r.steps, |
| "final_phi": r.final_phi, |
| "cumulative_reward": r.cumulative_reward, |
| "wall_time_s": r.wall_time_s, |
| "category": r.category, |
| } |
| for r in self.results |
| ], |
| }, indent=2) |
|
|
| def save(self, path: str) -> None: |
| Path(path).parent.mkdir(parents=True, exist_ok=True) |
| with open(path, "w") as f: |
| f.write(self.to_json()) |
| logger.info(f"Benchmark results saved to {path}") |
|
|
|
|
| |
| |
| |
|
|
| class BenchmarkRunner: |
| """ |
| Runs benchmark suites to prove self-improvement. |
| |
| The key test: run the same tasks multiple times (iterations). |
| On iteration 1, the agent has no experience. By iteration N, |
| it should have learned from previous attempts. |
| |
| Usage: |
| runner = BenchmarkRunner(orchestrator=orch) |
| |
| # Define tasks |
| tasks = [ |
| BenchmarkTask(id="t1", purpose="Find treasure", initial_state=...), |
| BenchmarkTask(id="t2", purpose="Solve puzzle", initial_state=...), |
| ] |
| |
| # Run 5 iterations |
| result = runner.run(tasks, iterations=5, name="TreasureMaze") |
| |
| # See the improvement curve |
| print(result.summary()) |
| result.save("results/benchmark.json") |
| """ |
|
|
| def __init__( |
| self, |
| orchestrator: Orchestrator, |
| reset_between_iterations: bool = False, |
| verbose: bool = True, |
| ): |
| self.orch = orchestrator |
| self.reset_between_iterations = reset_between_iterations |
| self.verbose = verbose |
|
|
| def run( |
| self, |
| tasks: list[BenchmarkTask], |
| iterations: int = 5, |
| name: str = "benchmark", |
| ) -> BenchmarkResult: |
| """ |
| Run benchmark: execute all tasks for N iterations. |
| |
| The experience replay and heuristic library persist between iterations |
| (unless reset_between_iterations=True), so the agent should improve. |
| """ |
| benchmark = BenchmarkResult( |
| benchmark_name=name, |
| iterations=iterations, |
| ) |
|
|
| for iteration in range(1, iterations + 1): |
| if self.verbose: |
| logger.info(f"\n{'='*60}") |
| logger.info(f" Iteration {iteration}/{iterations}") |
| logger.info(f"{'='*60}") |
|
|
| if self.reset_between_iterations and iteration > 1: |
| |
| |
| pass |
|
|
| for task in tasks: |
| start = time.time() |
|
|
| try: |
| result = self.orch.run_task( |
| purpose=task.purpose, |
| initial_state=task.initial_state, |
| max_steps=task.max_steps, |
| ) |
|
|
| success = task.check_success(result) |
| eval_result = EvalResult( |
| task_id=task.id, |
| iteration=iteration, |
| success=success, |
| steps=result.total_steps, |
| cumulative_reward=result.cumulative_reward, |
| final_phi=result.final_phi, |
| success_rate=result.trajectory.success_rate, |
| wall_time_s=time.time() - start, |
| category=task.category, |
| difficulty=task.difficulty, |
| ) |
| except Exception as e: |
| logger.error(f"Task {task.id} failed: {e}") |
| eval_result = EvalResult( |
| task_id=task.id, |
| iteration=iteration, |
| success=False, |
| steps=0, |
| cumulative_reward=0, |
| final_phi=None, |
| success_rate=0, |
| wall_time_s=time.time() - start, |
| category=task.category, |
| difficulty=task.difficulty, |
| ) |
|
|
| benchmark.results.append(eval_result) |
|
|
| if self.verbose: |
| status = "✓" if eval_result.success else "✗" |
| logger.info( |
| f" {status} Task '{task.id}' — " |
| f"Φ={eval_result.final_phi or 0:.1f}, " |
| f"steps={eval_result.steps}, " |
| f"reward={eval_result.cumulative_reward:.2f}" |
| ) |
|
|
| |
| if self.verbose: |
| curve = benchmark.get_improvement_curve() |
| if curve: |
| latest = curve[-1] |
| logger.info( |
| f" Iteration {iteration} summary: " |
| f"success={latest['success_rate']:.1%}, " |
| f"avg_Φ={latest['avg_final_phi']:.2f}" |
| ) |
|
|
| benchmark.finished_at = time.time() |
| return benchmark |
|
|
| def compare_cold_vs_warm( |
| self, |
| tasks: list[BenchmarkTask], |
| ) -> dict[str, Any]: |
| """ |
| Compare cold-start (no experience) vs warm-start (with experience). |
| |
| Runs tasks once with empty memory, then again with the learned memory. |
| The delta proves self-improvement. |
| """ |
| |
| cold_result = self.run(tasks, iterations=1, name="cold_start") |
| cold_curve = cold_result.get_improvement_curve() |
| cold_success = cold_curve[0]["success_rate"] if cold_curve else 0 |
|
|
| |
| warm_result = self.run(tasks, iterations=1, name="warm_start") |
| warm_curve = warm_result.get_improvement_curve() |
| warm_success = warm_curve[0]["success_rate"] if warm_curve else 0 |
|
|
| return { |
| "cold_start_success_rate": cold_success, |
| "warm_start_success_rate": warm_success, |
| "improvement": warm_success - cold_success, |
| "cold_avg_phi": cold_curve[0]["avg_final_phi"] if cold_curve else 0, |
| "warm_avg_phi": warm_curve[0]["avg_final_phi"] if warm_curve else 0, |
| "heuristics_learned": len(self.orch.optimizer.heuristic_library), |
| "experiences_stored": self.orch.experience_replay.size, |
| } |
|
|