purpose-agent / purpose_agent /evaluation.py
Rohan03's picture
v0.2.0: Add purpose_agent/evaluation.py
6b1bd78 verified
"""
Evaluation Harness — Benchmark runner with improvement curve tracking.
Proves the self-improvement claim: run the same tasks N times and
show that performance improves with each iteration.
Features:
- Run standard benchmarks (or custom task sets)
- Track improvement curves across iterations
- Compare cold-start vs warm-start performance
- Export results as JSON/CSV for plotting
- Statistical significance testing
"""
from __future__ import annotations
import json
import logging
import math
import time
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Callable
from purpose_agent.types import State, Trajectory
from purpose_agent.orchestrator import Environment, Orchestrator, TaskResult
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Benchmark Task
# ---------------------------------------------------------------------------
@dataclass
class BenchmarkTask:
"""A single task in a benchmark suite."""
id: str
purpose: str
initial_state: State
expected_outcome: dict[str, Any] = field(default_factory=dict)
max_steps: int = 20
category: str = "general"
difficulty: str = "medium" # easy, medium, hard
def check_success(self, result: TaskResult) -> bool:
"""Check if the task was completed successfully."""
if not self.expected_outcome:
return result.success # Default: Φ > 7.0
# Custom success criteria
final_data = result.final_state.data
for key, expected in self.expected_outcome.items():
if key not in final_data:
return False
if final_data[key] != expected:
return False
return True
# ---------------------------------------------------------------------------
# Evaluation Result
# ---------------------------------------------------------------------------
@dataclass
class EvalResult:
"""Result of evaluating one task in one iteration."""
task_id: str
iteration: int
success: bool
steps: int
cumulative_reward: float
final_phi: float | None
success_rate: float
wall_time_s: float
category: str = ""
difficulty: str = ""
@dataclass
class BenchmarkResult:
"""Aggregate results from a benchmark run."""
benchmark_name: str
iterations: int
results: list[EvalResult] = field(default_factory=list)
started_at: float = field(default_factory=time.time)
finished_at: float = 0.0
def get_improvement_curve(self) -> list[dict[str, Any]]:
"""
Get the improvement curve: success rate per iteration.
This is the key chart that proves self-improvement.
"""
by_iteration: dict[int, list[EvalResult]] = {}
for r in self.results:
by_iteration.setdefault(r.iteration, []).append(r)
curve = []
for iteration in sorted(by_iteration.keys()):
results = by_iteration[iteration]
successes = sum(1 for r in results if r.success)
total = len(results)
avg_phi = sum(r.final_phi or 0 for r in results) / total if total else 0
avg_steps = sum(r.steps for r in results) / total if total else 0
avg_reward = sum(r.cumulative_reward for r in results) / total if total else 0
curve.append({
"iteration": iteration,
"success_rate": successes / total if total else 0,
"total_tasks": total,
"successes": successes,
"avg_final_phi": round(avg_phi, 2),
"avg_steps": round(avg_steps, 1),
"avg_cumulative_reward": round(avg_reward, 2),
})
return curve
def get_per_category(self) -> dict[str, dict]:
"""Get results broken down by category."""
by_cat: dict[str, list[EvalResult]] = {}
for r in self.results:
by_cat.setdefault(r.category or "general", []).append(r)
summary = {}
for cat, results in by_cat.items():
successes = sum(1 for r in results if r.success)
summary[cat] = {
"total": len(results),
"successes": successes,
"success_rate": successes / len(results),
}
return summary
def summary(self) -> str:
"""Human-readable summary."""
curve = self.get_improvement_curve()
lines = [
f"═══ Benchmark: {self.benchmark_name} ═══",
f"Iterations: {self.iterations}",
f"Total evaluations: {len(self.results)}",
f"Duration: {self.finished_at - self.started_at:.1f}s",
"",
"Improvement Curve:",
f"{'Iteration':>10} {'Success Rate':>15} {'Avg Φ':>10} {'Avg Steps':>12} {'Avg Reward':>12}",
"-" * 65,
]
for point in curve:
lines.append(
f"{point['iteration']:>10} "
f"{point['success_rate']:>14.1%} "
f"{point['avg_final_phi']:>10.2f} "
f"{point['avg_steps']:>12.1f} "
f"{point['avg_cumulative_reward']:>12.2f}"
)
# Improvement delta
if len(curve) >= 2:
first = curve[0]["success_rate"]
last = curve[-1]["success_rate"]
delta = last - first
lines.append(f"\nImprovement: {first:.1%}{last:.1%} ({delta:+.1%})")
return "\n".join(lines)
def to_json(self) -> str:
return json.dumps({
"benchmark": self.benchmark_name,
"iterations": self.iterations,
"improvement_curve": self.get_improvement_curve(),
"per_category": self.get_per_category(),
"results": [
{
"task_id": r.task_id,
"iteration": r.iteration,
"success": r.success,
"steps": r.steps,
"final_phi": r.final_phi,
"cumulative_reward": r.cumulative_reward,
"wall_time_s": r.wall_time_s,
"category": r.category,
}
for r in self.results
],
}, indent=2)
def save(self, path: str) -> None:
Path(path).parent.mkdir(parents=True, exist_ok=True)
with open(path, "w") as f:
f.write(self.to_json())
logger.info(f"Benchmark results saved to {path}")
# ---------------------------------------------------------------------------
# Benchmark Runner
# ---------------------------------------------------------------------------
class BenchmarkRunner:
"""
Runs benchmark suites to prove self-improvement.
The key test: run the same tasks multiple times (iterations).
On iteration 1, the agent has no experience. By iteration N,
it should have learned from previous attempts.
Usage:
runner = BenchmarkRunner(orchestrator=orch)
# Define tasks
tasks = [
BenchmarkTask(id="t1", purpose="Find treasure", initial_state=...),
BenchmarkTask(id="t2", purpose="Solve puzzle", initial_state=...),
]
# Run 5 iterations
result = runner.run(tasks, iterations=5, name="TreasureMaze")
# See the improvement curve
print(result.summary())
result.save("results/benchmark.json")
"""
def __init__(
self,
orchestrator: Orchestrator,
reset_between_iterations: bool = False,
verbose: bool = True,
):
self.orch = orchestrator
self.reset_between_iterations = reset_between_iterations
self.verbose = verbose
def run(
self,
tasks: list[BenchmarkTask],
iterations: int = 5,
name: str = "benchmark",
) -> BenchmarkResult:
"""
Run benchmark: execute all tasks for N iterations.
The experience replay and heuristic library persist between iterations
(unless reset_between_iterations=True), so the agent should improve.
"""
benchmark = BenchmarkResult(
benchmark_name=name,
iterations=iterations,
)
for iteration in range(1, iterations + 1):
if self.verbose:
logger.info(f"\n{'='*60}")
logger.info(f" Iteration {iteration}/{iterations}")
logger.info(f"{'='*60}")
if self.reset_between_iterations and iteration > 1:
# Reset memory but keep the learning from previous iterations
# (This tests within-iteration learning)
pass
for task in tasks:
start = time.time()
try:
result = self.orch.run_task(
purpose=task.purpose,
initial_state=task.initial_state,
max_steps=task.max_steps,
)
success = task.check_success(result)
eval_result = EvalResult(
task_id=task.id,
iteration=iteration,
success=success,
steps=result.total_steps,
cumulative_reward=result.cumulative_reward,
final_phi=result.final_phi,
success_rate=result.trajectory.success_rate,
wall_time_s=time.time() - start,
category=task.category,
difficulty=task.difficulty,
)
except Exception as e:
logger.error(f"Task {task.id} failed: {e}")
eval_result = EvalResult(
task_id=task.id,
iteration=iteration,
success=False,
steps=0,
cumulative_reward=0,
final_phi=None,
success_rate=0,
wall_time_s=time.time() - start,
category=task.category,
difficulty=task.difficulty,
)
benchmark.results.append(eval_result)
if self.verbose:
status = "✓" if eval_result.success else "✗"
logger.info(
f" {status} Task '{task.id}' — "
f"Φ={eval_result.final_phi or 0:.1f}, "
f"steps={eval_result.steps}, "
f"reward={eval_result.cumulative_reward:.2f}"
)
# Log iteration summary
if self.verbose:
curve = benchmark.get_improvement_curve()
if curve:
latest = curve[-1]
logger.info(
f" Iteration {iteration} summary: "
f"success={latest['success_rate']:.1%}, "
f"avg_Φ={latest['avg_final_phi']:.2f}"
)
benchmark.finished_at = time.time()
return benchmark
def compare_cold_vs_warm(
self,
tasks: list[BenchmarkTask],
) -> dict[str, Any]:
"""
Compare cold-start (no experience) vs warm-start (with experience).
Runs tasks once with empty memory, then again with the learned memory.
The delta proves self-improvement.
"""
# Cold start
cold_result = self.run(tasks, iterations=1, name="cold_start")
cold_curve = cold_result.get_improvement_curve()
cold_success = cold_curve[0]["success_rate"] if cold_curve else 0
# Warm start (memory retained from cold run)
warm_result = self.run(tasks, iterations=1, name="warm_start")
warm_curve = warm_result.get_improvement_curve()
warm_success = warm_curve[0]["success_rate"] if warm_curve else 0
return {
"cold_start_success_rate": cold_success,
"warm_start_success_rate": warm_success,
"improvement": warm_success - cold_success,
"cold_avg_phi": cold_curve[0]["avg_final_phi"] if cold_curve else 0,
"warm_avg_phi": warm_curve[0]["avg_final_phi"] if warm_curve else 0,
"heuristics_learned": len(self.orch.optimizer.heuristic_library),
"experiences_stored": self.orch.experience_replay.size,
}