Rohan03 commited on
Commit
08321e5
·
verified ·
1 Parent(s): 52e6e2d

V2 merge: purpose_agent/benchmark_v2.py

Browse files
Files changed (1) hide show
  1. purpose_agent/benchmark_v2.py +284 -0
purpose_agent/benchmark_v2.py ADDED
@@ -0,0 +1,284 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ BenchmarkRunnerV2 — Rigorous evaluation with train/val/test splits,
3
+ memory ablation, shuffle control, and contamination detection.
4
+
5
+ Key difference from V1: BenchmarkRunnerV2 enforces RunMode. In eval_test
6
+ mode, no memory is written. This is the only mode whose numbers are trustworthy.
7
+ """
8
+ from __future__ import annotations
9
+
10
+ import json
11
+ import logging
12
+ import time
13
+ from dataclasses import dataclass, field
14
+ from pathlib import Path
15
+ from typing import Any
16
+
17
+ from purpose_agent.v2_types import RunMode
18
+ from purpose_agent.evalport import EvalCase, EvalPort, DictEvalPort, ScoreBundle
19
+ from purpose_agent.orchestrator import Orchestrator, TaskResult
20
+ from purpose_agent.types import State
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ @dataclass
26
+ class V2EvalResult:
27
+ """Result of one evaluation case."""
28
+ case_id: str
29
+ iteration: int
30
+ split: str
31
+ bundle: ScoreBundle
32
+ steps: int = 0
33
+ wall_time_s: float = 0.0
34
+
35
+
36
+ @dataclass
37
+ class V2BenchmarkResult:
38
+ """Full benchmark result with per-split reporting."""
39
+ name: str
40
+ results: list[V2EvalResult] = field(default_factory=list)
41
+ config: dict[str, Any] = field(default_factory=dict)
42
+ started_at: float = field(default_factory=time.time)
43
+ finished_at: float = 0.0
44
+
45
+ def get_split_summary(self, split: str) -> dict[str, float]:
46
+ """Get aggregate metrics for a specific split."""
47
+ split_results = [r for r in self.results if r.split == split]
48
+ if not split_results:
49
+ return {}
50
+ n = len(split_results)
51
+ pass_rate = sum(1 for r in split_results if r.bundle.passed) / n
52
+ avg_steps = sum(r.steps for r in split_results) / n
53
+ return {
54
+ "n": n,
55
+ "pass_rate": round(pass_rate, 3),
56
+ "avg_steps": round(avg_steps, 1),
57
+ }
58
+
59
+ def get_improvement_curve(self, split: str = "test") -> list[dict]:
60
+ """Get per-iteration metrics for one split."""
61
+ by_iter: dict[int, list[V2EvalResult]] = {}
62
+ for r in self.results:
63
+ if r.split == split:
64
+ by_iter.setdefault(r.iteration, []).append(r)
65
+
66
+ curve = []
67
+ for it in sorted(by_iter):
68
+ results = by_iter[it]
69
+ n = len(results)
70
+ pass_rate = sum(1 for r in results if r.bundle.passed) / n
71
+ curve.append({
72
+ "iteration": it,
73
+ "pass_rate": round(pass_rate, 3),
74
+ "n": n,
75
+ })
76
+ return curve
77
+
78
+ def summary(self) -> str:
79
+ lines = [f"═══ Benchmark: {self.name} ═══"]
80
+ for split in ["train", "validation", "test"]:
81
+ s = self.get_split_summary(split)
82
+ if s:
83
+ lines.append(f" {split:>12}: n={s['n']}, pass_rate={s['pass_rate']:.1%}, avg_steps={s['avg_steps']:.1f}")
84
+
85
+ curve = self.get_improvement_curve("test")
86
+ if len(curve) >= 2:
87
+ first = curve[0]["pass_rate"]
88
+ last = curve[-1]["pass_rate"]
89
+ delta = last - first
90
+ if abs(delta) < 0.001:
91
+ lines.append(f"\n Test improvement: {first:.1%} → {last:.1%} (no significant change)")
92
+ else:
93
+ lines.append(f"\n Test improvement: {first:.1%} → {last:.1%} ({delta:+.1%})")
94
+ return "\n".join(lines)
95
+
96
+ def save(self, path: str) -> None:
97
+ Path(path).parent.mkdir(parents=True, exist_ok=True)
98
+ with open(path, "w") as f:
99
+ json.dump({
100
+ "name": self.name,
101
+ "config": self.config,
102
+ "splits": {
103
+ s: self.get_split_summary(s) for s in ["train", "validation", "test"]
104
+ },
105
+ "curve": self.get_improvement_curve("test"),
106
+ "n_results": len(self.results),
107
+ }, f, indent=2)
108
+
109
+
110
+ class BenchmarkRunnerV2:
111
+ """
112
+ Rigorous benchmark runner with train/val/test splits and ablation controls.
113
+
114
+ Key guarantee: eval_test cases NEVER cause memory writes.
115
+
116
+ Usage:
117
+ cases = [
118
+ EvalCase(id="t1", input_purpose="...", split="train", ...),
119
+ EvalCase(id="t2", input_purpose="...", split="test", ...),
120
+ ]
121
+ runner = BenchmarkRunnerV2(orchestrator=orch)
122
+ result = runner.run(cases, train_iterations=3, eval_iterations=1)
123
+ print(result.summary())
124
+ """
125
+
126
+ def __init__(
127
+ self,
128
+ orchestrator: Orchestrator,
129
+ eval_port: EvalPort | None = None,
130
+ ):
131
+ self.orch = orchestrator
132
+ self.eval_port = eval_port or DictEvalPort()
133
+
134
+ def run(
135
+ self,
136
+ cases: list[EvalCase],
137
+ train_iterations: int = 3,
138
+ eval_iterations: int = 1,
139
+ name: str = "v2_benchmark",
140
+ ) -> V2BenchmarkResult:
141
+ """
142
+ Run benchmark: train split with learning, test split without.
143
+
144
+ 1. Train iterations: run train split cases with RunMode.LEARNING_TRAIN
145
+ 2. Validation: run validation split with RunMode.LEARNING_VALIDATION
146
+ 3. Test: run test split with RunMode.EVAL_TEST (no memory writes)
147
+ """
148
+ result = V2BenchmarkResult(name=name, config={
149
+ "train_iterations": train_iterations,
150
+ "eval_iterations": eval_iterations,
151
+ })
152
+
153
+ train_cases = [c for c in cases if c.split == "train"]
154
+ val_cases = [c for c in cases if c.split == "validation"]
155
+ test_cases = [c for c in cases if c.split == "test"]
156
+
157
+ # Phase 1: Training
158
+ for it in range(1, train_iterations + 1):
159
+ logger.info(f"Train iteration {it}/{train_iterations}")
160
+ for case in train_cases:
161
+ ev = self._run_case(case, it, RunMode.LEARNING_TRAIN)
162
+ result.results.append(ev)
163
+
164
+ # Phase 2: Validation
165
+ for case in val_cases:
166
+ ev = self._run_case(case, 1, RunMode.LEARNING_VALIDATION)
167
+ result.results.append(ev)
168
+
169
+ # Phase 3: Test (NO MEMORY WRITES)
170
+ for it in range(1, eval_iterations + 1):
171
+ logger.info(f"Test iteration {it}/{eval_iterations}")
172
+ for case in test_cases:
173
+ ev = self._run_case(case, it, RunMode.EVAL_TEST)
174
+ result.results.append(ev)
175
+
176
+ result.finished_at = time.time()
177
+ return result
178
+
179
+ def run_cold_warm(
180
+ self,
181
+ test_cases: list[EvalCase],
182
+ train_cases: list[EvalCase],
183
+ name: str = "cold_warm",
184
+ ) -> dict[str, Any]:
185
+ """Compare cold (no memory) vs warm (after training) on the same test set."""
186
+ # Cold: eval test cases with empty memory
187
+ cold_results = []
188
+ for case in test_cases:
189
+ ev = self._run_case(case, 0, RunMode.EVAL_TEST)
190
+ cold_results.append(ev)
191
+ cold_pass = sum(1 for r in cold_results if r.bundle.passed) / max(len(cold_results), 1)
192
+
193
+ # Train
194
+ for case in train_cases:
195
+ self._run_case(case, 1, RunMode.LEARNING_TRAIN)
196
+
197
+ # Warm: eval same test cases after training
198
+ warm_results = []
199
+ for case in test_cases:
200
+ ev = self._run_case(case, 1, RunMode.EVAL_TEST)
201
+ warm_results.append(ev)
202
+ warm_pass = sum(1 for r in warm_results if r.bundle.passed) / max(len(warm_results), 1)
203
+
204
+ delta = warm_pass - cold_pass
205
+ return {
206
+ "cold_pass_rate": round(cold_pass, 3),
207
+ "warm_pass_rate": round(warm_pass, 3),
208
+ "delta": round(delta, 3),
209
+ "improvement_significant": abs(delta) > 0.05,
210
+ }
211
+
212
+ def run_memory_ablation(
213
+ self,
214
+ test_cases: list[EvalCase],
215
+ ) -> dict[str, Any]:
216
+ """Run test cases with and without memory to measure memory contribution."""
217
+ # With memory
218
+ with_results = []
219
+ for case in test_cases:
220
+ ev = self._run_case(case, 1, RunMode.EVAL_TEST)
221
+ with_results.append(ev)
222
+ with_pass = sum(1 for r in with_results if r.bundle.passed) / max(len(with_results), 1)
223
+
224
+ # Without memory (temporarily clear)
225
+ saved_lib = list(self.orch.optimizer.heuristic_library)
226
+ self.orch.optimizer.heuristic_library = []
227
+ self.orch.sync_memory()
228
+
229
+ without_results = []
230
+ for case in test_cases:
231
+ ev = self._run_case(case, 1, RunMode.EVAL_TEST)
232
+ without_results.append(ev)
233
+ without_pass = sum(1 for r in without_results if r.bundle.passed) / max(len(without_results), 1)
234
+
235
+ # Restore
236
+ self.orch.optimizer.heuristic_library = saved_lib
237
+ self.orch.sync_memory()
238
+
239
+ return {
240
+ "with_memory_pass_rate": round(with_pass, 3),
241
+ "without_memory_pass_rate": round(without_pass, 3),
242
+ "memory_contribution": round(with_pass - without_pass, 3),
243
+ }
244
+
245
+ def _run_case(self, case: EvalCase, iteration: int, mode: RunMode) -> V2EvalResult:
246
+ """Run a single case under a specific RunMode."""
247
+ start = time.time()
248
+
249
+ # In EVAL_TEST: save and restore memory state
250
+ saved_optimize = self.orch.optimize_every_n_tasks
251
+ if mode.is_eval:
252
+ self.orch.optimize_every_n_tasks = 999999 # Disable optimization
253
+
254
+ try:
255
+ task_result = self.orch.run_task(
256
+ purpose=case.input_purpose,
257
+ initial_state=State(data=case.input_state),
258
+ max_steps=case.max_steps,
259
+ )
260
+ except Exception as e:
261
+ logger.error(f"Case {case.id} failed: {e}")
262
+ task_result = TaskResult(
263
+ trajectory=__import__("purpose_agent.types", fromlist=["Trajectory"]).Trajectory(
264
+ task_description=case.input_purpose, purpose=case.input_purpose,
265
+ ),
266
+ final_state=State(data={"_error": str(e)}),
267
+ )
268
+
269
+ # Restore
270
+ self.orch.optimize_every_n_tasks = saved_optimize
271
+
272
+ # Evaluate
273
+ bundle = self.eval_port.evaluate(
274
+ case, task_result.final_state.data, task_result.trajectory,
275
+ )
276
+
277
+ return V2EvalResult(
278
+ case_id=case.id,
279
+ iteration=iteration,
280
+ split=case.split,
281
+ bundle=bundle,
282
+ steps=task_result.total_steps,
283
+ wall_time_s=time.time() - start,
284
+ )