Rohan03 commited on
Commit
8bb75c0
Β·
verified Β·
1 Parent(s): 965bdfb

test: real-world regression test script

Browse files
Files changed (1) hide show
  1. tests/real_world_regression.py +379 -0
tests/real_world_regression.py ADDED
@@ -0,0 +1,379 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ REAL-WORLD REGRESSION TEST β€” Tests Purpose Agent with live LLMs via OpenRouter.
4
+
5
+ Tests across 5 domains with 3 different models:
6
+ - Coding: fibonacci, fizzbuzz, palindrome, factorial
7
+ - Reasoning: logic puzzles
8
+ - Math: arithmetic word problems
9
+ - Self-improvement: learning curve across runs
10
+ - Security: immune system with real model outputs
11
+
12
+ Models:
13
+ - meta-llama/llama-3.3-70b-instruct (large, reliable)
14
+ - google/gemma-4-26b-a4b-it (medium, fast)
15
+ - qwen/qwen3.5-9b (small, budget)
16
+
17
+ Usage:
18
+ export OPENROUTER_API_KEY="sk-or-..."
19
+ python3 tests/real_world_regression.py
20
+ """
21
+ import sys, os, time, json
22
+
23
+ OR_KEY = os.environ.get("OPENROUTER_API_KEY", "")
24
+ if not OR_KEY:
25
+ print("Set OPENROUTER_API_KEY"); sys.exit(1)
26
+
27
+ # Use installed package
28
+ import purpose_agent as pa
29
+ from purpose_agent.llm_backend import resolve_backend
30
+ from purpose_agent.orchestrator import Environment
31
+ from purpose_agent.types import State, Action
32
+ from purpose_agent.immune import scan_memory
33
+ from purpose_agent.memory import MemoryCard, MemoryKind
34
+ from purpose_agent.breakthroughs import AdversarialHardener
35
+
36
+ MODELS = [
37
+ ("llama-3.3-70b", "openrouter:meta-llama/llama-3.3-70b-instruct"),
38
+ ("gemma-4-26b", "openrouter:google/gemma-4-26b-a4b-it"),
39
+ ("qwen3.5-9b", "openrouter:qwen/qwen3.5-9b"),
40
+ ]
41
+
42
+ RESULTS = []
43
+ PASS = 0
44
+ FAIL = 0
45
+
46
+
47
+ def record(model, domain, test_name, passed, detail="", duration=0):
48
+ global PASS, FAIL
49
+ if passed:
50
+ PASS += 1
51
+ else:
52
+ FAIL += 1
53
+ status = "PASS" if passed else "FAIL"
54
+ RESULTS.append({
55
+ "model": model, "domain": domain, "test": test_name,
56
+ "status": status, "detail": detail, "duration_s": round(duration, 1),
57
+ })
58
+ icon = "βœ“" if passed else "βœ—"
59
+ print(f" {icon} {test_name}: {detail[:80]}" + (f" ({duration:.1f}s)" if duration else ""))
60
+
61
+
62
+ # ═══════════════════════════════════════════════════════════════
63
+ # CODING ENVIRONMENT
64
+ # ═══════════════════════════════════════════════════════════════
65
+
66
+ class CodeEnv(Environment):
67
+ def __init__(self, tests):
68
+ self.tests = tests
69
+
70
+ def execute(self, action, state):
71
+ code = action.params.get("code", "")
72
+ if not code or "def " not in code:
73
+ from purpose_agent.robust_parser import extract_code
74
+ code = extract_code(action.thought or "") or extract_code(action.expected_delta or "")
75
+ data = {**state.data, "attempts": state.data.get("attempts", 0) + 1}
76
+ passed, fails = 0, []
77
+ for tc in self.tests:
78
+ try:
79
+ ns = {}
80
+ exec(code, ns)
81
+ r = str(eval(tc["input"], ns))
82
+ if r.strip() == tc["expected"].strip():
83
+ passed += 1
84
+ else:
85
+ fails.append(f'{tc["input"]}: got {r}')
86
+ except Exception as e:
87
+ fails.append(f'{tc["input"]}: {type(e).__name__}')
88
+ total = len(self.tests)
89
+ rate = passed / total if total else 0
90
+ data.update({"pass_rate": rate, "all_passed": passed == total,
91
+ "failures": fails[:3], "last_code": code[:300]})
92
+ return State(data=data,
93
+ summary=f"Tests: {passed}/{total}" + (" ALL PASSED" if passed == total else ""))
94
+
95
+ def reset(self):
96
+ return State(data={"attempts": 0})
97
+
98
+ def is_terminal(self, state):
99
+ return state.data.get("all_passed", False)
100
+
101
+
102
+ # ═══════════════════════════════════════════════════════════════
103
+ # REASONING/MATH ENVIRONMENT
104
+ # ═══════════════════════════════════════════════════════════════
105
+
106
+ class AnswerEnv(Environment):
107
+ def __init__(self, expected_answer):
108
+ self.expected = expected_answer.lower().strip()
109
+
110
+ def execute(self, action, state):
111
+ answer = (action.params.get("answer", "") or action.thought or "").lower().strip()
112
+ correct = self.expected in answer
113
+ return State(
114
+ data={"answer": answer, "correct": correct, "expected": self.expected},
115
+ summary=f"{'CORRECT' if correct else 'WRONG'}: got '{answer[:50]}', expected '{self.expected}'",
116
+ )
117
+
118
+ def reset(self):
119
+ return State(data={})
120
+
121
+ def is_terminal(self, state):
122
+ return state.data.get("correct", False)
123
+
124
+
125
+ # ═══════════════════════════════════════════════════════════════
126
+ # TEST CASES
127
+ # ══════��════════════════════════════════════════════════════════
128
+
129
+ CODING_TASKS = [
130
+ {
131
+ "name": "fibonacci",
132
+ "purpose": "Write a Python function fib(n) returning the nth Fibonacci number. fib(0)=0, fib(1)=1, fib(5)=5, fib(10)=55. Use submit_code action.",
133
+ "tests": [
134
+ {"input": "fib(0)", "expected": "0"}, {"input": "fib(1)", "expected": "1"},
135
+ {"input": "fib(5)", "expected": "5"}, {"input": "fib(10)", "expected": "55"},
136
+ ],
137
+ },
138
+ {
139
+ "name": "fizzbuzz",
140
+ "purpose": "Write fizzbuzz(n): 'Fizz' if n%3==0, 'Buzz' if n%5==0, 'FizzBuzz' if both, else str(n). Use submit_code action.",
141
+ "tests": [
142
+ {"input": "fizzbuzz(3)", "expected": "Fizz"}, {"input": "fizzbuzz(5)", "expected": "Buzz"},
143
+ {"input": "fizzbuzz(15)", "expected": "FizzBuzz"}, {"input": "fizzbuzz(7)", "expected": "7"},
144
+ ],
145
+ },
146
+ {
147
+ "name": "is_palindrome",
148
+ "purpose": "Write is_palindrome(s) returning True if string s is a palindrome, False otherwise. Use submit_code action.",
149
+ "tests": [
150
+ {"input": "is_palindrome('racecar')", "expected": "True"},
151
+ {"input": "is_palindrome('hello')", "expected": "False"},
152
+ {"input": "is_palindrome('')", "expected": "True"},
153
+ ],
154
+ },
155
+ {
156
+ "name": "factorial",
157
+ "purpose": "Write factorial(n) returning n!. factorial(0)=1, factorial(5)=120. Use submit_code action.",
158
+ "tests": [
159
+ {"input": "factorial(0)", "expected": "1"}, {"input": "factorial(5)", "expected": "120"},
160
+ {"input": "factorial(10)", "expected": "3628800"},
161
+ ],
162
+ },
163
+ ]
164
+
165
+ REASONING_TASKS = [
166
+ {
167
+ "name": "logic_deduction",
168
+ "purpose": "If all roses are flowers, and some flowers fade quickly, can we conclude all roses fade quickly? Answer with 'no'. Use answer action.",
169
+ "expected": "no",
170
+ },
171
+ {
172
+ "name": "sequence",
173
+ "purpose": "What comes next: 2, 6, 18, 54, ? Answer with just the number using answer action.",
174
+ "expected": "162",
175
+ },
176
+ ]
177
+
178
+ MATH_TASKS = [
179
+ {
180
+ "name": "word_problem",
181
+ "purpose": "A store sells apples for $2 each. If you buy 7 apples and pay with $20, how much change do you get? Answer with just the number using answer action.",
182
+ "expected": "6",
183
+ },
184
+ {
185
+ "name": "percentage",
186
+ "purpose": "What is 15% of 200? Answer with just the number using answer action.",
187
+ "expected": "30",
188
+ },
189
+ ]
190
+
191
+
192
+ def run_coding_test(model_name, backend, task):
193
+ """Run a single coding task and check pass rate."""
194
+ env = CodeEnv(task["tests"])
195
+ orch = pa.Orchestrator(
196
+ llm=backend, environment=env,
197
+ available_actions={"submit_code": "Submit Python code in params.code", "DONE": "Done"},
198
+ optimize_every_n_tasks=99,
199
+ )
200
+ t0 = time.time()
201
+ try:
202
+ r = orch.run_task(purpose=task["purpose"], initial_state=env.reset(), max_steps=2)
203
+ rate = r.final_state.data.get("pass_rate", 0)
204
+ all_pass = r.final_state.data.get("all_passed", False)
205
+ elapsed = time.time() - t0
206
+ detail = f"pass_rate={rate:.0%}" + (f" failures={r.final_state.data.get('failures', [])[:1]}" if not all_pass else "")
207
+ record(model_name, "coding", task["name"], all_pass, detail, elapsed)
208
+ except Exception as e:
209
+ record(model_name, "coding", task["name"], False, str(e)[:80], time.time() - t0)
210
+
211
+
212
+ def run_reasoning_test(model_name, backend, task):
213
+ """Run a reasoning/math task."""
214
+ env = AnswerEnv(task["expected"])
215
+ orch = pa.Orchestrator(
216
+ llm=backend, environment=env,
217
+ available_actions={"answer": "Submit your answer in params.answer", "DONE": "Done"},
218
+ optimize_every_n_tasks=99,
219
+ )
220
+ t0 = time.time()
221
+ try:
222
+ r = orch.run_task(purpose=task["purpose"], initial_state=env.reset(), max_steps=2)
223
+ correct = r.final_state.data.get("correct", False)
224
+ answer = r.final_state.data.get("answer", "")[:50]
225
+ elapsed = time.time() - t0
226
+ record(model_name, "reasoning", task["name"], correct,
227
+ f"answer='{answer}' expected='{task['expected']}'", elapsed)
228
+ except Exception as e:
229
+ record(model_name, "reasoning", task["name"], False, str(e)[:80], time.time() - t0)
230
+
231
+
232
+ def run_learning_test(model_name, backend):
233
+ """Test that heuristic library grows across runs (self-improvement signal)."""
234
+ tests = [{"input": "fib(5)", "expected": "5"}]
235
+ env = CodeEnv(tests)
236
+ orch = pa.Orchestrator(
237
+ llm=backend, environment=env,
238
+ available_actions={"submit_code": "Submit code in params.code", "DONE": "Done"},
239
+ optimize_every_n_tasks=1,
240
+ )
241
+ orch.optimizer.min_reward_threshold = 0.01
242
+
243
+ heuristic_counts = []
244
+ for run in range(1, 4):
245
+ try:
246
+ orch.run_task(purpose="Write fib(n): fib(5)=5. Use submit_code.", initial_state=env.reset(), max_steps=2)
247
+ except:
248
+ pass
249
+ heuristic_counts.append(len(orch.optimizer.heuristic_library))
250
+ time.sleep(0.3)
251
+
252
+ grew = heuristic_counts[-1] > heuristic_counts[0]
253
+ record(model_name, "learning", "heuristic_growth",
254
+ grew, f"heuristics={heuristic_counts}", 0)
255
+
256
+
257
+ def run_security_test():
258
+ """Test immune system (doesn't need a model)."""
259
+ hardener = AdversarialHardener()
260
+ report = hardener.run(n_adversarial=30, n_benign=10)
261
+ record("immune_system", "security", "adversarial_catch",
262
+ report["catch_rate"] >= 0.75,
263
+ f"catch={report['catch_rate']:.0%} ({report['adversarial_caught']}/{report['adversarial_total']})")
264
+ record("immune_system", "security", "false_positive",
265
+ report["false_positive_rate"] <= 0.15,
266
+ f"fp={report['false_positive_rate']:.0%} ({report['benign_total']-report['benign_passed']}/{report['benign_total']})")
267
+
268
+
269
+ # ═══════════════════════════════════════════════════════════════
270
+ # MAIN
271
+ # ═══════════════════════════════════════════════════════════════
272
+
273
+ def main():
274
+ print("╔════════════════════════════════════════════════════════════╗")
275
+ print("β•‘ REAL-WORLD REGRESSION TEST β€” Purpose Agent v2.0.0 β•‘")
276
+ print("β•‘ Provider: OpenRouter | 3 models Γ— 10 test cases β•‘")
277
+ print("β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•")
278
+ print()
279
+
280
+ # Security (no model needed)
281
+ print("═══ SECURITY ═══")
282
+ run_security_test()
283
+ print()
284
+
285
+ for model_name, model_spec in MODELS:
286
+ print(f"═══ MODEL: {model_name} ═══")
287
+ try:
288
+ backend = resolve_backend(model_spec, api_key=OR_KEY)
289
+ except Exception as e:
290
+ print(f" βœ— Failed to create backend: {e}")
291
+ continue
292
+
293
+ # Coding
294
+ print(f" [Coding]")
295
+ for task in CODING_TASKS:
296
+ run_coding_test(model_name, backend, task)
297
+ time.sleep(0.3)
298
+
299
+ # Reasoning
300
+ print(f" [Reasoning]")
301
+ for task in REASONING_TASKS:
302
+ run_reasoning_test(model_name, backend, task)
303
+ time.sleep(0.3)
304
+
305
+ # Math
306
+ print(f" [Math]")
307
+ for task in MATH_TASKS:
308
+ run_reasoning_test(model_name, backend, task)
309
+ time.sleep(0.3)
310
+
311
+ # Learning
312
+ print(f" [Self-Improvement]")
313
+ run_learning_test(model_name, backend)
314
+
315
+ print()
316
+
317
+ # ═══ REPORT ═══
318
+ print("╔════════════════════════════════════════════════════════════╗")
319
+ print("β•‘ RESULTS β•‘")
320
+ print("β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•")
321
+ print()
322
+ print(f" PASS: {PASS} | FAIL: {FAIL} | Total: {PASS+FAIL}")
323
+ print(f" Pass rate: {PASS/(PASS+FAIL)*100:.1f}%")
324
+ print()
325
+
326
+ # Per-model summary
327
+ by_model = {}
328
+ for r in RESULTS:
329
+ m = r["model"]
330
+ if m not in by_model:
331
+ by_model[m] = {"pass": 0, "fail": 0, "total_time": 0}
332
+ if r["status"] == "PASS":
333
+ by_model[m]["pass"] += 1
334
+ else:
335
+ by_model[m]["fail"] += 1
336
+ by_model[m]["total_time"] += r["duration_s"]
337
+
338
+ print(f" {'Model':<20} {'Pass':>6} {'Fail':>6} {'Rate':>8} {'Time':>8}")
339
+ print(f" {'─'*50}")
340
+ for model, stats in by_model.items():
341
+ total = stats["pass"] + stats["fail"]
342
+ rate = stats["pass"] / total if total else 0
343
+ print(f" {model:<20} {stats['pass']:>6} {stats['fail']:>6} {rate:>7.0%} {stats['total_time']:>7.1f}s")
344
+
345
+ # Per-domain summary
346
+ print()
347
+ by_domain = {}
348
+ for r in RESULTS:
349
+ d = r["domain"]
350
+ if d not in by_domain:
351
+ by_domain[d] = {"pass": 0, "fail": 0}
352
+ if r["status"] == "PASS":
353
+ by_domain[d]["pass"] += 1
354
+ else:
355
+ by_domain[d]["fail"] += 1
356
+
357
+ print(f" {'Domain':<20} {'Pass':>6} {'Fail':>6} {'Rate':>8}")
358
+ print(f" {'─'*42}")
359
+ for domain, stats in by_domain.items():
360
+ total = stats["pass"] + stats["fail"]
361
+ rate = stats["pass"] / total if total else 0
362
+ print(f" {domain:<20} {stats['pass']:>6} {stats['fail']:>6} {rate:>7.0%}")
363
+
364
+ # Failures detail
365
+ failures = [r for r in RESULTS if r["status"] == "FAIL"]
366
+ if failures:
367
+ print(f"\n FAILURES ({len(failures)}):")
368
+ for f in failures:
369
+ print(f" βœ— [{f['model']}] {f['domain']}/{f['test']}: {f['detail'][:60]}")
370
+
371
+ # Save
372
+ os.makedirs("tests/results", exist_ok=True)
373
+ with open("tests/results/real_world_regression.json", "w") as f:
374
+ json.dump({"pass": PASS, "fail": FAIL, "results": RESULTS}, f, indent=2)
375
+ print(f"\n Saved to tests/results/real_world_regression.json")
376
+
377
+
378
+ if __name__ == "__main__":
379
+ main()