Rohan03 commited on
Commit
d7dc6c8
·
verified ·
1 Parent(s): 22fb57a

fix: real-model robustness — benchmarks/validate_real.py

Browse files
Files changed (1) hide show
  1. benchmarks/validate_real.py +215 -0
benchmarks/validate_real.py ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Track 2: REAL MODEL validation — Groq + Qwen3-32B.
4
+
5
+ Runs the self-improvement loop with an actual LLM, not mocks.
6
+ Proves Purpose Learning works with real inference.
7
+
8
+ Usage:
9
+ export GROQ_API_KEY="gsk_..."
10
+ python benchmarks/validate_real.py
11
+ """
12
+ import sys, os, json, time
13
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
14
+
15
+ from purpose_agent.types import State, Action
16
+ from purpose_agent.llm_backend import resolve_backend, ChatMessage
17
+ from purpose_agent.orchestrator import Environment, Orchestrator
18
+
19
+ GROQ_KEY = os.environ.get("GROQ_API_KEY", "")
20
+ if not GROQ_KEY:
21
+ print("Set GROQ_API_KEY to run this benchmark.")
22
+ sys.exit(1)
23
+
24
+ MODEL = "groq:llama-3.3-70b-versatile"
25
+
26
+
27
+ # ════════════════ Coding Environment ════════════════
28
+
29
+ class CodeEnv(Environment):
30
+ def __init__(self, tests):
31
+ self.tests = tests
32
+
33
+ def execute(self, action, current_state):
34
+ code = action.params.get("code", action.thought or "")
35
+ # Try to extract code from thought if not in params
36
+ if not code.strip() or "def " not in code:
37
+ for field in [action.expected_delta, action.thought]:
38
+ if field and "def " in field:
39
+ code = field
40
+ break
41
+
42
+ data = {**current_state.data, "attempts": current_state.data.get("attempts", 0) + 1}
43
+ passed, fails = 0, []
44
+ for tc in self.tests:
45
+ try:
46
+ ns = {}
47
+ exec(code, ns)
48
+ result = str(eval(tc["input"], ns))
49
+ if result.strip() == str(tc["expected"]).strip():
50
+ passed += 1
51
+ else:
52
+ fails.append(f'{tc["input"]}: want {tc["expected"]}, got {result}')
53
+ except Exception as e:
54
+ fails.append(f'{tc["input"]}: {type(e).__name__}: {e}')
55
+
56
+ total = len(self.tests)
57
+ data.update({
58
+ "tests_passed": passed, "tests_total": total,
59
+ "pass_rate": passed / total if total else 0,
60
+ "all_passed": passed == total,
61
+ "failures": fails[:3], "last_code": code[:500],
62
+ })
63
+ summary = f"Tests: {passed}/{total}" + (
64
+ " | ALL PASSED ✓" if passed == total else f" | Fails: {'; '.join(fails[:2])}"
65
+ )
66
+ return State(data=data, summary=summary)
67
+
68
+ def reset(self):
69
+ return State(data={"attempts": 0})
70
+
71
+ def is_terminal(self, state):
72
+ return state.data.get("all_passed", False)
73
+
74
+
75
+ # ════════════════ Tasks ════════════════
76
+
77
+ TASKS = {
78
+ "fibonacci": {
79
+ "purpose": (
80
+ "Write a Python function called fib(n) that returns the nth Fibonacci number. "
81
+ "fib(0)=0, fib(1)=1, fib(5)=5, fib(10)=55. "
82
+ "Use the submit_code action with your code in the 'code' parameter."
83
+ ),
84
+ "tests": [
85
+ {"input": "fib(0)", "expected": "0"},
86
+ {"input": "fib(1)", "expected": "1"},
87
+ {"input": "fib(5)", "expected": "5"},
88
+ {"input": "fib(10)", "expected": "55"},
89
+ ],
90
+ },
91
+ "fizzbuzz": {
92
+ "purpose": (
93
+ "Write a Python function called fizzbuzz(n) that returns: "
94
+ "'Fizz' if n is divisible by 3, 'Buzz' if by 5, 'FizzBuzz' if by both, else str(n). "
95
+ "Use the submit_code action with your code in the 'code' parameter."
96
+ ),
97
+ "tests": [
98
+ {"input": "fizzbuzz(3)", "expected": "Fizz"},
99
+ {"input": "fizzbuzz(5)", "expected": "Buzz"},
100
+ {"input": "fizzbuzz(15)", "expected": "FizzBuzz"},
101
+ {"input": "fizzbuzz(7)", "expected": "7"},
102
+ ],
103
+ },
104
+ }
105
+
106
+
107
+ def run_task_with_real_model(task_name: str, orch: Orchestrator, run_num: int) -> dict:
108
+ """Run one task and return metrics."""
109
+ task = TASKS[task_name]
110
+ env = CodeEnv(task["tests"])
111
+ orch.environment = env
112
+
113
+ start = time.time()
114
+ try:
115
+ result = orch.run_task(
116
+ purpose=task["purpose"],
117
+ initial_state=env.reset(),
118
+ max_steps=3,
119
+ )
120
+ phi = result.final_phi or 0
121
+ steps = result.total_steps
122
+ pass_rate = result.final_state.data.get("pass_rate", 0)
123
+ all_passed = result.final_state.data.get("all_passed", False)
124
+ except Exception as e:
125
+ print(f" ERROR: {e}")
126
+ phi, steps, pass_rate, all_passed = 0, 0, 0, False
127
+
128
+ elapsed = time.time() - start
129
+ n_heur = len(orch.optimizer.heuristic_library)
130
+
131
+ status = "✓" if all_passed else "✗"
132
+ print(f" Run {run_num}: {status} Φ={phi:.1f} pass={pass_rate:.0%} steps={steps} heur={n_heur} ({elapsed:.1f}s)")
133
+
134
+ return {
135
+ "run": run_num, "phi": round(phi, 1), "pass_rate": round(pass_rate, 2),
136
+ "steps": steps, "all_passed": all_passed, "heuristics": n_heur,
137
+ "time_s": round(elapsed, 1),
138
+ }
139
+
140
+
141
+ def main():
142
+ print("╔══════════════════════════════════════════════════════╗")
143
+ print("║ Track 2: REAL MODEL Validation ║")
144
+ print(f"║ Model: {MODEL:<44} ║")
145
+ print("╚══════════════════════════════════════════════════════╝\n")
146
+
147
+ backend = resolve_backend(MODEL, api_key=GROQ_KEY)
148
+
149
+ # Quick connection test
150
+ print("Testing connection...")
151
+ r = backend.generate(
152
+ [ChatMessage(role="user", content="Say 'ok' and nothing else.")],
153
+ temperature=0.1, max_tokens=500,
154
+ )
155
+ print(f" Response: \"{r[:50]}\"")
156
+ print()
157
+
158
+ results = {}
159
+
160
+ for task_name in TASKS:
161
+ print(f"═══ {task_name} (3 runs, learning persists) ═══")
162
+
163
+ env = CodeEnv(TASKS[task_name]["tests"])
164
+ orch = Orchestrator(
165
+ llm=backend,
166
+ environment=env,
167
+ available_actions={
168
+ "submit_code": "Submit Python code. Put the code in the 'code' parameter.",
169
+ "DONE": "Signal task completion",
170
+ },
171
+ optimize_every_n_tasks=1,
172
+ )
173
+ orch.optimizer.min_reward_threshold = 0.1
174
+
175
+ curve = []
176
+ for run_num in range(1, 4): # 3 runs per task
177
+ entry = run_task_with_real_model(task_name, orch, run_num)
178
+ curve.append(entry)
179
+ time.sleep(1) # Rate limit courtesy
180
+
181
+ results[task_name] = curve
182
+
183
+ # Report delta
184
+ if len(curve) >= 2:
185
+ delta = curve[-1]["phi"] - curve[0]["phi"]
186
+ if delta > 0:
187
+ print(f" → Δ(Φ) = {delta:+.1f} ✓ IMPROVED")
188
+ elif delta == 0:
189
+ print(f" → Δ(Φ) = {delta:+.1f} (no change)")
190
+ else:
191
+ print(f" → Δ(Φ) = {delta:+.1f} (regressed)")
192
+ print()
193
+
194
+ # ═══ Final Report ═══
195
+ print("╔══════════════════════════════════════════════════════╗")
196
+ print("║ RESULTS ║")
197
+ print("╚══════════════════════════════════════════════════════╝")
198
+ print(f"{'Task':<14} {'Run 1 Φ':>8} {'Run 3 Φ':>8} {'Delta':>8} {'Verdict'}")
199
+ print("─" * 50)
200
+ for task_name, curve in results.items():
201
+ r1 = curve[0]["phi"]
202
+ r3 = curve[-1]["phi"]
203
+ delta = r3 - r1
204
+ verdict = "✓ IMPROVED" if delta > 0 else "= SAME" if delta == 0 else "✗ REGRESSED"
205
+ print(f"{task_name:<14} {r1:>8.1f} {r3:>8.1f} {delta:>+8.1f} {verdict}")
206
+
207
+ # Save
208
+ os.makedirs("benchmarks/results", exist_ok=True)
209
+ with open("benchmarks/results/real_model_results.json", "w") as f:
210
+ json.dump({"model": MODEL, "results": results}, f, indent=2)
211
+ print(f"\nSaved to benchmarks/results/real_model_results.json")
212
+
213
+
214
+ if __name__ == "__main__":
215
+ main()