#!/usr/bin/env python3 """ Track 2: REAL MODEL validation — Groq + Qwen3-32B. Runs the self-improvement loop with an actual LLM, not mocks. Proves Purpose Learning works with real inference. Usage: export GROQ_API_KEY="gsk_..." python benchmarks/validate_real.py """ import sys, os, json, time sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) from purpose_agent.types import State, Action from purpose_agent.llm_backend import resolve_backend, ChatMessage from purpose_agent.orchestrator import Environment, Orchestrator GROQ_KEY = os.environ.get("GROQ_API_KEY", "") if not GROQ_KEY: print("Set GROQ_API_KEY to run this benchmark.") sys.exit(1) MODEL = "groq:llama-3.3-70b-versatile" # ════════════════ Coding Environment ════════════════ class CodeEnv(Environment): def __init__(self, tests): self.tests = tests def execute(self, action, current_state): code = action.params.get("code", action.thought or "") # Try to extract code from thought if not in params if not code.strip() or "def " not in code: for field in [action.expected_delta, action.thought]: if field and "def " in field: code = field break data = {**current_state.data, "attempts": current_state.data.get("attempts", 0) + 1} passed, fails = 0, [] for tc in self.tests: try: ns = {} exec(code, ns) result = str(eval(tc["input"], ns)) if result.strip() == str(tc["expected"]).strip(): passed += 1 else: fails.append(f'{tc["input"]}: want {tc["expected"]}, got {result}') except Exception as e: fails.append(f'{tc["input"]}: {type(e).__name__}: {e}') total = len(self.tests) data.update({ "tests_passed": passed, "tests_total": total, "pass_rate": passed / total if total else 0, "all_passed": passed == total, "failures": fails[:3], "last_code": code[:500], }) summary = f"Tests: {passed}/{total}" + ( " | ALL PASSED ✓" if passed == total else f" | Fails: {'; '.join(fails[:2])}" ) return State(data=data, summary=summary) def reset(self): return State(data={"attempts": 0}) def is_terminal(self, state): return state.data.get("all_passed", False) # ════════════════ Tasks ════════════════ TASKS = { "fibonacci": { "purpose": ( "Write a Python function called fib(n) that returns the nth Fibonacci number. " "fib(0)=0, fib(1)=1, fib(5)=5, fib(10)=55. " "Use the submit_code action with your code in the 'code' parameter." ), "tests": [ {"input": "fib(0)", "expected": "0"}, {"input": "fib(1)", "expected": "1"}, {"input": "fib(5)", "expected": "5"}, {"input": "fib(10)", "expected": "55"}, ], }, "fizzbuzz": { "purpose": ( "Write a Python function called fizzbuzz(n) that returns: " "'Fizz' if n is divisible by 3, 'Buzz' if by 5, 'FizzBuzz' if by both, else str(n). " "Use the submit_code action with your code in the 'code' parameter." ), "tests": [ {"input": "fizzbuzz(3)", "expected": "Fizz"}, {"input": "fizzbuzz(5)", "expected": "Buzz"}, {"input": "fizzbuzz(15)", "expected": "FizzBuzz"}, {"input": "fizzbuzz(7)", "expected": "7"}, ], }, } def run_task_with_real_model(task_name: str, orch: Orchestrator, run_num: int) -> dict: """Run one task and return metrics.""" task = TASKS[task_name] env = CodeEnv(task["tests"]) orch.environment = env start = time.time() try: result = orch.run_task( purpose=task["purpose"], initial_state=env.reset(), max_steps=3, ) phi = result.final_phi or 0 steps = result.total_steps pass_rate = result.final_state.data.get("pass_rate", 0) all_passed = result.final_state.data.get("all_passed", False) except Exception as e: print(f" ERROR: {e}") phi, steps, pass_rate, all_passed = 0, 0, 0, False elapsed = time.time() - start n_heur = len(orch.optimizer.heuristic_library) status = "✓" if all_passed else "✗" print(f" Run {run_num}: {status} Φ={phi:.1f} pass={pass_rate:.0%} steps={steps} heur={n_heur} ({elapsed:.1f}s)") return { "run": run_num, "phi": round(phi, 1), "pass_rate": round(pass_rate, 2), "steps": steps, "all_passed": all_passed, "heuristics": n_heur, "time_s": round(elapsed, 1), } def main(): print("╔══════════════════════════════════════════════════════╗") print("║ Track 2: REAL MODEL Validation ║") print(f"║ Model: {MODEL:<44} ║") print("╚══════════════════════════════════════════════════════╝\n") backend = resolve_backend(MODEL, api_key=GROQ_KEY) # Quick connection test print("Testing connection...") r = backend.generate( [ChatMessage(role="user", content="Say 'ok' and nothing else.")], temperature=0.1, max_tokens=500, ) print(f" Response: \"{r[:50]}\"") print() results = {} for task_name in TASKS: print(f"═══ {task_name} (3 runs, learning persists) ═══") env = CodeEnv(TASKS[task_name]["tests"]) orch = Orchestrator( llm=backend, environment=env, available_actions={ "submit_code": "Submit Python code. Put the code in the 'code' parameter.", "DONE": "Signal task completion", }, optimize_every_n_tasks=1, ) orch.optimizer.min_reward_threshold = 0.1 curve = [] for run_num in range(1, 4): # 3 runs per task entry = run_task_with_real_model(task_name, orch, run_num) curve.append(entry) time.sleep(1) # Rate limit courtesy results[task_name] = curve # Report delta if len(curve) >= 2: delta = curve[-1]["phi"] - curve[0]["phi"] if delta > 0: print(f" → Δ(Φ) = {delta:+.1f} ✓ IMPROVED") elif delta == 0: print(f" → Δ(Φ) = {delta:+.1f} (no change)") else: print(f" → Δ(Φ) = {delta:+.1f} (regressed)") print() # ═══ Final Report ═══ print("╔══════════════════════════════════════════════════════╗") print("║ RESULTS ║") print("╚══════════════════════════════════════════════════════╝") print(f"{'Task':<14} {'Run 1 Φ':>8} {'Run 3 Φ':>8} {'Delta':>8} {'Verdict'}") print("─" * 50) for task_name, curve in results.items(): r1 = curve[0]["phi"] r3 = curve[-1]["phi"] delta = r3 - r1 verdict = "✓ IMPROVED" if delta > 0 else "= SAME" if delta == 0 else "✗ REGRESSED" print(f"{task_name:<14} {r1:>8.1f} {r3:>8.1f} {delta:>+8.1f} {verdict}") # Save os.makedirs("benchmarks/results", exist_ok=True) with open("benchmarks/results/real_model_results.json", "w") as f: json.dump({"model": MODEL, "results": results}, f, indent=2) print(f"\nSaved to benchmarks/results/real_model_results.json") if __name__ == "__main__": main()