Rohan03
/

purpose-agent

+#!/usr/bin/env python3
+"""
+Track 2: REAL MODEL validation — Groq + Qwen3-32B.
+Runs the self-improvement loop with an actual LLM, not mocks.
+Proves Purpose Learning works with real inference.
+Usage:
+    export GROQ_API_KEY="gsk_..."
+    python benchmarks/validate_real.py
+"""
+import sys, os, json, time
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+from purpose_agent.types import State, Action
+from purpose_agent.llm_backend import resolve_backend, ChatMessage
+from purpose_agent.orchestrator import Environment, Orchestrator
+GROQ_KEY = os.environ.get("GROQ_API_KEY", "")
+if not GROQ_KEY:
+    print("Set GROQ_API_KEY to run this benchmark.")
+    sys.exit(1)
+MODEL = "groq:llama-3.3-70b-versatile"
+# ════════════════ Coding Environment ════════════════
+class CodeEnv(Environment):
+    def __init__(self, tests):
+        self.tests = tests
+    def execute(self, action, current_state):
+        code = action.params.get("code", action.thought or "")
+        # Try to extract code from thought if not in params
+        if not code.strip() or "def " not in code:
+            for field in [action.expected_delta, action.thought]:
+                if field and "def " in field:
+                    code = field
+                    break
+        data = {**current_state.data, "attempts": current_state.data.get("attempts", 0) + 1}
+        passed, fails = 0, []
+        for tc in self.tests:
+            try:
+                ns = {}
+                exec(code, ns)
+                result = str(eval(tc["input"], ns))
+                if result.strip() == str(tc["expected"]).strip():
+                    passed += 1
+                else:
+                    fails.append(f'{tc["input"]}: want {tc["expected"]}, got {result}')
+            except Exception as e:
+                fails.append(f'{tc["input"]}: {type(e).__name__}: {e}')
+        total = len(self.tests)
+        data.update({
+            "tests_passed": passed, "tests_total": total,
+            "pass_rate": passed / total if total else 0,
+            "all_passed": passed == total,
+            "failures": fails[:3], "last_code": code[:500],
+        })
+        summary = f"Tests: {passed}/{total}" + (
+            " | ALL PASSED ✓" if passed == total else f" | Fails: {'; '.join(fails[:2])}"
+        )
+        return State(data=data, summary=summary)
+    def reset(self):
+        return State(data={"attempts": 0})
+    def is_terminal(self, state):
+        return state.data.get("all_passed", False)
+# ════════════════ Tasks ════════════════
+TASKS = {
+    "fibonacci": {
+        "purpose": (
+            "Write a Python function called fib(n) that returns the nth Fibonacci number. "
+            "fib(0)=0, fib(1)=1, fib(5)=5, fib(10)=55. "
+            "Use the submit_code action with your code in the 'code' parameter."
+        ),
+        "tests": [
+            {"input": "fib(0)", "expected": "0"},
+            {"input": "fib(1)", "expected": "1"},
+            {"input": "fib(5)", "expected": "5"},
+            {"input": "fib(10)", "expected": "55"},
+        ],
+    },
+    "fizzbuzz": {
+        "purpose": (
+            "Write a Python function called fizzbuzz(n) that returns: "
+            "'Fizz' if n is divisible by 3, 'Buzz' if by 5, 'FizzBuzz' if by both, else str(n). "
+            "Use the submit_code action with your code in the 'code' parameter."
+        ),
+        "tests": [
+            {"input": "fizzbuzz(3)", "expected": "Fizz"},
+            {"input": "fizzbuzz(5)", "expected": "Buzz"},
+            {"input": "fizzbuzz(15)", "expected": "FizzBuzz"},
+            {"input": "fizzbuzz(7)", "expected": "7"},
+        ],
+    },
+}
+def run_task_with_real_model(task_name: str, orch: Orchestrator, run_num: int) -> dict:
+    """Run one task and return metrics."""
+    task = TASKS[task_name]
+    env = CodeEnv(task["tests"])
+    orch.environment = env
+    start = time.time()
+    try:
+        result = orch.run_task(
+            purpose=task["purpose"],
+            initial_state=env.reset(),
+            max_steps=3,
+        )
+        phi = result.final_phi or 0
+        steps = result.total_steps
+        pass_rate = result.final_state.data.get("pass_rate", 0)
+        all_passed = result.final_state.data.get("all_passed", False)
+    except Exception as e:
+        print(f"    ERROR: {e}")
+        phi, steps, pass_rate, all_passed = 0, 0, 0, False
+    elapsed = time.time() - start
+    n_heur = len(orch.optimizer.heuristic_library)
+    status = "✓" if all_passed else "✗"
+    print(f"  Run {run_num}: {status} Φ={phi:.1f} pass={pass_rate:.0%} steps={steps} heur={n_heur} ({elapsed:.1f}s)")
+    return {
+        "run": run_num, "phi": round(phi, 1), "pass_rate": round(pass_rate, 2),
+        "steps": steps, "all_passed": all_passed, "heuristics": n_heur,
+        "time_s": round(elapsed, 1),
+    }
+def main():
+    print("╔══════════════════════════════════════════════════════╗")
+    print("║  Track 2: REAL MODEL Validation                     ║")
+    print(f"║  Model: {MODEL:<44} ║")
+    print("╚══════════════════════════════════════════════════════╝\n")
+    backend = resolve_backend(MODEL, api_key=GROQ_KEY)
+    # Quick connection test
+    print("Testing connection...")
+    r = backend.generate(
+        [ChatMessage(role="user", content="Say 'ok' and nothing else.")],
+        temperature=0.1, max_tokens=500,
+    )
+    print(f"  Response: \"{r[:50]}\"")
+    print()
+    results = {}
+    for task_name in TASKS:
+        print(f"═══ {task_name} (3 runs, learning persists) ═══")
+        env = CodeEnv(TASKS[task_name]["tests"])
+        orch = Orchestrator(
+            llm=backend,
+            environment=env,
+            available_actions={
+                "submit_code": "Submit Python code. Put the code in the 'code' parameter.",
+                "DONE": "Signal task completion",
+            },
+            optimize_every_n_tasks=1,
+        )
+        orch.optimizer.min_reward_threshold = 0.1
+        curve = []
+        for run_num in range(1, 4):  # 3 runs per task
+            entry = run_task_with_real_model(task_name, orch, run_num)
+            curve.append(entry)
+            time.sleep(1)  # Rate limit courtesy
+        results[task_name] = curve
+        # Report delta
+        if len(curve) >= 2:
+            delta = curve[-1]["phi"] - curve[0]["phi"]
+            if delta > 0:
+                print(f"  → Δ(Φ) = {delta:+.1f} ✓ IMPROVED")
+            elif delta == 0:
+                print(f"  → Δ(Φ) = {delta:+.1f} (no change)")
+            else:
+                print(f"  → Δ(Φ) = {delta:+.1f} (regressed)")
+        print()
+    # ═══ Final Report ═══
+    print("╔══════════════════════════════════════════════════════╗")
+    print("║  RESULTS                                            ║")
+    print("╚══════════════════════════════════════════════════════╝")
+    print(f"{'Task':<14} {'Run 1 Φ':>8} {'Run 3 Φ':>8} {'Delta':>8} {'Verdict'}")
+    print("─" * 50)
+    for task_name, curve in results.items():
+        r1 = curve[0]["phi"]
+        r3 = curve[-1]["phi"]
+        delta = r3 - r1
+        verdict = "✓ IMPROVED" if delta > 0 else "= SAME" if delta == 0 else "✗ REGRESSED"
+        print(f"{task_name:<14} {r1:>8.1f} {r3:>8.1f} {delta:>+8.1f} {verdict}")
+    # Save
+    os.makedirs("benchmarks/results", exist_ok=True)
+    with open("benchmarks/results/real_model_results.json", "w") as f:
+        json.dump({"model": MODEL, "results": results}, f, indent=2)
+    print(f"\nSaved to benchmarks/results/real_model_results.json")
+if __name__ == "__main__":
+    main()