File size: 8,162 Bytes

d7dc6c8

#!/usr/bin/env python3
"""
Track 2: REAL MODEL validation — Groq + Qwen3-32B.

Runs the self-improvement loop with an actual LLM, not mocks.
Proves Purpose Learning works with real inference.

Usage:
    export GROQ_API_KEY="gsk_..."
    python benchmarks/validate_real.py
"""
import sys, os, json, time
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))

from purpose_agent.types import State, Action
from purpose_agent.llm_backend import resolve_backend, ChatMessage
from purpose_agent.orchestrator import Environment, Orchestrator

GROQ_KEY = os.environ.get("GROQ_API_KEY", "")
if not GROQ_KEY:
    print("Set GROQ_API_KEY to run this benchmark.")
    sys.exit(1)

MODEL = "groq:llama-3.3-70b-versatile"


# ════════════════ Coding Environment ════════════════

class CodeEnv(Environment):
    def __init__(self, tests):
        self.tests = tests

    def execute(self, action, current_state):
        code = action.params.get("code", action.thought or "")
        # Try to extract code from thought if not in params
        if not code.strip() or "def " not in code:
            for field in [action.expected_delta, action.thought]:
                if field and "def " in field:
                    code = field
                    break

        data = {**current_state.data, "attempts": current_state.data.get("attempts", 0) + 1}
        passed, fails = 0, []
        for tc in self.tests:
            try:
                ns = {}
                exec(code, ns)
                result = str(eval(tc["input"], ns))
                if result.strip() == str(tc["expected"]).strip():
                    passed += 1
                else:
                    fails.append(f'{tc["input"]}: want {tc["expected"]}, got {result}')
            except Exception as e:
                fails.append(f'{tc["input"]}: {type(e).__name__}: {e}')

        total = len(self.tests)
        data.update({
            "tests_passed": passed, "tests_total": total,
            "pass_rate": passed / total if total else 0,
            "all_passed": passed == total,
            "failures": fails[:3], "last_code": code[:500],
        })
        summary = f"Tests: {passed}/{total}" + (
            " | ALL PASSED ✓" if passed == total else f" | Fails: {'; '.join(fails[:2])}"
        )
        return State(data=data, summary=summary)

    def reset(self):
        return State(data={"attempts": 0})

    def is_terminal(self, state):
        return state.data.get("all_passed", False)


# ════════════════ Tasks ════════════════

TASKS = {
    "fibonacci": {
        "purpose": (
            "Write a Python function called fib(n) that returns the nth Fibonacci number. "
            "fib(0)=0, fib(1)=1, fib(5)=5, fib(10)=55. "
            "Use the submit_code action with your code in the 'code' parameter."
        ),
        "tests": [
            {"input": "fib(0)", "expected": "0"},
            {"input": "fib(1)", "expected": "1"},
            {"input": "fib(5)", "expected": "5"},
            {"input": "fib(10)", "expected": "55"},
        ],
    },
    "fizzbuzz": {
        "purpose": (
            "Write a Python function called fizzbuzz(n) that returns: "
            "'Fizz' if n is divisible by 3, 'Buzz' if by 5, 'FizzBuzz' if by both, else str(n). "
            "Use the submit_code action with your code in the 'code' parameter."
        ),
        "tests": [
            {"input": "fizzbuzz(3)", "expected": "Fizz"},
            {"input": "fizzbuzz(5)", "expected": "Buzz"},
            {"input": "fizzbuzz(15)", "expected": "FizzBuzz"},
            {"input": "fizzbuzz(7)", "expected": "7"},
        ],
    },
}


def run_task_with_real_model(task_name: str, orch: Orchestrator, run_num: int) -> dict:
    """Run one task and return metrics."""
    task = TASKS[task_name]
    env = CodeEnv(task["tests"])
    orch.environment = env

    start = time.time()
    try:
        result = orch.run_task(
            purpose=task["purpose"],
            initial_state=env.reset(),
            max_steps=3,
        )
        phi = result.final_phi or 0
        steps = result.total_steps
        pass_rate = result.final_state.data.get("pass_rate", 0)
        all_passed = result.final_state.data.get("all_passed", False)
    except Exception as e:
        print(f"    ERROR: {e}")
        phi, steps, pass_rate, all_passed = 0, 0, 0, False

    elapsed = time.time() - start
    n_heur = len(orch.optimizer.heuristic_library)

    status = "✓" if all_passed else "✗"
    print(f"  Run {run_num}: {status} Φ={phi:.1f} pass={pass_rate:.0%} steps={steps} heur={n_heur} ({elapsed:.1f}s)")

    return {
        "run": run_num, "phi": round(phi, 1), "pass_rate": round(pass_rate, 2),
        "steps": steps, "all_passed": all_passed, "heuristics": n_heur,
        "time_s": round(elapsed, 1),
    }


def main():
    print("╔══════════════════════════════════════════════════════╗")
    print("║  Track 2: REAL MODEL Validation                     ║")
    print(f"║  Model: {MODEL:<44} ║")
    print("╚══════════════════════════════════════════════════════╝\n")

    backend = resolve_backend(MODEL, api_key=GROQ_KEY)

    # Quick connection test
    print("Testing connection...")
    r = backend.generate(
        [ChatMessage(role="user", content="Say 'ok' and nothing else.")],
        temperature=0.1, max_tokens=500,
    )
    print(f"  Response: \"{r[:50]}\"")
    print()

    results = {}

    for task_name in TASKS:
        print(f"═══ {task_name} (3 runs, learning persists) ═══")

        env = CodeEnv(TASKS[task_name]["tests"])
        orch = Orchestrator(
            llm=backend,
            environment=env,
            available_actions={
                "submit_code": "Submit Python code. Put the code in the 'code' parameter.",
                "DONE": "Signal task completion",
            },
            optimize_every_n_tasks=1,
        )
        orch.optimizer.min_reward_threshold = 0.1

        curve = []
        for run_num in range(1, 4):  # 3 runs per task
            entry = run_task_with_real_model(task_name, orch, run_num)
            curve.append(entry)
            time.sleep(1)  # Rate limit courtesy

        results[task_name] = curve

        # Report delta
        if len(curve) >= 2:
            delta = curve[-1]["phi"] - curve[0]["phi"]
            if delta > 0:
                print(f"  → Δ(Φ) = {delta:+.1f} ✓ IMPROVED")
            elif delta == 0:
                print(f"  → Δ(Φ) = {delta:+.1f} (no change)")
            else:
                print(f"  → Δ(Φ) = {delta:+.1f} (regressed)")
        print()

    # ═══ Final Report ═══
    print("╔══════════════════════════════════════════════════════╗")
    print("║  RESULTS                                            ║")
    print("╚══════════════════════════════════════════════════════╝")
    print(f"{'Task':<14} {'Run 1 Φ':>8} {'Run 3 Φ':>8} {'Delta':>8} {'Verdict'}")
    print("─" * 50)
    for task_name, curve in results.items():
        r1 = curve[0]["phi"]
        r3 = curve[-1]["phi"]
        delta = r3 - r1
        verdict = "✓ IMPROVED" if delta > 0 else "= SAME" if delta == 0 else "✗ REGRESSED"
        print(f"{task_name:<14} {r1:>8.1f} {r3:>8.1f} {delta:>+8.1f} {verdict}")

    # Save
    os.makedirs("benchmarks/results", exist_ok=True)
    with open("benchmarks/results/real_model_results.json", "w") as f:
        json.dump({"model": MODEL, "results": results}, f, indent=2)
    print(f"\nSaved to benchmarks/results/real_model_results.json")


if __name__ == "__main__":
    main()