purpose-agent / benchmarks /validate_real.py
Rohan03's picture
fix: real-model robustness β€” benchmarks/validate_real.py
d7dc6c8 verified
#!/usr/bin/env python3
"""
Track 2: REAL MODEL validation β€” Groq + Qwen3-32B.
Runs the self-improvement loop with an actual LLM, not mocks.
Proves Purpose Learning works with real inference.
Usage:
export GROQ_API_KEY="gsk_..."
python benchmarks/validate_real.py
"""
import sys, os, json, time
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
from purpose_agent.types import State, Action
from purpose_agent.llm_backend import resolve_backend, ChatMessage
from purpose_agent.orchestrator import Environment, Orchestrator
GROQ_KEY = os.environ.get("GROQ_API_KEY", "")
if not GROQ_KEY:
print("Set GROQ_API_KEY to run this benchmark.")
sys.exit(1)
MODEL = "groq:llama-3.3-70b-versatile"
# ════════════════ Coding Environment ════════════════
class CodeEnv(Environment):
def __init__(self, tests):
self.tests = tests
def execute(self, action, current_state):
code = action.params.get("code", action.thought or "")
# Try to extract code from thought if not in params
if not code.strip() or "def " not in code:
for field in [action.expected_delta, action.thought]:
if field and "def " in field:
code = field
break
data = {**current_state.data, "attempts": current_state.data.get("attempts", 0) + 1}
passed, fails = 0, []
for tc in self.tests:
try:
ns = {}
exec(code, ns)
result = str(eval(tc["input"], ns))
if result.strip() == str(tc["expected"]).strip():
passed += 1
else:
fails.append(f'{tc["input"]}: want {tc["expected"]}, got {result}')
except Exception as e:
fails.append(f'{tc["input"]}: {type(e).__name__}: {e}')
total = len(self.tests)
data.update({
"tests_passed": passed, "tests_total": total,
"pass_rate": passed / total if total else 0,
"all_passed": passed == total,
"failures": fails[:3], "last_code": code[:500],
})
summary = f"Tests: {passed}/{total}" + (
" | ALL PASSED βœ“" if passed == total else f" | Fails: {'; '.join(fails[:2])}"
)
return State(data=data, summary=summary)
def reset(self):
return State(data={"attempts": 0})
def is_terminal(self, state):
return state.data.get("all_passed", False)
# ════════════════ Tasks ════════════════
TASKS = {
"fibonacci": {
"purpose": (
"Write a Python function called fib(n) that returns the nth Fibonacci number. "
"fib(0)=0, fib(1)=1, fib(5)=5, fib(10)=55. "
"Use the submit_code action with your code in the 'code' parameter."
),
"tests": [
{"input": "fib(0)", "expected": "0"},
{"input": "fib(1)", "expected": "1"},
{"input": "fib(5)", "expected": "5"},
{"input": "fib(10)", "expected": "55"},
],
},
"fizzbuzz": {
"purpose": (
"Write a Python function called fizzbuzz(n) that returns: "
"'Fizz' if n is divisible by 3, 'Buzz' if by 5, 'FizzBuzz' if by both, else str(n). "
"Use the submit_code action with your code in the 'code' parameter."
),
"tests": [
{"input": "fizzbuzz(3)", "expected": "Fizz"},
{"input": "fizzbuzz(5)", "expected": "Buzz"},
{"input": "fizzbuzz(15)", "expected": "FizzBuzz"},
{"input": "fizzbuzz(7)", "expected": "7"},
],
},
}
def run_task_with_real_model(task_name: str, orch: Orchestrator, run_num: int) -> dict:
"""Run one task and return metrics."""
task = TASKS[task_name]
env = CodeEnv(task["tests"])
orch.environment = env
start = time.time()
try:
result = orch.run_task(
purpose=task["purpose"],
initial_state=env.reset(),
max_steps=3,
)
phi = result.final_phi or 0
steps = result.total_steps
pass_rate = result.final_state.data.get("pass_rate", 0)
all_passed = result.final_state.data.get("all_passed", False)
except Exception as e:
print(f" ERROR: {e}")
phi, steps, pass_rate, all_passed = 0, 0, 0, False
elapsed = time.time() - start
n_heur = len(orch.optimizer.heuristic_library)
status = "βœ“" if all_passed else "βœ—"
print(f" Run {run_num}: {status} Ξ¦={phi:.1f} pass={pass_rate:.0%} steps={steps} heur={n_heur} ({elapsed:.1f}s)")
return {
"run": run_num, "phi": round(phi, 1), "pass_rate": round(pass_rate, 2),
"steps": steps, "all_passed": all_passed, "heuristics": n_heur,
"time_s": round(elapsed, 1),
}
def main():
print("╔══════════════════════════════════════════════════════╗")
print("β•‘ Track 2: REAL MODEL Validation β•‘")
print(f"β•‘ Model: {MODEL:<44} β•‘")
print("β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•\n")
backend = resolve_backend(MODEL, api_key=GROQ_KEY)
# Quick connection test
print("Testing connection...")
r = backend.generate(
[ChatMessage(role="user", content="Say 'ok' and nothing else.")],
temperature=0.1, max_tokens=500,
)
print(f" Response: \"{r[:50]}\"")
print()
results = {}
for task_name in TASKS:
print(f"═══ {task_name} (3 runs, learning persists) ═══")
env = CodeEnv(TASKS[task_name]["tests"])
orch = Orchestrator(
llm=backend,
environment=env,
available_actions={
"submit_code": "Submit Python code. Put the code in the 'code' parameter.",
"DONE": "Signal task completion",
},
optimize_every_n_tasks=1,
)
orch.optimizer.min_reward_threshold = 0.1
curve = []
for run_num in range(1, 4): # 3 runs per task
entry = run_task_with_real_model(task_name, orch, run_num)
curve.append(entry)
time.sleep(1) # Rate limit courtesy
results[task_name] = curve
# Report delta
if len(curve) >= 2:
delta = curve[-1]["phi"] - curve[0]["phi"]
if delta > 0:
print(f" β†’ Ξ”(Ξ¦) = {delta:+.1f} βœ“ IMPROVED")
elif delta == 0:
print(f" β†’ Ξ”(Ξ¦) = {delta:+.1f} (no change)")
else:
print(f" β†’ Ξ”(Ξ¦) = {delta:+.1f} (regressed)")
print()
# ═══ Final Report ═══
print("╔══════════════════════════════════════════════════════╗")
print("β•‘ RESULTS β•‘")
print("β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•")
print(f"{'Task':<14} {'Run 1 Ξ¦':>8} {'Run 3 Ξ¦':>8} {'Delta':>8} {'Verdict'}")
print("─" * 50)
for task_name, curve in results.items():
r1 = curve[0]["phi"]
r3 = curve[-1]["phi"]
delta = r3 - r1
verdict = "βœ“ IMPROVED" if delta > 0 else "= SAME" if delta == 0 else "βœ— REGRESSED"
print(f"{task_name:<14} {r1:>8.1f} {r3:>8.1f} {delta:>+8.1f} {verdict}")
# Save
os.makedirs("benchmarks/results", exist_ok=True)
with open("benchmarks/results/real_model_results.json", "w") as f:
json.dump({"model": MODEL, "results": results}, f, indent=2)
print(f"\nSaved to benchmarks/results/real_model_results.json")
if __name__ == "__main__":
main()