purpose-agent / benchmarks /validate_real.py

fix: real-model robustness — benchmarks/validate_real.py

d7dc6c8 verified 14 days ago

8.16 kB

	#!/usr/bin/env python3
	"""
	Track 2: REAL MODEL validation — Groq + Qwen3-32B.

	Runs the self-improvement loop with an actual LLM, not mocks.
	Proves Purpose Learning works with real inference.

	Usage:
	export GROQ_API_KEY="gsk_..."
	python benchmarks/validate_real.py
	"""
	import sys, os, json, time
	sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))

	from purpose_agent.types import State, Action
	from purpose_agent.llm_backend import resolve_backend, ChatMessage
	from purpose_agent.orchestrator import Environment, Orchestrator

	GROQ_KEY = os.environ.get("GROQ_API_KEY", "")
	if not GROQ_KEY:
	print("Set GROQ_API_KEY to run this benchmark.")
	sys.exit(1)

	MODEL = "groq:llama-3.3-70b-versatile"


	# ════════════════ Coding Environment ════════════════

	class CodeEnv(Environment):
	def __init__(self, tests):
	self.tests = tests

	def execute(self, action, current_state):
	code = action.params.get("code", action.thought or "")
	# Try to extract code from thought if not in params
	if not code.strip() or "def " not in code:
	for field in [action.expected_delta, action.thought]:
	if field and "def " in field:
	code = field
	break

	data = {**current_state.data, "attempts": current_state.data.get("attempts", 0) + 1}
	passed, fails = 0, []
	for tc in self.tests:
	try:
	ns = {}
	exec(code, ns)
	result = str(eval(tc["input"], ns))
	if result.strip() == str(tc["expected"]).strip():
	passed += 1
	else:
	fails.append(f'{tc["input"]}: want {tc["expected"]}, got {result}')
	except Exception as e:
	fails.append(f'{tc["input"]}: {type(e).__name__}: {e}')

	total = len(self.tests)
	data.update({
	"tests_passed": passed, "tests_total": total,
	"pass_rate": passed / total if total else 0,
	"all_passed": passed == total,
	"failures": fails[:3], "last_code": code[:500],
	})
	summary = f"Tests: {passed}/{total}" + (
	" \| ALL PASSED ✓" if passed == total else f" \| Fails: {'; '.join(fails[:2])}"
	)
	return State(data=data, summary=summary)

	def reset(self):
	return State(data={"attempts": 0})

	def is_terminal(self, state):
	return state.data.get("all_passed", False)


	# ════════════════ Tasks ════════════════

	TASKS = {
	"fibonacci": {
	"purpose": (
	"Write a Python function called fib(n) that returns the nth Fibonacci number. "
	"fib(0)=0, fib(1)=1, fib(5)=5, fib(10)=55. "
	"Use the submit_code action with your code in the 'code' parameter."
	),
	"tests": [
	{"input": "fib(0)", "expected": "0"},
	{"input": "fib(1)", "expected": "1"},
	{"input": "fib(5)", "expected": "5"},
	{"input": "fib(10)", "expected": "55"},
	],
	},
	"fizzbuzz": {
	"purpose": (
	"Write a Python function called fizzbuzz(n) that returns: "
	"'Fizz' if n is divisible by 3, 'Buzz' if by 5, 'FizzBuzz' if by both, else str(n). "
	"Use the submit_code action with your code in the 'code' parameter."
	),
	"tests": [
	{"input": "fizzbuzz(3)", "expected": "Fizz"},
	{"input": "fizzbuzz(5)", "expected": "Buzz"},
	{"input": "fizzbuzz(15)", "expected": "FizzBuzz"},
	{"input": "fizzbuzz(7)", "expected": "7"},
	],
	},
	}


	def run_task_with_real_model(task_name: str, orch: Orchestrator, run_num: int) -> dict:
	"""Run one task and return metrics."""
	task = TASKS[task_name]
	env = CodeEnv(task["tests"])
	orch.environment = env

	start = time.time()
	try:
	result = orch.run_task(
	purpose=task["purpose"],
	initial_state=env.reset(),
	max_steps=3,
	)
	phi = result.final_phi or 0
	steps = result.total_steps
	pass_rate = result.final_state.data.get("pass_rate", 0)
	all_passed = result.final_state.data.get("all_passed", False)
	except Exception as e:
	print(f" ERROR: {e}")
	phi, steps, pass_rate, all_passed = 0, 0, 0, False

	elapsed = time.time() - start
	n_heur = len(orch.optimizer.heuristic_library)

	status = "✓" if all_passed else "✗"
	print(f" Run {run_num}: {status} Φ={phi:.1f} pass={pass_rate:.0%} steps={steps} heur={n_heur} ({elapsed:.1f}s)")

	return {
	"run": run_num, "phi": round(phi, 1), "pass_rate": round(pass_rate, 2),
	"steps": steps, "all_passed": all_passed, "heuristics": n_heur,
	"time_s": round(elapsed, 1),
	}


	def main():
	print("╔══════════════════════════════════════════════════════╗")
	print("║ Track 2: REAL MODEL Validation ║")
	print(f"║ Model: {MODEL:<44} ║")
	print("╚══════════════════════════════════════════════════════╝\n")

	backend = resolve_backend(MODEL, api_key=GROQ_KEY)

	# Quick connection test
	print("Testing connection...")
	r = backend.generate(
	[ChatMessage(role="user", content="Say 'ok' and nothing else.")],
	temperature=0.1, max_tokens=500,
	)
	print(f" Response: \"{r[:50]}\"")
	print()

	results = {}

	for task_name in TASKS:
	print(f"═══ {task_name} (3 runs, learning persists) ═══")

	env = CodeEnv(TASKS[task_name]["tests"])
	orch = Orchestrator(
	llm=backend,
	environment=env,
	available_actions={
	"submit_code": "Submit Python code. Put the code in the 'code' parameter.",
	"DONE": "Signal task completion",
	},
	optimize_every_n_tasks=1,
	)
	orch.optimizer.min_reward_threshold = 0.1

	curve = []
	for run_num in range(1, 4): # 3 runs per task
	entry = run_task_with_real_model(task_name, orch, run_num)
	curve.append(entry)
	time.sleep(1) # Rate limit courtesy

	results[task_name] = curve

	# Report delta
	if len(curve) >= 2:
	delta = curve[-1]["phi"] - curve[0]["phi"]
	if delta > 0:
	print(f" → Δ(Φ) = {delta:+.1f} ✓ IMPROVED")
	elif delta == 0:
	print(f" → Δ(Φ) = {delta:+.1f} (no change)")
	else:
	print(f" → Δ(Φ) = {delta:+.1f} (regressed)")
	print()

	# ═══ Final Report ═══
	print("╔══════════════════════════════════════════════════════╗")
	print("║ RESULTS ║")
	print("╚══════════════════════════════════════════════════════╝")
	print(f"{'Task':<14} {'Run 1 Φ':>8} {'Run 3 Φ':>8} {'Delta':>8} {'Verdict'}")
	print("─" * 50)
	for task_name, curve in results.items():
	r1 = curve[0]["phi"]
	r3 = curve[-1]["phi"]
	delta = r3 - r1
	verdict = "✓ IMPROVED" if delta > 0 else "= SAME" if delta == 0 else "✗ REGRESSED"
	print(f"{task_name:<14} {r1:>8.1f} {r3:>8.1f} {delta:>+8.1f} {verdict}")

	# Save
	os.makedirs("benchmarks/results", exist_ok=True)
	with open("benchmarks/results/real_model_results.json", "w") as f:
	json.dump({"model": MODEL, "results": results}, f, indent=2)
	print(f"\nSaved to benchmarks/results/real_model_results.json")


	if __name__ == "__main__":
	main()