File size: 8,162 Bytes
d7dc6c8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 | #!/usr/bin/env python3
"""
Track 2: REAL MODEL validation β Groq + Qwen3-32B.
Runs the self-improvement loop with an actual LLM, not mocks.
Proves Purpose Learning works with real inference.
Usage:
export GROQ_API_KEY="gsk_..."
python benchmarks/validate_real.py
"""
import sys, os, json, time
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
from purpose_agent.types import State, Action
from purpose_agent.llm_backend import resolve_backend, ChatMessage
from purpose_agent.orchestrator import Environment, Orchestrator
GROQ_KEY = os.environ.get("GROQ_API_KEY", "")
if not GROQ_KEY:
print("Set GROQ_API_KEY to run this benchmark.")
sys.exit(1)
MODEL = "groq:llama-3.3-70b-versatile"
# ββββββββββββββββ Coding Environment ββββββββββββββββ
class CodeEnv(Environment):
def __init__(self, tests):
self.tests = tests
def execute(self, action, current_state):
code = action.params.get("code", action.thought or "")
# Try to extract code from thought if not in params
if not code.strip() or "def " not in code:
for field in [action.expected_delta, action.thought]:
if field and "def " in field:
code = field
break
data = {**current_state.data, "attempts": current_state.data.get("attempts", 0) + 1}
passed, fails = 0, []
for tc in self.tests:
try:
ns = {}
exec(code, ns)
result = str(eval(tc["input"], ns))
if result.strip() == str(tc["expected"]).strip():
passed += 1
else:
fails.append(f'{tc["input"]}: want {tc["expected"]}, got {result}')
except Exception as e:
fails.append(f'{tc["input"]}: {type(e).__name__}: {e}')
total = len(self.tests)
data.update({
"tests_passed": passed, "tests_total": total,
"pass_rate": passed / total if total else 0,
"all_passed": passed == total,
"failures": fails[:3], "last_code": code[:500],
})
summary = f"Tests: {passed}/{total}" + (
" | ALL PASSED β" if passed == total else f" | Fails: {'; '.join(fails[:2])}"
)
return State(data=data, summary=summary)
def reset(self):
return State(data={"attempts": 0})
def is_terminal(self, state):
return state.data.get("all_passed", False)
# ββββββββββββββββ Tasks ββββββββββββββββ
TASKS = {
"fibonacci": {
"purpose": (
"Write a Python function called fib(n) that returns the nth Fibonacci number. "
"fib(0)=0, fib(1)=1, fib(5)=5, fib(10)=55. "
"Use the submit_code action with your code in the 'code' parameter."
),
"tests": [
{"input": "fib(0)", "expected": "0"},
{"input": "fib(1)", "expected": "1"},
{"input": "fib(5)", "expected": "5"},
{"input": "fib(10)", "expected": "55"},
],
},
"fizzbuzz": {
"purpose": (
"Write a Python function called fizzbuzz(n) that returns: "
"'Fizz' if n is divisible by 3, 'Buzz' if by 5, 'FizzBuzz' if by both, else str(n). "
"Use the submit_code action with your code in the 'code' parameter."
),
"tests": [
{"input": "fizzbuzz(3)", "expected": "Fizz"},
{"input": "fizzbuzz(5)", "expected": "Buzz"},
{"input": "fizzbuzz(15)", "expected": "FizzBuzz"},
{"input": "fizzbuzz(7)", "expected": "7"},
],
},
}
def run_task_with_real_model(task_name: str, orch: Orchestrator, run_num: int) -> dict:
"""Run one task and return metrics."""
task = TASKS[task_name]
env = CodeEnv(task["tests"])
orch.environment = env
start = time.time()
try:
result = orch.run_task(
purpose=task["purpose"],
initial_state=env.reset(),
max_steps=3,
)
phi = result.final_phi or 0
steps = result.total_steps
pass_rate = result.final_state.data.get("pass_rate", 0)
all_passed = result.final_state.data.get("all_passed", False)
except Exception as e:
print(f" ERROR: {e}")
phi, steps, pass_rate, all_passed = 0, 0, 0, False
elapsed = time.time() - start
n_heur = len(orch.optimizer.heuristic_library)
status = "β" if all_passed else "β"
print(f" Run {run_num}: {status} Ξ¦={phi:.1f} pass={pass_rate:.0%} steps={steps} heur={n_heur} ({elapsed:.1f}s)")
return {
"run": run_num, "phi": round(phi, 1), "pass_rate": round(pass_rate, 2),
"steps": steps, "all_passed": all_passed, "heuristics": n_heur,
"time_s": round(elapsed, 1),
}
def main():
print("ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ")
print("β Track 2: REAL MODEL Validation β")
print(f"β Model: {MODEL:<44} β")
print("ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ\n")
backend = resolve_backend(MODEL, api_key=GROQ_KEY)
# Quick connection test
print("Testing connection...")
r = backend.generate(
[ChatMessage(role="user", content="Say 'ok' and nothing else.")],
temperature=0.1, max_tokens=500,
)
print(f" Response: \"{r[:50]}\"")
print()
results = {}
for task_name in TASKS:
print(f"βββ {task_name} (3 runs, learning persists) βββ")
env = CodeEnv(TASKS[task_name]["tests"])
orch = Orchestrator(
llm=backend,
environment=env,
available_actions={
"submit_code": "Submit Python code. Put the code in the 'code' parameter.",
"DONE": "Signal task completion",
},
optimize_every_n_tasks=1,
)
orch.optimizer.min_reward_threshold = 0.1
curve = []
for run_num in range(1, 4): # 3 runs per task
entry = run_task_with_real_model(task_name, orch, run_num)
curve.append(entry)
time.sleep(1) # Rate limit courtesy
results[task_name] = curve
# Report delta
if len(curve) >= 2:
delta = curve[-1]["phi"] - curve[0]["phi"]
if delta > 0:
print(f" β Ξ(Ξ¦) = {delta:+.1f} β IMPROVED")
elif delta == 0:
print(f" β Ξ(Ξ¦) = {delta:+.1f} (no change)")
else:
print(f" β Ξ(Ξ¦) = {delta:+.1f} (regressed)")
print()
# βββ Final Report βββ
print("ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ")
print("β RESULTS β")
print("ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ")
print(f"{'Task':<14} {'Run 1 Ξ¦':>8} {'Run 3 Ξ¦':>8} {'Delta':>8} {'Verdict'}")
print("β" * 50)
for task_name, curve in results.items():
r1 = curve[0]["phi"]
r3 = curve[-1]["phi"]
delta = r3 - r1
verdict = "β IMPROVED" if delta > 0 else "= SAME" if delta == 0 else "β REGRESSED"
print(f"{task_name:<14} {r1:>8.1f} {r3:>8.1f} {delta:>+8.1f} {verdict}")
# Save
os.makedirs("benchmarks/results", exist_ok=True)
with open("benchmarks/results/real_model_results.json", "w") as f:
json.dump({"model": MODEL, "results": results}, f, indent=2)
print(f"\nSaved to benchmarks/results/real_model_results.json")
if __name__ == "__main__":
main()
|