| """ |
| Agent Q3 [Evo] — Benchmark Runner |
| Domain QA evaluation across prediction markets, Solidity, and agent orchestration. |
| Scores are logged to agent-q3-trainingevo/benchmarks/. |
| """ |
| import json, os, datetime, httpx, asyncio |
|
|
| OLLAMA_URL = os.getenv("OLLAMA_URL", "http://localhost:11434/v1/chat/completions") |
| EVAL_MODEL = os.getenv("EVAL_MODEL", "qwen3.5:4b-instruct-q4_K_M") |
|
|
| BENCHMARKS = [ |
| {"domain": "prediction_markets", "prompt": "Explain the difference between AMM-based and CLOB-based prediction markets.", "keywords": ["AMM","CLOB","liquidity","orderbook"]}, |
| {"domain": "solidity", "prompt": "What is a reentrancy attack and how do you prevent it in Solidity?", "keywords": ["reentrancy","CEI","nonReentrant","checks-effects"]}, |
| {"domain": "langgraph", "prompt": "Describe the StateGraph execution model in LangGraph.", "keywords": ["StateGraph","node","edge","conditional","compile"]}, |
| {"domain": "lora", "prompt": "What is the difference between LoRA rank and alpha?", "keywords": ["rank","alpha","scaling","adapter","weight"]}, |
| ] |
|
|
| async def run_eval(bench: dict) -> dict: |
| async with httpx.AsyncClient(timeout=60) as client: |
| r = await client.post(OLLAMA_URL, json={ |
| "model": EVAL_MODEL, |
| "messages": [{"role":"user","content": bench["prompt"]}] |
| }) |
| response = r.json()["choices"][0]["message"]["content"].lower() |
| score = sum(1 for kw in bench["keywords"] if kw.lower() in response) / len(bench["keywords"]) |
| return {"domain": bench["domain"], "score": round(score,2), "response_len": len(response)} |
|
|
| async def run_all(): |
| results = await asyncio.gather(*[run_eval(b) for b in BENCHMARKS]) |
| out_path = f"./benchmarks/run_{datetime.date.today().isoformat()}.json" |
| os.makedirs("./benchmarks", exist_ok=True) |
| with open(out_path,"w") as f: |
| json.dump({"timestamp": datetime.datetime.utcnow().isoformat(), "results": results}, f, indent=2) |
| avg = sum(r["score"] for r in results) / len(results) |
| print(f"Avg score: {avg:.2%} | Saved to {out_path}") |
| return results |
|
|
| if __name__ == "__main__": |
| asyncio.run(run_all()) |
|
|