consolidate: Evo domain QA benchmark runner
Browse files- evo/benchmark_runner.py +39 -0
evo/benchmark_runner.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Agent Q3 [Evo] — Benchmark Runner
|
| 3 |
+
Domain QA evaluation across prediction markets, Solidity, and agent orchestration.
|
| 4 |
+
Scores are logged to agent-q3-trainingevo/benchmarks/.
|
| 5 |
+
"""
|
| 6 |
+
import json, os, datetime, httpx, asyncio
|
| 7 |
+
|
| 8 |
+
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://localhost:11434/v1/chat/completions")
|
| 9 |
+
EVAL_MODEL = os.getenv("EVAL_MODEL", "qwen3.5:4b-instruct-q4_K_M")
|
| 10 |
+
|
| 11 |
+
BENCHMARKS = [
|
| 12 |
+
{"domain": "prediction_markets", "prompt": "Explain the difference between AMM-based and CLOB-based prediction markets.", "keywords": ["AMM","CLOB","liquidity","orderbook"]},
|
| 13 |
+
{"domain": "solidity", "prompt": "What is a reentrancy attack and how do you prevent it in Solidity?", "keywords": ["reentrancy","CEI","nonReentrant","checks-effects"]},
|
| 14 |
+
{"domain": "langgraph", "prompt": "Describe the StateGraph execution model in LangGraph.", "keywords": ["StateGraph","node","edge","conditional","compile"]},
|
| 15 |
+
{"domain": "lora", "prompt": "What is the difference between LoRA rank and alpha?", "keywords": ["rank","alpha","scaling","adapter","weight"]},
|
| 16 |
+
]
|
| 17 |
+
|
| 18 |
+
async def run_eval(bench: dict) -> dict:
|
| 19 |
+
async with httpx.AsyncClient(timeout=60) as client:
|
| 20 |
+
r = await client.post(OLLAMA_URL, json={
|
| 21 |
+
"model": EVAL_MODEL,
|
| 22 |
+
"messages": [{"role":"user","content": bench["prompt"]}]
|
| 23 |
+
})
|
| 24 |
+
response = r.json()["choices"][0]["message"]["content"].lower()
|
| 25 |
+
score = sum(1 for kw in bench["keywords"] if kw.lower() in response) / len(bench["keywords"])
|
| 26 |
+
return {"domain": bench["domain"], "score": round(score,2), "response_len": len(response)}
|
| 27 |
+
|
| 28 |
+
async def run_all():
|
| 29 |
+
results = await asyncio.gather(*[run_eval(b) for b in BENCHMARKS])
|
| 30 |
+
out_path = f"./benchmarks/run_{datetime.date.today().isoformat()}.json"
|
| 31 |
+
os.makedirs("./benchmarks", exist_ok=True)
|
| 32 |
+
with open(out_path,"w") as f:
|
| 33 |
+
json.dump({"timestamp": datetime.datetime.utcnow().isoformat(), "results": results}, f, indent=2)
|
| 34 |
+
avg = sum(r["score"] for r in results) / len(results)
|
| 35 |
+
print(f"Avg score: {avg:.2%} | Saved to {out_path}")
|
| 36 |
+
return results
|
| 37 |
+
|
| 38 |
+
if __name__ == "__main__":
|
| 39 |
+
asyncio.run(run_all())
|