madDegen commited on
Commit
85b4f44
·
verified ·
1 Parent(s): ce752c2

consolidate: Evo domain QA benchmark runner

Browse files
Files changed (1) hide show
  1. evo/benchmark_runner.py +39 -0
evo/benchmark_runner.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Agent Q3 [Evo] — Benchmark Runner
3
+ Domain QA evaluation across prediction markets, Solidity, and agent orchestration.
4
+ Scores are logged to agent-q3-trainingevo/benchmarks/.
5
+ """
6
+ import json, os, datetime, httpx, asyncio
7
+
8
+ OLLAMA_URL = os.getenv("OLLAMA_URL", "http://localhost:11434/v1/chat/completions")
9
+ EVAL_MODEL = os.getenv("EVAL_MODEL", "qwen3.5:4b-instruct-q4_K_M")
10
+
11
+ BENCHMARKS = [
12
+ {"domain": "prediction_markets", "prompt": "Explain the difference between AMM-based and CLOB-based prediction markets.", "keywords": ["AMM","CLOB","liquidity","orderbook"]},
13
+ {"domain": "solidity", "prompt": "What is a reentrancy attack and how do you prevent it in Solidity?", "keywords": ["reentrancy","CEI","nonReentrant","checks-effects"]},
14
+ {"domain": "langgraph", "prompt": "Describe the StateGraph execution model in LangGraph.", "keywords": ["StateGraph","node","edge","conditional","compile"]},
15
+ {"domain": "lora", "prompt": "What is the difference between LoRA rank and alpha?", "keywords": ["rank","alpha","scaling","adapter","weight"]},
16
+ ]
17
+
18
+ async def run_eval(bench: dict) -> dict:
19
+ async with httpx.AsyncClient(timeout=60) as client:
20
+ r = await client.post(OLLAMA_URL, json={
21
+ "model": EVAL_MODEL,
22
+ "messages": [{"role":"user","content": bench["prompt"]}]
23
+ })
24
+ response = r.json()["choices"][0]["message"]["content"].lower()
25
+ score = sum(1 for kw in bench["keywords"] if kw.lower() in response) / len(bench["keywords"])
26
+ return {"domain": bench["domain"], "score": round(score,2), "response_len": len(response)}
27
+
28
+ async def run_all():
29
+ results = await asyncio.gather(*[run_eval(b) for b in BENCHMARKS])
30
+ out_path = f"./benchmarks/run_{datetime.date.today().isoformat()}.json"
31
+ os.makedirs("./benchmarks", exist_ok=True)
32
+ with open(out_path,"w") as f:
33
+ json.dump({"timestamp": datetime.datetime.utcnow().isoformat(), "results": results}, f, indent=2)
34
+ avg = sum(r["score"] for r in results) / len(results)
35
+ print(f"Avg score: {avg:.2%} | Saved to {out_path}")
36
+ return results
37
+
38
+ if __name__ == "__main__":
39
+ asyncio.run(run_all())