adeshboudh16 commited on
Commit Β·
c8b183a
1
Parent(s): a84bdca
feat: EVAL_PRIMARY_MODEL override for graph LLM, applied before civicsetu import
Browse files- scripts/run_eval.py +35 -6
scripts/run_eval.py
CHANGED
|
@@ -8,8 +8,16 @@ CivicSetu RAGAS evaluation β single pass, no phases.
|
|
| 8 |
|
| 9 |
Usage:
|
| 10 |
uv run python scripts/run_eval.py
|
| 11 |
-
EVAL_LIMIT=5 uv run python scripts/run_eval.py
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
"""
|
| 14 |
from __future__ import annotations
|
| 15 |
|
|
@@ -32,9 +40,13 @@ ROOT = Path(__file__).parent.parent
|
|
| 32 |
DATASET_PATH = ROOT / "eval" / "golden_dataset.jsonl"
|
| 33 |
OUTPUT_PATH = ROOT / "eval_results.json"
|
| 34 |
|
| 35 |
-
PASS_THRESHOLD
|
| 36 |
-
EVAL_LIMIT
|
| 37 |
-
JUDGE_MODEL
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
|
| 40 |
# ββ Dataset ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
@@ -287,9 +299,25 @@ def main() -> None:
|
|
| 287 |
if EVAL_LIMIT:
|
| 288 |
rows = rows[:EVAL_LIMIT]
|
| 289 |
|
| 290 |
-
print(f"CivicSetu RAGAS Eval β {len(rows)} queries |
|
| 291 |
|
| 292 |
# ββ Step 1: collect ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 293 |
from civicsetu.agent.graph import get_compiled_graph
|
| 294 |
graph = get_compiled_graph()
|
| 295 |
|
|
@@ -315,6 +343,7 @@ def main() -> None:
|
|
| 315 |
report = {
|
| 316 |
"run_at": datetime.now(timezone.utc).isoformat(),
|
| 317 |
"dataset_size": len(scored),
|
|
|
|
| 318 |
"judge_model": JUDGE_MODEL,
|
| 319 |
"pass_threshold": PASS_THRESHOLD,
|
| 320 |
"overall": _group_stats(scored),
|
|
|
|
| 8 |
|
| 9 |
Usage:
|
| 10 |
uv run python scripts/run_eval.py
|
| 11 |
+
EVAL_LIMIT=5 uv run python scripts/run_eval.py # quick smoke-test
|
| 12 |
+
|
| 13 |
+
Graph LLM overrides (applied before civicsetu imports, so nodes.py picks them up):
|
| 14 |
+
EVAL_PRIMARY_MODEL=openrouter/qwen/qwen3.5-397b-a17b:free uv run python scripts/run_eval.py
|
| 15 |
+
EVAL_PRIMARY_MODEL=gemini/gemini-2.5-flash-lite uv run python scripts/run_eval.py
|
| 16 |
+
|
| 17 |
+
EVAL_FALLBACK_MODEL is optional β defaults to EVAL_PRIMARY_MODEL if not set.
|
| 18 |
+
|
| 19 |
+
Judge (RAGAS scorer) model:
|
| 20 |
+
JUDGE_MODEL=qwen3.5-397b-a17b uv run python scripts/run_eval.py
|
| 21 |
"""
|
| 22 |
from __future__ import annotations
|
| 23 |
|
|
|
|
| 40 |
DATASET_PATH = ROOT / "eval" / "golden_dataset.jsonl"
|
| 41 |
OUTPUT_PATH = ROOT / "eval_results.json"
|
| 42 |
|
| 43 |
+
PASS_THRESHOLD = float(os.getenv("PASS_THRESHOLD", "0.7"))
|
| 44 |
+
EVAL_LIMIT = int(os.getenv("EVAL_LIMIT", "0")) or None
|
| 45 |
+
JUDGE_MODEL = os.getenv("JUDGE_MODEL", "qwen3.5-122b-a10b")
|
| 46 |
+
|
| 47 |
+
# Graph LLM overrides β applied before civicsetu is imported so nodes.py reads them
|
| 48 |
+
EVAL_PRIMARY_MODEL = os.getenv("EVAL_PRIMARY_MODEL") # overrides PRIMARY_MODEL for graph
|
| 49 |
+
EVAL_FALLBACK_MODEL = os.getenv("EVAL_FALLBACK_MODEL") # overrides all fallbacks (defaults to primary)
|
| 50 |
|
| 51 |
|
| 52 |
# ββ Dataset ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 299 |
if EVAL_LIMIT:
|
| 300 |
rows = rows[:EVAL_LIMIT]
|
| 301 |
|
| 302 |
+
print(f"CivicSetu RAGAS Eval β {len(rows)} queries | judge={JUDGE_MODEL} | threshold={PASS_THRESHOLD}")
|
| 303 |
|
| 304 |
# ββ Step 1: collect ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 305 |
+
# Apply graph LLM overrides BEFORE importing civicsetu.
|
| 306 |
+
# nodes.py builds FALLBACK_MODELS at module import time from settings,
|
| 307 |
+
# so env vars must be set before the first import of civicsetu.agent.graph.
|
| 308 |
+
if EVAL_PRIMARY_MODEL:
|
| 309 |
+
fallback = EVAL_FALLBACK_MODEL or EVAL_PRIMARY_MODEL
|
| 310 |
+
os.environ["PRIMARY_MODEL"] = EVAL_PRIMARY_MODEL
|
| 311 |
+
os.environ["FALLBACK_MODEL_1"] = fallback
|
| 312 |
+
os.environ["FALLBACK_MODEL_2"] = fallback
|
| 313 |
+
os.environ["FALLBACK_MODEL_3"] = fallback
|
| 314 |
+
print(f" Graph LLM : {EVAL_PRIMARY_MODEL} (all fallbacks β same)")
|
| 315 |
+
else:
|
| 316 |
+
from dotenv import load_dotenv
|
| 317 |
+
load_dotenv()
|
| 318 |
+
primary = os.getenv("PRIMARY_MODEL", "gemini/gemini-2.5-flash-lite")
|
| 319 |
+
print(f" Graph LLM : {primary} (from .env)")
|
| 320 |
+
|
| 321 |
from civicsetu.agent.graph import get_compiled_graph
|
| 322 |
graph = get_compiled_graph()
|
| 323 |
|
|
|
|
| 343 |
report = {
|
| 344 |
"run_at": datetime.now(timezone.utc).isoformat(),
|
| 345 |
"dataset_size": len(scored),
|
| 346 |
+
"graph_model": EVAL_PRIMARY_MODEL or os.getenv("PRIMARY_MODEL", "from-.env"),
|
| 347 |
"judge_model": JUDGE_MODEL,
|
| 348 |
"pass_threshold": PASS_THRESHOLD,
|
| 349 |
"overall": _group_stats(scored),
|