adeshboudh16 commited on
Commit
c8b183a
Β·
1 Parent(s): a84bdca

feat: EVAL_PRIMARY_MODEL override for graph LLM, applied before civicsetu import

Browse files
Files changed (1) hide show
  1. scripts/run_eval.py +35 -6
scripts/run_eval.py CHANGED
@@ -8,8 +8,16 @@ CivicSetu RAGAS evaluation β€” single pass, no phases.
8
 
9
  Usage:
10
  uv run python scripts/run_eval.py
11
- EVAL_LIMIT=5 uv run python scripts/run_eval.py # quick smoke-test
12
- JUDGE_MODEL=qwen3.5-122b-a10b uv run python scripts/run_eval.py
 
 
 
 
 
 
 
 
13
  """
14
  from __future__ import annotations
15
 
@@ -32,9 +40,13 @@ ROOT = Path(__file__).parent.parent
32
  DATASET_PATH = ROOT / "eval" / "golden_dataset.jsonl"
33
  OUTPUT_PATH = ROOT / "eval_results.json"
34
 
35
- PASS_THRESHOLD = float(os.getenv("PASS_THRESHOLD", "0.7"))
36
- EVAL_LIMIT = int(os.getenv("EVAL_LIMIT", "0")) or None
37
- JUDGE_MODEL = os.getenv("JUDGE_MODEL", "qwen3.5-122b-a10b")
 
 
 
 
38
 
39
 
40
  # ── Dataset ────────────────────────────────────────────────────────────────────
@@ -287,9 +299,25 @@ def main() -> None:
287
  if EVAL_LIMIT:
288
  rows = rows[:EVAL_LIMIT]
289
 
290
- print(f"CivicSetu RAGAS Eval β€” {len(rows)} queries | model={JUDGE_MODEL} | threshold={PASS_THRESHOLD}")
291
 
292
  # ── Step 1: collect ────────────────────────────────────────────────────────
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
293
  from civicsetu.agent.graph import get_compiled_graph
294
  graph = get_compiled_graph()
295
 
@@ -315,6 +343,7 @@ def main() -> None:
315
  report = {
316
  "run_at": datetime.now(timezone.utc).isoformat(),
317
  "dataset_size": len(scored),
 
318
  "judge_model": JUDGE_MODEL,
319
  "pass_threshold": PASS_THRESHOLD,
320
  "overall": _group_stats(scored),
 
8
 
9
  Usage:
10
  uv run python scripts/run_eval.py
11
+ EVAL_LIMIT=5 uv run python scripts/run_eval.py # quick smoke-test
12
+
13
+ Graph LLM overrides (applied before civicsetu imports, so nodes.py picks them up):
14
+ EVAL_PRIMARY_MODEL=openrouter/qwen/qwen3.5-397b-a17b:free uv run python scripts/run_eval.py
15
+ EVAL_PRIMARY_MODEL=gemini/gemini-2.5-flash-lite uv run python scripts/run_eval.py
16
+
17
+ EVAL_FALLBACK_MODEL is optional β€” defaults to EVAL_PRIMARY_MODEL if not set.
18
+
19
+ Judge (RAGAS scorer) model:
20
+ JUDGE_MODEL=qwen3.5-397b-a17b uv run python scripts/run_eval.py
21
  """
22
  from __future__ import annotations
23
 
 
40
  DATASET_PATH = ROOT / "eval" / "golden_dataset.jsonl"
41
  OUTPUT_PATH = ROOT / "eval_results.json"
42
 
43
+ PASS_THRESHOLD = float(os.getenv("PASS_THRESHOLD", "0.7"))
44
+ EVAL_LIMIT = int(os.getenv("EVAL_LIMIT", "0")) or None
45
+ JUDGE_MODEL = os.getenv("JUDGE_MODEL", "qwen3.5-122b-a10b")
46
+
47
+ # Graph LLM overrides β€” applied before civicsetu is imported so nodes.py reads them
48
+ EVAL_PRIMARY_MODEL = os.getenv("EVAL_PRIMARY_MODEL") # overrides PRIMARY_MODEL for graph
49
+ EVAL_FALLBACK_MODEL = os.getenv("EVAL_FALLBACK_MODEL") # overrides all fallbacks (defaults to primary)
50
 
51
 
52
  # ── Dataset ────────────────────────────────────────────────────────────────────
 
299
  if EVAL_LIMIT:
300
  rows = rows[:EVAL_LIMIT]
301
 
302
+ print(f"CivicSetu RAGAS Eval β€” {len(rows)} queries | judge={JUDGE_MODEL} | threshold={PASS_THRESHOLD}")
303
 
304
  # ── Step 1: collect ────────────────────────────────────────────────────────
305
+ # Apply graph LLM overrides BEFORE importing civicsetu.
306
+ # nodes.py builds FALLBACK_MODELS at module import time from settings,
307
+ # so env vars must be set before the first import of civicsetu.agent.graph.
308
+ if EVAL_PRIMARY_MODEL:
309
+ fallback = EVAL_FALLBACK_MODEL or EVAL_PRIMARY_MODEL
310
+ os.environ["PRIMARY_MODEL"] = EVAL_PRIMARY_MODEL
311
+ os.environ["FALLBACK_MODEL_1"] = fallback
312
+ os.environ["FALLBACK_MODEL_2"] = fallback
313
+ os.environ["FALLBACK_MODEL_3"] = fallback
314
+ print(f" Graph LLM : {EVAL_PRIMARY_MODEL} (all fallbacks β†’ same)")
315
+ else:
316
+ from dotenv import load_dotenv
317
+ load_dotenv()
318
+ primary = os.getenv("PRIMARY_MODEL", "gemini/gemini-2.5-flash-lite")
319
+ print(f" Graph LLM : {primary} (from .env)")
320
+
321
  from civicsetu.agent.graph import get_compiled_graph
322
  graph = get_compiled_graph()
323
 
 
343
  report = {
344
  "run_at": datetime.now(timezone.utc).isoformat(),
345
  "dataset_size": len(scored),
346
+ "graph_model": EVAL_PRIMARY_MODEL or os.getenv("PRIMARY_MODEL", "from-.env"),
347
  "judge_model": JUDGE_MODEL,
348
  "pass_threshold": PASS_THRESHOLD,
349
  "overall": _group_stats(scored),