muthuk1 commited on
Commit
f486777
·
verified ·
1 Parent(s): 079d32f

Fix #3: Add LLM-as-a-Judge (PASS/FAIL) + BERTScore evaluation — the two hackathon-required accuracy metrics

Browse files
Files changed (1) hide show
  1. graphrag/layers/evaluation_layer.py +392 -34
graphrag/layers/evaluation_layer.py CHANGED
@@ -1,15 +1,20 @@
1
  """
2
- Layer 4: Evaluation Layer — RAGAS + Custom Metrics + Benchmarking
3
- =================================================================
4
- Computes faithfulness, answer relevancy, context precision/recall,
5
- F1, exact match, and cost efficiency metrics.
 
 
 
 
6
  """
 
7
  import logging
8
  import re
9
  import string
10
  from collections import Counter
11
  from dataclasses import dataclass, field
12
- from typing import Any, Dict, List
13
 
14
  logger = logging.getLogger(__name__)
15
 
@@ -55,13 +60,222 @@ def compute_token_efficiency(baseline_tokens: int, graphrag_tokens: int) -> floa
55
  return graphrag_tokens / baseline_tokens if baseline_tokens > 0 else 0.0
56
 
57
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  # ── Data Structures ───────────────────────────────────────
59
 
60
  @dataclass
61
  class EvalSample:
62
- """Single evaluation sample."""
63
  query: str = ""
64
  reference_answer: str = ""
 
65
  baseline_answer: str = ""
66
  graphrag_answer: str = ""
67
  baseline_contexts: List[str] = field(default_factory=list)
@@ -73,14 +287,27 @@ class EvalSample:
73
 
74
  @dataclass
75
  class EvalResult:
76
- """Evaluation result for a single sample."""
77
  query: str = ""
 
 
78
  baseline_f1: float = 0.0
79
  graphrag_f1: float = 0.0
 
80
  baseline_em: float = 0.0
81
  graphrag_em: float = 0.0
 
82
  baseline_context_hit: float = 0.0
83
  graphrag_context_hit: float = 0.0
 
 
 
 
 
 
 
 
 
84
  baseline_faithfulness: float = 0.0
85
  graphrag_faithfulness: float = 0.0
86
  baseline_relevancy: float = 0.0
@@ -89,10 +316,14 @@ class EvalResult:
89
  graphrag_context_precision: float = 0.0
90
  baseline_context_recall: float = 0.0
91
  graphrag_context_recall: float = 0.0
 
 
92
  baseline_tokens: int = 0
93
  graphrag_tokens: int = 0
 
94
  baseline_cost: float = 0.0
95
  graphrag_cost: float = 0.0
 
96
  baseline_latency: float = 0.0
97
  graphrag_latency: float = 0.0
98
  question_type: str = ""
@@ -104,17 +335,24 @@ class EvalResult:
104
  class EvaluationLayer:
105
  """
106
  Layer 4: Evaluation Layer.
107
- Computes all metrics and generates benchmark reports.
 
 
 
 
 
108
  """
109
 
110
  def __init__(self, eval_llm_model="gpt-4o-mini", api_key=""):
111
  self.eval_llm_model = eval_llm_model
112
  self._api_key = api_key
113
  self._ragas_available = False
 
 
114
  self.results: List[EvalResult] = []
115
 
116
  def initialize(self):
117
- """Initialize RAGAS components if available."""
118
  try:
119
  from ragas import evaluate, EvaluationDataset, SingleTurnSample
120
  from ragas.metrics import Faithfulness, AnswerRelevancy
@@ -123,30 +361,121 @@ class EvaluationLayer:
123
  except ImportError:
124
  logger.warning("RAGAS not installed — using custom metrics only.")
125
 
126
- def evaluate_sample(self, sample: EvalSample,
127
- baseline_tokens=0, graphrag_tokens=0,
128
- baseline_cost=0.0, graphrag_cost=0.0,
129
- baseline_latency=0.0, graphrag_latency=0.0) -> EvalResult:
130
- """Evaluate a single sample with all metrics."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  r = EvalResult(
132
  query=sample.query,
133
  question_type=sample.question_type,
134
  difficulty=sample.difficulty,
 
 
135
  baseline_f1=compute_f1(sample.baseline_answer, sample.reference_answer),
136
  graphrag_f1=compute_f1(sample.graphrag_answer, sample.reference_answer),
 
 
137
  baseline_em=compute_exact_match(sample.baseline_answer, sample.reference_answer),
138
  graphrag_em=compute_exact_match(sample.graphrag_answer, sample.reference_answer),
 
139
  baseline_context_hit=compute_context_hit_rate(
140
  sample.baseline_contexts, sample.supporting_facts),
141
  graphrag_context_hit=compute_context_hit_rate(
142
  sample.graphrag_contexts, sample.supporting_facts),
 
 
143
  baseline_tokens=baseline_tokens, graphrag_tokens=graphrag_tokens,
 
144
  baseline_cost=baseline_cost, graphrag_cost=graphrag_cost,
 
145
  baseline_latency=baseline_latency, graphrag_latency=graphrag_latency,
146
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
  self.results.append(r)
148
  return r
149
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
  def evaluate_batch_ragas(self, samples: List[EvalSample], pipeline="baseline") -> Dict[str, float]:
151
  """Run RAGAS evaluation on a batch (requires RAGAS + OpenAI key)."""
152
  if not self._ragas_available:
@@ -188,6 +517,16 @@ class EvaluationLayer:
188
  n = len(self.results)
189
  avg = lambda vals: sum(vals) / len(vals) if vals else 0.0
190
 
 
 
 
 
 
 
 
 
 
 
191
  b = {
192
  "avg_f1": round(avg([r.baseline_f1 for r in self.results]), 4),
193
  "avg_em": round(avg([r.baseline_em for r in self.results]), 4),
@@ -197,6 +536,9 @@ class EvaluationLayer:
197
  "avg_latency_ms": round(avg([r.baseline_latency for r in self.results]), 1),
198
  "total_tokens": sum(r.baseline_tokens for r in self.results),
199
  "total_cost": round(sum(r.baseline_cost for r in self.results), 6),
 
 
 
200
  }
201
  g = {
202
  "avg_f1": round(avg([r.graphrag_f1 for r in self.results]), 4),
@@ -207,6 +549,9 @@ class EvaluationLayer:
207
  "avg_latency_ms": round(avg([r.graphrag_latency for r in self.results]), 1),
208
  "total_tokens": sum(r.graphrag_tokens for r in self.results),
209
  "total_cost": round(sum(r.graphrag_cost for r in self.results), 6),
 
 
 
210
  }
211
 
212
  win_rate = sum(1 for r in self.results if r.graphrag_f1 > r.baseline_f1) / n
@@ -220,9 +565,10 @@ class EvaluationLayer:
220
  by_type[qt]["count"] += 1
221
 
222
  return {
223
- "num_samples": n, "baseline": b, "graphrag": g,
 
224
  "graphrag_f1_win_rate": round(win_rate, 4),
225
- "token_ratio": round(g["total_tokens"] / max(b["total_tokens"], 1), 3),
226
  "by_question_type": {
227
  qt: {"count": d["count"],
228
  "baseline_avg_f1": round(avg(d["baseline_f1"]), 4),
@@ -232,37 +578,49 @@ class EvaluationLayer:
232
  }
233
 
234
  def generate_report(self) -> str:
235
- """Generate a text benchmark report."""
236
  m = self.compute_aggregate_metrics()
237
  if "message" in m: return m["message"]
238
  lines = [
239
- "=" * 60, "GRAPHRAG INFERENCE BENCHMARK REPORT", "=" * 60,
 
 
240
  f"\nTotal Samples Evaluated: {m['num_samples']}",
241
- f"\n{'Metric':<25} {'Baseline':>12} {'GraphRAG':>12} {'Winner':>12}",
242
- "-" * 65
243
  ]
244
- b, g = m["baseline"], m["graphrag"]
245
- for name, key in [("Avg F1 Score", "avg_f1"), ("Avg Exact Match", "avg_em"),
246
- ("Avg Context Hit Rate", "avg_context_hit")]:
247
- bv, gv = b[key], g[key]
248
- winner = "GraphRAG" if gv > bv else ("Baseline" if bv > gv else "Tie")
249
- lines.append(f"{name:<25} {bv:>12.4f} {gv:>12.4f} {winner:>12}")
250
-
251
- lines.append(f"\n{'Metric':<25} {'Baseline':>12} {'GraphRAG':>12} {'Ratio':>12}")
252
- lines.append("-" * 65)
 
 
 
 
 
253
  for name, key in [("Avg Tokens/Query", "avg_tokens"), ("Avg Cost ($)", "avg_cost"),
254
  ("Avg Latency (ms)", "avg_latency_ms")]:
255
- bv, gv = b[key], g[key]
256
  ratio = gv / bv if bv > 0 else 0
257
- lines.append(f"{name:<25} {bv:>12.4f} {gv:>12.4f} {ratio:>11.2f}x")
 
 
 
258
 
259
- lines.append(f"\nGraphRAG F1 Win Rate: {m['graphrag_f1_win_rate']:.1%}")
260
- lines.append(f"Token Ratio (G/B): {m['token_ratio']:.2f}x")
 
 
261
 
262
  if m.get("by_question_type"):
263
  lines.extend(["\n--- By Question Type ---",
264
  f"{'Type':<20} {'Count':>6} {'Base F1':>10} {'Graph F1':>10}", "-" * 50])
265
  for qt, d in m["by_question_type"].items():
266
  lines.append(f"{qt:<20} {d['count']:>6} {d['baseline_avg_f1']:>10.4f} {d['graphrag_avg_f1']:>10.4f}")
267
- lines.append("\n" + "=" * 60)
268
  return "\n".join(lines)
 
1
  """
2
+ Layer 4: Evaluation Layer — RAGAS + LLM-as-a-Judge + BERTScore + Custom Metrics
3
+ ================================================================================
4
+ Computes all hackathon-required evaluation metrics:
5
+ - LLM-as-a-Judge (PASS/FAIL grading) Zheng et al., NeurIPS 2023
6
+ - BERTScore (semantic similarity) — Zhang et al., ICLR 2020
7
+ - RAGAS (faithfulness, relevancy, context precision/recall)
8
+ - F1/EM (SQuAD/HotpotQA standard)
9
+ - Token efficiency, cost per query, latency
10
  """
11
+ import json
12
  import logging
13
  import re
14
  import string
15
  from collections import Counter
16
  from dataclasses import dataclass, field
17
+ from typing import Any, Dict, List, Optional
18
 
19
  logger = logging.getLogger(__name__)
20
 
 
60
  return graphrag_tokens / baseline_tokens if baseline_tokens > 0 else 0.0
61
 
62
 
63
+ # ── LLM-as-a-Judge (PASS/FAIL) ──────────────────────────
64
+
65
+ LLM_JUDGE_PROMPT = """You are a strict, impartial judge evaluating the factual correctness of an AI assistant's answer to a question, given a reference answer.
66
+
67
+ ###Question:
68
+ {question}
69
+
70
+ ###Reference Answer (Ground Truth):
71
+ {reference_answer}
72
+
73
+ ###AI System Answer:
74
+ {system_answer}
75
+
76
+ ###Evaluation Criteria:
77
+ Assess whether the AI System Answer is factually correct and sufficiently complete relative to the Reference Answer. Minor wording differences are acceptable. The core facts must match.
78
+
79
+ ###Instructions:
80
+ 1. Write brief feedback explaining your judgment (2-3 sentences).
81
+ 2. Output a final verdict: PASS (answer is correct/complete) or FAIL (answer is wrong, hallucinated, or critically incomplete).
82
+ 3. Respond ONLY in this JSON format:
83
+ {{"feedback": "<your reasoning>", "verdict": "PASS" or "FAIL"}}
84
+
85
+ ###Feedback:"""
86
+
87
+
88
+ def compute_llm_judge(
89
+ question: str,
90
+ reference_answer: str,
91
+ system_answer: str,
92
+ llm_fn=None,
93
+ ) -> Dict[str, Any]:
94
+ """
95
+ LLM-as-a-Judge: PASS/FAIL grading with explanation.
96
+
97
+ Based on: Zheng et al., "Judging LLM-as-a-Judge" (NeurIPS 2023)
98
+ Best practices:
99
+ - Reference answer always provided (maximizes human correlation)
100
+ - Chain-of-thought before verdict (Explain-then-Rate)
101
+ - Structured JSON output
102
+ - Temperature = 0 for deterministic grading
103
+
104
+ Args:
105
+ question: The original question
106
+ reference_answer: The gold/ground-truth answer
107
+ system_answer: The answer to evaluate
108
+ llm_fn: Callable that takes messages list and returns LLMResponse
109
+
110
+ Returns:
111
+ {"verdict": "PASS"|"FAIL", "feedback": str, "raw_response": str}
112
+ """
113
+ if not system_answer or not system_answer.strip():
114
+ return {"verdict": "FAIL", "feedback": "Empty answer.", "raw_response": ""}
115
+
116
+ if not llm_fn:
117
+ # Heuristic fallback: use F1 overlap as a proxy
118
+ f1 = compute_f1(system_answer, reference_answer)
119
+ verdict = "PASS" if f1 >= 0.4 else "FAIL"
120
+ return {
121
+ "verdict": verdict,
122
+ "feedback": f"Heuristic: F1={f1:.3f} (no LLM judge available)",
123
+ "raw_response": "",
124
+ }
125
+
126
+ prompt = LLM_JUDGE_PROMPT.format(
127
+ question=question,
128
+ reference_answer=reference_answer,
129
+ system_answer=system_answer,
130
+ )
131
+
132
+ try:
133
+ resp = llm_fn([
134
+ {"role": "system", "content": "You are a strict evaluation judge. Respond only in the specified JSON format."},
135
+ {"role": "user", "content": prompt},
136
+ ])
137
+ raw = resp.content if hasattr(resp, "content") else str(resp)
138
+
139
+ # Parse JSON verdict
140
+ try:
141
+ data = json.loads(raw)
142
+ verdict = data.get("verdict", "FAIL").upper().strip()
143
+ if verdict not in ("PASS", "FAIL"):
144
+ verdict = "FAIL"
145
+ return {
146
+ "verdict": verdict,
147
+ "feedback": data.get("feedback", ""),
148
+ "raw_response": raw,
149
+ }
150
+ except json.JSONDecodeError:
151
+ # Fallback: regex parse
152
+ match = re.search(r'"verdict"\s*:\s*"(PASS|FAIL)"', raw, re.IGNORECASE)
153
+ if match:
154
+ return {
155
+ "verdict": match.group(1).upper(),
156
+ "feedback": raw,
157
+ "raw_response": raw,
158
+ }
159
+ # Last resort: check for PASS/FAIL anywhere in response
160
+ if "PASS" in raw.upper():
161
+ return {"verdict": "PASS", "feedback": raw, "raw_response": raw}
162
+ return {"verdict": "FAIL", "feedback": raw, "raw_response": raw}
163
+ except Exception as e:
164
+ logger.error(f"LLM-as-Judge error: {e}")
165
+ return {"verdict": "FAIL", "feedback": f"Judge error: {e}", "raw_response": ""}
166
+
167
+
168
+ # ── BERTScore ────────────────────────────────────────────
169
+
170
+ def compute_bertscore(
171
+ predictions: List[str],
172
+ references: List[str],
173
+ model_type: str = "roberta-large",
174
+ rescale: bool = True,
175
+ lang: str = "en",
176
+ ) -> Dict[str, Any]:
177
+ """
178
+ Compute BERTScore F1 for a batch of prediction/reference pairs.
179
+
180
+ Based on: Zhang et al., "BERTScore: Evaluating Text Generation
181
+ with BERT" (ICLR 2020, arxiv:1904.09675)
182
+
183
+ Hackathon thresholds:
184
+ - BERTScore F1 rescaled >= 0.55 (bonus)
185
+ - BERTScore F1 raw >= 0.88 (equivalent bonus)
186
+
187
+ Args:
188
+ predictions: List of candidate answers
189
+ references: List of reference answers
190
+ model_type: BERTScore model (default: roberta-large)
191
+ rescale: Whether to rescale against baseline (recommended)
192
+ lang: Language code
193
+
194
+ Returns:
195
+ {
196
+ "precision": List[float], "recall": List[float], "f1": List[float],
197
+ "mean_f1": float, "pass_rate": float (% samples with f1 >= threshold)
198
+ }
199
+ """
200
+ if not predictions or not references:
201
+ return {"precision": [], "recall": [], "f1": [], "mean_f1": 0.0, "pass_rate": 0.0}
202
+
203
+ # Try evaluate library first (HuggingFace)
204
+ try:
205
+ from evaluate import load as eval_load
206
+ bertscore = eval_load("bertscore")
207
+ results = bertscore.compute(
208
+ predictions=predictions,
209
+ references=references,
210
+ model_type=model_type,
211
+ rescale_with_baseline=rescale,
212
+ lang=lang,
213
+ )
214
+ f1_scores = results["f1"]
215
+ threshold = 0.55 if rescale else 0.88
216
+ pass_rate = sum(1 for f in f1_scores if f >= threshold) / len(f1_scores) if f1_scores else 0.0
217
+ return {
218
+ "precision": results["precision"],
219
+ "recall": results["recall"],
220
+ "f1": f1_scores,
221
+ "mean_f1": sum(f1_scores) / len(f1_scores) if f1_scores else 0.0,
222
+ "pass_rate": pass_rate,
223
+ "threshold": threshold,
224
+ "rescaled": rescale,
225
+ "model": model_type,
226
+ }
227
+ except ImportError:
228
+ pass
229
+
230
+ # Try bert_score library directly
231
+ try:
232
+ from bert_score import score as bert_score_fn
233
+ P, R, F1 = bert_score_fn(
234
+ cands=predictions, refs=references,
235
+ model_type=model_type,
236
+ rescale_with_baseline=rescale,
237
+ lang=lang, verbose=False,
238
+ )
239
+ f1_list = F1.tolist()
240
+ threshold = 0.55 if rescale else 0.88
241
+ pass_rate = sum(1 for f in f1_list if f >= threshold) / len(f1_list) if f1_list else 0.0
242
+ return {
243
+ "precision": P.tolist(),
244
+ "recall": R.tolist(),
245
+ "f1": f1_list,
246
+ "mean_f1": sum(f1_list) / len(f1_list) if f1_list else 0.0,
247
+ "pass_rate": pass_rate,
248
+ "threshold": threshold,
249
+ "rescaled": rescale,
250
+ "model": model_type,
251
+ }
252
+ except ImportError:
253
+ pass
254
+
255
+ # Fallback: use token-level F1 as approximation
256
+ logger.warning("BERTScore not available. Install: pip install evaluate bert-score. Using token F1 proxy.")
257
+ f1_scores = [compute_f1(p, r) for p, r in zip(predictions, references)]
258
+ return {
259
+ "precision": f1_scores,
260
+ "recall": f1_scores,
261
+ "f1": f1_scores,
262
+ "mean_f1": sum(f1_scores) / len(f1_scores) if f1_scores else 0.0,
263
+ "pass_rate": sum(1 for f in f1_scores if f >= 0.5) / len(f1_scores) if f1_scores else 0.0,
264
+ "threshold": 0.5,
265
+ "rescaled": False,
266
+ "model": "token_f1_proxy",
267
+ "warning": "BERTScore not installed — using token F1 as proxy",
268
+ }
269
+
270
+
271
  # ── Data Structures ───────────────────────────────────────
272
 
273
  @dataclass
274
  class EvalSample:
275
+ """Single evaluation sample with all 3 pipelines."""
276
  query: str = ""
277
  reference_answer: str = ""
278
+ llm_only_answer: str = ""
279
  baseline_answer: str = ""
280
  graphrag_answer: str = ""
281
  baseline_contexts: List[str] = field(default_factory=list)
 
287
 
288
  @dataclass
289
  class EvalResult:
290
+ """Evaluation result for a single sample across all 3 pipelines."""
291
  query: str = ""
292
+ # F1 / EM
293
+ llm_only_f1: float = 0.0
294
  baseline_f1: float = 0.0
295
  graphrag_f1: float = 0.0
296
+ llm_only_em: float = 0.0
297
  baseline_em: float = 0.0
298
  graphrag_em: float = 0.0
299
+ # Context hit rate
300
  baseline_context_hit: float = 0.0
301
  graphrag_context_hit: float = 0.0
302
+ # LLM-as-a-Judge
303
+ llm_only_judge: str = "" # "PASS" or "FAIL"
304
+ baseline_judge: str = ""
305
+ graphrag_judge: str = ""
306
+ # BERTScore F1
307
+ llm_only_bertscore: float = 0.0
308
+ baseline_bertscore: float = 0.0
309
+ graphrag_bertscore: float = 0.0
310
+ # RAGAS
311
  baseline_faithfulness: float = 0.0
312
  graphrag_faithfulness: float = 0.0
313
  baseline_relevancy: float = 0.0
 
316
  graphrag_context_precision: float = 0.0
317
  baseline_context_recall: float = 0.0
318
  graphrag_context_recall: float = 0.0
319
+ # Efficiency metrics
320
+ llm_only_tokens: int = 0
321
  baseline_tokens: int = 0
322
  graphrag_tokens: int = 0
323
+ llm_only_cost: float = 0.0
324
  baseline_cost: float = 0.0
325
  graphrag_cost: float = 0.0
326
+ llm_only_latency: float = 0.0
327
  baseline_latency: float = 0.0
328
  graphrag_latency: float = 0.0
329
  question_type: str = ""
 
335
  class EvaluationLayer:
336
  """
337
  Layer 4: Evaluation Layer.
338
+ Computes all hackathon-required metrics:
339
+ - LLM-as-a-Judge (PASS/FAIL) — target >= 90% pass rate
340
+ - BERTScore F1 — target >= 0.55 rescaled / >= 0.88 raw
341
+ - F1, EM, Context Hit Rate
342
+ - RAGAS (optional)
343
+ - Token efficiency, cost, latency
344
  """
345
 
346
  def __init__(self, eval_llm_model="gpt-4o-mini", api_key=""):
347
  self.eval_llm_model = eval_llm_model
348
  self._api_key = api_key
349
  self._ragas_available = False
350
+ self._bertscore_available = False
351
+ self._llm_judge_fn = None
352
  self.results: List[EvalResult] = []
353
 
354
  def initialize(self):
355
+ """Initialize RAGAS and BERTScore components if available."""
356
  try:
357
  from ragas import evaluate, EvaluationDataset, SingleTurnSample
358
  from ragas.metrics import Faithfulness, AnswerRelevancy
 
361
  except ImportError:
362
  logger.warning("RAGAS not installed — using custom metrics only.")
363
 
364
+ # Check BERTScore availability
365
+ try:
366
+ import evaluate
367
+ self._bertscore_available = True
368
+ logger.info("BERTScore available via evaluate library.")
369
+ except ImportError:
370
+ try:
371
+ import bert_score
372
+ self._bertscore_available = True
373
+ logger.info("BERTScore available via bert_score library.")
374
+ except ImportError:
375
+ logger.warning("BERTScore not installed. Install: pip install evaluate bert-score")
376
+
377
+ # Initialize LLM judge function
378
+ self._init_llm_judge()
379
+
380
+ def _init_llm_judge(self):
381
+ """Initialize the LLM judge function."""
382
+ try:
383
+ from openai import OpenAI
384
+ import os
385
+ key = self._api_key or os.getenv("OPENAI_API_KEY", "")
386
+ if key:
387
+ client = OpenAI(api_key=key)
388
+ model = self.eval_llm_model
389
+
390
+ def judge_fn(messages):
391
+ resp = client.chat.completions.create(
392
+ model=model, messages=messages,
393
+ temperature=0, max_tokens=512,
394
+ response_format={"type": "json_object"},
395
+ )
396
+ from .llm_layer import LLMResponse
397
+ return LLMResponse(
398
+ content=resp.choices[0].message.content,
399
+ input_tokens=resp.usage.prompt_tokens,
400
+ output_tokens=resp.usage.completion_tokens,
401
+ )
402
+
403
+ self._llm_judge_fn = judge_fn
404
+ logger.info(f"LLM-as-Judge initialized with {model}")
405
+ except Exception as e:
406
+ logger.warning(f"LLM-as-Judge not available: {e}")
407
+
408
+ def evaluate_sample(
409
+ self, sample: EvalSample,
410
+ llm_only_tokens=0, baseline_tokens=0, graphrag_tokens=0,
411
+ llm_only_cost=0.0, baseline_cost=0.0, graphrag_cost=0.0,
412
+ llm_only_latency=0.0, baseline_latency=0.0, graphrag_latency=0.0,
413
+ run_judge=True, run_bertscore=False,
414
+ ) -> EvalResult:
415
+ """Evaluate a single sample with all metrics across all 3 pipelines."""
416
  r = EvalResult(
417
  query=sample.query,
418
  question_type=sample.question_type,
419
  difficulty=sample.difficulty,
420
+ # F1
421
+ llm_only_f1=compute_f1(sample.llm_only_answer, sample.reference_answer) if sample.llm_only_answer else 0.0,
422
  baseline_f1=compute_f1(sample.baseline_answer, sample.reference_answer),
423
  graphrag_f1=compute_f1(sample.graphrag_answer, sample.reference_answer),
424
+ # EM
425
+ llm_only_em=compute_exact_match(sample.llm_only_answer, sample.reference_answer) if sample.llm_only_answer else 0.0,
426
  baseline_em=compute_exact_match(sample.baseline_answer, sample.reference_answer),
427
  graphrag_em=compute_exact_match(sample.graphrag_answer, sample.reference_answer),
428
+ # Context hit
429
  baseline_context_hit=compute_context_hit_rate(
430
  sample.baseline_contexts, sample.supporting_facts),
431
  graphrag_context_hit=compute_context_hit_rate(
432
  sample.graphrag_contexts, sample.supporting_facts),
433
+ # Efficiency
434
+ llm_only_tokens=llm_only_tokens,
435
  baseline_tokens=baseline_tokens, graphrag_tokens=graphrag_tokens,
436
+ llm_only_cost=llm_only_cost,
437
  baseline_cost=baseline_cost, graphrag_cost=graphrag_cost,
438
+ llm_only_latency=llm_only_latency,
439
  baseline_latency=baseline_latency, graphrag_latency=graphrag_latency,
440
  )
441
+
442
+ # LLM-as-a-Judge
443
+ if run_judge:
444
+ for answer_attr, judge_attr in [
445
+ ("llm_only_answer", "llm_only_judge"),
446
+ ("baseline_answer", "baseline_judge"),
447
+ ("graphrag_answer", "graphrag_judge"),
448
+ ]:
449
+ answer = getattr(sample, answer_attr, "")
450
+ if answer:
451
+ verdict = compute_llm_judge(
452
+ sample.query, sample.reference_answer, answer, self._llm_judge_fn
453
+ )
454
+ setattr(r, judge_attr, verdict["verdict"])
455
+
456
  self.results.append(r)
457
  return r
458
 
459
+ def evaluate_bertscore_batch(
460
+ self, samples: List[EvalSample], pipeline: str = "graphrag"
461
+ ) -> Dict[str, Any]:
462
+ """Run BERTScore on a batch for a specific pipeline."""
463
+ predictions, references = [], []
464
+ for s in samples:
465
+ if pipeline == "llm_only" and s.llm_only_answer:
466
+ predictions.append(s.llm_only_answer)
467
+ references.append(s.reference_answer)
468
+ elif pipeline == "baseline" and s.baseline_answer:
469
+ predictions.append(s.baseline_answer)
470
+ references.append(s.reference_answer)
471
+ elif pipeline == "graphrag" and s.graphrag_answer:
472
+ predictions.append(s.graphrag_answer)
473
+ references.append(s.reference_answer)
474
+
475
+ if not predictions:
476
+ return {"f1": [], "mean_f1": 0.0, "pass_rate": 0.0}
477
+ return compute_bertscore(predictions, references)
478
+
479
  def evaluate_batch_ragas(self, samples: List[EvalSample], pipeline="baseline") -> Dict[str, float]:
480
  """Run RAGAS evaluation on a batch (requires RAGAS + OpenAI key)."""
481
  if not self._ragas_available:
 
517
  n = len(self.results)
518
  avg = lambda vals: sum(vals) / len(vals) if vals else 0.0
519
 
520
+ lo = {
521
+ "avg_f1": round(avg([r.llm_only_f1 for r in self.results]), 4),
522
+ "avg_em": round(avg([r.llm_only_em for r in self.results]), 4),
523
+ "avg_tokens": round(avg([r.llm_only_tokens for r in self.results]), 1),
524
+ "avg_cost": round(avg([r.llm_only_cost for r in self.results]), 6),
525
+ "avg_latency_ms": round(avg([r.llm_only_latency for r in self.results]), 1),
526
+ "judge_pass_rate": round(
527
+ sum(1 for r in self.results if r.llm_only_judge == "PASS") / max(
528
+ sum(1 for r in self.results if r.llm_only_judge), 1), 4),
529
+ }
530
  b = {
531
  "avg_f1": round(avg([r.baseline_f1 for r in self.results]), 4),
532
  "avg_em": round(avg([r.baseline_em for r in self.results]), 4),
 
536
  "avg_latency_ms": round(avg([r.baseline_latency for r in self.results]), 1),
537
  "total_tokens": sum(r.baseline_tokens for r in self.results),
538
  "total_cost": round(sum(r.baseline_cost for r in self.results), 6),
539
+ "judge_pass_rate": round(
540
+ sum(1 for r in self.results if r.baseline_judge == "PASS") / max(
541
+ sum(1 for r in self.results if r.baseline_judge), 1), 4),
542
  }
543
  g = {
544
  "avg_f1": round(avg([r.graphrag_f1 for r in self.results]), 4),
 
549
  "avg_latency_ms": round(avg([r.graphrag_latency for r in self.results]), 1),
550
  "total_tokens": sum(r.graphrag_tokens for r in self.results),
551
  "total_cost": round(sum(r.graphrag_cost for r in self.results), 6),
552
+ "judge_pass_rate": round(
553
+ sum(1 for r in self.results if r.graphrag_judge == "PASS") / max(
554
+ sum(1 for r in self.results if r.graphrag_judge), 1), 4),
555
  }
556
 
557
  win_rate = sum(1 for r in self.results if r.graphrag_f1 > r.baseline_f1) / n
 
565
  by_type[qt]["count"] += 1
566
 
567
  return {
568
+ "num_samples": n,
569
+ "llm_only": lo, "baseline": b, "graphrag": g,
570
  "graphrag_f1_win_rate": round(win_rate, 4),
571
+ "token_ratio": round(g.get("total_tokens", 0) / max(b.get("total_tokens", 1), 1), 3),
572
  "by_question_type": {
573
  qt: {"count": d["count"],
574
  "baseline_avg_f1": round(avg(d["baseline_f1"]), 4),
 
578
  }
579
 
580
  def generate_report(self) -> str:
581
+ """Generate a comprehensive text benchmark report."""
582
  m = self.compute_aggregate_metrics()
583
  if "message" in m: return m["message"]
584
  lines = [
585
+ "=" * 70,
586
+ "GRAPHRAG INFERENCE BENCHMARK REPORT (3-PIPELINE)",
587
+ "=" * 70,
588
  f"\nTotal Samples Evaluated: {m['num_samples']}",
589
+ f"\n{'Metric':<25} {'LLM-Only':>12} {'Basic RAG':>12} {'GraphRAG':>12} {'Winner':>12}",
590
+ "-" * 78,
591
  ]
592
+ lo, b, g = m["llm_only"], m["baseline"], m["graphrag"]
593
+
594
+ for name, key in [("Avg F1 Score", "avg_f1"), ("Avg Exact Match", "avg_em")]:
595
+ lov, bv, gv = lo.get(key, 0), b[key], g[key]
596
+ best = max(lov, bv, gv)
597
+ winner = "LLM-Only" if lov == best else ("BasicRAG" if bv == best else "GraphRAG")
598
+ lines.append(f"{name:<25} {lov:>12.4f} {bv:>12.4f} {gv:>12.4f} {winner:>12}")
599
+
600
+ # LLM-as-a-Judge pass rates
601
+ lines.append(f"\n{'LLM-Judge Pass Rate':<25} {lo.get('judge_pass_rate', 0):>11.1%} "
602
+ f"{b.get('judge_pass_rate', 0):>12.1%} {g.get('judge_pass_rate', 0):>12.1%}")
603
+
604
+ lines.append(f"\n{'Metric':<25} {'LLM-Only':>12} {'Basic RAG':>12} {'GraphRAG':>12} {'Ratio G/B':>12}")
605
+ lines.append("-" * 78)
606
  for name, key in [("Avg Tokens/Query", "avg_tokens"), ("Avg Cost ($)", "avg_cost"),
607
  ("Avg Latency (ms)", "avg_latency_ms")]:
608
+ lov, bv, gv = lo.get(key, 0), b[key], g[key]
609
  ratio = gv / bv if bv > 0 else 0
610
+ lines.append(f"{name:<25} {lov:>12.4f} {bv:>12.4f} {gv:>12.4f} {ratio:>11.2f}x")
611
+
612
+ lines.append(f"\nGraphRAG F1 Win Rate vs Basic RAG: {m['graphrag_f1_win_rate']:.1%}")
613
+ lines.append(f"Token Ratio (GraphRAG/BasicRAG): {m['token_ratio']:.2f}x")
614
 
615
+ # Bonus thresholds
616
+ gj = g.get("judge_pass_rate", 0)
617
+ lines.append(f"\n--- Hackathon Bonus Thresholds ---")
618
+ lines.append(f"LLM-Judge Pass Rate (GraphRAG): {gj:.1%} {'✅ BONUS' if gj >= 0.9 else '❌ < 90%'}")
619
 
620
  if m.get("by_question_type"):
621
  lines.extend(["\n--- By Question Type ---",
622
  f"{'Type':<20} {'Count':>6} {'Base F1':>10} {'Graph F1':>10}", "-" * 50])
623
  for qt, d in m["by_question_type"].items():
624
  lines.append(f"{qt:<20} {d['count']:>6} {d['baseline_avg_f1']:>10.4f} {d['graphrag_avg_f1']:>10.4f}")
625
+ lines.append("\n" + "=" * 70)
626
  return "\n".join(lines)