muthuk1 commited on
Commit
ddfbb09
·
verified ·
1 Parent(s): df00339

Fix #9: Update benchmark runner for 3-pipeline comparison + LLM-as-a-Judge + BERTScore evaluation

Browse files
Files changed (1) hide show
  1. graphrag/benchmark.py +91 -32
graphrag/benchmark.py CHANGED
@@ -1,32 +1,43 @@
1
  """
2
- Benchmark Runner — Runs both pipelines on HotpotQA and evaluates
3
- =================================================================
 
 
 
 
 
4
  """
5
  import json
6
  import logging
7
- from typing import Dict, List
8
  from .layers.orchestration_layer import InferenceOrchestrator
9
- from .layers.evaluation_layer import EvaluationLayer, EvalSample
 
 
10
 
11
  logger = logging.getLogger(__name__)
12
 
13
 
14
  class BenchmarkRunner:
15
- """Runs benchmarks on HotpotQA and generates comparison metrics."""
16
 
17
  def __init__(self, orchestrator, evaluator):
18
  self.orchestrator = orchestrator
19
  self.evaluator = evaluator
20
  self.benchmark_results = []
 
21
 
22
  def run_hotpotqa_benchmark(self, num_samples=100, split="validation",
23
- top_k=5, hops=2, progress_callback=None):
24
- """Run both pipelines on HotpotQA and evaluate."""
 
25
  from datasets import load_dataset
26
  logger.info(f"Loading HotpotQA ({split}, n={num_samples})...")
27
  ds = load_dataset("hotpotqa/hotpot_qa", "distractor", split=split)
28
 
29
  results = []
 
 
30
  for idx in range(min(num_samples, len(ds))):
31
  row = ds[idx]
32
  query, gold = row["question"], row["answer"]
@@ -45,40 +56,69 @@ class BenchmarkRunner:
45
  sf.append(cs[si])
46
 
47
  try:
48
- comp = self.orchestrator.run_comparison(query, passages, top_k, hops)
 
 
 
49
 
50
  sample = EvalSample(
51
  query=query, reference_answer=gold,
52
- baseline_answer=comp.baseline.answer,
53
- graphrag_answer=comp.graphrag.answer,
54
- baseline_contexts=comp.baseline.contexts,
55
- graphrag_contexts=comp.graphrag.contexts,
 
56
  question_type=qtype, difficulty=str(level),
57
  supporting_facts=sf)
 
58
 
59
  er = self.evaluator.evaluate_sample(
60
  sample,
61
- comp.baseline.total_tokens, comp.graphrag.total_tokens,
62
- comp.baseline.cost_usd, comp.graphrag.cost_usd,
63
- comp.baseline.latency_ms, comp.graphrag.latency_ms)
 
 
 
 
 
 
 
 
64
 
65
  rd = {
66
  "idx": idx, "query": query, "gold_answer": gold,
67
  "question_type": qtype, "level": level,
68
- "baseline_answer": comp.baseline.answer,
69
- "graphrag_answer": comp.graphrag.answer,
70
- "baseline_f1": er.baseline_f1, "graphrag_f1": er.graphrag_f1,
71
- "baseline_em": er.baseline_em, "graphrag_em": er.graphrag_em,
72
- "baseline_tokens": comp.baseline.total_tokens,
73
- "graphrag_tokens": comp.graphrag.total_tokens,
74
- "baseline_cost": comp.baseline.cost_usd,
75
- "graphrag_cost": comp.graphrag.cost_usd,
76
- "baseline_latency": comp.baseline.latency_ms,
77
- "graphrag_latency": comp.graphrag.latency_ms,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  "baseline_context_hit": er.baseline_context_hit,
79
  "graphrag_context_hit": er.graphrag_context_hit,
80
- "entities_found": len(comp.graphrag.entities_found),
81
- "relations_traversed": len(comp.graphrag.relations_traversed),
82
  }
83
  results.append(rd)
84
  self.benchmark_results.append(rd)
@@ -91,10 +131,28 @@ class BenchmarkRunner:
91
  except Exception as e:
92
  logger.error(f"Error on query {idx}: {e}")
93
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  aggregate = self.evaluator.compute_aggregate_metrics()
95
  report = self.evaluator.generate_report()
96
- return {"results": results, "aggregate": aggregate, "report": report,
97
- "num_completed": len(results), "num_requested": num_samples}
 
 
 
 
 
 
98
 
99
  def get_results_dataframe(self):
100
  import pandas as pd
@@ -102,6 +160,7 @@ class BenchmarkRunner:
102
 
103
  def save_results(self, filepath):
104
  with open(filepath, 'w') as f:
105
- json.dump({"results": self.benchmark_results,
106
- "aggregate": self.evaluator.compute_aggregate_metrics()},
107
- f, indent=2, default=str)
 
 
1
  """
2
+ Benchmark Runner — Runs all 3 pipelines on HotpotQA and evaluates
3
+ ==================================================================
4
+ Pipeline 1: LLM-Only (no retrieval)
5
+ Pipeline 2: Basic RAG (vector search + LLM)
6
+ Pipeline 3: GraphRAG (TigerGraph + novelty engine)
7
+
8
+ Evaluates with: F1, EM, LLM-as-a-Judge, BERTScore, Context Hit Rate
9
  """
10
  import json
11
  import logging
12
+ from typing import Dict, List, Optional
13
  from .layers.orchestration_layer import InferenceOrchestrator
14
+ from .layers.evaluation_layer import (
15
+ EvaluationLayer, EvalSample, compute_bertscore
16
+ )
17
 
18
  logger = logging.getLogger(__name__)
19
 
20
 
21
  class BenchmarkRunner:
22
+ """Runs benchmarks on HotpotQA with all 3 pipelines and generates comparison metrics."""
23
 
24
  def __init__(self, orchestrator, evaluator):
25
  self.orchestrator = orchestrator
26
  self.evaluator = evaluator
27
  self.benchmark_results = []
28
+ self.eval_samples: List[EvalSample] = []
29
 
30
  def run_hotpotqa_benchmark(self, num_samples=100, split="validation",
31
+ top_k=5, hops=2, progress_callback=None,
32
+ run_judge=True, run_bertscore=True):
33
+ """Run all 3 pipelines on HotpotQA and evaluate."""
34
  from datasets import load_dataset
35
  logger.info(f"Loading HotpotQA ({split}, n={num_samples})...")
36
  ds = load_dataset("hotpotqa/hotpot_qa", "distractor", split=split)
37
 
38
  results = []
39
+ self.eval_samples = []
40
+
41
  for idx in range(min(num_samples, len(ds))):
42
  row = ds[idx]
43
  query, gold = row["question"], row["answer"]
 
56
  sf.append(cs[si])
57
 
58
  try:
59
+ # Run all 3 pipelines
60
+ lo = self.orchestrator.run_llm_only(query)
61
+ b = self.orchestrator.run_baseline_rag(query, passages, top_k)
62
+ g = self.orchestrator.run_graphrag(query, passages, hops=hops)
63
 
64
  sample = EvalSample(
65
  query=query, reference_answer=gold,
66
+ llm_only_answer=lo.answer,
67
+ baseline_answer=b.answer,
68
+ graphrag_answer=g.answer,
69
+ baseline_contexts=b.contexts,
70
+ graphrag_contexts=g.contexts,
71
  question_type=qtype, difficulty=str(level),
72
  supporting_facts=sf)
73
+ self.eval_samples.append(sample)
74
 
75
  er = self.evaluator.evaluate_sample(
76
  sample,
77
+ llm_only_tokens=lo.total_tokens,
78
+ baseline_tokens=b.total_tokens,
79
+ graphrag_tokens=g.total_tokens,
80
+ llm_only_cost=lo.cost_usd,
81
+ baseline_cost=b.cost_usd,
82
+ graphrag_cost=g.cost_usd,
83
+ llm_only_latency=lo.latency_ms,
84
+ baseline_latency=b.latency_ms,
85
+ graphrag_latency=g.latency_ms,
86
+ run_judge=run_judge,
87
+ )
88
 
89
  rd = {
90
  "idx": idx, "query": query, "gold_answer": gold,
91
  "question_type": qtype, "level": level,
92
+ # Answers
93
+ "llm_only_answer": lo.answer,
94
+ "baseline_answer": b.answer,
95
+ "graphrag_answer": g.answer,
96
+ # F1 / EM
97
+ "llm_only_f1": er.llm_only_f1,
98
+ "baseline_f1": er.baseline_f1,
99
+ "graphrag_f1": er.graphrag_f1,
100
+ "llm_only_em": er.llm_only_em,
101
+ "baseline_em": er.baseline_em,
102
+ "graphrag_em": er.graphrag_em,
103
+ # LLM-as-Judge
104
+ "llm_only_judge": er.llm_only_judge,
105
+ "baseline_judge": er.baseline_judge,
106
+ "graphrag_judge": er.graphrag_judge,
107
+ # Tokens / Cost / Latency
108
+ "llm_only_tokens": lo.total_tokens,
109
+ "baseline_tokens": b.total_tokens,
110
+ "graphrag_tokens": g.total_tokens,
111
+ "llm_only_cost": lo.cost_usd,
112
+ "baseline_cost": b.cost_usd,
113
+ "graphrag_cost": g.cost_usd,
114
+ "llm_only_latency": lo.latency_ms,
115
+ "baseline_latency": b.latency_ms,
116
+ "graphrag_latency": g.latency_ms,
117
+ # Context
118
  "baseline_context_hit": er.baseline_context_hit,
119
  "graphrag_context_hit": er.graphrag_context_hit,
120
+ "entities_found": len(g.entities_found),
121
+ "relations_traversed": len(g.relations_traversed),
122
  }
123
  results.append(rd)
124
  self.benchmark_results.append(rd)
 
131
  except Exception as e:
132
  logger.error(f"Error on query {idx}: {e}")
133
 
134
+ # Run BERTScore on full batch (more efficient than per-sample)
135
+ bertscore_results = {}
136
+ if run_bertscore and self.eval_samples:
137
+ logger.info("Computing BERTScore for all pipelines...")
138
+ for pipe in ["llm_only", "baseline", "graphrag"]:
139
+ try:
140
+ bs = self.evaluator.evaluate_bertscore_batch(self.eval_samples, pipeline=pipe)
141
+ bertscore_results[pipe] = bs
142
+ logger.info(f" {pipe}: mean_f1={bs.get('mean_f1', 0):.4f}, pass_rate={bs.get('pass_rate', 0):.1%}")
143
+ except Exception as e:
144
+ logger.warning(f" BERTScore for {pipe} failed: {e}")
145
+
146
  aggregate = self.evaluator.compute_aggregate_metrics()
147
  report = self.evaluator.generate_report()
148
+ return {
149
+ "results": results,
150
+ "aggregate": aggregate,
151
+ "bertscore": bertscore_results,
152
+ "report": report,
153
+ "num_completed": len(results),
154
+ "num_requested": num_samples,
155
+ }
156
 
157
  def get_results_dataframe(self):
158
  import pandas as pd
 
160
 
161
  def save_results(self, filepath):
162
  with open(filepath, 'w') as f:
163
+ json.dump({
164
+ "results": self.benchmark_results,
165
+ "aggregate": self.evaluator.compute_aggregate_metrics(),
166
+ }, f, indent=2, default=str)