nkshirsa commited on
Commit
4b8a9f2
·
verified ·
1 Parent(s): c98d2ae

Add phd_research_os/evaluation.py

Browse files
Files changed (1) hide show
  1. phd_research_os/evaluation.py +287 -0
phd_research_os/evaluation.py ADDED
@@ -0,0 +1,287 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ PhD Research OS — Evaluation Harness (Phase 2)
3
+ ================================================
4
+ Golden dataset evaluation + regression gate.
5
+
6
+ Metrics:
7
+ - Extraction recall (% of real claims found)
8
+ - Extraction precision (% of extracted claims that are real)
9
+ - Epistemic tag accuracy (% correctly classified)
10
+ - Hallucination rate (% of claims with no source basis)
11
+ - Confidence calibration (correlation: assigned vs human scores)
12
+ """
13
+
14
+ import json
15
+ import os
16
+ from pathlib import Path
17
+ from typing import Optional
18
+ from dataclasses import dataclass, field
19
+
20
+
21
+ @dataclass
22
+ class EvalMetrics:
23
+ """Evaluation metrics for a single paper."""
24
+ paper_id: str
25
+ extraction_recall: float = 0.0 # % of real claims found
26
+ extraction_precision: float = 0.0 # % of extracted claims that are real
27
+ epistemic_accuracy: float = 0.0 # % correctly classified
28
+ hallucination_rate: float = 0.0 # % of claims with no source basis
29
+ confidence_correlation: float = 0.0 # Pearson r: assigned vs human
30
+ f1_score: float = 0.0
31
+
32
+ def to_dict(self):
33
+ return {
34
+ "paper_id": self.paper_id,
35
+ "extraction_recall": round(self.extraction_recall, 4),
36
+ "extraction_precision": round(self.extraction_precision, 4),
37
+ "f1_score": round(self.f1_score, 4),
38
+ "epistemic_accuracy": round(self.epistemic_accuracy, 4),
39
+ "hallucination_rate": round(self.hallucination_rate, 4),
40
+ "confidence_correlation": round(self.confidence_correlation, 4),
41
+ }
42
+
43
+
44
+ @dataclass
45
+ class RegressionResult:
46
+ """Result of regression gate check."""
47
+ passed: bool
48
+ metrics: dict
49
+ thresholds: dict
50
+ failures: list = field(default_factory=list)
51
+
52
+
53
+ # Regression thresholds (Phase 2 spec)
54
+ REGRESSION_THRESHOLDS = {
55
+ "extraction_recall": 0.70, # ≥ 70%
56
+ "hallucination_rate_max": 0.10, # ≤ 10%
57
+ "epistemic_accuracy": 0.60, # ≥ 60%
58
+ }
59
+
60
+
61
+ def load_golden_dataset(path: str = "tests/golden_dataset") -> dict:
62
+ """
63
+ Load golden dataset from JSON files.
64
+
65
+ Expected structure:
66
+ tests/golden_dataset/
67
+ ├── paper_1.json
68
+ ├── paper_2.json
69
+ └── ...
70
+
71
+ Each file contains:
72
+ {
73
+ "paper_id": "...",
74
+ "title": "...",
75
+ "claims": [
76
+ {
77
+ "text": "...",
78
+ "epistemic_tag": "Fact|Interpretation|...",
79
+ "confidence": 0.85,
80
+ "source_sentences": ["..."], # ground truth evidence
81
+ }
82
+ ]
83
+ }
84
+ """
85
+ golden = {}
86
+ golden_path = Path(path)
87
+
88
+ if not golden_path.exists():
89
+ print(f"Warning: Golden dataset path {path} does not exist")
90
+ return golden
91
+
92
+ for file in golden_path.glob("*.json"):
93
+ with open(file) as f:
94
+ data = json.load(f)
95
+ golden[data["paper_id"]] = data
96
+
97
+ return golden
98
+
99
+
100
+ def evaluate_extraction(golden_claims: list, extracted_claims: list,
101
+ similarity_threshold: float = 0.8) -> EvalMetrics:
102
+ """
103
+ Compare extracted claims against golden standard.
104
+
105
+ Uses text overlap as similarity metric (can be upgraded to embedding similarity).
106
+ """
107
+ metrics = EvalMetrics(paper_id="")
108
+
109
+ if not golden_claims:
110
+ return metrics
111
+
112
+ # Simple text overlap matching
113
+ matched_golden = set()
114
+ matched_extracted = set()
115
+ correct_epistemic = 0
116
+ hallucinated = 0
117
+
118
+ for i, ext in enumerate(extracted_claims):
119
+ ext_text = ext.get("text", "").lower()
120
+ best_match = -1
121
+ best_score = 0
122
+
123
+ for j, gold in enumerate(golden_claims):
124
+ gold_text = gold.get("text", "").lower()
125
+
126
+ # Jaccard similarity on word sets
127
+ ext_words = set(ext_text.split())
128
+ gold_words = set(gold_text.split())
129
+
130
+ if not ext_words or not gold_words:
131
+ continue
132
+
133
+ intersection = ext_words & gold_words
134
+ union = ext_words | gold_words
135
+ score = len(intersection) / len(union) if union else 0
136
+
137
+ if score > best_score:
138
+ best_score = score
139
+ best_match = j
140
+
141
+ if best_score >= similarity_threshold and best_match >= 0:
142
+ matched_golden.add(best_match)
143
+ matched_extracted.add(i)
144
+
145
+ # Check epistemic tag
146
+ if ext.get("epistemic_tag") == golden_claims[best_match].get("epistemic_tag"):
147
+ correct_epistemic += 1
148
+ elif best_score < 0.3: # Very low match → likely hallucination
149
+ hallucinated += 1
150
+
151
+ # Calculate metrics
152
+ n_golden = len(golden_claims)
153
+ n_extracted = len(extracted_claims)
154
+ n_matched = len(matched_golden)
155
+
156
+ metrics.extraction_recall = n_matched / n_golden if n_golden > 0 else 0
157
+ metrics.extraction_precision = len(matched_extracted) / n_extracted if n_extracted > 0 else 0
158
+
159
+ if metrics.extraction_recall + metrics.extraction_precision > 0:
160
+ metrics.f1_score = (2 * metrics.extraction_recall * metrics.extraction_precision /
161
+ (metrics.extraction_recall + metrics.extraction_precision))
162
+
163
+ metrics.epistemic_accuracy = correct_epistemic / n_matched if n_matched > 0 else 0
164
+ metrics.hallucination_rate = hallucinated / n_extracted if n_extracted > 0 else 0
165
+
166
+ # Confidence calibration (Pearson correlation)
167
+ if n_matched >= 3:
168
+ assigned = []
169
+ human = []
170
+ for i in matched_extracted:
171
+ ext = extracted_claims[i]
172
+ # Find matched golden
173
+ ext_text = ext.get("text", "").lower()
174
+ for j in matched_golden:
175
+ gold = golden_claims[j]
176
+ gold_text = gold.get("text", "").lower()
177
+ ext_words = set(ext_text.split())
178
+ gold_words = set(gold_text.split())
179
+ union = ext_words | gold_words
180
+ score = len(ext_words & gold_words) / len(union) if union else 0
181
+ if score >= similarity_threshold:
182
+ assigned.append(float(ext.get("confidence", 0.5)))
183
+ human.append(float(gold.get("confidence", 0.5)))
184
+ break
185
+
186
+ if len(assigned) >= 3:
187
+ # Simple Pearson correlation
188
+ n = len(assigned)
189
+ mean_a = sum(assigned) / n
190
+ mean_h = sum(human) / n
191
+
192
+ cov = sum((a - mean_a) * (h - mean_h) for a, h in zip(assigned, human)) / n
193
+ std_a = (sum((a - mean_a)**2 for a in assigned) / n) ** 0.5
194
+ std_h = (sum((h - mean_h)**2 for h in human) / n) ** 0.5
195
+
196
+ if std_a > 0 and std_h > 0:
197
+ metrics.confidence_correlation = cov / (std_a * std_h)
198
+
199
+ return metrics
200
+
201
+
202
+ def run_regression_gate(golden_path: str = "tests/golden_dataset",
203
+ pipeline_results: dict = None) -> RegressionResult:
204
+ """
205
+ Regression gate: checks if current pipeline meets minimum thresholds.
206
+
207
+ Must PASS before any config/prompt change is committed.
208
+
209
+ Thresholds (Phase 2 spec):
210
+ - Extraction recall: ≥ 70%
211
+ - Hallucination rate: ≤ 10%
212
+ - Epistemic accuracy: ≥ 60%
213
+ """
214
+ golden = load_golden_dataset(golden_path)
215
+
216
+ if not golden:
217
+ return RegressionResult(
218
+ passed=False,
219
+ metrics={},
220
+ thresholds=REGRESSION_THRESHOLDS,
221
+ failures=["No golden dataset found"]
222
+ )
223
+
224
+ all_metrics = {}
225
+ failures = []
226
+
227
+ for paper_id, gold_data in golden.items():
228
+ # Get extracted claims for this paper (from pipeline_results or DB)
229
+ extracted = pipeline_results.get(paper_id, []) if pipeline_results else []
230
+
231
+ metrics = evaluate_extraction(gold_data["claims"], extracted)
232
+ metrics.paper_id = paper_id
233
+ all_metrics[paper_id] = metrics.to_dict()
234
+
235
+ # Check thresholds
236
+ if metrics.extraction_recall < REGRESSION_THRESHOLDS["extraction_recall"]:
237
+ failures.append(f"{paper_id}: recall {metrics.extraction_recall:.2%} < {REGRESSION_THRESHOLDS['extraction_recall']:.0%}")
238
+ if metrics.hallucination_rate > REGRESSION_THRESHOLDS["hallucination_rate_max"]:
239
+ failures.append(f"{paper_id}: hallucination {metrics.hallucination_rate:.2%} > {REGRESSION_THRESHOLDS['hallucination_rate_max']:.0%}")
240
+ if metrics.epistemic_accuracy < REGRESSION_THRESHOLDS["epistemic_accuracy"]:
241
+ failures.append(f"{paper_id}: epistemic accuracy {metrics.epistemic_accuracy:.2%} < {REGRESSION_THRESHOLDS['epistemic_accuracy']:.0%}")
242
+
243
+ # Aggregate metrics
244
+ if all_metrics:
245
+ avg_metrics = {}
246
+ for key in ["extraction_recall", "extraction_precision", "f1_score",
247
+ "epistemic_accuracy", "hallucination_rate", "confidence_correlation"]:
248
+ values = [m[key] for m in all_metrics.values()]
249
+ avg_metrics[key] = sum(values) / len(values)
250
+ all_metrics["_average"] = avg_metrics
251
+
252
+ passed = len(failures) == 0
253
+
254
+ return RegressionResult(
255
+ passed=passed,
256
+ metrics=all_metrics,
257
+ thresholds=REGRESSION_THRESHOLDS,
258
+ failures=failures
259
+ )
260
+
261
+
262
+ def create_golden_paper(paper_id: str, title: str, claims: list,
263
+ output_path: str = "tests/golden_dataset"):
264
+ """
265
+ Helper to create a golden dataset paper entry.
266
+
267
+ Args:
268
+ paper_id: Unique identifier
269
+ title: Paper title
270
+ claims: List of dicts with text, epistemic_tag, confidence, source_sentences
271
+ output_path: Where to save
272
+ """
273
+ os.makedirs(output_path, exist_ok=True)
274
+
275
+ data = {
276
+ "paper_id": paper_id,
277
+ "title": title,
278
+ "claims": claims,
279
+ "created_at": __import__('datetime').datetime.now().isoformat(),
280
+ "schema_version": "1.0"
281
+ }
282
+
283
+ filepath = os.path.join(output_path, f"{paper_id}.json")
284
+ with open(filepath, "w") as f:
285
+ json.dump(data, f, indent=2)
286
+
287
+ print(f"Golden paper saved: {filepath} ({len(claims)} claims)")