""" PhD Research OS — Evaluation Harness (Phase 2) ================================================ Golden dataset evaluation + regression gate. Metrics: - Extraction recall (% of real claims found) - Extraction precision (% of extracted claims that are real) - Epistemic tag accuracy (% correctly classified) - Hallucination rate (% of claims with no source basis) - Confidence calibration (correlation: assigned vs human scores) """ import json import os from pathlib import Path from typing import Optional from dataclasses import dataclass, field @dataclass class EvalMetrics: """Evaluation metrics for a single paper.""" paper_id: str extraction_recall: float = 0.0 # % of real claims found extraction_precision: float = 0.0 # % of extracted claims that are real epistemic_accuracy: float = 0.0 # % correctly classified hallucination_rate: float = 0.0 # % of claims with no source basis confidence_correlation: float = 0.0 # Pearson r: assigned vs human f1_score: float = 0.0 def to_dict(self): return { "paper_id": self.paper_id, "extraction_recall": round(self.extraction_recall, 4), "extraction_precision": round(self.extraction_precision, 4), "f1_score": round(self.f1_score, 4), "epistemic_accuracy": round(self.epistemic_accuracy, 4), "hallucination_rate": round(self.hallucination_rate, 4), "confidence_correlation": round(self.confidence_correlation, 4), } @dataclass class RegressionResult: """Result of regression gate check.""" passed: bool metrics: dict thresholds: dict failures: list = field(default_factory=list) # Regression thresholds (Phase 2 spec) REGRESSION_THRESHOLDS = { "extraction_recall": 0.70, # ≥ 70% "hallucination_rate_max": 0.10, # ≤ 10% "epistemic_accuracy": 0.60, # ≥ 60% } def load_golden_dataset(path: str = "tests/golden_dataset") -> dict: """ Load golden dataset from JSON files. Expected structure: tests/golden_dataset/ ├── paper_1.json ├── paper_2.json └── ... Each file contains: { "paper_id": "...", "title": "...", "claims": [ { "text": "...", "epistemic_tag": "Fact|Interpretation|...", "confidence": 0.85, "source_sentences": ["..."], # ground truth evidence } ] } """ golden = {} golden_path = Path(path) if not golden_path.exists(): print(f"Warning: Golden dataset path {path} does not exist") return golden for file in golden_path.glob("*.json"): with open(file) as f: data = json.load(f) golden[data["paper_id"]] = data return golden def evaluate_extraction(golden_claims: list, extracted_claims: list, similarity_threshold: float = 0.8) -> EvalMetrics: """ Compare extracted claims against golden standard. Uses text overlap as similarity metric (can be upgraded to embedding similarity). """ metrics = EvalMetrics(paper_id="") if not golden_claims: return metrics # Simple text overlap matching matched_golden = set() matched_extracted = set() correct_epistemic = 0 hallucinated = 0 for i, ext in enumerate(extracted_claims): ext_text = ext.get("text", "").lower() best_match = -1 best_score = 0 for j, gold in enumerate(golden_claims): gold_text = gold.get("text", "").lower() # Jaccard similarity on word sets ext_words = set(ext_text.split()) gold_words = set(gold_text.split()) if not ext_words or not gold_words: continue intersection = ext_words & gold_words union = ext_words | gold_words score = len(intersection) / len(union) if union else 0 if score > best_score: best_score = score best_match = j if best_score >= similarity_threshold and best_match >= 0: matched_golden.add(best_match) matched_extracted.add(i) # Check epistemic tag if ext.get("epistemic_tag") == golden_claims[best_match].get("epistemic_tag"): correct_epistemic += 1 elif best_score < 0.3: # Very low match → likely hallucination hallucinated += 1 # Calculate metrics n_golden = len(golden_claims) n_extracted = len(extracted_claims) n_matched = len(matched_golden) metrics.extraction_recall = n_matched / n_golden if n_golden > 0 else 0 metrics.extraction_precision = len(matched_extracted) / n_extracted if n_extracted > 0 else 0 if metrics.extraction_recall + metrics.extraction_precision > 0: metrics.f1_score = (2 * metrics.extraction_recall * metrics.extraction_precision / (metrics.extraction_recall + metrics.extraction_precision)) metrics.epistemic_accuracy = correct_epistemic / n_matched if n_matched > 0 else 0 metrics.hallucination_rate = hallucinated / n_extracted if n_extracted > 0 else 0 # Confidence calibration (Pearson correlation) if n_matched >= 3: assigned = [] human = [] for i in matched_extracted: ext = extracted_claims[i] # Find matched golden ext_text = ext.get("text", "").lower() for j in matched_golden: gold = golden_claims[j] gold_text = gold.get("text", "").lower() ext_words = set(ext_text.split()) gold_words = set(gold_text.split()) union = ext_words | gold_words score = len(ext_words & gold_words) / len(union) if union else 0 if score >= similarity_threshold: assigned.append(float(ext.get("confidence", 0.5))) human.append(float(gold.get("confidence", 0.5))) break if len(assigned) >= 3: # Simple Pearson correlation n = len(assigned) mean_a = sum(assigned) / n mean_h = sum(human) / n cov = sum((a - mean_a) * (h - mean_h) for a, h in zip(assigned, human)) / n std_a = (sum((a - mean_a)**2 for a in assigned) / n) ** 0.5 std_h = (sum((h - mean_h)**2 for h in human) / n) ** 0.5 if std_a > 0 and std_h > 0: metrics.confidence_correlation = cov / (std_a * std_h) return metrics def run_regression_gate(golden_path: str = "tests/golden_dataset", pipeline_results: dict = None) -> RegressionResult: """ Regression gate: checks if current pipeline meets minimum thresholds. Must PASS before any config/prompt change is committed. Thresholds (Phase 2 spec): - Extraction recall: ≥ 70% - Hallucination rate: ≤ 10% - Epistemic accuracy: ≥ 60% """ golden = load_golden_dataset(golden_path) if not golden: return RegressionResult( passed=False, metrics={}, thresholds=REGRESSION_THRESHOLDS, failures=["No golden dataset found"] ) all_metrics = {} failures = [] for paper_id, gold_data in golden.items(): # Get extracted claims for this paper (from pipeline_results or DB) extracted = pipeline_results.get(paper_id, []) if pipeline_results else [] metrics = evaluate_extraction(gold_data["claims"], extracted) metrics.paper_id = paper_id all_metrics[paper_id] = metrics.to_dict() # Check thresholds if metrics.extraction_recall < REGRESSION_THRESHOLDS["extraction_recall"]: failures.append(f"{paper_id}: recall {metrics.extraction_recall:.2%} < {REGRESSION_THRESHOLDS['extraction_recall']:.0%}") if metrics.hallucination_rate > REGRESSION_THRESHOLDS["hallucination_rate_max"]: failures.append(f"{paper_id}: hallucination {metrics.hallucination_rate:.2%} > {REGRESSION_THRESHOLDS['hallucination_rate_max']:.0%}") if metrics.epistemic_accuracy < REGRESSION_THRESHOLDS["epistemic_accuracy"]: failures.append(f"{paper_id}: epistemic accuracy {metrics.epistemic_accuracy:.2%} < {REGRESSION_THRESHOLDS['epistemic_accuracy']:.0%}") # Aggregate metrics if all_metrics: avg_metrics = {} for key in ["extraction_recall", "extraction_precision", "f1_score", "epistemic_accuracy", "hallucination_rate", "confidence_correlation"]: values = [m[key] for m in all_metrics.values()] avg_metrics[key] = sum(values) / len(values) all_metrics["_average"] = avg_metrics passed = len(failures) == 0 return RegressionResult( passed=passed, metrics=all_metrics, thresholds=REGRESSION_THRESHOLDS, failures=failures ) def create_golden_paper(paper_id: str, title: str, claims: list, output_path: str = "tests/golden_dataset"): """ Helper to create a golden dataset paper entry. Args: paper_id: Unique identifier title: Paper title claims: List of dicts with text, epistemic_tag, confidence, source_sentences output_path: Where to save """ os.makedirs(output_path, exist_ok=True) data = { "paper_id": paper_id, "title": title, "claims": claims, "created_at": __import__('datetime').datetime.now().isoformat(), "schema_version": "1.0" } filepath = os.path.join(output_path, f"{paper_id}.json") with open(filepath, "w") as f: json.dump(data, f, indent=2) print(f"Golden paper saved: {filepath} ({len(claims)} claims)")