| """ |
| PhD Research OS — Evaluation Harness (Phase 2) |
| ================================================ |
| Golden dataset evaluation + regression gate. |
| |
| Metrics: |
| - Extraction recall (% of real claims found) |
| - Extraction precision (% of extracted claims that are real) |
| - Epistemic tag accuracy (% correctly classified) |
| - Hallucination rate (% of claims with no source basis) |
| - Confidence calibration (correlation: assigned vs human scores) |
| """ |
|
|
| import json |
| import os |
| from pathlib import Path |
| from typing import Optional |
| from dataclasses import dataclass, field |
|
|
|
|
| @dataclass |
| class EvalMetrics: |
| """Evaluation metrics for a single paper.""" |
| paper_id: str |
| extraction_recall: float = 0.0 |
| extraction_precision: float = 0.0 |
| epistemic_accuracy: float = 0.0 |
| hallucination_rate: float = 0.0 |
| confidence_correlation: float = 0.0 |
| f1_score: float = 0.0 |
| |
| def to_dict(self): |
| return { |
| "paper_id": self.paper_id, |
| "extraction_recall": round(self.extraction_recall, 4), |
| "extraction_precision": round(self.extraction_precision, 4), |
| "f1_score": round(self.f1_score, 4), |
| "epistemic_accuracy": round(self.epistemic_accuracy, 4), |
| "hallucination_rate": round(self.hallucination_rate, 4), |
| "confidence_correlation": round(self.confidence_correlation, 4), |
| } |
|
|
|
|
| @dataclass |
| class RegressionResult: |
| """Result of regression gate check.""" |
| passed: bool |
| metrics: dict |
| thresholds: dict |
| failures: list = field(default_factory=list) |
|
|
|
|
| |
| REGRESSION_THRESHOLDS = { |
| "extraction_recall": 0.70, |
| "hallucination_rate_max": 0.10, |
| "epistemic_accuracy": 0.60, |
| } |
|
|
|
|
| def load_golden_dataset(path: str = "tests/golden_dataset") -> dict: |
| """ |
| Load golden dataset from JSON files. |
| |
| Expected structure: |
| tests/golden_dataset/ |
| ├── paper_1.json |
| ├── paper_2.json |
| └── ... |
| |
| Each file contains: |
| { |
| "paper_id": "...", |
| "title": "...", |
| "claims": [ |
| { |
| "text": "...", |
| "epistemic_tag": "Fact|Interpretation|...", |
| "confidence": 0.85, |
| "source_sentences": ["..."], # ground truth evidence |
| } |
| ] |
| } |
| """ |
| golden = {} |
| golden_path = Path(path) |
| |
| if not golden_path.exists(): |
| print(f"Warning: Golden dataset path {path} does not exist") |
| return golden |
| |
| for file in golden_path.glob("*.json"): |
| with open(file) as f: |
| data = json.load(f) |
| golden[data["paper_id"]] = data |
| |
| return golden |
|
|
|
|
| def evaluate_extraction(golden_claims: list, extracted_claims: list, |
| similarity_threshold: float = 0.8) -> EvalMetrics: |
| """ |
| Compare extracted claims against golden standard. |
| |
| Uses text overlap as similarity metric (can be upgraded to embedding similarity). |
| """ |
| metrics = EvalMetrics(paper_id="") |
| |
| if not golden_claims: |
| return metrics |
| |
| |
| matched_golden = set() |
| matched_extracted = set() |
| correct_epistemic = 0 |
| hallucinated = 0 |
| |
| for i, ext in enumerate(extracted_claims): |
| ext_text = ext.get("text", "").lower() |
| best_match = -1 |
| best_score = 0 |
| |
| for j, gold in enumerate(golden_claims): |
| gold_text = gold.get("text", "").lower() |
| |
| |
| ext_words = set(ext_text.split()) |
| gold_words = set(gold_text.split()) |
| |
| if not ext_words or not gold_words: |
| continue |
| |
| intersection = ext_words & gold_words |
| union = ext_words | gold_words |
| score = len(intersection) / len(union) if union else 0 |
| |
| if score > best_score: |
| best_score = score |
| best_match = j |
| |
| if best_score >= similarity_threshold and best_match >= 0: |
| matched_golden.add(best_match) |
| matched_extracted.add(i) |
| |
| |
| if ext.get("epistemic_tag") == golden_claims[best_match].get("epistemic_tag"): |
| correct_epistemic += 1 |
| elif best_score < 0.3: |
| hallucinated += 1 |
| |
| |
| n_golden = len(golden_claims) |
| n_extracted = len(extracted_claims) |
| n_matched = len(matched_golden) |
| |
| metrics.extraction_recall = n_matched / n_golden if n_golden > 0 else 0 |
| metrics.extraction_precision = len(matched_extracted) / n_extracted if n_extracted > 0 else 0 |
| |
| if metrics.extraction_recall + metrics.extraction_precision > 0: |
| metrics.f1_score = (2 * metrics.extraction_recall * metrics.extraction_precision / |
| (metrics.extraction_recall + metrics.extraction_precision)) |
| |
| metrics.epistemic_accuracy = correct_epistemic / n_matched if n_matched > 0 else 0 |
| metrics.hallucination_rate = hallucinated / n_extracted if n_extracted > 0 else 0 |
| |
| |
| if n_matched >= 3: |
| assigned = [] |
| human = [] |
| for i in matched_extracted: |
| ext = extracted_claims[i] |
| |
| ext_text = ext.get("text", "").lower() |
| for j in matched_golden: |
| gold = golden_claims[j] |
| gold_text = gold.get("text", "").lower() |
| ext_words = set(ext_text.split()) |
| gold_words = set(gold_text.split()) |
| union = ext_words | gold_words |
| score = len(ext_words & gold_words) / len(union) if union else 0 |
| if score >= similarity_threshold: |
| assigned.append(float(ext.get("confidence", 0.5))) |
| human.append(float(gold.get("confidence", 0.5))) |
| break |
| |
| if len(assigned) >= 3: |
| |
| n = len(assigned) |
| mean_a = sum(assigned) / n |
| mean_h = sum(human) / n |
| |
| cov = sum((a - mean_a) * (h - mean_h) for a, h in zip(assigned, human)) / n |
| std_a = (sum((a - mean_a)**2 for a in assigned) / n) ** 0.5 |
| std_h = (sum((h - mean_h)**2 for h in human) / n) ** 0.5 |
| |
| if std_a > 0 and std_h > 0: |
| metrics.confidence_correlation = cov / (std_a * std_h) |
| |
| return metrics |
|
|
|
|
| def run_regression_gate(golden_path: str = "tests/golden_dataset", |
| pipeline_results: dict = None) -> RegressionResult: |
| """ |
| Regression gate: checks if current pipeline meets minimum thresholds. |
| |
| Must PASS before any config/prompt change is committed. |
| |
| Thresholds (Phase 2 spec): |
| - Extraction recall: ≥ 70% |
| - Hallucination rate: ≤ 10% |
| - Epistemic accuracy: ≥ 60% |
| """ |
| golden = load_golden_dataset(golden_path) |
| |
| if not golden: |
| return RegressionResult( |
| passed=False, |
| metrics={}, |
| thresholds=REGRESSION_THRESHOLDS, |
| failures=["No golden dataset found"] |
| ) |
| |
| all_metrics = {} |
| failures = [] |
| |
| for paper_id, gold_data in golden.items(): |
| |
| extracted = pipeline_results.get(paper_id, []) if pipeline_results else [] |
| |
| metrics = evaluate_extraction(gold_data["claims"], extracted) |
| metrics.paper_id = paper_id |
| all_metrics[paper_id] = metrics.to_dict() |
| |
| |
| if metrics.extraction_recall < REGRESSION_THRESHOLDS["extraction_recall"]: |
| failures.append(f"{paper_id}: recall {metrics.extraction_recall:.2%} < {REGRESSION_THRESHOLDS['extraction_recall']:.0%}") |
| if metrics.hallucination_rate > REGRESSION_THRESHOLDS["hallucination_rate_max"]: |
| failures.append(f"{paper_id}: hallucination {metrics.hallucination_rate:.2%} > {REGRESSION_THRESHOLDS['hallucination_rate_max']:.0%}") |
| if metrics.epistemic_accuracy < REGRESSION_THRESHOLDS["epistemic_accuracy"]: |
| failures.append(f"{paper_id}: epistemic accuracy {metrics.epistemic_accuracy:.2%} < {REGRESSION_THRESHOLDS['epistemic_accuracy']:.0%}") |
| |
| |
| if all_metrics: |
| avg_metrics = {} |
| for key in ["extraction_recall", "extraction_precision", "f1_score", |
| "epistemic_accuracy", "hallucination_rate", "confidence_correlation"]: |
| values = [m[key] for m in all_metrics.values()] |
| avg_metrics[key] = sum(values) / len(values) |
| all_metrics["_average"] = avg_metrics |
| |
| passed = len(failures) == 0 |
| |
| return RegressionResult( |
| passed=passed, |
| metrics=all_metrics, |
| thresholds=REGRESSION_THRESHOLDS, |
| failures=failures |
| ) |
|
|
|
|
| def create_golden_paper(paper_id: str, title: str, claims: list, |
| output_path: str = "tests/golden_dataset"): |
| """ |
| Helper to create a golden dataset paper entry. |
| |
| Args: |
| paper_id: Unique identifier |
| title: Paper title |
| claims: List of dicts with text, epistemic_tag, confidence, source_sentences |
| output_path: Where to save |
| """ |
| os.makedirs(output_path, exist_ok=True) |
| |
| data = { |
| "paper_id": paper_id, |
| "title": title, |
| "claims": claims, |
| "created_at": __import__('datetime').datetime.now().isoformat(), |
| "schema_version": "1.0" |
| } |
| |
| filepath = os.path.join(output_path, f"{paper_id}.json") |
| with open(filepath, "w") as f: |
| json.dump(data, f, indent=2) |
| |
| print(f"Golden paper saved: {filepath} ({len(claims)} claims)") |
|
|