nkshirsa's picture
Add phd_research_os/evaluation.py
4b8a9f2 verified
"""
PhD Research OS — Evaluation Harness (Phase 2)
================================================
Golden dataset evaluation + regression gate.
Metrics:
- Extraction recall (% of real claims found)
- Extraction precision (% of extracted claims that are real)
- Epistemic tag accuracy (% correctly classified)
- Hallucination rate (% of claims with no source basis)
- Confidence calibration (correlation: assigned vs human scores)
"""
import json
import os
from pathlib import Path
from typing import Optional
from dataclasses import dataclass, field
@dataclass
class EvalMetrics:
"""Evaluation metrics for a single paper."""
paper_id: str
extraction_recall: float = 0.0 # % of real claims found
extraction_precision: float = 0.0 # % of extracted claims that are real
epistemic_accuracy: float = 0.0 # % correctly classified
hallucination_rate: float = 0.0 # % of claims with no source basis
confidence_correlation: float = 0.0 # Pearson r: assigned vs human
f1_score: float = 0.0
def to_dict(self):
return {
"paper_id": self.paper_id,
"extraction_recall": round(self.extraction_recall, 4),
"extraction_precision": round(self.extraction_precision, 4),
"f1_score": round(self.f1_score, 4),
"epistemic_accuracy": round(self.epistemic_accuracy, 4),
"hallucination_rate": round(self.hallucination_rate, 4),
"confidence_correlation": round(self.confidence_correlation, 4),
}
@dataclass
class RegressionResult:
"""Result of regression gate check."""
passed: bool
metrics: dict
thresholds: dict
failures: list = field(default_factory=list)
# Regression thresholds (Phase 2 spec)
REGRESSION_THRESHOLDS = {
"extraction_recall": 0.70, # ≥ 70%
"hallucination_rate_max": 0.10, # ≤ 10%
"epistemic_accuracy": 0.60, # ≥ 60%
}
def load_golden_dataset(path: str = "tests/golden_dataset") -> dict:
"""
Load golden dataset from JSON files.
Expected structure:
tests/golden_dataset/
├── paper_1.json
├── paper_2.json
└── ...
Each file contains:
{
"paper_id": "...",
"title": "...",
"claims": [
{
"text": "...",
"epistemic_tag": "Fact|Interpretation|...",
"confidence": 0.85,
"source_sentences": ["..."], # ground truth evidence
}
]
}
"""
golden = {}
golden_path = Path(path)
if not golden_path.exists():
print(f"Warning: Golden dataset path {path} does not exist")
return golden
for file in golden_path.glob("*.json"):
with open(file) as f:
data = json.load(f)
golden[data["paper_id"]] = data
return golden
def evaluate_extraction(golden_claims: list, extracted_claims: list,
similarity_threshold: float = 0.8) -> EvalMetrics:
"""
Compare extracted claims against golden standard.
Uses text overlap as similarity metric (can be upgraded to embedding similarity).
"""
metrics = EvalMetrics(paper_id="")
if not golden_claims:
return metrics
# Simple text overlap matching
matched_golden = set()
matched_extracted = set()
correct_epistemic = 0
hallucinated = 0
for i, ext in enumerate(extracted_claims):
ext_text = ext.get("text", "").lower()
best_match = -1
best_score = 0
for j, gold in enumerate(golden_claims):
gold_text = gold.get("text", "").lower()
# Jaccard similarity on word sets
ext_words = set(ext_text.split())
gold_words = set(gold_text.split())
if not ext_words or not gold_words:
continue
intersection = ext_words & gold_words
union = ext_words | gold_words
score = len(intersection) / len(union) if union else 0
if score > best_score:
best_score = score
best_match = j
if best_score >= similarity_threshold and best_match >= 0:
matched_golden.add(best_match)
matched_extracted.add(i)
# Check epistemic tag
if ext.get("epistemic_tag") == golden_claims[best_match].get("epistemic_tag"):
correct_epistemic += 1
elif best_score < 0.3: # Very low match → likely hallucination
hallucinated += 1
# Calculate metrics
n_golden = len(golden_claims)
n_extracted = len(extracted_claims)
n_matched = len(matched_golden)
metrics.extraction_recall = n_matched / n_golden if n_golden > 0 else 0
metrics.extraction_precision = len(matched_extracted) / n_extracted if n_extracted > 0 else 0
if metrics.extraction_recall + metrics.extraction_precision > 0:
metrics.f1_score = (2 * metrics.extraction_recall * metrics.extraction_precision /
(metrics.extraction_recall + metrics.extraction_precision))
metrics.epistemic_accuracy = correct_epistemic / n_matched if n_matched > 0 else 0
metrics.hallucination_rate = hallucinated / n_extracted if n_extracted > 0 else 0
# Confidence calibration (Pearson correlation)
if n_matched >= 3:
assigned = []
human = []
for i in matched_extracted:
ext = extracted_claims[i]
# Find matched golden
ext_text = ext.get("text", "").lower()
for j in matched_golden:
gold = golden_claims[j]
gold_text = gold.get("text", "").lower()
ext_words = set(ext_text.split())
gold_words = set(gold_text.split())
union = ext_words | gold_words
score = len(ext_words & gold_words) / len(union) if union else 0
if score >= similarity_threshold:
assigned.append(float(ext.get("confidence", 0.5)))
human.append(float(gold.get("confidence", 0.5)))
break
if len(assigned) >= 3:
# Simple Pearson correlation
n = len(assigned)
mean_a = sum(assigned) / n
mean_h = sum(human) / n
cov = sum((a - mean_a) * (h - mean_h) for a, h in zip(assigned, human)) / n
std_a = (sum((a - mean_a)**2 for a in assigned) / n) ** 0.5
std_h = (sum((h - mean_h)**2 for h in human) / n) ** 0.5
if std_a > 0 and std_h > 0:
metrics.confidence_correlation = cov / (std_a * std_h)
return metrics
def run_regression_gate(golden_path: str = "tests/golden_dataset",
pipeline_results: dict = None) -> RegressionResult:
"""
Regression gate: checks if current pipeline meets minimum thresholds.
Must PASS before any config/prompt change is committed.
Thresholds (Phase 2 spec):
- Extraction recall: ≥ 70%
- Hallucination rate: ≤ 10%
- Epistemic accuracy: ≥ 60%
"""
golden = load_golden_dataset(golden_path)
if not golden:
return RegressionResult(
passed=False,
metrics={},
thresholds=REGRESSION_THRESHOLDS,
failures=["No golden dataset found"]
)
all_metrics = {}
failures = []
for paper_id, gold_data in golden.items():
# Get extracted claims for this paper (from pipeline_results or DB)
extracted = pipeline_results.get(paper_id, []) if pipeline_results else []
metrics = evaluate_extraction(gold_data["claims"], extracted)
metrics.paper_id = paper_id
all_metrics[paper_id] = metrics.to_dict()
# Check thresholds
if metrics.extraction_recall < REGRESSION_THRESHOLDS["extraction_recall"]:
failures.append(f"{paper_id}: recall {metrics.extraction_recall:.2%} < {REGRESSION_THRESHOLDS['extraction_recall']:.0%}")
if metrics.hallucination_rate > REGRESSION_THRESHOLDS["hallucination_rate_max"]:
failures.append(f"{paper_id}: hallucination {metrics.hallucination_rate:.2%} > {REGRESSION_THRESHOLDS['hallucination_rate_max']:.0%}")
if metrics.epistemic_accuracy < REGRESSION_THRESHOLDS["epistemic_accuracy"]:
failures.append(f"{paper_id}: epistemic accuracy {metrics.epistemic_accuracy:.2%} < {REGRESSION_THRESHOLDS['epistemic_accuracy']:.0%}")
# Aggregate metrics
if all_metrics:
avg_metrics = {}
for key in ["extraction_recall", "extraction_precision", "f1_score",
"epistemic_accuracy", "hallucination_rate", "confidence_correlation"]:
values = [m[key] for m in all_metrics.values()]
avg_metrics[key] = sum(values) / len(values)
all_metrics["_average"] = avg_metrics
passed = len(failures) == 0
return RegressionResult(
passed=passed,
metrics=all_metrics,
thresholds=REGRESSION_THRESHOLDS,
failures=failures
)
def create_golden_paper(paper_id: str, title: str, claims: list,
output_path: str = "tests/golden_dataset"):
"""
Helper to create a golden dataset paper entry.
Args:
paper_id: Unique identifier
title: Paper title
claims: List of dicts with text, epistemic_tag, confidence, source_sentences
output_path: Where to save
"""
os.makedirs(output_path, exist_ok=True)
data = {
"paper_id": paper_id,
"title": title,
"claims": claims,
"created_at": __import__('datetime').datetime.now().isoformat(),
"schema_version": "1.0"
}
filepath = os.path.join(output_path, f"{paper_id}.json")
with open(filepath, "w") as f:
json.dump(data, f, indent=2)
print(f"Golden paper saved: {filepath} ({len(claims)} claims)")