phd_research_os/evaluation.py · nkshirsa/phd-research-os-brain at main

phd-research-os-brain / phd_research_os /evaluation.py

Add phd_research_os/evaluation.py

4b8a9f2 verified 16 days ago

10.1 kB

	"""
	PhD Research OS — Evaluation Harness (Phase 2)
	================================================
	Golden dataset evaluation + regression gate.

	Metrics:
	- Extraction recall (% of real claims found)
	- Extraction precision (% of extracted claims that are real)
	- Epistemic tag accuracy (% correctly classified)
	- Hallucination rate (% of claims with no source basis)
	- Confidence calibration (correlation: assigned vs human scores)
	"""

	import json
	import os
	from pathlib import Path
	from typing import Optional
	from dataclasses import dataclass, field


	@dataclass
	class EvalMetrics:
	"""Evaluation metrics for a single paper."""
	paper_id: str
	extraction_recall: float = 0.0 # % of real claims found
	extraction_precision: float = 0.0 # % of extracted claims that are real
	epistemic_accuracy: float = 0.0 # % correctly classified
	hallucination_rate: float = 0.0 # % of claims with no source basis
	confidence_correlation: float = 0.0 # Pearson r: assigned vs human
	f1_score: float = 0.0

	def to_dict(self):
	return {
	"paper_id": self.paper_id,
	"extraction_recall": round(self.extraction_recall, 4),
	"extraction_precision": round(self.extraction_precision, 4),
	"f1_score": round(self.f1_score, 4),
	"epistemic_accuracy": round(self.epistemic_accuracy, 4),
	"hallucination_rate": round(self.hallucination_rate, 4),
	"confidence_correlation": round(self.confidence_correlation, 4),
	}


	@dataclass
	class RegressionResult:
	"""Result of regression gate check."""
	passed: bool
	metrics: dict
	thresholds: dict
	failures: list = field(default_factory=list)


	# Regression thresholds (Phase 2 spec)
	REGRESSION_THRESHOLDS = {
	"extraction_recall": 0.70, # ≥ 70%
	"hallucination_rate_max": 0.10, # ≤ 10%
	"epistemic_accuracy": 0.60, # ≥ 60%
	}


	def load_golden_dataset(path: str = "tests/golden_dataset") -> dict:
	"""
	Load golden dataset from JSON files.

	Expected structure:
	tests/golden_dataset/
	├── paper_1.json
	├── paper_2.json
	└── ...

	Each file contains:
	{
	"paper_id": "...",
	"title": "...",
	"claims": [
	{
	"text": "...",
	"epistemic_tag": "Fact\|Interpretation\|...",
	"confidence": 0.85,
	"source_sentences": ["..."], # ground truth evidence
	}
	]
	}
	"""
	golden = {}
	golden_path = Path(path)

	if not golden_path.exists():
	print(f"Warning: Golden dataset path {path} does not exist")
	return golden

	for file in golden_path.glob("*.json"):
	with open(file) as f:
	data = json.load(f)
	golden[data["paper_id"]] = data

	return golden


	def evaluate_extraction(golden_claims: list, extracted_claims: list,
	similarity_threshold: float = 0.8) -> EvalMetrics:
	"""
	Compare extracted claims against golden standard.

	Uses text overlap as similarity metric (can be upgraded to embedding similarity).
	"""
	metrics = EvalMetrics(paper_id="")

	if not golden_claims:
	return metrics

	# Simple text overlap matching
	matched_golden = set()
	matched_extracted = set()
	correct_epistemic = 0
	hallucinated = 0

	for i, ext in enumerate(extracted_claims):
	ext_text = ext.get("text", "").lower()
	best_match = -1
	best_score = 0

	for j, gold in enumerate(golden_claims):
	gold_text = gold.get("text", "").lower()

	# Jaccard similarity on word sets
	ext_words = set(ext_text.split())
	gold_words = set(gold_text.split())

	if not ext_words or not gold_words:
	continue

	intersection = ext_words & gold_words
	union = ext_words \| gold_words
	score = len(intersection) / len(union) if union else 0

	if score > best_score:
	best_score = score
	best_match = j

	if best_score >= similarity_threshold and best_match >= 0:
	matched_golden.add(best_match)
	matched_extracted.add(i)

	# Check epistemic tag
	if ext.get("epistemic_tag") == golden_claims[best_match].get("epistemic_tag"):
	correct_epistemic += 1
	elif best_score < 0.3: # Very low match → likely hallucination
	hallucinated += 1

	# Calculate metrics
	n_golden = len(golden_claims)
	n_extracted = len(extracted_claims)
	n_matched = len(matched_golden)

	metrics.extraction_recall = n_matched / n_golden if n_golden > 0 else 0
	metrics.extraction_precision = len(matched_extracted) / n_extracted if n_extracted > 0 else 0

	if metrics.extraction_recall + metrics.extraction_precision > 0:
	metrics.f1_score = (2 * metrics.extraction_recall * metrics.extraction_precision /
	(metrics.extraction_recall + metrics.extraction_precision))

	metrics.epistemic_accuracy = correct_epistemic / n_matched if n_matched > 0 else 0
	metrics.hallucination_rate = hallucinated / n_extracted if n_extracted > 0 else 0

	# Confidence calibration (Pearson correlation)
	if n_matched >= 3:
	assigned = []
	human = []
	for i in matched_extracted:
	ext = extracted_claims[i]
	# Find matched golden
	ext_text = ext.get("text", "").lower()
	for j in matched_golden:
	gold = golden_claims[j]
	gold_text = gold.get("text", "").lower()
	ext_words = set(ext_text.split())
	gold_words = set(gold_text.split())
	union = ext_words \| gold_words
	score = len(ext_words & gold_words) / len(union) if union else 0
	if score >= similarity_threshold:
	assigned.append(float(ext.get("confidence", 0.5)))
	human.append(float(gold.get("confidence", 0.5)))
	break

	if len(assigned) >= 3:
	# Simple Pearson correlation
	n = len(assigned)
	mean_a = sum(assigned) / n
	mean_h = sum(human) / n

	cov = sum((a - mean_a) * (h - mean_h) for a, h in zip(assigned, human)) / n
	std_a = (sum((a - mean_a)2 for a in assigned) / n) 0.5
	std_h = (sum((h - mean_h)2 for h in human) / n) 0.5

	if std_a > 0 and std_h > 0:
	metrics.confidence_correlation = cov / (std_a * std_h)

	return metrics


	def run_regression_gate(golden_path: str = "tests/golden_dataset",
	pipeline_results: dict = None) -> RegressionResult:
	"""
	Regression gate: checks if current pipeline meets minimum thresholds.

	Must PASS before any config/prompt change is committed.

	Thresholds (Phase 2 spec):
	- Extraction recall: ≥ 70%
	- Hallucination rate: ≤ 10%
	- Epistemic accuracy: ≥ 60%
	"""
	golden = load_golden_dataset(golden_path)

	if not golden:
	return RegressionResult(
	passed=False,
	metrics={},
	thresholds=REGRESSION_THRESHOLDS,
	failures=["No golden dataset found"]
	)

	all_metrics = {}
	failures = []

	for paper_id, gold_data in golden.items():
	# Get extracted claims for this paper (from pipeline_results or DB)
	extracted = pipeline_results.get(paper_id, []) if pipeline_results else []

	metrics = evaluate_extraction(gold_data["claims"], extracted)
	metrics.paper_id = paper_id
	all_metrics[paper_id] = metrics.to_dict()

	# Check thresholds
	if metrics.extraction_recall < REGRESSION_THRESHOLDS["extraction_recall"]:
	failures.append(f"{paper_id}: recall {metrics.extraction_recall:.2%} < {REGRESSION_THRESHOLDS['extraction_recall']:.0%}")
	if metrics.hallucination_rate > REGRESSION_THRESHOLDS["hallucination_rate_max"]:
	failures.append(f"{paper_id}: hallucination {metrics.hallucination_rate:.2%} > {REGRESSION_THRESHOLDS['hallucination_rate_max']:.0%}")
	if metrics.epistemic_accuracy < REGRESSION_THRESHOLDS["epistemic_accuracy"]:
	failures.append(f"{paper_id}: epistemic accuracy {metrics.epistemic_accuracy:.2%} < {REGRESSION_THRESHOLDS['epistemic_accuracy']:.0%}")

	# Aggregate metrics
	if all_metrics:
	avg_metrics = {}
	for key in ["extraction_recall", "extraction_precision", "f1_score",
	"epistemic_accuracy", "hallucination_rate", "confidence_correlation"]:
	values = [m[key] for m in all_metrics.values()]
	avg_metrics[key] = sum(values) / len(values)
	all_metrics["_average"] = avg_metrics

	passed = len(failures) == 0

	return RegressionResult(
	passed=passed,
	metrics=all_metrics,
	thresholds=REGRESSION_THRESHOLDS,
	failures=failures
	)


	def create_golden_paper(paper_id: str, title: str, claims: list,
	output_path: str = "tests/golden_dataset"):
	"""
	Helper to create a golden dataset paper entry.

	Args:
	paper_id: Unique identifier
	title: Paper title
	claims: List of dicts with text, epistemic_tag, confidence, source_sentences
	output_path: Where to save
	"""
	os.makedirs(output_path, exist_ok=True)

	data = {
	"paper_id": paper_id,
	"title": title,
	"claims": claims,
	"created_at": __import__('datetime').datetime.now().isoformat(),
	"schema_version": "1.0"
	}

	filepath = os.path.join(output_path, f"{paper_id}.json")
	with open(filepath, "w") as f:
	json.dump(data, f, indent=2)

	print(f"Golden paper saved: {filepath} ({len(claims)} claims)")