Add phd_research_os/evaluation.py

Browse files

Files changed (1) hide show

phd_research_os/evaluation.py +287 -0

phd_research_os/evaluation.py ADDED Viewed

	@@ -0,0 +1,287 @@

+"""
+PhD Research OS — Evaluation Harness (Phase 2)
+================================================
+Golden dataset evaluation + regression gate.
+Metrics:
+  - Extraction recall (% of real claims found)
+  - Extraction precision (% of extracted claims that are real)
+  - Epistemic tag accuracy (% correctly classified)
+  - Hallucination rate (% of claims with no source basis)
+  - Confidence calibration (correlation: assigned vs human scores)
+"""
+import json
+import os
+from pathlib import Path
+from typing import Optional
+from dataclasses import dataclass, field
+@dataclass
+class EvalMetrics:
+    """Evaluation metrics for a single paper."""
+    paper_id: str
+    extraction_recall: float = 0.0     # % of real claims found
+    extraction_precision: float = 0.0  # % of extracted claims that are real
+    epistemic_accuracy: float = 0.0    # % correctly classified
+    hallucination_rate: float = 0.0    # % of claims with no source basis
+    confidence_correlation: float = 0.0  # Pearson r: assigned vs human
+    f1_score: float = 0.0
+    def to_dict(self):
+        return {
+            "paper_id": self.paper_id,
+            "extraction_recall": round(self.extraction_recall, 4),
+            "extraction_precision": round(self.extraction_precision, 4),
+            "f1_score": round(self.f1_score, 4),
+            "epistemic_accuracy": round(self.epistemic_accuracy, 4),
+            "hallucination_rate": round(self.hallucination_rate, 4),
+            "confidence_correlation": round(self.confidence_correlation, 4),
+        }
+@dataclass
+class RegressionResult:
+    """Result of regression gate check."""
+    passed: bool
+    metrics: dict
+    thresholds: dict
+    failures: list = field(default_factory=list)
+# Regression thresholds (Phase 2 spec)
+REGRESSION_THRESHOLDS = {
+    "extraction_recall": 0.70,      # ≥ 70%
+    "hallucination_rate_max": 0.10,  # ≤ 10%
+    "epistemic_accuracy": 0.60,     # ≥ 60%
+}
+def load_golden_dataset(path: str = "tests/golden_dataset") -> dict:
+    """
+    Load golden dataset from JSON files.
+    Expected structure:
+    tests/golden_dataset/
+    ├── paper_1.json
+    ├── paper_2.json
+    └── ...
+    Each file contains:
+    {
+        "paper_id": "...",
+        "title": "...",
+        "claims": [
+            {
+                "text": "...",
+                "epistemic_tag": "Fact|Interpretation|...",
+                "confidence": 0.85,
+                "source_sentences": ["..."],  # ground truth evidence
+            }
+        ]
+    }
+    """
+    golden = {}
+    golden_path = Path(path)
+    if not golden_path.exists():
+        print(f"Warning: Golden dataset path {path} does not exist")
+        return golden
+    for file in golden_path.glob("*.json"):
+        with open(file) as f:
+            data = json.load(f)
+            golden[data["paper_id"]] = data
+    return golden
+def evaluate_extraction(golden_claims: list, extracted_claims: list,
+                        similarity_threshold: float = 0.8) -> EvalMetrics:
+    """
+    Compare extracted claims against golden standard.
+    Uses text overlap as similarity metric (can be upgraded to embedding similarity).
+    """
+    metrics = EvalMetrics(paper_id="")
+    if not golden_claims:
+        return metrics
+    # Simple text overlap matching
+    matched_golden = set()
+    matched_extracted = set()
+    correct_epistemic = 0
+    hallucinated = 0
+    for i, ext in enumerate(extracted_claims):
+        ext_text = ext.get("text", "").lower()
+        best_match = -1
+        best_score = 0
+        for j, gold in enumerate(golden_claims):
+            gold_text = gold.get("text", "").lower()
+            # Jaccard similarity on word sets
+            ext_words = set(ext_text.split())
+            gold_words = set(gold_text.split())
+            if not ext_words or not gold_words:
+                continue
+            intersection = ext_words & gold_words
+            union = ext_words | gold_words
+            score = len(intersection) / len(union) if union else 0
+            if score > best_score:
+                best_score = score
+                best_match = j
+        if best_score >= similarity_threshold and best_match >= 0:
+            matched_golden.add(best_match)
+            matched_extracted.add(i)
+            # Check epistemic tag
+            if ext.get("epistemic_tag") == golden_claims[best_match].get("epistemic_tag"):
+                correct_epistemic += 1
+        elif best_score < 0.3:  # Very low match → likely hallucination
+            hallucinated += 1
+    # Calculate metrics
+    n_golden = len(golden_claims)
+    n_extracted = len(extracted_claims)
+    n_matched = len(matched_golden)
+    metrics.extraction_recall = n_matched / n_golden if n_golden > 0 else 0
+    metrics.extraction_precision = len(matched_extracted) / n_extracted if n_extracted > 0 else 0
+    if metrics.extraction_recall + metrics.extraction_precision > 0:
+        metrics.f1_score = (2 * metrics.extraction_recall * metrics.extraction_precision /
+                           (metrics.extraction_recall + metrics.extraction_precision))
+    metrics.epistemic_accuracy = correct_epistemic / n_matched if n_matched > 0 else 0
+    metrics.hallucination_rate = hallucinated / n_extracted if n_extracted > 0 else 0
+    # Confidence calibration (Pearson correlation)
+    if n_matched >= 3:
+        assigned = []
+        human = []
+        for i in matched_extracted:
+            ext = extracted_claims[i]
+            # Find matched golden
+            ext_text = ext.get("text", "").lower()
+            for j in matched_golden:
+                gold = golden_claims[j]
+                gold_text = gold.get("text", "").lower()
+                ext_words = set(ext_text.split())
+                gold_words = set(gold_text.split())
+                union = ext_words | gold_words
+                score = len(ext_words & gold_words) / len(union) if union else 0
+                if score >= similarity_threshold:
+                    assigned.append(float(ext.get("confidence", 0.5)))
+                    human.append(float(gold.get("confidence", 0.5)))
+                    break
+        if len(assigned) >= 3:
+            # Simple Pearson correlation
+            n = len(assigned)
+            mean_a = sum(assigned) / n
+            mean_h = sum(human) / n
+            cov = sum((a - mean_a) * (h - mean_h) for a, h in zip(assigned, human)) / n
+            std_a = (sum((a - mean_a)**2 for a in assigned) / n) ** 0.5
+            std_h = (sum((h - mean_h)**2 for h in human) / n) ** 0.5
+            if std_a > 0 and std_h > 0:
+                metrics.confidence_correlation = cov / (std_a * std_h)
+    return metrics
+def run_regression_gate(golden_path: str = "tests/golden_dataset",
+                        pipeline_results: dict = None) -> RegressionResult:
+    """
+    Regression gate: checks if current pipeline meets minimum thresholds.
+    Must PASS before any config/prompt change is committed.
+    Thresholds (Phase 2 spec):
+      - Extraction recall: ≥ 70%
+      - Hallucination rate: ≤ 10%
+      - Epistemic accuracy: ≥ 60%
+    """
+    golden = load_golden_dataset(golden_path)
+    if not golden:
+        return RegressionResult(
+            passed=False,
+            metrics={},
+            thresholds=REGRESSION_THRESHOLDS,
+            failures=["No golden dataset found"]
+        )
+    all_metrics = {}
+    failures = []
+    for paper_id, gold_data in golden.items():
+        # Get extracted claims for this paper (from pipeline_results or DB)
+        extracted = pipeline_results.get(paper_id, []) if pipeline_results else []
+        metrics = evaluate_extraction(gold_data["claims"], extracted)
+        metrics.paper_id = paper_id
+        all_metrics[paper_id] = metrics.to_dict()
+        # Check thresholds
+        if metrics.extraction_recall < REGRESSION_THRESHOLDS["extraction_recall"]:
+            failures.append(f"{paper_id}: recall {metrics.extraction_recall:.2%} < {REGRESSION_THRESHOLDS['extraction_recall']:.0%}")
+        if metrics.hallucination_rate > REGRESSION_THRESHOLDS["hallucination_rate_max"]:
+            failures.append(f"{paper_id}: hallucination {metrics.hallucination_rate:.2%} > {REGRESSION_THRESHOLDS['hallucination_rate_max']:.0%}")
+        if metrics.epistemic_accuracy < REGRESSION_THRESHOLDS["epistemic_accuracy"]:
+            failures.append(f"{paper_id}: epistemic accuracy {metrics.epistemic_accuracy:.2%} < {REGRESSION_THRESHOLDS['epistemic_accuracy']:.0%}")
+    # Aggregate metrics
+    if all_metrics:
+        avg_metrics = {}
+        for key in ["extraction_recall", "extraction_precision", "f1_score",
+                     "epistemic_accuracy", "hallucination_rate", "confidence_correlation"]:
+            values = [m[key] for m in all_metrics.values()]
+            avg_metrics[key] = sum(values) / len(values)
+        all_metrics["_average"] = avg_metrics
+    passed = len(failures) == 0
+    return RegressionResult(
+        passed=passed,
+        metrics=all_metrics,
+        thresholds=REGRESSION_THRESHOLDS,
+        failures=failures
+    )
+def create_golden_paper(paper_id: str, title: str, claims: list,
+                        output_path: str = "tests/golden_dataset"):
+    """
+    Helper to create a golden dataset paper entry.
+    Args:
+        paper_id: Unique identifier
+        title: Paper title
+        claims: List of dicts with text, epistemic_tag, confidence, source_sentences
+        output_path: Where to save
+    """
+    os.makedirs(output_path, exist_ok=True)
+    data = {
+        "paper_id": paper_id,
+        "title": title,
+        "claims": claims,
+        "created_at": __import__('datetime').datetime.now().isoformat(),
+        "schema_version": "1.0"
+    }
+    filepath = os.path.join(output_path, f"{paper_id}.json")
+    with open(filepath, "w") as f:
+        json.dump(data, f, indent=2)
+    print(f"Golden paper saved: {filepath} ({len(claims)} claims)")