"""
PhD Research OS — Evaluation Harness (Phase 2)
================================================
Golden dataset evaluation + regression gate.

Metrics:
  - Extraction recall (% of real claims found)
  - Extraction precision (% of extracted claims that are real)
  - Epistemic tag accuracy (% correctly classified)
  - Hallucination rate (% of claims with no source basis)
  - Confidence calibration (correlation: assigned vs human scores)
"""

import json
import os
from pathlib import Path
from typing import Optional
from dataclasses import dataclass, field


@dataclass
class EvalMetrics:
    """Evaluation metrics for a single paper."""
    paper_id: str
    extraction_recall: float = 0.0     # % of real claims found
    extraction_precision: float = 0.0  # % of extracted claims that are real
    epistemic_accuracy: float = 0.0    # % correctly classified
    hallucination_rate: float = 0.0    # % of claims with no source basis
    confidence_correlation: float = 0.0  # Pearson r: assigned vs human
    f1_score: float = 0.0
    
    def to_dict(self):
        return {
            "paper_id": self.paper_id,
            "extraction_recall": round(self.extraction_recall, 4),
            "extraction_precision": round(self.extraction_precision, 4),
            "f1_score": round(self.f1_score, 4),
            "epistemic_accuracy": round(self.epistemic_accuracy, 4),
            "hallucination_rate": round(self.hallucination_rate, 4),
            "confidence_correlation": round(self.confidence_correlation, 4),
        }


@dataclass
class RegressionResult:
    """Result of regression gate check."""
    passed: bool
    metrics: dict
    thresholds: dict
    failures: list = field(default_factory=list)


# Regression thresholds (Phase 2 spec)
REGRESSION_THRESHOLDS = {
    "extraction_recall": 0.70,      # ≥ 70%
    "hallucination_rate_max": 0.10,  # ≤ 10%
    "epistemic_accuracy": 0.60,     # ≥ 60%
}


def load_golden_dataset(path: str = "tests/golden_dataset") -> dict:
    """
    Load golden dataset from JSON files.
    
    Expected structure:
    tests/golden_dataset/
    ├── paper_1.json
    ├── paper_2.json
    └── ...
    
    Each file contains:
    {
        "paper_id": "...",
        "title": "...",
        "claims": [
            {
                "text": "...",
                "epistemic_tag": "Fact|Interpretation|...",
                "confidence": 0.85,
                "source_sentences": ["..."],  # ground truth evidence
            }
        ]
    }
    """
    golden = {}
    golden_path = Path(path)
    
    if not golden_path.exists():
        print(f"Warning: Golden dataset path {path} does not exist")
        return golden
    
    for file in golden_path.glob("*.json"):
        with open(file) as f:
            data = json.load(f)
            golden[data["paper_id"]] = data
    
    return golden


def evaluate_extraction(golden_claims: list, extracted_claims: list,
                        similarity_threshold: float = 0.8) -> EvalMetrics:
    """
    Compare extracted claims against golden standard.
    
    Uses text overlap as similarity metric (can be upgraded to embedding similarity).
    """
    metrics = EvalMetrics(paper_id="")
    
    if not golden_claims:
        return metrics
    
    # Simple text overlap matching
    matched_golden = set()
    matched_extracted = set()
    correct_epistemic = 0
    hallucinated = 0
    
    for i, ext in enumerate(extracted_claims):
        ext_text = ext.get("text", "").lower()
        best_match = -1
        best_score = 0
        
        for j, gold in enumerate(golden_claims):
            gold_text = gold.get("text", "").lower()
            
            # Jaccard similarity on word sets
            ext_words = set(ext_text.split())
            gold_words = set(gold_text.split())
            
            if not ext_words or not gold_words:
                continue
            
            intersection = ext_words & gold_words
            union = ext_words | gold_words
            score = len(intersection) / len(union) if union else 0
            
            if score > best_score:
                best_score = score
                best_match = j
        
        if best_score >= similarity_threshold and best_match >= 0:
            matched_golden.add(best_match)
            matched_extracted.add(i)
            
            # Check epistemic tag
            if ext.get("epistemic_tag") == golden_claims[best_match].get("epistemic_tag"):
                correct_epistemic += 1
        elif best_score < 0.3:  # Very low match → likely hallucination
            hallucinated += 1
    
    # Calculate metrics
    n_golden = len(golden_claims)
    n_extracted = len(extracted_claims)
    n_matched = len(matched_golden)
    
    metrics.extraction_recall = n_matched / n_golden if n_golden > 0 else 0
    metrics.extraction_precision = len(matched_extracted) / n_extracted if n_extracted > 0 else 0
    
    if metrics.extraction_recall + metrics.extraction_precision > 0:
        metrics.f1_score = (2 * metrics.extraction_recall * metrics.extraction_precision / 
                           (metrics.extraction_recall + metrics.extraction_precision))
    
    metrics.epistemic_accuracy = correct_epistemic / n_matched if n_matched > 0 else 0
    metrics.hallucination_rate = hallucinated / n_extracted if n_extracted > 0 else 0
    
    # Confidence calibration (Pearson correlation)
    if n_matched >= 3:
        assigned = []
        human = []
        for i in matched_extracted:
            ext = extracted_claims[i]
            # Find matched golden
            ext_text = ext.get("text", "").lower()
            for j in matched_golden:
                gold = golden_claims[j]
                gold_text = gold.get("text", "").lower()
                ext_words = set(ext_text.split())
                gold_words = set(gold_text.split())
                union = ext_words | gold_words
                score = len(ext_words & gold_words) / len(union) if union else 0
                if score >= similarity_threshold:
                    assigned.append(float(ext.get("confidence", 0.5)))
                    human.append(float(gold.get("confidence", 0.5)))
                    break
        
        if len(assigned) >= 3:
            # Simple Pearson correlation
            n = len(assigned)
            mean_a = sum(assigned) / n
            mean_h = sum(human) / n
            
            cov = sum((a - mean_a) * (h - mean_h) for a, h in zip(assigned, human)) / n
            std_a = (sum((a - mean_a)**2 for a in assigned) / n) ** 0.5
            std_h = (sum((h - mean_h)**2 for h in human) / n) ** 0.5
            
            if std_a > 0 and std_h > 0:
                metrics.confidence_correlation = cov / (std_a * std_h)
    
    return metrics


def run_regression_gate(golden_path: str = "tests/golden_dataset",
                        pipeline_results: dict = None) -> RegressionResult:
    """
    Regression gate: checks if current pipeline meets minimum thresholds.
    
    Must PASS before any config/prompt change is committed.
    
    Thresholds (Phase 2 spec):
      - Extraction recall: ≥ 70%
      - Hallucination rate: ≤ 10%
      - Epistemic accuracy: ≥ 60%
    """
    golden = load_golden_dataset(golden_path)
    
    if not golden:
        return RegressionResult(
            passed=False,
            metrics={},
            thresholds=REGRESSION_THRESHOLDS,
            failures=["No golden dataset found"]
        )
    
    all_metrics = {}
    failures = []
    
    for paper_id, gold_data in golden.items():
        # Get extracted claims for this paper (from pipeline_results or DB)
        extracted = pipeline_results.get(paper_id, []) if pipeline_results else []
        
        metrics = evaluate_extraction(gold_data["claims"], extracted)
        metrics.paper_id = paper_id
        all_metrics[paper_id] = metrics.to_dict()
        
        # Check thresholds
        if metrics.extraction_recall < REGRESSION_THRESHOLDS["extraction_recall"]:
            failures.append(f"{paper_id}: recall {metrics.extraction_recall:.2%} < {REGRESSION_THRESHOLDS['extraction_recall']:.0%}")
        if metrics.hallucination_rate > REGRESSION_THRESHOLDS["hallucination_rate_max"]:
            failures.append(f"{paper_id}: hallucination {metrics.hallucination_rate:.2%} > {REGRESSION_THRESHOLDS['hallucination_rate_max']:.0%}")
        if metrics.epistemic_accuracy < REGRESSION_THRESHOLDS["epistemic_accuracy"]:
            failures.append(f"{paper_id}: epistemic accuracy {metrics.epistemic_accuracy:.2%} < {REGRESSION_THRESHOLDS['epistemic_accuracy']:.0%}")
    
    # Aggregate metrics
    if all_metrics:
        avg_metrics = {}
        for key in ["extraction_recall", "extraction_precision", "f1_score",
                     "epistemic_accuracy", "hallucination_rate", "confidence_correlation"]:
            values = [m[key] for m in all_metrics.values()]
            avg_metrics[key] = sum(values) / len(values)
        all_metrics["_average"] = avg_metrics
    
    passed = len(failures) == 0
    
    return RegressionResult(
        passed=passed,
        metrics=all_metrics,
        thresholds=REGRESSION_THRESHOLDS,
        failures=failures
    )


def create_golden_paper(paper_id: str, title: str, claims: list,
                        output_path: str = "tests/golden_dataset"):
    """
    Helper to create a golden dataset paper entry.
    
    Args:
        paper_id: Unique identifier
        title: Paper title
        claims: List of dicts with text, epistemic_tag, confidence, source_sentences
        output_path: Where to save
    """
    os.makedirs(output_path, exist_ok=True)
    
    data = {
        "paper_id": paper_id,
        "title": title,
        "claims": claims,
        "created_at": __import__('datetime').datetime.now().isoformat(),
        "schema_version": "1.0"
    }
    
    filepath = os.path.join(output_path, f"{paper_id}.json")
    with open(filepath, "w") as f:
        json.dump(data, f, indent=2)
    
    print(f"Golden paper saved: {filepath} ({len(claims)} claims)")