Add Epistemic Trigger Words validator (deterministic code-based epistemic classification alongside AI)

Browse files

Files changed (1) hide show

phd_research_os_v2/layer2/trigger_validator.py +311 -0

phd_research_os_v2/layer2/trigger_validator.py ADDED Viewed

	@@ -0,0 +1,311 @@

+"""
+Layer 2 + Layer 5: Epistemic Trigger Words Validator
+======================================================
+Deterministic, code-based epistemic classification using linguistic patterns.
+Runs ALONGSIDE the AI Council as a cross-check.
+If the AI says "Fact" but trigger words say "Hypothesis" → flag for human review.
+Adapted from: KGX3/iKuhn's language-game filters (arxiv:2002.03531)
+Addresses blindspots: PA-5, B-4
+Source: SYSTEM_INSPIRATIONS.md AD-3
+No ML dependencies. Pure Python. Deterministic output.
+"""
+import re
+from typing import Optional
+# ══════════════════════════════════════════════════════════════════════
+# TRIGGER WORD DICTIONARIES
+# ══════════════════════════════════════════════════════════════════════
+# Weights: "strong" triggers score 0.30, "moderate" score 0.15, "weak" score 0.08
+# Calibrated to KGX3's activation threshold θ=0.7
+FACT_TRIGGERS = {
+    "strong": [
+        "demonstrated", "measured", "observed", "detected", "confirmed",
+        "showed that", "resulted in", "was found to be", "achieved",
+        "we report", "we found", "was determined to be", "are reported",
+        "the data show", "the results show", "statistically significant",
+        "p < ", "p = ", "p-value", "with a yield of", "with an efficiency of",
+    ],
+    "moderate": [
+        "correlated with", "associated with", "consistent with the finding",
+        "reproduces", "replicated", "validated", "verified",
+        "supported by the data", "the analysis revealed",
+    ],
+    "weak": [
+        "found", "obtained", "recorded", "documented", "established",
+    ],
+}
+INTERPRETATION_TRIGGERS = {
+    "strong": [
+        "suggests that", "indicates that", "implies", "may be attributed to",
+        "could be explained by", "appears to", "is likely due to",
+        "we interpret", "these findings suggest", "this result suggests",
+        "it is likely that", "is indicative of", "we attribute this to",
+        "this is consistent with", "supports the notion",
+    ],
+    "moderate": [
+        "consistent with", "in line with", "supports the hypothesis",
+        "in agreement with", "pointing to", "reflecting",
+        "can be understood as", "we believe", "our interpretation",
+    ],
+    "weak": [
+        "presumably", "apparently", "seems to", "tends to",
+    ],
+}
+HYPOTHESIS_TRIGGERS = {
+    "strong": [
+        "may", "might", "could potentially", "we hypothesize",
+        "it is possible that", "remains to be determined",
+        "future work should", "further investigation is needed",
+        "we speculate", "one possibility is", "a potential explanation",
+        "it is conceivable", "it remains unclear", "requires further study",
+        "we cannot rule out",
+    ],
+    "moderate": [
+        "we propose", "we envision", "it is plausible",
+        "a promising direction", "warrants further investigation",
+        "preliminary evidence suggests", "tentatively",
+    ],
+    "weak": [
+        "possibly", "potentially", "presumably", "perhaps",
+    ],
+}
+CONFLICT_TRIGGERS = {
+    "strong": [
+        "contradicts", "in contrast to", "unlike previous",
+        "contrary to", "inconsistent with", "at odds with",
+        "disputes", "challenges the", "refutes",
+        "however, our results show", "in disagreement with",
+    ],
+    "moderate": [
+        "differs from", "diverges from", "does not support",
+        "failed to reproduce", "we were unable to replicate",
+        "the discrepancy", "while others have reported",
+    ],
+    "weak": [
+        "however", "nevertheless", "on the other hand", "conversely",
+    ],
+}
+# ── Section-based priors ──────────────────────────────────────────────
+# These shift the baseline probability before trigger analysis
+SECTION_PRIORS = {
+    "abstract":    {"Fact": 0.00, "Interpretation": 0.20, "Hypothesis": 0.05, "Conflict_Hypothesis": 0.00},
+    "introduction":{"Fact": 0.00, "Interpretation": 0.10, "Hypothesis": 0.05, "Conflict_Hypothesis": 0.00},
+    "methods":     {"Fact": 0.15, "Interpretation": 0.00, "Hypothesis": 0.00, "Conflict_Hypothesis": 0.00},
+    "results":     {"Fact": 0.25, "Interpretation": 0.00, "Hypothesis": 0.00, "Conflict_Hypothesis": 0.00},
+    "discussion":  {"Fact": 0.00, "Interpretation": 0.15, "Hypothesis": 0.10, "Conflict_Hypothesis": 0.00},
+    "conclusion":  {"Fact": 0.00, "Interpretation": 0.10, "Hypothesis": 0.05, "Conflict_Hypothesis": 0.00},
+    "supplement":  {"Fact": 0.20, "Interpretation": 0.00, "Hypothesis": 0.00, "Conflict_Hypothesis": 0.00},
+}
+# Strength weights
+STRENGTH_WEIGHTS = {"strong": 0.30, "moderate": 0.15, "weak": 0.08}
+def compute_trigger_scores(claim_text: str, source_section: str = None) -> dict:
+    """
+    Compute epistemic trigger scores for a claim.
+    Returns:
+        {
+            "scores": {"Fact": 0.45, "Interpretation": 0.20, ...},
+            "predicted_tag": "Fact",
+            "confidence": 0.45,
+            "matched_triggers": {"Fact": ["measured", "p < 0.01"], ...},
+            "section_prior_applied": "results",
+        }
+    """
+    text_lower = claim_text.lower()
+    categories = {
+        "Fact": FACT_TRIGGERS,
+        "Interpretation": INTERPRETATION_TRIGGERS,
+        "Hypothesis": HYPOTHESIS_TRIGGERS,
+        "Conflict_Hypothesis": CONFLICT_TRIGGERS,
+    }
+    scores = {"Fact": 0.0, "Interpretation": 0.0, "Hypothesis": 0.0, "Conflict_Hypothesis": 0.0}
+    matched = {"Fact": [], "Interpretation": [], "Hypothesis": [], "Conflict_Hypothesis": []}
+    # Score trigger matches
+    for category, triggers_dict in categories.items():
+        for strength, triggers in triggers_dict.items():
+            weight = STRENGTH_WEIGHTS[strength]
+            for trigger in triggers:
+                if trigger in text_lower:
+                    scores[category] += weight
+                    matched[category].append(trigger)
+    # Apply section priors
+    section_key = (source_section or "").lower().strip()
+    priors = SECTION_PRIORS.get(section_key, {})
+    for cat, prior in priors.items():
+        scores[cat] += prior
+    # Normalize (cap at 1.0)
+    for cat in scores:
+        scores[cat] = min(1.0, scores[cat])
+    # Determine predicted tag
+    predicted_tag = max(scores, key=scores.get)
+    confidence = scores[predicted_tag]
+    return {
+        "scores": {k: round(v, 3) for k, v in scores.items()},
+        "predicted_tag": predicted_tag,
+        "confidence": round(confidence, 3),
+        "matched_triggers": {k: v for k, v in matched.items() if v},
+        "section_prior_applied": section_key or None,
+    }
+def validate_ai_tag(claim_text: str, ai_tag: str,
+                     source_section: str = None,
+                     disagreement_threshold: float = 0.20) -> dict:
+    """
+    Cross-validate an AI-assigned epistemic tag against trigger analysis.
+    This is the core function — run this AFTER the AI Council assigns a tag,
+    and flag disagreements for human review.
+    Args:
+        claim_text: The claim text
+        ai_tag: Tag assigned by the AI Council (Fact/Interpretation/Hypothesis/Conflict_Hypothesis)
+        source_section: Paper section the claim came from
+        disagreement_threshold: Minimum score difference to flag disagreement
+    Returns:
+        {
+            "agreement": True/False,
+            "ai_tag": "Fact",
+            "trigger_tag": "Interpretation",
+            "trigger_scores": {...},
+            "disagreement_severity": "none" | "mild" | "strong",
+            "recommendation": "accept" | "review" | "override",
+            "explanation": "human-readable explanation",
+        }
+    """
+    trigger_result = compute_trigger_scores(claim_text, source_section)
+    trigger_tag = trigger_result["predicted_tag"]
+    trigger_scores = trigger_result["scores"]
+    agrees = (ai_tag == trigger_tag)
+    if agrees:
+        return {
+            "agreement": True,
+            "ai_tag": ai_tag,
+            "trigger_tag": trigger_tag,
+            "trigger_scores": trigger_scores,
+            "matched_triggers": trigger_result["matched_triggers"],
+            "disagreement_severity": "none",
+            "recommendation": "accept",
+            "explanation": f"AI and trigger analysis agree: {ai_tag}",
+        }
+    # Compute disagreement severity
+    ai_score = trigger_scores.get(ai_tag, 0.0)
+    trigger_score = trigger_scores.get(trigger_tag, 0.0)
+    score_diff = trigger_score - ai_score
+    if score_diff < disagreement_threshold:
+        severity = "mild"
+        recommendation = "accept"  # AI tag is close enough
+        explanation = (
+            f"AI says '{ai_tag}' (trigger score: {ai_score:.2f}), "
+            f"triggers lean '{trigger_tag}' (score: {trigger_score:.2f}). "
+            f"Difference is small ({score_diff:.2f}). AI tag accepted."
+        )
+    else:
+        severity = "strong"
+        recommendation = "review"
+        # Specific explanations for common disagreement patterns
+        if ai_tag == "Fact" and trigger_tag in ("Interpretation", "Hypothesis"):
+            explanation = (
+                f"⚠️ AI tagged as Fact but text contains hedging language: "
+                f"{trigger_result['matched_triggers'].get(trigger_tag, [])}. "
+                f"Consider downgrading to {trigger_tag}."
+            )
+        elif ai_tag == "Interpretation" and trigger_tag == "Fact":
+            explanation = (
+                f"AI tagged as Interpretation but text contains strong evidence language: "
+                f"{trigger_result['matched_triggers'].get('Fact', [])}. "
+                f"May warrant upgrading to Fact if in Results section."
+            )
+        elif ai_tag == "Fact" and trigger_tag == "Conflict_Hypothesis":
+            explanation = (
+                f"⚠️ AI tagged as Fact but text contains contradiction language: "
+                f"{trigger_result['matched_triggers'].get('Conflict_Hypothesis', [])}. "
+                f"This may be a conflict claim."
+            )
+        else:
+            explanation = (
+                f"AI says '{ai_tag}' (score: {ai_score:.2f}), "
+                f"triggers say '{trigger_tag}' (score: {trigger_score:.2f}). "
+                f"Matched triggers: {trigger_result['matched_triggers']}. "
+                f"Human review recommended."
+            )
+    return {
+        "agreement": False,
+        "ai_tag": ai_tag,
+        "trigger_tag": trigger_tag,
+        "trigger_scores": trigger_scores,
+        "matched_triggers": trigger_result["matched_triggers"],
+        "disagreement_severity": severity,
+        "recommendation": recommendation,
+        "explanation": explanation,
+    }
+def batch_validate(claims: list[dict]) -> dict:
+    """
+    Validate a batch of claims. Each claim dict must have:
+        - "text": str
+        - "epistemic_tag": str (AI-assigned tag)
+        - "source_section": str (optional)
+    Returns summary statistics and flagged claims.
+    """
+    results = {
+        "total": len(claims),
+        "agreements": 0,
+        "mild_disagreements": 0,
+        "strong_disagreements": 0,
+        "flagged_for_review": [],
+    }
+    for i, claim in enumerate(claims):
+        validation = validate_ai_tag(
+            claim_text=claim.get("text", ""),
+            ai_tag=claim.get("epistemic_tag", "Interpretation"),
+            source_section=claim.get("source_section"),
+        )
+        if validation["agreement"]:
+            results["agreements"] += 1
+        elif validation["disagreement_severity"] == "mild":
+            results["mild_disagreements"] += 1
+        else:
+            results["strong_disagreements"] += 1
+            results["flagged_for_review"].append({
+                "index": i,
+                "claim_text": claim.get("text", "")[:200],
+                "validation": validation,
+            })
+    results["agreement_rate"] = round(
+        results["agreements"] / max(results["total"], 1), 3
+    )
+    return results