| """ |
| Layer 2 + Layer 5: Epistemic Trigger Words Validator |
| ====================================================== |
| Deterministic, code-based epistemic classification using linguistic patterns. |
| Runs ALONGSIDE the AI Council as a cross-check. |
| |
| If the AI says "Fact" but trigger words say "Hypothesis" → flag for human review. |
| |
| Adapted from: KGX3/iKuhn's language-game filters (arxiv:2002.03531) |
| Addresses blindspots: PA-5, B-4 |
| Source: SYSTEM_INSPIRATIONS.md AD-3 |
| |
| No ML dependencies. Pure Python. Deterministic output. |
| """ |
|
|
| import re |
| from typing import Optional |
|
|
|
|
| |
| |
| |
| |
| |
|
|
| FACT_TRIGGERS = { |
| "strong": [ |
| "demonstrated", "measured", "observed", "detected", "confirmed", |
| "showed that", "resulted in", "was found to be", "achieved", |
| "we report", "we found", "was determined to be", "are reported", |
| "the data show", "the results show", "statistically significant", |
| "p < ", "p = ", "p-value", "with a yield of", "with an efficiency of", |
| ], |
| "moderate": [ |
| "correlated with", "associated with", "consistent with the finding", |
| "reproduces", "replicated", "validated", "verified", |
| "supported by the data", "the analysis revealed", |
| ], |
| "weak": [ |
| "found", "obtained", "recorded", "documented", "established", |
| ], |
| } |
|
|
| INTERPRETATION_TRIGGERS = { |
| "strong": [ |
| "suggests that", "indicates that", "implies", "may be attributed to", |
| "could be explained by", "appears to", "is likely due to", |
| "we interpret", "these findings suggest", "this result suggests", |
| "it is likely that", "is indicative of", "we attribute this to", |
| "this is consistent with", "supports the notion", |
| ], |
| "moderate": [ |
| "consistent with", "in line with", "supports the hypothesis", |
| "in agreement with", "pointing to", "reflecting", |
| "can be understood as", "we believe", "our interpretation", |
| ], |
| "weak": [ |
| "presumably", "apparently", "seems to", "tends to", |
| ], |
| } |
|
|
| HYPOTHESIS_TRIGGERS = { |
| "strong": [ |
| "may", "might", "could potentially", "we hypothesize", |
| "it is possible that", "remains to be determined", |
| "future work should", "further investigation is needed", |
| "we speculate", "one possibility is", "a potential explanation", |
| "it is conceivable", "it remains unclear", "requires further study", |
| "we cannot rule out", |
| ], |
| "moderate": [ |
| "we propose", "we envision", "it is plausible", |
| "a promising direction", "warrants further investigation", |
| "preliminary evidence suggests", "tentatively", |
| ], |
| "weak": [ |
| "possibly", "potentially", "presumably", "perhaps", |
| ], |
| } |
|
|
| CONFLICT_TRIGGERS = { |
| "strong": [ |
| "contradicts", "in contrast to", "unlike previous", |
| "contrary to", "inconsistent with", "at odds with", |
| "disputes", "challenges the", "refutes", |
| "however, our results show", "in disagreement with", |
| ], |
| "moderate": [ |
| "differs from", "diverges from", "does not support", |
| "failed to reproduce", "we were unable to replicate", |
| "the discrepancy", "while others have reported", |
| ], |
| "weak": [ |
| "however", "nevertheless", "on the other hand", "conversely", |
| ], |
| } |
|
|
| |
| |
| SECTION_PRIORS = { |
| "abstract": {"Fact": 0.00, "Interpretation": 0.20, "Hypothesis": 0.05, "Conflict_Hypothesis": 0.00}, |
| "introduction":{"Fact": 0.00, "Interpretation": 0.10, "Hypothesis": 0.05, "Conflict_Hypothesis": 0.00}, |
| "methods": {"Fact": 0.15, "Interpretation": 0.00, "Hypothesis": 0.00, "Conflict_Hypothesis": 0.00}, |
| "results": {"Fact": 0.25, "Interpretation": 0.00, "Hypothesis": 0.00, "Conflict_Hypothesis": 0.00}, |
| "discussion": {"Fact": 0.00, "Interpretation": 0.15, "Hypothesis": 0.10, "Conflict_Hypothesis": 0.00}, |
| "conclusion": {"Fact": 0.00, "Interpretation": 0.10, "Hypothesis": 0.05, "Conflict_Hypothesis": 0.00}, |
| "supplement": {"Fact": 0.20, "Interpretation": 0.00, "Hypothesis": 0.00, "Conflict_Hypothesis": 0.00}, |
| } |
|
|
| |
| STRENGTH_WEIGHTS = {"strong": 0.30, "moderate": 0.15, "weak": 0.08} |
|
|
|
|
| def compute_trigger_scores(claim_text: str, source_section: str = None) -> dict: |
| """ |
| Compute epistemic trigger scores for a claim. |
| |
| Returns: |
| { |
| "scores": {"Fact": 0.45, "Interpretation": 0.20, ...}, |
| "predicted_tag": "Fact", |
| "confidence": 0.45, |
| "matched_triggers": {"Fact": ["measured", "p < 0.01"], ...}, |
| "section_prior_applied": "results", |
| } |
| """ |
| text_lower = claim_text.lower() |
| |
| categories = { |
| "Fact": FACT_TRIGGERS, |
| "Interpretation": INTERPRETATION_TRIGGERS, |
| "Hypothesis": HYPOTHESIS_TRIGGERS, |
| "Conflict_Hypothesis": CONFLICT_TRIGGERS, |
| } |
| |
| scores = {"Fact": 0.0, "Interpretation": 0.0, "Hypothesis": 0.0, "Conflict_Hypothesis": 0.0} |
| matched = {"Fact": [], "Interpretation": [], "Hypothesis": [], "Conflict_Hypothesis": []} |
| |
| |
| for category, triggers_dict in categories.items(): |
| for strength, triggers in triggers_dict.items(): |
| weight = STRENGTH_WEIGHTS[strength] |
| for trigger in triggers: |
| if trigger in text_lower: |
| scores[category] += weight |
| matched[category].append(trigger) |
| |
| |
| section_key = (source_section or "").lower().strip() |
| priors = SECTION_PRIORS.get(section_key, {}) |
| for cat, prior in priors.items(): |
| scores[cat] += prior |
| |
| |
| for cat in scores: |
| scores[cat] = min(1.0, scores[cat]) |
| |
| |
| predicted_tag = max(scores, key=scores.get) |
| confidence = scores[predicted_tag] |
| |
| return { |
| "scores": {k: round(v, 3) for k, v in scores.items()}, |
| "predicted_tag": predicted_tag, |
| "confidence": round(confidence, 3), |
| "matched_triggers": {k: v for k, v in matched.items() if v}, |
| "section_prior_applied": section_key or None, |
| } |
|
|
|
|
| def validate_ai_tag(claim_text: str, ai_tag: str, |
| source_section: str = None, |
| disagreement_threshold: float = 0.20) -> dict: |
| """ |
| Cross-validate an AI-assigned epistemic tag against trigger analysis. |
| |
| This is the core function — run this AFTER the AI Council assigns a tag, |
| and flag disagreements for human review. |
| |
| Args: |
| claim_text: The claim text |
| ai_tag: Tag assigned by the AI Council (Fact/Interpretation/Hypothesis/Conflict_Hypothesis) |
| source_section: Paper section the claim came from |
| disagreement_threshold: Minimum score difference to flag disagreement |
| |
| Returns: |
| { |
| "agreement": True/False, |
| "ai_tag": "Fact", |
| "trigger_tag": "Interpretation", |
| "trigger_scores": {...}, |
| "disagreement_severity": "none" | "mild" | "strong", |
| "recommendation": "accept" | "review" | "override", |
| "explanation": "human-readable explanation", |
| } |
| """ |
| trigger_result = compute_trigger_scores(claim_text, source_section) |
| trigger_tag = trigger_result["predicted_tag"] |
| trigger_scores = trigger_result["scores"] |
| |
| agrees = (ai_tag == trigger_tag) |
| |
| if agrees: |
| return { |
| "agreement": True, |
| "ai_tag": ai_tag, |
| "trigger_tag": trigger_tag, |
| "trigger_scores": trigger_scores, |
| "matched_triggers": trigger_result["matched_triggers"], |
| "disagreement_severity": "none", |
| "recommendation": "accept", |
| "explanation": f"AI and trigger analysis agree: {ai_tag}", |
| } |
| |
| |
| ai_score = trigger_scores.get(ai_tag, 0.0) |
| trigger_score = trigger_scores.get(trigger_tag, 0.0) |
| score_diff = trigger_score - ai_score |
| |
| if score_diff < disagreement_threshold: |
| severity = "mild" |
| recommendation = "accept" |
| explanation = ( |
| f"AI says '{ai_tag}' (trigger score: {ai_score:.2f}), " |
| f"triggers lean '{trigger_tag}' (score: {trigger_score:.2f}). " |
| f"Difference is small ({score_diff:.2f}). AI tag accepted." |
| ) |
| else: |
| severity = "strong" |
| recommendation = "review" |
| |
| |
| if ai_tag == "Fact" and trigger_tag in ("Interpretation", "Hypothesis"): |
| explanation = ( |
| f"⚠️ AI tagged as Fact but text contains hedging language: " |
| f"{trigger_result['matched_triggers'].get(trigger_tag, [])}. " |
| f"Consider downgrading to {trigger_tag}." |
| ) |
| elif ai_tag == "Interpretation" and trigger_tag == "Fact": |
| explanation = ( |
| f"AI tagged as Interpretation but text contains strong evidence language: " |
| f"{trigger_result['matched_triggers'].get('Fact', [])}. " |
| f"May warrant upgrading to Fact if in Results section." |
| ) |
| elif ai_tag == "Fact" and trigger_tag == "Conflict_Hypothesis": |
| explanation = ( |
| f"⚠️ AI tagged as Fact but text contains contradiction language: " |
| f"{trigger_result['matched_triggers'].get('Conflict_Hypothesis', [])}. " |
| f"This may be a conflict claim." |
| ) |
| else: |
| explanation = ( |
| f"AI says '{ai_tag}' (score: {ai_score:.2f}), " |
| f"triggers say '{trigger_tag}' (score: {trigger_score:.2f}). " |
| f"Matched triggers: {trigger_result['matched_triggers']}. " |
| f"Human review recommended." |
| ) |
| |
| return { |
| "agreement": False, |
| "ai_tag": ai_tag, |
| "trigger_tag": trigger_tag, |
| "trigger_scores": trigger_scores, |
| "matched_triggers": trigger_result["matched_triggers"], |
| "disagreement_severity": severity, |
| "recommendation": recommendation, |
| "explanation": explanation, |
| } |
|
|
|
|
| def batch_validate(claims: list[dict]) -> dict: |
| """ |
| Validate a batch of claims. Each claim dict must have: |
| - "text": str |
| - "epistemic_tag": str (AI-assigned tag) |
| - "source_section": str (optional) |
| |
| Returns summary statistics and flagged claims. |
| """ |
| results = { |
| "total": len(claims), |
| "agreements": 0, |
| "mild_disagreements": 0, |
| "strong_disagreements": 0, |
| "flagged_for_review": [], |
| } |
| |
| for i, claim in enumerate(claims): |
| validation = validate_ai_tag( |
| claim_text=claim.get("text", ""), |
| ai_tag=claim.get("epistemic_tag", "Interpretation"), |
| source_section=claim.get("source_section"), |
| ) |
| |
| if validation["agreement"]: |
| results["agreements"] += 1 |
| elif validation["disagreement_severity"] == "mild": |
| results["mild_disagreements"] += 1 |
| else: |
| results["strong_disagreements"] += 1 |
| results["flagged_for_review"].append({ |
| "index": i, |
| "claim_text": claim.get("text", "")[:200], |
| "validation": validation, |
| }) |
| |
| results["agreement_rate"] = round( |
| results["agreements"] / max(results["total"], 1), 3 |
| ) |
| |
| return results |
|
|