File size: 12,327 Bytes
04f24fb | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 | """
Layer 2 + Layer 5: Epistemic Trigger Words Validator
======================================================
Deterministic, code-based epistemic classification using linguistic patterns.
Runs ALONGSIDE the AI Council as a cross-check.
If the AI says "Fact" but trigger words say "Hypothesis" β flag for human review.
Adapted from: KGX3/iKuhn's language-game filters (arxiv:2002.03531)
Addresses blindspots: PA-5, B-4
Source: SYSTEM_INSPIRATIONS.md AD-3
No ML dependencies. Pure Python. Deterministic output.
"""
import re
from typing import Optional
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# TRIGGER WORD DICTIONARIES
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# Weights: "strong" triggers score 0.30, "moderate" score 0.15, "weak" score 0.08
# Calibrated to KGX3's activation threshold ΞΈ=0.7
FACT_TRIGGERS = {
"strong": [
"demonstrated", "measured", "observed", "detected", "confirmed",
"showed that", "resulted in", "was found to be", "achieved",
"we report", "we found", "was determined to be", "are reported",
"the data show", "the results show", "statistically significant",
"p < ", "p = ", "p-value", "with a yield of", "with an efficiency of",
],
"moderate": [
"correlated with", "associated with", "consistent with the finding",
"reproduces", "replicated", "validated", "verified",
"supported by the data", "the analysis revealed",
],
"weak": [
"found", "obtained", "recorded", "documented", "established",
],
}
INTERPRETATION_TRIGGERS = {
"strong": [
"suggests that", "indicates that", "implies", "may be attributed to",
"could be explained by", "appears to", "is likely due to",
"we interpret", "these findings suggest", "this result suggests",
"it is likely that", "is indicative of", "we attribute this to",
"this is consistent with", "supports the notion",
],
"moderate": [
"consistent with", "in line with", "supports the hypothesis",
"in agreement with", "pointing to", "reflecting",
"can be understood as", "we believe", "our interpretation",
],
"weak": [
"presumably", "apparently", "seems to", "tends to",
],
}
HYPOTHESIS_TRIGGERS = {
"strong": [
"may", "might", "could potentially", "we hypothesize",
"it is possible that", "remains to be determined",
"future work should", "further investigation is needed",
"we speculate", "one possibility is", "a potential explanation",
"it is conceivable", "it remains unclear", "requires further study",
"we cannot rule out",
],
"moderate": [
"we propose", "we envision", "it is plausible",
"a promising direction", "warrants further investigation",
"preliminary evidence suggests", "tentatively",
],
"weak": [
"possibly", "potentially", "presumably", "perhaps",
],
}
CONFLICT_TRIGGERS = {
"strong": [
"contradicts", "in contrast to", "unlike previous",
"contrary to", "inconsistent with", "at odds with",
"disputes", "challenges the", "refutes",
"however, our results show", "in disagreement with",
],
"moderate": [
"differs from", "diverges from", "does not support",
"failed to reproduce", "we were unable to replicate",
"the discrepancy", "while others have reported",
],
"weak": [
"however", "nevertheless", "on the other hand", "conversely",
],
}
# ββ Section-based priors ββββββββββββββββββββββββββββββββββββββββββββββ
# These shift the baseline probability before trigger analysis
SECTION_PRIORS = {
"abstract": {"Fact": 0.00, "Interpretation": 0.20, "Hypothesis": 0.05, "Conflict_Hypothesis": 0.00},
"introduction":{"Fact": 0.00, "Interpretation": 0.10, "Hypothesis": 0.05, "Conflict_Hypothesis": 0.00},
"methods": {"Fact": 0.15, "Interpretation": 0.00, "Hypothesis": 0.00, "Conflict_Hypothesis": 0.00},
"results": {"Fact": 0.25, "Interpretation": 0.00, "Hypothesis": 0.00, "Conflict_Hypothesis": 0.00},
"discussion": {"Fact": 0.00, "Interpretation": 0.15, "Hypothesis": 0.10, "Conflict_Hypothesis": 0.00},
"conclusion": {"Fact": 0.00, "Interpretation": 0.10, "Hypothesis": 0.05, "Conflict_Hypothesis": 0.00},
"supplement": {"Fact": 0.20, "Interpretation": 0.00, "Hypothesis": 0.00, "Conflict_Hypothesis": 0.00},
}
# Strength weights
STRENGTH_WEIGHTS = {"strong": 0.30, "moderate": 0.15, "weak": 0.08}
def compute_trigger_scores(claim_text: str, source_section: str = None) -> dict:
"""
Compute epistemic trigger scores for a claim.
Returns:
{
"scores": {"Fact": 0.45, "Interpretation": 0.20, ...},
"predicted_tag": "Fact",
"confidence": 0.45,
"matched_triggers": {"Fact": ["measured", "p < 0.01"], ...},
"section_prior_applied": "results",
}
"""
text_lower = claim_text.lower()
categories = {
"Fact": FACT_TRIGGERS,
"Interpretation": INTERPRETATION_TRIGGERS,
"Hypothesis": HYPOTHESIS_TRIGGERS,
"Conflict_Hypothesis": CONFLICT_TRIGGERS,
}
scores = {"Fact": 0.0, "Interpretation": 0.0, "Hypothesis": 0.0, "Conflict_Hypothesis": 0.0}
matched = {"Fact": [], "Interpretation": [], "Hypothesis": [], "Conflict_Hypothesis": []}
# Score trigger matches
for category, triggers_dict in categories.items():
for strength, triggers in triggers_dict.items():
weight = STRENGTH_WEIGHTS[strength]
for trigger in triggers:
if trigger in text_lower:
scores[category] += weight
matched[category].append(trigger)
# Apply section priors
section_key = (source_section or "").lower().strip()
priors = SECTION_PRIORS.get(section_key, {})
for cat, prior in priors.items():
scores[cat] += prior
# Normalize (cap at 1.0)
for cat in scores:
scores[cat] = min(1.0, scores[cat])
# Determine predicted tag
predicted_tag = max(scores, key=scores.get)
confidence = scores[predicted_tag]
return {
"scores": {k: round(v, 3) for k, v in scores.items()},
"predicted_tag": predicted_tag,
"confidence": round(confidence, 3),
"matched_triggers": {k: v for k, v in matched.items() if v},
"section_prior_applied": section_key or None,
}
def validate_ai_tag(claim_text: str, ai_tag: str,
source_section: str = None,
disagreement_threshold: float = 0.20) -> dict:
"""
Cross-validate an AI-assigned epistemic tag against trigger analysis.
This is the core function β run this AFTER the AI Council assigns a tag,
and flag disagreements for human review.
Args:
claim_text: The claim text
ai_tag: Tag assigned by the AI Council (Fact/Interpretation/Hypothesis/Conflict_Hypothesis)
source_section: Paper section the claim came from
disagreement_threshold: Minimum score difference to flag disagreement
Returns:
{
"agreement": True/False,
"ai_tag": "Fact",
"trigger_tag": "Interpretation",
"trigger_scores": {...},
"disagreement_severity": "none" | "mild" | "strong",
"recommendation": "accept" | "review" | "override",
"explanation": "human-readable explanation",
}
"""
trigger_result = compute_trigger_scores(claim_text, source_section)
trigger_tag = trigger_result["predicted_tag"]
trigger_scores = trigger_result["scores"]
agrees = (ai_tag == trigger_tag)
if agrees:
return {
"agreement": True,
"ai_tag": ai_tag,
"trigger_tag": trigger_tag,
"trigger_scores": trigger_scores,
"matched_triggers": trigger_result["matched_triggers"],
"disagreement_severity": "none",
"recommendation": "accept",
"explanation": f"AI and trigger analysis agree: {ai_tag}",
}
# Compute disagreement severity
ai_score = trigger_scores.get(ai_tag, 0.0)
trigger_score = trigger_scores.get(trigger_tag, 0.0)
score_diff = trigger_score - ai_score
if score_diff < disagreement_threshold:
severity = "mild"
recommendation = "accept" # AI tag is close enough
explanation = (
f"AI says '{ai_tag}' (trigger score: {ai_score:.2f}), "
f"triggers lean '{trigger_tag}' (score: {trigger_score:.2f}). "
f"Difference is small ({score_diff:.2f}). AI tag accepted."
)
else:
severity = "strong"
recommendation = "review"
# Specific explanations for common disagreement patterns
if ai_tag == "Fact" and trigger_tag in ("Interpretation", "Hypothesis"):
explanation = (
f"β οΈ AI tagged as Fact but text contains hedging language: "
f"{trigger_result['matched_triggers'].get(trigger_tag, [])}. "
f"Consider downgrading to {trigger_tag}."
)
elif ai_tag == "Interpretation" and trigger_tag == "Fact":
explanation = (
f"AI tagged as Interpretation but text contains strong evidence language: "
f"{trigger_result['matched_triggers'].get('Fact', [])}. "
f"May warrant upgrading to Fact if in Results section."
)
elif ai_tag == "Fact" and trigger_tag == "Conflict_Hypothesis":
explanation = (
f"β οΈ AI tagged as Fact but text contains contradiction language: "
f"{trigger_result['matched_triggers'].get('Conflict_Hypothesis', [])}. "
f"This may be a conflict claim."
)
else:
explanation = (
f"AI says '{ai_tag}' (score: {ai_score:.2f}), "
f"triggers say '{trigger_tag}' (score: {trigger_score:.2f}). "
f"Matched triggers: {trigger_result['matched_triggers']}. "
f"Human review recommended."
)
return {
"agreement": False,
"ai_tag": ai_tag,
"trigger_tag": trigger_tag,
"trigger_scores": trigger_scores,
"matched_triggers": trigger_result["matched_triggers"],
"disagreement_severity": severity,
"recommendation": recommendation,
"explanation": explanation,
}
def batch_validate(claims: list[dict]) -> dict:
"""
Validate a batch of claims. Each claim dict must have:
- "text": str
- "epistemic_tag": str (AI-assigned tag)
- "source_section": str (optional)
Returns summary statistics and flagged claims.
"""
results = {
"total": len(claims),
"agreements": 0,
"mild_disagreements": 0,
"strong_disagreements": 0,
"flagged_for_review": [],
}
for i, claim in enumerate(claims):
validation = validate_ai_tag(
claim_text=claim.get("text", ""),
ai_tag=claim.get("epistemic_tag", "Interpretation"),
source_section=claim.get("source_section"),
)
if validation["agreement"]:
results["agreements"] += 1
elif validation["disagreement_severity"] == "mild":
results["mild_disagreements"] += 1
else:
results["strong_disagreements"] += 1
results["flagged_for_review"].append({
"index": i,
"claim_text": claim.get("text", "")[:200],
"validation": validation,
})
results["agreement_rate"] = round(
results["agreements"] / max(results["total"], 1), 3
)
return results
|