File size: 3,811 Bytes
7ff7119 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 | """Quote validator β anti-hallucination layer #7.
The schema-level ``_quotes`` field requires verbatim document quotes from the
LLM. This validator checks whether each quote appears in the ``full_text``
(in normalized form: whitespace + diacritics + case-folded). If a quote is
not found in the source, the LLM hallucinated β low confidence + risk log.
The original prototype-agentic system did not have this; only the LLM prompt
asked for citations. The LangGraph implementation adds an explicit verifier node.
"""
from __future__ import annotations
import re
import unicodedata
def _normalize(text: str) -> str:
"""Whitespace + diacritics + case folding.
NFKD decomposition splits the diacritic (e.g. Γ‘ β a + combining acute), then
we drop the combining marks β "a".
"""
if not text:
return ""
# Strip diacritics
nfkd = unicodedata.normalize("NFKD", text)
no_accent = "".join(c for c in nfkd if not unicodedata.combining(c))
# Collapse whitespace (multiple β single space)
normalized = re.sub(r"\s+", " ", no_accent.lower()).strip()
return normalized
def quote_in_source(quote: str, source_text: str, *, min_chars: int = 15) -> bool:
"""Check whether the quote (normalized) appears in the source text.
Args:
quote: the LLM-provided quote
source_text: the IngestedDocument.full_text (full document text)
min_chars: skip short quotes (< 15 chars β e.g. numeric values like
"1,200,000 USD" or "2026-02-28" where the LLM may apply
a different format although the content is correct)
Sometimes the LLM modifies number formatting (e.g. "1240160" vs "1 240 160 HUF"),
which fails a verbatim text match. Hence the min_chars cutoff, and the
downstream node downgrades severity to "low" instead of "high".
"""
q = _normalize(quote)
s = _normalize(source_text)
if not q or len(q) < min_chars:
return True # Too short β accept (avoid false positives)
return q in s
def validate_quotes(
extracted_raw: dict,
full_text: str,
) -> tuple[list[str], list[str]]:
"""Verify ``extracted_raw["_quotes"]`` against ``full_text``.
Returns:
(valid_quotes, invalid_quotes): lists of quotes that exist in the source
and quotes that do not. Invalid quotes are unreliable (suspected
hallucination) β downstream confidence is set to "low" and a risk is logged.
"""
quotes = (
extracted_raw.get("_quotes")
or extracted_raw.get("quotes")
or []
)
if not isinstance(quotes, list):
return [], []
valid: list[str] = []
invalid: list[str] = []
for q in quotes:
if not isinstance(q, str):
continue
if quote_in_source(q, full_text):
valid.append(q)
else:
invalid.append(q)
return valid, invalid
def downgrade_confidence(extracted_raw: dict, invalid_quotes: list[str]) -> dict:
"""If invalid quotes exist, downgrade ``_confidence`` fields to "low".
Aggressive policy: if NO valid quote exists, mark every confidence as "low".
Otherwise keep the LLM's original confidence values (>= 50% valid).
"""
if not invalid_quotes:
return extracted_raw
confidence = extracted_raw.get("_confidence") or {}
quote_count = len(extracted_raw.get("_quotes") or extracted_raw.get("quotes") or [])
valid_count = quote_count - len(invalid_quotes)
valid_ratio = valid_count / max(1, valid_count + len(invalid_quotes))
# Below 50% valid β all confidence β low
if valid_ratio < 0.5:
confidence = {k: "low" for k in confidence} if confidence else {"_overall": "low"}
extracted_raw["_confidence"] = confidence
return extracted_raw
|