paperhawk / validation /quote_validator.py
Nándorfi Vince
Initial paperhawk push to HF Space (LFS for binaries)
7ff7119
raw
history blame
3.81 kB
"""Quote validator — anti-hallucination layer #7.
The schema-level ``_quotes`` field requires verbatim document quotes from the
LLM. This validator checks whether each quote appears in the ``full_text``
(in normalized form: whitespace + diacritics + case-folded). If a quote is
not found in the source, the LLM hallucinated → low confidence + risk log.
The original prototype-agentic system did not have this; only the LLM prompt
asked for citations. The LangGraph implementation adds an explicit verifier node.
"""
from __future__ import annotations
import re
import unicodedata
def _normalize(text: str) -> str:
"""Whitespace + diacritics + case folding.
NFKD decomposition splits the diacritic (e.g. á → a + combining acute), then
we drop the combining marks → "a".
"""
if not text:
return ""
# Strip diacritics
nfkd = unicodedata.normalize("NFKD", text)
no_accent = "".join(c for c in nfkd if not unicodedata.combining(c))
# Collapse whitespace (multiple → single space)
normalized = re.sub(r"\s+", " ", no_accent.lower()).strip()
return normalized
def quote_in_source(quote: str, source_text: str, *, min_chars: int = 15) -> bool:
"""Check whether the quote (normalized) appears in the source text.
Args:
quote: the LLM-provided quote
source_text: the IngestedDocument.full_text (full document text)
min_chars: skip short quotes (< 15 chars — e.g. numeric values like
"1,200,000 USD" or "2026-02-28" where the LLM may apply
a different format although the content is correct)
Sometimes the LLM modifies number formatting (e.g. "1240160" vs "1 240 160 HUF"),
which fails a verbatim text match. Hence the min_chars cutoff, and the
downstream node downgrades severity to "low" instead of "high".
"""
q = _normalize(quote)
s = _normalize(source_text)
if not q or len(q) < min_chars:
return True # Too short → accept (avoid false positives)
return q in s
def validate_quotes(
extracted_raw: dict,
full_text: str,
) -> tuple[list[str], list[str]]:
"""Verify ``extracted_raw["_quotes"]`` against ``full_text``.
Returns:
(valid_quotes, invalid_quotes): lists of quotes that exist in the source
and quotes that do not. Invalid quotes are unreliable (suspected
hallucination) → downstream confidence is set to "low" and a risk is logged.
"""
quotes = (
extracted_raw.get("_quotes")
or extracted_raw.get("quotes")
or []
)
if not isinstance(quotes, list):
return [], []
valid: list[str] = []
invalid: list[str] = []
for q in quotes:
if not isinstance(q, str):
continue
if quote_in_source(q, full_text):
valid.append(q)
else:
invalid.append(q)
return valid, invalid
def downgrade_confidence(extracted_raw: dict, invalid_quotes: list[str]) -> dict:
"""If invalid quotes exist, downgrade ``_confidence`` fields to "low".
Aggressive policy: if NO valid quote exists, mark every confidence as "low".
Otherwise keep the LLM's original confidence values (>= 50% valid).
"""
if not invalid_quotes:
return extracted_raw
confidence = extracted_raw.get("_confidence") or {}
quote_count = len(extracted_raw.get("_quotes") or extracted_raw.get("quotes") or [])
valid_count = quote_count - len(invalid_quotes)
valid_ratio = valid_count / max(1, valid_count + len(invalid_quotes))
# Below 50% valid → all confidence → low
if valid_ratio < 0.5:
confidence = {k: "low" for k in confidence} if confidence else {"_overall": "low"}
extracted_raw["_confidence"] = confidence
return extracted_raw