| """Quote validator — anti-hallucination layer #7. |
| |
| The schema-level ``_quotes`` field requires verbatim document quotes from the |
| LLM. This validator checks whether each quote appears in the ``full_text`` |
| (in normalized form: whitespace + diacritics + case-folded). If a quote is |
| not found in the source, the LLM hallucinated → low confidence + risk log. |
| |
| The original prototype-agentic system did not have this; only the LLM prompt |
| asked for citations. The LangGraph implementation adds an explicit verifier node. |
| """ |
|
|
| from __future__ import annotations |
|
|
| import re |
| import unicodedata |
|
|
|
|
| def _normalize(text: str) -> str: |
| """Whitespace + diacritics + case folding. |
| |
| NFKD decomposition splits the diacritic (e.g. á → a + combining acute), then |
| we drop the combining marks → "a". |
| """ |
| if not text: |
| return "" |
| |
| nfkd = unicodedata.normalize("NFKD", text) |
| no_accent = "".join(c for c in nfkd if not unicodedata.combining(c)) |
| |
| normalized = re.sub(r"\s+", " ", no_accent.lower()).strip() |
| return normalized |
|
|
|
|
| def quote_in_source(quote: str, source_text: str, *, min_chars: int = 15) -> bool: |
| """Check whether the quote (normalized) appears in the source text. |
| |
| Args: |
| quote: the LLM-provided quote |
| source_text: the IngestedDocument.full_text (full document text) |
| min_chars: skip short quotes (< 15 chars — e.g. numeric values like |
| "1,200,000 USD" or "2026-02-28" where the LLM may apply |
| a different format although the content is correct) |
| |
| Sometimes the LLM modifies number formatting (e.g. "1240160" vs "1 240 160 HUF"), |
| which fails a verbatim text match. Hence the min_chars cutoff, and the |
| downstream node downgrades severity to "low" instead of "high". |
| """ |
| q = _normalize(quote) |
| s = _normalize(source_text) |
| if not q or len(q) < min_chars: |
| return True |
| return q in s |
|
|
|
|
| def validate_quotes( |
| extracted_raw: dict, |
| full_text: str, |
| ) -> tuple[list[str], list[str]]: |
| """Verify ``extracted_raw["_quotes"]`` against ``full_text``. |
| |
| Returns: |
| (valid_quotes, invalid_quotes): lists of quotes that exist in the source |
| and quotes that do not. Invalid quotes are unreliable (suspected |
| hallucination) → downstream confidence is set to "low" and a risk is logged. |
| """ |
| quotes = ( |
| extracted_raw.get("_quotes") |
| or extracted_raw.get("quotes") |
| or [] |
| ) |
| if not isinstance(quotes, list): |
| return [], [] |
|
|
| valid: list[str] = [] |
| invalid: list[str] = [] |
|
|
| for q in quotes: |
| if not isinstance(q, str): |
| continue |
| if quote_in_source(q, full_text): |
| valid.append(q) |
| else: |
| invalid.append(q) |
|
|
| return valid, invalid |
|
|
|
|
| def downgrade_confidence(extracted_raw: dict, invalid_quotes: list[str]) -> dict: |
| """If invalid quotes exist, downgrade ``_confidence`` fields to "low". |
| |
| Aggressive policy: if NO valid quote exists, mark every confidence as "low". |
| Otherwise keep the LLM's original confidence values (>= 50% valid). |
| """ |
| if not invalid_quotes: |
| return extracted_raw |
|
|
| confidence = extracted_raw.get("_confidence") or {} |
| quote_count = len(extracted_raw.get("_quotes") or extracted_raw.get("quotes") or []) |
| valid_count = quote_count - len(invalid_quotes) |
| valid_ratio = valid_count / max(1, valid_count + len(invalid_quotes)) |
|
|
| |
| if valid_ratio < 0.5: |
| confidence = {k: "low" for k in confidence} if confidence else {"_overall": "low"} |
| extracted_raw["_confidence"] = confidence |
|
|
| return extracted_raw |
|
|