Spaces:

lablab-ai-amd-developer-hackathon
/

paperhawk

Running

paperhawk / validation /quote_validator.py

Nándorfi Vince

Initial paperhawk push to HF Space (LFS for binaries)

7ff7119 3 days ago

3.81 kB

	"""Quote validator — anti-hallucination layer #7.

	The schema-level ``_quotes`` field requires verbatim document quotes from the
	LLM. This validator checks whether each quote appears in the ``full_text``
	(in normalized form: whitespace + diacritics + case-folded). If a quote is
	not found in the source, the LLM hallucinated → low confidence + risk log.

	The original prototype-agentic system did not have this; only the LLM prompt
	asked for citations. The LangGraph implementation adds an explicit verifier node.
	"""

	from __future__ import annotations

	import re
	import unicodedata


	def _normalize(text: str) -> str:
	"""Whitespace + diacritics + case folding.

	NFKD decomposition splits the diacritic (e.g. á → a + combining acute), then
	we drop the combining marks → "a".
	"""
	if not text:
	return ""
	# Strip diacritics
	nfkd = unicodedata.normalize("NFKD", text)
	no_accent = "".join(c for c in nfkd if not unicodedata.combining(c))
	# Collapse whitespace (multiple → single space)
	normalized = re.sub(r"\s+", " ", no_accent.lower()).strip()
	return normalized


	def quote_in_source(quote: str, source_text: str, *, min_chars: int = 15) -> bool:
	"""Check whether the quote (normalized) appears in the source text.

	Args:
	quote: the LLM-provided quote
	source_text: the IngestedDocument.full_text (full document text)
	min_chars: skip short quotes (< 15 chars — e.g. numeric values like
	"1,200,000 USD" or "2026-02-28" where the LLM may apply
	a different format although the content is correct)

	Sometimes the LLM modifies number formatting (e.g. "1240160" vs "1 240 160 HUF"),
	which fails a verbatim text match. Hence the min_chars cutoff, and the
	downstream node downgrades severity to "low" instead of "high".
	"""
	q = _normalize(quote)
	s = _normalize(source_text)
	if not q or len(q) < min_chars:
	return True # Too short → accept (avoid false positives)
	return q in s


	def validate_quotes(
	extracted_raw: dict,
	full_text: str,
	) -> tuple[list[str], list[str]]:
	"""Verify ``extracted_raw["_quotes"]`` against ``full_text``.

	Returns:
	(valid_quotes, invalid_quotes): lists of quotes that exist in the source
	and quotes that do not. Invalid quotes are unreliable (suspected
	hallucination) → downstream confidence is set to "low" and a risk is logged.
	"""
	quotes = (
	extracted_raw.get("_quotes")
	or extracted_raw.get("quotes")
	or []
	)
	if not isinstance(quotes, list):
	return [], []

	valid: list[str] = []
	invalid: list[str] = []

	for q in quotes:
	if not isinstance(q, str):
	continue
	if quote_in_source(q, full_text):
	valid.append(q)
	else:
	invalid.append(q)

	return valid, invalid


	def downgrade_confidence(extracted_raw: dict, invalid_quotes: list[str]) -> dict:
	"""If invalid quotes exist, downgrade ``_confidence`` fields to "low".

	Aggressive policy: if NO valid quote exists, mark every confidence as "low".
	Otherwise keep the LLM's original confidence values (>= 50% valid).
	"""
	if not invalid_quotes:
	return extracted_raw

	confidence = extracted_raw.get("_confidence") or {}
	quote_count = len(extracted_raw.get("_quotes") or extracted_raw.get("quotes") or [])
	valid_count = quote_count - len(invalid_quotes)
	valid_ratio = valid_count / max(1, valid_count + len(invalid_quotes))

	# Below 50% valid → all confidence → low
	if valid_ratio < 0.5:
	confidence = {k: "low" for k in confidence} if confidence else {"_overall": "low"}
	extracted_raw["_confidence"] = confidence

	return extracted_raw