| """quote_validator_node — anti-hallucination layer #7. |
| |
| Validates the LLM-provided ``_quotes`` field against the source ``full_text``. |
| If a quote does not appear in the source (after normalization: whitespace + |
| diacritics + case folding), the LLM hallucinated → confidence is downgraded |
| to "low" and a risk is logged. |
| |
| Original prototype-agentic system did not have this check; we add it here |
| as an explicit node. |
| """ |
|
|
| from __future__ import annotations |
|
|
| from graph.states.pipeline_state import ProcessedDocument, Risk |
| from validation.quote_validator import downgrade_confidence, validate_quotes |
|
|
|
|
| async def quote_validator_node(state: dict) -> dict: |
| """Walk the documents list and validate each doc's _quotes field. |
| |
| Returns ``{"documents": [pd_updated], "risks": [risk_for_invalid]}``. |
| The merge_doc_results and merge_risks reducers upsert into the parent state. |
| |
| NB: this node runs in the parent pipeline_graph, NOT inside extract_subgraph |
| (after the Send fan-in, so we see all docs' extracted data together). |
| """ |
| documents: list[ProcessedDocument] = state.get("documents") or [] |
| if not documents: |
| return {} |
|
|
| updated_docs: list[ProcessedDocument] = [] |
| new_risks: list[Risk] = [] |
|
|
| for pd in documents: |
| if pd.extracted is None or pd.ingested is None: |
| updated_docs.append(pd) |
| continue |
|
|
| full_text = pd.ingested.full_text or "" |
| valid, invalid = validate_quotes(pd.extracted.raw, full_text) |
|
|
| if invalid: |
| |
| new_raw = downgrade_confidence(dict(pd.extracted.raw), invalid) |
| new_extracted = pd.extracted.model_copy(update={ |
| "raw": new_raw, |
| "confidence": new_raw.get("_confidence", {}), |
| }) |
| updated_docs.append(pd.model_copy(update={"extracted": new_extracted})) |
|
|
| |
| |
| |
| valid, _ = validate_quotes(pd.extracted.raw, full_text) |
| total = len(invalid) + len(valid) |
| invalid_ratio = len(invalid) / max(1, total) |
| if invalid_ratio >= 0.5: |
| new_risks.append(Risk( |
| description=( |
| f"{pd.ingested.file_name}: {len(invalid)}/{total} quote(s) not found " |
| "in the source document (suspected LLM hallucination)." |
| ), |
| severity="low", |
| rationale=( |
| "The schema-level ``_quotes`` field contains text that does not appear " |
| "in the normalized full_text. Affected fields' confidence has been " |
| "downgraded to 'low'." |
| ), |
| kind="validation", |
| affected_document=pd.ingested.file_name, |
| source_check_id="quote_validator", |
| )) |
| else: |
| updated_docs.append(pd) |
|
|
| out: dict = {} |
| if updated_docs: |
| out["documents"] = updated_docs |
| if new_risks: |
| out["risks"] = new_risks |
| return out |
|
|