paperhawk / nodes /pipeline /duplicate_detector_node.py
Nándorfi Vince
Initial paperhawk push to HF Space (LFS for binaries)
7ff7119
raw
history blame
987 Bytes
"""duplicate_detector_node — package-level ISA 240 duplicate detection.
Operates over all documents at once (NOT a Send fan-out) — O(n²) cross-pairing
with up to ~25 docs is well within budget; the Send overhead would dominate.
"""
from __future__ import annotations
from domain_checks import check_duplicate_invoices
from graph.states.pipeline_state import PipelineState, ProcessedDocument
async def duplicate_detector_node(state: PipelineState) -> dict:
documents: list[ProcessedDocument] = state.get("documents") or []
if len(documents) < 2:
return {}
docs_for_check = [
{
"file_name": d.ingested.file_name,
"doc_type": d.classification.doc_type if d.classification else "other",
"extracted": d.extracted.raw if d.extracted else {},
}
for d in documents
if d.ingested is not None
]
risks = check_duplicate_invoices(docs_for_check)
return {"risks": risks} if risks else {}