File size: 987 Bytes
7ff7119
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
"""duplicate_detector_node — package-level ISA 240 duplicate detection.

Operates over all documents at once (NOT a Send fan-out) — O(n²) cross-pairing
with up to ~25 docs is well within budget; the Send overhead would dominate.
"""

from __future__ import annotations

from domain_checks import check_duplicate_invoices
from graph.states.pipeline_state import PipelineState, ProcessedDocument


async def duplicate_detector_node(state: PipelineState) -> dict:
    documents: list[ProcessedDocument] = state.get("documents") or []
    if len(documents) < 2:
        return {}

    docs_for_check = [
        {
            "file_name": d.ingested.file_name,
            "doc_type": d.classification.doc_type if d.classification else "other",
            "extracted": d.extracted.raw if d.extracted else {},
        }
        for d in documents
        if d.ingested is not None
    ]

    risks = check_duplicate_invoices(docs_for_check)
    return {"risks": risks} if risks else {}