Spaces:

lablab-ai-amd-developer-hackathon
/

paperhawk

Running

File size: 4,487 Bytes

7ff7119

"""Universal schema → flat field mapping.

The 14 domain checks read flat field names that mirror the typed schemas
(``invoice_number``, ``issuer.name``, ``line_items[].vat_rate``, ...). If
extract returns a payload following ``universal.json`` (unknown doc_type),
we flatten it first.
"""

from __future__ import annotations

from typing import Any


def flatten_universal(data: dict, doc_type: str | None = None) -> dict:
    """Universal-schema dict → flat dict with typed field names.

    Args:
        data: A dict shaped like ``universal.json`` (``document_type``,
              ``parties``, ``dates``, ``amounts``, ``line_items``,
              ``contract_elements`` ...).
        doc_type: Optional (``invoice``, ``contract``, ...). If provided, the
                  flatten optimizes for that target shape (e.g. for invoice we
                  split ``parties`` into ``issuer`` and ``customer``).

    Returns:
        Flat dict with field names matching the domain_checks expectations.
    """
    if not isinstance(data, dict):
        return data

    # Universal markers — these only appear in the universal.json shape (nested
    # structures). The ``parties`` key alone is NOT a sufficient indicator
    # because typed schemas (Contract/Invoice) use it as a top-level list too.
    # Only the truly universal-structural keys ("dates", "amounts",
    # "contract_elements") signal that flattening is needed.
    universal_indicators = {
        "dates",
        "amounts",
        "contract_elements",
        "document_type",
        "document_number",
    }
    if not (universal_indicators & set(data.keys())):
        return data

    flat: dict[str, Any] = {}

    # ----- Document-level basics -----
    flat["invoice_number"] = data.get("document_number")  # universal doc number
    flat["document_number"] = data.get("document_number")
    flat["document_type"] = data.get("document_type") or doc_type

    # ----- Dates -----
    dates = data.get("dates") or {}
    flat["issue_date"] = dates.get("issue")
    flat["fulfillment_date"] = dates.get("fulfillment")
    flat["payment_due_date"] = dates.get("payment_due")
    flat["effective_date"] = dates.get("effective")
    flat["expiry_date"] = dates.get("expiry")
    flat["signature_date"] = dates.get("signature")

    # ----- Amounts -----
    amounts = data.get("amounts") or {}
    flat["total_net"] = amounts.get("total_net")
    flat["total_vat"] = amounts.get("total_vat")
    flat["total_gross"] = amounts.get("total_gross")
    flat["currency"] = amounts.get("currency", "USD")

    # ----- Parties -----
    # Heuristic: split into issuer / customer based on role.
    parties = data.get("parties") or []
    issuer = None
    customer = None
    for party in parties:
        if not isinstance(party, dict):
            continue
        role = (party.get("role") or "").lower()
        if any(k in role for k in ("issuer", "supplier", "vendor", "seller", "kiallit", "szallit", "elado")):
            issuer = issuer or party
        elif any(k in role for k in ("customer", "buyer", "lessee", "vevo", "vasarlo", "berlo")):
            customer = customer or party
    # If role is ambiguous, first → issuer, second → customer
    if issuer is None and len(parties) >= 1:
        issuer = parties[0] if isinstance(parties[0], dict) else None
    if customer is None and len(parties) >= 2:
        customer = parties[1] if isinstance(parties[1], dict) else None

    flat["issuer"] = issuer
    flat["customer"] = customer

    # ----- Line items -----
    flat["line_items"] = data.get("line_items") or []

    # ----- Contract elements -----
    contract = data.get("contract_elements") or {}
    flat["contract_type"] = contract.get("contract_type")
    flat["termination_terms"] = contract.get("termination_terms")
    flat["penalty"] = contract.get("penalty")
    flat["confidentiality_clause"] = contract.get("confidentiality_clause")
    flat["governing_law"] = contract.get("governing_law")
    flat["key_clauses"] = contract.get("key_clauses") or []

    # ----- Anti-halluc fields preserved -----
    flat["_quotes"] = data.get("_quotes") or []
    flat["_confidence"] = data.get("_confidence") or {}
    flat["_source"] = data.get("_source") or {}

    # Strip None / empty values for cleaner JSON output (the domain checks use
    # ``is_empty()`` themselves, but cleaner output benefits the chat tools).
    return {k: v for k, v in flat.items() if v not in (None, [], {})}