paperhawk / schemas /flatten_universal.py
Nándorfi Vince
Initial paperhawk push to HF Space (LFS for binaries)
7ff7119
raw
history blame
4.49 kB
"""Universal schema → flat field mapping.
The 14 domain checks read flat field names that mirror the typed schemas
(``invoice_number``, ``issuer.name``, ``line_items[].vat_rate``, ...). If
extract returns a payload following ``universal.json`` (unknown doc_type),
we flatten it first.
"""
from __future__ import annotations
from typing import Any
def flatten_universal(data: dict, doc_type: str | None = None) -> dict:
"""Universal-schema dict → flat dict with typed field names.
Args:
data: A dict shaped like ``universal.json`` (``document_type``,
``parties``, ``dates``, ``amounts``, ``line_items``,
``contract_elements`` ...).
doc_type: Optional (``invoice``, ``contract``, ...). If provided, the
flatten optimizes for that target shape (e.g. for invoice we
split ``parties`` into ``issuer`` and ``customer``).
Returns:
Flat dict with field names matching the domain_checks expectations.
"""
if not isinstance(data, dict):
return data
# Universal markers — these only appear in the universal.json shape (nested
# structures). The ``parties`` key alone is NOT a sufficient indicator
# because typed schemas (Contract/Invoice) use it as a top-level list too.
# Only the truly universal-structural keys ("dates", "amounts",
# "contract_elements") signal that flattening is needed.
universal_indicators = {
"dates",
"amounts",
"contract_elements",
"document_type",
"document_number",
}
if not (universal_indicators & set(data.keys())):
return data
flat: dict[str, Any] = {}
# ----- Document-level basics -----
flat["invoice_number"] = data.get("document_number") # universal doc number
flat["document_number"] = data.get("document_number")
flat["document_type"] = data.get("document_type") or doc_type
# ----- Dates -----
dates = data.get("dates") or {}
flat["issue_date"] = dates.get("issue")
flat["fulfillment_date"] = dates.get("fulfillment")
flat["payment_due_date"] = dates.get("payment_due")
flat["effective_date"] = dates.get("effective")
flat["expiry_date"] = dates.get("expiry")
flat["signature_date"] = dates.get("signature")
# ----- Amounts -----
amounts = data.get("amounts") or {}
flat["total_net"] = amounts.get("total_net")
flat["total_vat"] = amounts.get("total_vat")
flat["total_gross"] = amounts.get("total_gross")
flat["currency"] = amounts.get("currency", "USD")
# ----- Parties -----
# Heuristic: split into issuer / customer based on role.
parties = data.get("parties") or []
issuer = None
customer = None
for party in parties:
if not isinstance(party, dict):
continue
role = (party.get("role") or "").lower()
if any(k in role for k in ("issuer", "supplier", "vendor", "seller", "kiallit", "szallit", "elado")):
issuer = issuer or party
elif any(k in role for k in ("customer", "buyer", "lessee", "vevo", "vasarlo", "berlo")):
customer = customer or party
# If role is ambiguous, first → issuer, second → customer
if issuer is None and len(parties) >= 1:
issuer = parties[0] if isinstance(parties[0], dict) else None
if customer is None and len(parties) >= 2:
customer = parties[1] if isinstance(parties[1], dict) else None
flat["issuer"] = issuer
flat["customer"] = customer
# ----- Line items -----
flat["line_items"] = data.get("line_items") or []
# ----- Contract elements -----
contract = data.get("contract_elements") or {}
flat["contract_type"] = contract.get("contract_type")
flat["termination_terms"] = contract.get("termination_terms")
flat["penalty"] = contract.get("penalty")
flat["confidentiality_clause"] = contract.get("confidentiality_clause")
flat["governing_law"] = contract.get("governing_law")
flat["key_clauses"] = contract.get("key_clauses") or []
# ----- Anti-halluc fields preserved -----
flat["_quotes"] = data.get("_quotes") or []
flat["_confidence"] = data.get("_confidence") or {}
flat["_source"] = data.get("_source") or {}
# Strip None / empty values for cleaner JSON output (the domain checks use
# ``is_empty()`` themselves, but cleaner output benefits the chat tools).
return {k: v for k, v in flat.items() if v not in (None, [], {})}