| """Universal schema → flat field mapping. |
| |
| The 14 domain checks read flat field names that mirror the typed schemas |
| (``invoice_number``, ``issuer.name``, ``line_items[].vat_rate``, ...). If |
| extract returns a payload following ``universal.json`` (unknown doc_type), |
| we flatten it first. |
| """ |
|
|
| from __future__ import annotations |
|
|
| from typing import Any |
|
|
|
|
| def flatten_universal(data: dict, doc_type: str | None = None) -> dict: |
| """Universal-schema dict → flat dict with typed field names. |
| |
| Args: |
| data: A dict shaped like ``universal.json`` (``document_type``, |
| ``parties``, ``dates``, ``amounts``, ``line_items``, |
| ``contract_elements`` ...). |
| doc_type: Optional (``invoice``, ``contract``, ...). If provided, the |
| flatten optimizes for that target shape (e.g. for invoice we |
| split ``parties`` into ``issuer`` and ``customer``). |
| |
| Returns: |
| Flat dict with field names matching the domain_checks expectations. |
| """ |
| if not isinstance(data, dict): |
| return data |
|
|
| |
| |
| |
| |
| |
| universal_indicators = { |
| "dates", |
| "amounts", |
| "contract_elements", |
| "document_type", |
| "document_number", |
| } |
| if not (universal_indicators & set(data.keys())): |
| return data |
|
|
| flat: dict[str, Any] = {} |
|
|
| |
| flat["invoice_number"] = data.get("document_number") |
| flat["document_number"] = data.get("document_number") |
| flat["document_type"] = data.get("document_type") or doc_type |
|
|
| |
| dates = data.get("dates") or {} |
| flat["issue_date"] = dates.get("issue") |
| flat["fulfillment_date"] = dates.get("fulfillment") |
| flat["payment_due_date"] = dates.get("payment_due") |
| flat["effective_date"] = dates.get("effective") |
| flat["expiry_date"] = dates.get("expiry") |
| flat["signature_date"] = dates.get("signature") |
|
|
| |
| amounts = data.get("amounts") or {} |
| flat["total_net"] = amounts.get("total_net") |
| flat["total_vat"] = amounts.get("total_vat") |
| flat["total_gross"] = amounts.get("total_gross") |
| flat["currency"] = amounts.get("currency", "USD") |
|
|
| |
| |
| parties = data.get("parties") or [] |
| issuer = None |
| customer = None |
| for party in parties: |
| if not isinstance(party, dict): |
| continue |
| role = (party.get("role") or "").lower() |
| if any(k in role for k in ("issuer", "supplier", "vendor", "seller", "kiallit", "szallit", "elado")): |
| issuer = issuer or party |
| elif any(k in role for k in ("customer", "buyer", "lessee", "vevo", "vasarlo", "berlo")): |
| customer = customer or party |
| |
| if issuer is None and len(parties) >= 1: |
| issuer = parties[0] if isinstance(parties[0], dict) else None |
| if customer is None and len(parties) >= 2: |
| customer = parties[1] if isinstance(parties[1], dict) else None |
|
|
| flat["issuer"] = issuer |
| flat["customer"] = customer |
|
|
| |
| flat["line_items"] = data.get("line_items") or [] |
|
|
| |
| contract = data.get("contract_elements") or {} |
| flat["contract_type"] = contract.get("contract_type") |
| flat["termination_terms"] = contract.get("termination_terms") |
| flat["penalty"] = contract.get("penalty") |
| flat["confidentiality_clause"] = contract.get("confidentiality_clause") |
| flat["governing_law"] = contract.get("governing_law") |
| flat["key_clauses"] = contract.get("key_clauses") or [] |
|
|
| |
| flat["_quotes"] = data.get("_quotes") or [] |
| flat["_confidence"] = data.get("_confidence") or {} |
| flat["_source"] = data.get("_source") or {} |
|
|
| |
| |
| return {k: v for k, v in flat.items() if v not in (None, [], {})} |
|
|