paperhawk / schemas /pydantic_models.py
Nándorfi Vince
Initial paperhawk push to HF Space (LFS for binaries)
7ff7119
raw
history blame
12.3 kB
"""Pydantic v2 mirror models for the JSON schemas.
Purpose: runtime field validation in the extract_subgraph
(``InvoiceModel.model_validate(...)``) and type-strong downstream nodes (the
risk_subgraph receives Pydantic-typed data).
JSON schema remains the source of truth for the LLM ``with_structured_output()``
calls — the Pydantic mirror is for VALIDATION ONLY, it does not replace the
JSON schema.
The ``_quotes`` and ``_confidence`` fields are aliased in the JSON
(``"alias_": ...``); we keep the aliases here too so the JSON parses cleanly.
"""
from __future__ import annotations
from pydantic import BaseModel, ConfigDict, Field
# ---------------------------------------------------------------------------
# Common sub-models
# ---------------------------------------------------------------------------
class Party(BaseModel):
"""A party (issuer, customer, contracting party)."""
name: str | None = None
tax_id: str | None = None
address: str | None = None
role: str | None = None
contact: str | None = None
class SourceRef(BaseModel):
file_name: str | None = None
page_number: int | None = None
# ---------------------------------------------------------------------------
# Invoice
# ---------------------------------------------------------------------------
class InvoiceItem(BaseModel):
item_code: str | None = None
description: str | None = None
quantity: float | None = None
unit: str | None = None
unit_price_net: float | None = None
vat_rate: float | None = None
total_net: float | None = None
total_vat: float | None = None
total_gross: float | None = None
class InvoiceModel(BaseModel):
model_config = ConfigDict(populate_by_name=True, extra="ignore")
invoice_number: str | None = None
issue_date: str | None = None
fulfillment_date: str | None = None
payment_due_date: str | None = None
payment_method: str | None = None
currency: str = "USD"
issuer: Party | None = None
customer: Party | None = None
line_items: list[InvoiceItem] = Field(default_factory=list)
total_net: float | None = None
total_vat: float | None = None
total_gross: float | None = None
quotes: list[str] = Field(default_factory=list, alias="_quotes")
confidence: dict = Field(default_factory=dict, alias="_confidence")
source: SourceRef | None = Field(default=None, alias="_source")
# ---------------------------------------------------------------------------
# Contract
# ---------------------------------------------------------------------------
class ContractPenalty(BaseModel):
amount: float | None = None
condition: str | None = None
class AutoRenewal(BaseModel):
enabled: bool = False
condition: str | None = None
class KeyClause(BaseModel):
name: str
content: str
risk_level: str = "low" # low | medium | high
class ContractModel(BaseModel):
model_config = ConfigDict(populate_by_name=True, extra="ignore")
contract_type: str | None = Field(
None,
description="The type of contract, e.g. 'NDA', 'service', 'works contract', "
"'lease', 'MSA', 'rental', 'IT framework agreement'. If the title "
"of the contract ('NON-DISCLOSURE AGREEMENT', 'LEASE AGREEMENT', etc.) "
"or the first paragraph contains it, fill it in.",
)
parties: list[Party] = Field(default_factory=list)
effective_date: str | None = Field(
None,
description="Effective date of the contract. If 'Effective date', "
"'Vertragsbeginn', 'Hatály kezdete' appears in the text, "
"fill in ISO 8601 (YYYY-MM-DD) format.",
)
expiry_date: str | None = Field(
None,
description="Expiration date of the contract. If 'Expiry date', "
"'Vertragsende', 'Lejárat' appears, fill it in.",
)
total_value: float | None = None
currency: str = "USD"
monthly_fee: float | None = None
monthly_fee_currency: str = "USD"
termination_terms: str | None = Field(
None,
description="Textual summary of the termination conditions. MANDATORY to "
"fill in if the contract anywhere mentions 'Termination', "
"'Felmondás', 'Megszűnés', 'Kündigung' — whether 30/60/90 day "
"notice or immediate termination for material breach. ONLY null "
"if the contract has NO termination clause whatsoever.",
)
termination_period_days: int | None = Field(
None,
description="Number of days for the termination notice period (e.g. 30, 60, 90). Numeric.",
)
penalty: ContractPenalty | None = Field(
None,
description="Penalty / liquidated damages clause if mentioned. Fill in if "
"'Penalty', 'Liquidated damages', 'Kötbér', 'Vertragsstrafe' or a "
"concrete amount/condition is referenced.",
)
confidentiality_clause: bool | None = Field(
None,
description="True if the contract contains a 'Confidentiality', 'NDA', "
"'Titoktartás' clause as a separate section or by reference.",
)
governing_law: str | None = Field(
None,
description="Applicable law. MANDATORY to fill in if 'Governing law', "
"'Applicable law', 'Anwendbares Recht', 'Irányadó jog', "
"'Hungarian law', 'BGB' is referenced. E.g.: 'Hungarian Civil Code', "
"'Hungarian and German BGB'.",
)
auto_renewal: AutoRenewal | None = Field(
None,
description="Auto-renewal clause. Fill in if 'auto-renewal', 'evergreen "
"clause', 'automatically renewed', 'automatische Verlängerung' is mentioned.",
)
change_of_control: bool | None = Field(
None,
description="True if the contract contains a 'change-of-control', "
"'change of control', 'kontroll-változás', 'termination on "
"ownership change' clause.",
)
non_compete: bool | None = Field(
None,
description="True if the contract contains a 'non-compete', "
"'versenytilalom', 'Wettbewerbsverbot' clause.",
)
key_clauses: list[KeyClause] = Field(default_factory=list)
quotes: list[str] = Field(default_factory=list, alias="_quotes")
confidence: dict = Field(default_factory=dict, alias="_confidence")
source: SourceRef | None = Field(default=None, alias="_source")
# ---------------------------------------------------------------------------
# Delivery Note
# ---------------------------------------------------------------------------
class DeliveryItem(BaseModel):
item_code: str | None = None
description: str | None = None
quantity: float | None = None
unit: str | None = None
class DeliveryNoteModel(BaseModel):
model_config = ConfigDict(populate_by_name=True, extra="ignore")
document_number: str | None = None
issue_date: str | None = None
delivery_date: str | None = None
purchase_order_reference: str | None = None
supplier: Party | None = None
customer: Party | None = None
line_items: list[DeliveryItem] = Field(default_factory=list)
notes: str | None = None
quotes: list[str] = Field(default_factory=list, alias="_quotes")
confidence: dict = Field(default_factory=dict, alias="_confidence")
source: SourceRef | None = Field(default=None, alias="_source")
# ---------------------------------------------------------------------------
# Purchase Order
# ---------------------------------------------------------------------------
class PurchaseOrderItem(BaseModel):
item_code: str | None = None
description: str | None = None
quantity: float | None = None
unit: str | None = None
unit_price_net: float | None = None
total_net: float | None = None
class PurchaseOrderModel(BaseModel):
model_config = ConfigDict(populate_by_name=True, extra="ignore")
document_number: str | None = None
date: str | None = None
delivery_due_date: str | None = None
payment_due_date: str | None = None
supplier: Party | None = None
customer: Party | None = None
line_items: list[PurchaseOrderItem] = Field(default_factory=list)
total_net: float | None = None
total_vat: float | None = None
total_gross: float | None = None
quotes: list[str] = Field(default_factory=list, alias="_quotes")
confidence: dict = Field(default_factory=dict, alias="_confidence")
source: SourceRef | None = Field(default=None, alias="_source")
# ---------------------------------------------------------------------------
# Financial Report
# ---------------------------------------------------------------------------
class FinancialLineItem(BaseModel):
description: str
value: float | None = None
value_prior_period: float | None = None
class FinancialReportModel(BaseModel):
model_config = ConfigDict(populate_by_name=True, extra="ignore")
report_type: str | None = None
period_start: str | None = None
period_end: str | None = None
company_name: str | None = None
company_tax_id: str | None = None
currency: str = "USD"
accounting_standard: str | None = None
"""One of: 'IFRS' | 'US-GAAP' | 'HU-GAAP' | 'DE-HGB' | None."""
line_items: list[FinancialLineItem] = Field(default_factory=list)
revenue: float | None = None
operating_income: float | None = None
pretax_income: float | None = None
tax: float | None = None
net_income: float | None = None
quotes: list[str] = Field(default_factory=list, alias="_quotes")
confidence: dict = Field(default_factory=dict, alias="_confidence")
source: SourceRef | None = Field(default=None, alias="_source")
# ---------------------------------------------------------------------------
# Universal — optional, because flatten_universal maps to the typed schemas
# ---------------------------------------------------------------------------
class UniversalDates(BaseModel):
issue: str | None = None
fulfillment: str | None = None
payment_due: str | None = None
effective: str | None = None
expiry: str | None = None
signature: str | None = None
other_dates: list[dict] = Field(default_factory=list)
class UniversalAmounts(BaseModel):
total_net: float | None = None
total_vat: float | None = None
total_gross: float | None = None
currency: str = "USD"
vat_rate: float | None = None
class UniversalContractElements(BaseModel):
contract_type: str | None = None
termination_terms: str | None = None
penalty: dict | None = None
confidentiality_clause: bool | None = None
governing_law: str | None = None
key_clauses: list[KeyClause] = Field(default_factory=list)
class UniversalModel(BaseModel):
model_config = ConfigDict(populate_by_name=True, extra="ignore")
document_type: str | None = None
document_language: str = "en"
document_number: str | None = None
parties: list[Party] = Field(default_factory=list)
dates: UniversalDates | None = None
amounts: UniversalAmounts | None = None
line_items: list[InvoiceItem] = Field(default_factory=list)
contract_elements: UniversalContractElements | None = None
risk_elements: list[str] = Field(default_factory=list)
quotes: list[str] = Field(default_factory=list, alias="_quotes")
confidence: dict = Field(default_factory=dict, alias="_confidence")
source: SourceRef | None = Field(default=None, alias="_source")
# ---------------------------------------------------------------------------
# Schema selection
# ---------------------------------------------------------------------------
def pydantic_for(doc_type: str) -> type[BaseModel]:
"""Return the Pydantic model class for the given doc_type."""
mapping = {
"invoice": InvoiceModel,
"delivery_note": DeliveryNoteModel,
"purchase_order": PurchaseOrderModel,
"contract": ContractModel,
"financial_report": FinancialReportModel,
"other": UniversalModel,
}
return mapping.get(doc_type, UniversalModel)