paperhawk / validation /plausibility.py
Nándorfi Vince
Initial paperhawk push to HF Space (LFS for binaries)
7ff7119
raw
history blame
3.45 kB
"""Plausibility checks — flag unusual values as info-level warnings.
Does not drop anything; only marks. Language- and country-agnostic.
"""
from __future__ import annotations
from utils.dates import parse_date_safe
from utils.numbers import coerce_number, is_null_alias
# Known VAT rates across countries
KNOWN_VAT_RATES = {0, 5, 7, 8, 10, 12, 13, 15, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27}
def validate_plausibility(extracted: dict) -> list[dict]:
"""Flag unusual values as warnings.
Returns: list of {"type": "plausibility", "severity": ..., "message": ...}
"""
warnings: list[dict] = []
# VAT rate per line item
items = extracted.get("line_items") or []
for item in items:
if not isinstance(item, dict):
continue
vat_rate = coerce_number(item.get("vat_rate"))
if vat_rate is None:
continue
name = item.get("description", "?")
if vat_rate < 0:
warnings.append({
"type": "plausibility",
"severity": "medium",
"message": f"Negative VAT rate ({vat_rate:g}%) on line '{name}'",
})
elif vat_rate > 50:
warnings.append({
"type": "plausibility",
"severity": "medium",
"message": f"Unusually high VAT rate ({vat_rate:g}%) on line '{name}'",
})
elif int(vat_rate) not in KNOWN_VAT_RATES and vat_rate != 0:
warnings.append({
"type": "plausibility",
"severity": "low",
"message": f"Non-standard VAT rate ({vat_rate:g}%) on line '{name}'",
})
# Negative totals
for field in ("total_net", "total_vat", "total_gross", "amount"):
amount = coerce_number(extracted.get(field))
if amount is not None and amount < 0:
warnings.append({
"type": "plausibility",
"severity": "medium",
"message": f"Negative amount: {field} = {amount:.0f}",
})
# Date plausibility (skip null aliases)
for field in (
"issue_date", "fulfillment_date", "payment_due_date",
"order_date", "delivery_due_date", "delivery_date",
"effective_date", "expiry_date",
):
date_str = extracted.get(field)
if not date_str or not isinstance(date_str, str):
continue
if is_null_alias(date_str):
continue
# parse_date_safe supports YYYY-MM-DD, YYYY.MM.DD, YYYY/MM/DD, DD.MM.YYYY
# — multilingual helper for HU/DE/EN dates.
dt = parse_date_safe(date_str)
if dt is None:
warnings.append({
"type": "plausibility",
"severity": "low",
"message": f"Unparseable date: {field} = '{date_str}'",
})
elif dt.year < 2000:
warnings.append({
"type": "plausibility",
"severity": "low",
"message": f"Old date: {field} = {date_str} (before 2000)",
})
elif dt.year > 2030 and field not in ("expiry_date", "effective_date"):
# Contract expiry can naturally be in the distant future
warnings.append({
"type": "plausibility",
"severity": "low",
"message": f"Future date: {field} = {date_str} (after 2030)",
})
return warnings