paperhawk / nodes /extract /_dummy_extractor.py
Nándorfi Vince
Initial paperhawk push to HF Space (LFS for binaries)
7ff7119
"""Dummy regex-based extractor — mock for the structured LLM extraction.
This module produces a flat dict with English field names matching the
``schemas/pydantic_models.py`` typed schemas. Multilingual regex patterns
support both English-generated and HU/DE legacy sample documents
(important for the multilingual demo flows).
In Phase 9 (test data regeneration), this module will be fully rewritten to
target the new English-generated sample PDFs. For now it provides a minimal,
structurally-correct stub so that downstream nodes (domain checks, anti-halluc
filters) receive English-keyed data and the dummy-mode pipeline runs end-to-end.
The ``_quotes`` field is populated from the matched text spans (the
quote_validator anti-halluc layer #7 verifies that those quotes actually
appear in the source full_text).
"""
from __future__ import annotations
import re
from typing import Any
# ---------------------------------------------------------------------------
# Shared regex patterns (multilingual)
# ---------------------------------------------------------------------------
# Hungarian tax-id format: XXXXXXXX-X-XX
_TAX_ID_HU = re.compile(r"\b(\d{8})\s*-\s*(\d)\s*-\s*(\d{2})\b")
# US EIN: XX-XXXXXXX
_TAX_ID_US = re.compile(r"\b(\d{2})\s*-\s*(\d{7})\b")
# Date in any common format: YYYY-MM-DD, YYYY.MM.DD, YYYY/MM/DD
_DATE = re.compile(r"\b(\d{4})[.\-/](\d{1,2})[.\-/](\d{1,2})\.?\b")
# Monetary amount with currency suffix: "1,234.56 USD" or "1 234 567 Ft" or "$1,234"
_MONEY = re.compile(
r"(?:[\$€£]\s*)?([\d\s.,]+)\s*(USD|EUR|HUF|GBP|CHF|Ft|JPY|CZK|PLN|RON)?\b",
re.I,
)
def _normalize_date(year: str, month: str, day: str) -> str:
return f"{int(year):04d}-{int(month):02d}-{int(day):02d}"
def _parse_money(s: str) -> float | None:
"""Parse "1 234 567" or "1,234.56" → float."""
if not s:
return None
cleaned = s.strip().replace(" ", "")
has_dot = "." in cleaned
has_comma = "," in cleaned
if has_dot and has_comma:
last_dot = cleaned.rfind(".")
last_comma = cleaned.rfind(",")
if last_dot > last_comma:
cleaned = cleaned.replace(",", "")
else:
cleaned = cleaned.replace(".", "").replace(",", ".")
elif has_comma:
last_comma = cleaned.rfind(",")
if len(cleaned) - last_comma - 1 in {1, 2}:
cleaned = cleaned[:last_comma].replace(",", "") + "." + cleaned[last_comma + 1:]
else:
cleaned = cleaned.replace(",", "")
elif has_dot:
n_dots = cleaned.count(".")
if n_dots > 1:
last_dot = cleaned.rfind(".")
cleaned = cleaned[:last_dot].replace(".", "") + "." + cleaned[last_dot + 1:]
try:
return float(cleaned)
except ValueError:
return None
# ---------------------------------------------------------------------------
# Dispatcher
# ---------------------------------------------------------------------------
def extract_dummy(full_text: str, doc_type: str, file_name: str) -> dict[str, Any]:
"""Doc-type-specific extractor → flat dict with EN field names."""
extractors = {
"invoice": _extract_invoice,
"delivery_note": _extract_delivery_note,
"purchase_order": _extract_purchase_order,
"contract": _extract_contract,
"financial_report": _extract_financial_report,
}
fn = extractors.get(doc_type, _extract_universal)
out = fn(full_text, file_name)
out.setdefault("_source", {"file_name": file_name})
return out
# ---------------------------------------------------------------------------
# Invoice
# ---------------------------------------------------------------------------
def _extract_invoice(text: str, file_name: str) -> dict[str, Any]:
out: dict[str, Any] = {
"_quotes": [],
"_confidence": {},
}
# Invoice number — multilingual (EN/HU/DE)
m = re.search(
r"(?:invoice\s+(?:number|no\.?|#)|sz[aá]mla\s+sz[aá]m[a]?|Rechnungsnummer)\s*[:\#]?\s*(\S+)",
text, re.I,
)
if m:
out["invoice_number"] = m.group(1).rstrip(",.;")
out["_quotes"].append(m.group(0)[:120])
out["_confidence"]["invoice_number"] = "high"
# Dates: issue, fulfillment, payment due
for label, key in [
(r"(?:issue\s+date|date\s+issued|ki[aá]ll[ií]t[aá]s\s*d[aá]tum[a]?|Rechnungsdatum)",
"issue_date"),
(r"(?:fulfillment\s+date|service\s+date|teljes[ií]t[eé]s\s*d[aá]tum[a]?|Leistungsdatum)",
"fulfillment_date"),
(r"(?:payment\s+due|due\s+date|fizet[eé]si\s*hat[aá]rid[oő]|F[aä]lligkeitsdatum)",
"payment_due_date"),
]:
m = re.search(rf"{label}\s*[:\#]?\s*({_DATE.pattern})", text, re.I)
if m:
try:
# Group indices: 0=full, 1=date, 2=year, 3=month, 4=day
date_str = _normalize_date(m.group(2), m.group(3), m.group(4))
out[key] = date_str
out["_quotes"].append(m.group(0)[:120])
out["_confidence"][key] = "high"
except (ValueError, IndexError):
pass
# Issuer + Customer parties (HU/EN labels)
issuer_match = re.search(
r"(?:issuer|seller|supplier|ki[aá]ll[ií]t[oó]|sz[aá]ll[ií]t[oó]|Aussteller)\s*[:\#]?\s*([A-Z][\w\s\.,&-]+?)(?=\n|adósz|tax|address|c[íi]m)",
text, re.I,
)
if issuer_match:
out["issuer"] = {"name": issuer_match.group(1).strip()}
customer_match = re.search(
r"(?:customer|buyer|client|vev[oő]|v[aá]s[aá]rl[oó]|Kunde)\s*[:\#]?\s*([A-Z][\w\s\.,&-]+?)(?=\n|adósz|tax|address|c[íi]m)",
text, re.I,
)
if customer_match:
out["customer"] = {"name": customer_match.group(1).strip()}
# Tax IDs (HU format prioritized; US/EU fallback)
tax_ids = _TAX_ID_HU.findall(text)
if tax_ids and out.get("issuer"):
first = tax_ids[0]
out["issuer"]["tax_id"] = f"{first[0]}-{first[1]}-{first[2]}"
# Totals — multilingual (EN/HU/DE)
for label, key in [
(r"(?:total\s+net|net\s+total|nett[oó]\s*v[eé]g[oö]ssz)", "total_net"),
(r"(?:total\s+vat|vat\s+total|[aá]fa\s*v[eé]g[oö]ssz|MwSt[\.\s]+gesamt)", "total_vat"),
(r"(?:total\s+gross|gross\s+total|brutt[oó]\s*v[eé]g[oö]ssz|Bruttogesamtbetrag|grand\s+total)",
"total_gross"),
]:
# The amount may carry a leading $/€/£ symbol — capture as optional prefix.
m = re.search(rf"{label}\s*[:\#]?\s*[\$€£]?\s*([\d\s.,]+)", text, re.I)
if m:
val = _parse_money(m.group(1))
if val is not None:
out[key] = val
out["_quotes"].append(m.group(0)[:120])
out["_confidence"][key] = "high"
# Currency detection
if re.search(r"\b(USD|\$)\b", text):
out["currency"] = "USD"
elif re.search(r"\b(EUR|€)\b", text):
out["currency"] = "EUR"
elif re.search(r"\b(HUF|Ft)\b", text):
out["currency"] = "HUF"
elif re.search(r"\b(GBP|£)\b", text):
out["currency"] = "GBP"
return out
# ---------------------------------------------------------------------------
# Delivery Note
# ---------------------------------------------------------------------------
def _extract_delivery_note(text: str, file_name: str) -> dict[str, Any]:
out: dict[str, Any] = {
"_quotes": [],
"_confidence": {},
}
m = re.search(
r"(?:delivery\s+note(?:\s+number)?|szallitolev[eé]l\s*sz[aá]m|Lieferschein)\s*[:\#]?\s*(\S+)",
text, re.I,
)
if m:
out["document_number"] = m.group(1).rstrip(",.;")
out["_quotes"].append(m.group(0)[:120])
return out
# ---------------------------------------------------------------------------
# Purchase Order
# ---------------------------------------------------------------------------
def _extract_purchase_order(text: str, file_name: str) -> dict[str, Any]:
out: dict[str, Any] = {
"_quotes": [],
"_confidence": {},
}
m = re.search(
r"(?:purchase\s+order(?:\s+number)?|po\s*[:\#]|megrendel[eé]s\s*sz[aá]m|Bestellnummer)\s*[:\#]?\s*(\S+)",
text, re.I,
)
if m:
out["document_number"] = m.group(1).rstrip(",.;")
out["_quotes"].append(m.group(0)[:120])
return out
# ---------------------------------------------------------------------------
# Contract
# ---------------------------------------------------------------------------
def _extract_contract(text: str, file_name: str) -> dict[str, Any]:
out: dict[str, Any] = {
"_quotes": [],
"_confidence": {},
"parties": [],
}
# Contract type — keyword detection
text_lower = text.lower()
if "non-disclosure" in text_lower or "nda" in text_lower or "titoktart" in text_lower:
out["contract_type"] = "NDA"
elif "lease" in text_lower or "rental" in text_lower or "lizing" in text_lower:
out["contract_type"] = "lease"
elif "service" in text_lower or "szolgaltatas" in text_lower:
out["contract_type"] = "service"
elif "framework" in text_lower or "MSA" in text:
out["contract_type"] = "MSA"
# Effective + expiry dates
for label, key in [
(r"(?:effective\s+date|hat[aá]ly\s+kezdet|Vertragsbeginn)", "effective_date"),
(r"(?:expiry\s+date|expiration|hat[aá]ly\s+v[eé]g|Vertragsende)", "expiry_date"),
]:
m = re.search(rf"{label}\s*[:\#]?\s*({_DATE.pattern})", text, re.I)
if m:
try:
out[key] = _normalize_date(m.group(2), m.group(3), m.group(4))
out["_quotes"].append(m.group(0)[:120])
except (ValueError, IndexError):
pass
# Governing law (multilingual)
gov = re.search(
r"(?:governing\s+law|applicable\s+law|ir[aá]ny[aá]d[oó]\s+jog|Anwendbares\s+Recht)\s*[:\.\,]?\s*([\w\s,]+)",
text, re.I,
)
if gov:
out["governing_law"] = gov.group(1).strip()[:120]
# Termination clause detection
if re.search(r"(?:termination|felmond[aá]s|K[üu]ndigung)", text, re.I):
m = re.search(
r"(?:termination\s+(?:terms|clause)|felmond[aá]si\s+felt[eé]tel\w*|K[üu]ndigungsfrist)\s*[:\#]?\s*(.{20,200}?)(?:\n\n|$)",
text, re.I,
)
if m:
out["termination_terms"] = m.group(1).strip()
out["_quotes"].append(m.group(0)[:200])
# Auto-renewal
if re.search(r"(?:auto[\s-]?renewal|automatically\s+renewed|automatikusan\s+meg[uú]jul|automatische\s+Verl[aä]ngerung)", text, re.I):
out["auto_renewal"] = {"enabled": True}
# Change-of-control
if re.search(r"(?:change[\s-]?of[\s-]?control|kontrollv[aá]ltoz[aá]s|Kontrollwechsel)", text, re.I):
out["change_of_control"] = True
# Non-compete
if re.search(r"(?:non[\s-]?compete|versenytilalom|Wettbewerbsverbot)", text, re.I):
out["non_compete"] = True
# Confidentiality (NDA implies confidentiality even without the keyword)
if re.search(r"(?:confidentiality|non[-\s]?disclosure|\bnda\b|titoktart|Vertraulichkeit)", text, re.I):
out["confidentiality_clause"] = True
return out
# ---------------------------------------------------------------------------
# Financial Report
# ---------------------------------------------------------------------------
def _extract_financial_report(text: str, file_name: str) -> dict[str, Any]:
out: dict[str, Any] = {
"_quotes": [],
"_confidence": {},
}
text_lower = text.lower()
if "income statement" in text_lower or "p&l" in text_lower or "profit" in text_lower:
out["report_type"] = "income_statement"
elif "balance sheet" in text_lower or "merleg" in text_lower:
out["report_type"] = "balance_sheet"
elif "cash flow" in text_lower:
out["report_type"] = "cash_flow"
# Accounting standard
if "IFRS" in text:
out["accounting_standard"] = "IFRS"
elif "US-GAAP" in text or "US GAAP" in text:
out["accounting_standard"] = "US-GAAP"
elif "HU-GAAP" in text or "HÁR" in text:
out["accounting_standard"] = "HU-GAAP"
elif "HGB" in text:
out["accounting_standard"] = "DE-HGB"
# Period
for label, key in [
(r"(?:period\s+start|id[oő]szak\s+kezdet)", "period_start"),
(r"(?:period\s+end|id[oő]szak\s+v[eé]g)", "period_end"),
]:
m = re.search(rf"{label}\s*[:\#]?\s*({_DATE.pattern})", text, re.I)
if m:
try:
out[key] = _normalize_date(m.group(2), m.group(3), m.group(4))
except (ValueError, IndexError):
pass
return out
# ---------------------------------------------------------------------------
# Universal (any other doc type)
# ---------------------------------------------------------------------------
def _extract_universal(text: str, file_name: str) -> dict[str, Any]:
out: dict[str, Any] = {
"_quotes": [],
"_confidence": {},
"document_type": "other",
"document_language": "en",
"parties": [],
"dates": {},
"amounts": {},
"line_items": [],
}
# Try to find any date as a generic signature
m = _DATE.search(text)
if m:
try:
out["dates"]["signature"] = _normalize_date(m.group(1), m.group(2), m.group(3))
except (ValueError, IndexError):
pass
return out