"""Dummy regex-based extractor — mock for the structured LLM extraction. This module produces a flat dict with English field names matching the ``schemas/pydantic_models.py`` typed schemas. Multilingual regex patterns support both English-generated and HU/DE legacy sample documents (important for the multilingual demo flows). In Phase 9 (test data regeneration), this module will be fully rewritten to target the new English-generated sample PDFs. For now it provides a minimal, structurally-correct stub so that downstream nodes (domain checks, anti-halluc filters) receive English-keyed data and the dummy-mode pipeline runs end-to-end. The ``_quotes`` field is populated from the matched text spans (the quote_validator anti-halluc layer #7 verifies that those quotes actually appear in the source full_text). """ from __future__ import annotations import re from typing import Any # --------------------------------------------------------------------------- # Shared regex patterns (multilingual) # --------------------------------------------------------------------------- # Hungarian tax-id format: XXXXXXXX-X-XX _TAX_ID_HU = re.compile(r"\b(\d{8})\s*-\s*(\d)\s*-\s*(\d{2})\b") # US EIN: XX-XXXXXXX _TAX_ID_US = re.compile(r"\b(\d{2})\s*-\s*(\d{7})\b") # Date in any common format: YYYY-MM-DD, YYYY.MM.DD, YYYY/MM/DD _DATE = re.compile(r"\b(\d{4})[.\-/](\d{1,2})[.\-/](\d{1,2})\.?\b") # Monetary amount with currency suffix: "1,234.56 USD" or "1 234 567 Ft" or "$1,234" _MONEY = re.compile( r"(?:[\$€£]\s*)?([\d\s.,]+)\s*(USD|EUR|HUF|GBP|CHF|Ft|JPY|CZK|PLN|RON)?\b", re.I, ) def _normalize_date(year: str, month: str, day: str) -> str: return f"{int(year):04d}-{int(month):02d}-{int(day):02d}" def _parse_money(s: str) -> float | None: """Parse "1 234 567" or "1,234.56" → float.""" if not s: return None cleaned = s.strip().replace(" ", "") has_dot = "." in cleaned has_comma = "," in cleaned if has_dot and has_comma: last_dot = cleaned.rfind(".") last_comma = cleaned.rfind(",") if last_dot > last_comma: cleaned = cleaned.replace(",", "") else: cleaned = cleaned.replace(".", "").replace(",", ".") elif has_comma: last_comma = cleaned.rfind(",") if len(cleaned) - last_comma - 1 in {1, 2}: cleaned = cleaned[:last_comma].replace(",", "") + "." + cleaned[last_comma + 1:] else: cleaned = cleaned.replace(",", "") elif has_dot: n_dots = cleaned.count(".") if n_dots > 1: last_dot = cleaned.rfind(".") cleaned = cleaned[:last_dot].replace(".", "") + "." + cleaned[last_dot + 1:] try: return float(cleaned) except ValueError: return None # --------------------------------------------------------------------------- # Dispatcher # --------------------------------------------------------------------------- def extract_dummy(full_text: str, doc_type: str, file_name: str) -> dict[str, Any]: """Doc-type-specific extractor → flat dict with EN field names.""" extractors = { "invoice": _extract_invoice, "delivery_note": _extract_delivery_note, "purchase_order": _extract_purchase_order, "contract": _extract_contract, "financial_report": _extract_financial_report, } fn = extractors.get(doc_type, _extract_universal) out = fn(full_text, file_name) out.setdefault("_source", {"file_name": file_name}) return out # --------------------------------------------------------------------------- # Invoice # --------------------------------------------------------------------------- def _extract_invoice(text: str, file_name: str) -> dict[str, Any]: out: dict[str, Any] = { "_quotes": [], "_confidence": {}, } # Invoice number — multilingual (EN/HU/DE) m = re.search( r"(?:invoice\s+(?:number|no\.?|#)|sz[aá]mla\s+sz[aá]m[a]?|Rechnungsnummer)\s*[:\#]?\s*(\S+)", text, re.I, ) if m: out["invoice_number"] = m.group(1).rstrip(",.;") out["_quotes"].append(m.group(0)[:120]) out["_confidence"]["invoice_number"] = "high" # Dates: issue, fulfillment, payment due for label, key in [ (r"(?:issue\s+date|date\s+issued|ki[aá]ll[ií]t[aá]s\s*d[aá]tum[a]?|Rechnungsdatum)", "issue_date"), (r"(?:fulfillment\s+date|service\s+date|teljes[ií]t[eé]s\s*d[aá]tum[a]?|Leistungsdatum)", "fulfillment_date"), (r"(?:payment\s+due|due\s+date|fizet[eé]si\s*hat[aá]rid[oő]|F[aä]lligkeitsdatum)", "payment_due_date"), ]: m = re.search(rf"{label}\s*[:\#]?\s*({_DATE.pattern})", text, re.I) if m: try: # Group indices: 0=full, 1=date, 2=year, 3=month, 4=day date_str = _normalize_date(m.group(2), m.group(3), m.group(4)) out[key] = date_str out["_quotes"].append(m.group(0)[:120]) out["_confidence"][key] = "high" except (ValueError, IndexError): pass # Issuer + Customer parties (HU/EN labels) issuer_match = re.search( r"(?:issuer|seller|supplier|ki[aá]ll[ií]t[oó]|sz[aá]ll[ií]t[oó]|Aussteller)\s*[:\#]?\s*([A-Z][\w\s\.,&-]+?)(?=\n|adósz|tax|address|c[íi]m)", text, re.I, ) if issuer_match: out["issuer"] = {"name": issuer_match.group(1).strip()} customer_match = re.search( r"(?:customer|buyer|client|vev[oő]|v[aá]s[aá]rl[oó]|Kunde)\s*[:\#]?\s*([A-Z][\w\s\.,&-]+?)(?=\n|adósz|tax|address|c[íi]m)", text, re.I, ) if customer_match: out["customer"] = {"name": customer_match.group(1).strip()} # Tax IDs (HU format prioritized; US/EU fallback) tax_ids = _TAX_ID_HU.findall(text) if tax_ids and out.get("issuer"): first = tax_ids[0] out["issuer"]["tax_id"] = f"{first[0]}-{first[1]}-{first[2]}" # Totals — multilingual (EN/HU/DE) for label, key in [ (r"(?:total\s+net|net\s+total|nett[oó]\s*v[eé]g[oö]ssz)", "total_net"), (r"(?:total\s+vat|vat\s+total|[aá]fa\s*v[eé]g[oö]ssz|MwSt[\.\s]+gesamt)", "total_vat"), (r"(?:total\s+gross|gross\s+total|brutt[oó]\s*v[eé]g[oö]ssz|Bruttogesamtbetrag|grand\s+total)", "total_gross"), ]: # The amount may carry a leading $/€/£ symbol — capture as optional prefix. m = re.search(rf"{label}\s*[:\#]?\s*[\$€£]?\s*([\d\s.,]+)", text, re.I) if m: val = _parse_money(m.group(1)) if val is not None: out[key] = val out["_quotes"].append(m.group(0)[:120]) out["_confidence"][key] = "high" # Currency detection if re.search(r"\b(USD|\$)\b", text): out["currency"] = "USD" elif re.search(r"\b(EUR|€)\b", text): out["currency"] = "EUR" elif re.search(r"\b(HUF|Ft)\b", text): out["currency"] = "HUF" elif re.search(r"\b(GBP|£)\b", text): out["currency"] = "GBP" return out # --------------------------------------------------------------------------- # Delivery Note # --------------------------------------------------------------------------- def _extract_delivery_note(text: str, file_name: str) -> dict[str, Any]: out: dict[str, Any] = { "_quotes": [], "_confidence": {}, } m = re.search( r"(?:delivery\s+note(?:\s+number)?|szallitolev[eé]l\s*sz[aá]m|Lieferschein)\s*[:\#]?\s*(\S+)", text, re.I, ) if m: out["document_number"] = m.group(1).rstrip(",.;") out["_quotes"].append(m.group(0)[:120]) return out # --------------------------------------------------------------------------- # Purchase Order # --------------------------------------------------------------------------- def _extract_purchase_order(text: str, file_name: str) -> dict[str, Any]: out: dict[str, Any] = { "_quotes": [], "_confidence": {}, } m = re.search( r"(?:purchase\s+order(?:\s+number)?|po\s*[:\#]|megrendel[eé]s\s*sz[aá]m|Bestellnummer)\s*[:\#]?\s*(\S+)", text, re.I, ) if m: out["document_number"] = m.group(1).rstrip(",.;") out["_quotes"].append(m.group(0)[:120]) return out # --------------------------------------------------------------------------- # Contract # --------------------------------------------------------------------------- def _extract_contract(text: str, file_name: str) -> dict[str, Any]: out: dict[str, Any] = { "_quotes": [], "_confidence": {}, "parties": [], } # Contract type — keyword detection text_lower = text.lower() if "non-disclosure" in text_lower or "nda" in text_lower or "titoktart" in text_lower: out["contract_type"] = "NDA" elif "lease" in text_lower or "rental" in text_lower or "lizing" in text_lower: out["contract_type"] = "lease" elif "service" in text_lower or "szolgaltatas" in text_lower: out["contract_type"] = "service" elif "framework" in text_lower or "MSA" in text: out["contract_type"] = "MSA" # Effective + expiry dates for label, key in [ (r"(?:effective\s+date|hat[aá]ly\s+kezdet|Vertragsbeginn)", "effective_date"), (r"(?:expiry\s+date|expiration|hat[aá]ly\s+v[eé]g|Vertragsende)", "expiry_date"), ]: m = re.search(rf"{label}\s*[:\#]?\s*({_DATE.pattern})", text, re.I) if m: try: out[key] = _normalize_date(m.group(2), m.group(3), m.group(4)) out["_quotes"].append(m.group(0)[:120]) except (ValueError, IndexError): pass # Governing law (multilingual) gov = re.search( r"(?:governing\s+law|applicable\s+law|ir[aá]ny[aá]d[oó]\s+jog|Anwendbares\s+Recht)\s*[:\.\,]?\s*([\w\s,]+)", text, re.I, ) if gov: out["governing_law"] = gov.group(1).strip()[:120] # Termination clause detection if re.search(r"(?:termination|felmond[aá]s|K[üu]ndigung)", text, re.I): m = re.search( r"(?:termination\s+(?:terms|clause)|felmond[aá]si\s+felt[eé]tel\w*|K[üu]ndigungsfrist)\s*[:\#]?\s*(.{20,200}?)(?:\n\n|$)", text, re.I, ) if m: out["termination_terms"] = m.group(1).strip() out["_quotes"].append(m.group(0)[:200]) # Auto-renewal if re.search(r"(?:auto[\s-]?renewal|automatically\s+renewed|automatikusan\s+meg[uú]jul|automatische\s+Verl[aä]ngerung)", text, re.I): out["auto_renewal"] = {"enabled": True} # Change-of-control if re.search(r"(?:change[\s-]?of[\s-]?control|kontrollv[aá]ltoz[aá]s|Kontrollwechsel)", text, re.I): out["change_of_control"] = True # Non-compete if re.search(r"(?:non[\s-]?compete|versenytilalom|Wettbewerbsverbot)", text, re.I): out["non_compete"] = True # Confidentiality (NDA implies confidentiality even without the keyword) if re.search(r"(?:confidentiality|non[-\s]?disclosure|\bnda\b|titoktart|Vertraulichkeit)", text, re.I): out["confidentiality_clause"] = True return out # --------------------------------------------------------------------------- # Financial Report # --------------------------------------------------------------------------- def _extract_financial_report(text: str, file_name: str) -> dict[str, Any]: out: dict[str, Any] = { "_quotes": [], "_confidence": {}, } text_lower = text.lower() if "income statement" in text_lower or "p&l" in text_lower or "profit" in text_lower: out["report_type"] = "income_statement" elif "balance sheet" in text_lower or "merleg" in text_lower: out["report_type"] = "balance_sheet" elif "cash flow" in text_lower: out["report_type"] = "cash_flow" # Accounting standard if "IFRS" in text: out["accounting_standard"] = "IFRS" elif "US-GAAP" in text or "US GAAP" in text: out["accounting_standard"] = "US-GAAP" elif "HU-GAAP" in text or "HÁR" in text: out["accounting_standard"] = "HU-GAAP" elif "HGB" in text: out["accounting_standard"] = "DE-HGB" # Period for label, key in [ (r"(?:period\s+start|id[oő]szak\s+kezdet)", "period_start"), (r"(?:period\s+end|id[oő]szak\s+v[eé]g)", "period_end"), ]: m = re.search(rf"{label}\s*[:\#]?\s*({_DATE.pattern})", text, re.I) if m: try: out[key] = _normalize_date(m.group(2), m.group(3), m.group(4)) except (ValueError, IndexError): pass return out # --------------------------------------------------------------------------- # Universal (any other doc type) # --------------------------------------------------------------------------- def _extract_universal(text: str, file_name: str) -> dict[str, Any]: out: dict[str, Any] = { "_quotes": [], "_confidence": {}, "document_type": "other", "document_language": "en", "parties": [], "dates": {}, "amounts": {}, "line_items": [], } # Try to find any date as a generic signature m = _DATE.search(text) if m: try: out["dates"]["signature"] = _normalize_date(m.group(1), m.group(2), m.group(3)) except (ValueError, IndexError): pass return out