| """Tolerant number normalization (HU/EU/US/FR formats + 8+ currencies + null aliases). |
| |
| Examples: |
| * "1 234 567" (HU) → 1234567 |
| * "1.234,56" (EU) → 1234.56 |
| * "1,234.56" (US) → 1234.56 |
| * "190 500 Ft" → 190500 |
| * "$1,234" → 1234 |
| * "null", "n/a", "none", "-", "—" → None (LLM "missing" indicator) |
| |
| Every numeric value at the input of a domain check passes through ``coerce_number``. |
| """ |
|
|
| from __future__ import annotations |
|
|
| import re |
|
|
| |
| _NULL_ALIASES = { |
| "null", "none", "n/a", "na", "missing", |
| "-", "—", "–", "?", "", |
| |
| "nincs", |
| "keine", |
| } |
|
|
| |
| _CURRENCY_PATTERN = re.compile( |
| r"\s*(USD|EUR|HUF|GBP|CHF|CZK|PLN|RON|JPY|Ft|€|\$|£)\s*$", |
| re.I, |
| ) |
|
|
|
|
| def is_null_alias(value: str | None) -> bool: |
| """True if the value is the LLM's null indicator (no data).""" |
| if value is None: |
| return True |
| if not isinstance(value, str): |
| return False |
| return value.strip().lower() in _NULL_ALIASES |
|
|
|
|
| def coerce_number(value) -> float | None: |
| """Tolerant numeric coercion from any-format string, int, or float. |
| |
| Returns None if: |
| * value is None or a null-alias string |
| * value cannot be parsed as a number |
| """ |
| if value is None: |
| return None |
|
|
| if isinstance(value, bool): |
| |
| return None |
|
|
| if isinstance(value, (int, float)): |
| return float(value) |
|
|
| if not isinstance(value, str): |
| return None |
|
|
| s = value.strip() |
| if not s or is_null_alias(s): |
| return None |
|
|
| |
| s = _CURRENCY_PATTERN.sub("", s).strip() |
| |
| s = re.sub(r"^\s*([€$£]|USD|EUR|HUF|GBP|CHF|CZK|PLN|RON|JPY|Ft)\s*", "", s, flags=re.I).strip() |
|
|
| |
| if s.startswith("(") and s.endswith(")"): |
| s = "-" + s[1:-1] |
|
|
| |
| s = s.replace(" ", "").replace(" ", "").replace(" ", "") |
|
|
| if not s or s in {"-", "+"}: |
| return None |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| has_dot = "." in s |
| has_comma = "," in s |
|
|
| if has_dot and has_comma: |
| last_dot = s.rfind(".") |
| last_comma = s.rfind(",") |
| if last_dot > last_comma: |
| |
| s = s.replace(",", "") |
| else: |
| |
| s = s.replace(".", "").replace(",", ".") |
| elif has_comma: |
| last_comma = s.rfind(",") |
| if len(s) - last_comma - 1 in {1, 2}: |
| s = s[:last_comma].replace(",", "") + "." + s[last_comma + 1 :] |
| else: |
| s = s.replace(",", "") |
| elif has_dot: |
| n_dots = s.count(".") |
| if n_dots > 1: |
| last_dot = s.rfind(".") |
| s = s[:last_dot].replace(".", "") + "." + s[last_dot + 1 :] |
|
|
| try: |
| return float(s) |
| except ValueError: |
| return None |
|
|