File size: 3,544 Bytes
7ff7119
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
"""Tolerant number normalization (HU/EU/US/FR formats + 8+ currencies + null aliases).

Examples:
  * "1 234 567" (HU) → 1234567
  * "1.234,56"  (EU) → 1234.56
  * "1,234.56"  (US) → 1234.56
  * "190 500 Ft" → 190500
  * "$1,234"   → 1234
  * "null", "n/a", "none", "-", "—" → None (LLM "missing" indicator)

Every numeric value at the input of a domain check passes through ``coerce_number``.
"""

from __future__ import annotations

import re

# Null aliases — strings the LLM uses to signal "no data"
_NULL_ALIASES = {
    "null", "none", "n/a", "na", "missing",
    "-", "—", "–", "?", "",
    # Multilingual
    "nincs",
    "keine",
}

# Currency suffix patterns (case-insensitive)
_CURRENCY_PATTERN = re.compile(
    r"\s*(USD|EUR|HUF|GBP|CHF|CZK|PLN|RON|JPY|Ft|€|\$|£)\s*$",
    re.I,
)


def is_null_alias(value: str | None) -> bool:
    """True if the value is the LLM's null indicator (no data)."""
    if value is None:
        return True
    if not isinstance(value, str):
        return False
    return value.strip().lower() in _NULL_ALIASES


def coerce_number(value) -> float | None:
    """Tolerant numeric coercion from any-format string, int, or float.

    Returns None if:
      * value is None or a null-alias string
      * value cannot be parsed as a number
    """
    if value is None:
        return None

    if isinstance(value, bool):
        # bool is an int subclass — guard so True != 1, False != 0
        return None

    if isinstance(value, (int, float)):
        return float(value)

    if not isinstance(value, str):
        return None

    s = value.strip()
    if not s or is_null_alias(s):
        return None

    # Strip currency suffix
    s = _CURRENCY_PATTERN.sub("", s).strip()
    # Strip currency prefix (e.g. "$1234")
    s = re.sub(r"^\s*([€$£]|USD|EUR|HUF|GBP|CHF|CZK|PLN|RON|JPY|Ft)\s*", "", s, flags=re.I).strip()

    # Negative parentheses: "(1234)" → "-1234"
    if s.startswith("(") and s.endswith(")"):
        s = "-" + s[1:-1]

    # Strip whitespace (HU thousands separator: "1 234 567")
    s = s.replace(" ", "").replace(" ", "").replace(" ", "")

    if not s or s in {"-", "+"}:
        return None

    # By now we have only digits, ., , and an optional leading +/-
    # Heuristic for separators:
    #   - if both . and , are present, the last one is the decimal,
    #     the others are thousands separators
    #   - if only , is present and ≤ 2 digits follow the last comma → decimal
    #     (otherwise comma is a thousands separator)
    #   - if only . is present and there are multiple . → last is decimal,
    #     the others are thousands

    has_dot = "." in s
    has_comma = "," in s

    if has_dot and has_comma:
        last_dot = s.rfind(".")
        last_comma = s.rfind(",")
        if last_dot > last_comma:
            # 1,234.56 → US: comma=thousands, dot=decimal
            s = s.replace(",", "")
        else:
            # 1.234,56 → EU: dot=thousands, comma=decimal
            s = s.replace(".", "").replace(",", ".")
    elif has_comma:
        last_comma = s.rfind(",")
        if len(s) - last_comma - 1 in {1, 2}:
            s = s[:last_comma].replace(",", "") + "." + s[last_comma + 1 :]
        else:
            s = s.replace(",", "")
    elif has_dot:
        n_dots = s.count(".")
        if n_dots > 1:
            last_dot = s.rfind(".")
            s = s[:last_dot].replace(".", "") + "." + s[last_dot + 1 :]

    try:
        return float(s)
    except ValueError:
        return None