File size: 2,982 Bytes
7ff7119
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
"""intent_classifier_node — fast regex-based intent recognition.

6 intents: list / extract / search / compare / validate / chat.
LLM-independent, < 1 ms.
"""

from __future__ import annotations

import re
import unicodedata

from graph.states.chat_state import ChatState


def _strip_accents(text: str) -> str:
    """ASCII normalization: strip diacritics (á→a, ő→o, etc.)."""
    nfkd = unicodedata.normalize("NFKD", text)
    return "".join(c for c in nfkd if not unicodedata.combining(c))


# English-first regexes with multilingual (HU) fallback — runs on
# ASCII-normalized text so "ellenőrizd" matches "ellenoriz".
_INTENT_RULES: list[tuple[str, re.Pattern[str]]] = [
    (
        "compare",
        re.compile(
            r"\b(compar\w*|differ\w*|diff|versus|\bvs\b|cheap\w*|expensiv\w*|"
            r"hasonlit\w*|elter\w*|kulonbs\w*|szembe\w*|drag\w*|olcsobb\w*|mennyivel)\b",
            re.I,
        ),
    ),
    (
        "validate",
        re.compile(
            r"\b(math|error\w*|valid\w*|check|verify|cdv|tax\s*id|consist\w*|correct|"
            r"matek\w*|hib\w*|validal\w*|ellenoriz\w*|adoszam\w*|ervenyes\w*|helyes)\b",
            re.I,
        ),
    ),
    (
        "search",
        re.compile(
            r"\b(search|find|where|contain\w*|penalty|liquid\w*|clause\w*|"
            r"keres\w*|talald|hol|melyik|tartalmaz\w*|szallit\w*|kotber\w*|change|klauz\w*)\b",
            re.I,
        ),
    ),
    (
        "list",
        re.compile(
            r"\b("
            r"(?:what|which)\s+(?:documents?|files?|types?|kinds?|uploads?)|"
            r"how\s*many\s+(?:documents?|files?)|"
            r"list|listazd|listazz|"
            r"file\w*|document\w*|kind|"
            r"milyen|mely|hany|fajl\w*|dokumentum\w*|tipus\w*"
            r")\b",
            re.I,
        ),
    ),
    (
        "extract",
        re.compile(
            r"\b(gross|net|issu\w*|amount\w*|due|date\w*|quantity|total\w*|sum\w*|"
            r"price|cost|unit\s*price|payable|"
            r"brutto\w*|netto\w*|kiallit\w*|allit\w*|bocsat\w*|fizetesi|datum\w*|"
            r"menny\w*|osszeg\w*|vegosszeg\w*|ar\b|ara\b)\b",
            re.I,
        ),
    ),
]


async def intent_classifier_node(state: ChatState) -> dict:
    """Classify based on the last user message."""
    messages = state.get("messages") or []
    last_user_text = ""
    for m in reversed(messages):
        if hasattr(m, "type") and m.type == "human":
            last_user_text = m.content if isinstance(m.content, str) else str(m.content)
            break

    # ASCII normalization (strip accents) so the regexes can match
    # diacritic forms like "ellenőrizd" → "ellenorizd"
    normalized = _strip_accents(last_user_text)
    intent = "chat"
    for label, pattern in _INTENT_RULES:
        if pattern.search(normalized):
            intent = label
            break

    return {
        "intent": intent,
        "trace": [f"intent classifier: {intent}"],
    }