Spaces:

Vincsipe
/

paperhawk

Running

paperhawk / nodes /chat /intent_classifier_node.py

Nándorfi Vince

Initial paperhawk push to HF Space (LFS for binaries)

7ff7119 3 days ago

2.98 kB

	"""intent_classifier_node — fast regex-based intent recognition.

	6 intents: list / extract / search / compare / validate / chat.
	LLM-independent, < 1 ms.
	"""

	from __future__ import annotations

	import re
	import unicodedata

	from graph.states.chat_state import ChatState


	def _strip_accents(text: str) -> str:
	"""ASCII normalization: strip diacritics (á→a, ő→o, etc.)."""
	nfkd = unicodedata.normalize("NFKD", text)
	return "".join(c for c in nfkd if not unicodedata.combining(c))


	# English-first regexes with multilingual (HU) fallback — runs on
	# ASCII-normalized text so "ellenőrizd" matches "ellenoriz".
	_INTENT_RULES: list[tuple[str, re.Pattern[str]]] = [
	(
	"compare",
	re.compile(
	r"\b(compar\w\|differ\w\|diff\|versus\|\bvs\b\|cheap\w\|expensiv\w\|"
	r"hasonlit\w\|elter\w\|kulonbs\w\|szembe\w\|drag\w\|olcsobb\w\|mennyivel)\b",
	re.I,
	),
	),
	(
	"validate",
	re.compile(
	r"\b(math\|error\w\|valid\w\|check\|verify\|cdv\|tax\sid\|consist\w\|correct\|"
	r"matek\w\|hib\w\|validal\w\|ellenoriz\w\|adoszam\w\|ervenyes\w\|helyes)\b",
	re.I,
	),
	),
	(
	"search",
	re.compile(
	r"\b(search\|find\|where\|contain\w\|penalty\|liquid\w\|clause\w*\|"
	r"keres\w\|talald\|hol\|melyik\|tartalmaz\w\|szallit\w\|kotber\w\|change\|klauz\w*)\b",
	re.I,
	),
	),
	(
	"list",
	re.compile(
	r"\b("
	r"(?:what\|which)\s+(?:documents?\|files?\|types?\|kinds?\|uploads?)\|"
	r"how\s*many\s+(?:documents?\|files?)\|"
	r"list\|listazd\|listazz\|"
	r"file\w\|document\w\|kind\|"
	r"milyen\|mely\|hany\|fajl\w\|dokumentum\w\|tipus\w*"
	r")\b",
	re.I,
	),
	),
	(
	"extract",
	re.compile(
	r"\b(gross\|net\|issu\w\|amount\w\|due\|date\w\|quantity\|total\w\|sum\w*\|"
	r"price\|cost\|unit\s*price\|payable\|"
	r"brutto\w\|netto\w\|kiallit\w\|allit\w\|bocsat\w\|fizetesi\|datum\w\|"
	r"menny\w\|osszeg\w\|vegosszeg\w*\|ar\b\|ara\b)\b",
	re.I,
	),
	),
	]


	async def intent_classifier_node(state: ChatState) -> dict:
	"""Classify based on the last user message."""
	messages = state.get("messages") or []
	last_user_text = ""
	for m in reversed(messages):
	if hasattr(m, "type") and m.type == "human":
	last_user_text = m.content if isinstance(m.content, str) else str(m.content)
	break

	# ASCII normalization (strip accents) so the regexes can match
	# diacritic forms like "ellenőrizd" → "ellenorizd"
	normalized = _strip_accents(last_user_text)
	intent = "chat"
	for label, pattern in _INTENT_RULES:
	if pattern.search(normalized):
	intent = label
	break

	return {
	"intent": intent,
	"trace": [f"intent classifier: {intent}"],
	}