address / src /normalizer.py

Add Rust address converter and use underthesea for normalization

1efa4be 2 months ago

2.08 kB

	"""Vietnamese text normalization for address matching.

	Uses underthesea character normalization (NFC + character map) to fix
	encoding issues, then applies address-specific transformations
	(abbreviation expansion, diacritics removal, key generation).
	"""

	import re
	import unicodedata

	from underthesea.pipeline.text_normalize.character_normalize import (
	normalize_characters_in_text,
	)

	# Abbreviation expansions
	ABBREVIATIONS = {
	"tp.": "thành phố ",
	"tp ": "thành phố ",
	"t.p.": "thành phố ",
	"t.p ": "thành phố ",
	"p.": "phường ",
	"q.": "quận ",
	"h.": "huyện ",
	"tx.": "thị xã ",
	"t.x.": "thị xã ",
	"tt.": "thị trấn ",
	"t.t.": "thị trấn ",
	"x.": "xã ",
	}


	def remove_diacritics(text: str) -> str:
	"""Remove Vietnamese diacritics from text.

	First applies underthesea character normalization (NFC + character map)
	to fix encoding issues, then strips combining marks via NFKD decomposition.
	"""
	text = normalize_characters_in_text(text)
	nfkd = unicodedata.normalize("NFKD", text)
	result = "".join(c for c in nfkd if not unicodedata.combining(c))
	# Handle đ/Đ separately (not decomposed by NFKD)
	result = result.replace("đ", "d").replace("Đ", "D")
	return result


	def normalize_key(text: str) -> str:
	"""Normalize text to a lookup key (lowercase, no diacritics, no spaces/punctuation)."""
	text = text.lower().strip()
	text = remove_diacritics(text)
	text = re.sub(r"[^a-z0-9]", "", text)
	return text


	def expand_abbreviations(text: str) -> str:
	"""Expand common Vietnamese address abbreviations."""
	result = text.lower().strip()
	# Sort by length descending to match longer abbreviations first
	for abbr, full in sorted(ABBREVIATIONS.items(), key=lambda x: -len(x[0])):
	result = result.replace(abbr, full)
	return result.strip()


	def normalize_for_matching(text: str) -> str:
	"""Full normalization pipeline for fuzzy matching."""
	text = expand_abbreviations(text)
	return normalize_key(text)