QModel / app /arabic_nlp.py

Upload folder using huggingface_hub

6ab1c8e 26 days ago

4.28 kB

	"""Arabic NLP — normalisation, light stemming, language detection."""

	from __future__ import annotations

	import re
	from typing import Dict, List, Literal


	# ── Normalization patterns ─────────────────────────────────────────────
	_DIACRITICS = re.compile(r"[\u064B-\u0655\u0656-\u0658\u0670\u06D6-\u06ED]")
	_ALEF_VARS = re.compile(r"[أإآٱ]")
	_WAW_HAMZA = re.compile(r"ؤ")
	_YA_HAMZA = re.compile(r"ئ")
	_TA_MARBUTA = re.compile(r"ة\b")
	_ALEF_MAQSURA = re.compile(r"ى")
	_TATWEEL = re.compile(r"\u0640+")
	_PUNC_AR = re.compile(r"[،؛؟!«»\u200c\u200d\u200f\u200e]")
	_MULTI_SPACE = re.compile(r"\s{2,}")
	_NON_AR_EN = re.compile(r"[^\u0600-\u06FF\u0750-\u077Fa-zA-Z0-9\s]")

	_SPELLING_MAP: Dict[str, str] = {
	"قران": "قرآن",
	"القران": "القرآن",
	"اللہ": "الله",
	}


	def normalize_arabic(text: str, *, aggressive: bool = False) -> str:
	"""Normalize Arabic text: diacritics, hamza, ta marbuta, etc."""
	text = _DIACRITICS.sub("", text)
	text = _TATWEEL.sub("", text)
	text = _ALEF_VARS.sub("ا", text)
	text = _WAW_HAMZA.sub("و", text)
	text = _YA_HAMZA.sub("ي", text)
	text = _TA_MARBUTA.sub("ه", text)
	text = _ALEF_MAQSURA.sub("ي", text)
	text = _PUNC_AR.sub(" ", text)
	for variant, canonical in _SPELLING_MAP.items():
	text = text.replace(variant, canonical)
	if aggressive:
	text = _NON_AR_EN.sub(" ", text)
	return _MULTI_SPACE.sub(" ", text).strip()


	# ── Light stemming ─────────────────────────────────────────────────────
	_AR_PREFIXES = re.compile(
	r"^(و\|ف\|ب\|ل\|ال\|لل\|وال\|فال\|بال\|كال\|ولل\|ومن\|وفي\|وعن\|وإلى\|وعلى)\b"
	)
	_AR_SUFFIXES = re.compile(
	r"(ون\|ين\|ان\|ات\|ها\|هم\|هن\|كم\|كن\|نا\|ني\|تي\|ي\|ه\|ك\|ا\|وا)$"
	)


	def light_stem(word: str) -> str:
	"""Light stemming: remove common Arabic affixes."""
	w = _AR_PREFIXES.sub("", word)
	w = _AR_SUFFIXES.sub("", w)
	return w if len(w) >= 2 else word


	def tokenize_ar(text: str) -> List[str]:
	"""Tokenize and stem Arabic text."""
	norm = normalize_arabic(text, aggressive=True).lower()
	return [light_stem(t) for t in norm.split() if t]


	# ── Language detection ─────────────────────────────────────────────────
	_ARABIC_SCRIPT = re.compile(
	r"[\u0600-\u06FF\u0750-\u077F\uFB50-\uFDFF\uFE70-\uFEFF]"
	)


	def detect_language(text: str) -> Literal["arabic", "english", "mixed"]:
	"""Detect if text is Arabic, English, or mixed."""
	ar = len(_ARABIC_SCRIPT.findall(text))
	en = len(re.findall(r"[a-zA-Z]", text))
	tot = ar + en or 1
	ratio = ar / tot
	if ratio > 0.70:
	return "arabic"
	if ratio < 0.30:
	return "english"
	return "mixed"


	def language_instruction(lang: str) -> str:
	"""Generate language-specific instruction for LLM."""
	return {
	"arabic": (
	"يجب أن تكون الإجابة كاملةً باللغة العربية الفصحى تماماً. "
	"لا تستخدم الإنجليزية أو أي لغة أخرى في أي جزء من الإجابة، "
	"باستثناء الاقتباسات الموجودة في صناديق الأدلة فقط. "
	"إذا كان السؤال بالعربية، أجب بالعربية حصراً."
	),
	"mixed": (
	"The question mixes Arabic and English. Reply primarily in Arabic (الفصحى) "
	"but you may include English transliterations for key Islamic terms where essential. "
	"Match the dominant language of the question."
	),
	"english": (
	"You MUST reply entirely in clear, formal English. "
	"Do NOT use Arabic in your explanation — only inside evidence quotation boxes. "
	"The user asked in English and expects an English answer."
	),
	}.get(lang, "You MUST reply entirely in clear, formal English.")