QModel / app /arabic_nlp.py
aelgendy's picture
Upload folder using huggingface_hub
6ab1c8e
"""Arabic NLP — normalisation, light stemming, language detection."""
from __future__ import annotations
import re
from typing import Dict, List, Literal
# ── Normalization patterns ─────────────────────────────────────────────
_DIACRITICS = re.compile(r"[\u064B-\u0655\u0656-\u0658\u0670\u06D6-\u06ED]")
_ALEF_VARS = re.compile(r"[أإآٱ]")
_WAW_HAMZA = re.compile(r"ؤ")
_YA_HAMZA = re.compile(r"ئ")
_TA_MARBUTA = re.compile(r"ة\b")
_ALEF_MAQSURA = re.compile(r"ى")
_TATWEEL = re.compile(r"\u0640+")
_PUNC_AR = re.compile(r"[،؛؟!«»\u200c\u200d\u200f\u200e]")
_MULTI_SPACE = re.compile(r"\s{2,}")
_NON_AR_EN = re.compile(r"[^\u0600-\u06FF\u0750-\u077Fa-zA-Z0-9\s]")
_SPELLING_MAP: Dict[str, str] = {
"قران": "قرآن",
"القران": "القرآن",
"اللہ": "الله",
}
def normalize_arabic(text: str, *, aggressive: bool = False) -> str:
"""Normalize Arabic text: diacritics, hamza, ta marbuta, etc."""
text = _DIACRITICS.sub("", text)
text = _TATWEEL.sub("", text)
text = _ALEF_VARS.sub("ا", text)
text = _WAW_HAMZA.sub("و", text)
text = _YA_HAMZA.sub("ي", text)
text = _TA_MARBUTA.sub("ه", text)
text = _ALEF_MAQSURA.sub("ي", text)
text = _PUNC_AR.sub(" ", text)
for variant, canonical in _SPELLING_MAP.items():
text = text.replace(variant, canonical)
if aggressive:
text = _NON_AR_EN.sub(" ", text)
return _MULTI_SPACE.sub(" ", text).strip()
# ── Light stemming ─────────────────────────────────────────────────────
_AR_PREFIXES = re.compile(
r"^(و|ف|ب|ل|ال|لل|وال|فال|بال|كال|ولل|ومن|وفي|وعن|وإلى|وعلى)\b"
)
_AR_SUFFIXES = re.compile(
r"(ون|ين|ان|ات|ها|هم|هن|كم|كن|نا|ني|تي|ي|ه|ك|ا|وا)$"
)
def light_stem(word: str) -> str:
"""Light stemming: remove common Arabic affixes."""
w = _AR_PREFIXES.sub("", word)
w = _AR_SUFFIXES.sub("", w)
return w if len(w) >= 2 else word
def tokenize_ar(text: str) -> List[str]:
"""Tokenize and stem Arabic text."""
norm = normalize_arabic(text, aggressive=True).lower()
return [light_stem(t) for t in norm.split() if t]
# ── Language detection ─────────────────────────────────────────────────
_ARABIC_SCRIPT = re.compile(
r"[\u0600-\u06FF\u0750-\u077F\uFB50-\uFDFF\uFE70-\uFEFF]"
)
def detect_language(text: str) -> Literal["arabic", "english", "mixed"]:
"""Detect if text is Arabic, English, or mixed."""
ar = len(_ARABIC_SCRIPT.findall(text))
en = len(re.findall(r"[a-zA-Z]", text))
tot = ar + en or 1
ratio = ar / tot
if ratio > 0.70:
return "arabic"
if ratio < 0.30:
return "english"
return "mixed"
def language_instruction(lang: str) -> str:
"""Generate language-specific instruction for LLM."""
return {
"arabic": (
"يجب أن تكون الإجابة كاملةً باللغة العربية الفصحى تماماً. "
"لا تستخدم الإنجليزية أو أي لغة أخرى في أي جزء من الإجابة، "
"باستثناء الاقتباسات الموجودة في صناديق الأدلة فقط. "
"إذا كان السؤال بالعربية، أجب بالعربية حصراً."
),
"mixed": (
"The question mixes Arabic and English. Reply primarily in Arabic (الفصحى) "
"but you may include English transliterations for key Islamic terms where essential. "
"Match the dominant language of the question."
),
"english": (
"You MUST reply entirely in clear, formal English. "
"Do NOT use Arabic in your explanation — only inside evidence quotation boxes. "
"The user asked in English and expects an English answer."
),
}.get(lang, "You MUST reply entirely in clear, formal English.")