| """Arabic NLP — normalisation, light stemming, language detection.""" |
|
|
| from __future__ import annotations |
|
|
| import re |
| from typing import Dict, List, Literal |
|
|
|
|
| |
| _DIACRITICS = re.compile(r"[\u064B-\u0655\u0656-\u0658\u0670\u06D6-\u06ED]") |
| _ALEF_VARS = re.compile(r"[أإآٱ]") |
| _WAW_HAMZA = re.compile(r"ؤ") |
| _YA_HAMZA = re.compile(r"ئ") |
| _TA_MARBUTA = re.compile(r"ة\b") |
| _ALEF_MAQSURA = re.compile(r"ى") |
| _TATWEEL = re.compile(r"\u0640+") |
| _PUNC_AR = re.compile(r"[،؛؟!«»\u200c\u200d\u200f\u200e]") |
| _MULTI_SPACE = re.compile(r"\s{2,}") |
| _NON_AR_EN = re.compile(r"[^\u0600-\u06FF\u0750-\u077Fa-zA-Z0-9\s]") |
|
|
| _SPELLING_MAP: Dict[str, str] = { |
| "قران": "قرآن", |
| "القران": "القرآن", |
| "اللہ": "الله", |
| } |
|
|
|
|
| def normalize_arabic(text: str, *, aggressive: bool = False) -> str: |
| """Normalize Arabic text: diacritics, hamza, ta marbuta, etc.""" |
| text = _DIACRITICS.sub("", text) |
| text = _TATWEEL.sub("", text) |
| text = _ALEF_VARS.sub("ا", text) |
| text = _WAW_HAMZA.sub("و", text) |
| text = _YA_HAMZA.sub("ي", text) |
| text = _TA_MARBUTA.sub("ه", text) |
| text = _ALEF_MAQSURA.sub("ي", text) |
| text = _PUNC_AR.sub(" ", text) |
| for variant, canonical in _SPELLING_MAP.items(): |
| text = text.replace(variant, canonical) |
| if aggressive: |
| text = _NON_AR_EN.sub(" ", text) |
| return _MULTI_SPACE.sub(" ", text).strip() |
|
|
|
|
| |
| _AR_PREFIXES = re.compile( |
| r"^(و|ف|ب|ل|ال|لل|وال|فال|بال|كال|ولل|ومن|وفي|وعن|وإلى|وعلى)\b" |
| ) |
| _AR_SUFFIXES = re.compile( |
| r"(ون|ين|ان|ات|ها|هم|هن|كم|كن|نا|ني|تي|ي|ه|ك|ا|وا)$" |
| ) |
|
|
|
|
| def light_stem(word: str) -> str: |
| """Light stemming: remove common Arabic affixes.""" |
| w = _AR_PREFIXES.sub("", word) |
| w = _AR_SUFFIXES.sub("", w) |
| return w if len(w) >= 2 else word |
|
|
|
|
| def tokenize_ar(text: str) -> List[str]: |
| """Tokenize and stem Arabic text.""" |
| norm = normalize_arabic(text, aggressive=True).lower() |
| return [light_stem(t) for t in norm.split() if t] |
|
|
|
|
| |
| _ARABIC_SCRIPT = re.compile( |
| r"[\u0600-\u06FF\u0750-\u077F\uFB50-\uFDFF\uFE70-\uFEFF]" |
| ) |
|
|
|
|
| def detect_language(text: str) -> Literal["arabic", "english", "mixed"]: |
| """Detect if text is Arabic, English, or mixed.""" |
| ar = len(_ARABIC_SCRIPT.findall(text)) |
| en = len(re.findall(r"[a-zA-Z]", text)) |
| tot = ar + en or 1 |
| ratio = ar / tot |
| if ratio > 0.70: |
| return "arabic" |
| if ratio < 0.30: |
| return "english" |
| return "mixed" |
|
|
|
|
| def language_instruction(lang: str) -> str: |
| """Generate language-specific instruction for LLM.""" |
| return { |
| "arabic": ( |
| "يجب أن تكون الإجابة كاملةً باللغة العربية الفصحى تماماً. " |
| "لا تستخدم الإنجليزية أو أي لغة أخرى في أي جزء من الإجابة، " |
| "باستثناء الاقتباسات الموجودة في صناديق الأدلة فقط. " |
| "إذا كان السؤال بالعربية، أجب بالعربية حصراً." |
| ), |
| "mixed": ( |
| "The question mixes Arabic and English. Reply primarily in Arabic (الفصحى) " |
| "but you may include English transliterations for key Islamic terms where essential. " |
| "Match the dominant language of the question." |
| ), |
| "english": ( |
| "You MUST reply entirely in clear, formal English. " |
| "Do NOT use Arabic in your explanation — only inside evidence quotation boxes. " |
| "The user asked in English and expects an English answer." |
| ), |
| }.get(lang, "You MUST reply entirely in clear, formal English.") |
|
|