from __future__ import annotations import json from pathlib import Path import pycountry LANGUAGE_ALIASES_JSON = Path(__file__).with_name("language_aliases.json") def _load_json_mapping(path: Path) -> dict[str, object]: with path.open(encoding="utf-8") as f: data = json.load(f) if not isinstance(data, dict): raise ValueError(f"Expected JSON object in {path}") return data def _load_language_aliases(path: Path = LANGUAGE_ALIASES_JSON) -> dict[str, tuple[str, ...]]: aliases = _load_json_mapping(path) normalized: dict[str, tuple[str, ...]] = {} for canonical, source_aliases in aliases.items(): if not isinstance(canonical, str) or not isinstance(source_aliases, list): continue normalized[canonical] = tuple(alias for alias in source_aliases if isinstance(alias, str) and alias) return normalized LANGUAGE_ALIASES = _load_language_aliases() ALL_LANGS = list(LANGUAGE_ALIASES.keys()) LATIN_ONLY_LANGS = { lang for lang in ALL_LANGS if lang not in { "ar", "fa", "ps", "sd", "ug", "ur", "ckb", "he", "yi", "ru", "uk", "be", "bg", "mk", "kk", "mn", "tt", "ky", "tg", "ba", "ce", "el", "hy", "ka", "am", "ti", "dv", "km", "lo", "my", "th", "si", "bo", "hi", "mr", "ne", "bn", "as", "ta", "te", "gu", "kn", "ml", "pa", "or", "ja", "zh", "ko", } } LANG_ISO2_TO_ISO3 = { lang: ( getattr(pycountry.languages.get(alpha_2=lang) or pycountry.languages.get(alpha_3=lang), "alpha_3", None) or lang ) for lang in ALL_LANGS } LANG_ALIASES = { alias: canonical for canonical, aliases in LANGUAGE_ALIASES.items() for alias in aliases } def canonical_lang(lang: str) -> str: return LANG_ALIASES.get(lang, lang) def label_script_suffix(label: str) -> str | None: label = (label or "").strip() if "_" not in label: return None suffix = label.rsplit("_", 1)[1].strip() return suffix or None def is_latin_script_label(label: str) -> bool: return label_script_suffix(label) == "Latn" def is_latin_script_compatible(lang: str, label: str) -> bool: """Return False when a `_Latn` label is used for a non-Latin language.""" if not is_latin_script_label(label): return True return canonical_lang(lang) in LATIN_ONLY_LANGS