| from __future__ import annotations |
|
|
| import json |
| import os |
| from pathlib import Path |
|
|
| import pycountry |
|
|
| from source_config import LANGUAGE_BUCKETS |
|
|
| LANGUAGE_GROUPS = {group: cfg["langs"] for group, cfg in LANGUAGE_BUCKETS.items()} |
| LANGUAGE_GROUP_WEIGHTS = {group: float(cfg["weight"]) for group, cfg in LANGUAGE_BUCKETS.items()} |
| LANGUAGE_GROUP_MIN_CHARS = {group: int(cfg["min_chars"]) for group, cfg in LANGUAGE_BUCKETS.items()} |
| LATIN_GROUPS = {group for group, cfg in LANGUAGE_BUCKETS.items() if cfg.get("latin")} |
| LANGUAGE_ALIASES_JSON = Path(__file__).with_name("language_aliases.json") |
|
|
|
|
| def _load_json_mapping(path: Path) -> dict[str, object]: |
| with path.open(encoding="utf-8") as f: |
| data = json.load(f) |
| if not isinstance(data, dict): |
| raise ValueError(f"Expected JSON object in {path}") |
| return data |
|
|
|
|
| def _load_language_aliases(path: Path = LANGUAGE_ALIASES_JSON) -> dict[str, tuple[str, ...]]: |
| aliases = _load_json_mapping(path) |
| normalized: dict[str, tuple[str, ...]] = {} |
| for canonical, source_aliases in aliases.items(): |
| if not isinstance(canonical, str) or not isinstance(source_aliases, list): |
| continue |
| normalized[canonical] = tuple(alias for alias in source_aliases if isinstance(alias, str) and alias) |
| return normalized |
|
|
|
|
| LANGUAGE_ALIASES = _load_language_aliases() |
| ALL_LANGS = list(LANGUAGE_ALIASES.keys()) |
| LANG_ISO2_TO_ISO3 = { |
| lang: ( |
| getattr(pycountry.languages.get(alpha_2=lang) or pycountry.languages.get(alpha_3=lang), "alpha_3", None) |
| or lang |
| ) |
| for lang in ALL_LANGS |
| } |
| LANG_ALIASES = { |
| alias: canonical |
| for canonical, aliases in LANGUAGE_ALIASES.items() |
| for alias in aliases |
| } |
| LANG_SOURCE_CODES = {canonical: tuple(aliases) for canonical, aliases in LANGUAGE_ALIASES.items()} |
| LANG_TO_GROUP = {lang: group for group, langs in LANGUAGE_GROUPS.items() for lang in langs} |
|
|
|
|
| def canonical_lang(lang: str) -> str: |
| return LANG_ALIASES.get(lang, lang) |
|
|
|
|
| def canonical_lang_family(lang: str) -> str: |
| """Collapse closely related variants to the app's display/training families.""" |
| return canonical_lang(lang) |
|
|