DerivedFunction's picture
update
0419b40
from __future__ import annotations
import json
import os
from pathlib import Path
import pycountry
from source_config import LANGUAGE_BUCKETS
LANGUAGE_GROUPS = {group: cfg["langs"] for group, cfg in LANGUAGE_BUCKETS.items()}
LANGUAGE_GROUP_WEIGHTS = {group: float(cfg["weight"]) for group, cfg in LANGUAGE_BUCKETS.items()}
LANGUAGE_GROUP_MIN_CHARS = {group: int(cfg["min_chars"]) for group, cfg in LANGUAGE_BUCKETS.items()}
LATIN_GROUPS = {group for group, cfg in LANGUAGE_BUCKETS.items() if cfg.get("latin")}
LANGUAGE_ALIASES_JSON = Path(__file__).with_name("language_aliases.json")
def _load_json_mapping(path: Path) -> dict[str, object]:
with path.open(encoding="utf-8") as f:
data = json.load(f)
if not isinstance(data, dict):
raise ValueError(f"Expected JSON object in {path}")
return data
def _load_language_aliases(path: Path = LANGUAGE_ALIASES_JSON) -> dict[str, tuple[str, ...]]:
aliases = _load_json_mapping(path)
normalized: dict[str, tuple[str, ...]] = {}
for canonical, source_aliases in aliases.items():
if not isinstance(canonical, str) or not isinstance(source_aliases, list):
continue
normalized[canonical] = tuple(alias for alias in source_aliases if isinstance(alias, str) and alias)
return normalized
LANGUAGE_ALIASES = _load_language_aliases()
ALL_LANGS = list(LANGUAGE_ALIASES.keys())
LANG_ISO2_TO_ISO3 = {
lang: (
getattr(pycountry.languages.get(alpha_2=lang) or pycountry.languages.get(alpha_3=lang), "alpha_3", None)
or lang
)
for lang in ALL_LANGS
}
LANG_ALIASES = {
alias: canonical
for canonical, aliases in LANGUAGE_ALIASES.items()
for alias in aliases
}
LANG_SOURCE_CODES = {canonical: tuple(aliases) for canonical, aliases in LANGUAGE_ALIASES.items()}
LANG_TO_GROUP = {lang: group for group, langs in LANGUAGE_GROUPS.items() for lang in langs}
def canonical_lang(lang: str) -> str:
return LANG_ALIASES.get(lang, lang)
def canonical_lang_family(lang: str) -> str:
"""Collapse closely related variants to the app's display/training families."""
return canonical_lang(lang)