Spaces:

polyglot-tagger
/

language-extractor-demo

Running

App Files Files Community

DerivedFunction1 commited on 6 days ago

Commit

0419b40

1 Parent(s): de532b2

update

Browse files

Files changed (4) hide show

all_langs.json +0 -104
language.py +36 -70
language_aliases.json +104 -0
tatoeba.py +2 -11

all_langs.json DELETED Viewed

@@ -1,104 +0,0 @@
-{
-  "en": "eng",
-  "es": "spa",
-  "fr": "fra",
-  "de": "deu",
-  "it": "ita",
-  "pt": "por",
-  "nl": "nld",
-  "vi": "vie",
-  "tr": "tur",
-  "la": "lat",
-  "id": "ind",
-  "ms": "msa",
-  "af": "afr",
-  "sq": "sqi",
-  "is": "isl",
-  "no": "nor",
-  "sv": "swe",
-  "da": "dan",
-  "fi": "fin",
-  "hu": "hun",
-  "pl": "pol",
-  "cs": "ces",
-  "ro": "ron",
-  "ru": "rus",
-  "bg": "bul",
-  "uk": "ukr",
-  "sr": "srp",
-  "be": "bel",
-  "kk": "kaz",
-  "mk": "mkd",
-  "mn": "mon",
-  "zh": "zho",
-  "ja": "jpn",
-  "ko": "kor",
-  "hi": "hin",
-  "ur": "urd",
-  "bn": "ben",
-  "ta": "tam",
-  "te": "tel",
-  "mr": "mar",
-  "gu": "guj",
-  "kn": "kan",
-  "ml": "mal",
-  "pa": "pan",
-  "as": "asm",
-  "or": "ori",
-  "ar": "ara",
-  "fa": "fas",
-  "ps": "pus",
-  "sd": "snd",
-  "ug": "uig",
-  "el": "ell",
-  "he": "heb",
-  "hy": "hye",
-  "ka": "kat",
-  "am": "amh",
-  "km": "khm",
-  "lo": "lao",
-  "my": "mya",
-  "th": "tha",
-  "si": "sin",
-  "bo": "bod",
-  "dv": "div",
-  "ti": "tir",
-  "sw": "swa",
-  "eu": "eus",
-  "tl": "tgl",
-  "ca": "cat",
-  "gl": "glg",
-  "oc": "oci",
-  "br": "bre",
-  "ga": "gle",
-  "gd": "gla",
-  "cy": "cym",
-  "bs": "bos",
-  "hr": "hrv",
-  "sl": "slv",
-  "sk": "slk",
-  "et": "est",
-  "lv": "lav",
-  "lt": "lit",
-  "eo": "epo",
-  "jv": "jav",
-  "mg": "mlg",
-  "om": "orm",
-  "so": "som",
-  "su": "sun",
-  "uz": "uzb",
-  "ku": "kur",
-  "ckb": "ckb",
-  "ne": "nep",
-  "mt": "mlt",
-  "lb": "ltz",
-  "rm": "roh",
-  "tt": "tat",
-  "ky": "kir",
-  "tg": "tgk",
-  "ba": "bak",
-  "yo": "yor",
-  "zu": "zul",
-  "ny": "nya",
-  "ce": "che"
-}

language.py CHANGED Viewed

@@ -4,63 +4,50 @@ import json
 import os
 from pathlib import Path
 from source_config import LANGUAGE_BUCKETS
 LANGUAGE_GROUPS = {group: cfg["langs"] for group, cfg in LANGUAGE_BUCKETS.items()}
 LANGUAGE_GROUP_WEIGHTS = {group: float(cfg["weight"]) for group, cfg in LANGUAGE_BUCKETS.items()}
 LANGUAGE_GROUP_MIN_CHARS = {group: int(cfg["min_chars"]) for group, cfg in LANGUAGE_BUCKETS.items()}
 LATIN_GROUPS = {group for group, cfg in LANGUAGE_BUCKETS.items() if cfg.get("latin")}
 LANG_ALIASES = {
-    "nn": "no",
-    "nb": "no",
-    "nob": "no",
-    "nno": "no",
-    "nor": "no",
-    "pes": "fa",
-    "fas": "fa",
-    "prs": "fa",
-    "arb": "ar",
-    "acm": "ar",
-    "apc": "ar",
-    "ary": "ar",
-    "arz": "ar",
-    "arq": "ar",
-    "aeb": "ar",
-    "ajp": "ar",
-    "azb": "az",
-    "azj": "az",
-    "ayr": "ay",
-    "npi": "ne",
-    "ory": "or",
-    "pbt": "ps",
-    "swh": "sw",
-    "zsm": "ms",
-    "yue": "zh",
-    "wuu": "zh",
-    "wu": "zh",
-    "nan": "zh",
-    "hak": "zh",
-    "gan": "zh",
-    "cdo": "zh",
-    "hsn": "zh",
-    "cjy": "zh",
-    "mnp": "zh",
-    "cpx": "zh",
-    "czh": "zh",
-    "lzh": "zh",
-    "hbo": "he",
-    "grc": "el",
 }
-LANGS_JSON = Path(__file__).with_name("all_langs.json")
-# Read from file to load the key-value pairs. JSON key order is canonical.
-LANG_ISO2_TO_ISO3 = {}
-with open(LANGS_JSON) as f:
-    # Parse the file as a json string
-    LANG_ISO2_TO_ISO3 = json.load(f)
-ALL_LANGS = list(LANG_ISO2_TO_ISO3.keys())
 LANG_TO_GROUP = {lang: group for group, langs in LANGUAGE_GROUPS.items() for lang in langs}
@@ -71,24 +58,3 @@ def canonical_lang(lang: str) -> str:
 def canonical_lang_family(lang: str) -> str:
     """Collapse closely related variants to the app's display/training families."""
     return canonical_lang(lang)
-def write_all_langs_json(path: str | os.PathLike[str] = LANGS_JSON) -> None:
-    """Write the canonical ALL_LANGS list to JSON if it is missing."""
-    path = Path(path)
-    if path.exists():
-        return
-    with path.open("w", encoding="utf-8") as f:
-        json.dump(ALL_LANGS, f, ensure_ascii=False, indent=2)
-def load_all_langs(path: str | os.PathLike[str] = LANGS_JSON) -> list[str]:
-    """Load ALL_LANGS from JSON, falling back to the in-repo constant."""
-    path = Path(path)
-    if path.exists():
-        with path.open(encoding="utf-8") as f:
-            langs = json.load(f)
-        if isinstance(langs, list) and all(isinstance(lang, str) for lang in langs):
-            return langs
-    write_all_langs_json(path)
-    return ALL_LANGS[:]

 import os
 from pathlib import Path
+import pycountry
 from source_config import LANGUAGE_BUCKETS
 LANGUAGE_GROUPS = {group: cfg["langs"] for group, cfg in LANGUAGE_BUCKETS.items()}
 LANGUAGE_GROUP_WEIGHTS = {group: float(cfg["weight"]) for group, cfg in LANGUAGE_BUCKETS.items()}
 LANGUAGE_GROUP_MIN_CHARS = {group: int(cfg["min_chars"]) for group, cfg in LANGUAGE_BUCKETS.items()}
 LATIN_GROUPS = {group for group, cfg in LANGUAGE_BUCKETS.items() if cfg.get("latin")}
+LANGUAGE_ALIASES_JSON = Path(__file__).with_name("language_aliases.json")
+def _load_json_mapping(path: Path) -> dict[str, object]:
+    with path.open(encoding="utf-8") as f:
+        data = json.load(f)
+    if not isinstance(data, dict):
+        raise ValueError(f"Expected JSON object in {path}")
+    return data
+def _load_language_aliases(path: Path = LANGUAGE_ALIASES_JSON) -> dict[str, tuple[str, ...]]:
+    aliases = _load_json_mapping(path)
+    normalized: dict[str, tuple[str, ...]] = {}
+    for canonical, source_aliases in aliases.items():
+        if not isinstance(canonical, str) or not isinstance(source_aliases, list):
+            continue
+        normalized[canonical] = tuple(alias for alias in source_aliases if isinstance(alias, str) and alias)
+    return normalized
+LANGUAGE_ALIASES = _load_language_aliases()
+ALL_LANGS = list(LANGUAGE_ALIASES.keys())
+LANG_ISO2_TO_ISO3 = {
+    lang: (
+        getattr(pycountry.languages.get(alpha_2=lang) or pycountry.languages.get(alpha_3=lang), "alpha_3", None)
+        or lang
+    )
+    for lang in ALL_LANGS
+}
 LANG_ALIASES = {
+    alias: canonical
+    for canonical, aliases in LANGUAGE_ALIASES.items()
+    for alias in aliases
 }
+LANG_SOURCE_CODES = {canonical: tuple(aliases) for canonical, aliases in LANGUAGE_ALIASES.items()}
 LANG_TO_GROUP = {lang: group for group, langs in LANGUAGE_GROUPS.items() for lang in langs}
 def canonical_lang_family(lang: str) -> str:
     """Collapse closely related variants to the app's display/training families."""
     return canonical_lang(lang)

language_aliases.json ADDED Viewed

	@@ -0,0 +1,104 @@

+{
+  "en": ["eng"],
+  "es": ["spa"],
+  "fr": ["fra", "fre"],
+  "de": ["deu", "ger"],
+  "it": ["ita"],
+  "pt": ["por"],
+  "nl": ["nld", "dut"],
+  "vi": ["vie"],
+  "tr": ["tur"],
+  "la": ["lat"],
+  "id": ["ind"],
+  "ms": ["msa", "may", "zlm", "zsm"],
+  "af": ["afr"],
+  "sq": ["alb", "sqi"],
+  "is": ["ice", "isl"],
+  "no": ["nb", "nn", "nor", "nob", "nno"],
+  "sv": ["swe"],
+  "da": ["dan"],
+  "fi": ["fin"],
+  "hu": ["hun"],
+  "pl": ["pol"],
+  "cs": ["ces", "cze"],
+  "ro": ["ron", "rum"],
+  "ru": ["rus"],
+  "bg": ["bul"],
+  "uk": ["ukr"],
+  "sr": ["srp"],
+  "be": ["bel"],
+  "kk": ["kaz"],
+  "mk": ["mac", "mkd"],
+  "mn": ["mon", "khk"],
+  "zh": ["chi", "zho", "cmn", "yue", "wuu", "nan"],
+  "ja": ["jpn"],
+  "ko": ["kor"],
+  "hi": ["hin"],
+  "ur": ["urd"],
+  "bn": ["ben"],
+  "ta": ["tam"],
+  "te": ["tel"],
+  "mr": ["mar"],
+  "gu": ["guj"],
+  "kn": ["kan"],
+  "ml": ["mal"],
+  "pa": ["pan"],
+  "as": ["asm"],
+  "or": ["ori", "ory"],
+  "ar": ["arb", "ara"],
+  "fa": ["fas", "per", "pes", "prs"],
+  "ps": ["pus", "pbt"],
+  "sd": ["snd"],
+  "ug": ["uig"],
+  "el": ["ell", "gre", "grc"],
+  "he": ["heb", "iw", "hbo"],
+  "hy": ["hye", "hyw"],
+  "ka": ["kat"],
+  "am": ["amh"],
+  "km": ["khm"],
+  "lo": ["lao"],
+  "my": ["mya"],
+  "th": ["tha"],
+  "si": ["sin"],
+  "bo": ["bod"],
+  "dv": ["div"],
+  "ti": ["tir"],
+  "sw": ["swa", "swh"],
+  "eu": ["eus"],
+  "tl": ["fil", "tgl"],
+  "ca": ["cat"],
+  "gl": ["glg"],
+  "oc": ["oci"],
+  "br": ["bre"],
+  "ga": ["gle"],
+  "gd": ["gla"],
+  "cy": ["cym"],
+  "bs": ["bos"],
+  "hr": ["hrv"],
+  "sl": ["slv"],
+  "sk": ["slk"],
+  "et": ["est"],
+  "lv": ["lav", "lvs"],
+  "lt": ["lit"],
+  "eo": ["epo"],
+  "jv": ["jav"],
+  "mg": ["mlg", "plt"],
+  "om": ["orm"],
+  "so": ["som"],
+  "su": ["sun"],
+  "uz": ["uzn", "uzb"],
+  "ku": ["kur", "kmr"],
+  "ckb": [],
+  "ne": ["npi"],
+  "mt": ["mlt"],
+  "lb": ["ltz"],
+  "rm": ["roh"],
+  "tt": ["tat"],
+  "ky": ["kir"],
+  "tg": ["tgk"],
+  "ba": ["bak"],
+  "yo": ["yor"],
+  "zu": ["zul"],
+  "ny": ["nya"],
+  "ce": ["che"]
+}

tatoeba.py CHANGED Viewed

@@ -14,15 +14,6 @@ from language import ALL_LANGS, LANG_ISO2_TO_ISO3, canonical_lang
 TATOEBA_CACHE_DIR = Path(__file__).with_name("data") / "tatoeba"
 TATOEBA_PARQUET_PATH = TATOEBA_CACHE_DIR / "tatoeba_text.parquet"
-DEFAULT_LANGUAGE_REMAPS = {
-    "cmn": "zh",
-    "yue": "zh",
-    "wuu": "zh",
-    "nan": "zh",
-    "nob": "no",
-    "nno": "no",
-}
 def _normalize_text_key(text: str) -> str:
     normalized = unicodedata.normalize("NFKC", text)
@@ -34,10 +25,10 @@ def _normalize_lang(code: str) -> str | None:
     code = (code or "").strip()
     if not code:
         return None
-    code = DEFAULT_LANGUAGE_REMAPS.get(code, code)
     if code in ALL_LANGS:
         return code
-    return canonical_lang(code)
 def _coerce_source_lang(lang_code: str) -> tuple[str, str]:

 TATOEBA_CACHE_DIR = Path(__file__).with_name("data") / "tatoeba"
 TATOEBA_PARQUET_PATH = TATOEBA_CACHE_DIR / "tatoeba_text.parquet"
 def _normalize_text_key(text: str) -> str:
     normalized = unicodedata.normalize("NFKC", text)
     code = (code or "").strip()
     if not code:
         return None
+    code = canonical_lang(code)
     if code in ALL_LANGS:
         return code
+    return None
 def _coerce_source_lang(lang_code: str) -> tuple[str, str]: