DerivedFunction1 commited on
Commit
0419b40
·
1 Parent(s): de532b2
Files changed (4) hide show
  1. all_langs.json +0 -104
  2. language.py +36 -70
  3. language_aliases.json +104 -0
  4. tatoeba.py +2 -11
all_langs.json DELETED
@@ -1,104 +0,0 @@
1
- {
2
- "en": "eng",
3
- "es": "spa",
4
- "fr": "fra",
5
- "de": "deu",
6
- "it": "ita",
7
- "pt": "por",
8
- "nl": "nld",
9
- "vi": "vie",
10
- "tr": "tur",
11
- "la": "lat",
12
- "id": "ind",
13
- "ms": "msa",
14
- "af": "afr",
15
- "sq": "sqi",
16
- "is": "isl",
17
- "no": "nor",
18
- "sv": "swe",
19
- "da": "dan",
20
- "fi": "fin",
21
- "hu": "hun",
22
- "pl": "pol",
23
- "cs": "ces",
24
- "ro": "ron",
25
- "ru": "rus",
26
- "bg": "bul",
27
- "uk": "ukr",
28
- "sr": "srp",
29
- "be": "bel",
30
- "kk": "kaz",
31
- "mk": "mkd",
32
- "mn": "mon",
33
- "zh": "zho",
34
- "ja": "jpn",
35
- "ko": "kor",
36
- "hi": "hin",
37
- "ur": "urd",
38
- "bn": "ben",
39
- "ta": "tam",
40
- "te": "tel",
41
- "mr": "mar",
42
- "gu": "guj",
43
- "kn": "kan",
44
- "ml": "mal",
45
- "pa": "pan",
46
- "as": "asm",
47
- "or": "ori",
48
- "ar": "ara",
49
- "fa": "fas",
50
- "ps": "pus",
51
- "sd": "snd",
52
- "ug": "uig",
53
- "el": "ell",
54
- "he": "heb",
55
- "hy": "hye",
56
- "ka": "kat",
57
- "am": "amh",
58
- "km": "khm",
59
- "lo": "lao",
60
- "my": "mya",
61
- "th": "tha",
62
- "si": "sin",
63
- "bo": "bod",
64
- "dv": "div",
65
- "ti": "tir",
66
- "sw": "swa",
67
- "eu": "eus",
68
- "tl": "tgl",
69
- "ca": "cat",
70
- "gl": "glg",
71
- "oc": "oci",
72
- "br": "bre",
73
- "ga": "gle",
74
- "gd": "gla",
75
- "cy": "cym",
76
- "bs": "bos",
77
- "hr": "hrv",
78
- "sl": "slv",
79
- "sk": "slk",
80
- "et": "est",
81
- "lv": "lav",
82
- "lt": "lit",
83
- "eo": "epo",
84
- "jv": "jav",
85
- "mg": "mlg",
86
- "om": "orm",
87
- "so": "som",
88
- "su": "sun",
89
- "uz": "uzb",
90
- "ku": "kur",
91
- "ckb": "ckb",
92
- "ne": "nep",
93
- "mt": "mlt",
94
- "lb": "ltz",
95
- "rm": "roh",
96
- "tt": "tat",
97
- "ky": "kir",
98
- "tg": "tgk",
99
- "ba": "bak",
100
- "yo": "yor",
101
- "zu": "zul",
102
- "ny": "nya",
103
- "ce": "che"
104
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
language.py CHANGED
@@ -4,63 +4,50 @@ import json
4
  import os
5
  from pathlib import Path
6
 
 
 
7
  from source_config import LANGUAGE_BUCKETS
8
 
9
  LANGUAGE_GROUPS = {group: cfg["langs"] for group, cfg in LANGUAGE_BUCKETS.items()}
10
  LANGUAGE_GROUP_WEIGHTS = {group: float(cfg["weight"]) for group, cfg in LANGUAGE_BUCKETS.items()}
11
  LANGUAGE_GROUP_MIN_CHARS = {group: int(cfg["min_chars"]) for group, cfg in LANGUAGE_BUCKETS.items()}
12
  LATIN_GROUPS = {group for group, cfg in LANGUAGE_BUCKETS.items() if cfg.get("latin")}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  LANG_ALIASES = {
14
- "nn": "no",
15
- "nb": "no",
16
- "nob": "no",
17
- "nno": "no",
18
- "nor": "no",
19
- "pes": "fa",
20
- "fas": "fa",
21
- "prs": "fa",
22
- "arb": "ar",
23
- "acm": "ar",
24
- "apc": "ar",
25
- "ary": "ar",
26
- "arz": "ar",
27
- "arq": "ar",
28
- "aeb": "ar",
29
- "ajp": "ar",
30
- "azb": "az",
31
- "azj": "az",
32
- "ayr": "ay",
33
- "npi": "ne",
34
- "ory": "or",
35
- "pbt": "ps",
36
- "swh": "sw",
37
- "zsm": "ms",
38
- "yue": "zh",
39
- "wuu": "zh",
40
- "wu": "zh",
41
- "nan": "zh",
42
- "hak": "zh",
43
- "gan": "zh",
44
- "cdo": "zh",
45
- "hsn": "zh",
46
- "cjy": "zh",
47
- "mnp": "zh",
48
- "cpx": "zh",
49
- "czh": "zh",
50
- "lzh": "zh",
51
- "hbo": "he",
52
- "grc": "el",
53
  }
54
-
55
- LANGS_JSON = Path(__file__).with_name("all_langs.json")
56
-
57
- # Read from file to load the key-value pairs. JSON key order is canonical.
58
- LANG_ISO2_TO_ISO3 = {}
59
- with open(LANGS_JSON) as f:
60
- # Parse the file as a json string
61
- LANG_ISO2_TO_ISO3 = json.load(f)
62
-
63
- ALL_LANGS = list(LANG_ISO2_TO_ISO3.keys())
64
  LANG_TO_GROUP = {lang: group for group, langs in LANGUAGE_GROUPS.items() for lang in langs}
65
 
66
 
@@ -71,24 +58,3 @@ def canonical_lang(lang: str) -> str:
71
  def canonical_lang_family(lang: str) -> str:
72
  """Collapse closely related variants to the app's display/training families."""
73
  return canonical_lang(lang)
74
-
75
-
76
- def write_all_langs_json(path: str | os.PathLike[str] = LANGS_JSON) -> None:
77
- """Write the canonical ALL_LANGS list to JSON if it is missing."""
78
- path = Path(path)
79
- if path.exists():
80
- return
81
- with path.open("w", encoding="utf-8") as f:
82
- json.dump(ALL_LANGS, f, ensure_ascii=False, indent=2)
83
-
84
-
85
- def load_all_langs(path: str | os.PathLike[str] = LANGS_JSON) -> list[str]:
86
- """Load ALL_LANGS from JSON, falling back to the in-repo constant."""
87
- path = Path(path)
88
- if path.exists():
89
- with path.open(encoding="utf-8") as f:
90
- langs = json.load(f)
91
- if isinstance(langs, list) and all(isinstance(lang, str) for lang in langs):
92
- return langs
93
- write_all_langs_json(path)
94
- return ALL_LANGS[:]
 
4
  import os
5
  from pathlib import Path
6
 
7
+ import pycountry
8
+
9
  from source_config import LANGUAGE_BUCKETS
10
 
11
  LANGUAGE_GROUPS = {group: cfg["langs"] for group, cfg in LANGUAGE_BUCKETS.items()}
12
  LANGUAGE_GROUP_WEIGHTS = {group: float(cfg["weight"]) for group, cfg in LANGUAGE_BUCKETS.items()}
13
  LANGUAGE_GROUP_MIN_CHARS = {group: int(cfg["min_chars"]) for group, cfg in LANGUAGE_BUCKETS.items()}
14
  LATIN_GROUPS = {group for group, cfg in LANGUAGE_BUCKETS.items() if cfg.get("latin")}
15
+ LANGUAGE_ALIASES_JSON = Path(__file__).with_name("language_aliases.json")
16
+
17
+
18
+ def _load_json_mapping(path: Path) -> dict[str, object]:
19
+ with path.open(encoding="utf-8") as f:
20
+ data = json.load(f)
21
+ if not isinstance(data, dict):
22
+ raise ValueError(f"Expected JSON object in {path}")
23
+ return data
24
+
25
+
26
+ def _load_language_aliases(path: Path = LANGUAGE_ALIASES_JSON) -> dict[str, tuple[str, ...]]:
27
+ aliases = _load_json_mapping(path)
28
+ normalized: dict[str, tuple[str, ...]] = {}
29
+ for canonical, source_aliases in aliases.items():
30
+ if not isinstance(canonical, str) or not isinstance(source_aliases, list):
31
+ continue
32
+ normalized[canonical] = tuple(alias for alias in source_aliases if isinstance(alias, str) and alias)
33
+ return normalized
34
+
35
+
36
+ LANGUAGE_ALIASES = _load_language_aliases()
37
+ ALL_LANGS = list(LANGUAGE_ALIASES.keys())
38
+ LANG_ISO2_TO_ISO3 = {
39
+ lang: (
40
+ getattr(pycountry.languages.get(alpha_2=lang) or pycountry.languages.get(alpha_3=lang), "alpha_3", None)
41
+ or lang
42
+ )
43
+ for lang in ALL_LANGS
44
+ }
45
  LANG_ALIASES = {
46
+ alias: canonical
47
+ for canonical, aliases in LANGUAGE_ALIASES.items()
48
+ for alias in aliases
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  }
50
+ LANG_SOURCE_CODES = {canonical: tuple(aliases) for canonical, aliases in LANGUAGE_ALIASES.items()}
 
 
 
 
 
 
 
 
 
51
  LANG_TO_GROUP = {lang: group for group, langs in LANGUAGE_GROUPS.items() for lang in langs}
52
 
53
 
 
58
  def canonical_lang_family(lang: str) -> str:
59
  """Collapse closely related variants to the app's display/training families."""
60
  return canonical_lang(lang)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
language_aliases.json ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "en": ["eng"],
3
+ "es": ["spa"],
4
+ "fr": ["fra", "fre"],
5
+ "de": ["deu", "ger"],
6
+ "it": ["ita"],
7
+ "pt": ["por"],
8
+ "nl": ["nld", "dut"],
9
+ "vi": ["vie"],
10
+ "tr": ["tur"],
11
+ "la": ["lat"],
12
+ "id": ["ind"],
13
+ "ms": ["msa", "may", "zlm", "zsm"],
14
+ "af": ["afr"],
15
+ "sq": ["alb", "sqi"],
16
+ "is": ["ice", "isl"],
17
+ "no": ["nb", "nn", "nor", "nob", "nno"],
18
+ "sv": ["swe"],
19
+ "da": ["dan"],
20
+ "fi": ["fin"],
21
+ "hu": ["hun"],
22
+ "pl": ["pol"],
23
+ "cs": ["ces", "cze"],
24
+ "ro": ["ron", "rum"],
25
+ "ru": ["rus"],
26
+ "bg": ["bul"],
27
+ "uk": ["ukr"],
28
+ "sr": ["srp"],
29
+ "be": ["bel"],
30
+ "kk": ["kaz"],
31
+ "mk": ["mac", "mkd"],
32
+ "mn": ["mon", "khk"],
33
+ "zh": ["chi", "zho", "cmn", "yue", "wuu", "nan"],
34
+ "ja": ["jpn"],
35
+ "ko": ["kor"],
36
+ "hi": ["hin"],
37
+ "ur": ["urd"],
38
+ "bn": ["ben"],
39
+ "ta": ["tam"],
40
+ "te": ["tel"],
41
+ "mr": ["mar"],
42
+ "gu": ["guj"],
43
+ "kn": ["kan"],
44
+ "ml": ["mal"],
45
+ "pa": ["pan"],
46
+ "as": ["asm"],
47
+ "or": ["ori", "ory"],
48
+ "ar": ["arb", "ara"],
49
+ "fa": ["fas", "per", "pes", "prs"],
50
+ "ps": ["pus", "pbt"],
51
+ "sd": ["snd"],
52
+ "ug": ["uig"],
53
+ "el": ["ell", "gre", "grc"],
54
+ "he": ["heb", "iw", "hbo"],
55
+ "hy": ["hye", "hyw"],
56
+ "ka": ["kat"],
57
+ "am": ["amh"],
58
+ "km": ["khm"],
59
+ "lo": ["lao"],
60
+ "my": ["mya"],
61
+ "th": ["tha"],
62
+ "si": ["sin"],
63
+ "bo": ["bod"],
64
+ "dv": ["div"],
65
+ "ti": ["tir"],
66
+ "sw": ["swa", "swh"],
67
+ "eu": ["eus"],
68
+ "tl": ["fil", "tgl"],
69
+ "ca": ["cat"],
70
+ "gl": ["glg"],
71
+ "oc": ["oci"],
72
+ "br": ["bre"],
73
+ "ga": ["gle"],
74
+ "gd": ["gla"],
75
+ "cy": ["cym"],
76
+ "bs": ["bos"],
77
+ "hr": ["hrv"],
78
+ "sl": ["slv"],
79
+ "sk": ["slk"],
80
+ "et": ["est"],
81
+ "lv": ["lav", "lvs"],
82
+ "lt": ["lit"],
83
+ "eo": ["epo"],
84
+ "jv": ["jav"],
85
+ "mg": ["mlg", "plt"],
86
+ "om": ["orm"],
87
+ "so": ["som"],
88
+ "su": ["sun"],
89
+ "uz": ["uzn", "uzb"],
90
+ "ku": ["kur", "kmr"],
91
+ "ckb": [],
92
+ "ne": ["npi"],
93
+ "mt": ["mlt"],
94
+ "lb": ["ltz"],
95
+ "rm": ["roh"],
96
+ "tt": ["tat"],
97
+ "ky": ["kir"],
98
+ "tg": ["tgk"],
99
+ "ba": ["bak"],
100
+ "yo": ["yor"],
101
+ "zu": ["zul"],
102
+ "ny": ["nya"],
103
+ "ce": ["che"]
104
+ }
tatoeba.py CHANGED
@@ -14,15 +14,6 @@ from language import ALL_LANGS, LANG_ISO2_TO_ISO3, canonical_lang
14
  TATOEBA_CACHE_DIR = Path(__file__).with_name("data") / "tatoeba"
15
  TATOEBA_PARQUET_PATH = TATOEBA_CACHE_DIR / "tatoeba_text.parquet"
16
 
17
- DEFAULT_LANGUAGE_REMAPS = {
18
- "cmn": "zh",
19
- "yue": "zh",
20
- "wuu": "zh",
21
- "nan": "zh",
22
- "nob": "no",
23
- "nno": "no",
24
- }
25
-
26
 
27
  def _normalize_text_key(text: str) -> str:
28
  normalized = unicodedata.normalize("NFKC", text)
@@ -34,10 +25,10 @@ def _normalize_lang(code: str) -> str | None:
34
  code = (code or "").strip()
35
  if not code:
36
  return None
37
- code = DEFAULT_LANGUAGE_REMAPS.get(code, code)
38
  if code in ALL_LANGS:
39
  return code
40
- return canonical_lang(code)
41
 
42
 
43
  def _coerce_source_lang(lang_code: str) -> tuple[str, str]:
 
14
  TATOEBA_CACHE_DIR = Path(__file__).with_name("data") / "tatoeba"
15
  TATOEBA_PARQUET_PATH = TATOEBA_CACHE_DIR / "tatoeba_text.parquet"
16
 
 
 
 
 
 
 
 
 
 
17
 
18
  def _normalize_text_key(text: str) -> str:
19
  normalized = unicodedata.normalize("NFKC", text)
 
25
  code = (code or "").strip()
26
  if not code:
27
  return None
28
+ code = canonical_lang(code)
29
  if code in ALL_LANGS:
30
  return code
31
+ return None
32
 
33
 
34
  def _coerce_source_lang(lang_code: str) -> tuple[str, str]: