Commit ·
0419b40
1
Parent(s): de532b2
update
Browse files- all_langs.json +0 -104
- language.py +36 -70
- language_aliases.json +104 -0
- tatoeba.py +2 -11
all_langs.json
DELETED
|
@@ -1,104 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"en": "eng",
|
| 3 |
-
"es": "spa",
|
| 4 |
-
"fr": "fra",
|
| 5 |
-
"de": "deu",
|
| 6 |
-
"it": "ita",
|
| 7 |
-
"pt": "por",
|
| 8 |
-
"nl": "nld",
|
| 9 |
-
"vi": "vie",
|
| 10 |
-
"tr": "tur",
|
| 11 |
-
"la": "lat",
|
| 12 |
-
"id": "ind",
|
| 13 |
-
"ms": "msa",
|
| 14 |
-
"af": "afr",
|
| 15 |
-
"sq": "sqi",
|
| 16 |
-
"is": "isl",
|
| 17 |
-
"no": "nor",
|
| 18 |
-
"sv": "swe",
|
| 19 |
-
"da": "dan",
|
| 20 |
-
"fi": "fin",
|
| 21 |
-
"hu": "hun",
|
| 22 |
-
"pl": "pol",
|
| 23 |
-
"cs": "ces",
|
| 24 |
-
"ro": "ron",
|
| 25 |
-
"ru": "rus",
|
| 26 |
-
"bg": "bul",
|
| 27 |
-
"uk": "ukr",
|
| 28 |
-
"sr": "srp",
|
| 29 |
-
"be": "bel",
|
| 30 |
-
"kk": "kaz",
|
| 31 |
-
"mk": "mkd",
|
| 32 |
-
"mn": "mon",
|
| 33 |
-
"zh": "zho",
|
| 34 |
-
"ja": "jpn",
|
| 35 |
-
"ko": "kor",
|
| 36 |
-
"hi": "hin",
|
| 37 |
-
"ur": "urd",
|
| 38 |
-
"bn": "ben",
|
| 39 |
-
"ta": "tam",
|
| 40 |
-
"te": "tel",
|
| 41 |
-
"mr": "mar",
|
| 42 |
-
"gu": "guj",
|
| 43 |
-
"kn": "kan",
|
| 44 |
-
"ml": "mal",
|
| 45 |
-
"pa": "pan",
|
| 46 |
-
"as": "asm",
|
| 47 |
-
"or": "ori",
|
| 48 |
-
"ar": "ara",
|
| 49 |
-
"fa": "fas",
|
| 50 |
-
"ps": "pus",
|
| 51 |
-
"sd": "snd",
|
| 52 |
-
"ug": "uig",
|
| 53 |
-
"el": "ell",
|
| 54 |
-
"he": "heb",
|
| 55 |
-
"hy": "hye",
|
| 56 |
-
"ka": "kat",
|
| 57 |
-
"am": "amh",
|
| 58 |
-
"km": "khm",
|
| 59 |
-
"lo": "lao",
|
| 60 |
-
"my": "mya",
|
| 61 |
-
"th": "tha",
|
| 62 |
-
"si": "sin",
|
| 63 |
-
"bo": "bod",
|
| 64 |
-
"dv": "div",
|
| 65 |
-
"ti": "tir",
|
| 66 |
-
"sw": "swa",
|
| 67 |
-
"eu": "eus",
|
| 68 |
-
"tl": "tgl",
|
| 69 |
-
"ca": "cat",
|
| 70 |
-
"gl": "glg",
|
| 71 |
-
"oc": "oci",
|
| 72 |
-
"br": "bre",
|
| 73 |
-
"ga": "gle",
|
| 74 |
-
"gd": "gla",
|
| 75 |
-
"cy": "cym",
|
| 76 |
-
"bs": "bos",
|
| 77 |
-
"hr": "hrv",
|
| 78 |
-
"sl": "slv",
|
| 79 |
-
"sk": "slk",
|
| 80 |
-
"et": "est",
|
| 81 |
-
"lv": "lav",
|
| 82 |
-
"lt": "lit",
|
| 83 |
-
"eo": "epo",
|
| 84 |
-
"jv": "jav",
|
| 85 |
-
"mg": "mlg",
|
| 86 |
-
"om": "orm",
|
| 87 |
-
"so": "som",
|
| 88 |
-
"su": "sun",
|
| 89 |
-
"uz": "uzb",
|
| 90 |
-
"ku": "kur",
|
| 91 |
-
"ckb": "ckb",
|
| 92 |
-
"ne": "nep",
|
| 93 |
-
"mt": "mlt",
|
| 94 |
-
"lb": "ltz",
|
| 95 |
-
"rm": "roh",
|
| 96 |
-
"tt": "tat",
|
| 97 |
-
"ky": "kir",
|
| 98 |
-
"tg": "tgk",
|
| 99 |
-
"ba": "bak",
|
| 100 |
-
"yo": "yor",
|
| 101 |
-
"zu": "zul",
|
| 102 |
-
"ny": "nya",
|
| 103 |
-
"ce": "che"
|
| 104 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
language.py
CHANGED
|
@@ -4,63 +4,50 @@ import json
|
|
| 4 |
import os
|
| 5 |
from pathlib import Path
|
| 6 |
|
|
|
|
|
|
|
| 7 |
from source_config import LANGUAGE_BUCKETS
|
| 8 |
|
| 9 |
LANGUAGE_GROUPS = {group: cfg["langs"] for group, cfg in LANGUAGE_BUCKETS.items()}
|
| 10 |
LANGUAGE_GROUP_WEIGHTS = {group: float(cfg["weight"]) for group, cfg in LANGUAGE_BUCKETS.items()}
|
| 11 |
LANGUAGE_GROUP_MIN_CHARS = {group: int(cfg["min_chars"]) for group, cfg in LANGUAGE_BUCKETS.items()}
|
| 12 |
LATIN_GROUPS = {group for group, cfg in LANGUAGE_BUCKETS.items() if cfg.get("latin")}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
LANG_ALIASES = {
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
"nno": "no",
|
| 18 |
-
"nor": "no",
|
| 19 |
-
"pes": "fa",
|
| 20 |
-
"fas": "fa",
|
| 21 |
-
"prs": "fa",
|
| 22 |
-
"arb": "ar",
|
| 23 |
-
"acm": "ar",
|
| 24 |
-
"apc": "ar",
|
| 25 |
-
"ary": "ar",
|
| 26 |
-
"arz": "ar",
|
| 27 |
-
"arq": "ar",
|
| 28 |
-
"aeb": "ar",
|
| 29 |
-
"ajp": "ar",
|
| 30 |
-
"azb": "az",
|
| 31 |
-
"azj": "az",
|
| 32 |
-
"ayr": "ay",
|
| 33 |
-
"npi": "ne",
|
| 34 |
-
"ory": "or",
|
| 35 |
-
"pbt": "ps",
|
| 36 |
-
"swh": "sw",
|
| 37 |
-
"zsm": "ms",
|
| 38 |
-
"yue": "zh",
|
| 39 |
-
"wuu": "zh",
|
| 40 |
-
"wu": "zh",
|
| 41 |
-
"nan": "zh",
|
| 42 |
-
"hak": "zh",
|
| 43 |
-
"gan": "zh",
|
| 44 |
-
"cdo": "zh",
|
| 45 |
-
"hsn": "zh",
|
| 46 |
-
"cjy": "zh",
|
| 47 |
-
"mnp": "zh",
|
| 48 |
-
"cpx": "zh",
|
| 49 |
-
"czh": "zh",
|
| 50 |
-
"lzh": "zh",
|
| 51 |
-
"hbo": "he",
|
| 52 |
-
"grc": "el",
|
| 53 |
}
|
| 54 |
-
|
| 55 |
-
LANGS_JSON = Path(__file__).with_name("all_langs.json")
|
| 56 |
-
|
| 57 |
-
# Read from file to load the key-value pairs. JSON key order is canonical.
|
| 58 |
-
LANG_ISO2_TO_ISO3 = {}
|
| 59 |
-
with open(LANGS_JSON) as f:
|
| 60 |
-
# Parse the file as a json string
|
| 61 |
-
LANG_ISO2_TO_ISO3 = json.load(f)
|
| 62 |
-
|
| 63 |
-
ALL_LANGS = list(LANG_ISO2_TO_ISO3.keys())
|
| 64 |
LANG_TO_GROUP = {lang: group for group, langs in LANGUAGE_GROUPS.items() for lang in langs}
|
| 65 |
|
| 66 |
|
|
@@ -71,24 +58,3 @@ def canonical_lang(lang: str) -> str:
|
|
| 71 |
def canonical_lang_family(lang: str) -> str:
|
| 72 |
"""Collapse closely related variants to the app's display/training families."""
|
| 73 |
return canonical_lang(lang)
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
def write_all_langs_json(path: str | os.PathLike[str] = LANGS_JSON) -> None:
|
| 77 |
-
"""Write the canonical ALL_LANGS list to JSON if it is missing."""
|
| 78 |
-
path = Path(path)
|
| 79 |
-
if path.exists():
|
| 80 |
-
return
|
| 81 |
-
with path.open("w", encoding="utf-8") as f:
|
| 82 |
-
json.dump(ALL_LANGS, f, ensure_ascii=False, indent=2)
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
def load_all_langs(path: str | os.PathLike[str] = LANGS_JSON) -> list[str]:
|
| 86 |
-
"""Load ALL_LANGS from JSON, falling back to the in-repo constant."""
|
| 87 |
-
path = Path(path)
|
| 88 |
-
if path.exists():
|
| 89 |
-
with path.open(encoding="utf-8") as f:
|
| 90 |
-
langs = json.load(f)
|
| 91 |
-
if isinstance(langs, list) and all(isinstance(lang, str) for lang in langs):
|
| 92 |
-
return langs
|
| 93 |
-
write_all_langs_json(path)
|
| 94 |
-
return ALL_LANGS[:]
|
|
|
|
| 4 |
import os
|
| 5 |
from pathlib import Path
|
| 6 |
|
| 7 |
+
import pycountry
|
| 8 |
+
|
| 9 |
from source_config import LANGUAGE_BUCKETS
|
| 10 |
|
| 11 |
LANGUAGE_GROUPS = {group: cfg["langs"] for group, cfg in LANGUAGE_BUCKETS.items()}
|
| 12 |
LANGUAGE_GROUP_WEIGHTS = {group: float(cfg["weight"]) for group, cfg in LANGUAGE_BUCKETS.items()}
|
| 13 |
LANGUAGE_GROUP_MIN_CHARS = {group: int(cfg["min_chars"]) for group, cfg in LANGUAGE_BUCKETS.items()}
|
| 14 |
LATIN_GROUPS = {group for group, cfg in LANGUAGE_BUCKETS.items() if cfg.get("latin")}
|
| 15 |
+
LANGUAGE_ALIASES_JSON = Path(__file__).with_name("language_aliases.json")
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def _load_json_mapping(path: Path) -> dict[str, object]:
|
| 19 |
+
with path.open(encoding="utf-8") as f:
|
| 20 |
+
data = json.load(f)
|
| 21 |
+
if not isinstance(data, dict):
|
| 22 |
+
raise ValueError(f"Expected JSON object in {path}")
|
| 23 |
+
return data
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def _load_language_aliases(path: Path = LANGUAGE_ALIASES_JSON) -> dict[str, tuple[str, ...]]:
|
| 27 |
+
aliases = _load_json_mapping(path)
|
| 28 |
+
normalized: dict[str, tuple[str, ...]] = {}
|
| 29 |
+
for canonical, source_aliases in aliases.items():
|
| 30 |
+
if not isinstance(canonical, str) or not isinstance(source_aliases, list):
|
| 31 |
+
continue
|
| 32 |
+
normalized[canonical] = tuple(alias for alias in source_aliases if isinstance(alias, str) and alias)
|
| 33 |
+
return normalized
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
LANGUAGE_ALIASES = _load_language_aliases()
|
| 37 |
+
ALL_LANGS = list(LANGUAGE_ALIASES.keys())
|
| 38 |
+
LANG_ISO2_TO_ISO3 = {
|
| 39 |
+
lang: (
|
| 40 |
+
getattr(pycountry.languages.get(alpha_2=lang) or pycountry.languages.get(alpha_3=lang), "alpha_3", None)
|
| 41 |
+
or lang
|
| 42 |
+
)
|
| 43 |
+
for lang in ALL_LANGS
|
| 44 |
+
}
|
| 45 |
LANG_ALIASES = {
|
| 46 |
+
alias: canonical
|
| 47 |
+
for canonical, aliases in LANGUAGE_ALIASES.items()
|
| 48 |
+
for alias in aliases
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
}
|
| 50 |
+
LANG_SOURCE_CODES = {canonical: tuple(aliases) for canonical, aliases in LANGUAGE_ALIASES.items()}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
LANG_TO_GROUP = {lang: group for group, langs in LANGUAGE_GROUPS.items() for lang in langs}
|
| 52 |
|
| 53 |
|
|
|
|
| 58 |
def canonical_lang_family(lang: str) -> str:
|
| 59 |
"""Collapse closely related variants to the app's display/training families."""
|
| 60 |
return canonical_lang(lang)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
language_aliases.json
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"en": ["eng"],
|
| 3 |
+
"es": ["spa"],
|
| 4 |
+
"fr": ["fra", "fre"],
|
| 5 |
+
"de": ["deu", "ger"],
|
| 6 |
+
"it": ["ita"],
|
| 7 |
+
"pt": ["por"],
|
| 8 |
+
"nl": ["nld", "dut"],
|
| 9 |
+
"vi": ["vie"],
|
| 10 |
+
"tr": ["tur"],
|
| 11 |
+
"la": ["lat"],
|
| 12 |
+
"id": ["ind"],
|
| 13 |
+
"ms": ["msa", "may", "zlm", "zsm"],
|
| 14 |
+
"af": ["afr"],
|
| 15 |
+
"sq": ["alb", "sqi"],
|
| 16 |
+
"is": ["ice", "isl"],
|
| 17 |
+
"no": ["nb", "nn", "nor", "nob", "nno"],
|
| 18 |
+
"sv": ["swe"],
|
| 19 |
+
"da": ["dan"],
|
| 20 |
+
"fi": ["fin"],
|
| 21 |
+
"hu": ["hun"],
|
| 22 |
+
"pl": ["pol"],
|
| 23 |
+
"cs": ["ces", "cze"],
|
| 24 |
+
"ro": ["ron", "rum"],
|
| 25 |
+
"ru": ["rus"],
|
| 26 |
+
"bg": ["bul"],
|
| 27 |
+
"uk": ["ukr"],
|
| 28 |
+
"sr": ["srp"],
|
| 29 |
+
"be": ["bel"],
|
| 30 |
+
"kk": ["kaz"],
|
| 31 |
+
"mk": ["mac", "mkd"],
|
| 32 |
+
"mn": ["mon", "khk"],
|
| 33 |
+
"zh": ["chi", "zho", "cmn", "yue", "wuu", "nan"],
|
| 34 |
+
"ja": ["jpn"],
|
| 35 |
+
"ko": ["kor"],
|
| 36 |
+
"hi": ["hin"],
|
| 37 |
+
"ur": ["urd"],
|
| 38 |
+
"bn": ["ben"],
|
| 39 |
+
"ta": ["tam"],
|
| 40 |
+
"te": ["tel"],
|
| 41 |
+
"mr": ["mar"],
|
| 42 |
+
"gu": ["guj"],
|
| 43 |
+
"kn": ["kan"],
|
| 44 |
+
"ml": ["mal"],
|
| 45 |
+
"pa": ["pan"],
|
| 46 |
+
"as": ["asm"],
|
| 47 |
+
"or": ["ori", "ory"],
|
| 48 |
+
"ar": ["arb", "ara"],
|
| 49 |
+
"fa": ["fas", "per", "pes", "prs"],
|
| 50 |
+
"ps": ["pus", "pbt"],
|
| 51 |
+
"sd": ["snd"],
|
| 52 |
+
"ug": ["uig"],
|
| 53 |
+
"el": ["ell", "gre", "grc"],
|
| 54 |
+
"he": ["heb", "iw", "hbo"],
|
| 55 |
+
"hy": ["hye", "hyw"],
|
| 56 |
+
"ka": ["kat"],
|
| 57 |
+
"am": ["amh"],
|
| 58 |
+
"km": ["khm"],
|
| 59 |
+
"lo": ["lao"],
|
| 60 |
+
"my": ["mya"],
|
| 61 |
+
"th": ["tha"],
|
| 62 |
+
"si": ["sin"],
|
| 63 |
+
"bo": ["bod"],
|
| 64 |
+
"dv": ["div"],
|
| 65 |
+
"ti": ["tir"],
|
| 66 |
+
"sw": ["swa", "swh"],
|
| 67 |
+
"eu": ["eus"],
|
| 68 |
+
"tl": ["fil", "tgl"],
|
| 69 |
+
"ca": ["cat"],
|
| 70 |
+
"gl": ["glg"],
|
| 71 |
+
"oc": ["oci"],
|
| 72 |
+
"br": ["bre"],
|
| 73 |
+
"ga": ["gle"],
|
| 74 |
+
"gd": ["gla"],
|
| 75 |
+
"cy": ["cym"],
|
| 76 |
+
"bs": ["bos"],
|
| 77 |
+
"hr": ["hrv"],
|
| 78 |
+
"sl": ["slv"],
|
| 79 |
+
"sk": ["slk"],
|
| 80 |
+
"et": ["est"],
|
| 81 |
+
"lv": ["lav", "lvs"],
|
| 82 |
+
"lt": ["lit"],
|
| 83 |
+
"eo": ["epo"],
|
| 84 |
+
"jv": ["jav"],
|
| 85 |
+
"mg": ["mlg", "plt"],
|
| 86 |
+
"om": ["orm"],
|
| 87 |
+
"so": ["som"],
|
| 88 |
+
"su": ["sun"],
|
| 89 |
+
"uz": ["uzn", "uzb"],
|
| 90 |
+
"ku": ["kur", "kmr"],
|
| 91 |
+
"ckb": [],
|
| 92 |
+
"ne": ["npi"],
|
| 93 |
+
"mt": ["mlt"],
|
| 94 |
+
"lb": ["ltz"],
|
| 95 |
+
"rm": ["roh"],
|
| 96 |
+
"tt": ["tat"],
|
| 97 |
+
"ky": ["kir"],
|
| 98 |
+
"tg": ["tgk"],
|
| 99 |
+
"ba": ["bak"],
|
| 100 |
+
"yo": ["yor"],
|
| 101 |
+
"zu": ["zul"],
|
| 102 |
+
"ny": ["nya"],
|
| 103 |
+
"ce": ["che"]
|
| 104 |
+
}
|
tatoeba.py
CHANGED
|
@@ -14,15 +14,6 @@ from language import ALL_LANGS, LANG_ISO2_TO_ISO3, canonical_lang
|
|
| 14 |
TATOEBA_CACHE_DIR = Path(__file__).with_name("data") / "tatoeba"
|
| 15 |
TATOEBA_PARQUET_PATH = TATOEBA_CACHE_DIR / "tatoeba_text.parquet"
|
| 16 |
|
| 17 |
-
DEFAULT_LANGUAGE_REMAPS = {
|
| 18 |
-
"cmn": "zh",
|
| 19 |
-
"yue": "zh",
|
| 20 |
-
"wuu": "zh",
|
| 21 |
-
"nan": "zh",
|
| 22 |
-
"nob": "no",
|
| 23 |
-
"nno": "no",
|
| 24 |
-
}
|
| 25 |
-
|
| 26 |
|
| 27 |
def _normalize_text_key(text: str) -> str:
|
| 28 |
normalized = unicodedata.normalize("NFKC", text)
|
|
@@ -34,10 +25,10 @@ def _normalize_lang(code: str) -> str | None:
|
|
| 34 |
code = (code or "").strip()
|
| 35 |
if not code:
|
| 36 |
return None
|
| 37 |
-
code =
|
| 38 |
if code in ALL_LANGS:
|
| 39 |
return code
|
| 40 |
-
return
|
| 41 |
|
| 42 |
|
| 43 |
def _coerce_source_lang(lang_code: str) -> tuple[str, str]:
|
|
|
|
| 14 |
TATOEBA_CACHE_DIR = Path(__file__).with_name("data") / "tatoeba"
|
| 15 |
TATOEBA_PARQUET_PATH = TATOEBA_CACHE_DIR / "tatoeba_text.parquet"
|
| 16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
def _normalize_text_key(text: str) -> str:
|
| 19 |
normalized = unicodedata.normalize("NFKC", text)
|
|
|
|
| 25 |
code = (code or "").strip()
|
| 26 |
if not code:
|
| 27 |
return None
|
| 28 |
+
code = canonical_lang(code)
|
| 29 |
if code in ALL_LANGS:
|
| 30 |
return code
|
| 31 |
+
return None
|
| 32 |
|
| 33 |
|
| 34 |
def _coerce_source_lang(lang_code: str) -> tuple[str, str]:
|