| from __future__ import annotations |
| LANGUAGE_BUCKETS = { |
| |
| "English": { |
| "langs": ["en"], |
| "weight": 2.9, |
| "min_chars": 2_000, |
| "latin": True, |
| }, |
| |
| "Russian": { |
| "langs": ["ru"], |
| "weight": 1.95, |
| "min_chars": 2_000, |
| "latin": False, |
| }, |
| |
| "German": { |
| "langs": ["de"], |
| "weight": 1.9, |
| "min_chars": 2_000, |
| "latin": True, |
| }, |
| |
| "Japanese": { |
| "langs": ["ja"], |
| "weight": 1.9, |
| "min_chars": 1_200, |
| "latin": False, |
| }, |
| |
| "Chinese": { |
| "langs": ["zh"], |
| "weight": 1.9, |
| "min_chars": 1_200, |
| "latin": False, |
| }, |
| |
| "French": { |
| "langs": ["fr"], |
| "weight": 1.9, |
| "min_chars": 2_000, |
| "latin": True, |
| }, |
| |
| "Spanish": { |
| "langs": ["es"], |
| "weight": 1.9, |
| "min_chars": 2_000, |
| "latin": True, |
| }, |
| |
| "Portuguese": { |
| "langs": ["pt"], |
| "weight": 1.7, |
| "min_chars": 2_000, |
| "latin": True, |
| }, |
| |
| "Italian": { |
| "langs": ["it"], |
| "weight": 1.6, |
| "min_chars": 2_000, |
| "latin": True, |
| }, |
| |
| "Polish": { |
| "langs": ["pl"], |
| "weight": 1.55, |
| "min_chars": 2_000, |
| "latin": True, |
| }, |
| |
| "Dutch": { |
| "langs": ["nl"], |
| "weight": 1.55, |
| "min_chars": 2_000, |
| "latin": True, |
| }, |
| |
| "Turkish": { |
| "langs": ["tr"], |
| "weight": 1.45, |
| "min_chars": 2_000, |
| "latin": True, |
| }, |
| |
| "SoutheastAsianLatin": { |
| "langs": ["vi", "id", "ms", "sq", "la"], |
| "weight": 1.55, |
| "min_chars": 2_000, |
| "latin": True, |
| }, |
| "WesternLatin": { |
| "langs": ["ca", "gl", "oc"], |
| "weight": 1.2, |
| "min_chars": 1_500, |
| "latin": True, |
| }, |
| "CelticLatin": { |
| "langs": ["br", "ga", "gd", "cy"], |
| "weight": 1.3, |
| "min_chars": 1_500, |
| "latin": True, |
| }, |
| "AdriaticLatin": { |
| "langs": ["bs", "hr", "sl", "sk"], |
| "weight": 1.4, |
| "min_chars": 1_500, |
| "latin": True, |
| }, |
| "BalticLatin": { |
| "langs": ["et", "lv", "lt"], |
| "weight": 1.2, |
| "min_chars": 1_500, |
| "latin": True, |
| }, |
| |
| "CentralEuropeanLatin": { |
| "langs": ["cs", "ro", "hu"], |
| "weight": 1.3, |
| "min_chars": 2_000, |
| "latin": True, |
| }, |
| |
| "Korean": { |
| "langs": ["ko"], |
| "weight": 1.35, |
| "min_chars": 1_200, |
| "latin": False, |
| }, |
| |
| "EastSlavicCyrillic": { |
| "langs": ["uk", "be"], |
| "weight": 1.7, |
| "min_chars": 2_000, |
| "latin": False, |
| }, |
| |
| "Arabic": { |
| "langs": ["ar"], |
| "weight": 1.4, |
| "min_chars": 2_000, |
| "latin": False, |
| }, |
| "Norwegian": { |
| "langs": ["no"], |
| "weight": 1.0, |
| "min_chars": 2_000, |
| "latin": True, |
| }, |
| |
| |
| |
| "NordicCore": { |
| "langs": ["sv", "da", "is", "af", "fi"], |
| "weight": 2.1, |
| "min_chars": 2_000, |
| "latin": True, |
| }, |
| |
| "BalkanCyrillic": { |
| "langs": ["bg", "sr", "mk"], |
| "weight": 1.05, |
| "min_chars": 2_000, |
| "latin": False, |
| }, |
| |
| "ArabicOther": { |
| "langs": ["fa", "ps", "sd", "ug"], |
| "weight": 0.95, |
| "min_chars": 2_000, |
| "latin": False, |
| }, |
| |
| |
| "Hindi": { |
| "langs": ["hi"], |
| "weight": 1.0, |
| "min_chars": 2_000, |
| "latin": False, |
| }, |
| |
| "IndicOther": { |
| "langs": [ |
| "ur", |
| "bn", |
| "ta", |
| "te", |
| "mr", |
| "gu", |
| "kn", |
| "ml", |
| "pa", |
| "as", |
| "or", |
| "ne", |
| ], |
| "weight": 0.95, |
| "min_chars": 2_000, |
| "latin": False, |
| }, |
| |
| "CentralAsianCaucusCyrillic": { |
| "langs": ["kk", "mn", "tt", "ky", "tg", "ba", "ce"], |
| "weight": 1.1, |
| "min_chars": 2_000, |
| "latin": False, |
| }, |
| |
| |
| |
| "KurdishLatin": { |
| "langs": ["ku"], |
| "weight": 0.45, |
| "min_chars": 1_500, |
| "latin": True, |
| }, |
| "KurdishArabic": { |
| "langs": ["ckb"], |
| "weight": 0.45, |
| "min_chars": 2_000, |
| "latin": False, |
| }, |
| "AfricanLatin": { |
| "langs": ["sw", "tl", "eu", "yo", "zu", "ny"], |
| "weight": 1.0, |
| "min_chars": 1_500, |
| "latin": True, |
| }, |
| "PeripheralLatin": { |
| "langs": ["eo", "jv", "lb", "mg", "mt", "om", "rm", "so", "su", "uz"], |
| "weight": 1.0, |
| "min_chars": 1_500, |
| "latin": True, |
| }, |
| |
| |
| "OtherScriptsWest": { |
| "langs": ["el", "he", "hy", "ka", "am", "ti", "dv", "hbo", "grc"], |
| "weight": 1.0, |
| "min_chars": 2_000, |
| "latin": False, |
| }, |
| "OtherScriptsEast": { |
| "langs": ["km", "lo", "my", "th", "si", "bo"], |
| "weight": 1.0, |
| "min_chars": 2_000, |
| "latin": False, |
| }, |
| } |