language-extractor-demo / source_config.py
DerivedFunction's picture
update
5d38774
from __future__ import annotations
LANGUAGE_BUCKETS = {
# ~41% of CC β€” intentionally capped to avoid crowding out other languages
"English": {
"langs": ["en"],
"weight": 2.9,
"min_chars": 2_000,
"latin": True,
},
# ~6.3% of CC β€” was badly underweighted relative to German/French
"Russian": {
"langs": ["ru"],
"weight": 1.95,
"min_chars": 2_000,
"latin": False,
},
# ~5.9% of CC
"German": {
"langs": ["de"],
"weight": 1.9,
"min_chars": 2_000,
"latin": True,
},
# ~5.7% of CC β€” bumped up from 1.7 to match its actual footprint
"Japanese": {
"langs": ["ja"],
"weight": 1.9,
"min_chars": 1_200,
"latin": False,
},
# ~5.0% of CC β€” CC likely undercounts due to Great Firewall
"Chinese": {
"langs": ["zh"],
"weight": 1.9,
"min_chars": 1_200,
"latin": False,
},
# ~4.6% of CC
"French": {
"langs": ["fr"],
"weight": 1.9,
"min_chars": 2_000,
"latin": True,
},
# ~4.6% of CC
"Spanish": {
"langs": ["es"],
"weight": 1.9,
"min_chars": 2_000,
"latin": True,
},
# ~2.5% of CC
"Portuguese": {
"langs": ["pt"],
"weight": 1.7,
"min_chars": 2_000,
"latin": True,
},
# ~2.4% of CC
"Italian": {
"langs": ["it"],
"weight": 1.6,
"min_chars": 2_000,
"latin": True,
},
# ~2.0% of CC β€” split out from CentralEuropeanLatin; rivals Italian/Portuguese
"Polish": {
"langs": ["pl"],
"weight": 1.55,
"min_chars": 2_000,
"latin": True,
},
# ~1.8% of CC β€” was significantly underweighted at 1.15
"Dutch": {
"langs": ["nl"],
"weight": 1.55,
"min_chars": 2_000,
"latin": True,
},
# ~1.2% of CC β€” split out from CentralEuropeanLatin; large internet population
"Turkish": {
"langs": ["tr"],
"weight": 1.45,
"min_chars": 2_000,
"latin": True,
},
# ind ~1.1%, vie ~1.05% of CC
"SoutheastAsianLatin": {
"langs": ["vi", "id", "ms", "sq", "la"],
"weight": 1.55,
"min_chars": 2_000,
"latin": True,
},
"WesternLatin": {
"langs": ["ca", "gl", "oc"],
"weight": 1.2,
"min_chars": 1_500,
"latin": True,
},
"CelticLatin": {
"langs": ["br", "ga", "gd", "cy"],
"weight": 1.3,
"min_chars": 1_500,
"latin": True,
},
"AdriaticLatin": {
"langs": ["bs", "hr", "sl", "sk"],
"weight": 1.4,
"min_chars": 1_500,
"latin": True,
},
"BalticLatin": {
"langs": ["et", "lv", "lt"],
"weight": 1.2,
"min_chars": 1_500,
"latin": True,
},
# ces ~1.14%, ron ~0.53%, hun ~0.52% of CC β€” smaller tier after splitting out pl/tr
"CentralEuropeanLatin": {
"langs": ["cs", "ro", "hu"],
"weight": 1.3,
"min_chars": 2_000,
"latin": True,
},
# ~0.81% of CC β€” was overweighted at 1.7
"Korean": {
"langs": ["ko"],
"weight": 1.35,
"min_chars": 1_200,
"latin": False,
},
# ukr ~0.70%, bel ~0.017% of CC
"EastSlavicCyrillic": {
"langs": ["uk", "be"],
"weight": 1.7,
"min_chars": 2_000,
"latin": False,
},
# ~0.65% of CC β€” upweighted relative to CC share given speaker population
"Arabic": {
"langs": ["ar"],
"weight": 1.4,
"min_chars": 2_000,
"latin": False,
},
"Norwegian": {
"langs": ["no"],
"weight": 1.0,
"min_chars": 2_000,
"latin": True,
},
# sv ~0.7%, dan ~0.51%, fin ~0.37%, isl ~0.04%, afr ~0.01%
# combined ~2.0% of CC β€” was drastically overweighted at 6.0
# note: Swedish Wikipedia is heavily bot-generated stubs, don't rely on article count
"NordicCore": {
"langs": ["sv", "da", "is", "af", "fi"],
"weight": 2.1,
"min_chars": 2_000,
"latin": True,
},
# bul ~0.27%, srp ~0.25%, mkd ~0.037% of CC
"BalkanCyrillic": {
"langs": ["bg", "sr", "mk"],
"weight": 1.05,
"min_chars": 2_000,
"latin": False,
},
# fas ~0.20% of CC (ignore the one anomalous crawl spike)
"ArabicOther": {
"langs": ["fa", "ps", "sd", "ug"],
"weight": 0.95,
"min_chars": 2_000,
"latin": False,
},
# ~0.22% of CC β€” genuine web underrepresentation relative to speaker count,
# but corpus is thin; 1.0 avoids oversampling a small pool
"Hindi": {
"langs": ["hi"],
"weight": 1.0,
"min_chars": 2_000,
"latin": False,
},
# combined ~0.27% of CC β€” upweighted for script diversity
"IndicOther": {
"langs": [
"ur",
"bn",
"ta",
"te",
"mr",
"gu",
"kn",
"ml",
"pa",
"as",
"or",
"ne",
],
"weight": 0.95,
"min_chars": 2_000,
"latin": False,
},
# kk ~0.038%, mn ~0.016% of CC β€” very thin corpus, weight is already a large relative boost
"CentralAsianCaucusCyrillic": {
"langs": ["kk", "mn", "tt", "ky", "tg", "ba", "ce"],
"weight": 1.1,
"min_chars": 2_000,
"latin": False,
},
# Kurdish is split by script/source:
# - ku: Wikipedia / Latin-script Kurdish
# - ckb: FineTranslations / Arabic-script Kurdish
"KurdishLatin": {
"langs": ["ku"],
"weight": 0.45,
"min_chars": 1_500,
"latin": True,
},
"KurdishArabic": {
"langs": ["ckb"],
"weight": 0.45,
"min_chars": 2_000,
"latin": False,
},
"AfricanLatin": {
"langs": ["sw", "tl", "eu", "yo", "zu", "ny"],
"weight": 1.0,
"min_chars": 1_500,
"latin": True,
},
"PeripheralLatin": {
"langs": ["eo", "jv", "lb", "mg", "mt", "om", "rm", "so", "su", "uz"],
"weight": 1.0,
"min_chars": 1_500,
"latin": True,
},
# Split the remaining non-Latin scripts into two buckets to keep
# Greco-Semitic/Caucasus-style scripts separate from Brahmic/Tibetan ones.
"OtherScriptsWest": {
"langs": ["el", "he", "hy", "ka", "am", "ti", "dv", "hbo", "grc"],
"weight": 1.0,
"min_chars": 2_000,
"latin": False,
},
"OtherScriptsEast": {
"langs": ["km", "lo", "my", "th", "si", "bo"],
"weight": 1.0,
"min_chars": 2_000,
"latin": False,
},
}