dalat5 / src /data /generate_cyr_lat_pairs.py
crossroderick's picture
Refined (v5.3) update
3cf1937
import os
import json
from tqdm import tqdm
from itertools import islice
from datasets import load_dataset
# Kazakh Cyrillic character to the Kazakh Latin character mapping from 2021 onwards
cyrillic_to_latin = {
"А": "A", "а": "a",
"Ә": "Ä", "ә": "ä",
"Б": "B", "б": "b",
"Д": "D", "д": "d",
"Е": "E", "е": "e",
"Ф": "F", "ф": "f",
"Г": "G", "г": "g",
"Ғ": "Ğ", "ғ": "ğ",
"Х": "H", "х": "h", # also Һ, see below
"Һ": "H", "һ": "h",
"И": "I", "и": "i", # used for [и], [й]
"І": "I", "і": "ı", # distinct from И in sound, both map to 'I/i'
"Ж": "J", "ж": "j",
"К": "K", "к": "k",
"Қ": "Q", "қ": "q",
"Л": "L", "л": "l",
"М": "M", "м": "m",
"Н": "N", "н": "n",
"Ң": "Ñ", "ң": "ñ",
"О": "O", "о": "o",
"Ө": "Ö", "ө": "ö",
"П": "P", "п": "p",
"Р": "R", "р": "r",
"С": "S", "с": "s",
"Ш": "Ş", "ш": "ş",
"Т": "T", "т": "t",
"У": "U", "у": "u", # basic 'u' sound, distinct from Ұ
"Ұ": "Ū", "ұ": "ū", # back rounded, used frequently
"Ү": "Ü", "ү": "ü", # front rounded
"В": "V", "в": "v",
"Ы": "Y", "ы": "y",
"Й": "I", "й": "i", # same treatment as И
"Ц": "Ts", "ц": "ts", # for Russian borrowings
"Ч": "Ch", "ч": "ch",
"Щ": "Ş", "щ": "ş", # typically simplified to 'ş'
"Э": "E", "э": "e",
"Ю": "Iu", "ю": "iu", # borrowed words only
"Я": "Ia", "я": "ia",
"Ъ": "", "ъ": "",
"Ь": "", "ь": "",
"З": "Z", "з": "z",
# Additional (not in table but used in borrowings)
"Ё": "Io", "ё": "io",
}
def convert_to_latin(text: str) -> str:
"""
Simple function to apply the Cyrillic -> Latin mapping for Kazakh characters.
"""
return ''.join(cyrillic_to_latin.get(char, char) for char in text)
# Process all files in "extracted" dir
# Output file
output_path = "src/data/kazakh_latin_corpus.jsonl"
# First step: process the Wikipedia dump
print("Processing the Wikipedia dump of Kazakh articles...")
with open(output_path, 'w', encoding = "utf-8") as out_file:
# Iterate over all folders
for root, _, files in os.walk("src/data/extracted"):
for fname in tqdm(files, desc = "Files in Wikipedia dump"):
with open(os.path.join(root, fname), 'r', encoding = "utf-8") as f:
for line in f:
try:
data = json.loads(line)
cyr_text = data["text"].strip()
lat_text = convert_to_latin(cyr_text).strip()
if cyr_text and lat_text:
obj = {
"transliteration": {
"src": cyr_text,
"tgt": lat_text
}
}
out_file.write(json.dumps(obj, ensure_ascii = False) + "\n")
except Exception as e:
tqdm.write(f"Skipping due to: {e}")
continue
print("Done")
# Second step: process the "CC100-Kazakh" dataset
print("Loading 'CC100-Kazakh' dataset...")
with open(output_path, 'a', encoding = "utf-8") as out_file:
with open("src/data/kk.txt", 'r', encoding = "utf-8") as f:
for line in tqdm(islice(f, 2_200_000), total = 2_200_000, desc = "Lines in CC100-Kazakh"):
try:
cyr_text = line.strip()
lat_text = convert_to_latin(cyr_text).strip()
if cyr_text and lat_text:
obj = {
"transliteration": {
"src": cyr_text,
"tgt": lat_text
}
}
out_file.write(json.dumps(obj, ensure_ascii = False) + "\n")
except Exception as e:
tqdm.write(f"Skipping due to: {e}")
continue
# Third step: process the raw, Kazakh-centred part of the "KazParC" dataset
print("Loading 'KazParC' dataset...")
kazparc = load_dataset("issai/kazparc", "kazparc_raw", split = "train")
with open(output_path, 'a', encoding = "utf-8") as out_file:
for entry in tqdm(kazparc, desc = "Entries in KazParC"):
try:
if "kk" in entry and isinstance(entry["kk"], str):
cyr_text = entry["kk"].strip()
lat_text = convert_to_latin(cyr_text).strip()
if cyr_text and lat_text:
obj = {
"transliteration": {
"src": cyr_text,
"tgt": lat_text
}
}
out_file.write(json.dumps(obj, ensure_ascii = False) + "\n")
except Exception as e:
tqdm.write(f"Skipping due to: {e}")
continue