| import os |
| import json |
| from tqdm import tqdm |
| from itertools import islice |
| from datasets import load_dataset |
|
|
|
|
| |
| cyrillic_to_latin = { |
| "А": "A", "а": "a", |
| "Ә": "Ä", "ә": "ä", |
| "Б": "B", "б": "b", |
| "Д": "D", "д": "d", |
| "Е": "E", "е": "e", |
| "Ф": "F", "ф": "f", |
| "Г": "G", "г": "g", |
| "Ғ": "Ğ", "ғ": "ğ", |
| "Х": "H", "х": "h", |
| "Һ": "H", "һ": "h", |
|
|
| "И": "I", "и": "i", |
| "І": "I", "і": "ı", |
| "Ж": "J", "ж": "j", |
|
|
| "К": "K", "к": "k", |
| "Қ": "Q", "қ": "q", |
| "Л": "L", "л": "l", |
| "М": "M", "м": "m", |
| "Н": "N", "н": "n", |
| "Ң": "Ñ", "ң": "ñ", |
|
|
| "О": "O", "о": "o", |
| "Ө": "Ö", "ө": "ö", |
|
|
| "П": "P", "п": "p", |
| "Р": "R", "р": "r", |
| "С": "S", "с": "s", |
| "Ш": "Ş", "ш": "ş", |
| "Т": "T", "т": "t", |
|
|
| "У": "U", "у": "u", |
| "Ұ": "Ū", "ұ": "ū", |
| "Ү": "Ü", "ү": "ü", |
|
|
| "В": "V", "в": "v", |
| "Ы": "Y", "ы": "y", |
| "Й": "I", "й": "i", |
| "Ц": "Ts", "ц": "ts", |
| "Ч": "Ch", "ч": "ch", |
| "Щ": "Ş", "щ": "ş", |
|
|
| "Э": "E", "э": "e", |
| "Ю": "Iu", "ю": "iu", |
| "Я": "Ia", "я": "ia", |
|
|
| "Ъ": "", "ъ": "", |
| "Ь": "", "ь": "", |
|
|
| "З": "Z", "з": "z", |
|
|
| |
| "Ё": "Io", "ё": "io", |
| } |
|
|
|
|
| def convert_to_latin(text: str) -> str: |
| """ |
| Simple function to apply the Cyrillic -> Latin mapping for Kazakh characters. |
| """ |
| return ''.join(cyrillic_to_latin.get(char, char) for char in text) |
|
|
|
|
| |
| |
| output_path = "src/data/kazakh_latin_corpus.jsonl" |
|
|
| |
| print("Processing the Wikipedia dump of Kazakh articles...") |
|
|
| with open(output_path, 'w', encoding = "utf-8") as out_file: |
| |
| for root, _, files in os.walk("src/data/extracted"): |
| for fname in tqdm(files, desc = "Files in Wikipedia dump"): |
| with open(os.path.join(root, fname), 'r', encoding = "utf-8") as f: |
| for line in f: |
| try: |
| data = json.loads(line) |
| cyr_text = data["text"].strip() |
| lat_text = convert_to_latin(cyr_text).strip() |
|
|
| if cyr_text and lat_text: |
| obj = { |
| "transliteration": { |
| "src": cyr_text, |
| "tgt": lat_text |
| } |
| } |
|
|
| out_file.write(json.dumps(obj, ensure_ascii = False) + "\n") |
|
|
| except Exception as e: |
| tqdm.write(f"Skipping due to: {e}") |
|
|
| continue |
|
|
| print("Done") |
|
|
| |
| print("Loading 'CC100-Kazakh' dataset...") |
|
|
| with open(output_path, 'a', encoding = "utf-8") as out_file: |
| with open("src/data/kk.txt", 'r', encoding = "utf-8") as f: |
| for line in tqdm(islice(f, 2_200_000), total = 2_200_000, desc = "Lines in CC100-Kazakh"): |
| try: |
| cyr_text = line.strip() |
| lat_text = convert_to_latin(cyr_text).strip() |
|
|
| if cyr_text and lat_text: |
| obj = { |
| "transliteration": { |
| "src": cyr_text, |
| "tgt": lat_text |
| } |
| } |
|
|
| out_file.write(json.dumps(obj, ensure_ascii = False) + "\n") |
|
|
| except Exception as e: |
| tqdm.write(f"Skipping due to: {e}") |
|
|
| continue |
|
|
| |
| print("Loading 'KazParC' dataset...") |
|
|
| kazparc = load_dataset("issai/kazparc", "kazparc_raw", split = "train") |
|
|
| with open(output_path, 'a', encoding = "utf-8") as out_file: |
| for entry in tqdm(kazparc, desc = "Entries in KazParC"): |
| try: |
| if "kk" in entry and isinstance(entry["kk"], str): |
| cyr_text = entry["kk"].strip() |
| lat_text = convert_to_latin(cyr_text).strip() |
|
|
| if cyr_text and lat_text: |
| obj = { |
| "transliteration": { |
| "src": cyr_text, |
| "tgt": lat_text |
| } |
| } |
|
|
| out_file.write(json.dumps(obj, ensure_ascii = False) + "\n") |
|
|
| except Exception as e: |
| tqdm.write(f"Skipping due to: {e}") |
|
|
| continue |