dalat5 / src /data /generate_cyr_lat_pairs.py

Refined (v5.3) update

3cf1937 12 months ago

5.2 kB

	import os
	import json
	from tqdm import tqdm
	from itertools import islice
	from datasets import load_dataset


	# Kazakh Cyrillic character to the Kazakh Latin character mapping from 2021 onwards
	cyrillic_to_latin = {
	"А": "A", "а": "a",
	"Ә": "Ä", "ә": "ä",
	"Б": "B", "б": "b",
	"Д": "D", "д": "d",
	"Е": "E", "е": "e",
	"Ф": "F", "ф": "f",
	"Г": "G", "г": "g",
	"Ғ": "Ğ", "ғ": "ğ",
	"Х": "H", "х": "h", # also Һ, see below
	"Һ": "H", "һ": "h",

	"И": "I", "и": "i", # used for [и], [й]
	"І": "I", "і": "ı", # distinct from И in sound, both map to 'I/i'
	"Ж": "J", "ж": "j",

	"К": "K", "к": "k",
	"Қ": "Q", "қ": "q",
	"Л": "L", "л": "l",
	"М": "M", "м": "m",
	"Н": "N", "н": "n",
	"Ң": "Ñ", "ң": "ñ",

	"О": "O", "о": "o",
	"Ө": "Ö", "ө": "ö",

	"П": "P", "п": "p",
	"Р": "R", "р": "r",
	"С": "S", "с": "s",
	"Ш": "Ş", "ш": "ş",
	"Т": "T", "т": "t",

	"У": "U", "у": "u", # basic 'u' sound, distinct from Ұ
	"Ұ": "Ū", "ұ": "ū", # back rounded, used frequently
	"Ү": "Ü", "ү": "ü", # front rounded

	"В": "V", "в": "v",
	"Ы": "Y", "ы": "y",
	"Й": "I", "й": "i", # same treatment as И
	"Ц": "Ts", "ц": "ts", # for Russian borrowings
	"Ч": "Ch", "ч": "ch",
	"Щ": "Ş", "щ": "ş", # typically simplified to 'ş'

	"Э": "E", "э": "e",
	"Ю": "Iu", "ю": "iu", # borrowed words only
	"Я": "Ia", "я": "ia",

	"Ъ": "", "ъ": "",
	"Ь": "", "ь": "",

	"З": "Z", "з": "z",

	# Additional (not in table but used in borrowings)
	"Ё": "Io", "ё": "io",
	}


	def convert_to_latin(text: str) -> str:
	"""
	Simple function to apply the Cyrillic -> Latin mapping for Kazakh characters.
	"""
	return ''.join(cyrillic_to_latin.get(char, char) for char in text)


	# Process all files in "extracted" dir
	# Output file
	output_path = "src/data/kazakh_latin_corpus.jsonl"

	# First step: process the Wikipedia dump
	print("Processing the Wikipedia dump of Kazakh articles...")

	with open(output_path, 'w', encoding = "utf-8") as out_file:
	# Iterate over all folders
	for root, _, files in os.walk("src/data/extracted"):
	for fname in tqdm(files, desc = "Files in Wikipedia dump"):
	with open(os.path.join(root, fname), 'r', encoding = "utf-8") as f:
	for line in f:
	try:
	data = json.loads(line)
	cyr_text = data["text"].strip()
	lat_text = convert_to_latin(cyr_text).strip()

	if cyr_text and lat_text:
	obj = {
	"transliteration": {
	"src": cyr_text,
	"tgt": lat_text
	}
	}

	out_file.write(json.dumps(obj, ensure_ascii = False) + "\n")

	except Exception as e:
	tqdm.write(f"Skipping due to: {e}")

	continue

	print("Done")

	# Second step: process the "CC100-Kazakh" dataset
	print("Loading 'CC100-Kazakh' dataset...")

	with open(output_path, 'a', encoding = "utf-8") as out_file:
	with open("src/data/kk.txt", 'r', encoding = "utf-8") as f:
	for line in tqdm(islice(f, 2_200_000), total = 2_200_000, desc = "Lines in CC100-Kazakh"):
	try:
	cyr_text = line.strip()
	lat_text = convert_to_latin(cyr_text).strip()

	if cyr_text and lat_text:
	obj = {
	"transliteration": {
	"src": cyr_text,
	"tgt": lat_text
	}
	}

	out_file.write(json.dumps(obj, ensure_ascii = False) + "\n")

	except Exception as e:
	tqdm.write(f"Skipping due to: {e}")

	continue

	# Third step: process the raw, Kazakh-centred part of the "KazParC" dataset
	print("Loading 'KazParC' dataset...")

	kazparc = load_dataset("issai/kazparc", "kazparc_raw", split = "train")

	with open(output_path, 'a', encoding = "utf-8") as out_file:
	for entry in tqdm(kazparc, desc = "Entries in KazParC"):
	try:
	if "kk" in entry and isinstance(entry["kk"], str):
	cyr_text = entry["kk"].strip()
	lat_text = convert_to_latin(cyr_text).strip()

	if cyr_text and lat_text:
	obj = {
	"transliteration": {
	"src": cyr_text,
	"tgt": lat_text
	}
	}

	out_file.write(json.dumps(obj, ensure_ascii = False) + "\n")

	except Exception as e:
	tqdm.write(f"Skipping due to: {e}")

	continue