| from pathlib import Path |
| import json |
| from transformers import PreTrainedTokenizer |
|
|
| |
|
|
| |
| CONSONANTS = set('βγδθκπτφχλρσμν') |
|
|
| def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None): |
| """ |
| Generate token_type_ids to distinguish sequences if token_ids_1 is given. |
| RoBERTa doesn't use token_type_ids, so we set them all to 0. |
| """ |
| if token_ids_1 is None: |
| return [0] * (len(token_ids_0) + 2) |
| return [0] * (len(token_ids_0) + 2) + [0] * (len(token_ids_1) + 1) |
| |
| def syllabify(tokens): |
| """ |
| Given a list of Greek tokens (letters or diphthongs), returns a list of syllables. |
| Each syllable is a list of tokens. |
| |
| The syllabification follows these rules: |
| - A syllable must have a vowel (or diphthong) as its nucleus. |
| - A single consonant preceding a vowel is considered onset of that syllable. |
| - If there are multiple consonants between vowels, the first consonant is attached as coda |
| to the preceding syllable, and the remaining form the onset of the following syllable. |
| - Any trailing consonants are attached to the last syllable. |
| """ |
| syllables = [] |
| i = 0 |
| n = len(tokens) |
| |
| while i < n: |
| current = [] |
| |
| |
| while i < n and tokens[i] in CONSONANTS: |
| current.append(tokens[i]) |
| i += 1 |
| |
| |
| if i >= n: |
| if syllables: |
| syllables[-1].extend(current) |
| else: |
| syllables.append(current) |
| break |
| |
| |
| current.append(tokens[i]) |
| i += 1 |
| |
| |
| start = i |
| count = 0 |
| while i < n and tokens[i] in CONSONANTS: |
| count += 1 |
| i += 1 |
| |
| if count == 0: |
| |
| syllables.append(current) |
| elif count == 1: |
| |
| syllables.append(current) |
| |
| i = start |
| else: |
| |
| |
| current.append(tokens[start]) |
| syllables.append(current) |
| i = start + 1 |
| |
| return syllables |
|
|
| def syllabify_joined(tokens): |
| """ |
| Convenience function that returns syllables as joined strings instead of lists. |
| """ |
| syllable_lists = syllabify(tokens) |
| return [''.join(syl) for syl in syllable_lists] |
|
|
| if __name__ == '__main__': |
| |
| test_tokens = ['σ', 'τ', 'έ', 'ρ', 'κ', 'σ', 'α', 'σ', 'ἀ', 'ν', 'έ', 'χ', 'ει', 'θ', 'ού', 'ρ', 'ι', 'ο', 'σ', 'αἴ', 'α', 'σ'] |
| |
| print("Syllabified (as lists):") |
| syllable_lists = syllabify(test_tokens) |
| for syl in syllable_lists: |
| print(syl) |
| |
| print("\nSyllabified (joined strings):") |
| print(syllabify_joined(test_tokens)) |
|
|
|
|
| import re |
| import unicodedata |
|
|
| |
| OXIA_TO_TONOS = { |
| "ά": "ά", |
| "έ": "έ", |
| "ή": "ή", |
| "ί": "ί", |
| "ύ": "ύ", |
| "ό": "ό", |
| "ώ": "ώ", |
| } |
|
|
| |
| diphth_y = {'α', 'ε', 'η', 'ο'} |
| upsilon_forms = {'ὐ','ὔ','υ','ὑ','ύ','ὖ','ῦ','ὕ','ὗ','ὺ','ὒ','ὓ'} |
|
|
| diphth_i = {'α', 'ε', 'ο', 'υ'} |
| iota_forms = {'ἰ','ί','ι','ῖ','ἴ','ἶ','ἵ','ἱ','ἷ','ὶ','ἲ','ἳ'} |
|
|
| adscr_i_first = {'α','η','ω','ἀ','ἠ','ὠ','ἁ','ἡ','ὡ','ά','ή','ώ','ὰ','ὴ','ὼ','ᾶ','ῆ','ῶ', |
| 'ὤ','ὥ','ὢ','ὣ','ἄ','ἅ','ἂ','ἃ','ἤ','ἥ','ἣ','ἢ','ἦ','ἧ','ἆ','ἇ','ὧ','ὦ'} |
| adscr_i_second = {'ι'} |
|
|
| |
| def process_word(word): |
| expanded = [] |
| for char in word: |
| if char == 'ζ': |
| expanded.extend(['δ', 'σ']) |
| elif char == 'ς': |
| expanded.append('σ') |
| elif char == 'ῥ': |
| expanded.append('ρ') |
| elif char == 'ξ': |
| expanded.extend(['κ', 'σ']) |
| elif char == 'ψ': |
| expanded.extend(['π', 'σ']) |
| else: |
| expanded.append(char) |
|
|
| combined = [] |
| i = 0 |
| while i < len(expanded): |
| a = expanded[i] |
| b = expanded[i+1] if i + 1 < len(expanded) else '' |
|
|
| if a in diphth_y and b in upsilon_forms: |
| combined.append(a + b) |
| i += 2 |
| elif a in diphth_i and b in iota_forms: |
| combined.append(a + b) |
| i += 2 |
| elif a in adscr_i_first and b in adscr_i_second: |
| combined.append(a + b) |
| i += 2 |
| else: |
| combined.append(a) |
| i += 1 |
|
|
| return combined |
| def replace_oxia_with_tonos(text): |
| return ''.join(OXIA_TO_TONOS.get(ch, ch) for ch in text) |
| |
| def preprocess_greek_line(line): |
| |
| line = replace_oxia_with_tonos(line) |
|
|
| |
| words = re.findall( |
| r"[ΆΐΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩάέήίΰαβγδεζηθικλμνξοπρςστυφχψωϊϋόύώ" |
| r"ἀἁἂἃἄἅἆἇἈἉἊἋἌἍἎ" |
| r"ἐἑἒἓἔἕἘἙἜἝ" |
| r"ἠἡἢἣἤἥἦἧἨἩἪἫἬἭἮ" |
| r"ἰἱἲἳἴἵἶἷἸἹἺἻἼἽἾ" |
| r"ὀὁὂὃὄὅὈὉὊὋὌὍ" |
| r"ὐὑὒὓὔὕὖὗὙὛὝ" |
| r"ὠὡὢὣὤὥὦὧὨὩὪὫὬὭὮὯ" |
| r"ὰὲὴὶὸὺὼᾀᾁᾂᾃᾄᾅᾆᾇᾈᾉᾊᾋᾌᾍ" |
| r"ᾐᾑᾒᾓᾔᾕᾖᾗᾘᾙᾚᾛᾜᾝ" |
| r"ᾠᾡᾢᾣᾤᾥᾦᾧᾨᾩᾪᾫᾬᾭᾮᾯ" |
| r"ᾲᾳᾴᾶᾷῂῃῄῆῇῒῖῗῢῤῥῦῧῬῲῳῴῶῷ]+", |
| line.lower() |
| ) |
|
|
| |
| token_lists = [process_word(word) for word in words] |
| return [token for tokens in token_lists for token in tokens] |
|
|
|
|
| class GreekSyllableTokenizer(PreTrainedTokenizer): |
| vocab_files_names = {"vocab_file": "vocab.json"} |
|
|
| def __init__(self, vocab_file: str, **kwargs): |
| |
| with Path(vocab_file).open(encoding="utf-8") as f: |
| self.vocab = json.load(f) |
| self.ids_to_tokens = {idx: tok for tok, idx in self.vocab.items()} |
|
|
| |
| kwargs.setdefault("pad_token", "[PAD]") |
| kwargs.setdefault("unk_token", "[UNK]") |
| kwargs.setdefault("bos_token", "[CLS]") |
| kwargs.setdefault("eos_token", "[SEP]") |
| kwargs.setdefault("cls_token", "[CLS]") |
| kwargs.setdefault("sep_token", "[SEP]") |
| kwargs.setdefault("mask_token", "[MASK]") |
|
|
| |
| for sp in [kwargs["bos_token"], kwargs["eos_token"], |
| kwargs["unk_token"], kwargs["pad_token"], kwargs["mask_token"]]: |
| if sp not in self.vocab: |
| self.vocab[sp] = len(self.vocab) |
| self.ids_to_tokens[self.vocab[sp]] = sp |
|
|
| |
| super().__init__(**kwargs) |
|
|
| |
| def _tokenize(self, text): |
| return syllabify_joined(preprocess_greek_line(text)) |
|
|
| def _convert_token_to_id(self, token): |
| return self.vocab.get(token, self.vocab[self.unk_token]) |
|
|
| def _convert_id_to_token(self, idx): |
| return self.ids_to_tokens.get(idx, self.unk_token) |
|
|
| |
| def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): |
| """ |
| [CLS] tokens_0 [SEP] (enkel sekvens) |
| [CLS] tokens_0 [SEP] tokens_1 [SEP] (par-sekvens) |
| """ |
| if token_ids_1 is None: |
| return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] |
| return ([self.cls_token_id] + |
| token_ids_0 + |
| [self.sep_token_id] + |
| token_ids_1 + |
| [self.sep_token_id]) |
|
|
| def get_special_tokens_mask(self, |
| token_ids_0, |
| token_ids_1=None, |
| already_has_special_tokens=False): |
| if already_has_special_tokens: |
| return [ |
| 1 if tid in (self.cls_token_id, self.sep_token_id) else 0 |
| for tid in (token_ids_0 + (token_ids_1 or [])) |
| ] |
| if token_ids_1 is None: |
| return [1] + [0]*len(token_ids_0) + [1] |
| return [1] + [0]*len(token_ids_0) + [1] + [0]*len(token_ids_1) + [1] |
|
|
| def save_vocabulary(self, save_directory, filename_prefix=None): |
| path = Path(save_directory) / (("" if filename_prefix is None else filename_prefix) + "vocab.json") |
| path.parent.mkdir(parents=True, exist_ok=True) |
| with path.open("w", encoding="utf-8") as f: |
| json.dump( |
| {str(k): v for k, v in self.vocab.items()}, |
| f, |
| ensure_ascii=False, |
| indent=2 |
| ) |
| return (str(path),) |
| def get_vocab(self): |
| return self.vocab |
|
|
| @property |
| def vocab_size(self): |
| return len(self.vocab) |