| import json |
| import sentencepiece as spm |
| from transformers import T5Tokenizer |
|
|
|
|
| |
| corpus = [] |
|
|
| with open("src/data/tokeniser_corpus.txt", "w", encoding = "utf-8") as f_out: |
| with open("src/data/clean_corpus.jsonl", "r", encoding = "utf-8") as f_in: |
| for i, line in enumerate(f_in): |
| if i >= 1000000: |
| break |
|
|
| item = json.loads(line) |
| src = item["transliteration"]["src"] |
| tgt = item["transliteration"]["tgt"] |
|
|
| f_out.write(src + "\n") |
| f_out.write(tgt + "\n") |
|
|
| |
| spm.SentencePieceTrainer.Train( |
| input = "src/data/tokeniser_corpus.txt", |
| model_prefix = "src/tokeniser/dalat5_sp", |
| vocab_size = 40000, |
| model_type = "unigram", |
| character_coverage = 1.0, |
| max_sentence_length = 8384, |
| pad_id = 0, |
| unk_id = 1, |
| bos_id = 2, |
| eos_id = 3, |
| user_defined_symbols = ["<pad>", "<s>", "</s>"] |
| ) |
|
|
| |
| tokenizer = T5Tokenizer.from_pretrained("src/tokeniser/dalat5_sp.model") |
|
|
| tokenizer.save_pretrained("src/tokeniser/") |