File size: 750 Bytes
a5e8634
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
print("[*] Loading libraries...")
from datasets import load_dataset
from tokenizers import ByteLevelBPETokenizer
from tqdm import tqdm

dataset = load_dataset("HuggingFaceFW/fineweb-edu", "sample-10BT", split="train", streaming=True)
def get_training_corpus():
    dataset_iter = iter(dataset)
    for _ in tqdm(range(1_000_000), desc="Feeding data"):
        yield next(dataset_iter)["text"]

tokenizer = ByteLevelBPETokenizer()

print("[*] Training tokenizer...")

tokenizer.train_from_iterator(
    get_training_corpus(),
    vocab_size=8192,
    min_frequency=2,
    show_progress=True,
    special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"]
)

tokenizer.save_model(".", "custom_llama_tokenizer")
print("[*] Tokenizer training complete!")