| import multiprocessing |
| from itertools import chain |
| from datasets import load_dataset |
| from transformers import AutoTokenizer |
|
|
| from huggingface_hub import HfApi |
|
|
|
|
| class DatasetBuilder: |
| def __init__( |
| self, |
| dataset_name, |
| seq_len=8192, |
| num_cpu=None, |
| hf_account_repo=None, |
| tokenizer="EleutherAI/gpt-neox-20b", |
| ): |
| self.dataset_name = dataset_name |
| self.seq_len = seq_len |
| self.num_cpu = num_cpu or multiprocessing.cpu_count() |
| self.hf_account_repo = hf_account_repo |
| self.tokenizer = tokenizer |
|
|
| def build_dataset(self): |
| tokenizer = AutoTokenizer.from_pretrained(self.tokenizer) |
| train_dataset = load_dataset(self.dataset_name, split="train", streaming=True) |
| dataset = train_dataset.shuffle() |
|
|
| def tokenize_function(example): |
| return tokenizer([t + tokenizer.eos_token for t in example["text"]]) |
|
|
| tokenized_dataset = dataset.map( |
| tokenize_function, |
| batched=True, |
| |
| remove_columns=["text"], |
| |
| ) |
|
|
| block_size = self.seq_len |
|
|
| def group_texts(examples): |
| concatenated_examples = { |
| k: list(chain(*examples[k])) for k in examples.keys() |
| } |
| total_length = len(concatenated_examples[list(examples.keys())[0]]) |
|
|
| if total_length >= block_size: |
| total_length = (total_length // block_size) * block_size |
|
|
| result = { |
| k: [ |
| t[i : i + block_size] |
| for i in range(0, total_length, block_size) |
| ] |
| for k, t in concatenated_examples.items() |
| } |
|
|
| return result |
|
|
| train_tokenized_dataset = tokenized_dataset.map( |
| group_texts, batched=True, |
| ) |
|
|
| |
| if self.hf_account_repo: |
| |
| hf_api = HfApi() |
| hf_api.upload_file( |
| path_or_fileobj= "TOKENIZED_DATASET", |
| path_in_repo="README.md", |
| repo_id=self.hf_account_repo, |
| repo_type="dataset" |
| ) |
|
|
| return train_tokenized_dataset |
|
|
| |
|
|
| builder = DatasetBuilder( |
| dataset_name="the_pile_books3", |
| seq_len=8192, |
| |
| hf_account_repo="kye/thepilebooks3-gptneox-8k", |
| tokenizer="EleutherAI/gpt-neox-20b", |
| ) |
|
|
| dataset = builder.build_dataset() |
|
|