| |
|
|
| import datasets |
| import os |
| import glob |
| import tqdm |
| from numpy.random import default_rng |
| from itertools import product |
|
|
| logger = datasets.logging.get_logger(__name__) |
|
|
| _DESCRIPTION = """\ |
| Pre-tokenized BabyLM HuggingFace dataset for verb perturbations. |
| """ |
| MODEL_NAME = "Llama-3.2-3B" |
| _PERTURBED_DATA_PATH = f"../data/Perturbed_data/{MODEL_NAME}" |
| _PERTURBATIONS = ["hop_control", "hop_tokens4", "hop_words4", |
| "reverse_control", "reverse_partial", "reverse_full", |
| "shuffle_control", "shuffle_nondeterministic", |
| "shuffle_deterministic21", "shuffle_deterministic57", "shuffle_deterministic84", |
| "shuffle_local3", "shuffle_local5", "shuffle_local10", |
| "shuffle_even_odd"] |
| |
| _RANDOM_SEEDS = [0] |
| |
| _TRAIN_SETS = ["10M"] |
| _EOS_TOKEN_ID = 50256 |
|
|
|
|
| class BabyConfig(datasets.BuilderConfig): |
|
|
| def __init__(self, data_dir, babylm_train_set, random_seed, **kwargs): |
| """BuilderConfig for IzParens |
| |
| Args: |
| data_dir: path to directory of tokenized, perturbed BabyLM dataset |
| """ |
| super(BabyConfig, self).__init__( |
| **kwargs, |
| ) |
| self.data_dir = data_dir |
| self.babylm_train_set = babylm_train_set |
| self.random_seed = random_seed |
|
|
|
|
| class BabyLMCorpus(datasets.GeneratorBasedBuilder): |
| BUILDER_CONFIGS = [ |
| BabyConfig( |
| name=f"babylm_{perturbation}_{train_set}_seed{random_seed}", |
| data_dir=os.path.join( |
| _PERTURBED_DATA_PATH, "babylm_" + perturbation), |
| babylm_train_set=train_set, |
| random_seed=random_seed, |
| ) for perturbation, train_set, random_seed in list(product(_PERTURBATIONS, _TRAIN_SETS, _RANDOM_SEEDS)) |
| ] |
|
|
| def _info(self): |
| return datasets.DatasetInfo( |
| |
| description=_DESCRIPTION, |
| |
| features=datasets.Features( |
| { |
| "text": datasets.Value("string") |
| |
| } |
| ), |
| |
| |
| |
| supervised_keys=None, |
| ) |
|
|
| def _split_generators(self, dl_manager): |
| return [ |
| datasets.SplitGenerator( |
| name=datasets.Split.TRAIN, |
| gen_kwargs={"data_dir": os.path.join( |
| self.config.data_dir, "babylm_" + self.config.babylm_train_set), "random_seed": self.config.random_seed, "split": "train"}, |
| ), |
| datasets.SplitGenerator( |
| name=datasets.Split.VALIDATION, |
| gen_kwargs={"data_dir": os.path.join( |
| self.config.data_dir, "babylm_dev"), "random_seed": self.config.random_seed, "split": "valid"}, |
| ), |
| datasets.SplitGenerator( |
| name=datasets.Split.TEST, |
| gen_kwargs={"data_dir": os.path.join( |
| self.config.data_dir, "babylm_test_affected"), "random_seed": self.config.random_seed, "split": "test"}, |
| ), |
| ] |
|
|
| def __chunk(self, sentences, eos_token): |
|
|
| |
| logger.info("Loading pre-tokenized data") |
| tokenized_sentences = [] |
| for sent in tqdm.tqdm(sentences): |
| tokenized_sentences.append([int(tok) for tok in sent.split()]) |
|
|
| |
| logger.info("Concatenating tokenized data using EOS token") |
| all_tokens = [] |
| for tokens in tqdm.tqdm(tokenized_sentences): |
| all_tokens.extend(tokens) |
| all_tokens.append(eos_token) |
|
|
| |
| logger.info("Chunking tokens into sublists of 1024") |
| max_seq_len = 1024 |
| chunked_tokens = [] |
| for i in tqdm.tqdm(range(0, len(all_tokens), max_seq_len)): |
| chunked_tokens.append(all_tokens[i:i + max_seq_len]) |
|
|
| |
| if len(chunked_tokens[-1]) < max_seq_len: |
| chunked_tokens.pop() |
|
|
| return chunked_tokens |
|
|
| def _generate_examples(self, data_dir, random_seed, split): |
| """This function returns the BabyLM text in the discretized, tokenized form.""" |
|
|
| logger.info("Generating examples from = %s", data_dir) |
| infiles = sorted(glob.glob(os.path.join(data_dir, "*"))) |
|
|
| |
| all_sentences = [] |
| for infile in infiles: |
| f = open(infile, encoding="utf-8") |
| all_sentences.extend(f.readlines()) |
| logger.info("Total sentences: {}".format(len(all_sentences))) |
|
|
| |
| rng = default_rng(seed=random_seed) |
| rng.shuffle(all_sentences) |
|
|
| |
| tokenized_lines = self.__chunk(all_sentences, _EOS_TOKEN_ID) |
|
|
| |
| logger.info("Writing dataset as space-separated sequences of tokens") |
| for idx, line in enumerate(tokenized_lines): |
| l = " ".join([str(tok) for tok in line]) + "\n" |
| yield idx, {"text": l} |