claudeattempt dataset
Browse files- scripts/train.py +15 -4
scripts/train.py
CHANGED
|
@@ -11,6 +11,19 @@ from datasets import load_dataset
|
|
| 11 |
from myolmoe import MyOlmoeForCausalLM, OlmoeConfig
|
| 12 |
import os
|
| 13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
def main():
|
| 15 |
# Load config - first try from local file, then from pretrained
|
| 16 |
config_path = os.path.join("myolmoe", "config.json")
|
|
@@ -33,10 +46,8 @@ def main():
|
|
| 33 |
tokenizer.pad_token = tokenizer.eos_token
|
| 34 |
|
| 35 |
# Load dataset
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
def tokenize_function(examples):
|
| 41 |
return tokenizer(
|
| 42 |
examples["text"],
|
|
|
|
| 11 |
from myolmoe import MyOlmoeForCausalLM, OlmoeConfig
|
| 12 |
import os
|
| 13 |
|
| 14 |
+
import datasets
|
| 15 |
+
|
| 16 |
+
# Clear the cache
|
| 17 |
+
datasets.disable_caching()
|
| 18 |
+
|
| 19 |
+
# Or manually clear cache
|
| 20 |
+
import shutil
|
| 21 |
+
import os
|
| 22 |
+
cache_dir = os.path.expanduser("~/.cache/huggingface/datasets")
|
| 23 |
+
if os.path.exists(cache_dir):
|
| 24 |
+
shutil.rmtree(cache_dir)
|
| 25 |
+
|
| 26 |
+
|
| 27 |
def main():
|
| 28 |
# Load config - first try from local file, then from pretrained
|
| 29 |
config_path = os.path.join("myolmoe", "config.json")
|
|
|
|
| 46 |
tokenizer.pad_token = tokenizer.eos_token
|
| 47 |
|
| 48 |
# Load dataset
|
| 49 |
+
dataset = load_dataset("allenai/tulu-v2-sft-mixture", split="train")
|
| 50 |
+
|
|
|
|
|
|
|
| 51 |
def tokenize_function(examples):
|
| 52 |
return tokenizer(
|
| 53 |
examples["text"],
|