Charlie81 commited on
Commit
5b01886
·
1 Parent(s): d7f70e5

claudeattempt dataset

Browse files
Files changed (1) hide show
  1. scripts/train.py +15 -4
scripts/train.py CHANGED
@@ -11,6 +11,19 @@ from datasets import load_dataset
11
  from myolmoe import MyOlmoeForCausalLM, OlmoeConfig
12
  import os
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  def main():
15
  # Load config - first try from local file, then from pretrained
16
  config_path = os.path.join("myolmoe", "config.json")
@@ -33,10 +46,8 @@ def main():
33
  tokenizer.pad_token = tokenizer.eos_token
34
 
35
  # Load dataset
36
- os.environ["HF_DATASETS_CACHE"] = "/tmp/hf-datasets-cache" # or any valid path
37
- dataset = load_dataset("allenai/tulu-v2-sft-mixture", split="train", download_mode="force_redownload")
38
-
39
-
40
  def tokenize_function(examples):
41
  return tokenizer(
42
  examples["text"],
 
11
  from myolmoe import MyOlmoeForCausalLM, OlmoeConfig
12
  import os
13
 
14
+ import datasets
15
+
16
+ # Clear the cache
17
+ datasets.disable_caching()
18
+
19
+ # Or manually clear cache
20
+ import shutil
21
+ import os
22
+ cache_dir = os.path.expanduser("~/.cache/huggingface/datasets")
23
+ if os.path.exists(cache_dir):
24
+ shutil.rmtree(cache_dir)
25
+
26
+
27
  def main():
28
  # Load config - first try from local file, then from pretrained
29
  config_path = os.path.join("myolmoe", "config.json")
 
46
  tokenizer.pad_token = tokenizer.eos_token
47
 
48
  # Load dataset
49
+ dataset = load_dataset("allenai/tulu-v2-sft-mixture", split="train")
50
+
 
 
51
  def tokenize_function(examples):
52
  return tokenizer(
53
  examples["text"],