RetrO21 commited on
Commit
7048260
·
verified ·
1 Parent(s): 82e5deb

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. README.md +3 -22
  3. adapter_config.json +5 -3
  4. adapter_model.safetensors +2 -2
  5. checkpoint-10422/README.md +209 -0
  6. checkpoint-10422/adapter_config.json +43 -0
  7. checkpoint-10422/adapter_model.safetensors +3 -0
  8. checkpoint-10422/added_tokens.json +16 -0
  9. checkpoint-10422/chat_template.jinja +7 -0
  10. checkpoint-10422/merges.txt +0 -0
  11. checkpoint-10422/optimizer.pt +3 -0
  12. checkpoint-10422/rng_state.pth +3 -0
  13. checkpoint-10422/scheduler.pt +3 -0
  14. checkpoint-10422/special_tokens_map.json +31 -0
  15. checkpoint-10422/tokenizer.json +3 -0
  16. checkpoint-10422/tokenizer_config.json +143 -0
  17. checkpoint-10422/trainer_state.json +2186 -0
  18. checkpoint-10422/training_args.bin +3 -0
  19. checkpoint-10422/vocab.json +0 -0
  20. checkpoint-1737/adapter_config.json +5 -3
  21. checkpoint-1737/adapter_model.safetensors +2 -2
  22. checkpoint-1737/optimizer.pt +2 -2
  23. checkpoint-1737/rng_state.pth +1 -1
  24. checkpoint-1737/scheduler.pt +1 -1
  25. checkpoint-1737/trainer_state.json +216 -216
  26. checkpoint-1737/training_args.bin +1 -1
  27. checkpoint-3474/adapter_config.json +5 -3
  28. checkpoint-3474/adapter_model.safetensors +2 -2
  29. checkpoint-3474/optimizer.pt +2 -2
  30. checkpoint-3474/rng_state.pth +1 -1
  31. checkpoint-3474/scheduler.pt +1 -1
  32. checkpoint-3474/trainer_state.json +434 -434
  33. checkpoint-3474/training_args.bin +1 -1
  34. checkpoint-5211/adapter_config.json +5 -3
  35. checkpoint-5211/adapter_model.safetensors +2 -2
  36. checkpoint-5211/optimizer.pt +2 -2
  37. checkpoint-5211/rng_state.pth +1 -1
  38. checkpoint-5211/scheduler.pt +1 -1
  39. checkpoint-5211/trainer_state.json +652 -652
  40. checkpoint-5211/training_args.bin +1 -1
  41. checkpoint-6948/adapter_config.json +5 -3
  42. checkpoint-6948/adapter_model.safetensors +2 -2
  43. checkpoint-6948/optimizer.pt +2 -2
  44. checkpoint-6948/rng_state.pth +1 -1
  45. checkpoint-6948/scheduler.pt +1 -1
  46. checkpoint-6948/trainer_state.json +865 -865
  47. checkpoint-6948/training_args.bin +1 -1
  48. checkpoint-8685/adapter_config.json +4 -2
  49. checkpoint-8685/adapter_model.safetensors +2 -2
  50. checkpoint-8685/optimizer.pt +2 -2
.gitattributes CHANGED
@@ -39,3 +39,4 @@ checkpoint-5211/tokenizer.json filter=lfs diff=lfs merge=lfs -text
39
  checkpoint-6948/tokenizer.json filter=lfs diff=lfs merge=lfs -text
40
  checkpoint-8685/tokenizer.json filter=lfs diff=lfs merge=lfs -text
41
  tokenizer.json filter=lfs diff=lfs merge=lfs -text
 
 
39
  checkpoint-6948/tokenizer.json filter=lfs diff=lfs merge=lfs -text
40
  checkpoint-8685/tokenizer.json filter=lfs diff=lfs merge=lfs -text
41
  tokenizer.json filter=lfs diff=lfs merge=lfs -text
42
+ checkpoint-10422/tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,28 +1,9 @@
1
  ---
2
  base_model: Qwen/Qwen2-VL-2B-Instruct
3
  library_name: peft
4
- model_name: output
5
  tags:
6
- - adapter
7
  - lora
8
- - sft
9
- - transformers
10
- - trl
11
- license: apache-2.0
12
- pipeline_tag: text-generation
13
  ---
14
-
15
- # Model Card for output
16
-
17
- This model is a LoRA fine-tuned version of
18
- [Qwen/Qwen2-VL-2B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct).
19
-
20
- It has been trained using the TRL SFT pipeline.
21
-
22
- ## Quick start
23
-
24
- ```python
25
- from transformers import pipeline
26
-
27
- pipe = pipeline("text-generation", model="RetrO21/agrofinetune", device="cuda")
28
- print(pipe("What is nitrogen deficiency?")[0]["generated_text"])
 
1
  ---
2
  base_model: Qwen/Qwen2-VL-2B-Instruct
3
  library_name: peft
 
4
  tags:
 
5
  - lora
6
+ - qwen2-vl
7
+ - adapter
8
+ - vision-language
 
 
9
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
adapter_config.json CHANGED
@@ -16,7 +16,7 @@
16
  "layers_pattern": null,
17
  "layers_to_transform": null,
18
  "loftq_config": {},
19
- "lora_alpha": 16,
20
  "lora_bias": false,
21
  "lora_dropout": 0.1,
22
  "megatron_config": null,
@@ -25,12 +25,14 @@
25
  "peft_type": "LORA",
26
  "peft_version": "0.18.0",
27
  "qalora_group_size": 16,
28
- "r": 8,
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
  "q_proj",
33
- "v_proj"
 
 
34
  ],
35
  "target_parameters": null,
36
  "task_type": "CAUSAL_LM",
 
16
  "layers_pattern": null,
17
  "layers_to_transform": null,
18
  "loftq_config": {},
19
+ "lora_alpha": 32,
20
  "lora_bias": false,
21
  "lora_dropout": 0.1,
22
  "megatron_config": null,
 
25
  "peft_type": "LORA",
26
  "peft_version": "0.18.0",
27
  "qalora_group_size": 16,
28
+ "r": 24,
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
  "q_proj",
33
+ "k_proj",
34
+ "v_proj",
35
+ "o_proj"
36
  ],
37
  "target_parameters": null,
38
  "task_type": "CAUSAL_LM",
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:76b5201211b5dac5150a2b3a87809a5671a1239a76fdfafed2618f15a157a612
3
- size 4374520
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a42655e5c5bf5a17388c99c67741b81d97a904a649f92d5298361717c78abaac
3
+ size 26182176
checkpoint-10422/README.md ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: ''
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:Qwen/Qwen2-VL-2B-Instruct
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ ---
12
+
13
+ # Model Card for Model ID
14
+
15
+ <!-- Provide a quick summary of what the model is/does. -->
16
+
17
+
18
+
19
+ ## Model Details
20
+
21
+ ### Model Description
22
+
23
+ <!-- Provide a longer summary of what this model is. -->
24
+
25
+
26
+
27
+ - **Developed by:** [More Information Needed]
28
+ - **Funded by [optional]:** [More Information Needed]
29
+ - **Shared by [optional]:** [More Information Needed]
30
+ - **Model type:** [More Information Needed]
31
+ - **Language(s) (NLP):** [More Information Needed]
32
+ - **License:** [More Information Needed]
33
+ - **Finetuned from model [optional]:** [More Information Needed]
34
+
35
+ ### Model Sources [optional]
36
+
37
+ <!-- Provide the basic links for the model. -->
38
+
39
+ - **Repository:** [More Information Needed]
40
+ - **Paper [optional]:** [More Information Needed]
41
+ - **Demo [optional]:** [More Information Needed]
42
+
43
+ ## Uses
44
+
45
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
46
+
47
+ ### Direct Use
48
+
49
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
50
+
51
+ [More Information Needed]
52
+
53
+ ### Downstream Use [optional]
54
+
55
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
56
+
57
+ [More Information Needed]
58
+
59
+ ### Out-of-Scope Use
60
+
61
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
62
+
63
+ [More Information Needed]
64
+
65
+ ## Bias, Risks, and Limitations
66
+
67
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
68
+
69
+ [More Information Needed]
70
+
71
+ ### Recommendations
72
+
73
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
74
+
75
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
76
+
77
+ ## How to Get Started with the Model
78
+
79
+ Use the code below to get started with the model.
80
+
81
+ [More Information Needed]
82
+
83
+ ## Training Details
84
+
85
+ ### Training Data
86
+
87
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
88
+
89
+ [More Information Needed]
90
+
91
+ ### Training Procedure
92
+
93
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
94
+
95
+ #### Preprocessing [optional]
96
+
97
+ [More Information Needed]
98
+
99
+
100
+ #### Training Hyperparameters
101
+
102
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
103
+
104
+ #### Speeds, Sizes, Times [optional]
105
+
106
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
107
+
108
+ [More Information Needed]
109
+
110
+ ## Evaluation
111
+
112
+ <!-- This section describes the evaluation protocols and provides the results. -->
113
+
114
+ ### Testing Data, Factors & Metrics
115
+
116
+ #### Testing Data
117
+
118
+ <!-- This should link to a Dataset Card if possible. -->
119
+
120
+ [More Information Needed]
121
+
122
+ #### Factors
123
+
124
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
125
+
126
+ [More Information Needed]
127
+
128
+ #### Metrics
129
+
130
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
131
+
132
+ [More Information Needed]
133
+
134
+ ### Results
135
+
136
+ [More Information Needed]
137
+
138
+ #### Summary
139
+
140
+
141
+
142
+ ## Model Examination [optional]
143
+
144
+ <!-- Relevant interpretability work for the model goes here -->
145
+
146
+ [More Information Needed]
147
+
148
+ ## Environmental Impact
149
+
150
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
151
+
152
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
153
+
154
+ - **Hardware Type:** [More Information Needed]
155
+ - **Hours used:** [More Information Needed]
156
+ - **Cloud Provider:** [More Information Needed]
157
+ - **Compute Region:** [More Information Needed]
158
+ - **Carbon Emitted:** [More Information Needed]
159
+
160
+ ## Technical Specifications [optional]
161
+
162
+ ### Model Architecture and Objective
163
+
164
+ [More Information Needed]
165
+
166
+ ### Compute Infrastructure
167
+
168
+ [More Information Needed]
169
+
170
+ #### Hardware
171
+
172
+ [More Information Needed]
173
+
174
+ #### Software
175
+
176
+ [More Information Needed]
177
+
178
+ ## Citation [optional]
179
+
180
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
181
+
182
+ **BibTeX:**
183
+
184
+ [More Information Needed]
185
+
186
+ **APA:**
187
+
188
+ [More Information Needed]
189
+
190
+ ## Glossary [optional]
191
+
192
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
193
+
194
+ [More Information Needed]
195
+
196
+ ## More Information [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Authors [optional]
201
+
202
+ [More Information Needed]
203
+
204
+ ## Model Card Contact
205
+
206
+ [More Information Needed]
207
+ ### Framework versions
208
+
209
+ - PEFT 0.18.0
checkpoint-10422/adapter_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "Qwen/Qwen2-VL-2B-Instruct",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 32,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.1,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.0",
27
+ "qalora_group_size": 16,
28
+ "r": 8,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "k_proj",
33
+ "v_proj",
34
+ "q_proj",
35
+ "o_proj"
36
+ ],
37
+ "target_parameters": null,
38
+ "task_type": "CAUSAL_LM",
39
+ "trainable_token_indices": null,
40
+ "use_dora": false,
41
+ "use_qalora": false,
42
+ "use_rslora": false
43
+ }
checkpoint-10422/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:267663f0833a741d0dff42c5b9b564413305e7f5771f9bc4b0265a1464819af9
3
+ size 8749064
checkpoint-10422/added_tokens.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "<|box_end|>": 151649,
3
+ "<|box_start|>": 151648,
4
+ "<|endoftext|>": 151643,
5
+ "<|im_end|>": 151645,
6
+ "<|im_start|>": 151644,
7
+ "<|image_pad|>": 151655,
8
+ "<|object_ref_end|>": 151647,
9
+ "<|object_ref_start|>": 151646,
10
+ "<|quad_end|>": 151651,
11
+ "<|quad_start|>": 151650,
12
+ "<|video_pad|>": 151656,
13
+ "<|vision_end|>": 151653,
14
+ "<|vision_pad|>": 151654,
15
+ "<|vision_start|>": 151652
16
+ }
checkpoint-10422/chat_template.jinja ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system
2
+ You are a helpful assistant.<|im_end|>
3
+ {% endif %}<|im_start|>{{ message['role'] }}
4
+ {% if message['content'] is string %}{{ message['content'] }}<|im_end|>
5
+ {% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>
6
+ {% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant
7
+ {% endif %}
checkpoint-10422/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-10422/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a8616e361fa88f6623812655f00db40780ff24fdbf0e3076512427426785c35f
3
+ size 17621003
checkpoint-10422/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a1b0f920b0d7a4950d6a2138471d4f887a620a6c310ba5c47cd3fdb370773865
3
+ size 14645
checkpoint-10422/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be9fd8f2e5282151739ba25b300ba9565edc934ba51e045424e2e775d92c6b36
3
+ size 1465
checkpoint-10422/special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
checkpoint-10422/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f33787292af226c4a4842be48a0e614d9524e25dc248e48bb1af0593de5564f9
3
+ size 11420539
checkpoint-10422/tokenizer_config.json ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "151643": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "151644": {
13
+ "content": "<|im_start|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "151645": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "151646": {
29
+ "content": "<|object_ref_start|>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "151647": {
37
+ "content": "<|object_ref_end|>",
38
+ "lstrip": false,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ },
44
+ "151648": {
45
+ "content": "<|box_start|>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false,
50
+ "special": true
51
+ },
52
+ "151649": {
53
+ "content": "<|box_end|>",
54
+ "lstrip": false,
55
+ "normalized": false,
56
+ "rstrip": false,
57
+ "single_word": false,
58
+ "special": true
59
+ },
60
+ "151650": {
61
+ "content": "<|quad_start|>",
62
+ "lstrip": false,
63
+ "normalized": false,
64
+ "rstrip": false,
65
+ "single_word": false,
66
+ "special": true
67
+ },
68
+ "151651": {
69
+ "content": "<|quad_end|>",
70
+ "lstrip": false,
71
+ "normalized": false,
72
+ "rstrip": false,
73
+ "single_word": false,
74
+ "special": true
75
+ },
76
+ "151652": {
77
+ "content": "<|vision_start|>",
78
+ "lstrip": false,
79
+ "normalized": false,
80
+ "rstrip": false,
81
+ "single_word": false,
82
+ "special": true
83
+ },
84
+ "151653": {
85
+ "content": "<|vision_end|>",
86
+ "lstrip": false,
87
+ "normalized": false,
88
+ "rstrip": false,
89
+ "single_word": false,
90
+ "special": true
91
+ },
92
+ "151654": {
93
+ "content": "<|vision_pad|>",
94
+ "lstrip": false,
95
+ "normalized": false,
96
+ "rstrip": false,
97
+ "single_word": false,
98
+ "special": true
99
+ },
100
+ "151655": {
101
+ "content": "<|image_pad|>",
102
+ "lstrip": false,
103
+ "normalized": false,
104
+ "rstrip": false,
105
+ "single_word": false,
106
+ "special": true
107
+ },
108
+ "151656": {
109
+ "content": "<|video_pad|>",
110
+ "lstrip": false,
111
+ "normalized": false,
112
+ "rstrip": false,
113
+ "single_word": false,
114
+ "special": true
115
+ }
116
+ },
117
+ "additional_special_tokens": [
118
+ "<|im_start|>",
119
+ "<|im_end|>",
120
+ "<|object_ref_start|>",
121
+ "<|object_ref_end|>",
122
+ "<|box_start|>",
123
+ "<|box_end|>",
124
+ "<|quad_start|>",
125
+ "<|quad_end|>",
126
+ "<|vision_start|>",
127
+ "<|vision_end|>",
128
+ "<|vision_pad|>",
129
+ "<|image_pad|>",
130
+ "<|video_pad|>"
131
+ ],
132
+ "bos_token": null,
133
+ "clean_up_tokenization_spaces": false,
134
+ "eos_token": "<|im_end|>",
135
+ "errors": "replace",
136
+ "extra_special_tokens": {},
137
+ "model_max_length": 32768,
138
+ "pad_token": "<|endoftext|>",
139
+ "padding_side": "right",
140
+ "split_special_tokens": false,
141
+ "tokenizer_class": "Qwen2Tokenizer",
142
+ "unk_token": null
143
+ }
checkpoint-10422/trainer_state.json ADDED
@@ -0,0 +1,2186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 10422,
3
+ "best_metric": 6.123514175415039,
4
+ "best_model_checkpoint": "./output/checkpoint-10422",
5
+ "epoch": 6.0,
6
+ "eval_steps": 500,
7
+ "global_step": 10422,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "entropy": 3.852523431777954,
14
+ "epoch": 0.028785261945883708,
15
+ "grad_norm": 6.76995325088501,
16
+ "learning_rate": 4.9e-07,
17
+ "loss": 15.3204,
18
+ "mean_token_accuracy": 0.10081263825297355,
19
+ "num_tokens": 47319.0,
20
+ "step": 50
21
+ },
22
+ {
23
+ "entropy": 3.862936944961548,
24
+ "epoch": 0.057570523891767415,
25
+ "grad_norm": 6.614922046661377,
26
+ "learning_rate": 9.9e-07,
27
+ "loss": 15.6327,
28
+ "mean_token_accuracy": 0.09312776654958725,
29
+ "num_tokens": 96809.0,
30
+ "step": 100
31
+ },
32
+ {
33
+ "entropy": 3.786734127998352,
34
+ "epoch": 0.08635578583765112,
35
+ "grad_norm": 7.106786727905273,
36
+ "learning_rate": 1.49e-06,
37
+ "loss": 14.619,
38
+ "mean_token_accuracy": 0.11782759010791778,
39
+ "num_tokens": 139962.0,
40
+ "step": 150
41
+ },
42
+ {
43
+ "entropy": 3.929054970741272,
44
+ "epoch": 0.11514104778353483,
45
+ "grad_norm": 8.141215324401855,
46
+ "learning_rate": 1.99e-06,
47
+ "loss": 15.1343,
48
+ "mean_token_accuracy": 0.10053366936743259,
49
+ "num_tokens": 188029.0,
50
+ "step": 200
51
+ },
52
+ {
53
+ "entropy": 4.070261402130127,
54
+ "epoch": 0.14392630972941853,
55
+ "grad_norm": 8.59919261932373,
56
+ "learning_rate": 1.9904128350616315e-06,
57
+ "loss": 14.4973,
58
+ "mean_token_accuracy": 0.11404540404677391,
59
+ "num_tokens": 234425.0,
60
+ "step": 250
61
+ },
62
+ {
63
+ "entropy": 4.341845245361328,
64
+ "epoch": 0.17271157167530224,
65
+ "grad_norm": 9.0352783203125,
66
+ "learning_rate": 1.98063001369595e-06,
67
+ "loss": 14.0316,
68
+ "mean_token_accuracy": 0.11713312789797783,
69
+ "num_tokens": 278885.0,
70
+ "step": 300
71
+ },
72
+ {
73
+ "entropy": 4.582782163619995,
74
+ "epoch": 0.20149683362118595,
75
+ "grad_norm": 12.131658554077148,
76
+ "learning_rate": 1.970847192330268e-06,
77
+ "loss": 13.6186,
78
+ "mean_token_accuracy": 0.11667421050369739,
79
+ "num_tokens": 325491.0,
80
+ "step": 350
81
+ },
82
+ {
83
+ "entropy": 5.054569063186645,
84
+ "epoch": 0.23028209556706966,
85
+ "grad_norm": 20.180458068847656,
86
+ "learning_rate": 1.961064370964586e-06,
87
+ "loss": 13.175,
88
+ "mean_token_accuracy": 0.11196398630738258,
89
+ "num_tokens": 372913.0,
90
+ "step": 400
91
+ },
92
+ {
93
+ "entropy": 5.657191934585572,
94
+ "epoch": 0.25906735751295334,
95
+ "grad_norm": 29.541980743408203,
96
+ "learning_rate": 1.9512815495989045e-06,
97
+ "loss": 11.8662,
98
+ "mean_token_accuracy": 0.11793715238571167,
99
+ "num_tokens": 419159.0,
100
+ "step": 450
101
+ },
102
+ {
103
+ "entropy": 6.518256826400757,
104
+ "epoch": 0.28785261945883706,
105
+ "grad_norm": 38.03327941894531,
106
+ "learning_rate": 1.9414987282332225e-06,
107
+ "loss": 10.1026,
108
+ "mean_token_accuracy": 0.1128273394703865,
109
+ "num_tokens": 466994.0,
110
+ "step": 500
111
+ },
112
+ {
113
+ "entropy": 6.959106483459473,
114
+ "epoch": 0.31663788140472077,
115
+ "grad_norm": 19.534976959228516,
116
+ "learning_rate": 1.9317159068675404e-06,
117
+ "loss": 8.5709,
118
+ "mean_token_accuracy": 0.12011336177587509,
119
+ "num_tokens": 513308.0,
120
+ "step": 550
121
+ },
122
+ {
123
+ "entropy": 7.0854248046875,
124
+ "epoch": 0.3454231433506045,
125
+ "grad_norm": 9.56103801727295,
126
+ "learning_rate": 1.921933085501859e-06,
127
+ "loss": 7.9893,
128
+ "mean_token_accuracy": 0.11719893589615822,
129
+ "num_tokens": 559679.0,
130
+ "step": 600
131
+ },
132
+ {
133
+ "entropy": 7.002319450378418,
134
+ "epoch": 0.3742084052964882,
135
+ "grad_norm": 7.537400245666504,
136
+ "learning_rate": 1.9121502641361767e-06,
137
+ "loss": 7.496,
138
+ "mean_token_accuracy": 0.13140189573168753,
139
+ "num_tokens": 603140.0,
140
+ "step": 650
141
+ },
142
+ {
143
+ "entropy": 7.21703164100647,
144
+ "epoch": 0.4029936672423719,
145
+ "grad_norm": 7.521403789520264,
146
+ "learning_rate": 1.902367442770495e-06,
147
+ "loss": 7.4496,
148
+ "mean_token_accuracy": 0.12282118022441864,
149
+ "num_tokens": 650179.0,
150
+ "step": 700
151
+ },
152
+ {
153
+ "entropy": 7.166195478439331,
154
+ "epoch": 0.4317789291882556,
155
+ "grad_norm": 5.631101608276367,
156
+ "learning_rate": 1.892584621404813e-06,
157
+ "loss": 7.2209,
158
+ "mean_token_accuracy": 0.1341453194618225,
159
+ "num_tokens": 696220.0,
160
+ "step": 750
161
+ },
162
+ {
163
+ "entropy": 7.179120960235596,
164
+ "epoch": 0.4605641911341393,
165
+ "grad_norm": 4.589099407196045,
166
+ "learning_rate": 1.8828018000391312e-06,
167
+ "loss": 7.1051,
168
+ "mean_token_accuracy": 0.13415986388921738,
169
+ "num_tokens": 743909.0,
170
+ "step": 800
171
+ },
172
+ {
173
+ "entropy": 7.132651596069336,
174
+ "epoch": 0.48934945308002303,
175
+ "grad_norm": 4.283458709716797,
176
+ "learning_rate": 1.8730189786734493e-06,
177
+ "loss": 6.9812,
178
+ "mean_token_accuracy": 0.13903394356369972,
179
+ "num_tokens": 792015.0,
180
+ "step": 850
181
+ },
182
+ {
183
+ "entropy": 7.095656795501709,
184
+ "epoch": 0.5181347150259067,
185
+ "grad_norm": 5.147945404052734,
186
+ "learning_rate": 1.8632361573077675e-06,
187
+ "loss": 6.8989,
188
+ "mean_token_accuracy": 0.14551860421895982,
189
+ "num_tokens": 841802.0,
190
+ "step": 900
191
+ },
192
+ {
193
+ "entropy": 6.79542833328247,
194
+ "epoch": 0.5469199769717904,
195
+ "grad_norm": 5.6646223068237305,
196
+ "learning_rate": 1.8534533359420857e-06,
197
+ "loss": 6.5807,
198
+ "mean_token_accuracy": 0.17714839324355125,
199
+ "num_tokens": 886492.0,
200
+ "step": 950
201
+ },
202
+ {
203
+ "entropy": 6.843292989730835,
204
+ "epoch": 0.5757052389176741,
205
+ "grad_norm": 4.838613033294678,
206
+ "learning_rate": 1.8436705145764038e-06,
207
+ "loss": 6.6068,
208
+ "mean_token_accuracy": 0.17419333070516585,
209
+ "num_tokens": 932807.0,
210
+ "step": 1000
211
+ },
212
+ {
213
+ "entropy": 6.8572557926177975,
214
+ "epoch": 0.6044905008635578,
215
+ "grad_norm": 4.200118541717529,
216
+ "learning_rate": 1.833887693210722e-06,
217
+ "loss": 6.6058,
218
+ "mean_token_accuracy": 0.1721690407395363,
219
+ "num_tokens": 980541.0,
220
+ "step": 1050
221
+ },
222
+ {
223
+ "entropy": 6.629744396209717,
224
+ "epoch": 0.6332757628094415,
225
+ "grad_norm": 5.378969192504883,
226
+ "learning_rate": 1.8241048718450401e-06,
227
+ "loss": 6.3675,
228
+ "mean_token_accuracy": 0.19682477086782454,
229
+ "num_tokens": 1023882.0,
230
+ "step": 1100
231
+ },
232
+ {
233
+ "entropy": 6.537129864692688,
234
+ "epoch": 0.6620610247553252,
235
+ "grad_norm": 6.9036736488342285,
236
+ "learning_rate": 1.814322050479358e-06,
237
+ "loss": 6.2659,
238
+ "mean_token_accuracy": 0.20461874470114708,
239
+ "num_tokens": 1068300.0,
240
+ "step": 1150
241
+ },
242
+ {
243
+ "entropy": 6.6863781929016115,
244
+ "epoch": 0.690846286701209,
245
+ "grad_norm": 4.751266002655029,
246
+ "learning_rate": 1.8045392291136762e-06,
247
+ "loss": 6.4062,
248
+ "mean_token_accuracy": 0.1890461677312851,
249
+ "num_tokens": 1115425.0,
250
+ "step": 1200
251
+ },
252
+ {
253
+ "entropy": 6.71572250366211,
254
+ "epoch": 0.7196315486470927,
255
+ "grad_norm": 3.819430351257324,
256
+ "learning_rate": 1.7947564077479944e-06,
257
+ "loss": 6.4309,
258
+ "mean_token_accuracy": 0.1853047838807106,
259
+ "num_tokens": 1162319.0,
260
+ "step": 1250
261
+ },
262
+ {
263
+ "entropy": 6.632803421020508,
264
+ "epoch": 0.7484168105929764,
265
+ "grad_norm": 6.341519832611084,
266
+ "learning_rate": 1.7849735863823125e-06,
267
+ "loss": 6.3439,
268
+ "mean_token_accuracy": 0.19532681837677957,
269
+ "num_tokens": 1208230.0,
270
+ "step": 1300
271
+ },
272
+ {
273
+ "entropy": 6.5115529870986935,
274
+ "epoch": 0.7772020725388601,
275
+ "grad_norm": 6.07994270324707,
276
+ "learning_rate": 1.7751907650166307e-06,
277
+ "loss": 6.227,
278
+ "mean_token_accuracy": 0.20865949630737304,
279
+ "num_tokens": 1253074.0,
280
+ "step": 1350
281
+ },
282
+ {
283
+ "entropy": 6.603812961578369,
284
+ "epoch": 0.8059873344847438,
285
+ "grad_norm": 5.038186073303223,
286
+ "learning_rate": 1.7654079436509488e-06,
287
+ "loss": 6.3205,
288
+ "mean_token_accuracy": 0.19723004043102266,
289
+ "num_tokens": 1300179.0,
290
+ "step": 1400
291
+ },
292
+ {
293
+ "entropy": 6.614377698898315,
294
+ "epoch": 0.8347725964306275,
295
+ "grad_norm": 5.706110000610352,
296
+ "learning_rate": 1.755625122285267e-06,
297
+ "loss": 6.3415,
298
+ "mean_token_accuracy": 0.19426312118768693,
299
+ "num_tokens": 1347539.0,
300
+ "step": 1450
301
+ },
302
+ {
303
+ "entropy": 6.535960426330567,
304
+ "epoch": 0.8635578583765112,
305
+ "grad_norm": 3.8547616004943848,
306
+ "learning_rate": 1.7458423009195851e-06,
307
+ "loss": 6.2563,
308
+ "mean_token_accuracy": 0.20262083023786545,
309
+ "num_tokens": 1394157.0,
310
+ "step": 1500
311
+ },
312
+ {
313
+ "entropy": 6.635546321868897,
314
+ "epoch": 0.8923431203223949,
315
+ "grad_norm": 6.530314922332764,
316
+ "learning_rate": 1.7360594795539033e-06,
317
+ "loss": 6.3569,
318
+ "mean_token_accuracy": 0.1919993445277214,
319
+ "num_tokens": 1443892.0,
320
+ "step": 1550
321
+ },
322
+ {
323
+ "entropy": 6.463044328689575,
324
+ "epoch": 0.9211283822682786,
325
+ "grad_norm": 4.981988430023193,
326
+ "learning_rate": 1.7262766581882212e-06,
327
+ "loss": 6.1768,
328
+ "mean_token_accuracy": 0.21415023148059845,
329
+ "num_tokens": 1491050.0,
330
+ "step": 1600
331
+ },
332
+ {
333
+ "entropy": 6.614933052062988,
334
+ "epoch": 0.9499136442141624,
335
+ "grad_norm": 3.6533243656158447,
336
+ "learning_rate": 1.7164938368225394e-06,
337
+ "loss": 6.3213,
338
+ "mean_token_accuracy": 0.19727844208478929,
339
+ "num_tokens": 1540809.0,
340
+ "step": 1650
341
+ },
342
+ {
343
+ "entropy": 6.38832573890686,
344
+ "epoch": 0.9786989061600461,
345
+ "grad_norm": 5.930168151855469,
346
+ "learning_rate": 1.7067110154568575e-06,
347
+ "loss": 6.0941,
348
+ "mean_token_accuracy": 0.22453365564346314,
349
+ "num_tokens": 1585876.0,
350
+ "step": 1700
351
+ },
352
+ {
353
+ "epoch": 1.0,
354
+ "eval_entropy": 6.647906507764544,
355
+ "eval_loss": 6.368417263031006,
356
+ "eval_mean_token_accuracy": 0.1902703122334546,
357
+ "eval_model_preparation_time": 0.0046,
358
+ "eval_num_tokens": 1619719.0,
359
+ "eval_runtime": 79.1489,
360
+ "eval_samples_per_second": 5.483,
361
+ "eval_steps_per_second": 2.742,
362
+ "step": 1737
363
+ },
364
+ {
365
+ "entropy": 6.413019351959228,
366
+ "epoch": 1.0074841681059297,
367
+ "grad_norm": 4.029679298400879,
368
+ "learning_rate": 1.6969281940911757e-06,
369
+ "loss": 6.1183,
370
+ "mean_token_accuracy": 0.2218746316432953,
371
+ "num_tokens": 1632015.0,
372
+ "step": 1750
373
+ },
374
+ {
375
+ "entropy": 6.508082237243652,
376
+ "epoch": 1.0362694300518134,
377
+ "grad_norm": 4.616228103637695,
378
+ "learning_rate": 1.687145372725494e-06,
379
+ "loss": 6.2151,
380
+ "mean_token_accuracy": 0.2117026337981224,
381
+ "num_tokens": 1681154.0,
382
+ "step": 1800
383
+ },
384
+ {
385
+ "entropy": 6.401120796203613,
386
+ "epoch": 1.065054691997697,
387
+ "grad_norm": 4.693721771240234,
388
+ "learning_rate": 1.6773625513598122e-06,
389
+ "loss": 6.1039,
390
+ "mean_token_accuracy": 0.22995314985513687,
391
+ "num_tokens": 1728110.0,
392
+ "step": 1850
393
+ },
394
+ {
395
+ "entropy": 6.3726827430725095,
396
+ "epoch": 1.0938399539435808,
397
+ "grad_norm": 3.4032232761383057,
398
+ "learning_rate": 1.6675797299941304e-06,
399
+ "loss": 6.0875,
400
+ "mean_token_accuracy": 0.23210913449525833,
401
+ "num_tokens": 1775703.0,
402
+ "step": 1900
403
+ },
404
+ {
405
+ "entropy": 6.328955335617065,
406
+ "epoch": 1.1226252158894645,
407
+ "grad_norm": 5.2645440101623535,
408
+ "learning_rate": 1.6577969086284485e-06,
409
+ "loss": 6.0535,
410
+ "mean_token_accuracy": 0.23541803926229476,
411
+ "num_tokens": 1821980.0,
412
+ "step": 1950
413
+ },
414
+ {
415
+ "entropy": 6.272669095993042,
416
+ "epoch": 1.1514104778353482,
417
+ "grad_norm": 3.2737088203430176,
418
+ "learning_rate": 1.6480140872627667e-06,
419
+ "loss": 6.0091,
420
+ "mean_token_accuracy": 0.24000794380903245,
421
+ "num_tokens": 1867547.0,
422
+ "step": 2000
423
+ },
424
+ {
425
+ "entropy": 6.346148128509522,
426
+ "epoch": 1.180195739781232,
427
+ "grad_norm": 29.210887908935547,
428
+ "learning_rate": 1.6382312658970846e-06,
429
+ "loss": 6.0883,
430
+ "mean_token_accuracy": 0.22887098014354706,
431
+ "num_tokens": 1915411.0,
432
+ "step": 2050
433
+ },
434
+ {
435
+ "entropy": 6.279903531074524,
436
+ "epoch": 1.2089810017271156,
437
+ "grad_norm": 3.977229356765747,
438
+ "learning_rate": 1.6284484445314028e-06,
439
+ "loss": 6.0318,
440
+ "mean_token_accuracy": 0.23324142932891845,
441
+ "num_tokens": 1964009.0,
442
+ "step": 2100
443
+ },
444
+ {
445
+ "entropy": 6.166584692001343,
446
+ "epoch": 1.2377662636729994,
447
+ "grad_norm": 3.1655149459838867,
448
+ "learning_rate": 1.618665623165721e-06,
449
+ "loss": 5.917,
450
+ "mean_token_accuracy": 0.24853385210037232,
451
+ "num_tokens": 2008595.0,
452
+ "step": 2150
453
+ },
454
+ {
455
+ "entropy": 6.322167301177979,
456
+ "epoch": 1.266551525618883,
457
+ "grad_norm": 2.878366708755493,
458
+ "learning_rate": 1.608882801800039e-06,
459
+ "loss": 6.0756,
460
+ "mean_token_accuracy": 0.2293447071313858,
461
+ "num_tokens": 2057229.0,
462
+ "step": 2200
463
+ },
464
+ {
465
+ "entropy": 6.205327453613282,
466
+ "epoch": 1.2953367875647668,
467
+ "grad_norm": 2.5909852981567383,
468
+ "learning_rate": 1.5990999804343572e-06,
469
+ "loss": 5.962,
470
+ "mean_token_accuracy": 0.24392724603414537,
471
+ "num_tokens": 2103631.0,
472
+ "step": 2250
473
+ },
474
+ {
475
+ "entropy": 6.157236385345459,
476
+ "epoch": 1.3241220495106505,
477
+ "grad_norm": 6.303485870361328,
478
+ "learning_rate": 1.5893171590686754e-06,
479
+ "loss": 5.9162,
480
+ "mean_token_accuracy": 0.24806494176387786,
481
+ "num_tokens": 2150369.0,
482
+ "step": 2300
483
+ },
484
+ {
485
+ "entropy": 6.351038675308228,
486
+ "epoch": 1.3529073114565342,
487
+ "grad_norm": 3.6839759349823,
488
+ "learning_rate": 1.5795343377029935e-06,
489
+ "loss": 6.1091,
490
+ "mean_token_accuracy": 0.22586097091436386,
491
+ "num_tokens": 2199724.0,
492
+ "step": 2350
493
+ },
494
+ {
495
+ "entropy": 6.134297747611999,
496
+ "epoch": 1.381692573402418,
497
+ "grad_norm": 2.5310072898864746,
498
+ "learning_rate": 1.5697515163373117e-06,
499
+ "loss": 5.9017,
500
+ "mean_token_accuracy": 0.24992096066474914,
501
+ "num_tokens": 2245341.0,
502
+ "step": 2400
503
+ },
504
+ {
505
+ "entropy": 6.322179689407348,
506
+ "epoch": 1.4104778353483016,
507
+ "grad_norm": 2.834397554397583,
508
+ "learning_rate": 1.5599686949716298e-06,
509
+ "loss": 6.0913,
510
+ "mean_token_accuracy": 0.22521918207407,
511
+ "num_tokens": 2294726.0,
512
+ "step": 2450
513
+ },
514
+ {
515
+ "entropy": 6.0980473279953005,
516
+ "epoch": 1.4392630972941853,
517
+ "grad_norm": 3.1855642795562744,
518
+ "learning_rate": 1.5501858736059478e-06,
519
+ "loss": 5.8841,
520
+ "mean_token_accuracy": 0.25087269872426987,
521
+ "num_tokens": 2342251.0,
522
+ "step": 2500
523
+ },
524
+ {
525
+ "entropy": 6.088696489334106,
526
+ "epoch": 1.468048359240069,
527
+ "grad_norm": 5.114110946655273,
528
+ "learning_rate": 1.540403052240266e-06,
529
+ "loss": 5.8739,
530
+ "mean_token_accuracy": 0.2518083402514458,
531
+ "num_tokens": 2387469.0,
532
+ "step": 2550
533
+ },
534
+ {
535
+ "entropy": 6.151525087356568,
536
+ "epoch": 1.4968336211859528,
537
+ "grad_norm": 2.6623592376708984,
538
+ "learning_rate": 1.530620230874584e-06,
539
+ "loss": 5.9402,
540
+ "mean_token_accuracy": 0.24315184772014617,
541
+ "num_tokens": 2434128.0,
542
+ "step": 2600
543
+ },
544
+ {
545
+ "entropy": 6.112346210479736,
546
+ "epoch": 1.5256188831318365,
547
+ "grad_norm": 4.4492950439453125,
548
+ "learning_rate": 1.5208374095089022e-06,
549
+ "loss": 5.9062,
550
+ "mean_token_accuracy": 0.24730559319257736,
551
+ "num_tokens": 2480366.0,
552
+ "step": 2650
553
+ },
554
+ {
555
+ "entropy": 6.088384003639221,
556
+ "epoch": 1.5544041450777202,
557
+ "grad_norm": 2.631941556930542,
558
+ "learning_rate": 1.5110545881432204e-06,
559
+ "loss": 5.8806,
560
+ "mean_token_accuracy": 0.2491958048939705,
561
+ "num_tokens": 2527357.0,
562
+ "step": 2700
563
+ },
564
+ {
565
+ "entropy": 6.054274072647095,
566
+ "epoch": 1.583189407023604,
567
+ "grad_norm": 3.9610729217529297,
568
+ "learning_rate": 1.5012717667775385e-06,
569
+ "loss": 5.851,
570
+ "mean_token_accuracy": 0.25282351911067963,
571
+ "num_tokens": 2574470.0,
572
+ "step": 2750
573
+ },
574
+ {
575
+ "entropy": 6.205899753570557,
576
+ "epoch": 1.6119746689694876,
577
+ "grad_norm": 2.052320957183838,
578
+ "learning_rate": 1.4914889454118567e-06,
579
+ "loss": 6.0034,
580
+ "mean_token_accuracy": 0.23449385523796082,
581
+ "num_tokens": 2622280.0,
582
+ "step": 2800
583
+ },
584
+ {
585
+ "entropy": 6.037883262634278,
586
+ "epoch": 1.6407599309153713,
587
+ "grad_norm": 1.5475044250488281,
588
+ "learning_rate": 1.4817061240461749e-06,
589
+ "loss": 5.8417,
590
+ "mean_token_accuracy": 0.2532995194196701,
591
+ "num_tokens": 2668624.0,
592
+ "step": 2850
593
+ },
594
+ {
595
+ "entropy": 6.011255393028259,
596
+ "epoch": 1.669545192861255,
597
+ "grad_norm": 1.50232994556427,
598
+ "learning_rate": 1.471923302680493e-06,
599
+ "loss": 5.8213,
600
+ "mean_token_accuracy": 0.2553745821118355,
601
+ "num_tokens": 2714388.0,
602
+ "step": 2900
603
+ },
604
+ {
605
+ "entropy": 6.172232007980346,
606
+ "epoch": 1.6983304548071387,
607
+ "grad_norm": 1.9385855197906494,
608
+ "learning_rate": 1.462140481314811e-06,
609
+ "loss": 5.9765,
610
+ "mean_token_accuracy": 0.23626189529895783,
611
+ "num_tokens": 2761465.0,
612
+ "step": 2950
613
+ },
614
+ {
615
+ "entropy": 6.066384444236755,
616
+ "epoch": 1.7271157167530224,
617
+ "grad_norm": 2.6527063846588135,
618
+ "learning_rate": 1.452357659949129e-06,
619
+ "loss": 5.8701,
620
+ "mean_token_accuracy": 0.24866942584514617,
621
+ "num_tokens": 2808066.0,
622
+ "step": 3000
623
+ },
624
+ {
625
+ "entropy": 5.876337275505066,
626
+ "epoch": 1.7559009786989062,
627
+ "grad_norm": 2.4430501461029053,
628
+ "learning_rate": 1.4425748385834473e-06,
629
+ "loss": 5.6826,
630
+ "mean_token_accuracy": 0.27362876415252685,
631
+ "num_tokens": 2851822.0,
632
+ "step": 3050
633
+ },
634
+ {
635
+ "entropy": 6.044622054100037,
636
+ "epoch": 1.7846862406447899,
637
+ "grad_norm": 3.2790579795837402,
638
+ "learning_rate": 1.4327920172177654e-06,
639
+ "loss": 5.8551,
640
+ "mean_token_accuracy": 0.25071538865566256,
641
+ "num_tokens": 2897737.0,
642
+ "step": 3100
643
+ },
644
+ {
645
+ "entropy": 5.777814731597901,
646
+ "epoch": 1.8134715025906736,
647
+ "grad_norm": 1.7892365455627441,
648
+ "learning_rate": 1.4230091958520836e-06,
649
+ "loss": 5.5954,
650
+ "mean_token_accuracy": 0.284136081635952,
651
+ "num_tokens": 2939511.0,
652
+ "step": 3150
653
+ },
654
+ {
655
+ "entropy": 6.034259614944458,
656
+ "epoch": 1.8422567645365573,
657
+ "grad_norm": 1.7564071416854858,
658
+ "learning_rate": 1.413226374486402e-06,
659
+ "loss": 5.848,
660
+ "mean_token_accuracy": 0.25160137861967086,
661
+ "num_tokens": 2986368.0,
662
+ "step": 3200
663
+ },
664
+ {
665
+ "entropy": 6.0115156078338625,
666
+ "epoch": 1.871042026482441,
667
+ "grad_norm": 2.3167052268981934,
668
+ "learning_rate": 1.40344355312072e-06,
669
+ "loss": 5.8269,
670
+ "mean_token_accuracy": 0.25526676297187806,
671
+ "num_tokens": 3031770.0,
672
+ "step": 3250
673
+ },
674
+ {
675
+ "entropy": 6.0657948303222655,
676
+ "epoch": 1.8998272884283247,
677
+ "grad_norm": 1.765837550163269,
678
+ "learning_rate": 1.3936607317550382e-06,
679
+ "loss": 5.8765,
680
+ "mean_token_accuracy": 0.24879903554916383,
681
+ "num_tokens": 3078322.0,
682
+ "step": 3300
683
+ },
684
+ {
685
+ "entropy": 6.146444616317749,
686
+ "epoch": 1.9286125503742084,
687
+ "grad_norm": 2.933809518814087,
688
+ "learning_rate": 1.3838779103893564e-06,
689
+ "loss": 5.9625,
690
+ "mean_token_accuracy": 0.23642315745353698,
691
+ "num_tokens": 3125572.0,
692
+ "step": 3350
693
+ },
694
+ {
695
+ "entropy": 6.007315292358398,
696
+ "epoch": 1.9573978123200921,
697
+ "grad_norm": 1.7006982564926147,
698
+ "learning_rate": 1.3740950890236743e-06,
699
+ "loss": 5.8227,
700
+ "mean_token_accuracy": 0.25394665479660034,
701
+ "num_tokens": 3171974.0,
702
+ "step": 3400
703
+ },
704
+ {
705
+ "entropy": 6.091508469581604,
706
+ "epoch": 1.9861830742659758,
707
+ "grad_norm": 1.8032574653625488,
708
+ "learning_rate": 1.3643122676579925e-06,
709
+ "loss": 5.9103,
710
+ "mean_token_accuracy": 0.24359373539686202,
711
+ "num_tokens": 3219624.0,
712
+ "step": 3450
713
+ },
714
+ {
715
+ "epoch": 2.0,
716
+ "eval_entropy": 6.323629730857462,
717
+ "eval_loss": 6.1541829109191895,
718
+ "eval_mean_token_accuracy": 0.20884785385725135,
719
+ "eval_model_preparation_time": 0.0046,
720
+ "eval_num_tokens": 3239438.0,
721
+ "eval_runtime": 79.0616,
722
+ "eval_samples_per_second": 5.489,
723
+ "eval_steps_per_second": 2.745,
724
+ "step": 3474
725
+ },
726
+ {
727
+ "entropy": 5.967442779541016,
728
+ "epoch": 2.0149683362118593,
729
+ "grad_norm": 1.3615084886550903,
730
+ "learning_rate": 1.3545294462923106e-06,
731
+ "loss": 5.7883,
732
+ "mean_token_accuracy": 0.25975353181362154,
733
+ "num_tokens": 3263994.0,
734
+ "step": 3500
735
+ },
736
+ {
737
+ "entropy": 6.085220527648926,
738
+ "epoch": 2.043753598157743,
739
+ "grad_norm": 2.406777858734131,
740
+ "learning_rate": 1.3447466249266288e-06,
741
+ "loss": 5.9002,
742
+ "mean_token_accuracy": 0.24500031709671022,
743
+ "num_tokens": 3311182.0,
744
+ "step": 3550
745
+ },
746
+ {
747
+ "entropy": 6.010667142868042,
748
+ "epoch": 2.0725388601036268,
749
+ "grad_norm": 4.209227561950684,
750
+ "learning_rate": 1.334963803560947e-06,
751
+ "loss": 5.8366,
752
+ "mean_token_accuracy": 0.2527648264169693,
753
+ "num_tokens": 3358036.0,
754
+ "step": 3600
755
+ },
756
+ {
757
+ "entropy": 6.040924577713013,
758
+ "epoch": 2.1013241220495105,
759
+ "grad_norm": 3.2806403636932373,
760
+ "learning_rate": 1.325180982195265e-06,
761
+ "loss": 5.8649,
762
+ "mean_token_accuracy": 0.24962294459342957,
763
+ "num_tokens": 3404058.0,
764
+ "step": 3650
765
+ },
766
+ {
767
+ "entropy": 6.023610129356384,
768
+ "epoch": 2.130109383995394,
769
+ "grad_norm": 1.1922718286514282,
770
+ "learning_rate": 1.3153981608295833e-06,
771
+ "loss": 5.8519,
772
+ "mean_token_accuracy": 0.25007107347249985,
773
+ "num_tokens": 3449834.0,
774
+ "step": 3700
775
+ },
776
+ {
777
+ "entropy": 5.89481824874878,
778
+ "epoch": 2.158894645941278,
779
+ "grad_norm": 1.8002029657363892,
780
+ "learning_rate": 1.3056153394639014e-06,
781
+ "loss": 5.7206,
782
+ "mean_token_accuracy": 0.26725934326648715,
783
+ "num_tokens": 3494740.0,
784
+ "step": 3750
785
+ },
786
+ {
787
+ "entropy": 6.008778114318847,
788
+ "epoch": 2.1876799078871616,
789
+ "grad_norm": 2.3413538932800293,
790
+ "learning_rate": 1.2958325180982196e-06,
791
+ "loss": 5.8343,
792
+ "mean_token_accuracy": 0.2532099911570549,
793
+ "num_tokens": 3541342.0,
794
+ "step": 3800
795
+ },
796
+ {
797
+ "entropy": 6.085102453231811,
798
+ "epoch": 2.2164651698330453,
799
+ "grad_norm": 1.7294431924819946,
800
+ "learning_rate": 1.2860496967325375e-06,
801
+ "loss": 5.9104,
802
+ "mean_token_accuracy": 0.2436734887957573,
803
+ "num_tokens": 3588995.0,
804
+ "step": 3850
805
+ },
806
+ {
807
+ "entropy": 5.947706546783447,
808
+ "epoch": 2.245250431778929,
809
+ "grad_norm": 1.6259620189666748,
810
+ "learning_rate": 1.2762668753668557e-06,
811
+ "loss": 5.78,
812
+ "mean_token_accuracy": 0.2602892768383026,
813
+ "num_tokens": 3634252.0,
814
+ "step": 3900
815
+ },
816
+ {
817
+ "entropy": 5.989762544631958,
818
+ "epoch": 2.2740356937248127,
819
+ "grad_norm": 1.664301872253418,
820
+ "learning_rate": 1.2664840540011738e-06,
821
+ "loss": 5.8189,
822
+ "mean_token_accuracy": 0.2546903318166733,
823
+ "num_tokens": 3681197.0,
824
+ "step": 3950
825
+ },
826
+ {
827
+ "entropy": 6.187751932144165,
828
+ "epoch": 2.3028209556706964,
829
+ "grad_norm": 3.428220748901367,
830
+ "learning_rate": 1.256701232635492e-06,
831
+ "loss": 6.0137,
832
+ "mean_token_accuracy": 0.23033296406269074,
833
+ "num_tokens": 3729955.0,
834
+ "step": 4000
835
+ },
836
+ {
837
+ "entropy": 6.038392038345337,
838
+ "epoch": 2.33160621761658,
839
+ "grad_norm": 2.1140899658203125,
840
+ "learning_rate": 1.2469184112698101e-06,
841
+ "loss": 5.8655,
842
+ "mean_token_accuracy": 0.24881428897380828,
843
+ "num_tokens": 3777043.0,
844
+ "step": 4050
845
+ },
846
+ {
847
+ "entropy": 6.071309795379639,
848
+ "epoch": 2.360391479562464,
849
+ "grad_norm": 1.344217300415039,
850
+ "learning_rate": 1.2371355899041283e-06,
851
+ "loss": 5.8991,
852
+ "mean_token_accuracy": 0.2440922862291336,
853
+ "num_tokens": 3824067.0,
854
+ "step": 4100
855
+ },
856
+ {
857
+ "entropy": 6.129210476875305,
858
+ "epoch": 2.3891767415083476,
859
+ "grad_norm": 1.578134536743164,
860
+ "learning_rate": 1.2273527685384464e-06,
861
+ "loss": 5.9573,
862
+ "mean_token_accuracy": 0.23632006645202636,
863
+ "num_tokens": 3872769.0,
864
+ "step": 4150
865
+ },
866
+ {
867
+ "entropy": 6.0412983751297,
868
+ "epoch": 2.4179620034542313,
869
+ "grad_norm": 1.5530976057052612,
870
+ "learning_rate": 1.2175699471727646e-06,
871
+ "loss": 5.8701,
872
+ "mean_token_accuracy": 0.24784765332937242,
873
+ "num_tokens": 3919379.0,
874
+ "step": 4200
875
+ },
876
+ {
877
+ "entropy": 6.002105917930603,
878
+ "epoch": 2.446747265400115,
879
+ "grad_norm": 1.6028035879135132,
880
+ "learning_rate": 1.2077871258070827e-06,
881
+ "loss": 5.8313,
882
+ "mean_token_accuracy": 0.25332365930080414,
883
+ "num_tokens": 3965593.0,
884
+ "step": 4250
885
+ },
886
+ {
887
+ "entropy": 6.06869218826294,
888
+ "epoch": 2.4755325273459987,
889
+ "grad_norm": 1.5630944967269897,
890
+ "learning_rate": 1.1980043044414007e-06,
891
+ "loss": 5.8973,
892
+ "mean_token_accuracy": 0.24471112668514253,
893
+ "num_tokens": 4012300.0,
894
+ "step": 4300
895
+ },
896
+ {
897
+ "entropy": 6.019678201675415,
898
+ "epoch": 2.5043177892918824,
899
+ "grad_norm": 1.9821183681488037,
900
+ "learning_rate": 1.1882214830757188e-06,
901
+ "loss": 5.8526,
902
+ "mean_token_accuracy": 0.24976039975881575,
903
+ "num_tokens": 4059323.0,
904
+ "step": 4350
905
+ },
906
+ {
907
+ "entropy": 6.157129697799682,
908
+ "epoch": 2.533103051237766,
909
+ "grad_norm": 3.1856675148010254,
910
+ "learning_rate": 1.178438661710037e-06,
911
+ "loss": 5.9868,
912
+ "mean_token_accuracy": 0.23358212381601334,
913
+ "num_tokens": 4107616.0,
914
+ "step": 4400
915
+ },
916
+ {
917
+ "entropy": 5.9199522733688354,
918
+ "epoch": 2.56188831318365,
919
+ "grad_norm": 2.0129523277282715,
920
+ "learning_rate": 1.1686558403443551e-06,
921
+ "loss": 5.7537,
922
+ "mean_token_accuracy": 0.26219907581806184,
923
+ "num_tokens": 4152400.0,
924
+ "step": 4450
925
+ },
926
+ {
927
+ "entropy": 6.141581220626831,
928
+ "epoch": 2.5906735751295336,
929
+ "grad_norm": 1.4197176694869995,
930
+ "learning_rate": 1.1588730189786733e-06,
931
+ "loss": 5.9746,
932
+ "mean_token_accuracy": 0.23547348588705064,
933
+ "num_tokens": 4200994.0,
934
+ "step": 4500
935
+ },
936
+ {
937
+ "entropy": 6.024065284729004,
938
+ "epoch": 2.6194588370754173,
939
+ "grad_norm": 2.5414512157440186,
940
+ "learning_rate": 1.1490901976129917e-06,
941
+ "loss": 5.8542,
942
+ "mean_token_accuracy": 0.24906692177057266,
943
+ "num_tokens": 4247548.0,
944
+ "step": 4550
945
+ },
946
+ {
947
+ "entropy": 6.087933650016785,
948
+ "epoch": 2.648244099021301,
949
+ "grad_norm": 1.2823543548583984,
950
+ "learning_rate": 1.1393073762473098e-06,
951
+ "loss": 5.9213,
952
+ "mean_token_accuracy": 0.24062541306018828,
953
+ "num_tokens": 4295250.0,
954
+ "step": 4600
955
+ },
956
+ {
957
+ "entropy": 6.043813619613648,
958
+ "epoch": 2.6770293609671847,
959
+ "grad_norm": 1.046730637550354,
960
+ "learning_rate": 1.129524554881628e-06,
961
+ "loss": 5.8741,
962
+ "mean_token_accuracy": 0.24775829553604126,
963
+ "num_tokens": 4341599.0,
964
+ "step": 4650
965
+ },
966
+ {
967
+ "entropy": 6.004058070182801,
968
+ "epoch": 2.7058146229130684,
969
+ "grad_norm": 1.243298053741455,
970
+ "learning_rate": 1.1197417335159461e-06,
971
+ "loss": 5.8366,
972
+ "mean_token_accuracy": 0.25220848590135575,
973
+ "num_tokens": 4388673.0,
974
+ "step": 4700
975
+ },
976
+ {
977
+ "entropy": 5.835509791374206,
978
+ "epoch": 2.734599884858952,
979
+ "grad_norm": 2.773327350616455,
980
+ "learning_rate": 1.109958912150264e-06,
981
+ "loss": 5.6767,
982
+ "mean_token_accuracy": 0.2719315069913864,
983
+ "num_tokens": 4432959.0,
984
+ "step": 4750
985
+ },
986
+ {
987
+ "entropy": 6.021662483215332,
988
+ "epoch": 2.763385146804836,
989
+ "grad_norm": 24.627521514892578,
990
+ "learning_rate": 1.1001760907845822e-06,
991
+ "loss": 5.8559,
992
+ "mean_token_accuracy": 0.24981790155172348,
993
+ "num_tokens": 4479190.0,
994
+ "step": 4800
995
+ },
996
+ {
997
+ "entropy": 5.997534699440003,
998
+ "epoch": 2.7921704087507195,
999
+ "grad_norm": 0.936356246471405,
1000
+ "learning_rate": 1.0903932694189004e-06,
1001
+ "loss": 5.8337,
1002
+ "mean_token_accuracy": 0.2518752273917198,
1003
+ "num_tokens": 4525674.0,
1004
+ "step": 4850
1005
+ },
1006
+ {
1007
+ "entropy": 5.853120732307434,
1008
+ "epoch": 2.8209556706966032,
1009
+ "grad_norm": 1.5253357887268066,
1010
+ "learning_rate": 1.0806104480532185e-06,
1011
+ "loss": 5.6906,
1012
+ "mean_token_accuracy": 0.2703215056657791,
1013
+ "num_tokens": 4570379.0,
1014
+ "step": 4900
1015
+ },
1016
+ {
1017
+ "entropy": 6.003798789978028,
1018
+ "epoch": 2.849740932642487,
1019
+ "grad_norm": 7.387447834014893,
1020
+ "learning_rate": 1.0708276266875367e-06,
1021
+ "loss": 5.8363,
1022
+ "mean_token_accuracy": 0.2520116460323334,
1023
+ "num_tokens": 4617184.0,
1024
+ "step": 4950
1025
+ },
1026
+ {
1027
+ "entropy": 6.044828844070435,
1028
+ "epoch": 2.8785261945883707,
1029
+ "grad_norm": 1.7473825216293335,
1030
+ "learning_rate": 1.0610448053218548e-06,
1031
+ "loss": 5.8824,
1032
+ "mean_token_accuracy": 0.2459094214439392,
1033
+ "num_tokens": 4664180.0,
1034
+ "step": 5000
1035
+ },
1036
+ {
1037
+ "entropy": 5.870430383682251,
1038
+ "epoch": 2.9073114565342544,
1039
+ "grad_norm": 0.9340764880180359,
1040
+ "learning_rate": 1.051261983956173e-06,
1041
+ "loss": 5.7101,
1042
+ "mean_token_accuracy": 0.2679078412055969,
1043
+ "num_tokens": 4708377.0,
1044
+ "step": 5050
1045
+ },
1046
+ {
1047
+ "entropy": 5.880399878025055,
1048
+ "epoch": 2.936096718480138,
1049
+ "grad_norm": 1.3693302869796753,
1050
+ "learning_rate": 1.0414791625904911e-06,
1051
+ "loss": 5.72,
1052
+ "mean_token_accuracy": 0.2662310737371445,
1053
+ "num_tokens": 4753587.0,
1054
+ "step": 5100
1055
+ },
1056
+ {
1057
+ "entropy": 6.051638517379761,
1058
+ "epoch": 2.964881980426022,
1059
+ "grad_norm": 1.886895775794983,
1060
+ "learning_rate": 1.0316963412248093e-06,
1061
+ "loss": 5.8882,
1062
+ "mean_token_accuracy": 0.2440450206398964,
1063
+ "num_tokens": 4800508.0,
1064
+ "step": 5150
1065
+ },
1066
+ {
1067
+ "entropy": 6.135327701568603,
1068
+ "epoch": 2.9936672423719055,
1069
+ "grad_norm": 1.1313307285308838,
1070
+ "learning_rate": 1.0219135198591272e-06,
1071
+ "loss": 5.9694,
1072
+ "mean_token_accuracy": 0.23429417878389358,
1073
+ "num_tokens": 4849415.0,
1074
+ "step": 5200
1075
+ },
1076
+ {
1077
+ "epoch": 3.0,
1078
+ "eval_entropy": 6.290762787041027,
1079
+ "eval_loss": 6.133134365081787,
1080
+ "eval_mean_token_accuracy": 0.21000784566874878,
1081
+ "eval_model_preparation_time": 0.0046,
1082
+ "eval_num_tokens": 4859157.0,
1083
+ "eval_runtime": 79.4078,
1084
+ "eval_samples_per_second": 5.465,
1085
+ "eval_steps_per_second": 2.733,
1086
+ "step": 5211
1087
+ },
1088
+ {
1089
+ "entropy": 5.866470074653625,
1090
+ "epoch": 3.0224525043177892,
1091
+ "grad_norm": 1.7904499769210815,
1092
+ "learning_rate": 1.0121306984934454e-06,
1093
+ "loss": 5.7064,
1094
+ "mean_token_accuracy": 0.26748142033815386,
1095
+ "num_tokens": 4893297.0,
1096
+ "step": 5250
1097
+ },
1098
+ {
1099
+ "entropy": 6.0223666858673095,
1100
+ "epoch": 3.051237766263673,
1101
+ "grad_norm": 1.4165620803833008,
1102
+ "learning_rate": 1.0023478771277635e-06,
1103
+ "loss": 5.855,
1104
+ "mean_token_accuracy": 0.2492792472243309,
1105
+ "num_tokens": 4940190.0,
1106
+ "step": 5300
1107
+ },
1108
+ {
1109
+ "entropy": 5.958261919021607,
1110
+ "epoch": 3.0800230282095566,
1111
+ "grad_norm": 1.7037155628204346,
1112
+ "learning_rate": 9.925650557620817e-07,
1113
+ "loss": 5.7943,
1114
+ "mean_token_accuracy": 0.2563889327645302,
1115
+ "num_tokens": 4986555.0,
1116
+ "step": 5350
1117
+ },
1118
+ {
1119
+ "entropy": 5.975116381645202,
1120
+ "epoch": 3.1088082901554404,
1121
+ "grad_norm": 1.5214799642562866,
1122
+ "learning_rate": 9.827822343963998e-07,
1123
+ "loss": 5.8129,
1124
+ "mean_token_accuracy": 0.2538496914505959,
1125
+ "num_tokens": 5033343.0,
1126
+ "step": 5400
1127
+ },
1128
+ {
1129
+ "entropy": 6.052438821792602,
1130
+ "epoch": 3.137593552101324,
1131
+ "grad_norm": 1.4624167680740356,
1132
+ "learning_rate": 9.72999413030718e-07,
1133
+ "loss": 5.8895,
1134
+ "mean_token_accuracy": 0.24580927312374115,
1135
+ "num_tokens": 5080498.0,
1136
+ "step": 5450
1137
+ },
1138
+ {
1139
+ "entropy": 5.986911368370056,
1140
+ "epoch": 3.166378814047208,
1141
+ "grad_norm": 0.6274769306182861,
1142
+ "learning_rate": 9.632165916650362e-07,
1143
+ "loss": 5.8291,
1144
+ "mean_token_accuracy": 0.2532012587785721,
1145
+ "num_tokens": 5126524.0,
1146
+ "step": 5500
1147
+ },
1148
+ {
1149
+ "entropy": 6.156058435440063,
1150
+ "epoch": 3.1951640759930915,
1151
+ "grad_norm": 1.5531014204025269,
1152
+ "learning_rate": 9.534337702993543e-07,
1153
+ "loss": 5.9887,
1154
+ "mean_token_accuracy": 0.23208704799413682,
1155
+ "num_tokens": 5176187.0,
1156
+ "step": 5550
1157
+ },
1158
+ {
1159
+ "entropy": 6.023375058174134,
1160
+ "epoch": 3.223949337938975,
1161
+ "grad_norm": 1.4891023635864258,
1162
+ "learning_rate": 9.436509489336725e-07,
1163
+ "loss": 5.8608,
1164
+ "mean_token_accuracy": 0.249232979118824,
1165
+ "num_tokens": 5223101.0,
1166
+ "step": 5600
1167
+ },
1168
+ {
1169
+ "entropy": 6.018003768920899,
1170
+ "epoch": 3.252734599884859,
1171
+ "grad_norm": 2.151552438735962,
1172
+ "learning_rate": 9.338681275679906e-07,
1173
+ "loss": 5.857,
1174
+ "mean_token_accuracy": 0.24891259402036667,
1175
+ "num_tokens": 5270200.0,
1176
+ "step": 5650
1177
+ },
1178
+ {
1179
+ "entropy": 6.016556148529053,
1180
+ "epoch": 3.2815198618307426,
1181
+ "grad_norm": 1.4065567255020142,
1182
+ "learning_rate": 9.240853062023088e-07,
1183
+ "loss": 5.8526,
1184
+ "mean_token_accuracy": 0.24942608833312988,
1185
+ "num_tokens": 5315872.0,
1186
+ "step": 5700
1187
+ },
1188
+ {
1189
+ "entropy": 6.045969610214233,
1190
+ "epoch": 3.3103051237766263,
1191
+ "grad_norm": 1.8933031558990479,
1192
+ "learning_rate": 9.143024848366268e-07,
1193
+ "loss": 5.8824,
1194
+ "mean_token_accuracy": 0.24579768538475036,
1195
+ "num_tokens": 5362860.0,
1196
+ "step": 5750
1197
+ },
1198
+ {
1199
+ "entropy": 5.900783424377441,
1200
+ "epoch": 3.33909038572251,
1201
+ "grad_norm": 2.198502540588379,
1202
+ "learning_rate": 9.04519663470945e-07,
1203
+ "loss": 5.7435,
1204
+ "mean_token_accuracy": 0.2635406255722046,
1205
+ "num_tokens": 5407854.0,
1206
+ "step": 5800
1207
+ },
1208
+ {
1209
+ "entropy": 5.739368691444397,
1210
+ "epoch": 3.3678756476683938,
1211
+ "grad_norm": 1.9872454404830933,
1212
+ "learning_rate": 8.947368421052631e-07,
1213
+ "loss": 5.581,
1214
+ "mean_token_accuracy": 0.28485828697681426,
1215
+ "num_tokens": 5450830.0,
1216
+ "step": 5850
1217
+ },
1218
+ {
1219
+ "entropy": 5.963817882537842,
1220
+ "epoch": 3.3966609096142775,
1221
+ "grad_norm": 1.0523409843444824,
1222
+ "learning_rate": 8.849540207395813e-07,
1223
+ "loss": 5.8033,
1224
+ "mean_token_accuracy": 0.25546928733587265,
1225
+ "num_tokens": 5497011.0,
1226
+ "step": 5900
1227
+ },
1228
+ {
1229
+ "entropy": 6.027821063995361,
1230
+ "epoch": 3.425446171560161,
1231
+ "grad_norm": 1.2537726163864136,
1232
+ "learning_rate": 8.751711993738994e-07,
1233
+ "loss": 5.8644,
1234
+ "mean_token_accuracy": 0.2480815091729164,
1235
+ "num_tokens": 5542914.0,
1236
+ "step": 5950
1237
+ },
1238
+ {
1239
+ "entropy": 6.06340226650238,
1240
+ "epoch": 3.454231433506045,
1241
+ "grad_norm": 1.3840627670288086,
1242
+ "learning_rate": 8.653883780082175e-07,
1243
+ "loss": 5.9024,
1244
+ "mean_token_accuracy": 0.24260428220033645,
1245
+ "num_tokens": 5591031.0,
1246
+ "step": 6000
1247
+ },
1248
+ {
1249
+ "entropy": 6.016174025535584,
1250
+ "epoch": 3.4830166954519286,
1251
+ "grad_norm": 1.4330769777297974,
1252
+ "learning_rate": 8.556055566425356e-07,
1253
+ "loss": 5.8533,
1254
+ "mean_token_accuracy": 0.2491714572906494,
1255
+ "num_tokens": 5638071.0,
1256
+ "step": 6050
1257
+ },
1258
+ {
1259
+ "entropy": 5.934744844436645,
1260
+ "epoch": 3.5118019573978123,
1261
+ "grad_norm": 1.0845732688903809,
1262
+ "learning_rate": 8.458227352768538e-07,
1263
+ "loss": 5.7747,
1264
+ "mean_token_accuracy": 0.2592592638731003,
1265
+ "num_tokens": 5684251.0,
1266
+ "step": 6100
1267
+ },
1268
+ {
1269
+ "entropy": 6.080287184715271,
1270
+ "epoch": 3.540587219343696,
1271
+ "grad_norm": 1.0975452661514282,
1272
+ "learning_rate": 8.360399139111719e-07,
1273
+ "loss": 5.9162,
1274
+ "mean_token_accuracy": 0.24146371990442275,
1275
+ "num_tokens": 5732144.0,
1276
+ "step": 6150
1277
+ },
1278
+ {
1279
+ "entropy": 5.930004096031189,
1280
+ "epoch": 3.5693724812895797,
1281
+ "grad_norm": 0.6231066584587097,
1282
+ "learning_rate": 8.2625709254549e-07,
1283
+ "loss": 5.7693,
1284
+ "mean_token_accuracy": 0.2606990364193916,
1285
+ "num_tokens": 5777711.0,
1286
+ "step": 6200
1287
+ },
1288
+ {
1289
+ "entropy": 5.9475119972229,
1290
+ "epoch": 3.5981577432354634,
1291
+ "grad_norm": 1.2649016380310059,
1292
+ "learning_rate": 8.164742711798082e-07,
1293
+ "loss": 5.7891,
1294
+ "mean_token_accuracy": 0.2565712609887123,
1295
+ "num_tokens": 5823662.0,
1296
+ "step": 6250
1297
+ },
1298
+ {
1299
+ "entropy": 6.142667779922485,
1300
+ "epoch": 3.626943005181347,
1301
+ "grad_norm": 2.129287004470825,
1302
+ "learning_rate": 8.066914498141264e-07,
1303
+ "loss": 5.9787,
1304
+ "mean_token_accuracy": 0.23326011776924133,
1305
+ "num_tokens": 5872695.0,
1306
+ "step": 6300
1307
+ },
1308
+ {
1309
+ "entropy": 5.959765286445617,
1310
+ "epoch": 3.655728267127231,
1311
+ "grad_norm": 1.467274785041809,
1312
+ "learning_rate": 7.969086284484446e-07,
1313
+ "loss": 5.7999,
1314
+ "mean_token_accuracy": 0.2563468313217163,
1315
+ "num_tokens": 5919280.0,
1316
+ "step": 6350
1317
+ },
1318
+ {
1319
+ "entropy": 6.116938819885254,
1320
+ "epoch": 3.6845135290731146,
1321
+ "grad_norm": 2.5099565982818604,
1322
+ "learning_rate": 7.871258070827627e-07,
1323
+ "loss": 5.951,
1324
+ "mean_token_accuracy": 0.23610050201416016,
1325
+ "num_tokens": 5967702.0,
1326
+ "step": 6400
1327
+ },
1328
+ {
1329
+ "entropy": 5.916635317802429,
1330
+ "epoch": 3.7132987910189983,
1331
+ "grad_norm": 1.8498667478561401,
1332
+ "learning_rate": 7.773429857170808e-07,
1333
+ "loss": 5.7564,
1334
+ "mean_token_accuracy": 0.26092158019542694,
1335
+ "num_tokens": 6012476.0,
1336
+ "step": 6450
1337
+ },
1338
+ {
1339
+ "entropy": 6.0112196683883665,
1340
+ "epoch": 3.742084052964882,
1341
+ "grad_norm": 1.475481390953064,
1342
+ "learning_rate": 7.675601643513989e-07,
1343
+ "loss": 5.8518,
1344
+ "mean_token_accuracy": 0.2494723927974701,
1345
+ "num_tokens": 6059915.0,
1346
+ "step": 6500
1347
+ },
1348
+ {
1349
+ "entropy": 6.143899393081665,
1350
+ "epoch": 3.7708693149107657,
1351
+ "grad_norm": 1.4096436500549316,
1352
+ "learning_rate": 7.577773429857171e-07,
1353
+ "loss": 5.9778,
1354
+ "mean_token_accuracy": 0.23250365376472473,
1355
+ "num_tokens": 6109703.0,
1356
+ "step": 6550
1357
+ },
1358
+ {
1359
+ "entropy": 5.922900657653809,
1360
+ "epoch": 3.7996545768566494,
1361
+ "grad_norm": 1.2578452825546265,
1362
+ "learning_rate": 7.479945216200352e-07,
1363
+ "loss": 5.7599,
1364
+ "mean_token_accuracy": 0.26120502591133116,
1365
+ "num_tokens": 6155107.0,
1366
+ "step": 6600
1367
+ },
1368
+ {
1369
+ "entropy": 5.908917541503906,
1370
+ "epoch": 3.828439838802533,
1371
+ "grad_norm": 2.6332685947418213,
1372
+ "learning_rate": 7.382117002543533e-07,
1373
+ "loss": 5.7503,
1374
+ "mean_token_accuracy": 0.262827065885067,
1375
+ "num_tokens": 6201565.0,
1376
+ "step": 6650
1377
+ },
1378
+ {
1379
+ "entropy": 6.0733087491989135,
1380
+ "epoch": 3.857225100748417,
1381
+ "grad_norm": 1.0848442316055298,
1382
+ "learning_rate": 7.284288788886714e-07,
1383
+ "loss": 5.9098,
1384
+ "mean_token_accuracy": 0.24171414226293564,
1385
+ "num_tokens": 6249569.0,
1386
+ "step": 6700
1387
+ },
1388
+ {
1389
+ "entropy": 6.002431573867798,
1390
+ "epoch": 3.8860103626943006,
1391
+ "grad_norm": 1.2640091180801392,
1392
+ "learning_rate": 7.186460575229896e-07,
1393
+ "loss": 5.84,
1394
+ "mean_token_accuracy": 0.25067923456430435,
1395
+ "num_tokens": 6296537.0,
1396
+ "step": 6750
1397
+ },
1398
+ {
1399
+ "entropy": 5.919499011039734,
1400
+ "epoch": 3.9147956246401843,
1401
+ "grad_norm": 1.4981272220611572,
1402
+ "learning_rate": 7.088632361573077e-07,
1403
+ "loss": 5.7591,
1404
+ "mean_token_accuracy": 0.259833604991436,
1405
+ "num_tokens": 6341373.0,
1406
+ "step": 6800
1407
+ },
1408
+ {
1409
+ "entropy": 6.17221610546112,
1410
+ "epoch": 3.943580886586068,
1411
+ "grad_norm": 1.38907790184021,
1412
+ "learning_rate": 6.990804147916259e-07,
1413
+ "loss": 6.0084,
1414
+ "mean_token_accuracy": 0.22883434295654298,
1415
+ "num_tokens": 6391079.0,
1416
+ "step": 6850
1417
+ },
1418
+ {
1419
+ "entropy": 5.79143741607666,
1420
+ "epoch": 3.9723661485319517,
1421
+ "grad_norm": 6.057252883911133,
1422
+ "learning_rate": 6.892975934259439e-07,
1423
+ "loss": 5.6354,
1424
+ "mean_token_accuracy": 0.27610290706157686,
1425
+ "num_tokens": 6434857.0,
1426
+ "step": 6900
1427
+ },
1428
+ {
1429
+ "epoch": 4.0,
1430
+ "eval_entropy": 6.279375676185854,
1431
+ "eval_loss": 6.126572132110596,
1432
+ "eval_mean_token_accuracy": 0.21069503338655568,
1433
+ "eval_model_preparation_time": 0.0046,
1434
+ "eval_num_tokens": 6478876.0,
1435
+ "eval_runtime": 79.2163,
1436
+ "eval_samples_per_second": 5.479,
1437
+ "eval_steps_per_second": 2.739,
1438
+ "step": 6948
1439
+ },
1440
+ {
1441
+ "entropy": 5.995638113021851,
1442
+ "epoch": 4.001151410477835,
1443
+ "grad_norm": 1.2027766704559326,
1444
+ "learning_rate": 6.795147720602622e-07,
1445
+ "loss": 5.8326,
1446
+ "mean_token_accuracy": 0.2509008884429932,
1447
+ "num_tokens": 6481003.0,
1448
+ "step": 6950
1449
+ },
1450
+ {
1451
+ "entropy": 5.7975999546051025,
1452
+ "epoch": 4.029936672423719,
1453
+ "grad_norm": 0.9509938359260559,
1454
+ "learning_rate": 6.697319506945803e-07,
1455
+ "loss": 5.64,
1456
+ "mean_token_accuracy": 0.27601367354393,
1457
+ "num_tokens": 6524470.0,
1458
+ "step": 7000
1459
+ },
1460
+ {
1461
+ "entropy": 6.142339401245117,
1462
+ "epoch": 4.058721934369602,
1463
+ "grad_norm": 1.7856882810592651,
1464
+ "learning_rate": 6.599491293288985e-07,
1465
+ "loss": 5.9798,
1466
+ "mean_token_accuracy": 0.23329689502716064,
1467
+ "num_tokens": 6572979.0,
1468
+ "step": 7050
1469
+ },
1470
+ {
1471
+ "entropy": 5.908348722457886,
1472
+ "epoch": 4.087507196315486,
1473
+ "grad_norm": 2.712480306625366,
1474
+ "learning_rate": 6.501663079632165e-07,
1475
+ "loss": 5.7511,
1476
+ "mean_token_accuracy": 0.26176382452249525,
1477
+ "num_tokens": 6617673.0,
1478
+ "step": 7100
1479
+ },
1480
+ {
1481
+ "entropy": 6.018770694732666,
1482
+ "epoch": 4.11629245826137,
1483
+ "grad_norm": 0.8292718529701233,
1484
+ "learning_rate": 6.403834865975347e-07,
1485
+ "loss": 5.8573,
1486
+ "mean_token_accuracy": 0.248506840467453,
1487
+ "num_tokens": 6664415.0,
1488
+ "step": 7150
1489
+ },
1490
+ {
1491
+ "entropy": 5.904297027587891,
1492
+ "epoch": 4.1450777202072535,
1493
+ "grad_norm": 0.6378379464149475,
1494
+ "learning_rate": 6.306006652318528e-07,
1495
+ "loss": 5.7486,
1496
+ "mean_token_accuracy": 0.26151282787323,
1497
+ "num_tokens": 6709826.0,
1498
+ "step": 7200
1499
+ },
1500
+ {
1501
+ "entropy": 5.967884268760681,
1502
+ "epoch": 4.173862982153137,
1503
+ "grad_norm": 1.2303566932678223,
1504
+ "learning_rate": 6.20817843866171e-07,
1505
+ "loss": 5.8065,
1506
+ "mean_token_accuracy": 0.2549690026044846,
1507
+ "num_tokens": 6756346.0,
1508
+ "step": 7250
1509
+ },
1510
+ {
1511
+ "entropy": 5.8746095514297485,
1512
+ "epoch": 4.202648244099021,
1513
+ "grad_norm": 1.3768641948699951,
1514
+ "learning_rate": 6.110350225004892e-07,
1515
+ "loss": 5.7161,
1516
+ "mean_token_accuracy": 0.2668989074230194,
1517
+ "num_tokens": 6801531.0,
1518
+ "step": 7300
1519
+ },
1520
+ {
1521
+ "entropy": 6.134030771255493,
1522
+ "epoch": 4.231433506044905,
1523
+ "grad_norm": 0.7624185085296631,
1524
+ "learning_rate": 6.012522011348072e-07,
1525
+ "loss": 5.9734,
1526
+ "mean_token_accuracy": 0.2334815075993538,
1527
+ "num_tokens": 6850251.0,
1528
+ "step": 7350
1529
+ },
1530
+ {
1531
+ "entropy": 6.01287202835083,
1532
+ "epoch": 4.260218767990788,
1533
+ "grad_norm": 1.4895133972167969,
1534
+ "learning_rate": 5.914693797691254e-07,
1535
+ "loss": 5.8533,
1536
+ "mean_token_accuracy": 0.24807787895202638,
1537
+ "num_tokens": 6896816.0,
1538
+ "step": 7400
1539
+ },
1540
+ {
1541
+ "entropy": 6.02873848438263,
1542
+ "epoch": 4.289004029936672,
1543
+ "grad_norm": 1.5440418720245361,
1544
+ "learning_rate": 5.816865584034435e-07,
1545
+ "loss": 5.8669,
1546
+ "mean_token_accuracy": 0.2474558174610138,
1547
+ "num_tokens": 6943702.0,
1548
+ "step": 7450
1549
+ },
1550
+ {
1551
+ "entropy": 6.010694708824158,
1552
+ "epoch": 4.317789291882556,
1553
+ "grad_norm": 2.088428258895874,
1554
+ "learning_rate": 5.719037370377617e-07,
1555
+ "loss": 5.8494,
1556
+ "mean_token_accuracy": 0.25013200104236605,
1557
+ "num_tokens": 6990895.0,
1558
+ "step": 7500
1559
+ },
1560
+ {
1561
+ "entropy": 6.087985677719116,
1562
+ "epoch": 4.3465745538284395,
1563
+ "grad_norm": 1.199644684791565,
1564
+ "learning_rate": 5.621209156720797e-07,
1565
+ "loss": 5.9273,
1566
+ "mean_token_accuracy": 0.23846659421920777,
1567
+ "num_tokens": 7039497.0,
1568
+ "step": 7550
1569
+ },
1570
+ {
1571
+ "entropy": 5.9547646045684814,
1572
+ "epoch": 4.375359815774323,
1573
+ "grad_norm": 1.8854912519454956,
1574
+ "learning_rate": 5.52338094306398e-07,
1575
+ "loss": 5.7951,
1576
+ "mean_token_accuracy": 0.25700003176927566,
1577
+ "num_tokens": 7084988.0,
1578
+ "step": 7600
1579
+ },
1580
+ {
1581
+ "entropy": 6.008062582015992,
1582
+ "epoch": 4.404145077720207,
1583
+ "grad_norm": 1.376185655593872,
1584
+ "learning_rate": 5.425552729407161e-07,
1585
+ "loss": 5.8469,
1586
+ "mean_token_accuracy": 0.24962662607431413,
1587
+ "num_tokens": 7131690.0,
1588
+ "step": 7650
1589
+ },
1590
+ {
1591
+ "entropy": 6.083430061340332,
1592
+ "epoch": 4.432930339666091,
1593
+ "grad_norm": 1.5763053894042969,
1594
+ "learning_rate": 5.327724515750343e-07,
1595
+ "loss": 5.9229,
1596
+ "mean_token_accuracy": 0.24034427106380463,
1597
+ "num_tokens": 7179874.0,
1598
+ "step": 7700
1599
+ },
1600
+ {
1601
+ "entropy": 5.94250883102417,
1602
+ "epoch": 4.461715601611974,
1603
+ "grad_norm": 1.1155059337615967,
1604
+ "learning_rate": 5.229896302093524e-07,
1605
+ "loss": 5.7841,
1606
+ "mean_token_accuracy": 0.25782077729701997,
1607
+ "num_tokens": 7225342.0,
1608
+ "step": 7750
1609
+ },
1610
+ {
1611
+ "entropy": 5.856548733711243,
1612
+ "epoch": 4.490500863557858,
1613
+ "grad_norm": 1.1634149551391602,
1614
+ "learning_rate": 5.132068088436705e-07,
1615
+ "loss": 5.6972,
1616
+ "mean_token_accuracy": 0.26918380439281464,
1617
+ "num_tokens": 7270136.0,
1618
+ "step": 7800
1619
+ },
1620
+ {
1621
+ "entropy": 5.866941246986389,
1622
+ "epoch": 4.519286125503742,
1623
+ "grad_norm": 1.6508464813232422,
1624
+ "learning_rate": 5.034239874779886e-07,
1625
+ "loss": 5.712,
1626
+ "mean_token_accuracy": 0.26652718901634215,
1627
+ "num_tokens": 7315462.0,
1628
+ "step": 7850
1629
+ },
1630
+ {
1631
+ "entropy": 6.119233846664429,
1632
+ "epoch": 4.5480713874496255,
1633
+ "grad_norm": 1.0165655612945557,
1634
+ "learning_rate": 4.936411661123068e-07,
1635
+ "loss": 5.9555,
1636
+ "mean_token_accuracy": 0.2348495191335678,
1637
+ "num_tokens": 7364611.0,
1638
+ "step": 7900
1639
+ },
1640
+ {
1641
+ "entropy": 6.090625686645508,
1642
+ "epoch": 4.576856649395509,
1643
+ "grad_norm": 0.7952129244804382,
1644
+ "learning_rate": 4.838583447466249e-07,
1645
+ "loss": 5.9278,
1646
+ "mean_token_accuracy": 0.23983514040708542,
1647
+ "num_tokens": 7412920.0,
1648
+ "step": 7950
1649
+ },
1650
+ {
1651
+ "entropy": 5.94470666885376,
1652
+ "epoch": 4.605641911341393,
1653
+ "grad_norm": 2.286240577697754,
1654
+ "learning_rate": 4.7407552338094304e-07,
1655
+ "loss": 5.7865,
1656
+ "mean_token_accuracy": 0.25735649168491365,
1657
+ "num_tokens": 7459505.0,
1658
+ "step": 8000
1659
+ },
1660
+ {
1661
+ "entropy": 6.177862458229065,
1662
+ "epoch": 4.634427173287277,
1663
+ "grad_norm": 2.1775429248809814,
1664
+ "learning_rate": 4.6429270201526114e-07,
1665
+ "loss": 6.012,
1666
+ "mean_token_accuracy": 0.22966912269592285,
1667
+ "num_tokens": 7508834.0,
1668
+ "step": 8050
1669
+ },
1670
+ {
1671
+ "entropy": 5.994407043457032,
1672
+ "epoch": 4.66321243523316,
1673
+ "grad_norm": 0.8207571506500244,
1674
+ "learning_rate": 4.545098806495793e-07,
1675
+ "loss": 5.8345,
1676
+ "mean_token_accuracy": 0.25109571874141695,
1677
+ "num_tokens": 7555981.0,
1678
+ "step": 8100
1679
+ },
1680
+ {
1681
+ "entropy": 5.867677879333496,
1682
+ "epoch": 4.691997697179044,
1683
+ "grad_norm": 2.5868327617645264,
1684
+ "learning_rate": 4.4472705928389745e-07,
1685
+ "loss": 5.7072,
1686
+ "mean_token_accuracy": 0.2670168370008469,
1687
+ "num_tokens": 7600529.0,
1688
+ "step": 8150
1689
+ },
1690
+ {
1691
+ "entropy": 5.971000475883484,
1692
+ "epoch": 4.720782959124928,
1693
+ "grad_norm": 1.0981251001358032,
1694
+ "learning_rate": 4.349442379182156e-07,
1695
+ "loss": 5.8118,
1696
+ "mean_token_accuracy": 0.25474150747060775,
1697
+ "num_tokens": 7646065.0,
1698
+ "step": 8200
1699
+ },
1700
+ {
1701
+ "entropy": 5.975438833236694,
1702
+ "epoch": 4.7495682210708114,
1703
+ "grad_norm": 1.0710279941558838,
1704
+ "learning_rate": 4.2516141655253376e-07,
1705
+ "loss": 5.8154,
1706
+ "mean_token_accuracy": 0.2538798648118973,
1707
+ "num_tokens": 7692863.0,
1708
+ "step": 8250
1709
+ },
1710
+ {
1711
+ "entropy": 6.000182151794434,
1712
+ "epoch": 4.778353483016695,
1713
+ "grad_norm": 0.7617666125297546,
1714
+ "learning_rate": 4.1537859518685186e-07,
1715
+ "loss": 5.8402,
1716
+ "mean_token_accuracy": 0.2507925814390182,
1717
+ "num_tokens": 7738799.0,
1718
+ "step": 8300
1719
+ },
1720
+ {
1721
+ "entropy": 5.834854488372803,
1722
+ "epoch": 4.807138744962579,
1723
+ "grad_norm": 2.2277169227600098,
1724
+ "learning_rate": 4.0559577382117e-07,
1725
+ "loss": 5.6785,
1726
+ "mean_token_accuracy": 0.2707058879733086,
1727
+ "num_tokens": 7782955.0,
1728
+ "step": 8350
1729
+ },
1730
+ {
1731
+ "entropy": 6.1768684530258176,
1732
+ "epoch": 4.835924006908463,
1733
+ "grad_norm": 0.7781999707221985,
1734
+ "learning_rate": 3.958129524554881e-07,
1735
+ "loss": 6.0105,
1736
+ "mean_token_accuracy": 0.22812994629144667,
1737
+ "num_tokens": 7832647.0,
1738
+ "step": 8400
1739
+ },
1740
+ {
1741
+ "entropy": 5.988465652465821,
1742
+ "epoch": 4.864709268854346,
1743
+ "grad_norm": 0.4961145222187042,
1744
+ "learning_rate": 3.860301310898063e-07,
1745
+ "loss": 5.8263,
1746
+ "mean_token_accuracy": 0.2520375117659569,
1747
+ "num_tokens": 7879636.0,
1748
+ "step": 8450
1749
+ },
1750
+ {
1751
+ "entropy": 6.066406717300415,
1752
+ "epoch": 4.89349453080023,
1753
+ "grad_norm": 1.1657921075820923,
1754
+ "learning_rate": 3.762473097241244e-07,
1755
+ "loss": 5.9054,
1756
+ "mean_token_accuracy": 0.2423809215426445,
1757
+ "num_tokens": 7927041.0,
1758
+ "step": 8500
1759
+ },
1760
+ {
1761
+ "entropy": 6.052256097793579,
1762
+ "epoch": 4.922279792746114,
1763
+ "grad_norm": 1.5335379838943481,
1764
+ "learning_rate": 3.664644883584426e-07,
1765
+ "loss": 5.8925,
1766
+ "mean_token_accuracy": 0.24372650146484376,
1767
+ "num_tokens": 7974219.0,
1768
+ "step": 8550
1769
+ },
1770
+ {
1771
+ "entropy": 5.889537000656128,
1772
+ "epoch": 4.951065054691997,
1773
+ "grad_norm": 1.3163872957229614,
1774
+ "learning_rate": 3.566816669927607e-07,
1775
+ "loss": 5.7296,
1776
+ "mean_token_accuracy": 0.26416114032268523,
1777
+ "num_tokens": 8018646.0,
1778
+ "step": 8600
1779
+ },
1780
+ {
1781
+ "entropy": 6.079032945632934,
1782
+ "epoch": 4.979850316637881,
1783
+ "grad_norm": 2.2353949546813965,
1784
+ "learning_rate": 3.4689884562707883e-07,
1785
+ "loss": 5.9181,
1786
+ "mean_token_accuracy": 0.2414929136633873,
1787
+ "num_tokens": 8066512.0,
1788
+ "step": 8650
1789
+ },
1790
+ {
1791
+ "epoch": 5.0,
1792
+ "eval_entropy": 6.274539154246106,
1793
+ "eval_loss": 6.1247029304504395,
1794
+ "eval_mean_token_accuracy": 0.21092273377328424,
1795
+ "eval_model_preparation_time": 0.0046,
1796
+ "eval_num_tokens": 8098595.0,
1797
+ "eval_runtime": 79.4734,
1798
+ "eval_samples_per_second": 5.461,
1799
+ "eval_steps_per_second": 2.73,
1800
+ "step": 8685
1801
+ },
1802
+ {
1803
+ "entropy": 5.957367534637451,
1804
+ "epoch": 5.008635578583765,
1805
+ "grad_norm": 0.680518388748169,
1806
+ "learning_rate": 3.37116024261397e-07,
1807
+ "loss": 5.7978,
1808
+ "mean_token_accuracy": 0.2565177664160728,
1809
+ "num_tokens": 8111610.0,
1810
+ "step": 8700
1811
+ },
1812
+ {
1813
+ "entropy": 5.974089093208313,
1814
+ "epoch": 5.037420840529649,
1815
+ "grad_norm": 2.036747932434082,
1816
+ "learning_rate": 3.273332028957151e-07,
1817
+ "loss": 5.8159,
1818
+ "mean_token_accuracy": 0.2531487289071083,
1819
+ "num_tokens": 8157841.0,
1820
+ "step": 8750
1821
+ },
1822
+ {
1823
+ "entropy": 5.812384562492371,
1824
+ "epoch": 5.066206102475532,
1825
+ "grad_norm": 1.5146092176437378,
1826
+ "learning_rate": 3.175503815300333e-07,
1827
+ "loss": 5.6566,
1828
+ "mean_token_accuracy": 0.27390810728073123,
1829
+ "num_tokens": 8201355.0,
1830
+ "step": 8800
1831
+ },
1832
+ {
1833
+ "entropy": 5.971576690673828,
1834
+ "epoch": 5.094991364421416,
1835
+ "grad_norm": 0.9713916778564453,
1836
+ "learning_rate": 3.077675601643514e-07,
1837
+ "loss": 5.8136,
1838
+ "mean_token_accuracy": 0.25402570873498914,
1839
+ "num_tokens": 8248348.0,
1840
+ "step": 8850
1841
+ },
1842
+ {
1843
+ "entropy": 5.940347299575806,
1844
+ "epoch": 5.1237766263673,
1845
+ "grad_norm": 1.1493933200836182,
1846
+ "learning_rate": 2.9798473879866954e-07,
1847
+ "loss": 5.7845,
1848
+ "mean_token_accuracy": 0.2574089586734772,
1849
+ "num_tokens": 8294009.0,
1850
+ "step": 8900
1851
+ },
1852
+ {
1853
+ "entropy": 5.9449573802948,
1854
+ "epoch": 5.152561888313183,
1855
+ "grad_norm": 1.647032618522644,
1856
+ "learning_rate": 2.8820191743298764e-07,
1857
+ "loss": 5.7871,
1858
+ "mean_token_accuracy": 0.25562890857458115,
1859
+ "num_tokens": 8340510.0,
1860
+ "step": 8950
1861
+ },
1862
+ {
1863
+ "entropy": 6.133461399078369,
1864
+ "epoch": 5.181347150259067,
1865
+ "grad_norm": 0.8065502047538757,
1866
+ "learning_rate": 2.784190960673058e-07,
1867
+ "loss": 5.969,
1868
+ "mean_token_accuracy": 0.23392125099897385,
1869
+ "num_tokens": 8390033.0,
1870
+ "step": 9000
1871
+ },
1872
+ {
1873
+ "entropy": 6.074811162948609,
1874
+ "epoch": 5.210132412204951,
1875
+ "grad_norm": 0.833552360534668,
1876
+ "learning_rate": 2.686362747016239e-07,
1877
+ "loss": 5.9127,
1878
+ "mean_token_accuracy": 0.241038781106472,
1879
+ "num_tokens": 8436914.0,
1880
+ "step": 9050
1881
+ },
1882
+ {
1883
+ "entropy": 5.771420574188232,
1884
+ "epoch": 5.2389176741508345,
1885
+ "grad_norm": 2.6249544620513916,
1886
+ "learning_rate": 2.5885345333594205e-07,
1887
+ "loss": 5.6153,
1888
+ "mean_token_accuracy": 0.27925612688064577,
1889
+ "num_tokens": 8481570.0,
1890
+ "step": 9100
1891
+ },
1892
+ {
1893
+ "entropy": 5.9740171718597415,
1894
+ "epoch": 5.267702936096718,
1895
+ "grad_norm": 1.484552025794983,
1896
+ "learning_rate": 2.490706319702602e-07,
1897
+ "loss": 5.8145,
1898
+ "mean_token_accuracy": 0.25337011635303497,
1899
+ "num_tokens": 8527697.0,
1900
+ "step": 9150
1901
+ },
1902
+ {
1903
+ "entropy": 6.084105367660523,
1904
+ "epoch": 5.296488198042602,
1905
+ "grad_norm": 1.5230190753936768,
1906
+ "learning_rate": 2.3928781060457836e-07,
1907
+ "loss": 5.9269,
1908
+ "mean_token_accuracy": 0.23908233702182768,
1909
+ "num_tokens": 8575933.0,
1910
+ "step": 9200
1911
+ },
1912
+ {
1913
+ "entropy": 5.878108925819397,
1914
+ "epoch": 5.325273459988486,
1915
+ "grad_norm": 1.7913310527801514,
1916
+ "learning_rate": 2.2950498923889649e-07,
1917
+ "loss": 5.7208,
1918
+ "mean_token_accuracy": 0.26579217702150343,
1919
+ "num_tokens": 8620214.0,
1920
+ "step": 9250
1921
+ },
1922
+ {
1923
+ "entropy": 6.043487319946289,
1924
+ "epoch": 5.354058721934369,
1925
+ "grad_norm": 1.173954963684082,
1926
+ "learning_rate": 2.1972216787321461e-07,
1927
+ "loss": 5.8806,
1928
+ "mean_token_accuracy": 0.24593402802944184,
1929
+ "num_tokens": 8668532.0,
1930
+ "step": 9300
1931
+ },
1932
+ {
1933
+ "entropy": 5.817543797492981,
1934
+ "epoch": 5.382843983880253,
1935
+ "grad_norm": 1.084602952003479,
1936
+ "learning_rate": 2.0993934650753277e-07,
1937
+ "loss": 5.6593,
1938
+ "mean_token_accuracy": 0.2739328667521477,
1939
+ "num_tokens": 8712689.0,
1940
+ "step": 9350
1941
+ },
1942
+ {
1943
+ "entropy": 6.036909718513488,
1944
+ "epoch": 5.411629245826137,
1945
+ "grad_norm": 1.4778636693954468,
1946
+ "learning_rate": 2.0015652514185092e-07,
1947
+ "loss": 5.8748,
1948
+ "mean_token_accuracy": 0.24605893224477768,
1949
+ "num_tokens": 8760376.0,
1950
+ "step": 9400
1951
+ },
1952
+ {
1953
+ "entropy": 6.050856218338013,
1954
+ "epoch": 5.4404145077720205,
1955
+ "grad_norm": 0.9816691875457764,
1956
+ "learning_rate": 1.9037370377616905e-07,
1957
+ "loss": 5.8885,
1958
+ "mean_token_accuracy": 0.2441025686264038,
1959
+ "num_tokens": 8808966.0,
1960
+ "step": 9450
1961
+ },
1962
+ {
1963
+ "entropy": 6.072160882949829,
1964
+ "epoch": 5.469199769717904,
1965
+ "grad_norm": 1.9699606895446777,
1966
+ "learning_rate": 1.8059088241048718e-07,
1967
+ "loss": 5.9096,
1968
+ "mean_token_accuracy": 0.24258128613233565,
1969
+ "num_tokens": 8856411.0,
1970
+ "step": 9500
1971
+ },
1972
+ {
1973
+ "entropy": 6.086222591400147,
1974
+ "epoch": 5.497985031663788,
1975
+ "grad_norm": 0.9021607637405396,
1976
+ "learning_rate": 1.708080610448053e-07,
1977
+ "loss": 5.9233,
1978
+ "mean_token_accuracy": 0.24022864073514938,
1979
+ "num_tokens": 8904412.0,
1980
+ "step": 9550
1981
+ },
1982
+ {
1983
+ "entropy": 5.965502681732178,
1984
+ "epoch": 5.526770293609672,
1985
+ "grad_norm": 1.0994197130203247,
1986
+ "learning_rate": 1.6102523967912346e-07,
1987
+ "loss": 5.8112,
1988
+ "mean_token_accuracy": 0.2534236097335815,
1989
+ "num_tokens": 8950118.0,
1990
+ "step": 9600
1991
+ },
1992
+ {
1993
+ "entropy": 5.795955166816712,
1994
+ "epoch": 5.555555555555555,
1995
+ "grad_norm": 1.9056462049484253,
1996
+ "learning_rate": 1.5124241831344158e-07,
1997
+ "loss": 5.6409,
1998
+ "mean_token_accuracy": 0.2760310146212578,
1999
+ "num_tokens": 8993094.0,
2000
+ "step": 9650
2001
+ },
2002
+ {
2003
+ "entropy": 5.992545394897461,
2004
+ "epoch": 5.584340817501439,
2005
+ "grad_norm": 1.1243247985839844,
2006
+ "learning_rate": 1.414595969477597e-07,
2007
+ "loss": 5.8337,
2008
+ "mean_token_accuracy": 0.25096620470285413,
2009
+ "num_tokens": 9039172.0,
2010
+ "step": 9700
2011
+ },
2012
+ {
2013
+ "entropy": 6.062644476890564,
2014
+ "epoch": 5.613126079447323,
2015
+ "grad_norm": 0.7900448441505432,
2016
+ "learning_rate": 1.3167677558207786e-07,
2017
+ "loss": 5.9029,
2018
+ "mean_token_accuracy": 0.2425614431500435,
2019
+ "num_tokens": 9087856.0,
2020
+ "step": 9750
2021
+ },
2022
+ {
2023
+ "entropy": 6.036457490921021,
2024
+ "epoch": 5.6419113413932065,
2025
+ "grad_norm": 17.477712631225586,
2026
+ "learning_rate": 1.2189395421639602e-07,
2027
+ "loss": 5.8757,
2028
+ "mean_token_accuracy": 0.24608026653528214,
2029
+ "num_tokens": 9134247.0,
2030
+ "step": 9800
2031
+ },
2032
+ {
2033
+ "entropy": 5.996092481613159,
2034
+ "epoch": 5.67069660333909,
2035
+ "grad_norm": 1.4077341556549072,
2036
+ "learning_rate": 1.1211113285071413e-07,
2037
+ "loss": 5.8395,
2038
+ "mean_token_accuracy": 0.250471707880497,
2039
+ "num_tokens": 9181486.0,
2040
+ "step": 9850
2041
+ },
2042
+ {
2043
+ "entropy": 6.087848567962647,
2044
+ "epoch": 5.699481865284974,
2045
+ "grad_norm": 1.3837510347366333,
2046
+ "learning_rate": 1.0232831148503227e-07,
2047
+ "loss": 5.9276,
2048
+ "mean_token_accuracy": 0.24019749820232392,
2049
+ "num_tokens": 9229355.0,
2050
+ "step": 9900
2051
+ },
2052
+ {
2053
+ "entropy": 6.053327779769898,
2054
+ "epoch": 5.728267127230858,
2055
+ "grad_norm": 1.0415576696395874,
2056
+ "learning_rate": 9.254549011935043e-08,
2057
+ "loss": 5.8926,
2058
+ "mean_token_accuracy": 0.24374084115028383,
2059
+ "num_tokens": 9276282.0,
2060
+ "step": 9950
2061
+ },
2062
+ {
2063
+ "entropy": 5.945247740745544,
2064
+ "epoch": 5.757052389176741,
2065
+ "grad_norm": 1.4692878723144531,
2066
+ "learning_rate": 8.276266875366855e-08,
2067
+ "loss": 5.79,
2068
+ "mean_token_accuracy": 0.2562396174669266,
2069
+ "num_tokens": 9322173.0,
2070
+ "step": 10000
2071
+ },
2072
+ {
2073
+ "entropy": 6.043243775367737,
2074
+ "epoch": 5.785837651122625,
2075
+ "grad_norm": 1.2986756563186646,
2076
+ "learning_rate": 7.29798473879867e-08,
2077
+ "loss": 5.8821,
2078
+ "mean_token_accuracy": 0.24506330251693725,
2079
+ "num_tokens": 9369917.0,
2080
+ "step": 10050
2081
+ },
2082
+ {
2083
+ "entropy": 6.070325479507447,
2084
+ "epoch": 5.814622913068509,
2085
+ "grad_norm": 2.0517385005950928,
2086
+ "learning_rate": 6.319702602230482e-08,
2087
+ "loss": 5.9093,
2088
+ "mean_token_accuracy": 0.24180745720863342,
2089
+ "num_tokens": 9417145.0,
2090
+ "step": 10100
2091
+ },
2092
+ {
2093
+ "entropy": 5.813769164085389,
2094
+ "epoch": 5.8434081750143925,
2095
+ "grad_norm": 1.4036266803741455,
2096
+ "learning_rate": 5.341420465662297e-08,
2097
+ "loss": 5.6553,
2098
+ "mean_token_accuracy": 0.27383983492851255,
2099
+ "num_tokens": 9462113.0,
2100
+ "step": 10150
2101
+ },
2102
+ {
2103
+ "entropy": 5.958855032920837,
2104
+ "epoch": 5.872193436960276,
2105
+ "grad_norm": 2.063570499420166,
2106
+ "learning_rate": 4.363138329094111e-08,
2107
+ "loss": 5.8008,
2108
+ "mean_token_accuracy": 0.2549652716517448,
2109
+ "num_tokens": 9507567.0,
2110
+ "step": 10200
2111
+ },
2112
+ {
2113
+ "entropy": 5.989646158218384,
2114
+ "epoch": 5.90097869890616,
2115
+ "grad_norm": 0.590006411075592,
2116
+ "learning_rate": 3.384856192525924e-08,
2117
+ "loss": 5.8287,
2118
+ "mean_token_accuracy": 0.2527298724651337,
2119
+ "num_tokens": 9554123.0,
2120
+ "step": 10250
2121
+ },
2122
+ {
2123
+ "entropy": 6.049757356643677,
2124
+ "epoch": 5.929763960852044,
2125
+ "grad_norm": 1.297501802444458,
2126
+ "learning_rate": 2.4065740559577383e-08,
2127
+ "loss": 5.8895,
2128
+ "mean_token_accuracy": 0.24449419289827345,
2129
+ "num_tokens": 9600615.0,
2130
+ "step": 10300
2131
+ },
2132
+ {
2133
+ "entropy": 6.052498106956482,
2134
+ "epoch": 5.958549222797927,
2135
+ "grad_norm": 1.1880056858062744,
2136
+ "learning_rate": 1.4282919193895518e-08,
2137
+ "loss": 5.8922,
2138
+ "mean_token_accuracy": 0.24352416545152664,
2139
+ "num_tokens": 9648757.0,
2140
+ "step": 10350
2141
+ },
2142
+ {
2143
+ "entropy": 6.125533571243286,
2144
+ "epoch": 5.987334484743811,
2145
+ "grad_norm": 1.0961848497390747,
2146
+ "learning_rate": 4.500097828213657e-09,
2147
+ "loss": 5.961,
2148
+ "mean_token_accuracy": 0.23481910437345505,
2149
+ "num_tokens": 9696666.0,
2150
+ "step": 10400
2151
+ },
2152
+ {
2153
+ "epoch": 6.0,
2154
+ "eval_entropy": 6.273049378724692,
2155
+ "eval_loss": 6.123514175415039,
2156
+ "eval_mean_token_accuracy": 0.21104606045281282,
2157
+ "eval_model_preparation_time": 0.0046,
2158
+ "eval_num_tokens": 9718314.0,
2159
+ "eval_runtime": 79.5368,
2160
+ "eval_samples_per_second": 5.457,
2161
+ "eval_steps_per_second": 2.728,
2162
+ "step": 10422
2163
+ }
2164
+ ],
2165
+ "logging_steps": 50,
2166
+ "max_steps": 10422,
2167
+ "num_input_tokens_seen": 0,
2168
+ "num_train_epochs": 6,
2169
+ "save_steps": 500,
2170
+ "stateful_callbacks": {
2171
+ "TrainerControl": {
2172
+ "args": {
2173
+ "should_epoch_stop": false,
2174
+ "should_evaluate": false,
2175
+ "should_log": false,
2176
+ "should_save": true,
2177
+ "should_training_stop": true
2178
+ },
2179
+ "attributes": {}
2180
+ }
2181
+ },
2182
+ "total_flos": 1.3590670746786202e+17,
2183
+ "train_batch_size": 2,
2184
+ "trial_name": null,
2185
+ "trial_params": null
2186
+ }
checkpoint-10422/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1adcf89e2ce7be508f245e87af0fd5d93d0d2e7562c07049b800c8d5a6b1822e
3
+ size 6225
checkpoint-10422/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-1737/adapter_config.json CHANGED
@@ -16,7 +16,7 @@
16
  "layers_pattern": null,
17
  "layers_to_transform": null,
18
  "loftq_config": {},
19
- "lora_alpha": 16,
20
  "lora_bias": false,
21
  "lora_dropout": 0.1,
22
  "megatron_config": null,
@@ -25,12 +25,14 @@
25
  "peft_type": "LORA",
26
  "peft_version": "0.18.0",
27
  "qalora_group_size": 16,
28
- "r": 8,
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
  "q_proj",
33
- "v_proj"
 
 
34
  ],
35
  "target_parameters": null,
36
  "task_type": "CAUSAL_LM",
 
16
  "layers_pattern": null,
17
  "layers_to_transform": null,
18
  "loftq_config": {},
19
+ "lora_alpha": 32,
20
  "lora_bias": false,
21
  "lora_dropout": 0.1,
22
  "megatron_config": null,
 
25
  "peft_type": "LORA",
26
  "peft_version": "0.18.0",
27
  "qalora_group_size": 16,
28
+ "r": 24,
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
  "q_proj",
33
+ "k_proj",
34
+ "v_proj",
35
+ "o_proj"
36
  ],
37
  "target_parameters": null,
38
  "task_type": "CAUSAL_LM",
checkpoint-1737/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b5f4b9708eccf0370f9aaa1466d17c487ab3a9e4e84732d5cd39bbd229aedd5c
3
- size 4374520
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca36c29cabd2e8ea449e6eadcd7f7db9042e00cae52ef5b042c56b58c200775a
3
+ size 26182176
checkpoint-1737/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:84ee821de3d805218a80046b08a325803a2434e306b554e094f68548e53fbe41
3
- size 8783179
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8fab12e7cc07b84cfe33ab9be36e25b4dfa882f0ac9e6725dfb7608859ec3a87
3
+ size 52486155
checkpoint-1737/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4e816ab59bde4778d4f30814a9146abbd7044e1640b72b0be4234c4aa55b98f1
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac7868bb5d10a59d1042ca17d4fc89dc5beddcdf6df99c035480579667b84b19
3
  size 14645
checkpoint-1737/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f9121f4d6a6f445ab467d2762de7c0b86cf7fef9179d9273d56797386ca47712
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c17d5ce4845692098064761cc4c713c4686c6a262dcb4177eea65f272ed234c
3
  size 1465
checkpoint-1737/trainer_state.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "best_global_step": 1737,
3
- "best_metric": 6.15173864364624,
4
  "best_model_checkpoint": "./output/checkpoint-1737",
5
  "epoch": 1.0,
6
  "eval_steps": 500,
@@ -10,362 +10,362 @@
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "entropy": 3.864118957519531,
14
  "epoch": 0.028785261945883708,
15
- "grad_norm": 2.7545533180236816,
16
- "learning_rate": 9.800000000000001e-06,
17
- "loss": 15.2997,
18
- "mean_token_accuracy": 0.10086015284061432,
19
- "num_tokens": 47319.0,
20
  "step": 50
21
  },
22
  {
23
- "entropy": 4.047076859474182,
24
  "epoch": 0.057570523891767415,
25
- "grad_norm": 5.0328264236450195,
26
- "learning_rate": 1.98e-05,
27
- "loss": 15.3264,
28
- "mean_token_accuracy": 0.09582207053899765,
29
- "num_tokens": 96809.0,
30
  "step": 100
31
  },
32
  {
33
- "entropy": 4.7578076648712155,
34
  "epoch": 0.08635578583765112,
35
- "grad_norm": 38.50589370727539,
36
- "learning_rate": 1.988584740827024e-05,
37
- "loss": 13.0056,
38
- "mean_token_accuracy": 0.126854517608881,
39
- "num_tokens": 139962.0,
40
  "step": 150
41
  },
42
  {
43
- "entropy": 6.80673882484436,
44
  "epoch": 0.11514104778353483,
45
- "grad_norm": 12.030129432678223,
46
- "learning_rate": 1.97693651718113e-05,
47
- "loss": 9.2822,
48
- "mean_token_accuracy": 0.11084575355052947,
49
- "num_tokens": 188029.0,
50
  "step": 200
51
  },
52
  {
53
- "entropy": 7.177925786972046,
54
  "epoch": 0.14392630972941853,
55
- "grad_norm": 4.852536201477051,
56
- "learning_rate": 1.965288293535236e-05,
57
- "loss": 7.6333,
58
- "mean_token_accuracy": 0.12398939326405525,
59
- "num_tokens": 234425.0,
60
  "step": 250
61
  },
62
  {
63
- "entropy": 7.080496473312378,
64
  "epoch": 0.17271157167530224,
65
- "grad_norm": 4.10841178894043,
66
- "learning_rate": 1.9536400698893422e-05,
67
- "loss": 7.1632,
68
- "mean_token_accuracy": 0.13563686355948448,
69
- "num_tokens": 278885.0,
70
  "step": 300
71
  },
72
  {
73
- "entropy": 6.931579580307007,
74
  "epoch": 0.20149683362118595,
75
- "grad_norm": 14.636048316955566,
76
- "learning_rate": 1.941991846243448e-05,
77
- "loss": 6.8213,
78
- "mean_token_accuracy": 0.16459846690297128,
79
- "num_tokens": 325491.0,
80
  "step": 350
81
  },
82
  {
83
- "entropy": 6.853660764694214,
84
  "epoch": 0.23028209556706966,
85
- "grad_norm": 5.966708183288574,
86
- "learning_rate": 1.930343622597554e-05,
87
- "loss": 6.6625,
88
- "mean_token_accuracy": 0.17670693069696428,
89
- "num_tokens": 372913.0,
90
  "step": 400
91
  },
92
  {
93
- "entropy": 6.684267387390137,
94
  "epoch": 0.25906735751295334,
95
- "grad_norm": 4.031010627746582,
96
- "learning_rate": 1.91869539895166e-05,
97
- "loss": 6.4505,
98
- "mean_token_accuracy": 0.1943434515595436,
99
- "num_tokens": 419159.0,
100
  "step": 450
101
  },
102
  {
103
- "entropy": 6.679989137649536,
104
  "epoch": 0.28785261945883706,
105
- "grad_norm": 6.251070022583008,
106
- "learning_rate": 1.907047175305766e-05,
107
- "loss": 6.4314,
108
- "mean_token_accuracy": 0.19514557600021362,
109
- "num_tokens": 466994.0,
110
  "step": 500
111
  },
112
  {
113
- "entropy": 6.477229623794556,
114
  "epoch": 0.31663788140472077,
115
- "grad_norm": 3.8656675815582275,
116
- "learning_rate": 1.895398951659872e-05,
117
- "loss": 6.2139,
118
- "mean_token_accuracy": 0.21764743447303772,
119
- "num_tokens": 513308.0,
120
  "step": 550
121
  },
122
  {
123
- "entropy": 6.408129243850708,
124
  "epoch": 0.3454231433506045,
125
- "grad_norm": 8.688581466674805,
126
- "learning_rate": 1.883750728013978e-05,
127
- "loss": 6.1224,
128
- "mean_token_accuracy": 0.23438037544488907,
129
- "num_tokens": 559679.0,
130
  "step": 600
131
  },
132
  {
133
- "entropy": 6.128518767356873,
134
  "epoch": 0.3742084052964882,
135
- "grad_norm": 5.419503688812256,
136
- "learning_rate": 1.872102504368084e-05,
137
- "loss": 5.8692,
138
- "mean_token_accuracy": 0.26634690463542937,
139
- "num_tokens": 603140.0,
140
  "step": 650
141
  },
142
  {
143
- "entropy": 6.322700729370117,
144
  "epoch": 0.4029936672423719,
145
- "grad_norm": 2.2213082313537598,
146
- "learning_rate": 1.86045428072219e-05,
147
- "loss": 6.0717,
148
- "mean_token_accuracy": 0.24038562417030335,
149
- "num_tokens": 650179.0,
150
  "step": 700
151
  },
152
  {
153
- "entropy": 6.236415157318115,
154
  "epoch": 0.4317789291882556,
155
- "grad_norm": 4.804980278015137,
156
- "learning_rate": 1.848806057076296e-05,
157
- "loss": 5.9986,
158
- "mean_token_accuracy": 0.24596781462430953,
159
- "num_tokens": 696220.0,
160
  "step": 750
161
  },
162
  {
163
- "entropy": 6.269758443832398,
164
  "epoch": 0.4605641911341393,
165
- "grad_norm": 2.2888853549957275,
166
- "learning_rate": 1.837157833430402e-05,
167
- "loss": 6.0385,
168
- "mean_token_accuracy": 0.24074893474578857,
169
- "num_tokens": 743909.0,
170
  "step": 800
171
  },
172
  {
173
- "entropy": 6.270364007949829,
174
  "epoch": 0.48934945308002303,
175
- "grad_norm": 3.0903279781341553,
176
- "learning_rate": 1.825509609784508e-05,
177
- "loss": 6.0481,
178
- "mean_token_accuracy": 0.23740622967481614,
179
- "num_tokens": 792015.0,
180
  "step": 850
181
  },
182
  {
183
- "entropy": 6.3037636184692385,
184
  "epoch": 0.5181347150259067,
185
- "grad_norm": 3.969320058822632,
186
- "learning_rate": 1.813861386138614e-05,
187
- "loss": 6.0855,
188
- "mean_token_accuracy": 0.2309597587585449,
189
- "num_tokens": 841802.0,
190
  "step": 900
191
  },
192
  {
193
- "entropy": 6.038041458129883,
194
  "epoch": 0.5469199769717904,
195
- "grad_norm": 2.2712185382843018,
196
- "learning_rate": 1.80221316249272e-05,
197
- "loss": 5.8285,
198
- "mean_token_accuracy": 0.26099125802516937,
199
- "num_tokens": 886492.0,
200
  "step": 950
201
  },
202
  {
203
- "entropy": 6.142958383560181,
204
  "epoch": 0.5757052389176741,
205
- "grad_norm": 1.2311755418777466,
206
- "learning_rate": 1.790564938846826e-05,
207
- "loss": 5.9357,
208
- "mean_token_accuracy": 0.24810438305139543,
209
- "num_tokens": 932807.0,
210
  "step": 1000
211
  },
212
  {
213
- "entropy": 6.199834351539612,
214
  "epoch": 0.6044905008635578,
215
- "grad_norm": 2.2788379192352295,
216
- "learning_rate": 1.7789167152009318e-05,
217
- "loss": 5.9964,
218
- "mean_token_accuracy": 0.23942562609910964,
219
- "num_tokens": 980541.0,
220
  "step": 1050
221
  },
222
  {
223
- "entropy": 5.961639919281006,
224
  "epoch": 0.6332757628094415,
225
- "grad_norm": 1.9077532291412354,
226
- "learning_rate": 1.767268491555038e-05,
227
- "loss": 5.7664,
228
- "mean_token_accuracy": 0.26718012750148773,
229
- "num_tokens": 1023882.0,
230
  "step": 1100
231
  },
232
  {
233
- "entropy": 5.889280087947846,
234
  "epoch": 0.6620610247553252,
235
- "grad_norm": 2.4254891872406006,
236
- "learning_rate": 1.7556202679091442e-05,
237
- "loss": 5.6952,
238
- "mean_token_accuracy": 0.27529804170131683,
239
- "num_tokens": 1068300.0,
240
  "step": 1150
241
  },
242
  {
243
- "entropy": 6.085640063285828,
244
  "epoch": 0.690846286701209,
245
- "grad_norm": 2.35312557220459,
246
- "learning_rate": 1.74397204426325e-05,
247
- "loss": 5.8898,
248
- "mean_token_accuracy": 0.25166562348604204,
249
- "num_tokens": 1115425.0,
250
  "step": 1200
251
  },
252
  {
253
- "entropy": 6.146574058532715,
254
  "epoch": 0.7196315486470927,
255
- "grad_norm": 1.7730146646499634,
256
- "learning_rate": 1.732323820617356e-05,
257
- "loss": 5.9519,
258
- "mean_token_accuracy": 0.24276195973157882,
259
- "num_tokens": 1162319.0,
260
  "step": 1250
261
  },
262
  {
263
- "entropy": 6.079372715950012,
264
  "epoch": 0.7484168105929764,
265
- "grad_norm": 1.7070863246917725,
266
- "learning_rate": 1.720675596971462e-05,
267
- "loss": 5.8922,
268
- "mean_token_accuracy": 0.24961524546146394,
269
- "num_tokens": 1208230.0,
270
  "step": 1300
271
  },
272
  {
273
- "entropy": 5.9683656406402585,
274
  "epoch": 0.7772020725388601,
275
- "grad_norm": 1.8790594339370728,
276
- "learning_rate": 1.709027373325568e-05,
277
- "loss": 5.7827,
278
- "mean_token_accuracy": 0.2632122594118118,
279
- "num_tokens": 1253074.0,
280
  "step": 1350
281
  },
282
  {
283
- "entropy": 6.107076721191406,
284
  "epoch": 0.8059873344847438,
285
- "grad_norm": 1.1745644807815552,
286
- "learning_rate": 1.6973791496796742e-05,
287
- "loss": 5.9211,
288
- "mean_token_accuracy": 0.24564073830842972,
289
- "num_tokens": 1300179.0,
290
  "step": 1400
291
  },
292
  {
293
- "entropy": 6.141328382492065,
294
  "epoch": 0.8347725964306275,
295
- "grad_norm": 1.0346958637237549,
296
- "learning_rate": 1.68573092603378e-05,
297
- "loss": 5.9584,
298
- "mean_token_accuracy": 0.23997059136629104,
299
- "num_tokens": 1347539.0,
300
  "step": 1450
301
  },
302
  {
303
- "entropy": 6.070010099411011,
304
  "epoch": 0.8635578583765112,
305
- "grad_norm": 1.6541163921356201,
306
- "learning_rate": 1.674082702387886e-05,
307
- "loss": 5.889,
308
- "mean_token_accuracy": 0.24875166177749633,
309
- "num_tokens": 1394157.0,
310
  "step": 1500
311
  },
312
  {
313
- "entropy": 6.207450666427612,
314
  "epoch": 0.8923431203223949,
315
- "grad_norm": 0.9742990732192993,
316
- "learning_rate": 1.662434478741992e-05,
317
- "loss": 6.0217,
318
- "mean_token_accuracy": 0.23067249596118927,
319
- "num_tokens": 1443892.0,
320
  "step": 1550
321
  },
322
  {
323
- "entropy": 6.026197805404663,
324
  "epoch": 0.9211283822682786,
325
- "grad_norm": 1.4229531288146973,
326
- "learning_rate": 1.650786255096098e-05,
327
- "loss": 5.8455,
328
- "mean_token_accuracy": 0.2537291014194489,
329
- "num_tokens": 1491050.0,
330
  "step": 1600
331
  },
332
  {
333
- "entropy": 6.210526428222656,
334
  "epoch": 0.9499136442141624,
335
- "grad_norm": 1.3555018901824951,
336
- "learning_rate": 1.6391380314502038e-05,
337
- "loss": 6.0279,
338
- "mean_token_accuracy": 0.2308420208096504,
339
- "num_tokens": 1540809.0,
340
  "step": 1650
341
  },
342
  {
343
- "entropy": 5.9872834014892575,
344
  "epoch": 0.9786989061600461,
345
- "grad_norm": 0.9893498420715332,
346
- "learning_rate": 1.62748980780431e-05,
347
- "loss": 5.8137,
348
- "mean_token_accuracy": 0.2566875320672989,
349
- "num_tokens": 1585876.0,
350
  "step": 1700
351
  },
352
  {
353
  "epoch": 1.0,
354
- "eval_entropy": 6.322207130045386,
355
- "eval_loss": 6.15173864364624,
356
- "eval_mean_token_accuracy": 0.21116007946877985,
357
- "eval_model_preparation_time": 0.0036,
358
- "eval_num_tokens": 1619719.0,
359
- "eval_runtime": 76.1297,
360
- "eval_samples_per_second": 5.701,
361
- "eval_steps_per_second": 2.85,
362
  "step": 1737
363
  }
364
  ],
365
  "logging_steps": 50,
366
- "max_steps": 8685,
367
  "num_input_tokens_seen": 0,
368
- "num_train_epochs": 5,
369
  "save_steps": 500,
370
  "stateful_callbacks": {
371
  "TrainerControl": {
@@ -379,7 +379,7 @@
379
  "attributes": {}
380
  }
381
  },
382
- "total_flos": 2.265889302609408e+16,
383
  "train_batch_size": 2,
384
  "trial_name": null,
385
  "trial_params": null
 
1
  {
2
  "best_global_step": 1737,
3
+ "best_metric": 5.861395835876465,
4
  "best_model_checkpoint": "./output/checkpoint-1737",
5
  "epoch": 1.0,
6
  "eval_steps": 500,
 
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "entropy": 3.6583470726013183,
14
  "epoch": 0.028785261945883708,
15
+ "grad_norm": 3.3817152976989746,
16
+ "learning_rate": 4.9e-07,
17
+ "loss": 13.8754,
18
+ "mean_token_accuracy": 0.15036460414528846,
19
+ "num_tokens": 53093.0,
20
  "step": 50
21
  },
22
  {
23
+ "entropy": 3.669608063697815,
24
  "epoch": 0.057570523891767415,
25
+ "grad_norm": 3.2541544437408447,
26
+ "learning_rate": 9.9e-07,
27
+ "loss": 14.2282,
28
+ "mean_token_accuracy": 0.14137721598148345,
29
+ "num_tokens": 108334.0,
30
  "step": 100
31
  },
32
  {
33
+ "entropy": 3.569736371040344,
34
  "epoch": 0.08635578583765112,
35
+ "grad_norm": 3.6797454357147217,
36
+ "learning_rate": 1.49e-06,
37
+ "loss": 13.0735,
38
+ "mean_token_accuracy": 0.17473630651831626,
39
+ "num_tokens": 157491.0,
40
  "step": 150
41
  },
42
  {
43
+ "entropy": 3.7253233194351196,
44
  "epoch": 0.11514104778353483,
45
+ "grad_norm": 4.297911643981934,
46
+ "learning_rate": 1.99e-06,
47
+ "loss": 13.7392,
48
+ "mean_token_accuracy": 0.1473099772632122,
49
+ "num_tokens": 211394.0,
50
  "step": 200
51
  },
52
  {
53
+ "entropy": 3.8280500602722167,
54
  "epoch": 0.14392630972941853,
55
+ "grad_norm": 4.405268669128418,
56
+ "learning_rate": 1.9854771784232364e-06,
57
+ "loss": 13.0797,
58
+ "mean_token_accuracy": 0.16704789966344832,
59
+ "num_tokens": 263685.0,
60
  "step": 250
61
  },
62
  {
63
+ "entropy": 4.066333084106446,
64
  "epoch": 0.17271157167530224,
65
+ "grad_norm": 4.757556438446045,
66
+ "learning_rate": 1.9706579727326615e-06,
67
+ "loss": 12.6321,
68
+ "mean_token_accuracy": 0.1691790708899498,
69
+ "num_tokens": 314059.0,
70
  "step": 300
71
  },
72
  {
73
+ "entropy": 4.257266030311585,
74
  "epoch": 0.20149683362118595,
75
+ "grad_norm": 6.406249523162842,
76
+ "learning_rate": 1.955838767042086e-06,
77
+ "loss": 12.2253,
78
+ "mean_token_accuracy": 0.17223650276660918,
79
+ "num_tokens": 367038.0,
80
  "step": 350
81
  },
82
  {
83
+ "entropy": 4.694105777740479,
84
  "epoch": 0.23028209556706966,
85
+ "grad_norm": 12.57987117767334,
86
+ "learning_rate": 1.9410195613515113e-06,
87
+ "loss": 11.9714,
88
+ "mean_token_accuracy": 0.15997304677963256,
89
+ "num_tokens": 420327.0,
90
  "step": 400
91
  },
92
  {
93
+ "entropy": 5.205010280609131,
94
  "epoch": 0.25906735751295334,
95
+ "grad_norm": 15.570313453674316,
96
+ "learning_rate": 1.9262003556609364e-06,
97
+ "loss": 10.8173,
98
+ "mean_token_accuracy": 0.16447648257017136,
99
+ "num_tokens": 472429.0,
100
  "step": 450
101
  },
102
  {
103
+ "entropy": 5.917805089950561,
104
  "epoch": 0.28785261945883706,
105
+ "grad_norm": 23.61503791809082,
106
+ "learning_rate": 1.9113811499703615e-06,
107
+ "loss": 9.3196,
108
+ "mean_token_accuracy": 0.16179455041885377,
109
+ "num_tokens": 526315.0,
110
  "step": 500
111
  },
112
  {
113
+ "entropy": 6.380368332862854,
114
  "epoch": 0.31663788140472077,
115
+ "grad_norm": 13.846810340881348,
116
+ "learning_rate": 1.8965619442797864e-06,
117
+ "loss": 7.9636,
118
+ "mean_token_accuracy": 0.16881170988082886,
119
+ "num_tokens": 578511.0,
120
  "step": 550
121
  },
122
  {
123
+ "entropy": 6.507339992523193,
124
  "epoch": 0.3454231433506045,
125
+ "grad_norm": 4.569090366363525,
126
+ "learning_rate": 1.8817427385892115e-06,
127
+ "loss": 7.4171,
128
+ "mean_token_accuracy": 0.16941152423620223,
129
+ "num_tokens": 630937.0,
130
  "step": 600
131
  },
132
  {
133
+ "entropy": 6.392864561080932,
134
  "epoch": 0.3742084052964882,
135
+ "grad_norm": 4.594696521759033,
136
+ "learning_rate": 1.8669235328986366e-06,
137
+ "loss": 6.9389,
138
+ "mean_token_accuracy": 0.1844496901333332,
139
+ "num_tokens": 680501.0,
140
  "step": 650
141
  },
142
  {
143
+ "entropy": 6.6726202869415285,
144
  "epoch": 0.4029936672423719,
145
+ "grad_norm": 4.768734931945801,
146
+ "learning_rate": 1.8521043272080617e-06,
147
+ "loss": 6.9818,
148
+ "mean_token_accuracy": 0.16990411713719367,
149
+ "num_tokens": 733231.0,
150
  "step": 700
151
  },
152
  {
153
+ "entropy": 6.592793455123902,
154
  "epoch": 0.4317789291882556,
155
+ "grad_norm": 3.253056764602661,
156
+ "learning_rate": 1.8372851215174864e-06,
157
+ "loss": 6.7105,
158
+ "mean_token_accuracy": 0.18250102579593658,
159
+ "num_tokens": 785373.0,
160
  "step": 750
161
  },
162
  {
163
+ "entropy": 6.683582029342651,
164
  "epoch": 0.4605641911341393,
165
+ "grad_norm": 2.1871063709259033,
166
+ "learning_rate": 1.8224659158269115e-06,
167
+ "loss": 6.6685,
168
+ "mean_token_accuracy": 0.17129646152257919,
169
+ "num_tokens": 838646.0,
170
  "step": 800
171
  },
172
  {
173
+ "entropy": 6.636875295639038,
174
  "epoch": 0.48934945308002303,
175
+ "grad_norm": 3.2284677028656006,
176
+ "learning_rate": 1.8076467101363366e-06,
177
+ "loss": 6.53,
178
+ "mean_token_accuracy": 0.18053789794445038,
179
+ "num_tokens": 892380.0,
180
  "step": 850
181
  },
182
  {
183
+ "entropy": 6.610673260688782,
184
  "epoch": 0.5181347150259067,
185
+ "grad_norm": 2.2088730335235596,
186
+ "learning_rate": 1.7928275044457617e-06,
187
+ "loss": 6.4429,
188
+ "mean_token_accuracy": 0.18492739230394364,
189
+ "num_tokens": 947971.0,
190
  "step": 900
191
  },
192
  {
193
+ "entropy": 6.242899022102356,
194
  "epoch": 0.5469199769717904,
195
+ "grad_norm": 2.3000030517578125,
196
+ "learning_rate": 1.7780082987551866e-06,
197
+ "loss": 6.047,
198
+ "mean_token_accuracy": 0.2291259828209877,
199
+ "num_tokens": 998810.0,
200
  "step": 950
201
  },
202
  {
203
+ "entropy": 6.311488924026489,
204
  "epoch": 0.5757052389176741,
205
+ "grad_norm": 2.1333675384521484,
206
+ "learning_rate": 1.7631890930646115e-06,
207
+ "loss": 6.0919,
208
+ "mean_token_accuracy": 0.22644571751356124,
209
+ "num_tokens": 1050860.0,
210
  "step": 1000
211
  },
212
  {
213
+ "entropy": 6.3254336166381835,
214
  "epoch": 0.6044905008635578,
215
+ "grad_norm": 2.0400779247283936,
216
+ "learning_rate": 1.7483698873740366e-06,
217
+ "loss": 6.094,
218
+ "mean_token_accuracy": 0.2222653564810753,
219
+ "num_tokens": 1104304.0,
220
  "step": 1050
221
  },
222
  {
223
+ "entropy": 6.046922063827514,
224
  "epoch": 0.6332757628094415,
225
+ "grad_norm": 2.8049051761627197,
226
+ "learning_rate": 1.7335506816834617e-06,
227
+ "loss": 5.8011,
228
+ "mean_token_accuracy": 0.25127078920602797,
229
+ "num_tokens": 1153605.0,
230
  "step": 1100
231
  },
232
  {
233
+ "entropy": 5.943600912094116,
234
  "epoch": 0.6620610247553252,
235
+ "grad_norm": 4.063963890075684,
236
+ "learning_rate": 1.7187314759928866e-06,
237
+ "loss": 5.6855,
238
+ "mean_token_accuracy": 0.26265266716480257,
239
+ "num_tokens": 1204328.0,
240
  "step": 1150
241
  },
242
  {
243
+ "entropy": 6.12883231639862,
244
  "epoch": 0.690846286701209,
245
+ "grad_norm": 3.9440460205078125,
246
+ "learning_rate": 1.7039122703023117e-06,
247
+ "loss": 5.8578,
248
+ "mean_token_accuracy": 0.24439335912466048,
249
+ "num_tokens": 1257415.0,
250
  "step": 1200
251
  },
252
  {
253
+ "entropy": 6.164987115859986,
254
  "epoch": 0.7196315486470927,
255
+ "grad_norm": 3.20070481300354,
256
+ "learning_rate": 1.6890930646117368e-06,
257
+ "loss": 5.8876,
258
+ "mean_token_accuracy": 0.24275501281023026,
259
+ "num_tokens": 1310049.0,
260
  "step": 1250
261
  },
262
  {
263
+ "entropy": 6.080997190475464,
264
  "epoch": 0.7484168105929764,
265
+ "grad_norm": 2.8067362308502197,
266
+ "learning_rate": 1.6742738589211617e-06,
267
+ "loss": 5.8058,
268
+ "mean_token_accuracy": 0.25242207854986193,
269
+ "num_tokens": 1361794.0,
270
  "step": 1300
271
  },
272
  {
273
+ "entropy": 5.940848155021667,
274
  "epoch": 0.7772020725388601,
275
+ "grad_norm": 2.6375925540924072,
276
+ "learning_rate": 1.6594546532305868e-06,
277
+ "loss": 5.6718,
278
+ "mean_token_accuracy": 0.2665082859992981,
279
+ "num_tokens": 1412773.0,
280
  "step": 1350
281
  },
282
  {
283
+ "entropy": 6.071129274368286,
284
  "epoch": 0.8059873344847438,
285
+ "grad_norm": 3.951350212097168,
286
+ "learning_rate": 1.6446354475400117e-06,
287
+ "loss": 5.8012,
288
+ "mean_token_accuracy": 0.25434976994991304,
289
+ "num_tokens": 1465620.0,
290
  "step": 1400
291
  },
292
  {
293
+ "entropy": 6.069429359436035,
294
  "epoch": 0.8347725964306275,
295
+ "grad_norm": 3.580608606338501,
296
+ "learning_rate": 1.6298162418494368e-06,
297
+ "loss": 5.8027,
298
+ "mean_token_accuracy": 0.25208072274923327,
299
+ "num_tokens": 1518899.0,
300
  "step": 1450
301
  },
302
  {
303
+ "entropy": 6.005315380096436,
304
  "epoch": 0.8635578583765112,
305
+ "grad_norm": 3.9580376148223877,
306
+ "learning_rate": 1.614997036158862e-06,
307
+ "loss": 5.7364,
308
+ "mean_token_accuracy": 0.25940640360116957,
309
+ "num_tokens": 1571304.0,
310
  "step": 1500
311
  },
312
  {
313
+ "entropy": 6.0786464881896975,
314
  "epoch": 0.8923431203223949,
315
+ "grad_norm": 4.55721378326416,
316
+ "learning_rate": 1.6001778304682868e-06,
317
+ "loss": 5.8092,
318
+ "mean_token_accuracy": 0.2496869170665741,
319
+ "num_tokens": 1627369.0,
320
  "step": 1550
321
  },
322
  {
323
+ "entropy": 5.939382014274597,
324
  "epoch": 0.9211283822682786,
325
+ "grad_norm": 2.330057144165039,
326
+ "learning_rate": 1.5853586247777117e-06,
327
+ "loss": 5.6604,
328
+ "mean_token_accuracy": 0.2686630353331566,
329
+ "num_tokens": 1680401.0,
330
  "step": 1600
331
  },
332
  {
333
+ "entropy": 6.121775646209716,
334
  "epoch": 0.9499136442141624,
335
+ "grad_norm": 2.9881200790405273,
336
+ "learning_rate": 1.5705394190871368e-06,
337
+ "loss": 5.8388,
338
+ "mean_token_accuracy": 0.2503683388233185,
339
+ "num_tokens": 1735745.0,
340
  "step": 1650
341
  },
342
  {
343
+ "entropy": 5.840040788650513,
344
  "epoch": 0.9786989061600461,
345
+ "grad_norm": 3.798994779586792,
346
+ "learning_rate": 1.555720213396562e-06,
347
+ "loss": 5.5635,
348
+ "mean_token_accuracy": 0.278279125392437,
349
+ "num_tokens": 1786896.0,
350
  "step": 1700
351
  },
352
  {
353
  "epoch": 1.0,
354
+ "eval_entropy": 6.139133475343203,
355
+ "eval_loss": 5.861395835876465,
356
+ "eval_mean_token_accuracy": 0.2402858340657801,
357
+ "eval_model_preparation_time": 0.0047,
358
+ "eval_num_tokens": 1825107.0,
359
+ "eval_runtime": 79.3994,
360
+ "eval_samples_per_second": 5.466,
361
+ "eval_steps_per_second": 2.733,
362
  "step": 1737
363
  }
364
  ],
365
  "logging_steps": 50,
366
+ "max_steps": 6948,
367
  "num_input_tokens_seen": 0,
368
+ "num_train_epochs": 4,
369
  "save_steps": 500,
370
  "stateful_callbacks": {
371
  "TrainerControl": {
 
379
  "attributes": {}
380
  }
381
  },
382
+ "total_flos": 2.5090142668416e+16,
383
  "train_batch_size": 2,
384
  "trial_name": null,
385
  "trial_params": null
checkpoint-1737/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:130d33149272782bd60306263c371036419926142b8999aad7806359168f8484
3
  size 6225
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8db5c304963110404ebb6947b83ba95bd9b8aad1f9b8b578cc33c46d601e13dc
3
  size 6225
checkpoint-3474/adapter_config.json CHANGED
@@ -16,7 +16,7 @@
16
  "layers_pattern": null,
17
  "layers_to_transform": null,
18
  "loftq_config": {},
19
- "lora_alpha": 16,
20
  "lora_bias": false,
21
  "lora_dropout": 0.1,
22
  "megatron_config": null,
@@ -25,12 +25,14 @@
25
  "peft_type": "LORA",
26
  "peft_version": "0.18.0",
27
  "qalora_group_size": 16,
28
- "r": 8,
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
  "q_proj",
33
- "v_proj"
 
 
34
  ],
35
  "target_parameters": null,
36
  "task_type": "CAUSAL_LM",
 
16
  "layers_pattern": null,
17
  "layers_to_transform": null,
18
  "loftq_config": {},
19
+ "lora_alpha": 32,
20
  "lora_bias": false,
21
  "lora_dropout": 0.1,
22
  "megatron_config": null,
 
25
  "peft_type": "LORA",
26
  "peft_version": "0.18.0",
27
  "qalora_group_size": 16,
28
+ "r": 24,
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
  "q_proj",
33
+ "k_proj",
34
+ "v_proj",
35
+ "o_proj"
36
  ],
37
  "target_parameters": null,
38
  "task_type": "CAUSAL_LM",
checkpoint-3474/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b7979fe4ab41b842e564542d82ca738faea1a24cfcb2e3003501296353e2a240
3
- size 4374520
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a64c44cbe26eb26de9c868554476ac772a1101223d4511df741d375932e915d3
3
+ size 26182176
checkpoint-3474/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:571f08123879a8157590252a0cd0abe24c345fd53c5c7a3b55bb8b256658f9c0
3
- size 8783179
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:678993601594a7d04e501306f05a8d5de7ef3edaadbed87bc8a64e6f10f97582
3
+ size 52486155
checkpoint-3474/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5f6c201154e30349ea924dac640f38cc7626e879caf89ba0aa995630585e3ea5
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:388ebf23a81b449689f35e6de23bc7bbc9587bef795c318be18b9ce6620ad7a4
3
  size 14645
checkpoint-3474/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ecacb7697ae73257f39077a0e981cf0773317c0d0186dca0c24e0700ca53ab36
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d0eb619e824095911c3281fa938e4204802f0a5951fcaf56996a5bc063db576
3
  size 1465
checkpoint-3474/trainer_state.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "best_global_step": 3474,
3
- "best_metric": 6.12472677230835,
4
  "best_model_checkpoint": "./output/checkpoint-3474",
5
  "epoch": 2.0,
6
  "eval_steps": 500,
@@ -10,724 +10,724 @@
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "entropy": 3.864118957519531,
14
  "epoch": 0.028785261945883708,
15
- "grad_norm": 2.7545533180236816,
16
- "learning_rate": 9.800000000000001e-06,
17
- "loss": 15.2997,
18
- "mean_token_accuracy": 0.10086015284061432,
19
- "num_tokens": 47319.0,
20
  "step": 50
21
  },
22
  {
23
- "entropy": 4.047076859474182,
24
  "epoch": 0.057570523891767415,
25
- "grad_norm": 5.0328264236450195,
26
- "learning_rate": 1.98e-05,
27
- "loss": 15.3264,
28
- "mean_token_accuracy": 0.09582207053899765,
29
- "num_tokens": 96809.0,
30
  "step": 100
31
  },
32
  {
33
- "entropy": 4.7578076648712155,
34
  "epoch": 0.08635578583765112,
35
- "grad_norm": 38.50589370727539,
36
- "learning_rate": 1.988584740827024e-05,
37
- "loss": 13.0056,
38
- "mean_token_accuracy": 0.126854517608881,
39
- "num_tokens": 139962.0,
40
  "step": 150
41
  },
42
  {
43
- "entropy": 6.80673882484436,
44
  "epoch": 0.11514104778353483,
45
- "grad_norm": 12.030129432678223,
46
- "learning_rate": 1.97693651718113e-05,
47
- "loss": 9.2822,
48
- "mean_token_accuracy": 0.11084575355052947,
49
- "num_tokens": 188029.0,
50
  "step": 200
51
  },
52
  {
53
- "entropy": 7.177925786972046,
54
  "epoch": 0.14392630972941853,
55
- "grad_norm": 4.852536201477051,
56
- "learning_rate": 1.965288293535236e-05,
57
- "loss": 7.6333,
58
- "mean_token_accuracy": 0.12398939326405525,
59
- "num_tokens": 234425.0,
60
  "step": 250
61
  },
62
  {
63
- "entropy": 7.080496473312378,
64
  "epoch": 0.17271157167530224,
65
- "grad_norm": 4.10841178894043,
66
- "learning_rate": 1.9536400698893422e-05,
67
- "loss": 7.1632,
68
- "mean_token_accuracy": 0.13563686355948448,
69
- "num_tokens": 278885.0,
70
  "step": 300
71
  },
72
  {
73
- "entropy": 6.931579580307007,
74
  "epoch": 0.20149683362118595,
75
- "grad_norm": 14.636048316955566,
76
- "learning_rate": 1.941991846243448e-05,
77
- "loss": 6.8213,
78
- "mean_token_accuracy": 0.16459846690297128,
79
- "num_tokens": 325491.0,
80
  "step": 350
81
  },
82
  {
83
- "entropy": 6.853660764694214,
84
  "epoch": 0.23028209556706966,
85
- "grad_norm": 5.966708183288574,
86
- "learning_rate": 1.930343622597554e-05,
87
- "loss": 6.6625,
88
- "mean_token_accuracy": 0.17670693069696428,
89
- "num_tokens": 372913.0,
90
  "step": 400
91
  },
92
  {
93
- "entropy": 6.684267387390137,
94
  "epoch": 0.25906735751295334,
95
- "grad_norm": 4.031010627746582,
96
- "learning_rate": 1.91869539895166e-05,
97
- "loss": 6.4505,
98
- "mean_token_accuracy": 0.1943434515595436,
99
- "num_tokens": 419159.0,
100
  "step": 450
101
  },
102
  {
103
- "entropy": 6.679989137649536,
104
  "epoch": 0.28785261945883706,
105
- "grad_norm": 6.251070022583008,
106
- "learning_rate": 1.907047175305766e-05,
107
- "loss": 6.4314,
108
- "mean_token_accuracy": 0.19514557600021362,
109
- "num_tokens": 466994.0,
110
  "step": 500
111
  },
112
  {
113
- "entropy": 6.477229623794556,
114
  "epoch": 0.31663788140472077,
115
- "grad_norm": 3.8656675815582275,
116
- "learning_rate": 1.895398951659872e-05,
117
- "loss": 6.2139,
118
- "mean_token_accuracy": 0.21764743447303772,
119
- "num_tokens": 513308.0,
120
  "step": 550
121
  },
122
  {
123
- "entropy": 6.408129243850708,
124
  "epoch": 0.3454231433506045,
125
- "grad_norm": 8.688581466674805,
126
- "learning_rate": 1.883750728013978e-05,
127
- "loss": 6.1224,
128
- "mean_token_accuracy": 0.23438037544488907,
129
- "num_tokens": 559679.0,
130
  "step": 600
131
  },
132
  {
133
- "entropy": 6.128518767356873,
134
  "epoch": 0.3742084052964882,
135
- "grad_norm": 5.419503688812256,
136
- "learning_rate": 1.872102504368084e-05,
137
- "loss": 5.8692,
138
- "mean_token_accuracy": 0.26634690463542937,
139
- "num_tokens": 603140.0,
140
  "step": 650
141
  },
142
  {
143
- "entropy": 6.322700729370117,
144
  "epoch": 0.4029936672423719,
145
- "grad_norm": 2.2213082313537598,
146
- "learning_rate": 1.86045428072219e-05,
147
- "loss": 6.0717,
148
- "mean_token_accuracy": 0.24038562417030335,
149
- "num_tokens": 650179.0,
150
  "step": 700
151
  },
152
  {
153
- "entropy": 6.236415157318115,
154
  "epoch": 0.4317789291882556,
155
- "grad_norm": 4.804980278015137,
156
- "learning_rate": 1.848806057076296e-05,
157
- "loss": 5.9986,
158
- "mean_token_accuracy": 0.24596781462430953,
159
- "num_tokens": 696220.0,
160
  "step": 750
161
  },
162
  {
163
- "entropy": 6.269758443832398,
164
  "epoch": 0.4605641911341393,
165
- "grad_norm": 2.2888853549957275,
166
- "learning_rate": 1.837157833430402e-05,
167
- "loss": 6.0385,
168
- "mean_token_accuracy": 0.24074893474578857,
169
- "num_tokens": 743909.0,
170
  "step": 800
171
  },
172
  {
173
- "entropy": 6.270364007949829,
174
  "epoch": 0.48934945308002303,
175
- "grad_norm": 3.0903279781341553,
176
- "learning_rate": 1.825509609784508e-05,
177
- "loss": 6.0481,
178
- "mean_token_accuracy": 0.23740622967481614,
179
- "num_tokens": 792015.0,
180
  "step": 850
181
  },
182
  {
183
- "entropy": 6.3037636184692385,
184
  "epoch": 0.5181347150259067,
185
- "grad_norm": 3.969320058822632,
186
- "learning_rate": 1.813861386138614e-05,
187
- "loss": 6.0855,
188
- "mean_token_accuracy": 0.2309597587585449,
189
- "num_tokens": 841802.0,
190
  "step": 900
191
  },
192
  {
193
- "entropy": 6.038041458129883,
194
  "epoch": 0.5469199769717904,
195
- "grad_norm": 2.2712185382843018,
196
- "learning_rate": 1.80221316249272e-05,
197
- "loss": 5.8285,
198
- "mean_token_accuracy": 0.26099125802516937,
199
- "num_tokens": 886492.0,
200
  "step": 950
201
  },
202
  {
203
- "entropy": 6.142958383560181,
204
  "epoch": 0.5757052389176741,
205
- "grad_norm": 1.2311755418777466,
206
- "learning_rate": 1.790564938846826e-05,
207
- "loss": 5.9357,
208
- "mean_token_accuracy": 0.24810438305139543,
209
- "num_tokens": 932807.0,
210
  "step": 1000
211
  },
212
  {
213
- "entropy": 6.199834351539612,
214
  "epoch": 0.6044905008635578,
215
- "grad_norm": 2.2788379192352295,
216
- "learning_rate": 1.7789167152009318e-05,
217
- "loss": 5.9964,
218
- "mean_token_accuracy": 0.23942562609910964,
219
- "num_tokens": 980541.0,
220
  "step": 1050
221
  },
222
  {
223
- "entropy": 5.961639919281006,
224
  "epoch": 0.6332757628094415,
225
- "grad_norm": 1.9077532291412354,
226
- "learning_rate": 1.767268491555038e-05,
227
- "loss": 5.7664,
228
- "mean_token_accuracy": 0.26718012750148773,
229
- "num_tokens": 1023882.0,
230
  "step": 1100
231
  },
232
  {
233
- "entropy": 5.889280087947846,
234
  "epoch": 0.6620610247553252,
235
- "grad_norm": 2.4254891872406006,
236
- "learning_rate": 1.7556202679091442e-05,
237
- "loss": 5.6952,
238
- "mean_token_accuracy": 0.27529804170131683,
239
- "num_tokens": 1068300.0,
240
  "step": 1150
241
  },
242
  {
243
- "entropy": 6.085640063285828,
244
  "epoch": 0.690846286701209,
245
- "grad_norm": 2.35312557220459,
246
- "learning_rate": 1.74397204426325e-05,
247
- "loss": 5.8898,
248
- "mean_token_accuracy": 0.25166562348604204,
249
- "num_tokens": 1115425.0,
250
  "step": 1200
251
  },
252
  {
253
- "entropy": 6.146574058532715,
254
  "epoch": 0.7196315486470927,
255
- "grad_norm": 1.7730146646499634,
256
- "learning_rate": 1.732323820617356e-05,
257
- "loss": 5.9519,
258
- "mean_token_accuracy": 0.24276195973157882,
259
- "num_tokens": 1162319.0,
260
  "step": 1250
261
  },
262
  {
263
- "entropy": 6.079372715950012,
264
  "epoch": 0.7484168105929764,
265
- "grad_norm": 1.7070863246917725,
266
- "learning_rate": 1.720675596971462e-05,
267
- "loss": 5.8922,
268
- "mean_token_accuracy": 0.24961524546146394,
269
- "num_tokens": 1208230.0,
270
  "step": 1300
271
  },
272
  {
273
- "entropy": 5.9683656406402585,
274
  "epoch": 0.7772020725388601,
275
- "grad_norm": 1.8790594339370728,
276
- "learning_rate": 1.709027373325568e-05,
277
- "loss": 5.7827,
278
- "mean_token_accuracy": 0.2632122594118118,
279
- "num_tokens": 1253074.0,
280
  "step": 1350
281
  },
282
  {
283
- "entropy": 6.107076721191406,
284
  "epoch": 0.8059873344847438,
285
- "grad_norm": 1.1745644807815552,
286
- "learning_rate": 1.6973791496796742e-05,
287
- "loss": 5.9211,
288
- "mean_token_accuracy": 0.24564073830842972,
289
- "num_tokens": 1300179.0,
290
  "step": 1400
291
  },
292
  {
293
- "entropy": 6.141328382492065,
294
  "epoch": 0.8347725964306275,
295
- "grad_norm": 1.0346958637237549,
296
- "learning_rate": 1.68573092603378e-05,
297
- "loss": 5.9584,
298
- "mean_token_accuracy": 0.23997059136629104,
299
- "num_tokens": 1347539.0,
300
  "step": 1450
301
  },
302
  {
303
- "entropy": 6.070010099411011,
304
  "epoch": 0.8635578583765112,
305
- "grad_norm": 1.6541163921356201,
306
- "learning_rate": 1.674082702387886e-05,
307
- "loss": 5.889,
308
- "mean_token_accuracy": 0.24875166177749633,
309
- "num_tokens": 1394157.0,
310
  "step": 1500
311
  },
312
  {
313
- "entropy": 6.207450666427612,
314
  "epoch": 0.8923431203223949,
315
- "grad_norm": 0.9742990732192993,
316
- "learning_rate": 1.662434478741992e-05,
317
- "loss": 6.0217,
318
- "mean_token_accuracy": 0.23067249596118927,
319
- "num_tokens": 1443892.0,
320
  "step": 1550
321
  },
322
  {
323
- "entropy": 6.026197805404663,
324
  "epoch": 0.9211283822682786,
325
- "grad_norm": 1.4229531288146973,
326
- "learning_rate": 1.650786255096098e-05,
327
- "loss": 5.8455,
328
- "mean_token_accuracy": 0.2537291014194489,
329
- "num_tokens": 1491050.0,
330
  "step": 1600
331
  },
332
  {
333
- "entropy": 6.210526428222656,
334
  "epoch": 0.9499136442141624,
335
- "grad_norm": 1.3555018901824951,
336
- "learning_rate": 1.6391380314502038e-05,
337
- "loss": 6.0279,
338
- "mean_token_accuracy": 0.2308420208096504,
339
- "num_tokens": 1540809.0,
340
  "step": 1650
341
  },
342
  {
343
- "entropy": 5.9872834014892575,
344
  "epoch": 0.9786989061600461,
345
- "grad_norm": 0.9893498420715332,
346
- "learning_rate": 1.62748980780431e-05,
347
- "loss": 5.8137,
348
- "mean_token_accuracy": 0.2566875320672989,
349
- "num_tokens": 1585876.0,
350
  "step": 1700
351
  },
352
  {
353
  "epoch": 1.0,
354
- "eval_entropy": 6.322207130045386,
355
- "eval_loss": 6.15173864364624,
356
- "eval_mean_token_accuracy": 0.21116007946877985,
357
- "eval_model_preparation_time": 0.0036,
358
- "eval_num_tokens": 1619719.0,
359
- "eval_runtime": 76.1297,
360
- "eval_samples_per_second": 5.701,
361
- "eval_steps_per_second": 2.85,
362
  "step": 1737
363
  },
364
  {
365
- "entropy": 6.038531675338745,
366
  "epoch": 1.0074841681059297,
367
- "grad_norm": 0.8715208172798157,
368
- "learning_rate": 1.615841584158416e-05,
369
- "loss": 5.8628,
370
- "mean_token_accuracy": 0.2510762655735016,
371
- "num_tokens": 1632015.0,
372
  "step": 1750
373
  },
374
  {
375
- "entropy": 6.164030771255494,
376
  "epoch": 1.0362694300518134,
377
- "grad_norm": 0.7344900965690613,
378
- "learning_rate": 1.604193360512522e-05,
379
- "loss": 5.9856,
380
- "mean_token_accuracy": 0.2351543301343918,
381
- "num_tokens": 1681154.0,
382
  "step": 1800
383
  },
384
  {
385
- "entropy": 6.0731862354278565,
386
  "epoch": 1.065054691997697,
387
- "grad_norm": 1.0801328420639038,
388
- "learning_rate": 1.592545136866628e-05,
389
- "loss": 5.8976,
390
- "mean_token_accuracy": 0.24701615989208223,
391
- "num_tokens": 1728110.0,
392
  "step": 1850
393
  },
394
  {
395
- "entropy": 6.079212121963501,
396
  "epoch": 1.0938399539435808,
397
- "grad_norm": 0.7876909375190735,
398
- "learning_rate": 1.5808969132207338e-05,
399
- "loss": 5.9056,
400
- "mean_token_accuracy": 0.24457543224096298,
401
- "num_tokens": 1775703.0,
402
  "step": 1900
403
  },
404
  {
405
- "entropy": 6.062467746734619,
406
  "epoch": 1.1226252158894645,
407
- "grad_norm": 0.5999078750610352,
408
- "learning_rate": 1.56924868957484e-05,
409
- "loss": 5.8899,
410
- "mean_token_accuracy": 0.2469428673386574,
411
- "num_tokens": 1821980.0,
412
  "step": 1950
413
  },
414
  {
415
- "entropy": 6.031774473190308,
416
  "epoch": 1.1514104778353482,
417
- "grad_norm": 1.6313235759735107,
418
- "learning_rate": 1.557600465928946e-05,
419
- "loss": 5.8593,
420
- "mean_token_accuracy": 0.250918984413147,
421
- "num_tokens": 1867547.0,
422
  "step": 2000
423
  },
424
  {
425
- "entropy": 6.122789564132691,
426
  "epoch": 1.180195739781232,
427
- "grad_norm": 2.562373161315918,
428
- "learning_rate": 1.545952242283052e-05,
429
- "loss": 5.9502,
430
- "mean_token_accuracy": 0.23938885867595672,
431
- "num_tokens": 1915411.0,
432
  "step": 2050
433
  },
434
  {
435
- "entropy": 6.067130417823791,
436
  "epoch": 1.2089810017271156,
437
- "grad_norm": 0.9762872457504272,
438
- "learning_rate": 1.534304018637158e-05,
439
- "loss": 5.8956,
440
- "mean_token_accuracy": 0.2454381173849106,
441
- "num_tokens": 1964009.0,
442
  "step": 2100
443
  },
444
  {
445
- "entropy": 5.9613511180877685,
446
  "epoch": 1.2377662636729994,
447
- "grad_norm": 0.8701547384262085,
448
- "learning_rate": 1.5226557949912639e-05,
449
- "loss": 5.7907,
450
- "mean_token_accuracy": 0.25976367652416227,
451
- "num_tokens": 2008595.0,
452
  "step": 2150
453
  },
454
  {
455
- "entropy": 6.13505428314209,
456
  "epoch": 1.266551525618883,
457
- "grad_norm": 0.8511647582054138,
458
- "learning_rate": 1.51100757134537e-05,
459
- "loss": 5.9619,
460
- "mean_token_accuracy": 0.23760781466960906,
461
- "num_tokens": 2057229.0,
462
  "step": 2200
463
  },
464
  {
465
- "entropy": 6.025254983901977,
466
  "epoch": 1.2953367875647668,
467
- "grad_norm": 0.7627406120300293,
468
- "learning_rate": 1.4993593476994758e-05,
469
- "loss": 5.8546,
470
- "mean_token_accuracy": 0.2508662334084511,
471
- "num_tokens": 2103631.0,
472
  "step": 2250
473
  },
474
  {
475
- "entropy": 5.981974196434021,
476
  "epoch": 1.3241220495106505,
477
- "grad_norm": 1.6922173500061035,
478
- "learning_rate": 1.4877111240535819e-05,
479
- "loss": 5.8119,
480
- "mean_token_accuracy": 0.256170334815979,
481
- "num_tokens": 2150369.0,
482
  "step": 2300
483
  },
484
  {
485
- "entropy": 6.19903904914856,
486
  "epoch": 1.3529073114565342,
487
- "grad_norm": 0.40436601638793945,
488
- "learning_rate": 1.4760629004076878e-05,
489
- "loss": 6.0244,
490
- "mean_token_accuracy": 0.22900927513837815,
491
- "num_tokens": 2199724.0,
492
  "step": 2350
493
  },
494
  {
495
- "entropy": 5.986697297096253,
496
  "epoch": 1.381692573402418,
497
- "grad_norm": 0.8481882214546204,
498
- "learning_rate": 1.464414676761794e-05,
499
- "loss": 5.8195,
500
- "mean_token_accuracy": 0.2552035376429558,
501
- "num_tokens": 2245341.0,
502
  "step": 2400
503
  },
504
  {
505
- "entropy": 6.1886044692993165,
506
  "epoch": 1.4104778353483016,
507
- "grad_norm": 0.7911505103111267,
508
- "learning_rate": 1.4527664531159e-05,
509
- "loss": 6.0148,
510
- "mean_token_accuracy": 0.23026730984449387,
511
- "num_tokens": 2294726.0,
512
  "step": 2450
513
  },
514
  {
515
- "entropy": 5.974867792129516,
516
  "epoch": 1.4392630972941853,
517
- "grad_norm": 1.640499234199524,
518
- "learning_rate": 1.441118229470006e-05,
519
- "loss": 5.8111,
520
- "mean_token_accuracy": 0.2554209426045418,
521
- "num_tokens": 2342251.0,
522
  "step": 2500
523
  },
524
  {
525
- "entropy": 5.967635660171509,
526
  "epoch": 1.468048359240069,
527
- "grad_norm": 0.8022929430007935,
528
- "learning_rate": 1.429470005824112e-05,
529
- "loss": 5.8015,
530
- "mean_token_accuracy": 0.2569852137565613,
531
- "num_tokens": 2387469.0,
532
  "step": 2550
533
  },
534
  {
535
- "entropy": 6.047262029647827,
536
  "epoch": 1.4968336211859528,
537
- "grad_norm": 0.9270678758621216,
538
- "learning_rate": 1.417821782178218e-05,
539
- "loss": 5.8782,
540
- "mean_token_accuracy": 0.2467849862575531,
541
- "num_tokens": 2434128.0,
542
  "step": 2600
543
  },
544
  {
545
- "entropy": 6.00601068019867,
546
  "epoch": 1.5256188831318365,
547
- "grad_norm": 1.5378597974777222,
548
- "learning_rate": 1.406173558532324e-05,
549
- "loss": 5.839,
550
- "mean_token_accuracy": 0.25216978013515473,
551
- "num_tokens": 2480366.0,
552
  "step": 2650
553
  },
554
  {
555
- "entropy": 5.988714299201965,
556
  "epoch": 1.5544041450777202,
557
- "grad_norm": 0.819143533706665,
558
- "learning_rate": 1.3945253348864299e-05,
559
- "loss": 5.82,
560
- "mean_token_accuracy": 0.254311783015728,
561
- "num_tokens": 2527357.0,
562
  "step": 2700
563
  },
564
  {
565
- "entropy": 5.960293846130371,
566
  "epoch": 1.583189407023604,
567
- "grad_norm": 0.8920449614524841,
568
- "learning_rate": 1.382877111240536e-05,
569
- "loss": 5.7946,
570
- "mean_token_accuracy": 0.25750755161046984,
571
- "num_tokens": 2574470.0,
572
  "step": 2750
573
  },
574
  {
575
- "entropy": 6.1214879322052,
576
  "epoch": 1.6119746689694876,
577
- "grad_norm": 0.5333890914916992,
578
- "learning_rate": 1.371228887594642e-05,
579
- "loss": 5.9513,
580
- "mean_token_accuracy": 0.2377367687225342,
581
- "num_tokens": 2622280.0,
582
  "step": 2800
583
  },
584
  {
585
- "entropy": 5.951769871711731,
586
  "epoch": 1.6407599309153713,
587
- "grad_norm": 0.5994665026664734,
588
- "learning_rate": 1.3595806639487479e-05,
589
- "loss": 5.7861,
590
- "mean_token_accuracy": 0.25854207515716554,
591
- "num_tokens": 2668624.0,
592
  "step": 2850
593
  },
594
  {
595
- "entropy": 5.927765312194825,
596
  "epoch": 1.669545192861255,
597
- "grad_norm": 0.4460087716579437,
598
- "learning_rate": 1.347932440302854e-05,
599
- "loss": 5.7661,
600
- "mean_token_accuracy": 0.25973255425691605,
601
- "num_tokens": 2714388.0,
602
  "step": 2900
603
  },
604
  {
605
- "entropy": 6.097678365707398,
606
  "epoch": 1.6983304548071387,
607
- "grad_norm": 0.7125752568244934,
608
- "learning_rate": 1.3362842166569598e-05,
609
- "loss": 5.9284,
610
- "mean_token_accuracy": 0.23995368272066117,
611
- "num_tokens": 2761465.0,
612
  "step": 2950
613
  },
614
  {
615
- "entropy": 5.986212658882141,
616
  "epoch": 1.7271157167530224,
617
- "grad_norm": 1.5405049324035645,
618
- "learning_rate": 1.3246359930110659e-05,
619
- "loss": 5.8194,
620
- "mean_token_accuracy": 0.25333445996046067,
621
- "num_tokens": 2808066.0,
622
  "step": 3000
623
  },
624
  {
625
- "entropy": 5.7968806195259095,
626
  "epoch": 1.7559009786989062,
627
- "grad_norm": 0.4532749652862549,
628
- "learning_rate": 1.312987769365172e-05,
629
- "loss": 5.6344,
630
- "mean_token_accuracy": 0.2782411390542984,
631
- "num_tokens": 2851822.0,
632
  "step": 3050
633
  },
634
  {
635
- "entropy": 5.973708114624023,
636
  "epoch": 1.7846862406447899,
637
- "grad_norm": 1.4795438051223755,
638
- "learning_rate": 1.3013395457192778e-05,
639
- "loss": 5.8104,
640
- "mean_token_accuracy": 0.25441971331834795,
641
- "num_tokens": 2897737.0,
642
  "step": 3100
643
  },
644
  {
645
- "entropy": 5.70733567237854,
646
  "epoch": 1.8134715025906736,
647
- "grad_norm": 0.6216577887535095,
648
- "learning_rate": 1.2896913220733839e-05,
649
- "loss": 5.5523,
650
- "mean_token_accuracy": 0.28787180870771406,
651
- "num_tokens": 2939511.0,
652
  "step": 3150
653
  },
654
  {
655
- "entropy": 5.96826630115509,
656
  "epoch": 1.8422567645365573,
657
- "grad_norm": 0.9246350526809692,
658
- "learning_rate": 1.2780430984274898e-05,
659
- "loss": 5.8057,
660
- "mean_token_accuracy": 0.25464902341365814,
661
- "num_tokens": 2986368.0,
662
  "step": 3200
663
  },
664
  {
665
- "entropy": 5.950662693977356,
666
  "epoch": 1.871042026482441,
667
- "grad_norm": 0.8141199946403503,
668
- "learning_rate": 1.266394874781596e-05,
669
- "loss": 5.7886,
670
- "mean_token_accuracy": 0.25830793648958206,
671
- "num_tokens": 3031770.0,
672
  "step": 3250
673
  },
674
  {
675
- "entropy": 6.00512773513794,
676
  "epoch": 1.8998272884283247,
677
- "grad_norm": 0.4913998246192932,
678
- "learning_rate": 1.2547466511357018e-05,
679
- "loss": 5.838,
680
- "mean_token_accuracy": 0.2512077575922012,
681
- "num_tokens": 3078322.0,
682
  "step": 3300
683
  },
684
  {
685
- "entropy": 6.090880632400513,
686
  "epoch": 1.9286125503742084,
687
- "grad_norm": 0.9893012046813965,
688
- "learning_rate": 1.243098427489808e-05,
689
- "loss": 5.9264,
690
- "mean_token_accuracy": 0.2391783133149147,
691
- "num_tokens": 3125572.0,
692
  "step": 3350
693
  },
694
  {
695
- "entropy": 5.949693293571472,
696
  "epoch": 1.9573978123200921,
697
- "grad_norm": 0.5794200301170349,
698
- "learning_rate": 1.231450203843914e-05,
699
- "loss": 5.7861,
700
- "mean_token_accuracy": 0.2568664598464966,
701
- "num_tokens": 3171974.0,
702
  "step": 3400
703
  },
704
  {
705
- "entropy": 6.03591317653656,
706
  "epoch": 1.9861830742659758,
707
- "grad_norm": 0.8525373339653015,
708
- "learning_rate": 1.21980198019802e-05,
709
- "loss": 5.8741,
710
- "mean_token_accuracy": 0.24642003327608109,
711
- "num_tokens": 3219624.0,
712
  "step": 3450
713
  },
714
  {
715
  "epoch": 2.0,
716
- "eval_entropy": 6.272298685416648,
717
- "eval_loss": 6.12472677230835,
718
- "eval_mean_token_accuracy": 0.21168697409091458,
719
- "eval_model_preparation_time": 0.0036,
720
- "eval_num_tokens": 3239438.0,
721
- "eval_runtime": 76.2536,
722
- "eval_samples_per_second": 5.692,
723
- "eval_steps_per_second": 2.846,
724
  "step": 3474
725
  }
726
  ],
727
  "logging_steps": 50,
728
- "max_steps": 8685,
729
  "num_input_tokens_seen": 0,
730
- "num_train_epochs": 5,
731
  "save_steps": 500,
732
  "stateful_callbacks": {
733
  "TrainerControl": {
@@ -741,7 +741,7 @@
741
  "attributes": {}
742
  }
743
  },
744
- "total_flos": 4.529454004325376e+16,
745
  "train_batch_size": 2,
746
  "trial_name": null,
747
  "trial_params": null
 
1
  {
2
  "best_global_step": 3474,
3
+ "best_metric": 5.656307220458984,
4
  "best_model_checkpoint": "./output/checkpoint-3474",
5
  "epoch": 2.0,
6
  "eval_steps": 500,
 
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "entropy": 3.6583470726013183,
14
  "epoch": 0.028785261945883708,
15
+ "grad_norm": 3.3817152976989746,
16
+ "learning_rate": 4.9e-07,
17
+ "loss": 13.8754,
18
+ "mean_token_accuracy": 0.15036460414528846,
19
+ "num_tokens": 53093.0,
20
  "step": 50
21
  },
22
  {
23
+ "entropy": 3.669608063697815,
24
  "epoch": 0.057570523891767415,
25
+ "grad_norm": 3.2541544437408447,
26
+ "learning_rate": 9.9e-07,
27
+ "loss": 14.2282,
28
+ "mean_token_accuracy": 0.14137721598148345,
29
+ "num_tokens": 108334.0,
30
  "step": 100
31
  },
32
  {
33
+ "entropy": 3.569736371040344,
34
  "epoch": 0.08635578583765112,
35
+ "grad_norm": 3.6797454357147217,
36
+ "learning_rate": 1.49e-06,
37
+ "loss": 13.0735,
38
+ "mean_token_accuracy": 0.17473630651831626,
39
+ "num_tokens": 157491.0,
40
  "step": 150
41
  },
42
  {
43
+ "entropy": 3.7253233194351196,
44
  "epoch": 0.11514104778353483,
45
+ "grad_norm": 4.297911643981934,
46
+ "learning_rate": 1.99e-06,
47
+ "loss": 13.7392,
48
+ "mean_token_accuracy": 0.1473099772632122,
49
+ "num_tokens": 211394.0,
50
  "step": 200
51
  },
52
  {
53
+ "entropy": 3.8280500602722167,
54
  "epoch": 0.14392630972941853,
55
+ "grad_norm": 4.405268669128418,
56
+ "learning_rate": 1.9854771784232364e-06,
57
+ "loss": 13.0797,
58
+ "mean_token_accuracy": 0.16704789966344832,
59
+ "num_tokens": 263685.0,
60
  "step": 250
61
  },
62
  {
63
+ "entropy": 4.066333084106446,
64
  "epoch": 0.17271157167530224,
65
+ "grad_norm": 4.757556438446045,
66
+ "learning_rate": 1.9706579727326615e-06,
67
+ "loss": 12.6321,
68
+ "mean_token_accuracy": 0.1691790708899498,
69
+ "num_tokens": 314059.0,
70
  "step": 300
71
  },
72
  {
73
+ "entropy": 4.257266030311585,
74
  "epoch": 0.20149683362118595,
75
+ "grad_norm": 6.406249523162842,
76
+ "learning_rate": 1.955838767042086e-06,
77
+ "loss": 12.2253,
78
+ "mean_token_accuracy": 0.17223650276660918,
79
+ "num_tokens": 367038.0,
80
  "step": 350
81
  },
82
  {
83
+ "entropy": 4.694105777740479,
84
  "epoch": 0.23028209556706966,
85
+ "grad_norm": 12.57987117767334,
86
+ "learning_rate": 1.9410195613515113e-06,
87
+ "loss": 11.9714,
88
+ "mean_token_accuracy": 0.15997304677963256,
89
+ "num_tokens": 420327.0,
90
  "step": 400
91
  },
92
  {
93
+ "entropy": 5.205010280609131,
94
  "epoch": 0.25906735751295334,
95
+ "grad_norm": 15.570313453674316,
96
+ "learning_rate": 1.9262003556609364e-06,
97
+ "loss": 10.8173,
98
+ "mean_token_accuracy": 0.16447648257017136,
99
+ "num_tokens": 472429.0,
100
  "step": 450
101
  },
102
  {
103
+ "entropy": 5.917805089950561,
104
  "epoch": 0.28785261945883706,
105
+ "grad_norm": 23.61503791809082,
106
+ "learning_rate": 1.9113811499703615e-06,
107
+ "loss": 9.3196,
108
+ "mean_token_accuracy": 0.16179455041885377,
109
+ "num_tokens": 526315.0,
110
  "step": 500
111
  },
112
  {
113
+ "entropy": 6.380368332862854,
114
  "epoch": 0.31663788140472077,
115
+ "grad_norm": 13.846810340881348,
116
+ "learning_rate": 1.8965619442797864e-06,
117
+ "loss": 7.9636,
118
+ "mean_token_accuracy": 0.16881170988082886,
119
+ "num_tokens": 578511.0,
120
  "step": 550
121
  },
122
  {
123
+ "entropy": 6.507339992523193,
124
  "epoch": 0.3454231433506045,
125
+ "grad_norm": 4.569090366363525,
126
+ "learning_rate": 1.8817427385892115e-06,
127
+ "loss": 7.4171,
128
+ "mean_token_accuracy": 0.16941152423620223,
129
+ "num_tokens": 630937.0,
130
  "step": 600
131
  },
132
  {
133
+ "entropy": 6.392864561080932,
134
  "epoch": 0.3742084052964882,
135
+ "grad_norm": 4.594696521759033,
136
+ "learning_rate": 1.8669235328986366e-06,
137
+ "loss": 6.9389,
138
+ "mean_token_accuracy": 0.1844496901333332,
139
+ "num_tokens": 680501.0,
140
  "step": 650
141
  },
142
  {
143
+ "entropy": 6.6726202869415285,
144
  "epoch": 0.4029936672423719,
145
+ "grad_norm": 4.768734931945801,
146
+ "learning_rate": 1.8521043272080617e-06,
147
+ "loss": 6.9818,
148
+ "mean_token_accuracy": 0.16990411713719367,
149
+ "num_tokens": 733231.0,
150
  "step": 700
151
  },
152
  {
153
+ "entropy": 6.592793455123902,
154
  "epoch": 0.4317789291882556,
155
+ "grad_norm": 3.253056764602661,
156
+ "learning_rate": 1.8372851215174864e-06,
157
+ "loss": 6.7105,
158
+ "mean_token_accuracy": 0.18250102579593658,
159
+ "num_tokens": 785373.0,
160
  "step": 750
161
  },
162
  {
163
+ "entropy": 6.683582029342651,
164
  "epoch": 0.4605641911341393,
165
+ "grad_norm": 2.1871063709259033,
166
+ "learning_rate": 1.8224659158269115e-06,
167
+ "loss": 6.6685,
168
+ "mean_token_accuracy": 0.17129646152257919,
169
+ "num_tokens": 838646.0,
170
  "step": 800
171
  },
172
  {
173
+ "entropy": 6.636875295639038,
174
  "epoch": 0.48934945308002303,
175
+ "grad_norm": 3.2284677028656006,
176
+ "learning_rate": 1.8076467101363366e-06,
177
+ "loss": 6.53,
178
+ "mean_token_accuracy": 0.18053789794445038,
179
+ "num_tokens": 892380.0,
180
  "step": 850
181
  },
182
  {
183
+ "entropy": 6.610673260688782,
184
  "epoch": 0.5181347150259067,
185
+ "grad_norm": 2.2088730335235596,
186
+ "learning_rate": 1.7928275044457617e-06,
187
+ "loss": 6.4429,
188
+ "mean_token_accuracy": 0.18492739230394364,
189
+ "num_tokens": 947971.0,
190
  "step": 900
191
  },
192
  {
193
+ "entropy": 6.242899022102356,
194
  "epoch": 0.5469199769717904,
195
+ "grad_norm": 2.3000030517578125,
196
+ "learning_rate": 1.7780082987551866e-06,
197
+ "loss": 6.047,
198
+ "mean_token_accuracy": 0.2291259828209877,
199
+ "num_tokens": 998810.0,
200
  "step": 950
201
  },
202
  {
203
+ "entropy": 6.311488924026489,
204
  "epoch": 0.5757052389176741,
205
+ "grad_norm": 2.1333675384521484,
206
+ "learning_rate": 1.7631890930646115e-06,
207
+ "loss": 6.0919,
208
+ "mean_token_accuracy": 0.22644571751356124,
209
+ "num_tokens": 1050860.0,
210
  "step": 1000
211
  },
212
  {
213
+ "entropy": 6.3254336166381835,
214
  "epoch": 0.6044905008635578,
215
+ "grad_norm": 2.0400779247283936,
216
+ "learning_rate": 1.7483698873740366e-06,
217
+ "loss": 6.094,
218
+ "mean_token_accuracy": 0.2222653564810753,
219
+ "num_tokens": 1104304.0,
220
  "step": 1050
221
  },
222
  {
223
+ "entropy": 6.046922063827514,
224
  "epoch": 0.6332757628094415,
225
+ "grad_norm": 2.8049051761627197,
226
+ "learning_rate": 1.7335506816834617e-06,
227
+ "loss": 5.8011,
228
+ "mean_token_accuracy": 0.25127078920602797,
229
+ "num_tokens": 1153605.0,
230
  "step": 1100
231
  },
232
  {
233
+ "entropy": 5.943600912094116,
234
  "epoch": 0.6620610247553252,
235
+ "grad_norm": 4.063963890075684,
236
+ "learning_rate": 1.7187314759928866e-06,
237
+ "loss": 5.6855,
238
+ "mean_token_accuracy": 0.26265266716480257,
239
+ "num_tokens": 1204328.0,
240
  "step": 1150
241
  },
242
  {
243
+ "entropy": 6.12883231639862,
244
  "epoch": 0.690846286701209,
245
+ "grad_norm": 3.9440460205078125,
246
+ "learning_rate": 1.7039122703023117e-06,
247
+ "loss": 5.8578,
248
+ "mean_token_accuracy": 0.24439335912466048,
249
+ "num_tokens": 1257415.0,
250
  "step": 1200
251
  },
252
  {
253
+ "entropy": 6.164987115859986,
254
  "epoch": 0.7196315486470927,
255
+ "grad_norm": 3.20070481300354,
256
+ "learning_rate": 1.6890930646117368e-06,
257
+ "loss": 5.8876,
258
+ "mean_token_accuracy": 0.24275501281023026,
259
+ "num_tokens": 1310049.0,
260
  "step": 1250
261
  },
262
  {
263
+ "entropy": 6.080997190475464,
264
  "epoch": 0.7484168105929764,
265
+ "grad_norm": 2.8067362308502197,
266
+ "learning_rate": 1.6742738589211617e-06,
267
+ "loss": 5.8058,
268
+ "mean_token_accuracy": 0.25242207854986193,
269
+ "num_tokens": 1361794.0,
270
  "step": 1300
271
  },
272
  {
273
+ "entropy": 5.940848155021667,
274
  "epoch": 0.7772020725388601,
275
+ "grad_norm": 2.6375925540924072,
276
+ "learning_rate": 1.6594546532305868e-06,
277
+ "loss": 5.6718,
278
+ "mean_token_accuracy": 0.2665082859992981,
279
+ "num_tokens": 1412773.0,
280
  "step": 1350
281
  },
282
  {
283
+ "entropy": 6.071129274368286,
284
  "epoch": 0.8059873344847438,
285
+ "grad_norm": 3.951350212097168,
286
+ "learning_rate": 1.6446354475400117e-06,
287
+ "loss": 5.8012,
288
+ "mean_token_accuracy": 0.25434976994991304,
289
+ "num_tokens": 1465620.0,
290
  "step": 1400
291
  },
292
  {
293
+ "entropy": 6.069429359436035,
294
  "epoch": 0.8347725964306275,
295
+ "grad_norm": 3.580608606338501,
296
+ "learning_rate": 1.6298162418494368e-06,
297
+ "loss": 5.8027,
298
+ "mean_token_accuracy": 0.25208072274923327,
299
+ "num_tokens": 1518899.0,
300
  "step": 1450
301
  },
302
  {
303
+ "entropy": 6.005315380096436,
304
  "epoch": 0.8635578583765112,
305
+ "grad_norm": 3.9580376148223877,
306
+ "learning_rate": 1.614997036158862e-06,
307
+ "loss": 5.7364,
308
+ "mean_token_accuracy": 0.25940640360116957,
309
+ "num_tokens": 1571304.0,
310
  "step": 1500
311
  },
312
  {
313
+ "entropy": 6.0786464881896975,
314
  "epoch": 0.8923431203223949,
315
+ "grad_norm": 4.55721378326416,
316
+ "learning_rate": 1.6001778304682868e-06,
317
+ "loss": 5.8092,
318
+ "mean_token_accuracy": 0.2496869170665741,
319
+ "num_tokens": 1627369.0,
320
  "step": 1550
321
  },
322
  {
323
+ "entropy": 5.939382014274597,
324
  "epoch": 0.9211283822682786,
325
+ "grad_norm": 2.330057144165039,
326
+ "learning_rate": 1.5853586247777117e-06,
327
+ "loss": 5.6604,
328
+ "mean_token_accuracy": 0.2686630353331566,
329
+ "num_tokens": 1680401.0,
330
  "step": 1600
331
  },
332
  {
333
+ "entropy": 6.121775646209716,
334
  "epoch": 0.9499136442141624,
335
+ "grad_norm": 2.9881200790405273,
336
+ "learning_rate": 1.5705394190871368e-06,
337
+ "loss": 5.8388,
338
+ "mean_token_accuracy": 0.2503683388233185,
339
+ "num_tokens": 1735745.0,
340
  "step": 1650
341
  },
342
  {
343
+ "entropy": 5.840040788650513,
344
  "epoch": 0.9786989061600461,
345
+ "grad_norm": 3.798994779586792,
346
+ "learning_rate": 1.555720213396562e-06,
347
+ "loss": 5.5635,
348
+ "mean_token_accuracy": 0.278279125392437,
349
+ "num_tokens": 1786896.0,
350
  "step": 1700
351
  },
352
  {
353
  "epoch": 1.0,
354
+ "eval_entropy": 6.139133475343203,
355
+ "eval_loss": 5.861395835876465,
356
+ "eval_mean_token_accuracy": 0.2402858340657801,
357
+ "eval_model_preparation_time": 0.0047,
358
+ "eval_num_tokens": 1825107.0,
359
+ "eval_runtime": 79.3994,
360
+ "eval_samples_per_second": 5.466,
361
+ "eval_steps_per_second": 2.733,
362
  "step": 1737
363
  },
364
  {
365
+ "entropy": 5.8970259666442875,
366
  "epoch": 1.0074841681059297,
367
+ "grad_norm": 2.6411802768707275,
368
+ "learning_rate": 1.540901007705987e-06,
369
+ "loss": 5.614,
370
+ "mean_token_accuracy": 0.273006406724453,
371
+ "num_tokens": 1838864.0,
372
  "step": 1750
373
  },
374
  {
375
+ "entropy": 6.0111794090271,
376
  "epoch": 1.0362694300518134,
377
+ "grad_norm": 3.6491827964782715,
378
+ "learning_rate": 1.526081802015412e-06,
379
+ "loss": 5.7323,
380
+ "mean_token_accuracy": 0.26104256987571717,
381
+ "num_tokens": 1893816.0,
382
  "step": 1800
383
  },
384
  {
385
+ "entropy": 5.902219276428223,
386
  "epoch": 1.065054691997697,
387
+ "grad_norm": 2.593249559402466,
388
+ "learning_rate": 1.5112625963248368e-06,
389
+ "loss": 5.6187,
390
+ "mean_token_accuracy": 0.2746362566947937,
391
+ "num_tokens": 1946532.0,
392
  "step": 1850
393
  },
394
  {
395
+ "entropy": 5.874705944061279,
396
  "epoch": 1.0938399539435808,
397
+ "grad_norm": 2.554327964782715,
398
+ "learning_rate": 1.496443390634262e-06,
399
+ "loss": 5.6021,
400
+ "mean_token_accuracy": 0.2795292744040489,
401
+ "num_tokens": 2000184.0,
402
  "step": 1900
403
  },
404
  {
405
+ "entropy": 5.850096368789673,
406
  "epoch": 1.1226252158894645,
407
+ "grad_norm": 3.6060993671417236,
408
+ "learning_rate": 1.481624184943687e-06,
409
+ "loss": 5.576,
410
+ "mean_token_accuracy": 0.28532547056674956,
411
+ "num_tokens": 2052250.0,
412
  "step": 1950
413
  },
414
  {
415
+ "entropy": 5.802229671478272,
416
  "epoch": 1.1514104778353482,
417
+ "grad_norm": 3.0913314819335938,
418
+ "learning_rate": 1.466804979253112e-06,
419
+ "loss": 5.53,
420
+ "mean_token_accuracy": 0.2916027933359146,
421
+ "num_tokens": 2103531.0,
422
  "step": 2000
423
  },
424
  {
425
+ "entropy": 5.875646467208862,
426
  "epoch": 1.180195739781232,
427
+ "grad_norm": 4.777045726776123,
428
+ "learning_rate": 1.451985773562537e-06,
429
+ "loss": 5.6146,
430
+ "mean_token_accuracy": 0.28063644528388976,
431
+ "num_tokens": 2157098.0,
432
  "step": 2050
433
  },
434
  {
435
+ "entropy": 5.786596937179565,
436
  "epoch": 1.2089810017271156,
437
+ "grad_norm": 4.207762718200684,
438
+ "learning_rate": 1.437166567871962e-06,
439
+ "loss": 5.5417,
440
+ "mean_token_accuracy": 0.2870470091700554,
441
+ "num_tokens": 2211827.0,
442
  "step": 2100
443
  },
444
  {
445
+ "entropy": 5.672234449386597,
446
  "epoch": 1.2377662636729994,
447
+ "grad_norm": 2.2771811485290527,
448
+ "learning_rate": 1.422347362181387e-06,
449
+ "loss": 5.4285,
450
+ "mean_token_accuracy": 0.30194485366344453,
451
+ "num_tokens": 2262174.0,
452
  "step": 2150
453
  },
454
  {
455
+ "entropy": 5.862573285102844,
456
  "epoch": 1.266551525618883,
457
+ "grad_norm": 3.3273422718048096,
458
+ "learning_rate": 1.4075281564908121e-06,
459
+ "loss": 5.6169,
460
+ "mean_token_accuracy": 0.278145115673542,
461
+ "num_tokens": 2316440.0,
462
  "step": 2200
463
  },
464
  {
465
+ "entropy": 5.734760231971741,
466
  "epoch": 1.2953367875647668,
467
+ "grad_norm": 3.7049715518951416,
468
+ "learning_rate": 1.392708950800237e-06,
469
+ "loss": 5.493,
470
+ "mean_token_accuracy": 0.2941485676169395,
471
+ "num_tokens": 2368468.0,
472
  "step": 2250
473
  },
474
  {
475
+ "entropy": 5.665819988250733,
476
  "epoch": 1.3241220495106505,
477
+ "grad_norm": 3.572636604309082,
478
+ "learning_rate": 1.3778897451096621e-06,
479
+ "loss": 5.4352,
480
+ "mean_token_accuracy": 0.3003745040297508,
481
+ "num_tokens": 2421180.0,
482
  "step": 2300
483
  },
484
  {
485
+ "entropy": 5.890115032196045,
486
  "epoch": 1.3529073114565342,
487
+ "grad_norm": 2.738203525543213,
488
+ "learning_rate": 1.3630705394190872e-06,
489
+ "loss": 5.6555,
490
+ "mean_token_accuracy": 0.2737997192144394,
491
+ "num_tokens": 2476255.0,
492
  "step": 2350
493
  },
494
  {
495
+ "entropy": 5.66056040763855,
496
  "epoch": 1.381692573402418,
497
+ "grad_norm": 3.1416995525360107,
498
+ "learning_rate": 1.3482513337285121e-06,
499
+ "loss": 5.4302,
500
+ "mean_token_accuracy": 0.3000989046692848,
501
+ "num_tokens": 2527674.0,
502
  "step": 2400
503
  },
504
  {
505
+ "entropy": 5.861240615844727,
506
  "epoch": 1.4104778353483016,
507
+ "grad_norm": 2.7569284439086914,
508
+ "learning_rate": 1.333432128037937e-06,
509
+ "loss": 5.6304,
510
+ "mean_token_accuracy": 0.27707513481378554,
511
+ "num_tokens": 2582909.0,
512
  "step": 2450
513
  },
514
  {
515
+ "entropy": 5.627686910629272,
516
  "epoch": 1.4392630972941853,
517
+ "grad_norm": 1.7750262022018433,
518
+ "learning_rate": 1.3186129223473621e-06,
519
+ "loss": 5.4058,
520
+ "mean_token_accuracy": 0.3019809901714325,
521
+ "num_tokens": 2636579.0,
522
  "step": 2500
523
  },
524
  {
525
+ "entropy": 5.607026796340943,
526
  "epoch": 1.468048359240069,
527
+ "grad_norm": 3.1005160808563232,
528
+ "learning_rate": 1.3037937166567872e-06,
529
+ "loss": 5.3836,
530
+ "mean_token_accuracy": 0.30584611505270004,
531
+ "num_tokens": 2687698.0,
532
  "step": 2550
533
  },
534
  {
535
+ "entropy": 5.6909641885757445,
536
  "epoch": 1.4968336211859528,
537
+ "grad_norm": 1.6848654747009277,
538
+ "learning_rate": 1.2889745109662123e-06,
539
+ "loss": 5.4653,
540
+ "mean_token_accuracy": 0.296178964972496,
541
+ "num_tokens": 2740214.0,
542
  "step": 2600
543
  },
544
  {
545
+ "entropy": 5.619450302124023,
546
  "epoch": 1.5256188831318365,
547
+ "grad_norm": 2.469539165496826,
548
+ "learning_rate": 1.274155305275637e-06,
549
+ "loss": 5.4022,
550
+ "mean_token_accuracy": 0.3039679077267647,
551
+ "num_tokens": 2792574.0,
552
  "step": 2650
553
  },
554
  {
555
+ "entropy": 5.61073097705841,
556
  "epoch": 1.5544041450777202,
557
+ "grad_norm": 2.367810010910034,
558
+ "learning_rate": 1.259336099585062e-06,
559
+ "loss": 5.3956,
560
+ "mean_token_accuracy": 0.3051413372159004,
561
+ "num_tokens": 2845597.0,
562
  "step": 2700
563
  },
564
  {
565
+ "entropy": 5.5791136837005615,
566
  "epoch": 1.583189407023604,
567
+ "grad_norm": 2.3874764442443848,
568
+ "learning_rate": 1.2445168938944872e-06,
569
+ "loss": 5.3676,
570
+ "mean_token_accuracy": 0.3068238252401352,
571
+ "num_tokens": 2898683.0,
572
  "step": 2750
573
  },
574
  {
575
+ "entropy": 5.735381307601929,
576
  "epoch": 1.6119746689694876,
577
+ "grad_norm": 2.2097349166870117,
578
+ "learning_rate": 1.2296976882039123e-06,
579
+ "loss": 5.5239,
580
+ "mean_token_accuracy": 0.28974882304668426,
581
+ "num_tokens": 2952290.0,
582
  "step": 2800
583
  },
584
  {
585
+ "entropy": 5.55252691745758,
586
  "epoch": 1.6407599309153713,
587
+ "grad_norm": 1.694831132888794,
588
+ "learning_rate": 1.2148784825133372e-06,
589
+ "loss": 5.351,
590
+ "mean_token_accuracy": 0.3091904193162918,
591
+ "num_tokens": 3004556.0,
592
  "step": 2850
593
  },
594
  {
595
+ "entropy": 5.508773093223572,
596
  "epoch": 1.669545192861255,
597
+ "grad_norm": 1.8229279518127441,
598
+ "learning_rate": 1.200059276822762e-06,
599
+ "loss": 5.3164,
600
+ "mean_token_accuracy": 0.31158645361661913,
601
+ "num_tokens": 3056448.0,
602
  "step": 2900
603
  },
604
  {
605
+ "entropy": 5.676794271469117,
606
  "epoch": 1.6983304548071387,
607
+ "grad_norm": 1.7196234464645386,
608
+ "learning_rate": 1.1852400711321872e-06,
609
+ "loss": 5.4776,
610
+ "mean_token_accuracy": 0.2929128894209862,
611
+ "num_tokens": 3109539.0,
612
  "step": 2950
613
  },
614
  {
615
+ "entropy": 5.551529383659362,
616
  "epoch": 1.7271157167530224,
617
+ "grad_norm": 3.117525577545166,
618
+ "learning_rate": 1.1704208654416123e-06,
619
+ "loss": 5.3561,
620
+ "mean_token_accuracy": 0.30634030640125276,
621
+ "num_tokens": 3162421.0,
622
  "step": 3000
623
  },
624
  {
625
+ "entropy": 5.379635264873505,
626
  "epoch": 1.7559009786989062,
627
+ "grad_norm": 1.876755714416504,
628
+ "learning_rate": 1.1556016597510372e-06,
629
+ "loss": 5.1868,
630
+ "mean_token_accuracy": 0.32913618892431257,
631
+ "num_tokens": 3212079.0,
632
  "step": 3050
633
  },
634
  {
635
+ "entropy": 5.538804936408996,
636
  "epoch": 1.7846862406447899,
637
+ "grad_norm": 1.8670976161956787,
638
+ "learning_rate": 1.1407824540604623e-06,
639
+ "loss": 5.3494,
640
+ "mean_token_accuracy": 0.30661171555519107,
641
+ "num_tokens": 3264089.0,
642
  "step": 3100
643
  },
644
  {
645
+ "entropy": 5.258263626098633,
646
  "epoch": 1.8134715025906736,
647
+ "grad_norm": 2.748718023300171,
648
+ "learning_rate": 1.1259632483698874e-06,
649
+ "loss": 5.08,
650
+ "mean_token_accuracy": 0.3413010013103485,
651
+ "num_tokens": 3311881.0,
652
  "step": 3150
653
  },
654
  {
655
+ "entropy": 5.54539008140564,
656
  "epoch": 1.8422567645365573,
657
+ "grad_norm": 1.8556406497955322,
658
+ "learning_rate": 1.1111440426793123e-06,
659
+ "loss": 5.3614,
660
+ "mean_token_accuracy": 0.30550685405731204,
661
+ "num_tokens": 3364861.0,
662
  "step": 3200
663
  },
664
  {
665
+ "entropy": 5.5433073282241825,
666
  "epoch": 1.871042026482441,
667
+ "grad_norm": 1.8386749029159546,
668
+ "learning_rate": 1.0963248369887374e-06,
669
+ "loss": 5.3543,
670
+ "mean_token_accuracy": 0.30875524014234546,
671
+ "num_tokens": 3415911.0,
672
  "step": 3250
673
  },
674
  {
675
+ "entropy": 5.5769769477844235,
676
  "epoch": 1.8998272884283247,
677
+ "grad_norm": 1.922486662864685,
678
+ "learning_rate": 1.0815056312981623e-06,
679
+ "loss": 5.3834,
680
+ "mean_token_accuracy": 0.3035113242268562,
681
+ "num_tokens": 3468338.0,
682
  "step": 3300
683
  },
684
  {
685
+ "entropy": 5.640013842582703,
686
  "epoch": 1.9286125503742084,
687
+ "grad_norm": 2.179500102996826,
688
+ "learning_rate": 1.0666864256075874e-06,
689
+ "loss": 5.4574,
690
+ "mean_token_accuracy": 0.2947095710039139,
691
+ "num_tokens": 3521693.0,
692
  "step": 3350
693
  },
694
  {
695
+ "entropy": 5.506910061836242,
696
  "epoch": 1.9573978123200921,
697
+ "grad_norm": 1.4014379978179932,
698
+ "learning_rate": 1.0518672199170125e-06,
699
+ "loss": 5.3234,
700
+ "mean_token_accuracy": 0.3096472260355949,
701
+ "num_tokens": 3574206.0,
702
  "step": 3400
703
  },
704
  {
705
+ "entropy": 5.607311015129089,
706
  "epoch": 1.9861830742659758,
707
+ "grad_norm": 1.41231107711792,
708
+ "learning_rate": 1.0370480142264374e-06,
709
+ "loss": 5.4226,
710
+ "mean_token_accuracy": 0.2979922544956207,
711
+ "num_tokens": 3627807.0,
712
  "step": 3450
713
  },
714
  {
715
  "epoch": 2.0,
716
+ "eval_entropy": 5.831721861790951,
717
+ "eval_loss": 5.656307220458984,
718
+ "eval_mean_token_accuracy": 0.2641724460685308,
719
+ "eval_model_preparation_time": 0.0047,
720
+ "eval_num_tokens": 3650214.0,
721
+ "eval_runtime": 79.7324,
722
+ "eval_samples_per_second": 5.443,
723
+ "eval_steps_per_second": 2.722,
724
  "step": 3474
725
  }
726
  ],
727
  "logging_steps": 50,
728
+ "max_steps": 6948,
729
  "num_input_tokens_seen": 0,
730
+ "num_train_epochs": 4,
731
  "save_steps": 500,
732
  "stateful_callbacks": {
733
  "TrainerControl": {
 
741
  "attributes": {}
742
  }
743
  },
744
+ "total_flos": 5.014260864635904e+16,
745
  "train_batch_size": 2,
746
  "trial_name": null,
747
  "trial_params": null
checkpoint-3474/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:130d33149272782bd60306263c371036419926142b8999aad7806359168f8484
3
  size 6225
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8db5c304963110404ebb6947b83ba95bd9b8aad1f9b8b578cc33c46d601e13dc
3
  size 6225
checkpoint-5211/adapter_config.json CHANGED
@@ -16,7 +16,7 @@
16
  "layers_pattern": null,
17
  "layers_to_transform": null,
18
  "loftq_config": {},
19
- "lora_alpha": 16,
20
  "lora_bias": false,
21
  "lora_dropout": 0.1,
22
  "megatron_config": null,
@@ -25,12 +25,14 @@
25
  "peft_type": "LORA",
26
  "peft_version": "0.18.0",
27
  "qalora_group_size": 16,
28
- "r": 8,
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
  "q_proj",
33
- "v_proj"
 
 
34
  ],
35
  "target_parameters": null,
36
  "task_type": "CAUSAL_LM",
 
16
  "layers_pattern": null,
17
  "layers_to_transform": null,
18
  "loftq_config": {},
19
+ "lora_alpha": 32,
20
  "lora_bias": false,
21
  "lora_dropout": 0.1,
22
  "megatron_config": null,
 
25
  "peft_type": "LORA",
26
  "peft_version": "0.18.0",
27
  "qalora_group_size": 16,
28
+ "r": 24,
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
  "q_proj",
33
+ "k_proj",
34
+ "v_proj",
35
+ "o_proj"
36
  ],
37
  "target_parameters": null,
38
  "task_type": "CAUSAL_LM",
checkpoint-5211/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0e6a7b22d63fd8741b839353cbaab150c0bd5f07d663ad8884bd3b4af58a9cce
3
- size 4374520
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:96bed2a64089d15ba0d03e873c6ba43e222e9615622cb08853696f1bb3f72ed3
3
+ size 26182176
checkpoint-5211/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0d7235486f7f068a0b9991bde7ca0b6a16106923b1cca53549a5bb621f15d218
3
- size 8783179
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc97f69c9bd94b7be821d35593073dc08cf44ccce0203ce520c9a25dfcbc93d7
3
+ size 52486155
checkpoint-5211/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:43cbafcbad7a00736ad4867a9fc18293a08b0b3d13acacb84d30cd8449539e81
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4bfc3867136ea1392d43912e26c993ff7e9d2c829e3cc938d41df7399c31116c
3
  size 14645
checkpoint-5211/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c82e157712778db9a1270de44d6dd5d35b469dbf5b63767059cabfb507d50c8a
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a76316bf35b4ab1e089615992ceff4951bb9d24d95bfa6731e79f937bd9a30c
3
  size 1465
checkpoint-5211/trainer_state.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "best_global_step": 5211,
3
- "best_metric": 6.0980024337768555,
4
  "best_model_checkpoint": "./output/checkpoint-5211",
5
  "epoch": 3.0,
6
  "eval_steps": 500,
@@ -10,1086 +10,1086 @@
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "entropy": 3.864118957519531,
14
  "epoch": 0.028785261945883708,
15
- "grad_norm": 2.7545533180236816,
16
- "learning_rate": 9.800000000000001e-06,
17
- "loss": 15.2997,
18
- "mean_token_accuracy": 0.10086015284061432,
19
- "num_tokens": 47319.0,
20
  "step": 50
21
  },
22
  {
23
- "entropy": 4.047076859474182,
24
  "epoch": 0.057570523891767415,
25
- "grad_norm": 5.0328264236450195,
26
- "learning_rate": 1.98e-05,
27
- "loss": 15.3264,
28
- "mean_token_accuracy": 0.09582207053899765,
29
- "num_tokens": 96809.0,
30
  "step": 100
31
  },
32
  {
33
- "entropy": 4.7578076648712155,
34
  "epoch": 0.08635578583765112,
35
- "grad_norm": 38.50589370727539,
36
- "learning_rate": 1.988584740827024e-05,
37
- "loss": 13.0056,
38
- "mean_token_accuracy": 0.126854517608881,
39
- "num_tokens": 139962.0,
40
  "step": 150
41
  },
42
  {
43
- "entropy": 6.80673882484436,
44
  "epoch": 0.11514104778353483,
45
- "grad_norm": 12.030129432678223,
46
- "learning_rate": 1.97693651718113e-05,
47
- "loss": 9.2822,
48
- "mean_token_accuracy": 0.11084575355052947,
49
- "num_tokens": 188029.0,
50
  "step": 200
51
  },
52
  {
53
- "entropy": 7.177925786972046,
54
  "epoch": 0.14392630972941853,
55
- "grad_norm": 4.852536201477051,
56
- "learning_rate": 1.965288293535236e-05,
57
- "loss": 7.6333,
58
- "mean_token_accuracy": 0.12398939326405525,
59
- "num_tokens": 234425.0,
60
  "step": 250
61
  },
62
  {
63
- "entropy": 7.080496473312378,
64
  "epoch": 0.17271157167530224,
65
- "grad_norm": 4.10841178894043,
66
- "learning_rate": 1.9536400698893422e-05,
67
- "loss": 7.1632,
68
- "mean_token_accuracy": 0.13563686355948448,
69
- "num_tokens": 278885.0,
70
  "step": 300
71
  },
72
  {
73
- "entropy": 6.931579580307007,
74
  "epoch": 0.20149683362118595,
75
- "grad_norm": 14.636048316955566,
76
- "learning_rate": 1.941991846243448e-05,
77
- "loss": 6.8213,
78
- "mean_token_accuracy": 0.16459846690297128,
79
- "num_tokens": 325491.0,
80
  "step": 350
81
  },
82
  {
83
- "entropy": 6.853660764694214,
84
  "epoch": 0.23028209556706966,
85
- "grad_norm": 5.966708183288574,
86
- "learning_rate": 1.930343622597554e-05,
87
- "loss": 6.6625,
88
- "mean_token_accuracy": 0.17670693069696428,
89
- "num_tokens": 372913.0,
90
  "step": 400
91
  },
92
  {
93
- "entropy": 6.684267387390137,
94
  "epoch": 0.25906735751295334,
95
- "grad_norm": 4.031010627746582,
96
- "learning_rate": 1.91869539895166e-05,
97
- "loss": 6.4505,
98
- "mean_token_accuracy": 0.1943434515595436,
99
- "num_tokens": 419159.0,
100
  "step": 450
101
  },
102
  {
103
- "entropy": 6.679989137649536,
104
  "epoch": 0.28785261945883706,
105
- "grad_norm": 6.251070022583008,
106
- "learning_rate": 1.907047175305766e-05,
107
- "loss": 6.4314,
108
- "mean_token_accuracy": 0.19514557600021362,
109
- "num_tokens": 466994.0,
110
  "step": 500
111
  },
112
  {
113
- "entropy": 6.477229623794556,
114
  "epoch": 0.31663788140472077,
115
- "grad_norm": 3.8656675815582275,
116
- "learning_rate": 1.895398951659872e-05,
117
- "loss": 6.2139,
118
- "mean_token_accuracy": 0.21764743447303772,
119
- "num_tokens": 513308.0,
120
  "step": 550
121
  },
122
  {
123
- "entropy": 6.408129243850708,
124
  "epoch": 0.3454231433506045,
125
- "grad_norm": 8.688581466674805,
126
- "learning_rate": 1.883750728013978e-05,
127
- "loss": 6.1224,
128
- "mean_token_accuracy": 0.23438037544488907,
129
- "num_tokens": 559679.0,
130
  "step": 600
131
  },
132
  {
133
- "entropy": 6.128518767356873,
134
  "epoch": 0.3742084052964882,
135
- "grad_norm": 5.419503688812256,
136
- "learning_rate": 1.872102504368084e-05,
137
- "loss": 5.8692,
138
- "mean_token_accuracy": 0.26634690463542937,
139
- "num_tokens": 603140.0,
140
  "step": 650
141
  },
142
  {
143
- "entropy": 6.322700729370117,
144
  "epoch": 0.4029936672423719,
145
- "grad_norm": 2.2213082313537598,
146
- "learning_rate": 1.86045428072219e-05,
147
- "loss": 6.0717,
148
- "mean_token_accuracy": 0.24038562417030335,
149
- "num_tokens": 650179.0,
150
  "step": 700
151
  },
152
  {
153
- "entropy": 6.236415157318115,
154
  "epoch": 0.4317789291882556,
155
- "grad_norm": 4.804980278015137,
156
- "learning_rate": 1.848806057076296e-05,
157
- "loss": 5.9986,
158
- "mean_token_accuracy": 0.24596781462430953,
159
- "num_tokens": 696220.0,
160
  "step": 750
161
  },
162
  {
163
- "entropy": 6.269758443832398,
164
  "epoch": 0.4605641911341393,
165
- "grad_norm": 2.2888853549957275,
166
- "learning_rate": 1.837157833430402e-05,
167
- "loss": 6.0385,
168
- "mean_token_accuracy": 0.24074893474578857,
169
- "num_tokens": 743909.0,
170
  "step": 800
171
  },
172
  {
173
- "entropy": 6.270364007949829,
174
  "epoch": 0.48934945308002303,
175
- "grad_norm": 3.0903279781341553,
176
- "learning_rate": 1.825509609784508e-05,
177
- "loss": 6.0481,
178
- "mean_token_accuracy": 0.23740622967481614,
179
- "num_tokens": 792015.0,
180
  "step": 850
181
  },
182
  {
183
- "entropy": 6.3037636184692385,
184
  "epoch": 0.5181347150259067,
185
- "grad_norm": 3.969320058822632,
186
- "learning_rate": 1.813861386138614e-05,
187
- "loss": 6.0855,
188
- "mean_token_accuracy": 0.2309597587585449,
189
- "num_tokens": 841802.0,
190
  "step": 900
191
  },
192
  {
193
- "entropy": 6.038041458129883,
194
  "epoch": 0.5469199769717904,
195
- "grad_norm": 2.2712185382843018,
196
- "learning_rate": 1.80221316249272e-05,
197
- "loss": 5.8285,
198
- "mean_token_accuracy": 0.26099125802516937,
199
- "num_tokens": 886492.0,
200
  "step": 950
201
  },
202
  {
203
- "entropy": 6.142958383560181,
204
  "epoch": 0.5757052389176741,
205
- "grad_norm": 1.2311755418777466,
206
- "learning_rate": 1.790564938846826e-05,
207
- "loss": 5.9357,
208
- "mean_token_accuracy": 0.24810438305139543,
209
- "num_tokens": 932807.0,
210
  "step": 1000
211
  },
212
  {
213
- "entropy": 6.199834351539612,
214
  "epoch": 0.6044905008635578,
215
- "grad_norm": 2.2788379192352295,
216
- "learning_rate": 1.7789167152009318e-05,
217
- "loss": 5.9964,
218
- "mean_token_accuracy": 0.23942562609910964,
219
- "num_tokens": 980541.0,
220
  "step": 1050
221
  },
222
  {
223
- "entropy": 5.961639919281006,
224
  "epoch": 0.6332757628094415,
225
- "grad_norm": 1.9077532291412354,
226
- "learning_rate": 1.767268491555038e-05,
227
- "loss": 5.7664,
228
- "mean_token_accuracy": 0.26718012750148773,
229
- "num_tokens": 1023882.0,
230
  "step": 1100
231
  },
232
  {
233
- "entropy": 5.889280087947846,
234
  "epoch": 0.6620610247553252,
235
- "grad_norm": 2.4254891872406006,
236
- "learning_rate": 1.7556202679091442e-05,
237
- "loss": 5.6952,
238
- "mean_token_accuracy": 0.27529804170131683,
239
- "num_tokens": 1068300.0,
240
  "step": 1150
241
  },
242
  {
243
- "entropy": 6.085640063285828,
244
  "epoch": 0.690846286701209,
245
- "grad_norm": 2.35312557220459,
246
- "learning_rate": 1.74397204426325e-05,
247
- "loss": 5.8898,
248
- "mean_token_accuracy": 0.25166562348604204,
249
- "num_tokens": 1115425.0,
250
  "step": 1200
251
  },
252
  {
253
- "entropy": 6.146574058532715,
254
  "epoch": 0.7196315486470927,
255
- "grad_norm": 1.7730146646499634,
256
- "learning_rate": 1.732323820617356e-05,
257
- "loss": 5.9519,
258
- "mean_token_accuracy": 0.24276195973157882,
259
- "num_tokens": 1162319.0,
260
  "step": 1250
261
  },
262
  {
263
- "entropy": 6.079372715950012,
264
  "epoch": 0.7484168105929764,
265
- "grad_norm": 1.7070863246917725,
266
- "learning_rate": 1.720675596971462e-05,
267
- "loss": 5.8922,
268
- "mean_token_accuracy": 0.24961524546146394,
269
- "num_tokens": 1208230.0,
270
  "step": 1300
271
  },
272
  {
273
- "entropy": 5.9683656406402585,
274
  "epoch": 0.7772020725388601,
275
- "grad_norm": 1.8790594339370728,
276
- "learning_rate": 1.709027373325568e-05,
277
- "loss": 5.7827,
278
- "mean_token_accuracy": 0.2632122594118118,
279
- "num_tokens": 1253074.0,
280
  "step": 1350
281
  },
282
  {
283
- "entropy": 6.107076721191406,
284
  "epoch": 0.8059873344847438,
285
- "grad_norm": 1.1745644807815552,
286
- "learning_rate": 1.6973791496796742e-05,
287
- "loss": 5.9211,
288
- "mean_token_accuracy": 0.24564073830842972,
289
- "num_tokens": 1300179.0,
290
  "step": 1400
291
  },
292
  {
293
- "entropy": 6.141328382492065,
294
  "epoch": 0.8347725964306275,
295
- "grad_norm": 1.0346958637237549,
296
- "learning_rate": 1.68573092603378e-05,
297
- "loss": 5.9584,
298
- "mean_token_accuracy": 0.23997059136629104,
299
- "num_tokens": 1347539.0,
300
  "step": 1450
301
  },
302
  {
303
- "entropy": 6.070010099411011,
304
  "epoch": 0.8635578583765112,
305
- "grad_norm": 1.6541163921356201,
306
- "learning_rate": 1.674082702387886e-05,
307
- "loss": 5.889,
308
- "mean_token_accuracy": 0.24875166177749633,
309
- "num_tokens": 1394157.0,
310
  "step": 1500
311
  },
312
  {
313
- "entropy": 6.207450666427612,
314
  "epoch": 0.8923431203223949,
315
- "grad_norm": 0.9742990732192993,
316
- "learning_rate": 1.662434478741992e-05,
317
- "loss": 6.0217,
318
- "mean_token_accuracy": 0.23067249596118927,
319
- "num_tokens": 1443892.0,
320
  "step": 1550
321
  },
322
  {
323
- "entropy": 6.026197805404663,
324
  "epoch": 0.9211283822682786,
325
- "grad_norm": 1.4229531288146973,
326
- "learning_rate": 1.650786255096098e-05,
327
- "loss": 5.8455,
328
- "mean_token_accuracy": 0.2537291014194489,
329
- "num_tokens": 1491050.0,
330
  "step": 1600
331
  },
332
  {
333
- "entropy": 6.210526428222656,
334
  "epoch": 0.9499136442141624,
335
- "grad_norm": 1.3555018901824951,
336
- "learning_rate": 1.6391380314502038e-05,
337
- "loss": 6.0279,
338
- "mean_token_accuracy": 0.2308420208096504,
339
- "num_tokens": 1540809.0,
340
  "step": 1650
341
  },
342
  {
343
- "entropy": 5.9872834014892575,
344
  "epoch": 0.9786989061600461,
345
- "grad_norm": 0.9893498420715332,
346
- "learning_rate": 1.62748980780431e-05,
347
- "loss": 5.8137,
348
- "mean_token_accuracy": 0.2566875320672989,
349
- "num_tokens": 1585876.0,
350
  "step": 1700
351
  },
352
  {
353
  "epoch": 1.0,
354
- "eval_entropy": 6.322207130045386,
355
- "eval_loss": 6.15173864364624,
356
- "eval_mean_token_accuracy": 0.21116007946877985,
357
- "eval_model_preparation_time": 0.0036,
358
- "eval_num_tokens": 1619719.0,
359
- "eval_runtime": 76.1297,
360
- "eval_samples_per_second": 5.701,
361
- "eval_steps_per_second": 2.85,
362
  "step": 1737
363
  },
364
  {
365
- "entropy": 6.038531675338745,
366
  "epoch": 1.0074841681059297,
367
- "grad_norm": 0.8715208172798157,
368
- "learning_rate": 1.615841584158416e-05,
369
- "loss": 5.8628,
370
- "mean_token_accuracy": 0.2510762655735016,
371
- "num_tokens": 1632015.0,
372
  "step": 1750
373
  },
374
  {
375
- "entropy": 6.164030771255494,
376
  "epoch": 1.0362694300518134,
377
- "grad_norm": 0.7344900965690613,
378
- "learning_rate": 1.604193360512522e-05,
379
- "loss": 5.9856,
380
- "mean_token_accuracy": 0.2351543301343918,
381
- "num_tokens": 1681154.0,
382
  "step": 1800
383
  },
384
  {
385
- "entropy": 6.0731862354278565,
386
  "epoch": 1.065054691997697,
387
- "grad_norm": 1.0801328420639038,
388
- "learning_rate": 1.592545136866628e-05,
389
- "loss": 5.8976,
390
- "mean_token_accuracy": 0.24701615989208223,
391
- "num_tokens": 1728110.0,
392
  "step": 1850
393
  },
394
  {
395
- "entropy": 6.079212121963501,
396
  "epoch": 1.0938399539435808,
397
- "grad_norm": 0.7876909375190735,
398
- "learning_rate": 1.5808969132207338e-05,
399
- "loss": 5.9056,
400
- "mean_token_accuracy": 0.24457543224096298,
401
- "num_tokens": 1775703.0,
402
  "step": 1900
403
  },
404
  {
405
- "entropy": 6.062467746734619,
406
  "epoch": 1.1226252158894645,
407
- "grad_norm": 0.5999078750610352,
408
- "learning_rate": 1.56924868957484e-05,
409
- "loss": 5.8899,
410
- "mean_token_accuracy": 0.2469428673386574,
411
- "num_tokens": 1821980.0,
412
  "step": 1950
413
  },
414
  {
415
- "entropy": 6.031774473190308,
416
  "epoch": 1.1514104778353482,
417
- "grad_norm": 1.6313235759735107,
418
- "learning_rate": 1.557600465928946e-05,
419
- "loss": 5.8593,
420
- "mean_token_accuracy": 0.250918984413147,
421
- "num_tokens": 1867547.0,
422
  "step": 2000
423
  },
424
  {
425
- "entropy": 6.122789564132691,
426
  "epoch": 1.180195739781232,
427
- "grad_norm": 2.562373161315918,
428
- "learning_rate": 1.545952242283052e-05,
429
- "loss": 5.9502,
430
- "mean_token_accuracy": 0.23938885867595672,
431
- "num_tokens": 1915411.0,
432
  "step": 2050
433
  },
434
  {
435
- "entropy": 6.067130417823791,
436
  "epoch": 1.2089810017271156,
437
- "grad_norm": 0.9762872457504272,
438
- "learning_rate": 1.534304018637158e-05,
439
- "loss": 5.8956,
440
- "mean_token_accuracy": 0.2454381173849106,
441
- "num_tokens": 1964009.0,
442
  "step": 2100
443
  },
444
  {
445
- "entropy": 5.9613511180877685,
446
  "epoch": 1.2377662636729994,
447
- "grad_norm": 0.8701547384262085,
448
- "learning_rate": 1.5226557949912639e-05,
449
- "loss": 5.7907,
450
- "mean_token_accuracy": 0.25976367652416227,
451
- "num_tokens": 2008595.0,
452
  "step": 2150
453
  },
454
  {
455
- "entropy": 6.13505428314209,
456
  "epoch": 1.266551525618883,
457
- "grad_norm": 0.8511647582054138,
458
- "learning_rate": 1.51100757134537e-05,
459
- "loss": 5.9619,
460
- "mean_token_accuracy": 0.23760781466960906,
461
- "num_tokens": 2057229.0,
462
  "step": 2200
463
  },
464
  {
465
- "entropy": 6.025254983901977,
466
  "epoch": 1.2953367875647668,
467
- "grad_norm": 0.7627406120300293,
468
- "learning_rate": 1.4993593476994758e-05,
469
- "loss": 5.8546,
470
- "mean_token_accuracy": 0.2508662334084511,
471
- "num_tokens": 2103631.0,
472
  "step": 2250
473
  },
474
  {
475
- "entropy": 5.981974196434021,
476
  "epoch": 1.3241220495106505,
477
- "grad_norm": 1.6922173500061035,
478
- "learning_rate": 1.4877111240535819e-05,
479
- "loss": 5.8119,
480
- "mean_token_accuracy": 0.256170334815979,
481
- "num_tokens": 2150369.0,
482
  "step": 2300
483
  },
484
  {
485
- "entropy": 6.19903904914856,
486
  "epoch": 1.3529073114565342,
487
- "grad_norm": 0.40436601638793945,
488
- "learning_rate": 1.4760629004076878e-05,
489
- "loss": 6.0244,
490
- "mean_token_accuracy": 0.22900927513837815,
491
- "num_tokens": 2199724.0,
492
  "step": 2350
493
  },
494
  {
495
- "entropy": 5.986697297096253,
496
  "epoch": 1.381692573402418,
497
- "grad_norm": 0.8481882214546204,
498
- "learning_rate": 1.464414676761794e-05,
499
- "loss": 5.8195,
500
- "mean_token_accuracy": 0.2552035376429558,
501
- "num_tokens": 2245341.0,
502
  "step": 2400
503
  },
504
  {
505
- "entropy": 6.1886044692993165,
506
  "epoch": 1.4104778353483016,
507
- "grad_norm": 0.7911505103111267,
508
- "learning_rate": 1.4527664531159e-05,
509
- "loss": 6.0148,
510
- "mean_token_accuracy": 0.23026730984449387,
511
- "num_tokens": 2294726.0,
512
  "step": 2450
513
  },
514
  {
515
- "entropy": 5.974867792129516,
516
  "epoch": 1.4392630972941853,
517
- "grad_norm": 1.640499234199524,
518
- "learning_rate": 1.441118229470006e-05,
519
- "loss": 5.8111,
520
- "mean_token_accuracy": 0.2554209426045418,
521
- "num_tokens": 2342251.0,
522
  "step": 2500
523
  },
524
  {
525
- "entropy": 5.967635660171509,
526
  "epoch": 1.468048359240069,
527
- "grad_norm": 0.8022929430007935,
528
- "learning_rate": 1.429470005824112e-05,
529
- "loss": 5.8015,
530
- "mean_token_accuracy": 0.2569852137565613,
531
- "num_tokens": 2387469.0,
532
  "step": 2550
533
  },
534
  {
535
- "entropy": 6.047262029647827,
536
  "epoch": 1.4968336211859528,
537
- "grad_norm": 0.9270678758621216,
538
- "learning_rate": 1.417821782178218e-05,
539
- "loss": 5.8782,
540
- "mean_token_accuracy": 0.2467849862575531,
541
- "num_tokens": 2434128.0,
542
  "step": 2600
543
  },
544
  {
545
- "entropy": 6.00601068019867,
546
  "epoch": 1.5256188831318365,
547
- "grad_norm": 1.5378597974777222,
548
- "learning_rate": 1.406173558532324e-05,
549
- "loss": 5.839,
550
- "mean_token_accuracy": 0.25216978013515473,
551
- "num_tokens": 2480366.0,
552
  "step": 2650
553
  },
554
  {
555
- "entropy": 5.988714299201965,
556
  "epoch": 1.5544041450777202,
557
- "grad_norm": 0.819143533706665,
558
- "learning_rate": 1.3945253348864299e-05,
559
- "loss": 5.82,
560
- "mean_token_accuracy": 0.254311783015728,
561
- "num_tokens": 2527357.0,
562
  "step": 2700
563
  },
564
  {
565
- "entropy": 5.960293846130371,
566
  "epoch": 1.583189407023604,
567
- "grad_norm": 0.8920449614524841,
568
- "learning_rate": 1.382877111240536e-05,
569
- "loss": 5.7946,
570
- "mean_token_accuracy": 0.25750755161046984,
571
- "num_tokens": 2574470.0,
572
  "step": 2750
573
  },
574
  {
575
- "entropy": 6.1214879322052,
576
  "epoch": 1.6119746689694876,
577
- "grad_norm": 0.5333890914916992,
578
- "learning_rate": 1.371228887594642e-05,
579
- "loss": 5.9513,
580
- "mean_token_accuracy": 0.2377367687225342,
581
- "num_tokens": 2622280.0,
582
  "step": 2800
583
  },
584
  {
585
- "entropy": 5.951769871711731,
586
  "epoch": 1.6407599309153713,
587
- "grad_norm": 0.5994665026664734,
588
- "learning_rate": 1.3595806639487479e-05,
589
- "loss": 5.7861,
590
- "mean_token_accuracy": 0.25854207515716554,
591
- "num_tokens": 2668624.0,
592
  "step": 2850
593
  },
594
  {
595
- "entropy": 5.927765312194825,
596
  "epoch": 1.669545192861255,
597
- "grad_norm": 0.4460087716579437,
598
- "learning_rate": 1.347932440302854e-05,
599
- "loss": 5.7661,
600
- "mean_token_accuracy": 0.25973255425691605,
601
- "num_tokens": 2714388.0,
602
  "step": 2900
603
  },
604
  {
605
- "entropy": 6.097678365707398,
606
  "epoch": 1.6983304548071387,
607
- "grad_norm": 0.7125752568244934,
608
- "learning_rate": 1.3362842166569598e-05,
609
- "loss": 5.9284,
610
- "mean_token_accuracy": 0.23995368272066117,
611
- "num_tokens": 2761465.0,
612
  "step": 2950
613
  },
614
  {
615
- "entropy": 5.986212658882141,
616
  "epoch": 1.7271157167530224,
617
- "grad_norm": 1.5405049324035645,
618
- "learning_rate": 1.3246359930110659e-05,
619
- "loss": 5.8194,
620
- "mean_token_accuracy": 0.25333445996046067,
621
- "num_tokens": 2808066.0,
622
  "step": 3000
623
  },
624
  {
625
- "entropy": 5.7968806195259095,
626
  "epoch": 1.7559009786989062,
627
- "grad_norm": 0.4532749652862549,
628
- "learning_rate": 1.312987769365172e-05,
629
- "loss": 5.6344,
630
- "mean_token_accuracy": 0.2782411390542984,
631
- "num_tokens": 2851822.0,
632
  "step": 3050
633
  },
634
  {
635
- "entropy": 5.973708114624023,
636
  "epoch": 1.7846862406447899,
637
- "grad_norm": 1.4795438051223755,
638
- "learning_rate": 1.3013395457192778e-05,
639
- "loss": 5.8104,
640
- "mean_token_accuracy": 0.25441971331834795,
641
- "num_tokens": 2897737.0,
642
  "step": 3100
643
  },
644
  {
645
- "entropy": 5.70733567237854,
646
  "epoch": 1.8134715025906736,
647
- "grad_norm": 0.6216577887535095,
648
- "learning_rate": 1.2896913220733839e-05,
649
- "loss": 5.5523,
650
- "mean_token_accuracy": 0.28787180870771406,
651
- "num_tokens": 2939511.0,
652
  "step": 3150
653
  },
654
  {
655
- "entropy": 5.96826630115509,
656
  "epoch": 1.8422567645365573,
657
- "grad_norm": 0.9246350526809692,
658
- "learning_rate": 1.2780430984274898e-05,
659
- "loss": 5.8057,
660
- "mean_token_accuracy": 0.25464902341365814,
661
- "num_tokens": 2986368.0,
662
  "step": 3200
663
  },
664
  {
665
- "entropy": 5.950662693977356,
666
  "epoch": 1.871042026482441,
667
- "grad_norm": 0.8141199946403503,
668
- "learning_rate": 1.266394874781596e-05,
669
- "loss": 5.7886,
670
- "mean_token_accuracy": 0.25830793648958206,
671
- "num_tokens": 3031770.0,
672
  "step": 3250
673
  },
674
  {
675
- "entropy": 6.00512773513794,
676
  "epoch": 1.8998272884283247,
677
- "grad_norm": 0.4913998246192932,
678
- "learning_rate": 1.2547466511357018e-05,
679
- "loss": 5.838,
680
- "mean_token_accuracy": 0.2512077575922012,
681
- "num_tokens": 3078322.0,
682
  "step": 3300
683
  },
684
  {
685
- "entropy": 6.090880632400513,
686
  "epoch": 1.9286125503742084,
687
- "grad_norm": 0.9893012046813965,
688
- "learning_rate": 1.243098427489808e-05,
689
- "loss": 5.9264,
690
- "mean_token_accuracy": 0.2391783133149147,
691
- "num_tokens": 3125572.0,
692
  "step": 3350
693
  },
694
  {
695
- "entropy": 5.949693293571472,
696
  "epoch": 1.9573978123200921,
697
- "grad_norm": 0.5794200301170349,
698
- "learning_rate": 1.231450203843914e-05,
699
- "loss": 5.7861,
700
- "mean_token_accuracy": 0.2568664598464966,
701
- "num_tokens": 3171974.0,
702
  "step": 3400
703
  },
704
  {
705
- "entropy": 6.03591317653656,
706
  "epoch": 1.9861830742659758,
707
- "grad_norm": 0.8525373339653015,
708
- "learning_rate": 1.21980198019802e-05,
709
- "loss": 5.8741,
710
- "mean_token_accuracy": 0.24642003327608109,
711
- "num_tokens": 3219624.0,
712
  "step": 3450
713
  },
714
  {
715
  "epoch": 2.0,
716
- "eval_entropy": 6.272298685416648,
717
- "eval_loss": 6.12472677230835,
718
- "eval_mean_token_accuracy": 0.21168697409091458,
719
- "eval_model_preparation_time": 0.0036,
720
- "eval_num_tokens": 3239438.0,
721
- "eval_runtime": 76.2536,
722
- "eval_samples_per_second": 5.692,
723
- "eval_steps_per_second": 2.846,
724
  "step": 3474
725
  },
726
  {
727
- "entropy": 5.914763498306274,
728
  "epoch": 2.0149683362118593,
729
- "grad_norm": 0.5479806661605835,
730
- "learning_rate": 1.208153756552126e-05,
731
- "loss": 5.7559,
732
- "mean_token_accuracy": 0.2624077323079109,
733
- "num_tokens": 3263994.0,
734
  "step": 3500
735
  },
736
  {
737
- "entropy": 6.033470869064331,
738
  "epoch": 2.043753598157743,
739
- "grad_norm": 1.7186369895935059,
740
- "learning_rate": 1.1965055329062319e-05,
741
- "loss": 5.8677,
742
- "mean_token_accuracy": 0.24745646148920059,
743
- "num_tokens": 3311182.0,
744
  "step": 3550
745
  },
746
  {
747
- "entropy": 5.962404427528381,
748
  "epoch": 2.0725388601036268,
749
- "grad_norm": 0.9068580269813538,
750
- "learning_rate": 1.184857309260338e-05,
751
- "loss": 5.8038,
752
- "mean_token_accuracy": 0.25500513821840287,
753
- "num_tokens": 3358036.0,
754
  "step": 3600
755
  },
756
  {
757
- "entropy": 5.995727968215943,
758
  "epoch": 2.1013241220495105,
759
- "grad_norm": 2.044490337371826,
760
- "learning_rate": 1.1732090856144438e-05,
761
- "loss": 5.8333,
762
- "mean_token_accuracy": 0.2514388278126717,
763
- "num_tokens": 3404058.0,
764
  "step": 3650
765
  },
766
  {
767
- "entropy": 5.981345901489258,
768
  "epoch": 2.130109383995394,
769
- "grad_norm": 0.5262818336486816,
770
- "learning_rate": 1.1615608619685499e-05,
771
- "loss": 5.8205,
772
- "mean_token_accuracy": 0.2523340278863907,
773
- "num_tokens": 3449834.0,
774
  "step": 3700
775
  },
776
  {
777
- "entropy": 5.848710675239563,
778
  "epoch": 2.158894645941278,
779
- "grad_norm": 0.726718544960022,
780
- "learning_rate": 1.149912638322656e-05,
781
- "loss": 5.6891,
782
- "mean_token_accuracy": 0.2697497832775116,
783
- "num_tokens": 3494740.0,
784
  "step": 3750
785
  },
786
  {
787
- "entropy": 5.964878315925598,
788
  "epoch": 2.1876799078871616,
789
- "grad_norm": 0.6147393584251404,
790
- "learning_rate": 1.1382644146767618e-05,
791
- "loss": 5.8029,
792
- "mean_token_accuracy": 0.2553535890579224,
793
- "num_tokens": 3541342.0,
794
  "step": 3800
795
  },
796
  {
797
- "entropy": 6.045858116149902,
798
  "epoch": 2.2164651698330453,
799
- "grad_norm": 0.8283621072769165,
800
- "learning_rate": 1.1266161910308679e-05,
801
- "loss": 5.8802,
802
- "mean_token_accuracy": 0.24544916599988936,
803
- "num_tokens": 3588995.0,
804
  "step": 3850
805
  },
806
  {
807
- "entropy": 5.909895505905151,
808
  "epoch": 2.245250431778929,
809
- "grad_norm": 0.9912867546081543,
810
- "learning_rate": 1.1149679673849738e-05,
811
- "loss": 5.7481,
812
- "mean_token_accuracy": 0.2620398569107056,
813
- "num_tokens": 3634252.0,
814
  "step": 3900
815
  },
816
  {
817
- "entropy": 5.9534005498886104,
818
  "epoch": 2.2740356937248127,
819
- "grad_norm": 1.2012401819229126,
820
- "learning_rate": 1.1033197437390799e-05,
821
- "loss": 5.788,
822
- "mean_token_accuracy": 0.25642816990613937,
823
- "num_tokens": 3681197.0,
824
  "step": 3950
825
  },
826
  {
827
- "entropy": 6.155718851089477,
828
  "epoch": 2.3028209556706964,
829
- "grad_norm": 1.4272509813308716,
830
- "learning_rate": 1.0916715200931857e-05,
831
- "loss": 5.9842,
832
- "mean_token_accuracy": 0.23176315426826477,
833
- "num_tokens": 3729955.0,
834
  "step": 4000
835
  },
836
  {
837
- "entropy": 6.004842009544372,
838
  "epoch": 2.33160621761658,
839
- "grad_norm": 1.1919596195220947,
840
- "learning_rate": 1.0800232964472918e-05,
841
- "loss": 5.8332,
842
- "mean_token_accuracy": 0.25039500594139097,
843
- "num_tokens": 3777043.0,
844
  "step": 4050
845
  },
846
  {
847
- "entropy": 6.045269584655761,
848
  "epoch": 2.360391479562464,
849
- "grad_norm": 0.6200748085975647,
850
- "learning_rate": 1.068375072801398e-05,
851
- "loss": 5.8641,
852
- "mean_token_accuracy": 0.2466951721906662,
853
- "num_tokens": 3824067.0,
854
  "step": 4100
855
  },
856
  {
857
- "entropy": 6.105137758255005,
858
  "epoch": 2.3891767415083476,
859
- "grad_norm": 1.0185531377792358,
860
- "learning_rate": 1.0567268491555038e-05,
861
- "loss": 5.9181,
862
- "mean_token_accuracy": 0.24000227689743042,
863
- "num_tokens": 3872769.0,
864
  "step": 4150
865
  },
866
  {
867
- "entropy": 6.013391451835632,
868
  "epoch": 2.4179620034542313,
869
- "grad_norm": 0.6188511848449707,
870
- "learning_rate": 1.04507862550961e-05,
871
- "loss": 5.8286,
872
- "mean_token_accuracy": 0.25189226895570754,
873
- "num_tokens": 3919379.0,
874
  "step": 4200
875
  },
876
  {
877
- "entropy": 5.972923498153687,
878
  "epoch": 2.446747265400115,
879
- "grad_norm": 0.7165982127189636,
880
- "learning_rate": 1.0334304018637157e-05,
881
- "loss": 5.7908,
882
- "mean_token_accuracy": 0.2567197346687317,
883
- "num_tokens": 3965593.0,
884
  "step": 4250
885
  },
886
  {
887
- "entropy": 6.0378124713897705,
888
  "epoch": 2.4755325273459987,
889
- "grad_norm": 0.5278330445289612,
890
- "learning_rate": 1.021782178217822e-05,
891
- "loss": 5.8559,
892
- "mean_token_accuracy": 0.2484271454811096,
893
- "num_tokens": 4012300.0,
894
  "step": 4300
895
  },
896
  {
897
- "entropy": 5.984496111869812,
898
  "epoch": 2.5043177892918824,
899
- "grad_norm": 0.8995006680488586,
900
- "learning_rate": 1.0101339545719278e-05,
901
- "loss": 5.8092,
902
- "mean_token_accuracy": 0.253717774450779,
903
- "num_tokens": 4059323.0,
904
  "step": 4350
905
  },
906
  {
907
- "entropy": 6.124767150878906,
908
  "epoch": 2.533103051237766,
909
- "grad_norm": 1.3810409307479858,
910
- "learning_rate": 9.984857309260339e-06,
911
- "loss": 5.9468,
912
- "mean_token_accuracy": 0.23715158700942993,
913
- "num_tokens": 4107616.0,
914
  "step": 4400
915
  },
916
  {
917
- "entropy": 5.8810745000839235,
918
  "epoch": 2.56188831318365,
919
- "grad_norm": 0.8794332146644592,
920
- "learning_rate": 9.868375072801398e-06,
921
- "loss": 5.7089,
922
- "mean_token_accuracy": 0.2662400561571121,
923
- "num_tokens": 4152400.0,
924
  "step": 4450
925
  },
926
  {
927
- "entropy": 6.108017959594727,
928
  "epoch": 2.5906735751295336,
929
- "grad_norm": 0.5132983922958374,
930
- "learning_rate": 9.751892836342458e-06,
931
- "loss": 5.9346,
932
- "mean_token_accuracy": 0.23871887892484664,
933
- "num_tokens": 4200994.0,
934
  "step": 4500
935
  },
936
  {
937
- "entropy": 5.985005149841308,
938
  "epoch": 2.6194588370754173,
939
- "grad_norm": 0.6561470031738281,
940
- "learning_rate": 9.635410599883519e-06,
941
- "loss": 5.8111,
942
- "mean_token_accuracy": 0.25315980523824694,
943
- "num_tokens": 4247548.0,
944
  "step": 4550
945
  },
946
  {
947
- "entropy": 6.050709452629089,
948
  "epoch": 2.648244099021301,
949
- "grad_norm": 0.8790570497512817,
950
- "learning_rate": 9.51892836342458e-06,
951
- "loss": 5.8789,
952
- "mean_token_accuracy": 0.2440834751725197,
953
- "num_tokens": 4295250.0,
954
  "step": 4600
955
  },
956
  {
957
- "entropy": 6.007251596450805,
958
  "epoch": 2.6770293609671847,
959
- "grad_norm": 0.6728562116622925,
960
- "learning_rate": 9.402446126965639e-06,
961
- "loss": 5.8338,
962
- "mean_token_accuracy": 0.2509264424443245,
963
- "num_tokens": 4341599.0,
964
  "step": 4650
965
  },
966
  {
967
- "entropy": 5.966628184318543,
968
  "epoch": 2.7058146229130684,
969
- "grad_norm": 0.5815795063972473,
970
- "learning_rate": 9.285963890506699e-06,
971
- "loss": 5.7961,
972
- "mean_token_accuracy": 0.2559360232949257,
973
- "num_tokens": 4388673.0,
974
  "step": 4700
975
  },
976
  {
977
- "entropy": 5.7972593069076535,
978
  "epoch": 2.734599884858952,
979
- "grad_norm": 1.0610334873199463,
980
- "learning_rate": 9.169481654047758e-06,
981
- "loss": 5.6318,
982
- "mean_token_accuracy": 0.27574603259563446,
983
- "num_tokens": 4432959.0,
984
  "step": 4750
985
  },
986
  {
987
- "entropy": 5.984181261062622,
988
  "epoch": 2.763385146804836,
989
- "grad_norm": 2.1847357749938965,
990
- "learning_rate": 9.052999417588819e-06,
991
- "loss": 5.8153,
992
- "mean_token_accuracy": 0.2533784031867981,
993
- "num_tokens": 4479190.0,
994
  "step": 4800
995
  },
996
  {
997
- "entropy": 5.959725599288941,
998
  "epoch": 2.7921704087507195,
999
- "grad_norm": 0.5671709179878235,
1000
- "learning_rate": 8.936517181129878e-06,
1001
- "loss": 5.7912,
1002
- "mean_token_accuracy": 0.2556650054454803,
1003
- "num_tokens": 4525674.0,
1004
  "step": 4850
1005
  },
1006
  {
1007
- "entropy": 5.814929313659668,
1008
  "epoch": 2.8209556706966032,
1009
- "grad_norm": 0.9447108507156372,
1010
- "learning_rate": 8.820034944670938e-06,
1011
- "loss": 5.6478,
1012
- "mean_token_accuracy": 0.27417868226766584,
1013
- "num_tokens": 4570379.0,
1014
  "step": 4900
1015
  },
1016
  {
1017
- "entropy": 5.96754421710968,
1018
  "epoch": 2.849740932642487,
1019
- "grad_norm": 2.009676218032837,
1020
- "learning_rate": 8.703552708211999e-06,
1021
- "loss": 5.795,
1022
- "mean_token_accuracy": 0.2556305864453316,
1023
- "num_tokens": 4617184.0,
1024
  "step": 4950
1025
  },
1026
  {
1027
- "entropy": 6.008112049102783,
1028
  "epoch": 2.8785261945883707,
1029
- "grad_norm": 1.1977978944778442,
1030
- "learning_rate": 8.587070471753058e-06,
1031
- "loss": 5.8416,
1032
- "mean_token_accuracy": 0.2494604030251503,
1033
- "num_tokens": 4664180.0,
1034
  "step": 5000
1035
  },
1036
  {
1037
- "entropy": 5.832320966720581,
1038
  "epoch": 2.9073114565342544,
1039
- "grad_norm": 0.4845636785030365,
1040
- "learning_rate": 8.470588235294118e-06,
1041
- "loss": 5.6672,
1042
- "mean_token_accuracy": 0.27187123566865923,
1043
- "num_tokens": 4708377.0,
1044
  "step": 5050
1045
  },
1046
  {
1047
- "entropy": 5.84138514995575,
1048
  "epoch": 2.936096718480138,
1049
- "grad_norm": 0.8487229943275452,
1050
- "learning_rate": 8.354105998835179e-06,
1051
- "loss": 5.6769,
1052
- "mean_token_accuracy": 0.26995211571455,
1053
- "num_tokens": 4753587.0,
1054
  "step": 5100
1055
  },
1056
  {
1057
- "entropy": 6.016681690216064,
1058
  "epoch": 2.964881980426022,
1059
- "grad_norm": 0.9554332494735718,
1060
- "learning_rate": 8.237623762376238e-06,
1061
- "loss": 5.8479,
1062
- "mean_token_accuracy": 0.24785644590854644,
1063
- "num_tokens": 4800508.0,
1064
  "step": 5150
1065
  },
1066
  {
1067
- "entropy": 6.103472499847412,
1068
  "epoch": 2.9936672423719055,
1069
- "grad_norm": 0.6602863669395447,
1070
- "learning_rate": 8.121141525917298e-06,
1071
- "loss": 5.9305,
1072
- "mean_token_accuracy": 0.23794592499732972,
1073
- "num_tokens": 4849415.0,
1074
  "step": 5200
1075
  },
1076
  {
1077
  "epoch": 3.0,
1078
- "eval_entropy": 6.254081044878278,
1079
- "eval_loss": 6.0980024337768555,
1080
- "eval_mean_token_accuracy": 0.21401402258103894,
1081
- "eval_model_preparation_time": 0.0036,
1082
- "eval_num_tokens": 4859157.0,
1083
- "eval_runtime": 75.9443,
1084
- "eval_samples_per_second": 5.715,
1085
- "eval_steps_per_second": 2.857,
1086
  "step": 5211
1087
  }
1088
  ],
1089
  "logging_steps": 50,
1090
- "max_steps": 8685,
1091
  "num_input_tokens_seen": 0,
1092
- "num_train_epochs": 5,
1093
  "save_steps": 500,
1094
  "stateful_callbacks": {
1095
  "TrainerControl": {
@@ -1103,7 +1103,7 @@
1103
  "attributes": {}
1104
  }
1105
  },
1106
- "total_flos": 6.795785692717056e+16,
1107
  "train_batch_size": 2,
1108
  "trial_name": null,
1109
  "trial_params": null
 
1
  {
2
  "best_global_step": 5211,
3
+ "best_metric": 5.628758430480957,
4
  "best_model_checkpoint": "./output/checkpoint-5211",
5
  "epoch": 3.0,
6
  "eval_steps": 500,
 
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "entropy": 3.6583470726013183,
14
  "epoch": 0.028785261945883708,
15
+ "grad_norm": 3.3817152976989746,
16
+ "learning_rate": 4.9e-07,
17
+ "loss": 13.8754,
18
+ "mean_token_accuracy": 0.15036460414528846,
19
+ "num_tokens": 53093.0,
20
  "step": 50
21
  },
22
  {
23
+ "entropy": 3.669608063697815,
24
  "epoch": 0.057570523891767415,
25
+ "grad_norm": 3.2541544437408447,
26
+ "learning_rate": 9.9e-07,
27
+ "loss": 14.2282,
28
+ "mean_token_accuracy": 0.14137721598148345,
29
+ "num_tokens": 108334.0,
30
  "step": 100
31
  },
32
  {
33
+ "entropy": 3.569736371040344,
34
  "epoch": 0.08635578583765112,
35
+ "grad_norm": 3.6797454357147217,
36
+ "learning_rate": 1.49e-06,
37
+ "loss": 13.0735,
38
+ "mean_token_accuracy": 0.17473630651831626,
39
+ "num_tokens": 157491.0,
40
  "step": 150
41
  },
42
  {
43
+ "entropy": 3.7253233194351196,
44
  "epoch": 0.11514104778353483,
45
+ "grad_norm": 4.297911643981934,
46
+ "learning_rate": 1.99e-06,
47
+ "loss": 13.7392,
48
+ "mean_token_accuracy": 0.1473099772632122,
49
+ "num_tokens": 211394.0,
50
  "step": 200
51
  },
52
  {
53
+ "entropy": 3.8280500602722167,
54
  "epoch": 0.14392630972941853,
55
+ "grad_norm": 4.405268669128418,
56
+ "learning_rate": 1.9854771784232364e-06,
57
+ "loss": 13.0797,
58
+ "mean_token_accuracy": 0.16704789966344832,
59
+ "num_tokens": 263685.0,
60
  "step": 250
61
  },
62
  {
63
+ "entropy": 4.066333084106446,
64
  "epoch": 0.17271157167530224,
65
+ "grad_norm": 4.757556438446045,
66
+ "learning_rate": 1.9706579727326615e-06,
67
+ "loss": 12.6321,
68
+ "mean_token_accuracy": 0.1691790708899498,
69
+ "num_tokens": 314059.0,
70
  "step": 300
71
  },
72
  {
73
+ "entropy": 4.257266030311585,
74
  "epoch": 0.20149683362118595,
75
+ "grad_norm": 6.406249523162842,
76
+ "learning_rate": 1.955838767042086e-06,
77
+ "loss": 12.2253,
78
+ "mean_token_accuracy": 0.17223650276660918,
79
+ "num_tokens": 367038.0,
80
  "step": 350
81
  },
82
  {
83
+ "entropy": 4.694105777740479,
84
  "epoch": 0.23028209556706966,
85
+ "grad_norm": 12.57987117767334,
86
+ "learning_rate": 1.9410195613515113e-06,
87
+ "loss": 11.9714,
88
+ "mean_token_accuracy": 0.15997304677963256,
89
+ "num_tokens": 420327.0,
90
  "step": 400
91
  },
92
  {
93
+ "entropy": 5.205010280609131,
94
  "epoch": 0.25906735751295334,
95
+ "grad_norm": 15.570313453674316,
96
+ "learning_rate": 1.9262003556609364e-06,
97
+ "loss": 10.8173,
98
+ "mean_token_accuracy": 0.16447648257017136,
99
+ "num_tokens": 472429.0,
100
  "step": 450
101
  },
102
  {
103
+ "entropy": 5.917805089950561,
104
  "epoch": 0.28785261945883706,
105
+ "grad_norm": 23.61503791809082,
106
+ "learning_rate": 1.9113811499703615e-06,
107
+ "loss": 9.3196,
108
+ "mean_token_accuracy": 0.16179455041885377,
109
+ "num_tokens": 526315.0,
110
  "step": 500
111
  },
112
  {
113
+ "entropy": 6.380368332862854,
114
  "epoch": 0.31663788140472077,
115
+ "grad_norm": 13.846810340881348,
116
+ "learning_rate": 1.8965619442797864e-06,
117
+ "loss": 7.9636,
118
+ "mean_token_accuracy": 0.16881170988082886,
119
+ "num_tokens": 578511.0,
120
  "step": 550
121
  },
122
  {
123
+ "entropy": 6.507339992523193,
124
  "epoch": 0.3454231433506045,
125
+ "grad_norm": 4.569090366363525,
126
+ "learning_rate": 1.8817427385892115e-06,
127
+ "loss": 7.4171,
128
+ "mean_token_accuracy": 0.16941152423620223,
129
+ "num_tokens": 630937.0,
130
  "step": 600
131
  },
132
  {
133
+ "entropy": 6.392864561080932,
134
  "epoch": 0.3742084052964882,
135
+ "grad_norm": 4.594696521759033,
136
+ "learning_rate": 1.8669235328986366e-06,
137
+ "loss": 6.9389,
138
+ "mean_token_accuracy": 0.1844496901333332,
139
+ "num_tokens": 680501.0,
140
  "step": 650
141
  },
142
  {
143
+ "entropy": 6.6726202869415285,
144
  "epoch": 0.4029936672423719,
145
+ "grad_norm": 4.768734931945801,
146
+ "learning_rate": 1.8521043272080617e-06,
147
+ "loss": 6.9818,
148
+ "mean_token_accuracy": 0.16990411713719367,
149
+ "num_tokens": 733231.0,
150
  "step": 700
151
  },
152
  {
153
+ "entropy": 6.592793455123902,
154
  "epoch": 0.4317789291882556,
155
+ "grad_norm": 3.253056764602661,
156
+ "learning_rate": 1.8372851215174864e-06,
157
+ "loss": 6.7105,
158
+ "mean_token_accuracy": 0.18250102579593658,
159
+ "num_tokens": 785373.0,
160
  "step": 750
161
  },
162
  {
163
+ "entropy": 6.683582029342651,
164
  "epoch": 0.4605641911341393,
165
+ "grad_norm": 2.1871063709259033,
166
+ "learning_rate": 1.8224659158269115e-06,
167
+ "loss": 6.6685,
168
+ "mean_token_accuracy": 0.17129646152257919,
169
+ "num_tokens": 838646.0,
170
  "step": 800
171
  },
172
  {
173
+ "entropy": 6.636875295639038,
174
  "epoch": 0.48934945308002303,
175
+ "grad_norm": 3.2284677028656006,
176
+ "learning_rate": 1.8076467101363366e-06,
177
+ "loss": 6.53,
178
+ "mean_token_accuracy": 0.18053789794445038,
179
+ "num_tokens": 892380.0,
180
  "step": 850
181
  },
182
  {
183
+ "entropy": 6.610673260688782,
184
  "epoch": 0.5181347150259067,
185
+ "grad_norm": 2.2088730335235596,
186
+ "learning_rate": 1.7928275044457617e-06,
187
+ "loss": 6.4429,
188
+ "mean_token_accuracy": 0.18492739230394364,
189
+ "num_tokens": 947971.0,
190
  "step": 900
191
  },
192
  {
193
+ "entropy": 6.242899022102356,
194
  "epoch": 0.5469199769717904,
195
+ "grad_norm": 2.3000030517578125,
196
+ "learning_rate": 1.7780082987551866e-06,
197
+ "loss": 6.047,
198
+ "mean_token_accuracy": 0.2291259828209877,
199
+ "num_tokens": 998810.0,
200
  "step": 950
201
  },
202
  {
203
+ "entropy": 6.311488924026489,
204
  "epoch": 0.5757052389176741,
205
+ "grad_norm": 2.1333675384521484,
206
+ "learning_rate": 1.7631890930646115e-06,
207
+ "loss": 6.0919,
208
+ "mean_token_accuracy": 0.22644571751356124,
209
+ "num_tokens": 1050860.0,
210
  "step": 1000
211
  },
212
  {
213
+ "entropy": 6.3254336166381835,
214
  "epoch": 0.6044905008635578,
215
+ "grad_norm": 2.0400779247283936,
216
+ "learning_rate": 1.7483698873740366e-06,
217
+ "loss": 6.094,
218
+ "mean_token_accuracy": 0.2222653564810753,
219
+ "num_tokens": 1104304.0,
220
  "step": 1050
221
  },
222
  {
223
+ "entropy": 6.046922063827514,
224
  "epoch": 0.6332757628094415,
225
+ "grad_norm": 2.8049051761627197,
226
+ "learning_rate": 1.7335506816834617e-06,
227
+ "loss": 5.8011,
228
+ "mean_token_accuracy": 0.25127078920602797,
229
+ "num_tokens": 1153605.0,
230
  "step": 1100
231
  },
232
  {
233
+ "entropy": 5.943600912094116,
234
  "epoch": 0.6620610247553252,
235
+ "grad_norm": 4.063963890075684,
236
+ "learning_rate": 1.7187314759928866e-06,
237
+ "loss": 5.6855,
238
+ "mean_token_accuracy": 0.26265266716480257,
239
+ "num_tokens": 1204328.0,
240
  "step": 1150
241
  },
242
  {
243
+ "entropy": 6.12883231639862,
244
  "epoch": 0.690846286701209,
245
+ "grad_norm": 3.9440460205078125,
246
+ "learning_rate": 1.7039122703023117e-06,
247
+ "loss": 5.8578,
248
+ "mean_token_accuracy": 0.24439335912466048,
249
+ "num_tokens": 1257415.0,
250
  "step": 1200
251
  },
252
  {
253
+ "entropy": 6.164987115859986,
254
  "epoch": 0.7196315486470927,
255
+ "grad_norm": 3.20070481300354,
256
+ "learning_rate": 1.6890930646117368e-06,
257
+ "loss": 5.8876,
258
+ "mean_token_accuracy": 0.24275501281023026,
259
+ "num_tokens": 1310049.0,
260
  "step": 1250
261
  },
262
  {
263
+ "entropy": 6.080997190475464,
264
  "epoch": 0.7484168105929764,
265
+ "grad_norm": 2.8067362308502197,
266
+ "learning_rate": 1.6742738589211617e-06,
267
+ "loss": 5.8058,
268
+ "mean_token_accuracy": 0.25242207854986193,
269
+ "num_tokens": 1361794.0,
270
  "step": 1300
271
  },
272
  {
273
+ "entropy": 5.940848155021667,
274
  "epoch": 0.7772020725388601,
275
+ "grad_norm": 2.6375925540924072,
276
+ "learning_rate": 1.6594546532305868e-06,
277
+ "loss": 5.6718,
278
+ "mean_token_accuracy": 0.2665082859992981,
279
+ "num_tokens": 1412773.0,
280
  "step": 1350
281
  },
282
  {
283
+ "entropy": 6.071129274368286,
284
  "epoch": 0.8059873344847438,
285
+ "grad_norm": 3.951350212097168,
286
+ "learning_rate": 1.6446354475400117e-06,
287
+ "loss": 5.8012,
288
+ "mean_token_accuracy": 0.25434976994991304,
289
+ "num_tokens": 1465620.0,
290
  "step": 1400
291
  },
292
  {
293
+ "entropy": 6.069429359436035,
294
  "epoch": 0.8347725964306275,
295
+ "grad_norm": 3.580608606338501,
296
+ "learning_rate": 1.6298162418494368e-06,
297
+ "loss": 5.8027,
298
+ "mean_token_accuracy": 0.25208072274923327,
299
+ "num_tokens": 1518899.0,
300
  "step": 1450
301
  },
302
  {
303
+ "entropy": 6.005315380096436,
304
  "epoch": 0.8635578583765112,
305
+ "grad_norm": 3.9580376148223877,
306
+ "learning_rate": 1.614997036158862e-06,
307
+ "loss": 5.7364,
308
+ "mean_token_accuracy": 0.25940640360116957,
309
+ "num_tokens": 1571304.0,
310
  "step": 1500
311
  },
312
  {
313
+ "entropy": 6.0786464881896975,
314
  "epoch": 0.8923431203223949,
315
+ "grad_norm": 4.55721378326416,
316
+ "learning_rate": 1.6001778304682868e-06,
317
+ "loss": 5.8092,
318
+ "mean_token_accuracy": 0.2496869170665741,
319
+ "num_tokens": 1627369.0,
320
  "step": 1550
321
  },
322
  {
323
+ "entropy": 5.939382014274597,
324
  "epoch": 0.9211283822682786,
325
+ "grad_norm": 2.330057144165039,
326
+ "learning_rate": 1.5853586247777117e-06,
327
+ "loss": 5.6604,
328
+ "mean_token_accuracy": 0.2686630353331566,
329
+ "num_tokens": 1680401.0,
330
  "step": 1600
331
  },
332
  {
333
+ "entropy": 6.121775646209716,
334
  "epoch": 0.9499136442141624,
335
+ "grad_norm": 2.9881200790405273,
336
+ "learning_rate": 1.5705394190871368e-06,
337
+ "loss": 5.8388,
338
+ "mean_token_accuracy": 0.2503683388233185,
339
+ "num_tokens": 1735745.0,
340
  "step": 1650
341
  },
342
  {
343
+ "entropy": 5.840040788650513,
344
  "epoch": 0.9786989061600461,
345
+ "grad_norm": 3.798994779586792,
346
+ "learning_rate": 1.555720213396562e-06,
347
+ "loss": 5.5635,
348
+ "mean_token_accuracy": 0.278279125392437,
349
+ "num_tokens": 1786896.0,
350
  "step": 1700
351
  },
352
  {
353
  "epoch": 1.0,
354
+ "eval_entropy": 6.139133475343203,
355
+ "eval_loss": 5.861395835876465,
356
+ "eval_mean_token_accuracy": 0.2402858340657801,
357
+ "eval_model_preparation_time": 0.0047,
358
+ "eval_num_tokens": 1825107.0,
359
+ "eval_runtime": 79.3994,
360
+ "eval_samples_per_second": 5.466,
361
+ "eval_steps_per_second": 2.733,
362
  "step": 1737
363
  },
364
  {
365
+ "entropy": 5.8970259666442875,
366
  "epoch": 1.0074841681059297,
367
+ "grad_norm": 2.6411802768707275,
368
+ "learning_rate": 1.540901007705987e-06,
369
+ "loss": 5.614,
370
+ "mean_token_accuracy": 0.273006406724453,
371
+ "num_tokens": 1838864.0,
372
  "step": 1750
373
  },
374
  {
375
+ "entropy": 6.0111794090271,
376
  "epoch": 1.0362694300518134,
377
+ "grad_norm": 3.6491827964782715,
378
+ "learning_rate": 1.526081802015412e-06,
379
+ "loss": 5.7323,
380
+ "mean_token_accuracy": 0.26104256987571717,
381
+ "num_tokens": 1893816.0,
382
  "step": 1800
383
  },
384
  {
385
+ "entropy": 5.902219276428223,
386
  "epoch": 1.065054691997697,
387
+ "grad_norm": 2.593249559402466,
388
+ "learning_rate": 1.5112625963248368e-06,
389
+ "loss": 5.6187,
390
+ "mean_token_accuracy": 0.2746362566947937,
391
+ "num_tokens": 1946532.0,
392
  "step": 1850
393
  },
394
  {
395
+ "entropy": 5.874705944061279,
396
  "epoch": 1.0938399539435808,
397
+ "grad_norm": 2.554327964782715,
398
+ "learning_rate": 1.496443390634262e-06,
399
+ "loss": 5.6021,
400
+ "mean_token_accuracy": 0.2795292744040489,
401
+ "num_tokens": 2000184.0,
402
  "step": 1900
403
  },
404
  {
405
+ "entropy": 5.850096368789673,
406
  "epoch": 1.1226252158894645,
407
+ "grad_norm": 3.6060993671417236,
408
+ "learning_rate": 1.481624184943687e-06,
409
+ "loss": 5.576,
410
+ "mean_token_accuracy": 0.28532547056674956,
411
+ "num_tokens": 2052250.0,
412
  "step": 1950
413
  },
414
  {
415
+ "entropy": 5.802229671478272,
416
  "epoch": 1.1514104778353482,
417
+ "grad_norm": 3.0913314819335938,
418
+ "learning_rate": 1.466804979253112e-06,
419
+ "loss": 5.53,
420
+ "mean_token_accuracy": 0.2916027933359146,
421
+ "num_tokens": 2103531.0,
422
  "step": 2000
423
  },
424
  {
425
+ "entropy": 5.875646467208862,
426
  "epoch": 1.180195739781232,
427
+ "grad_norm": 4.777045726776123,
428
+ "learning_rate": 1.451985773562537e-06,
429
+ "loss": 5.6146,
430
+ "mean_token_accuracy": 0.28063644528388976,
431
+ "num_tokens": 2157098.0,
432
  "step": 2050
433
  },
434
  {
435
+ "entropy": 5.786596937179565,
436
  "epoch": 1.2089810017271156,
437
+ "grad_norm": 4.207762718200684,
438
+ "learning_rate": 1.437166567871962e-06,
439
+ "loss": 5.5417,
440
+ "mean_token_accuracy": 0.2870470091700554,
441
+ "num_tokens": 2211827.0,
442
  "step": 2100
443
  },
444
  {
445
+ "entropy": 5.672234449386597,
446
  "epoch": 1.2377662636729994,
447
+ "grad_norm": 2.2771811485290527,
448
+ "learning_rate": 1.422347362181387e-06,
449
+ "loss": 5.4285,
450
+ "mean_token_accuracy": 0.30194485366344453,
451
+ "num_tokens": 2262174.0,
452
  "step": 2150
453
  },
454
  {
455
+ "entropy": 5.862573285102844,
456
  "epoch": 1.266551525618883,
457
+ "grad_norm": 3.3273422718048096,
458
+ "learning_rate": 1.4075281564908121e-06,
459
+ "loss": 5.6169,
460
+ "mean_token_accuracy": 0.278145115673542,
461
+ "num_tokens": 2316440.0,
462
  "step": 2200
463
  },
464
  {
465
+ "entropy": 5.734760231971741,
466
  "epoch": 1.2953367875647668,
467
+ "grad_norm": 3.7049715518951416,
468
+ "learning_rate": 1.392708950800237e-06,
469
+ "loss": 5.493,
470
+ "mean_token_accuracy": 0.2941485676169395,
471
+ "num_tokens": 2368468.0,
472
  "step": 2250
473
  },
474
  {
475
+ "entropy": 5.665819988250733,
476
  "epoch": 1.3241220495106505,
477
+ "grad_norm": 3.572636604309082,
478
+ "learning_rate": 1.3778897451096621e-06,
479
+ "loss": 5.4352,
480
+ "mean_token_accuracy": 0.3003745040297508,
481
+ "num_tokens": 2421180.0,
482
  "step": 2300
483
  },
484
  {
485
+ "entropy": 5.890115032196045,
486
  "epoch": 1.3529073114565342,
487
+ "grad_norm": 2.738203525543213,
488
+ "learning_rate": 1.3630705394190872e-06,
489
+ "loss": 5.6555,
490
+ "mean_token_accuracy": 0.2737997192144394,
491
+ "num_tokens": 2476255.0,
492
  "step": 2350
493
  },
494
  {
495
+ "entropy": 5.66056040763855,
496
  "epoch": 1.381692573402418,
497
+ "grad_norm": 3.1416995525360107,
498
+ "learning_rate": 1.3482513337285121e-06,
499
+ "loss": 5.4302,
500
+ "mean_token_accuracy": 0.3000989046692848,
501
+ "num_tokens": 2527674.0,
502
  "step": 2400
503
  },
504
  {
505
+ "entropy": 5.861240615844727,
506
  "epoch": 1.4104778353483016,
507
+ "grad_norm": 2.7569284439086914,
508
+ "learning_rate": 1.333432128037937e-06,
509
+ "loss": 5.6304,
510
+ "mean_token_accuracy": 0.27707513481378554,
511
+ "num_tokens": 2582909.0,
512
  "step": 2450
513
  },
514
  {
515
+ "entropy": 5.627686910629272,
516
  "epoch": 1.4392630972941853,
517
+ "grad_norm": 1.7750262022018433,
518
+ "learning_rate": 1.3186129223473621e-06,
519
+ "loss": 5.4058,
520
+ "mean_token_accuracy": 0.3019809901714325,
521
+ "num_tokens": 2636579.0,
522
  "step": 2500
523
  },
524
  {
525
+ "entropy": 5.607026796340943,
526
  "epoch": 1.468048359240069,
527
+ "grad_norm": 3.1005160808563232,
528
+ "learning_rate": 1.3037937166567872e-06,
529
+ "loss": 5.3836,
530
+ "mean_token_accuracy": 0.30584611505270004,
531
+ "num_tokens": 2687698.0,
532
  "step": 2550
533
  },
534
  {
535
+ "entropy": 5.6909641885757445,
536
  "epoch": 1.4968336211859528,
537
+ "grad_norm": 1.6848654747009277,
538
+ "learning_rate": 1.2889745109662123e-06,
539
+ "loss": 5.4653,
540
+ "mean_token_accuracy": 0.296178964972496,
541
+ "num_tokens": 2740214.0,
542
  "step": 2600
543
  },
544
  {
545
+ "entropy": 5.619450302124023,
546
  "epoch": 1.5256188831318365,
547
+ "grad_norm": 2.469539165496826,
548
+ "learning_rate": 1.274155305275637e-06,
549
+ "loss": 5.4022,
550
+ "mean_token_accuracy": 0.3039679077267647,
551
+ "num_tokens": 2792574.0,
552
  "step": 2650
553
  },
554
  {
555
+ "entropy": 5.61073097705841,
556
  "epoch": 1.5544041450777202,
557
+ "grad_norm": 2.367810010910034,
558
+ "learning_rate": 1.259336099585062e-06,
559
+ "loss": 5.3956,
560
+ "mean_token_accuracy": 0.3051413372159004,
561
+ "num_tokens": 2845597.0,
562
  "step": 2700
563
  },
564
  {
565
+ "entropy": 5.5791136837005615,
566
  "epoch": 1.583189407023604,
567
+ "grad_norm": 2.3874764442443848,
568
+ "learning_rate": 1.2445168938944872e-06,
569
+ "loss": 5.3676,
570
+ "mean_token_accuracy": 0.3068238252401352,
571
+ "num_tokens": 2898683.0,
572
  "step": 2750
573
  },
574
  {
575
+ "entropy": 5.735381307601929,
576
  "epoch": 1.6119746689694876,
577
+ "grad_norm": 2.2097349166870117,
578
+ "learning_rate": 1.2296976882039123e-06,
579
+ "loss": 5.5239,
580
+ "mean_token_accuracy": 0.28974882304668426,
581
+ "num_tokens": 2952290.0,
582
  "step": 2800
583
  },
584
  {
585
+ "entropy": 5.55252691745758,
586
  "epoch": 1.6407599309153713,
587
+ "grad_norm": 1.694831132888794,
588
+ "learning_rate": 1.2148784825133372e-06,
589
+ "loss": 5.351,
590
+ "mean_token_accuracy": 0.3091904193162918,
591
+ "num_tokens": 3004556.0,
592
  "step": 2850
593
  },
594
  {
595
+ "entropy": 5.508773093223572,
596
  "epoch": 1.669545192861255,
597
+ "grad_norm": 1.8229279518127441,
598
+ "learning_rate": 1.200059276822762e-06,
599
+ "loss": 5.3164,
600
+ "mean_token_accuracy": 0.31158645361661913,
601
+ "num_tokens": 3056448.0,
602
  "step": 2900
603
  },
604
  {
605
+ "entropy": 5.676794271469117,
606
  "epoch": 1.6983304548071387,
607
+ "grad_norm": 1.7196234464645386,
608
+ "learning_rate": 1.1852400711321872e-06,
609
+ "loss": 5.4776,
610
+ "mean_token_accuracy": 0.2929128894209862,
611
+ "num_tokens": 3109539.0,
612
  "step": 2950
613
  },
614
  {
615
+ "entropy": 5.551529383659362,
616
  "epoch": 1.7271157167530224,
617
+ "grad_norm": 3.117525577545166,
618
+ "learning_rate": 1.1704208654416123e-06,
619
+ "loss": 5.3561,
620
+ "mean_token_accuracy": 0.30634030640125276,
621
+ "num_tokens": 3162421.0,
622
  "step": 3000
623
  },
624
  {
625
+ "entropy": 5.379635264873505,
626
  "epoch": 1.7559009786989062,
627
+ "grad_norm": 1.876755714416504,
628
+ "learning_rate": 1.1556016597510372e-06,
629
+ "loss": 5.1868,
630
+ "mean_token_accuracy": 0.32913618892431257,
631
+ "num_tokens": 3212079.0,
632
  "step": 3050
633
  },
634
  {
635
+ "entropy": 5.538804936408996,
636
  "epoch": 1.7846862406447899,
637
+ "grad_norm": 1.8670976161956787,
638
+ "learning_rate": 1.1407824540604623e-06,
639
+ "loss": 5.3494,
640
+ "mean_token_accuracy": 0.30661171555519107,
641
+ "num_tokens": 3264089.0,
642
  "step": 3100
643
  },
644
  {
645
+ "entropy": 5.258263626098633,
646
  "epoch": 1.8134715025906736,
647
+ "grad_norm": 2.748718023300171,
648
+ "learning_rate": 1.1259632483698874e-06,
649
+ "loss": 5.08,
650
+ "mean_token_accuracy": 0.3413010013103485,
651
+ "num_tokens": 3311881.0,
652
  "step": 3150
653
  },
654
  {
655
+ "entropy": 5.54539008140564,
656
  "epoch": 1.8422567645365573,
657
+ "grad_norm": 1.8556406497955322,
658
+ "learning_rate": 1.1111440426793123e-06,
659
+ "loss": 5.3614,
660
+ "mean_token_accuracy": 0.30550685405731204,
661
+ "num_tokens": 3364861.0,
662
  "step": 3200
663
  },
664
  {
665
+ "entropy": 5.5433073282241825,
666
  "epoch": 1.871042026482441,
667
+ "grad_norm": 1.8386749029159546,
668
+ "learning_rate": 1.0963248369887374e-06,
669
+ "loss": 5.3543,
670
+ "mean_token_accuracy": 0.30875524014234546,
671
+ "num_tokens": 3415911.0,
672
  "step": 3250
673
  },
674
  {
675
+ "entropy": 5.5769769477844235,
676
  "epoch": 1.8998272884283247,
677
+ "grad_norm": 1.922486662864685,
678
+ "learning_rate": 1.0815056312981623e-06,
679
+ "loss": 5.3834,
680
+ "mean_token_accuracy": 0.3035113242268562,
681
+ "num_tokens": 3468338.0,
682
  "step": 3300
683
  },
684
  {
685
+ "entropy": 5.640013842582703,
686
  "epoch": 1.9286125503742084,
687
+ "grad_norm": 2.179500102996826,
688
+ "learning_rate": 1.0666864256075874e-06,
689
+ "loss": 5.4574,
690
+ "mean_token_accuracy": 0.2947095710039139,
691
+ "num_tokens": 3521693.0,
692
  "step": 3350
693
  },
694
  {
695
+ "entropy": 5.506910061836242,
696
  "epoch": 1.9573978123200921,
697
+ "grad_norm": 1.4014379978179932,
698
+ "learning_rate": 1.0518672199170125e-06,
699
+ "loss": 5.3234,
700
+ "mean_token_accuracy": 0.3096472260355949,
701
+ "num_tokens": 3574206.0,
702
  "step": 3400
703
  },
704
  {
705
+ "entropy": 5.607311015129089,
706
  "epoch": 1.9861830742659758,
707
+ "grad_norm": 1.41231107711792,
708
+ "learning_rate": 1.0370480142264374e-06,
709
+ "loss": 5.4226,
710
+ "mean_token_accuracy": 0.2979922544956207,
711
+ "num_tokens": 3627807.0,
712
  "step": 3450
713
  },
714
  {
715
  "epoch": 2.0,
716
+ "eval_entropy": 5.831721861790951,
717
+ "eval_loss": 5.656307220458984,
718
+ "eval_mean_token_accuracy": 0.2641724460685308,
719
+ "eval_model_preparation_time": 0.0047,
720
+ "eval_num_tokens": 3650214.0,
721
+ "eval_runtime": 79.7324,
722
+ "eval_samples_per_second": 5.443,
723
+ "eval_steps_per_second": 2.722,
724
  "step": 3474
725
  },
726
  {
727
+ "entropy": 5.477711625099182,
728
  "epoch": 2.0149683362118593,
729
+ "grad_norm": 3.0133137702941895,
730
+ "learning_rate": 1.0222288085358623e-06,
731
+ "loss": 5.2957,
732
+ "mean_token_accuracy": 0.31543311327695844,
733
+ "num_tokens": 3677883.0,
734
  "step": 3500
735
  },
736
  {
737
+ "entropy": 5.599187393188476,
738
  "epoch": 2.043753598157743,
739
+ "grad_norm": 1.885867714881897,
740
+ "learning_rate": 1.0074096028452874e-06,
741
+ "loss": 5.4142,
742
+ "mean_token_accuracy": 0.3004470923542976,
743
+ "num_tokens": 3730991.0,
744
  "step": 3550
745
  },
746
  {
747
+ "entropy": 5.526448183059692,
748
  "epoch": 2.0725388601036268,
749
+ "grad_norm": 4.50788688659668,
750
+ "learning_rate": 9.925903971547125e-07,
751
+ "loss": 5.3517,
752
+ "mean_token_accuracy": 0.3069574165344238,
753
+ "num_tokens": 3783795.0,
754
  "step": 3600
755
  },
756
  {
757
+ "entropy": 5.560557870864868,
758
  "epoch": 2.1013241220495105,
759
+ "grad_norm": 1.927862524986267,
760
+ "learning_rate": 9.777711914641374e-07,
761
+ "loss": 5.3815,
762
+ "mean_token_accuracy": 0.3045575937628746,
763
+ "num_tokens": 3835526.0,
764
  "step": 3650
765
  },
766
  {
767
+ "entropy": 5.528058257102966,
768
  "epoch": 2.130109383995394,
769
+ "grad_norm": 2.164687156677246,
770
+ "learning_rate": 9.629519857735625e-07,
771
+ "loss": 5.3501,
772
+ "mean_token_accuracy": 0.3071546205878258,
773
+ "num_tokens": 3887175.0,
774
  "step": 3700
775
  },
776
  {
777
+ "entropy": 5.397617678642273,
778
  "epoch": 2.158894645941278,
779
+ "grad_norm": 2.3098385334014893,
780
+ "learning_rate": 9.481327800829875e-07,
781
+ "loss": 5.2244,
782
+ "mean_token_accuracy": 0.3226669803261757,
783
+ "num_tokens": 3938003.0,
784
  "step": 3750
785
  },
786
  {
787
+ "entropy": 5.529960298538208,
788
  "epoch": 2.1876799078871616,
789
+ "grad_norm": 1.8144755363464355,
790
+ "learning_rate": 9.333135743924125e-07,
791
+ "loss": 5.3572,
792
+ "mean_token_accuracy": 0.306032218337059,
793
+ "num_tokens": 3990451.0,
794
  "step": 3800
795
  },
796
  {
797
+ "entropy": 5.597109637260437,
798
  "epoch": 2.2164651698330453,
799
+ "grad_norm": 2.7306935787200928,
800
+ "learning_rate": 9.184943687018375e-07,
801
+ "loss": 5.4162,
802
+ "mean_token_accuracy": 0.2985941395163536,
803
+ "num_tokens": 4044048.0,
804
  "step": 3850
805
  },
806
  {
807
+ "entropy": 5.448684883117676,
808
  "epoch": 2.245250431778929,
809
+ "grad_norm": 1.8199880123138428,
810
+ "learning_rate": 9.036751630112626e-07,
811
+ "loss": 5.2775,
812
+ "mean_token_accuracy": 0.31548845052719116,
813
+ "num_tokens": 4095276.0,
814
  "step": 3900
815
  },
816
  {
817
+ "entropy": 5.5008597612380985,
818
  "epoch": 2.2740356937248127,
819
+ "grad_norm": 1.755323052406311,
820
+ "learning_rate": 8.888559573206875e-07,
821
+ "loss": 5.3274,
822
+ "mean_token_accuracy": 0.309090721309185,
823
+ "num_tokens": 4148172.0,
824
  "step": 3950
825
  },
826
  {
827
+ "entropy": 5.7040300464630125,
828
  "epoch": 2.3028209556706964,
829
+ "grad_norm": 2.3154356479644775,
830
+ "learning_rate": 8.740367516301126e-07,
831
+ "loss": 5.5239,
832
+ "mean_token_accuracy": 0.28589318484067916,
833
+ "num_tokens": 4202733.0,
834
  "step": 4000
835
  },
836
  {
837
+ "entropy": 5.549855670928955,
838
  "epoch": 2.33160621761658,
839
+ "grad_norm": 1.9549669027328491,
840
+ "learning_rate": 8.592175459395375e-07,
841
+ "loss": 5.3755,
842
+ "mean_token_accuracy": 0.3029727828502655,
843
+ "num_tokens": 4255738.0,
844
  "step": 4050
845
  },
846
  {
847
+ "entropy": 5.579690465927124,
848
  "epoch": 2.360391479562464,
849
+ "grad_norm": 1.7018866539001465,
850
+ "learning_rate": 8.443983402489626e-07,
851
+ "loss": 5.4036,
852
+ "mean_token_accuracy": 0.3001995691657066,
853
+ "num_tokens": 4308638.0,
854
  "step": 4100
855
  },
856
  {
857
+ "entropy": 5.646504878997803,
858
  "epoch": 2.3891767415083476,
859
+ "grad_norm": 1.4139262437820435,
860
+ "learning_rate": 8.295791345583877e-07,
861
+ "loss": 5.4733,
862
+ "mean_token_accuracy": 0.2912476986646652,
863
+ "num_tokens": 4363170.0,
864
  "step": 4150
865
  },
866
  {
867
+ "entropy": 5.554990992546082,
868
  "epoch": 2.4179620034542313,
869
+ "grad_norm": 1.6886577606201172,
870
+ "learning_rate": 8.147599288678126e-07,
871
+ "loss": 5.3842,
872
+ "mean_token_accuracy": 0.302762059867382,
873
+ "num_tokens": 4415607.0,
874
  "step": 4200
875
  },
876
  {
877
+ "entropy": 5.513420124053955,
878
  "epoch": 2.446747265400115,
879
+ "grad_norm": 1.3537819385528564,
880
+ "learning_rate": 7.999407231772377e-07,
881
+ "loss": 5.3408,
882
+ "mean_token_accuracy": 0.30764526218175886,
883
+ "num_tokens": 4467608.0,
884
  "step": 4250
885
  },
886
  {
887
+ "entropy": 5.561378569602966,
888
  "epoch": 2.4755325273459987,
889
+ "grad_norm": 1.8514106273651123,
890
+ "learning_rate": 7.851215174866627e-07,
891
+ "loss": 5.3891,
892
+ "mean_token_accuracy": 0.301382859647274,
893
+ "num_tokens": 4520299.0,
894
  "step": 4300
895
  },
896
  {
897
+ "entropy": 5.536689953804016,
898
  "epoch": 2.5043177892918824,
899
+ "grad_norm": 2.1830835342407227,
900
+ "learning_rate": 7.703023117960877e-07,
901
+ "loss": 5.3672,
902
+ "mean_token_accuracy": 0.3047756373882294,
903
+ "num_tokens": 4573065.0,
904
  "step": 4350
905
  },
906
  {
907
+ "entropy": 5.69776873588562,
908
  "epoch": 2.533103051237766,
909
+ "grad_norm": 1.999536156654358,
910
+ "learning_rate": 7.554831061055127e-07,
911
+ "loss": 5.5236,
912
+ "mean_token_accuracy": 0.2868007507920265,
913
+ "num_tokens": 4626807.0,
914
  "step": 4400
915
  },
916
  {
917
+ "entropy": 5.3977436876297,
918
  "epoch": 2.56188831318365,
919
+ "grad_norm": 1.9608020782470703,
920
+ "learning_rate": 7.406639004149378e-07,
921
+ "loss": 5.2335,
922
+ "mean_token_accuracy": 0.3199601462483406,
923
+ "num_tokens": 4677663.0,
924
  "step": 4450
925
  },
926
  {
927
+ "entropy": 5.6681678771972654,
928
  "epoch": 2.5906735751295336,
929
+ "grad_norm": 1.829047441482544,
930
+ "learning_rate": 7.258446947243627e-07,
931
+ "loss": 5.491,
932
+ "mean_token_accuracy": 0.2894612854719162,
933
+ "num_tokens": 4731830.0,
934
  "step": 4500
935
  },
936
  {
937
+ "entropy": 5.49174174785614,
938
  "epoch": 2.6194588370754173,
939
+ "grad_norm": 1.3158719539642334,
940
+ "learning_rate": 7.110254890337878e-07,
941
+ "loss": 5.3225,
942
+ "mean_token_accuracy": 0.3084965732693672,
943
+ "num_tokens": 4784694.0,
944
  "step": 4550
945
  },
946
  {
947
+ "entropy": 5.573234438896179,
948
  "epoch": 2.648244099021301,
949
+ "grad_norm": 1.562915325164795,
950
+ "learning_rate": 6.962062833432127e-07,
951
+ "loss": 5.4028,
952
+ "mean_token_accuracy": 0.2989520016312599,
953
+ "num_tokens": 4838534.0,
954
  "step": 4600
955
  },
956
  {
957
+ "entropy": 5.550469598770142,
958
  "epoch": 2.6770293609671847,
959
+ "grad_norm": 2.114727735519409,
960
+ "learning_rate": 6.813870776526378e-07,
961
+ "loss": 5.3804,
962
+ "mean_token_accuracy": 0.30373542964458466,
963
+ "num_tokens": 4890611.0,
964
  "step": 4650
965
  },
966
  {
967
+ "entropy": 5.523049550056458,
968
  "epoch": 2.7058146229130684,
969
+ "grad_norm": 2.5036823749542236,
970
+ "learning_rate": 6.665678719620628e-07,
971
+ "loss": 5.3542,
972
+ "mean_token_accuracy": 0.30681024432182313,
973
+ "num_tokens": 4943571.0,
974
  "step": 4700
975
  },
976
  {
977
+ "entropy": 5.323453049659729,
978
  "epoch": 2.734599884858952,
979
+ "grad_norm": 1.8069168329238892,
980
+ "learning_rate": 6.517486662714878e-07,
981
+ "loss": 5.1583,
982
+ "mean_token_accuracy": 0.32906652927398683,
983
+ "num_tokens": 4993871.0,
984
  "step": 4750
985
  },
986
  {
987
+ "entropy": 5.504038324356079,
988
  "epoch": 2.763385146804836,
989
+ "grad_norm": 4.750283718109131,
990
+ "learning_rate": 6.369294605809128e-07,
991
+ "loss": 5.3366,
992
+ "mean_token_accuracy": 0.3087608867883682,
993
+ "num_tokens": 5046187.0,
994
  "step": 4800
995
  },
996
  {
997
+ "entropy": 5.487624549865723,
998
  "epoch": 2.7921704087507195,
999
+ "grad_norm": 1.4186172485351562,
1000
+ "learning_rate": 6.221102548903379e-07,
1001
+ "loss": 5.3237,
1002
+ "mean_token_accuracy": 0.3088638699054718,
1003
+ "num_tokens": 5098644.0,
1004
  "step": 4850
1005
  },
1006
  {
1007
+ "entropy": 5.346905107498169,
1008
  "epoch": 2.8209556706966032,
1009
+ "grad_norm": 1.5670177936553955,
1010
+ "learning_rate": 6.072910491997628e-07,
1011
+ "loss": 5.1849,
1012
+ "mean_token_accuracy": 0.3265886321663857,
1013
+ "num_tokens": 5149345.0,
1014
  "step": 4900
1015
  },
1016
  {
1017
+ "entropy": 5.510410032272339,
1018
  "epoch": 2.849740932642487,
1019
+ "grad_norm": 7.489855766296387,
1020
+ "learning_rate": 5.924718435091879e-07,
1021
+ "loss": 5.3424,
1022
+ "mean_token_accuracy": 0.30768151730299,
1023
+ "num_tokens": 5202028.0,
1024
  "step": 4950
1025
  },
1026
  {
1027
+ "entropy": 5.525181493759155,
1028
  "epoch": 2.8785261945883707,
1029
+ "grad_norm": 1.8829196691513062,
1030
+ "learning_rate": 5.776526378186128e-07,
1031
+ "loss": 5.3654,
1032
+ "mean_token_accuracy": 0.30342737555503846,
1033
+ "num_tokens": 5255082.0,
1034
  "step": 5000
1035
  },
1036
  {
1037
+ "entropy": 5.374098634719848,
1038
  "epoch": 2.9073114565342544,
1039
+ "grad_norm": 1.3901060819625854,
1040
+ "learning_rate": 5.628334321280379e-07,
1041
+ "loss": 5.2103,
1042
+ "mean_token_accuracy": 0.3233291879296303,
1043
+ "num_tokens": 5305042.0,
1044
  "step": 5050
1045
  },
1046
  {
1047
+ "entropy": 5.374619431495667,
1048
  "epoch": 2.936096718480138,
1049
+ "grad_norm": 1.6586560010910034,
1050
+ "learning_rate": 5.48014226437463e-07,
1051
+ "loss": 5.2125,
1052
+ "mean_token_accuracy": 0.322759662270546,
1053
+ "num_tokens": 5356310.0,
1054
  "step": 5100
1055
  },
1056
  {
1057
+ "entropy": 5.527479724884033,
1058
  "epoch": 2.964881980426022,
1059
+ "grad_norm": 1.6678485870361328,
1060
+ "learning_rate": 5.331950207468879e-07,
1061
+ "loss": 5.3627,
1062
+ "mean_token_accuracy": 0.30430852621793747,
1063
+ "num_tokens": 5409283.0,
1064
  "step": 5150
1065
  },
1066
  {
1067
+ "entropy": 5.6171248292922975,
1068
  "epoch": 2.9936672423719055,
1069
+ "grad_norm": 1.50790274143219,
1070
+ "learning_rate": 5.18375815056313e-07,
1071
+ "loss": 5.4484,
1072
+ "mean_token_accuracy": 0.29375598043203355,
1073
+ "num_tokens": 5464332.0,
1074
  "step": 5200
1075
  },
1076
  {
1077
  "epoch": 3.0,
1078
+ "eval_entropy": 5.78779639186947,
1079
+ "eval_loss": 5.628758430480957,
1080
+ "eval_mean_token_accuracy": 0.2653660801698535,
1081
+ "eval_model_preparation_time": 0.0047,
1082
+ "eval_num_tokens": 5475321.0,
1083
+ "eval_runtime": 80.3676,
1084
+ "eval_samples_per_second": 5.4,
1085
+ "eval_steps_per_second": 2.7,
1086
  "step": 5211
1087
  }
1088
  ],
1089
  "logging_steps": 50,
1090
+ "max_steps": 6948,
1091
  "num_input_tokens_seen": 0,
1092
+ "num_train_epochs": 4,
1093
  "save_steps": 500,
1094
  "stateful_callbacks": {
1095
  "TrainerControl": {
 
1103
  "attributes": {}
1104
  }
1105
  },
1106
+ "total_flos": 7.520890606086144e+16,
1107
  "train_batch_size": 2,
1108
  "trial_name": null,
1109
  "trial_params": null
checkpoint-5211/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:130d33149272782bd60306263c371036419926142b8999aad7806359168f8484
3
  size 6225
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8db5c304963110404ebb6947b83ba95bd9b8aad1f9b8b578cc33c46d601e13dc
3
  size 6225
checkpoint-6948/adapter_config.json CHANGED
@@ -16,7 +16,7 @@
16
  "layers_pattern": null,
17
  "layers_to_transform": null,
18
  "loftq_config": {},
19
- "lora_alpha": 16,
20
  "lora_bias": false,
21
  "lora_dropout": 0.1,
22
  "megatron_config": null,
@@ -25,12 +25,14 @@
25
  "peft_type": "LORA",
26
  "peft_version": "0.18.0",
27
  "qalora_group_size": 16,
28
- "r": 8,
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
  "q_proj",
33
- "v_proj"
 
 
34
  ],
35
  "target_parameters": null,
36
  "task_type": "CAUSAL_LM",
 
16
  "layers_pattern": null,
17
  "layers_to_transform": null,
18
  "loftq_config": {},
19
+ "lora_alpha": 32,
20
  "lora_bias": false,
21
  "lora_dropout": 0.1,
22
  "megatron_config": null,
 
25
  "peft_type": "LORA",
26
  "peft_version": "0.18.0",
27
  "qalora_group_size": 16,
28
+ "r": 24,
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
  "q_proj",
33
+ "k_proj",
34
+ "v_proj",
35
+ "o_proj"
36
  ],
37
  "target_parameters": null,
38
  "task_type": "CAUSAL_LM",
checkpoint-6948/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d98bdffcaf94c61a7d4f2d6e4effa1765874d8fc8f97c30cd420626b72320c3b
3
- size 4374520
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a42655e5c5bf5a17388c99c67741b81d97a904a649f92d5298361717c78abaac
3
+ size 26182176
checkpoint-6948/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:00bc515ace55234a59210394255dbfc391f61f9f697c5ca151b3d2dde3e16426
3
- size 8783179
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f4dc67fd123c4a9f8eb45bc8894cccfeeb5a7766daf44f4ca97786db172fd5f
3
+ size 52486155
checkpoint-6948/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6d29771f79d36c2441adbe71a52a34256493ea9dc339b022adf52f9bd8969a78
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:de23a91d8efb3b92e132a49e237b78926ed9acaded7b594b358633abace10591
3
  size 14645
checkpoint-6948/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:78dfb562bc2784af18bf3113adae6543329adb31bab3face56cd71f730439d6d
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ced24601208e373e591e4ce71c0d860f568ef5205374f58c5db9ee9e78232103
3
  size 1465
checkpoint-6948/trainer_state.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "best_global_step": 6948,
3
- "best_metric": 6.0967888832092285,
4
  "best_model_checkpoint": "./output/checkpoint-6948",
5
  "epoch": 4.0,
6
  "eval_steps": 500,
@@ -10,1438 +10,1438 @@
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "entropy": 3.864118957519531,
14
  "epoch": 0.028785261945883708,
15
- "grad_norm": 2.7545533180236816,
16
- "learning_rate": 9.800000000000001e-06,
17
- "loss": 15.2997,
18
- "mean_token_accuracy": 0.10086015284061432,
19
- "num_tokens": 47319.0,
20
  "step": 50
21
  },
22
  {
23
- "entropy": 4.047076859474182,
24
  "epoch": 0.057570523891767415,
25
- "grad_norm": 5.0328264236450195,
26
- "learning_rate": 1.98e-05,
27
- "loss": 15.3264,
28
- "mean_token_accuracy": 0.09582207053899765,
29
- "num_tokens": 96809.0,
30
  "step": 100
31
  },
32
  {
33
- "entropy": 4.7578076648712155,
34
  "epoch": 0.08635578583765112,
35
- "grad_norm": 38.50589370727539,
36
- "learning_rate": 1.988584740827024e-05,
37
- "loss": 13.0056,
38
- "mean_token_accuracy": 0.126854517608881,
39
- "num_tokens": 139962.0,
40
  "step": 150
41
  },
42
  {
43
- "entropy": 6.80673882484436,
44
  "epoch": 0.11514104778353483,
45
- "grad_norm": 12.030129432678223,
46
- "learning_rate": 1.97693651718113e-05,
47
- "loss": 9.2822,
48
- "mean_token_accuracy": 0.11084575355052947,
49
- "num_tokens": 188029.0,
50
  "step": 200
51
  },
52
  {
53
- "entropy": 7.177925786972046,
54
  "epoch": 0.14392630972941853,
55
- "grad_norm": 4.852536201477051,
56
- "learning_rate": 1.965288293535236e-05,
57
- "loss": 7.6333,
58
- "mean_token_accuracy": 0.12398939326405525,
59
- "num_tokens": 234425.0,
60
  "step": 250
61
  },
62
  {
63
- "entropy": 7.080496473312378,
64
  "epoch": 0.17271157167530224,
65
- "grad_norm": 4.10841178894043,
66
- "learning_rate": 1.9536400698893422e-05,
67
- "loss": 7.1632,
68
- "mean_token_accuracy": 0.13563686355948448,
69
- "num_tokens": 278885.0,
70
  "step": 300
71
  },
72
  {
73
- "entropy": 6.931579580307007,
74
  "epoch": 0.20149683362118595,
75
- "grad_norm": 14.636048316955566,
76
- "learning_rate": 1.941991846243448e-05,
77
- "loss": 6.8213,
78
- "mean_token_accuracy": 0.16459846690297128,
79
- "num_tokens": 325491.0,
80
  "step": 350
81
  },
82
  {
83
- "entropy": 6.853660764694214,
84
  "epoch": 0.23028209556706966,
85
- "grad_norm": 5.966708183288574,
86
- "learning_rate": 1.930343622597554e-05,
87
- "loss": 6.6625,
88
- "mean_token_accuracy": 0.17670693069696428,
89
- "num_tokens": 372913.0,
90
  "step": 400
91
  },
92
  {
93
- "entropy": 6.684267387390137,
94
  "epoch": 0.25906735751295334,
95
- "grad_norm": 4.031010627746582,
96
- "learning_rate": 1.91869539895166e-05,
97
- "loss": 6.4505,
98
- "mean_token_accuracy": 0.1943434515595436,
99
- "num_tokens": 419159.0,
100
  "step": 450
101
  },
102
  {
103
- "entropy": 6.679989137649536,
104
  "epoch": 0.28785261945883706,
105
- "grad_norm": 6.251070022583008,
106
- "learning_rate": 1.907047175305766e-05,
107
- "loss": 6.4314,
108
- "mean_token_accuracy": 0.19514557600021362,
109
- "num_tokens": 466994.0,
110
  "step": 500
111
  },
112
  {
113
- "entropy": 6.477229623794556,
114
  "epoch": 0.31663788140472077,
115
- "grad_norm": 3.8656675815582275,
116
- "learning_rate": 1.895398951659872e-05,
117
- "loss": 6.2139,
118
- "mean_token_accuracy": 0.21764743447303772,
119
- "num_tokens": 513308.0,
120
  "step": 550
121
  },
122
  {
123
- "entropy": 6.408129243850708,
124
  "epoch": 0.3454231433506045,
125
- "grad_norm": 8.688581466674805,
126
- "learning_rate": 1.883750728013978e-05,
127
- "loss": 6.1224,
128
- "mean_token_accuracy": 0.23438037544488907,
129
- "num_tokens": 559679.0,
130
  "step": 600
131
  },
132
  {
133
- "entropy": 6.128518767356873,
134
  "epoch": 0.3742084052964882,
135
- "grad_norm": 5.419503688812256,
136
- "learning_rate": 1.872102504368084e-05,
137
- "loss": 5.8692,
138
- "mean_token_accuracy": 0.26634690463542937,
139
- "num_tokens": 603140.0,
140
  "step": 650
141
  },
142
  {
143
- "entropy": 6.322700729370117,
144
  "epoch": 0.4029936672423719,
145
- "grad_norm": 2.2213082313537598,
146
- "learning_rate": 1.86045428072219e-05,
147
- "loss": 6.0717,
148
- "mean_token_accuracy": 0.24038562417030335,
149
- "num_tokens": 650179.0,
150
  "step": 700
151
  },
152
  {
153
- "entropy": 6.236415157318115,
154
  "epoch": 0.4317789291882556,
155
- "grad_norm": 4.804980278015137,
156
- "learning_rate": 1.848806057076296e-05,
157
- "loss": 5.9986,
158
- "mean_token_accuracy": 0.24596781462430953,
159
- "num_tokens": 696220.0,
160
  "step": 750
161
  },
162
  {
163
- "entropy": 6.269758443832398,
164
  "epoch": 0.4605641911341393,
165
- "grad_norm": 2.2888853549957275,
166
- "learning_rate": 1.837157833430402e-05,
167
- "loss": 6.0385,
168
- "mean_token_accuracy": 0.24074893474578857,
169
- "num_tokens": 743909.0,
170
  "step": 800
171
  },
172
  {
173
- "entropy": 6.270364007949829,
174
  "epoch": 0.48934945308002303,
175
- "grad_norm": 3.0903279781341553,
176
- "learning_rate": 1.825509609784508e-05,
177
- "loss": 6.0481,
178
- "mean_token_accuracy": 0.23740622967481614,
179
- "num_tokens": 792015.0,
180
  "step": 850
181
  },
182
  {
183
- "entropy": 6.3037636184692385,
184
  "epoch": 0.5181347150259067,
185
- "grad_norm": 3.969320058822632,
186
- "learning_rate": 1.813861386138614e-05,
187
- "loss": 6.0855,
188
- "mean_token_accuracy": 0.2309597587585449,
189
- "num_tokens": 841802.0,
190
  "step": 900
191
  },
192
  {
193
- "entropy": 6.038041458129883,
194
  "epoch": 0.5469199769717904,
195
- "grad_norm": 2.2712185382843018,
196
- "learning_rate": 1.80221316249272e-05,
197
- "loss": 5.8285,
198
- "mean_token_accuracy": 0.26099125802516937,
199
- "num_tokens": 886492.0,
200
  "step": 950
201
  },
202
  {
203
- "entropy": 6.142958383560181,
204
  "epoch": 0.5757052389176741,
205
- "grad_norm": 1.2311755418777466,
206
- "learning_rate": 1.790564938846826e-05,
207
- "loss": 5.9357,
208
- "mean_token_accuracy": 0.24810438305139543,
209
- "num_tokens": 932807.0,
210
  "step": 1000
211
  },
212
  {
213
- "entropy": 6.199834351539612,
214
  "epoch": 0.6044905008635578,
215
- "grad_norm": 2.2788379192352295,
216
- "learning_rate": 1.7789167152009318e-05,
217
- "loss": 5.9964,
218
- "mean_token_accuracy": 0.23942562609910964,
219
- "num_tokens": 980541.0,
220
  "step": 1050
221
  },
222
  {
223
- "entropy": 5.961639919281006,
224
  "epoch": 0.6332757628094415,
225
- "grad_norm": 1.9077532291412354,
226
- "learning_rate": 1.767268491555038e-05,
227
- "loss": 5.7664,
228
- "mean_token_accuracy": 0.26718012750148773,
229
- "num_tokens": 1023882.0,
230
  "step": 1100
231
  },
232
  {
233
- "entropy": 5.889280087947846,
234
  "epoch": 0.6620610247553252,
235
- "grad_norm": 2.4254891872406006,
236
- "learning_rate": 1.7556202679091442e-05,
237
- "loss": 5.6952,
238
- "mean_token_accuracy": 0.27529804170131683,
239
- "num_tokens": 1068300.0,
240
  "step": 1150
241
  },
242
  {
243
- "entropy": 6.085640063285828,
244
  "epoch": 0.690846286701209,
245
- "grad_norm": 2.35312557220459,
246
- "learning_rate": 1.74397204426325e-05,
247
- "loss": 5.8898,
248
- "mean_token_accuracy": 0.25166562348604204,
249
- "num_tokens": 1115425.0,
250
  "step": 1200
251
  },
252
  {
253
- "entropy": 6.146574058532715,
254
  "epoch": 0.7196315486470927,
255
- "grad_norm": 1.7730146646499634,
256
- "learning_rate": 1.732323820617356e-05,
257
- "loss": 5.9519,
258
- "mean_token_accuracy": 0.24276195973157882,
259
- "num_tokens": 1162319.0,
260
  "step": 1250
261
  },
262
  {
263
- "entropy": 6.079372715950012,
264
  "epoch": 0.7484168105929764,
265
- "grad_norm": 1.7070863246917725,
266
- "learning_rate": 1.720675596971462e-05,
267
- "loss": 5.8922,
268
- "mean_token_accuracy": 0.24961524546146394,
269
- "num_tokens": 1208230.0,
270
  "step": 1300
271
  },
272
  {
273
- "entropy": 5.9683656406402585,
274
  "epoch": 0.7772020725388601,
275
- "grad_norm": 1.8790594339370728,
276
- "learning_rate": 1.709027373325568e-05,
277
- "loss": 5.7827,
278
- "mean_token_accuracy": 0.2632122594118118,
279
- "num_tokens": 1253074.0,
280
  "step": 1350
281
  },
282
  {
283
- "entropy": 6.107076721191406,
284
  "epoch": 0.8059873344847438,
285
- "grad_norm": 1.1745644807815552,
286
- "learning_rate": 1.6973791496796742e-05,
287
- "loss": 5.9211,
288
- "mean_token_accuracy": 0.24564073830842972,
289
- "num_tokens": 1300179.0,
290
  "step": 1400
291
  },
292
  {
293
- "entropy": 6.141328382492065,
294
  "epoch": 0.8347725964306275,
295
- "grad_norm": 1.0346958637237549,
296
- "learning_rate": 1.68573092603378e-05,
297
- "loss": 5.9584,
298
- "mean_token_accuracy": 0.23997059136629104,
299
- "num_tokens": 1347539.0,
300
  "step": 1450
301
  },
302
  {
303
- "entropy": 6.070010099411011,
304
  "epoch": 0.8635578583765112,
305
- "grad_norm": 1.6541163921356201,
306
- "learning_rate": 1.674082702387886e-05,
307
- "loss": 5.889,
308
- "mean_token_accuracy": 0.24875166177749633,
309
- "num_tokens": 1394157.0,
310
  "step": 1500
311
  },
312
  {
313
- "entropy": 6.207450666427612,
314
  "epoch": 0.8923431203223949,
315
- "grad_norm": 0.9742990732192993,
316
- "learning_rate": 1.662434478741992e-05,
317
- "loss": 6.0217,
318
- "mean_token_accuracy": 0.23067249596118927,
319
- "num_tokens": 1443892.0,
320
  "step": 1550
321
  },
322
  {
323
- "entropy": 6.026197805404663,
324
  "epoch": 0.9211283822682786,
325
- "grad_norm": 1.4229531288146973,
326
- "learning_rate": 1.650786255096098e-05,
327
- "loss": 5.8455,
328
- "mean_token_accuracy": 0.2537291014194489,
329
- "num_tokens": 1491050.0,
330
  "step": 1600
331
  },
332
  {
333
- "entropy": 6.210526428222656,
334
  "epoch": 0.9499136442141624,
335
- "grad_norm": 1.3555018901824951,
336
- "learning_rate": 1.6391380314502038e-05,
337
- "loss": 6.0279,
338
- "mean_token_accuracy": 0.2308420208096504,
339
- "num_tokens": 1540809.0,
340
  "step": 1650
341
  },
342
  {
343
- "entropy": 5.9872834014892575,
344
  "epoch": 0.9786989061600461,
345
- "grad_norm": 0.9893498420715332,
346
- "learning_rate": 1.62748980780431e-05,
347
- "loss": 5.8137,
348
- "mean_token_accuracy": 0.2566875320672989,
349
- "num_tokens": 1585876.0,
350
  "step": 1700
351
  },
352
  {
353
  "epoch": 1.0,
354
- "eval_entropy": 6.322207130045386,
355
- "eval_loss": 6.15173864364624,
356
- "eval_mean_token_accuracy": 0.21116007946877985,
357
- "eval_model_preparation_time": 0.0036,
358
- "eval_num_tokens": 1619719.0,
359
- "eval_runtime": 76.1297,
360
- "eval_samples_per_second": 5.701,
361
- "eval_steps_per_second": 2.85,
362
  "step": 1737
363
  },
364
  {
365
- "entropy": 6.038531675338745,
366
  "epoch": 1.0074841681059297,
367
- "grad_norm": 0.8715208172798157,
368
- "learning_rate": 1.615841584158416e-05,
369
- "loss": 5.8628,
370
- "mean_token_accuracy": 0.2510762655735016,
371
- "num_tokens": 1632015.0,
372
  "step": 1750
373
  },
374
  {
375
- "entropy": 6.164030771255494,
376
  "epoch": 1.0362694300518134,
377
- "grad_norm": 0.7344900965690613,
378
- "learning_rate": 1.604193360512522e-05,
379
- "loss": 5.9856,
380
- "mean_token_accuracy": 0.2351543301343918,
381
- "num_tokens": 1681154.0,
382
  "step": 1800
383
  },
384
  {
385
- "entropy": 6.0731862354278565,
386
  "epoch": 1.065054691997697,
387
- "grad_norm": 1.0801328420639038,
388
- "learning_rate": 1.592545136866628e-05,
389
- "loss": 5.8976,
390
- "mean_token_accuracy": 0.24701615989208223,
391
- "num_tokens": 1728110.0,
392
  "step": 1850
393
  },
394
  {
395
- "entropy": 6.079212121963501,
396
  "epoch": 1.0938399539435808,
397
- "grad_norm": 0.7876909375190735,
398
- "learning_rate": 1.5808969132207338e-05,
399
- "loss": 5.9056,
400
- "mean_token_accuracy": 0.24457543224096298,
401
- "num_tokens": 1775703.0,
402
  "step": 1900
403
  },
404
  {
405
- "entropy": 6.062467746734619,
406
  "epoch": 1.1226252158894645,
407
- "grad_norm": 0.5999078750610352,
408
- "learning_rate": 1.56924868957484e-05,
409
- "loss": 5.8899,
410
- "mean_token_accuracy": 0.2469428673386574,
411
- "num_tokens": 1821980.0,
412
  "step": 1950
413
  },
414
  {
415
- "entropy": 6.031774473190308,
416
  "epoch": 1.1514104778353482,
417
- "grad_norm": 1.6313235759735107,
418
- "learning_rate": 1.557600465928946e-05,
419
- "loss": 5.8593,
420
- "mean_token_accuracy": 0.250918984413147,
421
- "num_tokens": 1867547.0,
422
  "step": 2000
423
  },
424
  {
425
- "entropy": 6.122789564132691,
426
  "epoch": 1.180195739781232,
427
- "grad_norm": 2.562373161315918,
428
- "learning_rate": 1.545952242283052e-05,
429
- "loss": 5.9502,
430
- "mean_token_accuracy": 0.23938885867595672,
431
- "num_tokens": 1915411.0,
432
  "step": 2050
433
  },
434
  {
435
- "entropy": 6.067130417823791,
436
  "epoch": 1.2089810017271156,
437
- "grad_norm": 0.9762872457504272,
438
- "learning_rate": 1.534304018637158e-05,
439
- "loss": 5.8956,
440
- "mean_token_accuracy": 0.2454381173849106,
441
- "num_tokens": 1964009.0,
442
  "step": 2100
443
  },
444
  {
445
- "entropy": 5.9613511180877685,
446
  "epoch": 1.2377662636729994,
447
- "grad_norm": 0.8701547384262085,
448
- "learning_rate": 1.5226557949912639e-05,
449
- "loss": 5.7907,
450
- "mean_token_accuracy": 0.25976367652416227,
451
- "num_tokens": 2008595.0,
452
  "step": 2150
453
  },
454
  {
455
- "entropy": 6.13505428314209,
456
  "epoch": 1.266551525618883,
457
- "grad_norm": 0.8511647582054138,
458
- "learning_rate": 1.51100757134537e-05,
459
- "loss": 5.9619,
460
- "mean_token_accuracy": 0.23760781466960906,
461
- "num_tokens": 2057229.0,
462
  "step": 2200
463
  },
464
  {
465
- "entropy": 6.025254983901977,
466
  "epoch": 1.2953367875647668,
467
- "grad_norm": 0.7627406120300293,
468
- "learning_rate": 1.4993593476994758e-05,
469
- "loss": 5.8546,
470
- "mean_token_accuracy": 0.2508662334084511,
471
- "num_tokens": 2103631.0,
472
  "step": 2250
473
  },
474
  {
475
- "entropy": 5.981974196434021,
476
  "epoch": 1.3241220495106505,
477
- "grad_norm": 1.6922173500061035,
478
- "learning_rate": 1.4877111240535819e-05,
479
- "loss": 5.8119,
480
- "mean_token_accuracy": 0.256170334815979,
481
- "num_tokens": 2150369.0,
482
  "step": 2300
483
  },
484
  {
485
- "entropy": 6.19903904914856,
486
  "epoch": 1.3529073114565342,
487
- "grad_norm": 0.40436601638793945,
488
- "learning_rate": 1.4760629004076878e-05,
489
- "loss": 6.0244,
490
- "mean_token_accuracy": 0.22900927513837815,
491
- "num_tokens": 2199724.0,
492
  "step": 2350
493
  },
494
  {
495
- "entropy": 5.986697297096253,
496
  "epoch": 1.381692573402418,
497
- "grad_norm": 0.8481882214546204,
498
- "learning_rate": 1.464414676761794e-05,
499
- "loss": 5.8195,
500
- "mean_token_accuracy": 0.2552035376429558,
501
- "num_tokens": 2245341.0,
502
  "step": 2400
503
  },
504
  {
505
- "entropy": 6.1886044692993165,
506
  "epoch": 1.4104778353483016,
507
- "grad_norm": 0.7911505103111267,
508
- "learning_rate": 1.4527664531159e-05,
509
- "loss": 6.0148,
510
- "mean_token_accuracy": 0.23026730984449387,
511
- "num_tokens": 2294726.0,
512
  "step": 2450
513
  },
514
  {
515
- "entropy": 5.974867792129516,
516
  "epoch": 1.4392630972941853,
517
- "grad_norm": 1.640499234199524,
518
- "learning_rate": 1.441118229470006e-05,
519
- "loss": 5.8111,
520
- "mean_token_accuracy": 0.2554209426045418,
521
- "num_tokens": 2342251.0,
522
  "step": 2500
523
  },
524
  {
525
- "entropy": 5.967635660171509,
526
  "epoch": 1.468048359240069,
527
- "grad_norm": 0.8022929430007935,
528
- "learning_rate": 1.429470005824112e-05,
529
- "loss": 5.8015,
530
- "mean_token_accuracy": 0.2569852137565613,
531
- "num_tokens": 2387469.0,
532
  "step": 2550
533
  },
534
  {
535
- "entropy": 6.047262029647827,
536
  "epoch": 1.4968336211859528,
537
- "grad_norm": 0.9270678758621216,
538
- "learning_rate": 1.417821782178218e-05,
539
- "loss": 5.8782,
540
- "mean_token_accuracy": 0.2467849862575531,
541
- "num_tokens": 2434128.0,
542
  "step": 2600
543
  },
544
  {
545
- "entropy": 6.00601068019867,
546
  "epoch": 1.5256188831318365,
547
- "grad_norm": 1.5378597974777222,
548
- "learning_rate": 1.406173558532324e-05,
549
- "loss": 5.839,
550
- "mean_token_accuracy": 0.25216978013515473,
551
- "num_tokens": 2480366.0,
552
  "step": 2650
553
  },
554
  {
555
- "entropy": 5.988714299201965,
556
  "epoch": 1.5544041450777202,
557
- "grad_norm": 0.819143533706665,
558
- "learning_rate": 1.3945253348864299e-05,
559
- "loss": 5.82,
560
- "mean_token_accuracy": 0.254311783015728,
561
- "num_tokens": 2527357.0,
562
  "step": 2700
563
  },
564
  {
565
- "entropy": 5.960293846130371,
566
  "epoch": 1.583189407023604,
567
- "grad_norm": 0.8920449614524841,
568
- "learning_rate": 1.382877111240536e-05,
569
- "loss": 5.7946,
570
- "mean_token_accuracy": 0.25750755161046984,
571
- "num_tokens": 2574470.0,
572
  "step": 2750
573
  },
574
  {
575
- "entropy": 6.1214879322052,
576
  "epoch": 1.6119746689694876,
577
- "grad_norm": 0.5333890914916992,
578
- "learning_rate": 1.371228887594642e-05,
579
- "loss": 5.9513,
580
- "mean_token_accuracy": 0.2377367687225342,
581
- "num_tokens": 2622280.0,
582
  "step": 2800
583
  },
584
  {
585
- "entropy": 5.951769871711731,
586
  "epoch": 1.6407599309153713,
587
- "grad_norm": 0.5994665026664734,
588
- "learning_rate": 1.3595806639487479e-05,
589
- "loss": 5.7861,
590
- "mean_token_accuracy": 0.25854207515716554,
591
- "num_tokens": 2668624.0,
592
  "step": 2850
593
  },
594
  {
595
- "entropy": 5.927765312194825,
596
  "epoch": 1.669545192861255,
597
- "grad_norm": 0.4460087716579437,
598
- "learning_rate": 1.347932440302854e-05,
599
- "loss": 5.7661,
600
- "mean_token_accuracy": 0.25973255425691605,
601
- "num_tokens": 2714388.0,
602
  "step": 2900
603
  },
604
  {
605
- "entropy": 6.097678365707398,
606
  "epoch": 1.6983304548071387,
607
- "grad_norm": 0.7125752568244934,
608
- "learning_rate": 1.3362842166569598e-05,
609
- "loss": 5.9284,
610
- "mean_token_accuracy": 0.23995368272066117,
611
- "num_tokens": 2761465.0,
612
  "step": 2950
613
  },
614
  {
615
- "entropy": 5.986212658882141,
616
  "epoch": 1.7271157167530224,
617
- "grad_norm": 1.5405049324035645,
618
- "learning_rate": 1.3246359930110659e-05,
619
- "loss": 5.8194,
620
- "mean_token_accuracy": 0.25333445996046067,
621
- "num_tokens": 2808066.0,
622
  "step": 3000
623
  },
624
  {
625
- "entropy": 5.7968806195259095,
626
  "epoch": 1.7559009786989062,
627
- "grad_norm": 0.4532749652862549,
628
- "learning_rate": 1.312987769365172e-05,
629
- "loss": 5.6344,
630
- "mean_token_accuracy": 0.2782411390542984,
631
- "num_tokens": 2851822.0,
632
  "step": 3050
633
  },
634
  {
635
- "entropy": 5.973708114624023,
636
  "epoch": 1.7846862406447899,
637
- "grad_norm": 1.4795438051223755,
638
- "learning_rate": 1.3013395457192778e-05,
639
- "loss": 5.8104,
640
- "mean_token_accuracy": 0.25441971331834795,
641
- "num_tokens": 2897737.0,
642
  "step": 3100
643
  },
644
  {
645
- "entropy": 5.70733567237854,
646
  "epoch": 1.8134715025906736,
647
- "grad_norm": 0.6216577887535095,
648
- "learning_rate": 1.2896913220733839e-05,
649
- "loss": 5.5523,
650
- "mean_token_accuracy": 0.28787180870771406,
651
- "num_tokens": 2939511.0,
652
  "step": 3150
653
  },
654
  {
655
- "entropy": 5.96826630115509,
656
  "epoch": 1.8422567645365573,
657
- "grad_norm": 0.9246350526809692,
658
- "learning_rate": 1.2780430984274898e-05,
659
- "loss": 5.8057,
660
- "mean_token_accuracy": 0.25464902341365814,
661
- "num_tokens": 2986368.0,
662
  "step": 3200
663
  },
664
  {
665
- "entropy": 5.950662693977356,
666
  "epoch": 1.871042026482441,
667
- "grad_norm": 0.8141199946403503,
668
- "learning_rate": 1.266394874781596e-05,
669
- "loss": 5.7886,
670
- "mean_token_accuracy": 0.25830793648958206,
671
- "num_tokens": 3031770.0,
672
  "step": 3250
673
  },
674
  {
675
- "entropy": 6.00512773513794,
676
  "epoch": 1.8998272884283247,
677
- "grad_norm": 0.4913998246192932,
678
- "learning_rate": 1.2547466511357018e-05,
679
- "loss": 5.838,
680
- "mean_token_accuracy": 0.2512077575922012,
681
- "num_tokens": 3078322.0,
682
  "step": 3300
683
  },
684
  {
685
- "entropy": 6.090880632400513,
686
  "epoch": 1.9286125503742084,
687
- "grad_norm": 0.9893012046813965,
688
- "learning_rate": 1.243098427489808e-05,
689
- "loss": 5.9264,
690
- "mean_token_accuracy": 0.2391783133149147,
691
- "num_tokens": 3125572.0,
692
  "step": 3350
693
  },
694
  {
695
- "entropy": 5.949693293571472,
696
  "epoch": 1.9573978123200921,
697
- "grad_norm": 0.5794200301170349,
698
- "learning_rate": 1.231450203843914e-05,
699
- "loss": 5.7861,
700
- "mean_token_accuracy": 0.2568664598464966,
701
- "num_tokens": 3171974.0,
702
  "step": 3400
703
  },
704
  {
705
- "entropy": 6.03591317653656,
706
  "epoch": 1.9861830742659758,
707
- "grad_norm": 0.8525373339653015,
708
- "learning_rate": 1.21980198019802e-05,
709
- "loss": 5.8741,
710
- "mean_token_accuracy": 0.24642003327608109,
711
- "num_tokens": 3219624.0,
712
  "step": 3450
713
  },
714
  {
715
  "epoch": 2.0,
716
- "eval_entropy": 6.272298685416648,
717
- "eval_loss": 6.12472677230835,
718
- "eval_mean_token_accuracy": 0.21168697409091458,
719
- "eval_model_preparation_time": 0.0036,
720
- "eval_num_tokens": 3239438.0,
721
- "eval_runtime": 76.2536,
722
- "eval_samples_per_second": 5.692,
723
- "eval_steps_per_second": 2.846,
724
  "step": 3474
725
  },
726
  {
727
- "entropy": 5.914763498306274,
728
  "epoch": 2.0149683362118593,
729
- "grad_norm": 0.5479806661605835,
730
- "learning_rate": 1.208153756552126e-05,
731
- "loss": 5.7559,
732
- "mean_token_accuracy": 0.2624077323079109,
733
- "num_tokens": 3263994.0,
734
  "step": 3500
735
  },
736
  {
737
- "entropy": 6.033470869064331,
738
  "epoch": 2.043753598157743,
739
- "grad_norm": 1.7186369895935059,
740
- "learning_rate": 1.1965055329062319e-05,
741
- "loss": 5.8677,
742
- "mean_token_accuracy": 0.24745646148920059,
743
- "num_tokens": 3311182.0,
744
  "step": 3550
745
  },
746
  {
747
- "entropy": 5.962404427528381,
748
  "epoch": 2.0725388601036268,
749
- "grad_norm": 0.9068580269813538,
750
- "learning_rate": 1.184857309260338e-05,
751
- "loss": 5.8038,
752
- "mean_token_accuracy": 0.25500513821840287,
753
- "num_tokens": 3358036.0,
754
  "step": 3600
755
  },
756
  {
757
- "entropy": 5.995727968215943,
758
  "epoch": 2.1013241220495105,
759
- "grad_norm": 2.044490337371826,
760
- "learning_rate": 1.1732090856144438e-05,
761
- "loss": 5.8333,
762
- "mean_token_accuracy": 0.2514388278126717,
763
- "num_tokens": 3404058.0,
764
  "step": 3650
765
  },
766
  {
767
- "entropy": 5.981345901489258,
768
  "epoch": 2.130109383995394,
769
- "grad_norm": 0.5262818336486816,
770
- "learning_rate": 1.1615608619685499e-05,
771
- "loss": 5.8205,
772
- "mean_token_accuracy": 0.2523340278863907,
773
- "num_tokens": 3449834.0,
774
  "step": 3700
775
  },
776
  {
777
- "entropy": 5.848710675239563,
778
  "epoch": 2.158894645941278,
779
- "grad_norm": 0.726718544960022,
780
- "learning_rate": 1.149912638322656e-05,
781
- "loss": 5.6891,
782
- "mean_token_accuracy": 0.2697497832775116,
783
- "num_tokens": 3494740.0,
784
  "step": 3750
785
  },
786
  {
787
- "entropy": 5.964878315925598,
788
  "epoch": 2.1876799078871616,
789
- "grad_norm": 0.6147393584251404,
790
- "learning_rate": 1.1382644146767618e-05,
791
- "loss": 5.8029,
792
- "mean_token_accuracy": 0.2553535890579224,
793
- "num_tokens": 3541342.0,
794
  "step": 3800
795
  },
796
  {
797
- "entropy": 6.045858116149902,
798
  "epoch": 2.2164651698330453,
799
- "grad_norm": 0.8283621072769165,
800
- "learning_rate": 1.1266161910308679e-05,
801
- "loss": 5.8802,
802
- "mean_token_accuracy": 0.24544916599988936,
803
- "num_tokens": 3588995.0,
804
  "step": 3850
805
  },
806
  {
807
- "entropy": 5.909895505905151,
808
  "epoch": 2.245250431778929,
809
- "grad_norm": 0.9912867546081543,
810
- "learning_rate": 1.1149679673849738e-05,
811
- "loss": 5.7481,
812
- "mean_token_accuracy": 0.2620398569107056,
813
- "num_tokens": 3634252.0,
814
  "step": 3900
815
  },
816
  {
817
- "entropy": 5.9534005498886104,
818
  "epoch": 2.2740356937248127,
819
- "grad_norm": 1.2012401819229126,
820
- "learning_rate": 1.1033197437390799e-05,
821
- "loss": 5.788,
822
- "mean_token_accuracy": 0.25642816990613937,
823
- "num_tokens": 3681197.0,
824
  "step": 3950
825
  },
826
  {
827
- "entropy": 6.155718851089477,
828
  "epoch": 2.3028209556706964,
829
- "grad_norm": 1.4272509813308716,
830
- "learning_rate": 1.0916715200931857e-05,
831
- "loss": 5.9842,
832
- "mean_token_accuracy": 0.23176315426826477,
833
- "num_tokens": 3729955.0,
834
  "step": 4000
835
  },
836
  {
837
- "entropy": 6.004842009544372,
838
  "epoch": 2.33160621761658,
839
- "grad_norm": 1.1919596195220947,
840
- "learning_rate": 1.0800232964472918e-05,
841
- "loss": 5.8332,
842
- "mean_token_accuracy": 0.25039500594139097,
843
- "num_tokens": 3777043.0,
844
  "step": 4050
845
  },
846
  {
847
- "entropy": 6.045269584655761,
848
  "epoch": 2.360391479562464,
849
- "grad_norm": 0.6200748085975647,
850
- "learning_rate": 1.068375072801398e-05,
851
- "loss": 5.8641,
852
- "mean_token_accuracy": 0.2466951721906662,
853
- "num_tokens": 3824067.0,
854
  "step": 4100
855
  },
856
  {
857
- "entropy": 6.105137758255005,
858
  "epoch": 2.3891767415083476,
859
- "grad_norm": 1.0185531377792358,
860
- "learning_rate": 1.0567268491555038e-05,
861
- "loss": 5.9181,
862
- "mean_token_accuracy": 0.24000227689743042,
863
- "num_tokens": 3872769.0,
864
  "step": 4150
865
  },
866
  {
867
- "entropy": 6.013391451835632,
868
  "epoch": 2.4179620034542313,
869
- "grad_norm": 0.6188511848449707,
870
- "learning_rate": 1.04507862550961e-05,
871
- "loss": 5.8286,
872
- "mean_token_accuracy": 0.25189226895570754,
873
- "num_tokens": 3919379.0,
874
  "step": 4200
875
  },
876
  {
877
- "entropy": 5.972923498153687,
878
  "epoch": 2.446747265400115,
879
- "grad_norm": 0.7165982127189636,
880
- "learning_rate": 1.0334304018637157e-05,
881
- "loss": 5.7908,
882
- "mean_token_accuracy": 0.2567197346687317,
883
- "num_tokens": 3965593.0,
884
  "step": 4250
885
  },
886
  {
887
- "entropy": 6.0378124713897705,
888
  "epoch": 2.4755325273459987,
889
- "grad_norm": 0.5278330445289612,
890
- "learning_rate": 1.021782178217822e-05,
891
- "loss": 5.8559,
892
- "mean_token_accuracy": 0.2484271454811096,
893
- "num_tokens": 4012300.0,
894
  "step": 4300
895
  },
896
  {
897
- "entropy": 5.984496111869812,
898
  "epoch": 2.5043177892918824,
899
- "grad_norm": 0.8995006680488586,
900
- "learning_rate": 1.0101339545719278e-05,
901
- "loss": 5.8092,
902
- "mean_token_accuracy": 0.253717774450779,
903
- "num_tokens": 4059323.0,
904
  "step": 4350
905
  },
906
  {
907
- "entropy": 6.124767150878906,
908
  "epoch": 2.533103051237766,
909
- "grad_norm": 1.3810409307479858,
910
- "learning_rate": 9.984857309260339e-06,
911
- "loss": 5.9468,
912
- "mean_token_accuracy": 0.23715158700942993,
913
- "num_tokens": 4107616.0,
914
  "step": 4400
915
  },
916
  {
917
- "entropy": 5.8810745000839235,
918
  "epoch": 2.56188831318365,
919
- "grad_norm": 0.8794332146644592,
920
- "learning_rate": 9.868375072801398e-06,
921
- "loss": 5.7089,
922
- "mean_token_accuracy": 0.2662400561571121,
923
- "num_tokens": 4152400.0,
924
  "step": 4450
925
  },
926
  {
927
- "entropy": 6.108017959594727,
928
  "epoch": 2.5906735751295336,
929
- "grad_norm": 0.5132983922958374,
930
- "learning_rate": 9.751892836342458e-06,
931
- "loss": 5.9346,
932
- "mean_token_accuracy": 0.23871887892484664,
933
- "num_tokens": 4200994.0,
934
  "step": 4500
935
  },
936
  {
937
- "entropy": 5.985005149841308,
938
  "epoch": 2.6194588370754173,
939
- "grad_norm": 0.6561470031738281,
940
- "learning_rate": 9.635410599883519e-06,
941
- "loss": 5.8111,
942
- "mean_token_accuracy": 0.25315980523824694,
943
- "num_tokens": 4247548.0,
944
  "step": 4550
945
  },
946
  {
947
- "entropy": 6.050709452629089,
948
  "epoch": 2.648244099021301,
949
- "grad_norm": 0.8790570497512817,
950
- "learning_rate": 9.51892836342458e-06,
951
- "loss": 5.8789,
952
- "mean_token_accuracy": 0.2440834751725197,
953
- "num_tokens": 4295250.0,
954
  "step": 4600
955
  },
956
  {
957
- "entropy": 6.007251596450805,
958
  "epoch": 2.6770293609671847,
959
- "grad_norm": 0.6728562116622925,
960
- "learning_rate": 9.402446126965639e-06,
961
- "loss": 5.8338,
962
- "mean_token_accuracy": 0.2509264424443245,
963
- "num_tokens": 4341599.0,
964
  "step": 4650
965
  },
966
  {
967
- "entropy": 5.966628184318543,
968
  "epoch": 2.7058146229130684,
969
- "grad_norm": 0.5815795063972473,
970
- "learning_rate": 9.285963890506699e-06,
971
- "loss": 5.7961,
972
- "mean_token_accuracy": 0.2559360232949257,
973
- "num_tokens": 4388673.0,
974
  "step": 4700
975
  },
976
  {
977
- "entropy": 5.7972593069076535,
978
  "epoch": 2.734599884858952,
979
- "grad_norm": 1.0610334873199463,
980
- "learning_rate": 9.169481654047758e-06,
981
- "loss": 5.6318,
982
- "mean_token_accuracy": 0.27574603259563446,
983
- "num_tokens": 4432959.0,
984
  "step": 4750
985
  },
986
  {
987
- "entropy": 5.984181261062622,
988
  "epoch": 2.763385146804836,
989
- "grad_norm": 2.1847357749938965,
990
- "learning_rate": 9.052999417588819e-06,
991
- "loss": 5.8153,
992
- "mean_token_accuracy": 0.2533784031867981,
993
- "num_tokens": 4479190.0,
994
  "step": 4800
995
  },
996
  {
997
- "entropy": 5.959725599288941,
998
  "epoch": 2.7921704087507195,
999
- "grad_norm": 0.5671709179878235,
1000
- "learning_rate": 8.936517181129878e-06,
1001
- "loss": 5.7912,
1002
- "mean_token_accuracy": 0.2556650054454803,
1003
- "num_tokens": 4525674.0,
1004
  "step": 4850
1005
  },
1006
  {
1007
- "entropy": 5.814929313659668,
1008
  "epoch": 2.8209556706966032,
1009
- "grad_norm": 0.9447108507156372,
1010
- "learning_rate": 8.820034944670938e-06,
1011
- "loss": 5.6478,
1012
- "mean_token_accuracy": 0.27417868226766584,
1013
- "num_tokens": 4570379.0,
1014
  "step": 4900
1015
  },
1016
  {
1017
- "entropy": 5.96754421710968,
1018
  "epoch": 2.849740932642487,
1019
- "grad_norm": 2.009676218032837,
1020
- "learning_rate": 8.703552708211999e-06,
1021
- "loss": 5.795,
1022
- "mean_token_accuracy": 0.2556305864453316,
1023
- "num_tokens": 4617184.0,
1024
  "step": 4950
1025
  },
1026
  {
1027
- "entropy": 6.008112049102783,
1028
  "epoch": 2.8785261945883707,
1029
- "grad_norm": 1.1977978944778442,
1030
- "learning_rate": 8.587070471753058e-06,
1031
- "loss": 5.8416,
1032
- "mean_token_accuracy": 0.2494604030251503,
1033
- "num_tokens": 4664180.0,
1034
  "step": 5000
1035
  },
1036
  {
1037
- "entropy": 5.832320966720581,
1038
  "epoch": 2.9073114565342544,
1039
- "grad_norm": 0.4845636785030365,
1040
- "learning_rate": 8.470588235294118e-06,
1041
- "loss": 5.6672,
1042
- "mean_token_accuracy": 0.27187123566865923,
1043
- "num_tokens": 4708377.0,
1044
  "step": 5050
1045
  },
1046
  {
1047
- "entropy": 5.84138514995575,
1048
  "epoch": 2.936096718480138,
1049
- "grad_norm": 0.8487229943275452,
1050
- "learning_rate": 8.354105998835179e-06,
1051
- "loss": 5.6769,
1052
- "mean_token_accuracy": 0.26995211571455,
1053
- "num_tokens": 4753587.0,
1054
  "step": 5100
1055
  },
1056
  {
1057
- "entropy": 6.016681690216064,
1058
  "epoch": 2.964881980426022,
1059
- "grad_norm": 0.9554332494735718,
1060
- "learning_rate": 8.237623762376238e-06,
1061
- "loss": 5.8479,
1062
- "mean_token_accuracy": 0.24785644590854644,
1063
- "num_tokens": 4800508.0,
1064
  "step": 5150
1065
  },
1066
  {
1067
- "entropy": 6.103472499847412,
1068
  "epoch": 2.9936672423719055,
1069
- "grad_norm": 0.6602863669395447,
1070
- "learning_rate": 8.121141525917298e-06,
1071
- "loss": 5.9305,
1072
- "mean_token_accuracy": 0.23794592499732972,
1073
- "num_tokens": 4849415.0,
1074
  "step": 5200
1075
  },
1076
  {
1077
  "epoch": 3.0,
1078
- "eval_entropy": 6.254081044878278,
1079
- "eval_loss": 6.0980024337768555,
1080
- "eval_mean_token_accuracy": 0.21401402258103894,
1081
- "eval_model_preparation_time": 0.0036,
1082
- "eval_num_tokens": 4859157.0,
1083
- "eval_runtime": 75.9443,
1084
- "eval_samples_per_second": 5.715,
1085
- "eval_steps_per_second": 2.857,
1086
  "step": 5211
1087
  },
1088
  {
1089
- "entropy": 5.829766502380371,
1090
  "epoch": 3.0224525043177892,
1091
- "grad_norm": 0.5663251280784607,
1092
- "learning_rate": 8.004659289458359e-06,
1093
- "loss": 5.6641,
1094
- "mean_token_accuracy": 0.27141522347927094,
1095
- "num_tokens": 4893297.0,
1096
  "step": 5250
1097
  },
1098
  {
1099
- "entropy": 5.987464437484741,
1100
  "epoch": 3.051237766263673,
1101
- "grad_norm": 0.7494759559631348,
1102
- "learning_rate": 7.888177052999418e-06,
1103
- "loss": 5.8163,
1104
- "mean_token_accuracy": 0.2527216270565987,
1105
- "num_tokens": 4940190.0,
1106
  "step": 5300
1107
  },
1108
  {
1109
- "entropy": 5.922745175361634,
1110
  "epoch": 3.0800230282095566,
1111
- "grad_norm": 1.0836946964263916,
1112
- "learning_rate": 7.771694816540478e-06,
1113
- "loss": 5.7536,
1114
- "mean_token_accuracy": 0.26056944400072096,
1115
- "num_tokens": 4986555.0,
1116
  "step": 5350
1117
  },
1118
  {
1119
- "entropy": 5.940353560447693,
1120
  "epoch": 3.1088082901554404,
1121
- "grad_norm": 0.9733691811561584,
1122
- "learning_rate": 7.655212580081537e-06,
1123
- "loss": 5.7726,
1124
- "mean_token_accuracy": 0.25769122928380966,
1125
- "num_tokens": 5033343.0,
1126
  "step": 5400
1127
  },
1128
  {
1129
- "entropy": 6.020898208618164,
1130
  "epoch": 3.137593552101324,
1131
- "grad_norm": 0.7032522559165955,
1132
- "learning_rate": 7.538730343622598e-06,
1133
- "loss": 5.8525,
1134
- "mean_token_accuracy": 0.2485825625061989,
1135
- "num_tokens": 5080498.0,
1136
  "step": 5450
1137
  },
1138
  {
1139
- "entropy": 5.953487596511841,
1140
  "epoch": 3.166378814047208,
1141
- "grad_norm": 0.34246090054512024,
1142
- "learning_rate": 7.422248107163658e-06,
1143
- "loss": 5.7907,
1144
- "mean_token_accuracy": 0.25604957044124604,
1145
- "num_tokens": 5126524.0,
1146
  "step": 5500
1147
  },
1148
  {
1149
- "entropy": 6.124787425994873,
1150
  "epoch": 3.1951640759930915,
1151
- "grad_norm": 0.8139322400093079,
1152
- "learning_rate": 7.305765870704718e-06,
1153
- "loss": 5.9528,
1154
- "mean_token_accuracy": 0.23523027300834656,
1155
- "num_tokens": 5176187.0,
1156
  "step": 5550
1157
  },
1158
  {
1159
- "entropy": 5.990889682769775,
1160
  "epoch": 3.223949337938975,
1161
- "grad_norm": 0.47230103611946106,
1162
- "learning_rate": 7.189283634245778e-06,
1163
- "loss": 5.8239,
1164
- "mean_token_accuracy": 0.2525310072302818,
1165
- "num_tokens": 5223101.0,
1166
  "step": 5600
1167
  },
1168
  {
1169
- "entropy": 5.9849296569824215,
1170
  "epoch": 3.252734599884859,
1171
- "grad_norm": 1.0184181928634644,
1172
- "learning_rate": 7.072801397786838e-06,
1173
- "loss": 5.8195,
1174
- "mean_token_accuracy": 0.25241400361061095,
1175
- "num_tokens": 5270200.0,
1176
  "step": 5650
1177
  },
1178
  {
1179
- "entropy": 5.9844825649261475,
1180
  "epoch": 3.2815198618307426,
1181
- "grad_norm": 0.8858366012573242,
1182
- "learning_rate": 6.956319161327898e-06,
1183
- "loss": 5.8149,
1184
- "mean_token_accuracy": 0.2527842208743095,
1185
- "num_tokens": 5315872.0,
1186
  "step": 5700
1187
  },
1188
  {
1189
- "entropy": 6.0137806224823,
1190
  "epoch": 3.3103051237766263,
1191
- "grad_norm": 1.8156790733337402,
1192
- "learning_rate": 6.839836924868957e-06,
1193
- "loss": 5.8448,
1194
- "mean_token_accuracy": 0.24918658077716827,
1195
- "num_tokens": 5362860.0,
1196
  "step": 5750
1197
  },
1198
  {
1199
- "entropy": 5.8676600885391235,
1200
  "epoch": 3.33909038572251,
1201
- "grad_norm": 0.5497516393661499,
1202
- "learning_rate": 6.723354688410018e-06,
1203
- "loss": 5.7041,
1204
- "mean_token_accuracy": 0.2672875428199768,
1205
- "num_tokens": 5407854.0,
1206
  "step": 5800
1207
  },
1208
  {
1209
- "entropy": 5.702701902389526,
1210
  "epoch": 3.3678756476683938,
1211
- "grad_norm": 1.8813326358795166,
1212
- "learning_rate": 6.606872451951079e-06,
1213
- "loss": 5.5403,
1214
- "mean_token_accuracy": 0.2881160417199135,
1215
- "num_tokens": 5450830.0,
1216
  "step": 5850
1217
  },
1218
  {
1219
- "entropy": 5.931481714248657,
1220
  "epoch": 3.3966609096142775,
1221
- "grad_norm": 0.664723813533783,
1222
- "learning_rate": 6.490390215492138e-06,
1223
- "loss": 5.7643,
1224
- "mean_token_accuracy": 0.2589978861808777,
1225
- "num_tokens": 5497011.0,
1226
  "step": 5900
1227
  },
1228
  {
1229
- "entropy": 5.998486938476563,
1230
  "epoch": 3.425446171560161,
1231
- "grad_norm": 0.8670396208763123,
1232
- "learning_rate": 6.373907979033198e-06,
1233
- "loss": 5.8296,
1234
- "mean_token_accuracy": 0.2514311093091965,
1235
- "num_tokens": 5542914.0,
1236
  "step": 5950
1237
  },
1238
  {
1239
- "entropy": 6.03099499464035,
1240
  "epoch": 3.454231433506045,
1241
- "grad_norm": 0.6441876292228699,
1242
- "learning_rate": 6.257425742574258e-06,
1243
- "loss": 5.865,
1244
- "mean_token_accuracy": 0.2457648393511772,
1245
- "num_tokens": 5591031.0,
1246
  "step": 6000
1247
  },
1248
  {
1249
- "entropy": 5.982430481910706,
1250
  "epoch": 3.4830166954519286,
1251
- "grad_norm": 0.8924009799957275,
1252
- "learning_rate": 6.140943506115318e-06,
1253
- "loss": 5.8162,
1254
- "mean_token_accuracy": 0.2524935993552208,
1255
- "num_tokens": 5638071.0,
1256
  "step": 6050
1257
  },
1258
  {
1259
- "entropy": 5.9037020778656,
1260
  "epoch": 3.5118019573978123,
1261
- "grad_norm": 0.6898691654205322,
1262
- "learning_rate": 6.024461269656377e-06,
1263
- "loss": 5.7369,
1264
- "mean_token_accuracy": 0.2629904666543007,
1265
- "num_tokens": 5684251.0,
1266
  "step": 6100
1267
  },
1268
  {
1269
- "entropy": 6.049537987709045,
1270
  "epoch": 3.540587219343696,
1271
- "grad_norm": 0.7176857590675354,
1272
- "learning_rate": 5.907979033197437e-06,
1273
- "loss": 5.8796,
1274
- "mean_token_accuracy": 0.24504777789115906,
1275
- "num_tokens": 5732144.0,
1276
  "step": 6150
1277
  },
1278
  {
1279
- "entropy": 5.896630597114563,
1280
  "epoch": 3.5693724812895797,
1281
- "grad_norm": 0.2607983350753784,
1282
- "learning_rate": 5.7914967967384986e-06,
1283
- "loss": 5.7322,
1284
- "mean_token_accuracy": 0.2642005959153175,
1285
- "num_tokens": 5777711.0,
1286
  "step": 6200
1287
  },
1288
  {
1289
- "entropy": 5.914587182998657,
1290
  "epoch": 3.5981577432354634,
1291
- "grad_norm": 1.745258092880249,
1292
- "learning_rate": 5.675014560279558e-06,
1293
- "loss": 5.75,
1294
- "mean_token_accuracy": 0.26022892773151396,
1295
- "num_tokens": 5823662.0,
1296
  "step": 6250
1297
  },
1298
  {
1299
- "entropy": 6.1133457374572755,
1300
  "epoch": 3.626943005181347,
1301
- "grad_norm": 1.2074909210205078,
1302
- "learning_rate": 5.558532323820618e-06,
1303
- "loss": 5.9456,
1304
- "mean_token_accuracy": 0.2363065341114998,
1305
- "num_tokens": 5872695.0,
1306
  "step": 6300
1307
  },
1308
  {
1309
- "entropy": 5.927231726646423,
1310
  "epoch": 3.655728267127231,
1311
- "grad_norm": 0.4929303824901581,
1312
- "learning_rate": 5.442050087361678e-06,
1313
- "loss": 5.7627,
1314
- "mean_token_accuracy": 0.2595584252476692,
1315
- "num_tokens": 5919280.0,
1316
  "step": 6350
1317
  },
1318
  {
1319
- "entropy": 6.085461645126343,
1320
  "epoch": 3.6845135290731146,
1321
- "grad_norm": 0.6310611367225647,
1322
- "learning_rate": 5.325567850902738e-06,
1323
- "loss": 5.9152,
1324
- "mean_token_accuracy": 0.23941247612237931,
1325
- "num_tokens": 5967702.0,
1326
  "step": 6400
1327
  },
1328
  {
1329
- "entropy": 5.884929132461548,
1330
  "epoch": 3.7132987910189983,
1331
- "grad_norm": 1.1140141487121582,
1332
- "learning_rate": 5.209085614443797e-06,
1333
- "loss": 5.7183,
1334
- "mean_token_accuracy": 0.26445025473833084,
1335
- "num_tokens": 6012476.0,
1336
  "step": 6450
1337
  },
1338
  {
1339
- "entropy": 5.980639338493347,
1340
  "epoch": 3.742084052964882,
1341
- "grad_norm": 0.8079864978790283,
1342
- "learning_rate": 5.092603377984858e-06,
1343
- "loss": 5.8145,
1344
- "mean_token_accuracy": 0.2531423449516296,
1345
- "num_tokens": 6059915.0,
1346
  "step": 6500
1347
  },
1348
  {
1349
- "entropy": 6.113835816383362,
1350
  "epoch": 3.7708693149107657,
1351
- "grad_norm": 1.0994234085083008,
1352
- "learning_rate": 4.976121141525918e-06,
1353
- "loss": 5.9438,
1354
- "mean_token_accuracy": 0.2360466265678406,
1355
- "num_tokens": 6109703.0,
1356
  "step": 6550
1357
  },
1358
  {
1359
- "entropy": 5.891526069641113,
1360
  "epoch": 3.7996545768566494,
1361
- "grad_norm": 0.6459522843360901,
1362
- "learning_rate": 4.859638905066978e-06,
1363
- "loss": 5.7234,
1364
- "mean_token_accuracy": 0.264646929204464,
1365
- "num_tokens": 6155107.0,
1366
  "step": 6600
1367
  },
1368
  {
1369
- "entropy": 5.878629055023193,
1370
  "epoch": 3.828439838802533,
1371
- "grad_norm": 2.0741031169891357,
1372
- "learning_rate": 4.743156668608038e-06,
1373
- "loss": 5.7132,
1374
- "mean_token_accuracy": 0.26615093410015106,
1375
- "num_tokens": 6201565.0,
1376
  "step": 6650
1377
  },
1378
  {
1379
- "entropy": 6.0427888488769534,
1380
  "epoch": 3.857225100748417,
1381
- "grad_norm": 0.6295380592346191,
1382
- "learning_rate": 4.626674432149098e-06,
1383
- "loss": 5.8755,
1384
- "mean_token_accuracy": 0.24501585960388184,
1385
- "num_tokens": 6249569.0,
1386
  "step": 6700
1387
  },
1388
  {
1389
- "entropy": 5.9715061330795285,
1390
  "epoch": 3.8860103626943006,
1391
- "grad_norm": 0.5263471007347107,
1392
- "learning_rate": 4.5101921956901576e-06,
1393
- "loss": 5.8028,
1394
- "mean_token_accuracy": 0.25363644570112226,
1395
- "num_tokens": 6296537.0,
1396
  "step": 6750
1397
  },
1398
  {
1399
- "entropy": 5.886189122200012,
1400
  "epoch": 3.9147956246401843,
1401
- "grad_norm": 0.8449739217758179,
1402
- "learning_rate": 4.393709959231217e-06,
1403
- "loss": 5.7206,
1404
- "mean_token_accuracy": 0.26399643808603285,
1405
- "num_tokens": 6341373.0,
1406
  "step": 6800
1407
  },
1408
  {
1409
- "entropy": 6.144029655456543,
1410
  "epoch": 3.943580886586068,
1411
- "grad_norm": 0.9823312163352966,
1412
- "learning_rate": 4.277227722772277e-06,
1413
- "loss": 5.9724,
1414
- "mean_token_accuracy": 0.23244859367609025,
1415
- "num_tokens": 6391079.0,
1416
  "step": 6850
1417
  },
1418
  {
1419
- "entropy": 5.7569769096374515,
1420
  "epoch": 3.9723661485319517,
1421
- "grad_norm": 1.98943030834198,
1422
- "learning_rate": 4.160745486313338e-06,
1423
- "loss": 5.5963,
1424
- "mean_token_accuracy": 0.2798686361312866,
1425
- "num_tokens": 6434857.0,
1426
  "step": 6900
1427
  },
1428
  {
1429
  "epoch": 4.0,
1430
- "eval_entropy": 6.247500973363077,
1431
- "eval_loss": 6.0967888832092285,
1432
- "eval_mean_token_accuracy": 0.2140663956304849,
1433
- "eval_model_preparation_time": 0.0036,
1434
- "eval_num_tokens": 6478876.0,
1435
- "eval_runtime": 76.4612,
1436
- "eval_samples_per_second": 5.676,
1437
- "eval_steps_per_second": 2.838,
1438
  "step": 6948
1439
  }
1440
  ],
1441
  "logging_steps": 50,
1442
- "max_steps": 8685,
1443
  "num_input_tokens_seen": 0,
1444
- "num_train_epochs": 5,
1445
  "save_steps": 500,
1446
  "stateful_callbacks": {
1447
  "TrainerControl": {
@@ -1450,12 +1450,12 @@
1450
  "should_evaluate": false,
1451
  "should_log": false,
1452
  "should_save": true,
1453
- "should_training_stop": false
1454
  },
1455
  "attributes": {}
1456
  }
1457
  },
1458
- "total_flos": 9.054713052854784e+16,
1459
  "train_batch_size": 2,
1460
  "trial_name": null,
1461
  "trial_params": null
 
1
  {
2
  "best_global_step": 6948,
3
+ "best_metric": 5.622366428375244,
4
  "best_model_checkpoint": "./output/checkpoint-6948",
5
  "epoch": 4.0,
6
  "eval_steps": 500,
 
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "entropy": 3.6583470726013183,
14
  "epoch": 0.028785261945883708,
15
+ "grad_norm": 3.3817152976989746,
16
+ "learning_rate": 4.9e-07,
17
+ "loss": 13.8754,
18
+ "mean_token_accuracy": 0.15036460414528846,
19
+ "num_tokens": 53093.0,
20
  "step": 50
21
  },
22
  {
23
+ "entropy": 3.669608063697815,
24
  "epoch": 0.057570523891767415,
25
+ "grad_norm": 3.2541544437408447,
26
+ "learning_rate": 9.9e-07,
27
+ "loss": 14.2282,
28
+ "mean_token_accuracy": 0.14137721598148345,
29
+ "num_tokens": 108334.0,
30
  "step": 100
31
  },
32
  {
33
+ "entropy": 3.569736371040344,
34
  "epoch": 0.08635578583765112,
35
+ "grad_norm": 3.6797454357147217,
36
+ "learning_rate": 1.49e-06,
37
+ "loss": 13.0735,
38
+ "mean_token_accuracy": 0.17473630651831626,
39
+ "num_tokens": 157491.0,
40
  "step": 150
41
  },
42
  {
43
+ "entropy": 3.7253233194351196,
44
  "epoch": 0.11514104778353483,
45
+ "grad_norm": 4.297911643981934,
46
+ "learning_rate": 1.99e-06,
47
+ "loss": 13.7392,
48
+ "mean_token_accuracy": 0.1473099772632122,
49
+ "num_tokens": 211394.0,
50
  "step": 200
51
  },
52
  {
53
+ "entropy": 3.8280500602722167,
54
  "epoch": 0.14392630972941853,
55
+ "grad_norm": 4.405268669128418,
56
+ "learning_rate": 1.9854771784232364e-06,
57
+ "loss": 13.0797,
58
+ "mean_token_accuracy": 0.16704789966344832,
59
+ "num_tokens": 263685.0,
60
  "step": 250
61
  },
62
  {
63
+ "entropy": 4.066333084106446,
64
  "epoch": 0.17271157167530224,
65
+ "grad_norm": 4.757556438446045,
66
+ "learning_rate": 1.9706579727326615e-06,
67
+ "loss": 12.6321,
68
+ "mean_token_accuracy": 0.1691790708899498,
69
+ "num_tokens": 314059.0,
70
  "step": 300
71
  },
72
  {
73
+ "entropy": 4.257266030311585,
74
  "epoch": 0.20149683362118595,
75
+ "grad_norm": 6.406249523162842,
76
+ "learning_rate": 1.955838767042086e-06,
77
+ "loss": 12.2253,
78
+ "mean_token_accuracy": 0.17223650276660918,
79
+ "num_tokens": 367038.0,
80
  "step": 350
81
  },
82
  {
83
+ "entropy": 4.694105777740479,
84
  "epoch": 0.23028209556706966,
85
+ "grad_norm": 12.57987117767334,
86
+ "learning_rate": 1.9410195613515113e-06,
87
+ "loss": 11.9714,
88
+ "mean_token_accuracy": 0.15997304677963256,
89
+ "num_tokens": 420327.0,
90
  "step": 400
91
  },
92
  {
93
+ "entropy": 5.205010280609131,
94
  "epoch": 0.25906735751295334,
95
+ "grad_norm": 15.570313453674316,
96
+ "learning_rate": 1.9262003556609364e-06,
97
+ "loss": 10.8173,
98
+ "mean_token_accuracy": 0.16447648257017136,
99
+ "num_tokens": 472429.0,
100
  "step": 450
101
  },
102
  {
103
+ "entropy": 5.917805089950561,
104
  "epoch": 0.28785261945883706,
105
+ "grad_norm": 23.61503791809082,
106
+ "learning_rate": 1.9113811499703615e-06,
107
+ "loss": 9.3196,
108
+ "mean_token_accuracy": 0.16179455041885377,
109
+ "num_tokens": 526315.0,
110
  "step": 500
111
  },
112
  {
113
+ "entropy": 6.380368332862854,
114
  "epoch": 0.31663788140472077,
115
+ "grad_norm": 13.846810340881348,
116
+ "learning_rate": 1.8965619442797864e-06,
117
+ "loss": 7.9636,
118
+ "mean_token_accuracy": 0.16881170988082886,
119
+ "num_tokens": 578511.0,
120
  "step": 550
121
  },
122
  {
123
+ "entropy": 6.507339992523193,
124
  "epoch": 0.3454231433506045,
125
+ "grad_norm": 4.569090366363525,
126
+ "learning_rate": 1.8817427385892115e-06,
127
+ "loss": 7.4171,
128
+ "mean_token_accuracy": 0.16941152423620223,
129
+ "num_tokens": 630937.0,
130
  "step": 600
131
  },
132
  {
133
+ "entropy": 6.392864561080932,
134
  "epoch": 0.3742084052964882,
135
+ "grad_norm": 4.594696521759033,
136
+ "learning_rate": 1.8669235328986366e-06,
137
+ "loss": 6.9389,
138
+ "mean_token_accuracy": 0.1844496901333332,
139
+ "num_tokens": 680501.0,
140
  "step": 650
141
  },
142
  {
143
+ "entropy": 6.6726202869415285,
144
  "epoch": 0.4029936672423719,
145
+ "grad_norm": 4.768734931945801,
146
+ "learning_rate": 1.8521043272080617e-06,
147
+ "loss": 6.9818,
148
+ "mean_token_accuracy": 0.16990411713719367,
149
+ "num_tokens": 733231.0,
150
  "step": 700
151
  },
152
  {
153
+ "entropy": 6.592793455123902,
154
  "epoch": 0.4317789291882556,
155
+ "grad_norm": 3.253056764602661,
156
+ "learning_rate": 1.8372851215174864e-06,
157
+ "loss": 6.7105,
158
+ "mean_token_accuracy": 0.18250102579593658,
159
+ "num_tokens": 785373.0,
160
  "step": 750
161
  },
162
  {
163
+ "entropy": 6.683582029342651,
164
  "epoch": 0.4605641911341393,
165
+ "grad_norm": 2.1871063709259033,
166
+ "learning_rate": 1.8224659158269115e-06,
167
+ "loss": 6.6685,
168
+ "mean_token_accuracy": 0.17129646152257919,
169
+ "num_tokens": 838646.0,
170
  "step": 800
171
  },
172
  {
173
+ "entropy": 6.636875295639038,
174
  "epoch": 0.48934945308002303,
175
+ "grad_norm": 3.2284677028656006,
176
+ "learning_rate": 1.8076467101363366e-06,
177
+ "loss": 6.53,
178
+ "mean_token_accuracy": 0.18053789794445038,
179
+ "num_tokens": 892380.0,
180
  "step": 850
181
  },
182
  {
183
+ "entropy": 6.610673260688782,
184
  "epoch": 0.5181347150259067,
185
+ "grad_norm": 2.2088730335235596,
186
+ "learning_rate": 1.7928275044457617e-06,
187
+ "loss": 6.4429,
188
+ "mean_token_accuracy": 0.18492739230394364,
189
+ "num_tokens": 947971.0,
190
  "step": 900
191
  },
192
  {
193
+ "entropy": 6.242899022102356,
194
  "epoch": 0.5469199769717904,
195
+ "grad_norm": 2.3000030517578125,
196
+ "learning_rate": 1.7780082987551866e-06,
197
+ "loss": 6.047,
198
+ "mean_token_accuracy": 0.2291259828209877,
199
+ "num_tokens": 998810.0,
200
  "step": 950
201
  },
202
  {
203
+ "entropy": 6.311488924026489,
204
  "epoch": 0.5757052389176741,
205
+ "grad_norm": 2.1333675384521484,
206
+ "learning_rate": 1.7631890930646115e-06,
207
+ "loss": 6.0919,
208
+ "mean_token_accuracy": 0.22644571751356124,
209
+ "num_tokens": 1050860.0,
210
  "step": 1000
211
  },
212
  {
213
+ "entropy": 6.3254336166381835,
214
  "epoch": 0.6044905008635578,
215
+ "grad_norm": 2.0400779247283936,
216
+ "learning_rate": 1.7483698873740366e-06,
217
+ "loss": 6.094,
218
+ "mean_token_accuracy": 0.2222653564810753,
219
+ "num_tokens": 1104304.0,
220
  "step": 1050
221
  },
222
  {
223
+ "entropy": 6.046922063827514,
224
  "epoch": 0.6332757628094415,
225
+ "grad_norm": 2.8049051761627197,
226
+ "learning_rate": 1.7335506816834617e-06,
227
+ "loss": 5.8011,
228
+ "mean_token_accuracy": 0.25127078920602797,
229
+ "num_tokens": 1153605.0,
230
  "step": 1100
231
  },
232
  {
233
+ "entropy": 5.943600912094116,
234
  "epoch": 0.6620610247553252,
235
+ "grad_norm": 4.063963890075684,
236
+ "learning_rate": 1.7187314759928866e-06,
237
+ "loss": 5.6855,
238
+ "mean_token_accuracy": 0.26265266716480257,
239
+ "num_tokens": 1204328.0,
240
  "step": 1150
241
  },
242
  {
243
+ "entropy": 6.12883231639862,
244
  "epoch": 0.690846286701209,
245
+ "grad_norm": 3.9440460205078125,
246
+ "learning_rate": 1.7039122703023117e-06,
247
+ "loss": 5.8578,
248
+ "mean_token_accuracy": 0.24439335912466048,
249
+ "num_tokens": 1257415.0,
250
  "step": 1200
251
  },
252
  {
253
+ "entropy": 6.164987115859986,
254
  "epoch": 0.7196315486470927,
255
+ "grad_norm": 3.20070481300354,
256
+ "learning_rate": 1.6890930646117368e-06,
257
+ "loss": 5.8876,
258
+ "mean_token_accuracy": 0.24275501281023026,
259
+ "num_tokens": 1310049.0,
260
  "step": 1250
261
  },
262
  {
263
+ "entropy": 6.080997190475464,
264
  "epoch": 0.7484168105929764,
265
+ "grad_norm": 2.8067362308502197,
266
+ "learning_rate": 1.6742738589211617e-06,
267
+ "loss": 5.8058,
268
+ "mean_token_accuracy": 0.25242207854986193,
269
+ "num_tokens": 1361794.0,
270
  "step": 1300
271
  },
272
  {
273
+ "entropy": 5.940848155021667,
274
  "epoch": 0.7772020725388601,
275
+ "grad_norm": 2.6375925540924072,
276
+ "learning_rate": 1.6594546532305868e-06,
277
+ "loss": 5.6718,
278
+ "mean_token_accuracy": 0.2665082859992981,
279
+ "num_tokens": 1412773.0,
280
  "step": 1350
281
  },
282
  {
283
+ "entropy": 6.071129274368286,
284
  "epoch": 0.8059873344847438,
285
+ "grad_norm": 3.951350212097168,
286
+ "learning_rate": 1.6446354475400117e-06,
287
+ "loss": 5.8012,
288
+ "mean_token_accuracy": 0.25434976994991304,
289
+ "num_tokens": 1465620.0,
290
  "step": 1400
291
  },
292
  {
293
+ "entropy": 6.069429359436035,
294
  "epoch": 0.8347725964306275,
295
+ "grad_norm": 3.580608606338501,
296
+ "learning_rate": 1.6298162418494368e-06,
297
+ "loss": 5.8027,
298
+ "mean_token_accuracy": 0.25208072274923327,
299
+ "num_tokens": 1518899.0,
300
  "step": 1450
301
  },
302
  {
303
+ "entropy": 6.005315380096436,
304
  "epoch": 0.8635578583765112,
305
+ "grad_norm": 3.9580376148223877,
306
+ "learning_rate": 1.614997036158862e-06,
307
+ "loss": 5.7364,
308
+ "mean_token_accuracy": 0.25940640360116957,
309
+ "num_tokens": 1571304.0,
310
  "step": 1500
311
  },
312
  {
313
+ "entropy": 6.0786464881896975,
314
  "epoch": 0.8923431203223949,
315
+ "grad_norm": 4.55721378326416,
316
+ "learning_rate": 1.6001778304682868e-06,
317
+ "loss": 5.8092,
318
+ "mean_token_accuracy": 0.2496869170665741,
319
+ "num_tokens": 1627369.0,
320
  "step": 1550
321
  },
322
  {
323
+ "entropy": 5.939382014274597,
324
  "epoch": 0.9211283822682786,
325
+ "grad_norm": 2.330057144165039,
326
+ "learning_rate": 1.5853586247777117e-06,
327
+ "loss": 5.6604,
328
+ "mean_token_accuracy": 0.2686630353331566,
329
+ "num_tokens": 1680401.0,
330
  "step": 1600
331
  },
332
  {
333
+ "entropy": 6.121775646209716,
334
  "epoch": 0.9499136442141624,
335
+ "grad_norm": 2.9881200790405273,
336
+ "learning_rate": 1.5705394190871368e-06,
337
+ "loss": 5.8388,
338
+ "mean_token_accuracy": 0.2503683388233185,
339
+ "num_tokens": 1735745.0,
340
  "step": 1650
341
  },
342
  {
343
+ "entropy": 5.840040788650513,
344
  "epoch": 0.9786989061600461,
345
+ "grad_norm": 3.798994779586792,
346
+ "learning_rate": 1.555720213396562e-06,
347
+ "loss": 5.5635,
348
+ "mean_token_accuracy": 0.278279125392437,
349
+ "num_tokens": 1786896.0,
350
  "step": 1700
351
  },
352
  {
353
  "epoch": 1.0,
354
+ "eval_entropy": 6.139133475343203,
355
+ "eval_loss": 5.861395835876465,
356
+ "eval_mean_token_accuracy": 0.2402858340657801,
357
+ "eval_model_preparation_time": 0.0047,
358
+ "eval_num_tokens": 1825107.0,
359
+ "eval_runtime": 79.3994,
360
+ "eval_samples_per_second": 5.466,
361
+ "eval_steps_per_second": 2.733,
362
  "step": 1737
363
  },
364
  {
365
+ "entropy": 5.8970259666442875,
366
  "epoch": 1.0074841681059297,
367
+ "grad_norm": 2.6411802768707275,
368
+ "learning_rate": 1.540901007705987e-06,
369
+ "loss": 5.614,
370
+ "mean_token_accuracy": 0.273006406724453,
371
+ "num_tokens": 1838864.0,
372
  "step": 1750
373
  },
374
  {
375
+ "entropy": 6.0111794090271,
376
  "epoch": 1.0362694300518134,
377
+ "grad_norm": 3.6491827964782715,
378
+ "learning_rate": 1.526081802015412e-06,
379
+ "loss": 5.7323,
380
+ "mean_token_accuracy": 0.26104256987571717,
381
+ "num_tokens": 1893816.0,
382
  "step": 1800
383
  },
384
  {
385
+ "entropy": 5.902219276428223,
386
  "epoch": 1.065054691997697,
387
+ "grad_norm": 2.593249559402466,
388
+ "learning_rate": 1.5112625963248368e-06,
389
+ "loss": 5.6187,
390
+ "mean_token_accuracy": 0.2746362566947937,
391
+ "num_tokens": 1946532.0,
392
  "step": 1850
393
  },
394
  {
395
+ "entropy": 5.874705944061279,
396
  "epoch": 1.0938399539435808,
397
+ "grad_norm": 2.554327964782715,
398
+ "learning_rate": 1.496443390634262e-06,
399
+ "loss": 5.6021,
400
+ "mean_token_accuracy": 0.2795292744040489,
401
+ "num_tokens": 2000184.0,
402
  "step": 1900
403
  },
404
  {
405
+ "entropy": 5.850096368789673,
406
  "epoch": 1.1226252158894645,
407
+ "grad_norm": 3.6060993671417236,
408
+ "learning_rate": 1.481624184943687e-06,
409
+ "loss": 5.576,
410
+ "mean_token_accuracy": 0.28532547056674956,
411
+ "num_tokens": 2052250.0,
412
  "step": 1950
413
  },
414
  {
415
+ "entropy": 5.802229671478272,
416
  "epoch": 1.1514104778353482,
417
+ "grad_norm": 3.0913314819335938,
418
+ "learning_rate": 1.466804979253112e-06,
419
+ "loss": 5.53,
420
+ "mean_token_accuracy": 0.2916027933359146,
421
+ "num_tokens": 2103531.0,
422
  "step": 2000
423
  },
424
  {
425
+ "entropy": 5.875646467208862,
426
  "epoch": 1.180195739781232,
427
+ "grad_norm": 4.777045726776123,
428
+ "learning_rate": 1.451985773562537e-06,
429
+ "loss": 5.6146,
430
+ "mean_token_accuracy": 0.28063644528388976,
431
+ "num_tokens": 2157098.0,
432
  "step": 2050
433
  },
434
  {
435
+ "entropy": 5.786596937179565,
436
  "epoch": 1.2089810017271156,
437
+ "grad_norm": 4.207762718200684,
438
+ "learning_rate": 1.437166567871962e-06,
439
+ "loss": 5.5417,
440
+ "mean_token_accuracy": 0.2870470091700554,
441
+ "num_tokens": 2211827.0,
442
  "step": 2100
443
  },
444
  {
445
+ "entropy": 5.672234449386597,
446
  "epoch": 1.2377662636729994,
447
+ "grad_norm": 2.2771811485290527,
448
+ "learning_rate": 1.422347362181387e-06,
449
+ "loss": 5.4285,
450
+ "mean_token_accuracy": 0.30194485366344453,
451
+ "num_tokens": 2262174.0,
452
  "step": 2150
453
  },
454
  {
455
+ "entropy": 5.862573285102844,
456
  "epoch": 1.266551525618883,
457
+ "grad_norm": 3.3273422718048096,
458
+ "learning_rate": 1.4075281564908121e-06,
459
+ "loss": 5.6169,
460
+ "mean_token_accuracy": 0.278145115673542,
461
+ "num_tokens": 2316440.0,
462
  "step": 2200
463
  },
464
  {
465
+ "entropy": 5.734760231971741,
466
  "epoch": 1.2953367875647668,
467
+ "grad_norm": 3.7049715518951416,
468
+ "learning_rate": 1.392708950800237e-06,
469
+ "loss": 5.493,
470
+ "mean_token_accuracy": 0.2941485676169395,
471
+ "num_tokens": 2368468.0,
472
  "step": 2250
473
  },
474
  {
475
+ "entropy": 5.665819988250733,
476
  "epoch": 1.3241220495106505,
477
+ "grad_norm": 3.572636604309082,
478
+ "learning_rate": 1.3778897451096621e-06,
479
+ "loss": 5.4352,
480
+ "mean_token_accuracy": 0.3003745040297508,
481
+ "num_tokens": 2421180.0,
482
  "step": 2300
483
  },
484
  {
485
+ "entropy": 5.890115032196045,
486
  "epoch": 1.3529073114565342,
487
+ "grad_norm": 2.738203525543213,
488
+ "learning_rate": 1.3630705394190872e-06,
489
+ "loss": 5.6555,
490
+ "mean_token_accuracy": 0.2737997192144394,
491
+ "num_tokens": 2476255.0,
492
  "step": 2350
493
  },
494
  {
495
+ "entropy": 5.66056040763855,
496
  "epoch": 1.381692573402418,
497
+ "grad_norm": 3.1416995525360107,
498
+ "learning_rate": 1.3482513337285121e-06,
499
+ "loss": 5.4302,
500
+ "mean_token_accuracy": 0.3000989046692848,
501
+ "num_tokens": 2527674.0,
502
  "step": 2400
503
  },
504
  {
505
+ "entropy": 5.861240615844727,
506
  "epoch": 1.4104778353483016,
507
+ "grad_norm": 2.7569284439086914,
508
+ "learning_rate": 1.333432128037937e-06,
509
+ "loss": 5.6304,
510
+ "mean_token_accuracy": 0.27707513481378554,
511
+ "num_tokens": 2582909.0,
512
  "step": 2450
513
  },
514
  {
515
+ "entropy": 5.627686910629272,
516
  "epoch": 1.4392630972941853,
517
+ "grad_norm": 1.7750262022018433,
518
+ "learning_rate": 1.3186129223473621e-06,
519
+ "loss": 5.4058,
520
+ "mean_token_accuracy": 0.3019809901714325,
521
+ "num_tokens": 2636579.0,
522
  "step": 2500
523
  },
524
  {
525
+ "entropy": 5.607026796340943,
526
  "epoch": 1.468048359240069,
527
+ "grad_norm": 3.1005160808563232,
528
+ "learning_rate": 1.3037937166567872e-06,
529
+ "loss": 5.3836,
530
+ "mean_token_accuracy": 0.30584611505270004,
531
+ "num_tokens": 2687698.0,
532
  "step": 2550
533
  },
534
  {
535
+ "entropy": 5.6909641885757445,
536
  "epoch": 1.4968336211859528,
537
+ "grad_norm": 1.6848654747009277,
538
+ "learning_rate": 1.2889745109662123e-06,
539
+ "loss": 5.4653,
540
+ "mean_token_accuracy": 0.296178964972496,
541
+ "num_tokens": 2740214.0,
542
  "step": 2600
543
  },
544
  {
545
+ "entropy": 5.619450302124023,
546
  "epoch": 1.5256188831318365,
547
+ "grad_norm": 2.469539165496826,
548
+ "learning_rate": 1.274155305275637e-06,
549
+ "loss": 5.4022,
550
+ "mean_token_accuracy": 0.3039679077267647,
551
+ "num_tokens": 2792574.0,
552
  "step": 2650
553
  },
554
  {
555
+ "entropy": 5.61073097705841,
556
  "epoch": 1.5544041450777202,
557
+ "grad_norm": 2.367810010910034,
558
+ "learning_rate": 1.259336099585062e-06,
559
+ "loss": 5.3956,
560
+ "mean_token_accuracy": 0.3051413372159004,
561
+ "num_tokens": 2845597.0,
562
  "step": 2700
563
  },
564
  {
565
+ "entropy": 5.5791136837005615,
566
  "epoch": 1.583189407023604,
567
+ "grad_norm": 2.3874764442443848,
568
+ "learning_rate": 1.2445168938944872e-06,
569
+ "loss": 5.3676,
570
+ "mean_token_accuracy": 0.3068238252401352,
571
+ "num_tokens": 2898683.0,
572
  "step": 2750
573
  },
574
  {
575
+ "entropy": 5.735381307601929,
576
  "epoch": 1.6119746689694876,
577
+ "grad_norm": 2.2097349166870117,
578
+ "learning_rate": 1.2296976882039123e-06,
579
+ "loss": 5.5239,
580
+ "mean_token_accuracy": 0.28974882304668426,
581
+ "num_tokens": 2952290.0,
582
  "step": 2800
583
  },
584
  {
585
+ "entropy": 5.55252691745758,
586
  "epoch": 1.6407599309153713,
587
+ "grad_norm": 1.694831132888794,
588
+ "learning_rate": 1.2148784825133372e-06,
589
+ "loss": 5.351,
590
+ "mean_token_accuracy": 0.3091904193162918,
591
+ "num_tokens": 3004556.0,
592
  "step": 2850
593
  },
594
  {
595
+ "entropy": 5.508773093223572,
596
  "epoch": 1.669545192861255,
597
+ "grad_norm": 1.8229279518127441,
598
+ "learning_rate": 1.200059276822762e-06,
599
+ "loss": 5.3164,
600
+ "mean_token_accuracy": 0.31158645361661913,
601
+ "num_tokens": 3056448.0,
602
  "step": 2900
603
  },
604
  {
605
+ "entropy": 5.676794271469117,
606
  "epoch": 1.6983304548071387,
607
+ "grad_norm": 1.7196234464645386,
608
+ "learning_rate": 1.1852400711321872e-06,
609
+ "loss": 5.4776,
610
+ "mean_token_accuracy": 0.2929128894209862,
611
+ "num_tokens": 3109539.0,
612
  "step": 2950
613
  },
614
  {
615
+ "entropy": 5.551529383659362,
616
  "epoch": 1.7271157167530224,
617
+ "grad_norm": 3.117525577545166,
618
+ "learning_rate": 1.1704208654416123e-06,
619
+ "loss": 5.3561,
620
+ "mean_token_accuracy": 0.30634030640125276,
621
+ "num_tokens": 3162421.0,
622
  "step": 3000
623
  },
624
  {
625
+ "entropy": 5.379635264873505,
626
  "epoch": 1.7559009786989062,
627
+ "grad_norm": 1.876755714416504,
628
+ "learning_rate": 1.1556016597510372e-06,
629
+ "loss": 5.1868,
630
+ "mean_token_accuracy": 0.32913618892431257,
631
+ "num_tokens": 3212079.0,
632
  "step": 3050
633
  },
634
  {
635
+ "entropy": 5.538804936408996,
636
  "epoch": 1.7846862406447899,
637
+ "grad_norm": 1.8670976161956787,
638
+ "learning_rate": 1.1407824540604623e-06,
639
+ "loss": 5.3494,
640
+ "mean_token_accuracy": 0.30661171555519107,
641
+ "num_tokens": 3264089.0,
642
  "step": 3100
643
  },
644
  {
645
+ "entropy": 5.258263626098633,
646
  "epoch": 1.8134715025906736,
647
+ "grad_norm": 2.748718023300171,
648
+ "learning_rate": 1.1259632483698874e-06,
649
+ "loss": 5.08,
650
+ "mean_token_accuracy": 0.3413010013103485,
651
+ "num_tokens": 3311881.0,
652
  "step": 3150
653
  },
654
  {
655
+ "entropy": 5.54539008140564,
656
  "epoch": 1.8422567645365573,
657
+ "grad_norm": 1.8556406497955322,
658
+ "learning_rate": 1.1111440426793123e-06,
659
+ "loss": 5.3614,
660
+ "mean_token_accuracy": 0.30550685405731204,
661
+ "num_tokens": 3364861.0,
662
  "step": 3200
663
  },
664
  {
665
+ "entropy": 5.5433073282241825,
666
  "epoch": 1.871042026482441,
667
+ "grad_norm": 1.8386749029159546,
668
+ "learning_rate": 1.0963248369887374e-06,
669
+ "loss": 5.3543,
670
+ "mean_token_accuracy": 0.30875524014234546,
671
+ "num_tokens": 3415911.0,
672
  "step": 3250
673
  },
674
  {
675
+ "entropy": 5.5769769477844235,
676
  "epoch": 1.8998272884283247,
677
+ "grad_norm": 1.922486662864685,
678
+ "learning_rate": 1.0815056312981623e-06,
679
+ "loss": 5.3834,
680
+ "mean_token_accuracy": 0.3035113242268562,
681
+ "num_tokens": 3468338.0,
682
  "step": 3300
683
  },
684
  {
685
+ "entropy": 5.640013842582703,
686
  "epoch": 1.9286125503742084,
687
+ "grad_norm": 2.179500102996826,
688
+ "learning_rate": 1.0666864256075874e-06,
689
+ "loss": 5.4574,
690
+ "mean_token_accuracy": 0.2947095710039139,
691
+ "num_tokens": 3521693.0,
692
  "step": 3350
693
  },
694
  {
695
+ "entropy": 5.506910061836242,
696
  "epoch": 1.9573978123200921,
697
+ "grad_norm": 1.4014379978179932,
698
+ "learning_rate": 1.0518672199170125e-06,
699
+ "loss": 5.3234,
700
+ "mean_token_accuracy": 0.3096472260355949,
701
+ "num_tokens": 3574206.0,
702
  "step": 3400
703
  },
704
  {
705
+ "entropy": 5.607311015129089,
706
  "epoch": 1.9861830742659758,
707
+ "grad_norm": 1.41231107711792,
708
+ "learning_rate": 1.0370480142264374e-06,
709
+ "loss": 5.4226,
710
+ "mean_token_accuracy": 0.2979922544956207,
711
+ "num_tokens": 3627807.0,
712
  "step": 3450
713
  },
714
  {
715
  "epoch": 2.0,
716
+ "eval_entropy": 5.831721861790951,
717
+ "eval_loss": 5.656307220458984,
718
+ "eval_mean_token_accuracy": 0.2641724460685308,
719
+ "eval_model_preparation_time": 0.0047,
720
+ "eval_num_tokens": 3650214.0,
721
+ "eval_runtime": 79.7324,
722
+ "eval_samples_per_second": 5.443,
723
+ "eval_steps_per_second": 2.722,
724
  "step": 3474
725
  },
726
  {
727
+ "entropy": 5.477711625099182,
728
  "epoch": 2.0149683362118593,
729
+ "grad_norm": 3.0133137702941895,
730
+ "learning_rate": 1.0222288085358623e-06,
731
+ "loss": 5.2957,
732
+ "mean_token_accuracy": 0.31543311327695844,
733
+ "num_tokens": 3677883.0,
734
  "step": 3500
735
  },
736
  {
737
+ "entropy": 5.599187393188476,
738
  "epoch": 2.043753598157743,
739
+ "grad_norm": 1.885867714881897,
740
+ "learning_rate": 1.0074096028452874e-06,
741
+ "loss": 5.4142,
742
+ "mean_token_accuracy": 0.3004470923542976,
743
+ "num_tokens": 3730991.0,
744
  "step": 3550
745
  },
746
  {
747
+ "entropy": 5.526448183059692,
748
  "epoch": 2.0725388601036268,
749
+ "grad_norm": 4.50788688659668,
750
+ "learning_rate": 9.925903971547125e-07,
751
+ "loss": 5.3517,
752
+ "mean_token_accuracy": 0.3069574165344238,
753
+ "num_tokens": 3783795.0,
754
  "step": 3600
755
  },
756
  {
757
+ "entropy": 5.560557870864868,
758
  "epoch": 2.1013241220495105,
759
+ "grad_norm": 1.927862524986267,
760
+ "learning_rate": 9.777711914641374e-07,
761
+ "loss": 5.3815,
762
+ "mean_token_accuracy": 0.3045575937628746,
763
+ "num_tokens": 3835526.0,
764
  "step": 3650
765
  },
766
  {
767
+ "entropy": 5.528058257102966,
768
  "epoch": 2.130109383995394,
769
+ "grad_norm": 2.164687156677246,
770
+ "learning_rate": 9.629519857735625e-07,
771
+ "loss": 5.3501,
772
+ "mean_token_accuracy": 0.3071546205878258,
773
+ "num_tokens": 3887175.0,
774
  "step": 3700
775
  },
776
  {
777
+ "entropy": 5.397617678642273,
778
  "epoch": 2.158894645941278,
779
+ "grad_norm": 2.3098385334014893,
780
+ "learning_rate": 9.481327800829875e-07,
781
+ "loss": 5.2244,
782
+ "mean_token_accuracy": 0.3226669803261757,
783
+ "num_tokens": 3938003.0,
784
  "step": 3750
785
  },
786
  {
787
+ "entropy": 5.529960298538208,
788
  "epoch": 2.1876799078871616,
789
+ "grad_norm": 1.8144755363464355,
790
+ "learning_rate": 9.333135743924125e-07,
791
+ "loss": 5.3572,
792
+ "mean_token_accuracy": 0.306032218337059,
793
+ "num_tokens": 3990451.0,
794
  "step": 3800
795
  },
796
  {
797
+ "entropy": 5.597109637260437,
798
  "epoch": 2.2164651698330453,
799
+ "grad_norm": 2.7306935787200928,
800
+ "learning_rate": 9.184943687018375e-07,
801
+ "loss": 5.4162,
802
+ "mean_token_accuracy": 0.2985941395163536,
803
+ "num_tokens": 4044048.0,
804
  "step": 3850
805
  },
806
  {
807
+ "entropy": 5.448684883117676,
808
  "epoch": 2.245250431778929,
809
+ "grad_norm": 1.8199880123138428,
810
+ "learning_rate": 9.036751630112626e-07,
811
+ "loss": 5.2775,
812
+ "mean_token_accuracy": 0.31548845052719116,
813
+ "num_tokens": 4095276.0,
814
  "step": 3900
815
  },
816
  {
817
+ "entropy": 5.5008597612380985,
818
  "epoch": 2.2740356937248127,
819
+ "grad_norm": 1.755323052406311,
820
+ "learning_rate": 8.888559573206875e-07,
821
+ "loss": 5.3274,
822
+ "mean_token_accuracy": 0.309090721309185,
823
+ "num_tokens": 4148172.0,
824
  "step": 3950
825
  },
826
  {
827
+ "entropy": 5.7040300464630125,
828
  "epoch": 2.3028209556706964,
829
+ "grad_norm": 2.3154356479644775,
830
+ "learning_rate": 8.740367516301126e-07,
831
+ "loss": 5.5239,
832
+ "mean_token_accuracy": 0.28589318484067916,
833
+ "num_tokens": 4202733.0,
834
  "step": 4000
835
  },
836
  {
837
+ "entropy": 5.549855670928955,
838
  "epoch": 2.33160621761658,
839
+ "grad_norm": 1.9549669027328491,
840
+ "learning_rate": 8.592175459395375e-07,
841
+ "loss": 5.3755,
842
+ "mean_token_accuracy": 0.3029727828502655,
843
+ "num_tokens": 4255738.0,
844
  "step": 4050
845
  },
846
  {
847
+ "entropy": 5.579690465927124,
848
  "epoch": 2.360391479562464,
849
+ "grad_norm": 1.7018866539001465,
850
+ "learning_rate": 8.443983402489626e-07,
851
+ "loss": 5.4036,
852
+ "mean_token_accuracy": 0.3001995691657066,
853
+ "num_tokens": 4308638.0,
854
  "step": 4100
855
  },
856
  {
857
+ "entropy": 5.646504878997803,
858
  "epoch": 2.3891767415083476,
859
+ "grad_norm": 1.4139262437820435,
860
+ "learning_rate": 8.295791345583877e-07,
861
+ "loss": 5.4733,
862
+ "mean_token_accuracy": 0.2912476986646652,
863
+ "num_tokens": 4363170.0,
864
  "step": 4150
865
  },
866
  {
867
+ "entropy": 5.554990992546082,
868
  "epoch": 2.4179620034542313,
869
+ "grad_norm": 1.6886577606201172,
870
+ "learning_rate": 8.147599288678126e-07,
871
+ "loss": 5.3842,
872
+ "mean_token_accuracy": 0.302762059867382,
873
+ "num_tokens": 4415607.0,
874
  "step": 4200
875
  },
876
  {
877
+ "entropy": 5.513420124053955,
878
  "epoch": 2.446747265400115,
879
+ "grad_norm": 1.3537819385528564,
880
+ "learning_rate": 7.999407231772377e-07,
881
+ "loss": 5.3408,
882
+ "mean_token_accuracy": 0.30764526218175886,
883
+ "num_tokens": 4467608.0,
884
  "step": 4250
885
  },
886
  {
887
+ "entropy": 5.561378569602966,
888
  "epoch": 2.4755325273459987,
889
+ "grad_norm": 1.8514106273651123,
890
+ "learning_rate": 7.851215174866627e-07,
891
+ "loss": 5.3891,
892
+ "mean_token_accuracy": 0.301382859647274,
893
+ "num_tokens": 4520299.0,
894
  "step": 4300
895
  },
896
  {
897
+ "entropy": 5.536689953804016,
898
  "epoch": 2.5043177892918824,
899
+ "grad_norm": 2.1830835342407227,
900
+ "learning_rate": 7.703023117960877e-07,
901
+ "loss": 5.3672,
902
+ "mean_token_accuracy": 0.3047756373882294,
903
+ "num_tokens": 4573065.0,
904
  "step": 4350
905
  },
906
  {
907
+ "entropy": 5.69776873588562,
908
  "epoch": 2.533103051237766,
909
+ "grad_norm": 1.999536156654358,
910
+ "learning_rate": 7.554831061055127e-07,
911
+ "loss": 5.5236,
912
+ "mean_token_accuracy": 0.2868007507920265,
913
+ "num_tokens": 4626807.0,
914
  "step": 4400
915
  },
916
  {
917
+ "entropy": 5.3977436876297,
918
  "epoch": 2.56188831318365,
919
+ "grad_norm": 1.9608020782470703,
920
+ "learning_rate": 7.406639004149378e-07,
921
+ "loss": 5.2335,
922
+ "mean_token_accuracy": 0.3199601462483406,
923
+ "num_tokens": 4677663.0,
924
  "step": 4450
925
  },
926
  {
927
+ "entropy": 5.6681678771972654,
928
  "epoch": 2.5906735751295336,
929
+ "grad_norm": 1.829047441482544,
930
+ "learning_rate": 7.258446947243627e-07,
931
+ "loss": 5.491,
932
+ "mean_token_accuracy": 0.2894612854719162,
933
+ "num_tokens": 4731830.0,
934
  "step": 4500
935
  },
936
  {
937
+ "entropy": 5.49174174785614,
938
  "epoch": 2.6194588370754173,
939
+ "grad_norm": 1.3158719539642334,
940
+ "learning_rate": 7.110254890337878e-07,
941
+ "loss": 5.3225,
942
+ "mean_token_accuracy": 0.3084965732693672,
943
+ "num_tokens": 4784694.0,
944
  "step": 4550
945
  },
946
  {
947
+ "entropy": 5.573234438896179,
948
  "epoch": 2.648244099021301,
949
+ "grad_norm": 1.562915325164795,
950
+ "learning_rate": 6.962062833432127e-07,
951
+ "loss": 5.4028,
952
+ "mean_token_accuracy": 0.2989520016312599,
953
+ "num_tokens": 4838534.0,
954
  "step": 4600
955
  },
956
  {
957
+ "entropy": 5.550469598770142,
958
  "epoch": 2.6770293609671847,
959
+ "grad_norm": 2.114727735519409,
960
+ "learning_rate": 6.813870776526378e-07,
961
+ "loss": 5.3804,
962
+ "mean_token_accuracy": 0.30373542964458466,
963
+ "num_tokens": 4890611.0,
964
  "step": 4650
965
  },
966
  {
967
+ "entropy": 5.523049550056458,
968
  "epoch": 2.7058146229130684,
969
+ "grad_norm": 2.5036823749542236,
970
+ "learning_rate": 6.665678719620628e-07,
971
+ "loss": 5.3542,
972
+ "mean_token_accuracy": 0.30681024432182313,
973
+ "num_tokens": 4943571.0,
974
  "step": 4700
975
  },
976
  {
977
+ "entropy": 5.323453049659729,
978
  "epoch": 2.734599884858952,
979
+ "grad_norm": 1.8069168329238892,
980
+ "learning_rate": 6.517486662714878e-07,
981
+ "loss": 5.1583,
982
+ "mean_token_accuracy": 0.32906652927398683,
983
+ "num_tokens": 4993871.0,
984
  "step": 4750
985
  },
986
  {
987
+ "entropy": 5.504038324356079,
988
  "epoch": 2.763385146804836,
989
+ "grad_norm": 4.750283718109131,
990
+ "learning_rate": 6.369294605809128e-07,
991
+ "loss": 5.3366,
992
+ "mean_token_accuracy": 0.3087608867883682,
993
+ "num_tokens": 5046187.0,
994
  "step": 4800
995
  },
996
  {
997
+ "entropy": 5.487624549865723,
998
  "epoch": 2.7921704087507195,
999
+ "grad_norm": 1.4186172485351562,
1000
+ "learning_rate": 6.221102548903379e-07,
1001
+ "loss": 5.3237,
1002
+ "mean_token_accuracy": 0.3088638699054718,
1003
+ "num_tokens": 5098644.0,
1004
  "step": 4850
1005
  },
1006
  {
1007
+ "entropy": 5.346905107498169,
1008
  "epoch": 2.8209556706966032,
1009
+ "grad_norm": 1.5670177936553955,
1010
+ "learning_rate": 6.072910491997628e-07,
1011
+ "loss": 5.1849,
1012
+ "mean_token_accuracy": 0.3265886321663857,
1013
+ "num_tokens": 5149345.0,
1014
  "step": 4900
1015
  },
1016
  {
1017
+ "entropy": 5.510410032272339,
1018
  "epoch": 2.849740932642487,
1019
+ "grad_norm": 7.489855766296387,
1020
+ "learning_rate": 5.924718435091879e-07,
1021
+ "loss": 5.3424,
1022
+ "mean_token_accuracy": 0.30768151730299,
1023
+ "num_tokens": 5202028.0,
1024
  "step": 4950
1025
  },
1026
  {
1027
+ "entropy": 5.525181493759155,
1028
  "epoch": 2.8785261945883707,
1029
+ "grad_norm": 1.8829196691513062,
1030
+ "learning_rate": 5.776526378186128e-07,
1031
+ "loss": 5.3654,
1032
+ "mean_token_accuracy": 0.30342737555503846,
1033
+ "num_tokens": 5255082.0,
1034
  "step": 5000
1035
  },
1036
  {
1037
+ "entropy": 5.374098634719848,
1038
  "epoch": 2.9073114565342544,
1039
+ "grad_norm": 1.3901060819625854,
1040
+ "learning_rate": 5.628334321280379e-07,
1041
+ "loss": 5.2103,
1042
+ "mean_token_accuracy": 0.3233291879296303,
1043
+ "num_tokens": 5305042.0,
1044
  "step": 5050
1045
  },
1046
  {
1047
+ "entropy": 5.374619431495667,
1048
  "epoch": 2.936096718480138,
1049
+ "grad_norm": 1.6586560010910034,
1050
+ "learning_rate": 5.48014226437463e-07,
1051
+ "loss": 5.2125,
1052
+ "mean_token_accuracy": 0.322759662270546,
1053
+ "num_tokens": 5356310.0,
1054
  "step": 5100
1055
  },
1056
  {
1057
+ "entropy": 5.527479724884033,
1058
  "epoch": 2.964881980426022,
1059
+ "grad_norm": 1.6678485870361328,
1060
+ "learning_rate": 5.331950207468879e-07,
1061
+ "loss": 5.3627,
1062
+ "mean_token_accuracy": 0.30430852621793747,
1063
+ "num_tokens": 5409283.0,
1064
  "step": 5150
1065
  },
1066
  {
1067
+ "entropy": 5.6171248292922975,
1068
  "epoch": 2.9936672423719055,
1069
+ "grad_norm": 1.50790274143219,
1070
+ "learning_rate": 5.18375815056313e-07,
1071
+ "loss": 5.4484,
1072
+ "mean_token_accuracy": 0.29375598043203355,
1073
+ "num_tokens": 5464332.0,
1074
  "step": 5200
1075
  },
1076
  {
1077
  "epoch": 3.0,
1078
+ "eval_entropy": 5.78779639186947,
1079
+ "eval_loss": 5.628758430480957,
1080
+ "eval_mean_token_accuracy": 0.2653660801698535,
1081
+ "eval_model_preparation_time": 0.0047,
1082
+ "eval_num_tokens": 5475321.0,
1083
+ "eval_runtime": 80.3676,
1084
+ "eval_samples_per_second": 5.4,
1085
+ "eval_steps_per_second": 2.7,
1086
  "step": 5211
1087
  },
1088
  {
1089
+ "entropy": 5.323350539207459,
1090
  "epoch": 3.0224525043177892,
1091
+ "grad_norm": 2.033228635787964,
1092
+ "learning_rate": 5.03556609365738e-07,
1093
+ "loss": 5.1623,
1094
+ "mean_token_accuracy": 0.32844111531972886,
1095
+ "num_tokens": 5514450.0,
1096
  "step": 5250
1097
  },
1098
  {
1099
+ "entropy": 5.509175033569336,
1100
  "epoch": 3.051237766263673,
1101
+ "grad_norm": 1.4281281232833862,
1102
+ "learning_rate": 4.88737403675163e-07,
1103
+ "loss": 5.3403,
1104
+ "mean_token_accuracy": 0.30768867909908293,
1105
+ "num_tokens": 5567345.0,
1106
  "step": 5300
1107
  },
1108
  {
1109
+ "entropy": 5.4536163854599,
1110
  "epoch": 3.0800230282095566,
1111
+ "grad_norm": 2.0320699214935303,
1112
+ "learning_rate": 4.73918197984588e-07,
1113
+ "loss": 5.2898,
1114
+ "mean_token_accuracy": 0.31407355904579165,
1115
+ "num_tokens": 5619654.0,
1116
  "step": 5350
1117
  },
1118
  {
1119
+ "entropy": 5.487306084632873,
1120
  "epoch": 3.1088082901554404,
1121
+ "grad_norm": 1.2829618453979492,
1122
+ "learning_rate": 4.59098992294013e-07,
1123
+ "loss": 5.3204,
1124
+ "mean_token_accuracy": 0.30913869380950926,
1125
+ "num_tokens": 5672269.0,
1126
  "step": 5400
1127
  },
1128
  {
1129
+ "entropy": 5.569495844841003,
1130
  "epoch": 3.137593552101324,
1131
+ "grad_norm": 2.231628656387329,
1132
+ "learning_rate": 4.44279786603438e-07,
1133
+ "loss": 5.4045,
1134
+ "mean_token_accuracy": 0.30076681196689603,
1135
+ "num_tokens": 5725059.0,
1136
  "step": 5450
1137
  },
1138
  {
1139
+ "entropy": 5.499957413673401,
1140
  "epoch": 3.166378814047208,
1141
+ "grad_norm": 1.549865484237671,
1142
+ "learning_rate": 4.2946058091286305e-07,
1143
+ "loss": 5.3415,
1144
+ "mean_token_accuracy": 0.30755339056253433,
1145
+ "num_tokens": 5776784.0,
1146
  "step": 5500
1147
  },
1148
  {
1149
+ "entropy": 5.664071002006531,
1150
  "epoch": 3.1951640759930915,
1151
+ "grad_norm": 1.2153443098068237,
1152
+ "learning_rate": 4.146413752222881e-07,
1153
+ "loss": 5.4948,
1154
+ "mean_token_accuracy": 0.28785294711589815,
1155
+ "num_tokens": 5832296.0,
1156
  "step": 5550
1157
  },
1158
  {
1159
+ "entropy": 5.516234860420227,
1160
  "epoch": 3.223949337938975,
1161
+ "grad_norm": 1.0542709827423096,
1162
+ "learning_rate": 3.998221695317131e-07,
1163
+ "loss": 5.3465,
1164
+ "mean_token_accuracy": 0.3083792108297348,
1165
+ "num_tokens": 5885122.0,
1166
  "step": 5600
1167
  },
1168
  {
1169
+ "entropy": 5.500826091766357,
1170
  "epoch": 3.252734599884859,
1171
+ "grad_norm": 2.2477681636810303,
1172
+ "learning_rate": 3.850029638411381e-07,
1173
+ "loss": 5.3385,
1174
+ "mean_token_accuracy": 0.30737883657217024,
1175
+ "num_tokens": 5938386.0,
1176
  "step": 5650
1177
  },
1178
  {
1179
+ "entropy": 5.517533864974975,
1180
  "epoch": 3.2815198618307426,
1181
+ "grad_norm": 1.03904128074646,
1182
+ "learning_rate": 3.7018375815056315e-07,
1183
+ "loss": 5.3533,
1184
+ "mean_token_accuracy": 0.3064529225230217,
1185
+ "num_tokens": 5989784.0,
1186
  "step": 5700
1187
  },
1188
  {
1189
+ "entropy": 5.543709697723389,
1190
  "epoch": 3.3103051237766263,
1191
+ "grad_norm": 1.562757134437561,
1192
+ "learning_rate": 3.5536455245998815e-07,
1193
+ "loss": 5.3766,
1194
+ "mean_token_accuracy": 0.3036728450655937,
1195
+ "num_tokens": 6042646.0,
1196
  "step": 5750
1197
  },
1198
  {
1199
+ "entropy": 5.389412899017334,
1200
  "epoch": 3.33909038572251,
1201
+ "grad_norm": 2.2124178409576416,
1202
+ "learning_rate": 3.4054534676941315e-07,
1203
+ "loss": 5.2287,
1204
+ "mean_token_accuracy": 0.32173423111438754,
1205
+ "num_tokens": 6093550.0,
1206
  "step": 5800
1207
  },
1208
  {
1209
+ "entropy": 5.236968355178833,
1210
  "epoch": 3.3678756476683938,
1211
+ "grad_norm": 2.146965503692627,
1212
+ "learning_rate": 3.2572614107883814e-07,
1213
+ "loss": 5.0793,
1214
+ "mean_token_accuracy": 0.3410212889313698,
1215
+ "num_tokens": 6142299.0,
1216
  "step": 5850
1217
  },
1218
  {
1219
+ "entropy": 5.459367966651916,
1220
  "epoch": 3.3966609096142775,
1221
+ "grad_norm": 1.0992231369018555,
1222
+ "learning_rate": 3.109069353882632e-07,
1223
+ "loss": 5.2978,
1224
+ "mean_token_accuracy": 0.31258249312639236,
1225
+ "num_tokens": 6194315.0,
1226
  "step": 5900
1227
  },
1228
  {
1229
+ "entropy": 5.526850900650024,
1230
  "epoch": 3.425446171560161,
1231
+ "grad_norm": 2.137270212173462,
1232
+ "learning_rate": 2.960877296976882e-07,
1233
+ "loss": 5.3598,
1234
+ "mean_token_accuracy": 0.3052875977754593,
1235
+ "num_tokens": 6246032.0,
1236
  "step": 5950
1237
  },
1238
  {
1239
+ "entropy": 5.573816101551056,
1240
  "epoch": 3.454231433506045,
1241
+ "grad_norm": 1.5624985694885254,
1242
+ "learning_rate": 2.812685240071132e-07,
1243
+ "loss": 5.4081,
1244
+ "mean_token_accuracy": 0.2992635017633438,
1245
+ "num_tokens": 6300018.0,
1246
  "step": 6000
1247
  },
1248
  {
1249
+ "entropy": 5.514087476730347,
1250
  "epoch": 3.4830166954519286,
1251
+ "grad_norm": 1.2660338878631592,
1252
+ "learning_rate": 2.664493183165382e-07,
1253
+ "loss": 5.3472,
1254
+ "mean_token_accuracy": 0.3070674228668213,
1255
+ "num_tokens": 6352988.0,
1256
  "step": 6050
1257
  },
1258
  {
1259
+ "entropy": 5.430188207626343,
1260
  "epoch": 3.5118019573978123,
1261
+ "grad_norm": 1.2666460275650024,
1262
+ "learning_rate": 2.5163011262596324e-07,
1263
+ "loss": 5.2645,
1264
+ "mean_token_accuracy": 0.31776045858860014,
1265
+ "num_tokens": 6405116.0,
1266
  "step": 6100
1267
  },
1268
  {
1269
+ "entropy": 5.5897090005874634,
1270
  "epoch": 3.540587219343696,
1271
+ "grad_norm": 1.275363802909851,
1272
+ "learning_rate": 2.3681090693538824e-07,
1273
+ "loss": 5.4265,
1274
+ "mean_token_accuracy": 0.297469447851181,
1275
+ "num_tokens": 6458789.0,
1276
  "step": 6150
1277
  },
1278
  {
1279
+ "entropy": 5.422791337966919,
1280
  "epoch": 3.5693724812895797,
1281
+ "grad_norm": 2.2392683029174805,
1282
+ "learning_rate": 2.2199170124481327e-07,
1283
+ "loss": 5.2608,
1284
+ "mean_token_accuracy": 0.3180572906136513,
1285
+ "num_tokens": 6510168.0,
1286
  "step": 6200
1287
  },
1288
  {
1289
+ "entropy": 5.408909387588501,
1290
  "epoch": 3.5981577432354634,
1291
+ "grad_norm": 2.821279525756836,
1292
+ "learning_rate": 2.071724955542383e-07,
1293
+ "loss": 5.2455,
1294
+ "mean_token_accuracy": 0.316647432744503,
1295
+ "num_tokens": 6562528.0,
1296
  "step": 6250
1297
  },
1298
  {
1299
+ "entropy": 5.657666215896606,
1300
  "epoch": 3.626943005181347,
1301
+ "grad_norm": 3.261878490447998,
1302
+ "learning_rate": 1.9235328986366332e-07,
1303
+ "loss": 5.4941,
1304
+ "mean_token_accuracy": 0.28845800429582596,
1305
+ "num_tokens": 6617308.0,
1306
  "step": 6300
1307
  },
1308
  {
1309
+ "entropy": 5.446933870315552,
1310
  "epoch": 3.655728267127231,
1311
+ "grad_norm": 1.1171406507492065,
1312
+ "learning_rate": 1.7753408417308832e-07,
1313
+ "loss": 5.2848,
1314
+ "mean_token_accuracy": 0.31402444154024123,
1315
+ "num_tokens": 6669969.0,
1316
  "step": 6350
1317
  },
1318
  {
1319
+ "entropy": 5.605754513740539,
1320
  "epoch": 3.6845135290731146,
1321
+ "grad_norm": 2.066650152206421,
1322
+ "learning_rate": 1.6271487848251334e-07,
1323
+ "loss": 5.4447,
1324
+ "mean_token_accuracy": 0.2945487481355667,
1325
+ "num_tokens": 6724425.0,
1326
  "step": 6400
1327
  },
1328
  {
1329
+ "entropy": 5.39195601940155,
1330
  "epoch": 3.7132987910189983,
1331
+ "grad_norm": 1.6908842325210571,
1332
+ "learning_rate": 1.4789567279193834e-07,
1333
+ "loss": 5.2298,
1334
+ "mean_token_accuracy": 0.3206364804506302,
1335
+ "num_tokens": 6775236.0,
1336
  "step": 6450
1337
  },
1338
  {
1339
+ "entropy": 5.514347395896912,
1340
  "epoch": 3.742084052964882,
1341
+ "grad_norm": 1.166090726852417,
1342
+ "learning_rate": 1.3307646710136337e-07,
1343
+ "loss": 5.3517,
1344
+ "mean_token_accuracy": 0.30615471601486205,
1345
+ "num_tokens": 6828545.0,
1346
  "step": 6500
1347
  },
1348
  {
1349
+ "entropy": 5.6728374910354615,
1350
  "epoch": 3.7708693149107657,
1351
+ "grad_norm": 2.3615996837615967,
1352
+ "learning_rate": 1.1825726141078837e-07,
1353
+ "loss": 5.5058,
1354
+ "mean_token_accuracy": 0.28638383001089096,
1355
+ "num_tokens": 6884005.0,
1356
  "step": 6550
1357
  },
1358
  {
1359
+ "entropy": 5.4262278175354,
1360
  "epoch": 3.7996545768566494,
1361
+ "grad_norm": 1.7658995389938354,
1362
+ "learning_rate": 1.0343805572021339e-07,
1363
+ "loss": 5.2617,
1364
+ "mean_token_accuracy": 0.31743784427642824,
1365
+ "num_tokens": 6935209.0,
1366
  "step": 6600
1367
  },
1368
  {
1369
+ "entropy": 5.436288638114929,
1370
  "epoch": 3.828439838802533,
1371
+ "grad_norm": 3.455641269683838,
1372
+ "learning_rate": 8.861885002963842e-08,
1373
+ "loss": 5.2706,
1374
+ "mean_token_accuracy": 0.31677050977945326,
1375
+ "num_tokens": 6987396.0,
1376
  "step": 6650
1377
  },
1378
  {
1379
+ "entropy": 5.586358890533448,
1380
  "epoch": 3.857225100748417,
1381
+ "grad_norm": 1.981423020362854,
1382
+ "learning_rate": 7.379964433906343e-08,
1383
+ "loss": 5.4191,
1384
+ "mean_token_accuracy": 0.2982942935824394,
1385
+ "num_tokens": 7041132.0,
1386
  "step": 6700
1387
  },
1388
  {
1389
+ "entropy": 5.494750590324402,
1390
  "epoch": 3.8860103626943006,
1391
+ "grad_norm": 1.7962652444839478,
1392
+ "learning_rate": 5.8980438648488434e-08,
1393
+ "loss": 5.3306,
1394
+ "mean_token_accuracy": 0.3082431614398956,
1395
+ "num_tokens": 7094059.0,
1396
  "step": 6750
1397
  },
1398
  {
1399
+ "entropy": 5.393875141143798,
1400
  "epoch": 3.9147956246401843,
1401
+ "grad_norm": 1.8328484296798706,
1402
+ "learning_rate": 4.416123295791346e-08,
1403
+ "loss": 5.2351,
1404
+ "mean_token_accuracy": 0.3187332367897034,
1405
+ "num_tokens": 7144964.0,
1406
  "step": 6800
1407
  },
1408
  {
1409
+ "entropy": 5.660646886825561,
1410
  "epoch": 3.943580886586068,
1411
+ "grad_norm": 0.8133105039596558,
1412
+ "learning_rate": 2.934202726733847e-08,
1413
+ "loss": 5.4946,
1414
+ "mean_token_accuracy": 0.2876924830675125,
1415
+ "num_tokens": 7200805.0,
1416
  "step": 6850
1417
  },
1418
  {
1419
+ "entropy": 5.239456839561463,
1420
  "epoch": 3.9723661485319517,
1421
+ "grad_norm": 7.838026523590088,
1422
+ "learning_rate": 1.4522821576763486e-08,
1423
+ "loss": 5.0866,
1424
+ "mean_token_accuracy": 0.33811178654432295,
1425
+ "num_tokens": 7250918.0,
1426
  "step": 6900
1427
  },
1428
  {
1429
  "epoch": 4.0,
1430
+ "eval_entropy": 5.780879339314826,
1431
+ "eval_loss": 5.622366428375244,
1432
+ "eval_mean_token_accuracy": 0.26563407995733795,
1433
+ "eval_model_preparation_time": 0.0047,
1434
+ "eval_num_tokens": 7300428.0,
1435
+ "eval_runtime": 80.4424,
1436
+ "eval_samples_per_second": 5.395,
1437
+ "eval_steps_per_second": 2.698,
1438
  "step": 6948
1439
  }
1440
  ],
1441
  "logging_steps": 50,
1442
+ "max_steps": 6948,
1443
  "num_input_tokens_seen": 0,
1444
+ "num_train_epochs": 4,
1445
  "save_steps": 500,
1446
  "stateful_callbacks": {
1447
  "TrainerControl": {
 
1450
  "should_evaluate": false,
1451
  "should_log": false,
1452
  "should_save": true,
1453
+ "should_training_stop": true
1454
  },
1455
  "attributes": {}
1456
  }
1457
  },
1458
+ "total_flos": 1.0021019691282432e+17,
1459
  "train_batch_size": 2,
1460
  "trial_name": null,
1461
  "trial_params": null
checkpoint-6948/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:130d33149272782bd60306263c371036419926142b8999aad7806359168f8484
3
  size 6225
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8db5c304963110404ebb6947b83ba95bd9b8aad1f9b8b578cc33c46d601e13dc
3
  size 6225
checkpoint-8685/adapter_config.json CHANGED
@@ -16,7 +16,7 @@
16
  "layers_pattern": null,
17
  "layers_to_transform": null,
18
  "loftq_config": {},
19
- "lora_alpha": 16,
20
  "lora_bias": false,
21
  "lora_dropout": 0.1,
22
  "megatron_config": null,
@@ -29,8 +29,10 @@
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
 
 
32
  "q_proj",
33
- "v_proj"
34
  ],
35
  "target_parameters": null,
36
  "task_type": "CAUSAL_LM",
 
16
  "layers_pattern": null,
17
  "layers_to_transform": null,
18
  "loftq_config": {},
19
+ "lora_alpha": 32,
20
  "lora_bias": false,
21
  "lora_dropout": 0.1,
22
  "megatron_config": null,
 
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
+ "k_proj",
33
+ "v_proj",
34
  "q_proj",
35
+ "o_proj"
36
  ],
37
  "target_parameters": null,
38
  "task_type": "CAUSAL_LM",
checkpoint-8685/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:76b5201211b5dac5150a2b3a87809a5671a1239a76fdfafed2618f15a157a612
3
- size 4374520
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2381d61542c1032294bdfd8d93b87c507ec0307a2bd423dfa1c90ac19f153434
3
+ size 8749064
checkpoint-8685/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:01d27e4d4e2843533494998d1773c9af8797769c8e1900f756f6eb1a61546355
3
- size 8783179
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:36cfb0e2c01a3583f649b7157010998f7cfe60c81f2d8dd9f8a236e6ac0ea717
3
+ size 17621003