ddore14 commited on
Commit
6113166
·
verified ·
1 Parent(s): d32c9ea

Upload 6 files

Browse files
config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "MPNetForMaskedLM"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "bos_token_id": 0,
7
+ "dtype": "float16",
8
+ "eos_token_id": 2,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "layer_norm_eps": 1e-05,
15
+ "max_position_embeddings": 514,
16
+ "model_type": "mpnet",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 12,
19
+ "pad_token_id": 1,
20
+ "relative_attention_num_buckets": 32,
21
+ "tie_word_embeddings": true,
22
+ "transformers_version": "5.0.0",
23
+ "use_cache": false,
24
+ "vocab_size": 30527
25
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:345e644416dff53ce516d961eeb96dea467ada38a7172d8dbefd5f62144db8bb
3
+ size 266011004
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "bos_token": "<s>",
4
+ "cls_token": "<s>",
5
+ "do_lower_case": true,
6
+ "eos_token": "</s>",
7
+ "is_local": true,
8
+ "mask_token": "<mask>",
9
+ "model_max_length": 512,
10
+ "pad_token": "<pad>",
11
+ "sep_token": "</s>",
12
+ "strip_accents": null,
13
+ "tokenize_chinese_chars": true,
14
+ "tokenizer_class": "MPNetTokenizer",
15
+ "unk_token": "[UNK]"
16
+ }
trainer_state.json ADDED
@@ -0,0 +1,1432 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 16.515276630883566,
6
+ "eval_steps": 2000,
7
+ "global_step": 120000,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0,
14
+ "eval_accuracy": 1.2790688050781904e-05,
15
+ "eval_loss": 115.5,
16
+ "eval_runtime": 200.9168,
17
+ "eval_samples_per_second": 8229.121,
18
+ "eval_steps_per_second": 16.076,
19
+ "step": 0
20
+ },
21
+ {
22
+ "epoch": 0.13762730525736305,
23
+ "grad_norm": 26.74563980102539,
24
+ "learning_rate": 2.9639999999999997e-05,
25
+ "loss": 141.26440625,
26
+ "step": 1000
27
+ },
28
+ {
29
+ "epoch": 0.2752546105147261,
30
+ "grad_norm": 22.885421752929688,
31
+ "learning_rate": 5.964e-05,
32
+ "loss": 59.12350390625,
33
+ "step": 2000
34
+ },
35
+ {
36
+ "epoch": 0.2752546105147261,
37
+ "eval_accuracy": 0.676248915563685,
38
+ "eval_loss": 12.453125,
39
+ "eval_runtime": 168.1994,
40
+ "eval_samples_per_second": 9829.817,
41
+ "eval_steps_per_second": 19.203,
42
+ "step": 2000
43
+ },
44
+ {
45
+ "epoch": 0.41288191577208916,
46
+ "grad_norm": 21.04855728149414,
47
+ "learning_rate": 8.957999999999998e-05,
48
+ "loss": 50.17753125,
49
+ "step": 3000
50
+ },
51
+ {
52
+ "epoch": 0.5505092210294522,
53
+ "grad_norm": 20.75350570678711,
54
+ "learning_rate": 0.00011957999999999999,
55
+ "loss": 46.33467578125,
56
+ "step": 4000
57
+ },
58
+ {
59
+ "epoch": 0.5505092210294522,
60
+ "eval_accuracy": 0.7075093226563655,
61
+ "eval_loss": 10.6796875,
62
+ "eval_runtime": 167.9148,
63
+ "eval_samples_per_second": 9846.476,
64
+ "eval_steps_per_second": 19.236,
65
+ "step": 4000
66
+ },
67
+ {
68
+ "epoch": 0.6881365262868153,
69
+ "grad_norm": 19.514423370361328,
70
+ "learning_rate": 0.00014948999999999998,
71
+ "loss": 44.1445078125,
72
+ "step": 5000
73
+ },
74
+ {
75
+ "epoch": 0.8257638315441783,
76
+ "grad_norm": 18.586366653442383,
77
+ "learning_rate": 0.00017949,
78
+ "loss": 42.90046875,
79
+ "step": 6000
80
+ },
81
+ {
82
+ "epoch": 0.8257638315441783,
83
+ "eval_accuracy": 0.7171687154928088,
84
+ "eval_loss": 10.140625,
85
+ "eval_runtime": 169.3136,
86
+ "eval_samples_per_second": 9765.126,
87
+ "eval_steps_per_second": 19.077,
88
+ "step": 6000
89
+ },
90
+ {
91
+ "epoch": 0.9633911368015414,
92
+ "grad_norm": 17.66230010986328,
93
+ "learning_rate": 0.00020946,
94
+ "loss": 41.96175390625,
95
+ "step": 7000
96
+ },
97
+ {
98
+ "epoch": 1.1010184420589044,
99
+ "grad_norm": 19.431285858154297,
100
+ "learning_rate": 0.0002394,
101
+ "loss": 41.42766015625,
102
+ "step": 8000
103
+ },
104
+ {
105
+ "epoch": 1.1010184420589044,
106
+ "eval_accuracy": 0.7211219507641113,
107
+ "eval_loss": 9.921875,
108
+ "eval_runtime": 170.601,
109
+ "eval_samples_per_second": 9691.437,
110
+ "eval_steps_per_second": 18.933,
111
+ "step": 8000
112
+ },
113
+ {
114
+ "epoch": 1.2386457473162675,
115
+ "grad_norm": 19.449247360229492,
116
+ "learning_rate": 0.00026933999999999997,
117
+ "loss": 41.1454921875,
118
+ "step": 9000
119
+ },
120
+ {
121
+ "epoch": 1.3762730525736306,
122
+ "grad_norm": 21.419654846191406,
123
+ "learning_rate": 0.00029934,
124
+ "loss": 40.98086328125,
125
+ "step": 10000
126
+ },
127
+ {
128
+ "epoch": 1.3762730525736306,
129
+ "eval_accuracy": 0.7225334877311377,
130
+ "eval_loss": 9.8359375,
131
+ "eval_runtime": 167.923,
132
+ "eval_samples_per_second": 9845.998,
133
+ "eval_steps_per_second": 19.235,
134
+ "step": 10000
135
+ },
136
+ {
137
+ "epoch": 1.5139003578309937,
138
+ "grad_norm": 17.391115188598633,
139
+ "learning_rate": 0.0003,
140
+ "loss": 40.79280078125,
141
+ "step": 11000
142
+ },
143
+ {
144
+ "epoch": 1.6515276630883569,
145
+ "grad_norm": 29.895591735839844,
146
+ "learning_rate": 0.0003,
147
+ "loss": 40.4991640625,
148
+ "step": 12000
149
+ },
150
+ {
151
+ "epoch": 1.6515276630883569,
152
+ "eval_accuracy": 0.7232843609541934,
153
+ "eval_loss": 9.78125,
154
+ "eval_runtime": 170.7512,
155
+ "eval_samples_per_second": 9682.91,
156
+ "eval_steps_per_second": 18.916,
157
+ "step": 12000
158
+ },
159
+ {
160
+ "epoch": 1.7891549683457197,
161
+ "grad_norm": 16.52672576904297,
162
+ "learning_rate": 0.0003,
163
+ "loss": 40.5724921875,
164
+ "step": 13000
165
+ },
166
+ {
167
+ "epoch": 1.9267822736030829,
168
+ "grad_norm": 21.801998138427734,
169
+ "learning_rate": 0.0003,
170
+ "loss": 40.29026953125,
171
+ "step": 14000
172
+ },
173
+ {
174
+ "epoch": 1.9267822736030829,
175
+ "eval_accuracy": 0.7247579731972368,
176
+ "eval_loss": 9.7109375,
177
+ "eval_runtime": 169.376,
178
+ "eval_samples_per_second": 9761.53,
179
+ "eval_steps_per_second": 19.07,
180
+ "step": 14000
181
+ },
182
+ {
183
+ "epoch": 2.0644095788604457,
184
+ "grad_norm": 16.79710578918457,
185
+ "learning_rate": 0.0003,
186
+ "loss": 40.192109375,
187
+ "step": 15000
188
+ },
189
+ {
190
+ "epoch": 2.202036884117809,
191
+ "grad_norm": 19.166486740112305,
192
+ "learning_rate": 0.0003,
193
+ "loss": 39.6695,
194
+ "step": 16000
195
+ },
196
+ {
197
+ "epoch": 2.202036884117809,
198
+ "eval_accuracy": 0.726963905479327,
199
+ "eval_loss": 9.6171875,
200
+ "eval_runtime": 170.9384,
201
+ "eval_samples_per_second": 9672.308,
202
+ "eval_steps_per_second": 18.896,
203
+ "step": 16000
204
+ },
205
+ {
206
+ "epoch": 2.339664189375172,
207
+ "grad_norm": 15.058335304260254,
208
+ "learning_rate": 0.0003,
209
+ "loss": 39.4829609375,
210
+ "step": 17000
211
+ },
212
+ {
213
+ "epoch": 2.477291494632535,
214
+ "grad_norm": 15.027190208435059,
215
+ "learning_rate": 0.0003,
216
+ "loss": 39.3190859375,
217
+ "step": 18000
218
+ },
219
+ {
220
+ "epoch": 2.477291494632535,
221
+ "eval_accuracy": 0.7287604009112041,
222
+ "eval_loss": 9.546875,
223
+ "eval_runtime": 170.8414,
224
+ "eval_samples_per_second": 9677.8,
225
+ "eval_steps_per_second": 18.906,
226
+ "step": 18000
227
+ },
228
+ {
229
+ "epoch": 2.614918799889898,
230
+ "grad_norm": 17.88246726989746,
231
+ "learning_rate": 0.0003,
232
+ "loss": 39.1439296875,
233
+ "step": 19000
234
+ },
235
+ {
236
+ "epoch": 2.7525461051472613,
237
+ "grad_norm": 63.608863830566406,
238
+ "learning_rate": 0.0003,
239
+ "loss": 39.0614765625,
240
+ "step": 20000
241
+ },
242
+ {
243
+ "epoch": 2.7525461051472613,
244
+ "eval_accuracy": 0.7304020938141926,
245
+ "eval_loss": 9.46875,
246
+ "eval_runtime": 169.4479,
247
+ "eval_samples_per_second": 9757.391,
248
+ "eval_steps_per_second": 19.062,
249
+ "step": 20000
250
+ },
251
+ {
252
+ "epoch": 2.8901734104046244,
253
+ "grad_norm": 16.970745086669922,
254
+ "learning_rate": 0.0003,
255
+ "loss": 38.933671875,
256
+ "step": 21000
257
+ },
258
+ {
259
+ "epoch": 3.0278007156619875,
260
+ "grad_norm": 19.159212112426758,
261
+ "learning_rate": 0.0003,
262
+ "loss": 39.2828828125,
263
+ "step": 22000
264
+ },
265
+ {
266
+ "epoch": 3.0278007156619875,
267
+ "eval_accuracy": 0.7285464859640399,
268
+ "eval_loss": 9.5390625,
269
+ "eval_runtime": 168.2153,
270
+ "eval_samples_per_second": 9828.886,
271
+ "eval_steps_per_second": 19.202,
272
+ "step": 22000
273
+ },
274
+ {
275
+ "epoch": 3.1654280209193506,
276
+ "grad_norm": 28.30343246459961,
277
+ "learning_rate": 0.0003,
278
+ "loss": 39.1001796875,
279
+ "step": 23000
280
+ },
281
+ {
282
+ "epoch": 3.3030553261767133,
283
+ "grad_norm": 14.642151832580566,
284
+ "learning_rate": 0.0003,
285
+ "loss": 38.792859375,
286
+ "step": 24000
287
+ },
288
+ {
289
+ "epoch": 3.3030553261767133,
290
+ "eval_accuracy": 0.7310024378583697,
291
+ "eval_loss": 9.421875,
292
+ "eval_runtime": 168.799,
293
+ "eval_samples_per_second": 9794.899,
294
+ "eval_steps_per_second": 19.135,
295
+ "step": 24000
296
+ },
297
+ {
298
+ "epoch": 3.4406826314340764,
299
+ "grad_norm": 32.9803352355957,
300
+ "learning_rate": 0.0003,
301
+ "loss": 38.72437890625,
302
+ "step": 25000
303
+ },
304
+ {
305
+ "epoch": 3.5783099366914395,
306
+ "grad_norm": 15.326594352722168,
307
+ "learning_rate": 0.0003,
308
+ "loss": 38.7340703125,
309
+ "step": 26000
310
+ },
311
+ {
312
+ "epoch": 3.5783099366914395,
313
+ "eval_accuracy": 0.7314286407398777,
314
+ "eval_loss": 9.390625,
315
+ "eval_runtime": 168.4702,
316
+ "eval_samples_per_second": 9814.015,
317
+ "eval_steps_per_second": 19.173,
318
+ "step": 26000
319
+ },
320
+ {
321
+ "epoch": 3.7159372419488026,
322
+ "grad_norm": 17.456626892089844,
323
+ "learning_rate": 0.0003,
324
+ "loss": 38.50905078125,
325
+ "step": 27000
326
+ },
327
+ {
328
+ "epoch": 3.8535645472061657,
329
+ "grad_norm": 19.74641990661621,
330
+ "learning_rate": 0.0003,
331
+ "loss": 38.4493125,
332
+ "step": 28000
333
+ },
334
+ {
335
+ "epoch": 3.8535645472061657,
336
+ "eval_accuracy": 0.7326063700018453,
337
+ "eval_loss": 9.3359375,
338
+ "eval_runtime": 167.8973,
339
+ "eval_samples_per_second": 9847.504,
340
+ "eval_steps_per_second": 19.238,
341
+ "step": 28000
342
+ },
343
+ {
344
+ "epoch": 3.991191852463529,
345
+ "grad_norm": 16.116609573364258,
346
+ "learning_rate": 0.0003,
347
+ "loss": 38.4295859375,
348
+ "step": 29000
349
+ },
350
+ {
351
+ "epoch": 4.1288191577208915,
352
+ "grad_norm": 14.791277885437012,
353
+ "learning_rate": 0.0003,
354
+ "loss": 38.2604921875,
355
+ "step": 30000
356
+ },
357
+ {
358
+ "epoch": 4.1288191577208915,
359
+ "eval_accuracy": 0.73262638229311,
360
+ "eval_loss": 9.34375,
361
+ "eval_runtime": 169.3505,
362
+ "eval_samples_per_second": 9763.003,
363
+ "eval_steps_per_second": 19.073,
364
+ "step": 30000
365
+ },
366
+ {
367
+ "epoch": 4.266446462978255,
368
+ "grad_norm": 14.701274871826172,
369
+ "learning_rate": 0.0003,
370
+ "loss": 38.178046875,
371
+ "step": 31000
372
+ },
373
+ {
374
+ "epoch": 4.404073768235618,
375
+ "grad_norm": 32.28364181518555,
376
+ "learning_rate": 0.0003,
377
+ "loss": 38.118671875,
378
+ "step": 32000
379
+ },
380
+ {
381
+ "epoch": 4.404073768235618,
382
+ "eval_accuracy": 0.7330061560700385,
383
+ "eval_loss": 9.328125,
384
+ "eval_runtime": 169.1911,
385
+ "eval_samples_per_second": 9772.196,
386
+ "eval_steps_per_second": 19.091,
387
+ "step": 32000
388
+ },
389
+ {
390
+ "epoch": 4.541701073492981,
391
+ "grad_norm": 14.65267562866211,
392
+ "learning_rate": 0.0003,
393
+ "loss": 38.04671484375,
394
+ "step": 33000
395
+ },
396
+ {
397
+ "epoch": 4.679328378750344,
398
+ "grad_norm": 15.814043045043945,
399
+ "learning_rate": 0.0003,
400
+ "loss": 38.03919140625,
401
+ "step": 34000
402
+ },
403
+ {
404
+ "epoch": 4.679328378750344,
405
+ "eval_accuracy": 0.7350299683164979,
406
+ "eval_loss": 9.234375,
407
+ "eval_runtime": 170.3006,
408
+ "eval_samples_per_second": 9708.532,
409
+ "eval_steps_per_second": 18.966,
410
+ "step": 34000
411
+ },
412
+ {
413
+ "epoch": 4.8169556840077075,
414
+ "grad_norm": 22.082040786743164,
415
+ "learning_rate": 0.0003,
416
+ "loss": 38.00453125,
417
+ "step": 35000
418
+ },
419
+ {
420
+ "epoch": 4.95458298926507,
421
+ "grad_norm": 20.287931442260742,
422
+ "learning_rate": 0.0003,
423
+ "loss": 37.977375,
424
+ "step": 36000
425
+ },
426
+ {
427
+ "epoch": 4.95458298926507,
428
+ "eval_accuracy": 0.7339120258458323,
429
+ "eval_loss": 9.28125,
430
+ "eval_runtime": 171.801,
431
+ "eval_samples_per_second": 9623.744,
432
+ "eval_steps_per_second": 18.801,
433
+ "step": 36000
434
+ },
435
+ {
436
+ "epoch": 5.092210294522434,
437
+ "grad_norm": 30.58030128479004,
438
+ "learning_rate": 0.0003,
439
+ "loss": 37.9320703125,
440
+ "step": 37000
441
+ },
442
+ {
443
+ "epoch": 5.229837599779796,
444
+ "grad_norm": 19.570669174194336,
445
+ "learning_rate": 0.0003,
446
+ "loss": 37.9665234375,
447
+ "step": 38000
448
+ },
449
+ {
450
+ "epoch": 5.229837599779796,
451
+ "eval_accuracy": 0.7340658881695064,
452
+ "eval_loss": 9.28125,
453
+ "eval_runtime": 167.7847,
454
+ "eval_samples_per_second": 9854.11,
455
+ "eval_steps_per_second": 19.251,
456
+ "step": 38000
457
+ },
458
+ {
459
+ "epoch": 5.367464905037159,
460
+ "grad_norm": 17.29003143310547,
461
+ "learning_rate": 0.0003,
462
+ "loss": 37.9463984375,
463
+ "step": 39000
464
+ },
465
+ {
466
+ "epoch": 5.505092210294523,
467
+ "grad_norm": 16.568086624145508,
468
+ "learning_rate": 0.0003,
469
+ "loss": 37.7375625,
470
+ "step": 40000
471
+ },
472
+ {
473
+ "epoch": 5.505092210294523,
474
+ "eval_accuracy": 0.735108947509515,
475
+ "eval_loss": 9.2265625,
476
+ "eval_runtime": 170.3335,
477
+ "eval_samples_per_second": 9706.659,
478
+ "eval_steps_per_second": 18.963,
479
+ "step": 40000
480
+ },
481
+ {
482
+ "epoch": 5.642719515551885,
483
+ "grad_norm": 110.49578094482422,
484
+ "learning_rate": 0.0003,
485
+ "loss": 37.6945,
486
+ "step": 41000
487
+ },
488
+ {
489
+ "epoch": 5.780346820809249,
490
+ "grad_norm": 24.624027252197266,
491
+ "learning_rate": 0.0003,
492
+ "loss": 37.72793359375,
493
+ "step": 42000
494
+ },
495
+ {
496
+ "epoch": 5.780346820809249,
497
+ "eval_accuracy": 0.7357611562920721,
498
+ "eval_loss": 9.203125,
499
+ "eval_runtime": 168.0213,
500
+ "eval_samples_per_second": 9840.236,
501
+ "eval_steps_per_second": 19.224,
502
+ "step": 42000
503
+ },
504
+ {
505
+ "epoch": 5.917974126066611,
506
+ "grad_norm": 17.721742630004883,
507
+ "learning_rate": 0.0003,
508
+ "loss": 37.635171875,
509
+ "step": 43000
510
+ },
511
+ {
512
+ "epoch": 6.055601431323975,
513
+ "grad_norm": 14.108975410461426,
514
+ "learning_rate": 0.0003,
515
+ "loss": 37.572765625,
516
+ "step": 44000
517
+ },
518
+ {
519
+ "epoch": 6.055601431323975,
520
+ "eval_accuracy": 0.7358946142236276,
521
+ "eval_loss": 9.1953125,
522
+ "eval_runtime": 168.7352,
523
+ "eval_samples_per_second": 9798.601,
524
+ "eval_steps_per_second": 19.142,
525
+ "step": 44000
526
+ },
527
+ {
528
+ "epoch": 6.193228736581338,
529
+ "grad_norm": 22.010786056518555,
530
+ "learning_rate": 0.0003,
531
+ "loss": 37.4784453125,
532
+ "step": 45000
533
+ },
534
+ {
535
+ "epoch": 6.330856041838701,
536
+ "grad_norm": 28.03326988220215,
537
+ "learning_rate": 0.0003,
538
+ "loss": 37.5036328125,
539
+ "step": 46000
540
+ },
541
+ {
542
+ "epoch": 6.330856041838701,
543
+ "eval_accuracy": 0.7368965623776919,
544
+ "eval_loss": 9.171875,
545
+ "eval_runtime": 166.8883,
546
+ "eval_samples_per_second": 9907.04,
547
+ "eval_steps_per_second": 19.354,
548
+ "step": 46000
549
+ },
550
+ {
551
+ "epoch": 6.468483347096064,
552
+ "grad_norm": 17.36493492126465,
553
+ "learning_rate": 0.0003,
554
+ "loss": 37.4494453125,
555
+ "step": 47000
556
+ },
557
+ {
558
+ "epoch": 6.6061106523534265,
559
+ "grad_norm": 19.51688003540039,
560
+ "learning_rate": 0.0003,
561
+ "loss": 37.435234375,
562
+ "step": 48000
563
+ },
564
+ {
565
+ "epoch": 6.6061106523534265,
566
+ "eval_accuracy": 0.7361287527698055,
567
+ "eval_loss": 9.1796875,
568
+ "eval_runtime": 169.5516,
569
+ "eval_samples_per_second": 9751.421,
570
+ "eval_steps_per_second": 19.05,
571
+ "step": 48000
572
+ },
573
+ {
574
+ "epoch": 6.74373795761079,
575
+ "grad_norm": 15.679317474365234,
576
+ "learning_rate": 0.0003,
577
+ "loss": 37.3981484375,
578
+ "step": 49000
579
+ },
580
+ {
581
+ "epoch": 6.881365262868153,
582
+ "grad_norm": 42.613555908203125,
583
+ "learning_rate": 0.0003,
584
+ "loss": 37.51515625,
585
+ "step": 50000
586
+ },
587
+ {
588
+ "epoch": 6.881365262868153,
589
+ "eval_accuracy": 0.7378156733015306,
590
+ "eval_loss": 9.125,
591
+ "eval_runtime": 170.8004,
592
+ "eval_samples_per_second": 9680.126,
593
+ "eval_steps_per_second": 18.911,
594
+ "step": 50000
595
+ },
596
+ {
597
+ "epoch": 7.018992568125516,
598
+ "grad_norm": 15.565272331237793,
599
+ "learning_rate": 0.0003,
600
+ "loss": 37.4844375,
601
+ "step": 51000
602
+ },
603
+ {
604
+ "epoch": 7.156619873382879,
605
+ "grad_norm": 15.575469017028809,
606
+ "learning_rate": 0.0003,
607
+ "loss": 37.2694375,
608
+ "step": 52000
609
+ },
610
+ {
611
+ "epoch": 7.156619873382879,
612
+ "eval_accuracy": 0.7383814207101249,
613
+ "eval_loss": 9.09375,
614
+ "eval_runtime": 167.1627,
615
+ "eval_samples_per_second": 9890.776,
616
+ "eval_steps_per_second": 19.322,
617
+ "step": 52000
618
+ },
619
+ {
620
+ "epoch": 7.2942471786402425,
621
+ "grad_norm": 15.146500587463379,
622
+ "learning_rate": 0.0003,
623
+ "loss": 37.243421875,
624
+ "step": 53000
625
+ },
626
+ {
627
+ "epoch": 7.431874483897605,
628
+ "grad_norm": 17.28059959411621,
629
+ "learning_rate": 0.0003,
630
+ "loss": 37.23215625,
631
+ "step": 54000
632
+ },
633
+ {
634
+ "epoch": 7.431874483897605,
635
+ "eval_accuracy": 0.7379147365824262,
636
+ "eval_loss": 9.1015625,
637
+ "eval_runtime": 169.7822,
638
+ "eval_samples_per_second": 9738.179,
639
+ "eval_steps_per_second": 19.024,
640
+ "step": 54000
641
+ },
642
+ {
643
+ "epoch": 7.569501789154968,
644
+ "grad_norm": 29.069555282592773,
645
+ "learning_rate": 0.0003,
646
+ "loss": 37.22365625,
647
+ "step": 55000
648
+ },
649
+ {
650
+ "epoch": 7.707129094412331,
651
+ "grad_norm": 26.9329833984375,
652
+ "learning_rate": 0.0003,
653
+ "loss": 37.22718359375,
654
+ "step": 56000
655
+ },
656
+ {
657
+ "epoch": 7.707129094412331,
658
+ "eval_accuracy": 0.7364887765974263,
659
+ "eval_loss": 9.171875,
660
+ "eval_runtime": 170.0511,
661
+ "eval_samples_per_second": 9722.778,
662
+ "eval_steps_per_second": 18.994,
663
+ "step": 56000
664
+ },
665
+ {
666
+ "epoch": 7.844756399669695,
667
+ "grad_norm": 97.25556182861328,
668
+ "learning_rate": 0.0003,
669
+ "loss": 40.3630625,
670
+ "step": 57000
671
+ },
672
+ {
673
+ "epoch": 7.982383704927058,
674
+ "grad_norm": 19.714794158935547,
675
+ "learning_rate": 0.0003,
676
+ "loss": 37.60580859375,
677
+ "step": 58000
678
+ },
679
+ {
680
+ "epoch": 7.982383704927058,
681
+ "eval_accuracy": 0.7352168040424968,
682
+ "eval_loss": 9.234375,
683
+ "eval_runtime": 169.0419,
684
+ "eval_samples_per_second": 9780.822,
685
+ "eval_steps_per_second": 19.108,
686
+ "step": 58000
687
+ },
688
+ {
689
+ "epoch": 8.12001101018442,
690
+ "grad_norm": 14.792304039001465,
691
+ "learning_rate": 0.0003,
692
+ "loss": 37.54733984375,
693
+ "step": 59000
694
+ },
695
+ {
696
+ "epoch": 8.257638315441783,
697
+ "grad_norm": 15.469705581665039,
698
+ "learning_rate": 0.0003,
699
+ "loss": 37.5459453125,
700
+ "step": 60000
701
+ },
702
+ {
703
+ "epoch": 8.257638315441783,
704
+ "eval_accuracy": 0.7371664310551599,
705
+ "eval_loss": 9.1328125,
706
+ "eval_runtime": 166.3796,
707
+ "eval_samples_per_second": 9937.33,
708
+ "eval_steps_per_second": 19.413,
709
+ "step": 60000
710
+ },
711
+ {
712
+ "epoch": 8.395265620699147,
713
+ "grad_norm": 15.161516189575195,
714
+ "learning_rate": 0.0003,
715
+ "loss": 37.47540625,
716
+ "step": 61000
717
+ },
718
+ {
719
+ "epoch": 8.53289292595651,
720
+ "grad_norm": 14.802396774291992,
721
+ "learning_rate": 0.0003,
722
+ "loss": 37.30961328125,
723
+ "step": 62000
724
+ },
725
+ {
726
+ "epoch": 8.53289292595651,
727
+ "eval_accuracy": 0.7381438951141913,
728
+ "eval_loss": 9.09375,
729
+ "eval_runtime": 168.1235,
730
+ "eval_samples_per_second": 9834.251,
731
+ "eval_steps_per_second": 19.212,
732
+ "step": 62000
733
+ },
734
+ {
735
+ "epoch": 8.670520231213873,
736
+ "grad_norm": 20.38652229309082,
737
+ "learning_rate": 0.0003,
738
+ "loss": 37.20555859375,
739
+ "step": 63000
740
+ },
741
+ {
742
+ "epoch": 8.808147536471235,
743
+ "grad_norm": 15.676960945129395,
744
+ "learning_rate": 0.0003,
745
+ "loss": 37.14574609375,
746
+ "step": 64000
747
+ },
748
+ {
749
+ "epoch": 8.808147536471235,
750
+ "eval_accuracy": 0.7382565867319608,
751
+ "eval_loss": 9.1015625,
752
+ "eval_runtime": 171.8616,
753
+ "eval_samples_per_second": 9620.35,
754
+ "eval_steps_per_second": 18.794,
755
+ "step": 64000
756
+ },
757
+ {
758
+ "epoch": 8.9457748417286,
759
+ "grad_norm": 16.668174743652344,
760
+ "learning_rate": 0.0003,
761
+ "loss": 37.125671875,
762
+ "step": 65000
763
+ },
764
+ {
765
+ "epoch": 9.083402146985962,
766
+ "grad_norm": 16.503215789794922,
767
+ "learning_rate": 0.0003,
768
+ "loss": 37.01801171875,
769
+ "step": 66000
770
+ },
771
+ {
772
+ "epoch": 9.083402146985962,
773
+ "eval_accuracy": 0.7393124677323514,
774
+ "eval_loss": 9.046875,
775
+ "eval_runtime": 168.277,
776
+ "eval_samples_per_second": 9825.282,
777
+ "eval_steps_per_second": 19.195,
778
+ "step": 66000
779
+ },
780
+ {
781
+ "epoch": 9.221029452243325,
782
+ "grad_norm": 19.62793731689453,
783
+ "learning_rate": 0.0003,
784
+ "loss": 36.9910703125,
785
+ "step": 67000
786
+ },
787
+ {
788
+ "epoch": 9.358656757500688,
789
+ "grad_norm": 18.604116439819336,
790
+ "learning_rate": 0.0003,
791
+ "loss": 36.9631328125,
792
+ "step": 68000
793
+ },
794
+ {
795
+ "epoch": 9.358656757500688,
796
+ "eval_accuracy": 0.7395416473430355,
797
+ "eval_loss": 9.03125,
798
+ "eval_runtime": 166.5578,
799
+ "eval_samples_per_second": 9926.697,
800
+ "eval_steps_per_second": 19.393,
801
+ "step": 68000
802
+ },
803
+ {
804
+ "epoch": 9.49628406275805,
805
+ "grad_norm": 20.98838233947754,
806
+ "learning_rate": 0.0003,
807
+ "loss": 36.963703125,
808
+ "step": 69000
809
+ },
810
+ {
811
+ "epoch": 9.633911368015415,
812
+ "grad_norm": 17.784839630126953,
813
+ "learning_rate": 0.0003,
814
+ "loss": 36.96887109375,
815
+ "step": 70000
816
+ },
817
+ {
818
+ "epoch": 9.633911368015415,
819
+ "eval_accuracy": 0.7390765595619299,
820
+ "eval_loss": 9.0625,
821
+ "eval_runtime": 166.5724,
822
+ "eval_samples_per_second": 9925.827,
823
+ "eval_steps_per_second": 19.391,
824
+ "step": 70000
825
+ },
826
+ {
827
+ "epoch": 9.771538673272778,
828
+ "grad_norm": 27.863100051879883,
829
+ "learning_rate": 0.0003,
830
+ "loss": 37.0048671875,
831
+ "step": 71000
832
+ },
833
+ {
834
+ "epoch": 9.90916597853014,
835
+ "grad_norm": 16.188459396362305,
836
+ "learning_rate": 0.0003,
837
+ "loss": 36.9179296875,
838
+ "step": 72000
839
+ },
840
+ {
841
+ "epoch": 9.90916597853014,
842
+ "eval_accuracy": 0.7394581492526459,
843
+ "eval_loss": 9.03125,
844
+ "eval_runtime": 170.082,
845
+ "eval_samples_per_second": 9721.013,
846
+ "eval_steps_per_second": 18.991,
847
+ "step": 72000
848
+ },
849
+ {
850
+ "epoch": 10.046793283787503,
851
+ "grad_norm": 14.628756523132324,
852
+ "learning_rate": 0.0003,
853
+ "loss": 36.888484375,
854
+ "step": 73000
855
+ },
856
+ {
857
+ "epoch": 10.184420589044867,
858
+ "grad_norm": 14.934839248657227,
859
+ "learning_rate": 0.0003,
860
+ "loss": 36.7855625,
861
+ "step": 74000
862
+ },
863
+ {
864
+ "epoch": 10.184420589044867,
865
+ "eval_accuracy": 0.7404586215139355,
866
+ "eval_loss": 8.9921875,
867
+ "eval_runtime": 168.2373,
868
+ "eval_samples_per_second": 9827.6,
869
+ "eval_steps_per_second": 19.199,
870
+ "step": 74000
871
+ },
872
+ {
873
+ "epoch": 10.32204789430223,
874
+ "grad_norm": 15.220124244689941,
875
+ "learning_rate": 0.0003,
876
+ "loss": 36.9548515625,
877
+ "step": 75000
878
+ },
879
+ {
880
+ "epoch": 10.459675199559593,
881
+ "grad_norm": 15.933542251586914,
882
+ "learning_rate": 0.0003,
883
+ "loss": 36.85228125,
884
+ "step": 76000
885
+ },
886
+ {
887
+ "epoch": 10.459675199559593,
888
+ "eval_accuracy": 0.7398413045299678,
889
+ "eval_loss": 9.0234375,
890
+ "eval_runtime": 169.5447,
891
+ "eval_samples_per_second": 9751.82,
892
+ "eval_steps_per_second": 19.051,
893
+ "step": 76000
894
+ },
895
+ {
896
+ "epoch": 10.597302504816955,
897
+ "grad_norm": 15.21678638458252,
898
+ "learning_rate": 0.0003,
899
+ "loss": 36.8666171875,
900
+ "step": 77000
901
+ },
902
+ {
903
+ "epoch": 10.734929810074318,
904
+ "grad_norm": 16.906696319580078,
905
+ "learning_rate": 0.0003,
906
+ "loss": 36.78946875,
907
+ "step": 78000
908
+ },
909
+ {
910
+ "epoch": 10.734929810074318,
911
+ "eval_accuracy": 0.7396672771453636,
912
+ "eval_loss": 9.015625,
913
+ "eval_runtime": 168.0605,
914
+ "eval_samples_per_second": 9837.938,
915
+ "eval_steps_per_second": 19.219,
916
+ "step": 78000
917
+ },
918
+ {
919
+ "epoch": 10.872557115331682,
920
+ "grad_norm": 15.474593162536621,
921
+ "learning_rate": 0.0003,
922
+ "loss": 36.7689296875,
923
+ "step": 79000
924
+ },
925
+ {
926
+ "epoch": 11.010184420589045,
927
+ "grad_norm": 14.83968448638916,
928
+ "learning_rate": 0.0003,
929
+ "loss": 36.761296875,
930
+ "step": 80000
931
+ },
932
+ {
933
+ "epoch": 11.010184420589045,
934
+ "eval_accuracy": 0.7401702144442798,
935
+ "eval_loss": 9.0,
936
+ "eval_runtime": 167.1731,
937
+ "eval_samples_per_second": 9890.16,
938
+ "eval_steps_per_second": 19.321,
939
+ "step": 80000
940
+ },
941
+ {
942
+ "epoch": 11.147811725846408,
943
+ "grad_norm": 16.945621490478516,
944
+ "learning_rate": 0.0003,
945
+ "loss": 36.7302734375,
946
+ "step": 81000
947
+ },
948
+ {
949
+ "epoch": 11.28543903110377,
950
+ "grad_norm": 20.112407684326172,
951
+ "learning_rate": 0.0003,
952
+ "loss": 36.74803125,
953
+ "step": 82000
954
+ },
955
+ {
956
+ "epoch": 11.28543903110377,
957
+ "eval_accuracy": 0.7397283390276018,
958
+ "eval_loss": 9.046875,
959
+ "eval_runtime": 167.1017,
960
+ "eval_samples_per_second": 9894.387,
961
+ "eval_steps_per_second": 19.33,
962
+ "step": 82000
963
+ },
964
+ {
965
+ "epoch": 11.423066336361135,
966
+ "grad_norm": 15.264861106872559,
967
+ "learning_rate": 0.0003,
968
+ "loss": 36.7436875,
969
+ "step": 83000
970
+ },
971
+ {
972
+ "epoch": 11.560693641618498,
973
+ "grad_norm": 35.09538269042969,
974
+ "learning_rate": 0.0003,
975
+ "loss": 36.68840625,
976
+ "step": 84000
977
+ },
978
+ {
979
+ "epoch": 11.560693641618498,
980
+ "eval_accuracy": 0.7408698045825953,
981
+ "eval_loss": 8.96875,
982
+ "eval_runtime": 168.7302,
983
+ "eval_samples_per_second": 9798.89,
984
+ "eval_steps_per_second": 19.143,
985
+ "step": 84000
986
+ },
987
+ {
988
+ "epoch": 11.69832094687586,
989
+ "grad_norm": 19.97286605834961,
990
+ "learning_rate": 0.0003,
991
+ "loss": 36.69001171875,
992
+ "step": 85000
993
+ },
994
+ {
995
+ "epoch": 11.835948252133223,
996
+ "grad_norm": 17.521934509277344,
997
+ "learning_rate": 0.0003,
998
+ "loss": 36.67117578125,
999
+ "step": 86000
1000
+ },
1001
+ {
1002
+ "epoch": 11.835948252133223,
1003
+ "eval_accuracy": 0.7407812017839025,
1004
+ "eval_loss": 8.9765625,
1005
+ "eval_runtime": 169.2449,
1006
+ "eval_samples_per_second": 9769.092,
1007
+ "eval_steps_per_second": 19.085,
1008
+ "step": 86000
1009
+ },
1010
+ {
1011
+ "epoch": 11.973575557390586,
1012
+ "grad_norm": 15.003190040588379,
1013
+ "learning_rate": 0.0003,
1014
+ "loss": 36.64848046875,
1015
+ "step": 87000
1016
+ },
1017
+ {
1018
+ "epoch": 12.11120286264795,
1019
+ "grad_norm": 20.016794204711914,
1020
+ "learning_rate": 0.0003,
1021
+ "loss": 36.72609765625,
1022
+ "step": 88000
1023
+ },
1024
+ {
1025
+ "epoch": 12.11120286264795,
1026
+ "eval_accuracy": 0.7402567288458476,
1027
+ "eval_loss": 9.0,
1028
+ "eval_runtime": 168.1699,
1029
+ "eval_samples_per_second": 9831.539,
1030
+ "eval_steps_per_second": 19.207,
1031
+ "step": 88000
1032
+ },
1033
+ {
1034
+ "epoch": 12.248830167905313,
1035
+ "grad_norm": 16.020404815673828,
1036
+ "learning_rate": 0.0003,
1037
+ "loss": 36.8169140625,
1038
+ "step": 89000
1039
+ },
1040
+ {
1041
+ "epoch": 12.386457473162675,
1042
+ "grad_norm": 16.005430221557617,
1043
+ "learning_rate": 0.0003,
1044
+ "loss": 36.652015625,
1045
+ "step": 90000
1046
+ },
1047
+ {
1048
+ "epoch": 12.386457473162675,
1049
+ "eval_accuracy": 0.740178315482875,
1050
+ "eval_loss": 9.0,
1051
+ "eval_runtime": 169.3304,
1052
+ "eval_samples_per_second": 9764.16,
1053
+ "eval_steps_per_second": 19.075,
1054
+ "step": 90000
1055
+ },
1056
+ {
1057
+ "epoch": 12.524084778420038,
1058
+ "grad_norm": 14.788511276245117,
1059
+ "learning_rate": 0.0003,
1060
+ "loss": 36.70846875,
1061
+ "step": 91000
1062
+ },
1063
+ {
1064
+ "epoch": 12.661712083677402,
1065
+ "grad_norm": 15.171088218688965,
1066
+ "learning_rate": 0.0003,
1067
+ "loss": 36.9015078125,
1068
+ "step": 92000
1069
+ },
1070
+ {
1071
+ "epoch": 12.661712083677402,
1072
+ "eval_accuracy": 0.7409616192832641,
1073
+ "eval_loss": 8.9765625,
1074
+ "eval_runtime": 167.4567,
1075
+ "eval_samples_per_second": 9873.414,
1076
+ "eval_steps_per_second": 19.289,
1077
+ "step": 92000
1078
+ },
1079
+ {
1080
+ "epoch": 12.799339388934765,
1081
+ "grad_norm": 15.66518497467041,
1082
+ "learning_rate": 0.0003,
1083
+ "loss": 36.66136328125,
1084
+ "step": 93000
1085
+ },
1086
+ {
1087
+ "epoch": 12.936966694192128,
1088
+ "grad_norm": 17.391189575195312,
1089
+ "learning_rate": 0.0003,
1090
+ "loss": 36.62637109375,
1091
+ "step": 94000
1092
+ },
1093
+ {
1094
+ "epoch": 12.936966694192128,
1095
+ "eval_accuracy": 0.7408841425444469,
1096
+ "eval_loss": 8.96875,
1097
+ "eval_runtime": 168.9204,
1098
+ "eval_samples_per_second": 9787.86,
1099
+ "eval_steps_per_second": 19.121,
1100
+ "step": 94000
1101
+ },
1102
+ {
1103
+ "epoch": 13.07459399944949,
1104
+ "grad_norm": 14.819647789001465,
1105
+ "learning_rate": 0.0003,
1106
+ "loss": 36.554375,
1107
+ "step": 95000
1108
+ },
1109
+ {
1110
+ "epoch": 13.212221304706853,
1111
+ "grad_norm": 15.322741508483887,
1112
+ "learning_rate": 0.0003,
1113
+ "loss": 36.5329375,
1114
+ "step": 96000
1115
+ },
1116
+ {
1117
+ "epoch": 13.212221304706853,
1118
+ "eval_accuracy": 0.7418048186655257,
1119
+ "eval_loss": 8.9296875,
1120
+ "eval_runtime": 168.0129,
1121
+ "eval_samples_per_second": 9840.727,
1122
+ "eval_steps_per_second": 19.225,
1123
+ "step": 96000
1124
+ },
1125
+ {
1126
+ "epoch": 13.349848609964218,
1127
+ "grad_norm": 15.646768569946289,
1128
+ "learning_rate": 0.0003,
1129
+ "loss": 36.5148125,
1130
+ "step": 97000
1131
+ },
1132
+ {
1133
+ "epoch": 13.48747591522158,
1134
+ "grad_norm": 22.885791778564453,
1135
+ "learning_rate": 0.0003,
1136
+ "loss": 36.550625,
1137
+ "step": 98000
1138
+ },
1139
+ {
1140
+ "epoch": 13.48747591522158,
1141
+ "eval_accuracy": 0.7403524509758528,
1142
+ "eval_loss": 9.0078125,
1143
+ "eval_runtime": 169.9032,
1144
+ "eval_samples_per_second": 9731.239,
1145
+ "eval_steps_per_second": 19.011,
1146
+ "step": 98000
1147
+ },
1148
+ {
1149
+ "epoch": 13.625103220478943,
1150
+ "grad_norm": 23.617691040039062,
1151
+ "learning_rate": 0.0003,
1152
+ "loss": 36.64653515625,
1153
+ "step": 99000
1154
+ },
1155
+ {
1156
+ "epoch": 13.762730525736306,
1157
+ "grad_norm": 16.434221267700195,
1158
+ "learning_rate": 0.0003,
1159
+ "loss": 36.5250390625,
1160
+ "step": 100000
1161
+ },
1162
+ {
1163
+ "epoch": 13.762730525736306,
1164
+ "eval_accuracy": 0.741298813475031,
1165
+ "eval_loss": 8.9453125,
1166
+ "eval_runtime": 170.5631,
1167
+ "eval_samples_per_second": 9693.591,
1168
+ "eval_steps_per_second": 18.937,
1169
+ "step": 100000
1170
+ },
1171
+ {
1172
+ "epoch": 13.90035783099367,
1173
+ "grad_norm": 15.552685737609863,
1174
+ "learning_rate": 0.0003,
1175
+ "loss": 36.50655078125,
1176
+ "step": 101000
1177
+ },
1178
+ {
1179
+ "epoch": 14.037985136251033,
1180
+ "grad_norm": 15.682257652282715,
1181
+ "learning_rate": 0.0003,
1182
+ "loss": 36.52517578125,
1183
+ "step": 102000
1184
+ },
1185
+ {
1186
+ "epoch": 14.037985136251033,
1187
+ "eval_accuracy": 0.7417503019794978,
1188
+ "eval_loss": 8.9296875,
1189
+ "eval_runtime": 167.2274,
1190
+ "eval_samples_per_second": 9886.95,
1191
+ "eval_steps_per_second": 19.315,
1192
+ "step": 102000
1193
+ },
1194
+ {
1195
+ "epoch": 14.175612441508395,
1196
+ "grad_norm": 16.582626342773438,
1197
+ "learning_rate": 0.0003,
1198
+ "loss": 36.4439453125,
1199
+ "step": 103000
1200
+ },
1201
+ {
1202
+ "epoch": 14.313239746765758,
1203
+ "grad_norm": 15.228682518005371,
1204
+ "learning_rate": 0.0003,
1205
+ "loss": 36.7357578125,
1206
+ "step": 104000
1207
+ },
1208
+ {
1209
+ "epoch": 14.313239746765758,
1210
+ "eval_accuracy": 0.7409231190118309,
1211
+ "eval_loss": 8.984375,
1212
+ "eval_runtime": 168.495,
1213
+ "eval_samples_per_second": 9812.569,
1214
+ "eval_steps_per_second": 19.17,
1215
+ "step": 104000
1216
+ },
1217
+ {
1218
+ "epoch": 14.45086705202312,
1219
+ "grad_norm": 18.01194190979004,
1220
+ "learning_rate": 0.0003,
1221
+ "loss": 36.506953125,
1222
+ "step": 105000
1223
+ },
1224
+ {
1225
+ "epoch": 14.588494357280485,
1226
+ "grad_norm": 16.423683166503906,
1227
+ "learning_rate": 0.0003,
1228
+ "loss": 36.4571796875,
1229
+ "step": 106000
1230
+ },
1231
+ {
1232
+ "epoch": 14.588494357280485,
1233
+ "eval_accuracy": 0.7415011536669796,
1234
+ "eval_loss": 8.9453125,
1235
+ "eval_runtime": 170.7059,
1236
+ "eval_samples_per_second": 9685.482,
1237
+ "eval_steps_per_second": 18.921,
1238
+ "step": 106000
1239
+ },
1240
+ {
1241
+ "epoch": 14.726121662537848,
1242
+ "grad_norm": 15.935735702514648,
1243
+ "learning_rate": 0.0003,
1244
+ "loss": 36.4399453125,
1245
+ "step": 107000
1246
+ },
1247
+ {
1248
+ "epoch": 14.86374896779521,
1249
+ "grad_norm": 34.532562255859375,
1250
+ "learning_rate": 0.0003,
1251
+ "loss": 36.496796875,
1252
+ "step": 108000
1253
+ },
1254
+ {
1255
+ "epoch": 14.86374896779521,
1256
+ "eval_accuracy": 0.740452263067793,
1257
+ "eval_loss": 9.0,
1258
+ "eval_runtime": 169.5593,
1259
+ "eval_samples_per_second": 9750.98,
1260
+ "eval_steps_per_second": 19.049,
1261
+ "step": 108000
1262
+ },
1263
+ {
1264
+ "epoch": 15.001376273052573,
1265
+ "grad_norm": 17.5130615234375,
1266
+ "learning_rate": 0.0003,
1267
+ "loss": 36.44953125,
1268
+ "step": 109000
1269
+ },
1270
+ {
1271
+ "epoch": 15.139003578309937,
1272
+ "grad_norm": 17.350488662719727,
1273
+ "learning_rate": 0.0003,
1274
+ "loss": 36.5754375,
1275
+ "step": 110000
1276
+ },
1277
+ {
1278
+ "epoch": 15.139003578309937,
1279
+ "eval_accuracy": 0.7407140399047041,
1280
+ "eval_loss": 8.984375,
1281
+ "eval_runtime": 169.3269,
1282
+ "eval_samples_per_second": 9764.36,
1283
+ "eval_steps_per_second": 19.076,
1284
+ "step": 110000
1285
+ },
1286
+ {
1287
+ "epoch": 15.2766308835673,
1288
+ "grad_norm": 16.463069915771484,
1289
+ "learning_rate": 0.0003,
1290
+ "loss": 36.57256640625,
1291
+ "step": 111000
1292
+ },
1293
+ {
1294
+ "epoch": 15.414258188824663,
1295
+ "grad_norm": 15.29598331451416,
1296
+ "learning_rate": 0.0003,
1297
+ "loss": 36.57964453125,
1298
+ "step": 112000
1299
+ },
1300
+ {
1301
+ "epoch": 15.414258188824663,
1302
+ "eval_accuracy": 0.7415344808420624,
1303
+ "eval_loss": 8.9375,
1304
+ "eval_runtime": 169.2422,
1305
+ "eval_samples_per_second": 9769.248,
1306
+ "eval_steps_per_second": 19.085,
1307
+ "step": 112000
1308
+ },
1309
+ {
1310
+ "epoch": 15.551885494082025,
1311
+ "grad_norm": 26.094396591186523,
1312
+ "learning_rate": 0.0003,
1313
+ "loss": 36.45933203125,
1314
+ "step": 113000
1315
+ },
1316
+ {
1317
+ "epoch": 15.689512799339388,
1318
+ "grad_norm": 15.42809009552002,
1319
+ "learning_rate": 0.0003,
1320
+ "loss": 36.4115546875,
1321
+ "step": 114000
1322
+ },
1323
+ {
1324
+ "epoch": 15.689512799339388,
1325
+ "eval_accuracy": 0.7421097237159107,
1326
+ "eval_loss": 8.921875,
1327
+ "eval_runtime": 168.9547,
1328
+ "eval_samples_per_second": 9785.869,
1329
+ "eval_steps_per_second": 19.118,
1330
+ "step": 114000
1331
+ },
1332
+ {
1333
+ "epoch": 15.827140104596753,
1334
+ "grad_norm": 18.432769775390625,
1335
+ "learning_rate": 0.0003,
1336
+ "loss": 36.36869921875,
1337
+ "step": 115000
1338
+ },
1339
+ {
1340
+ "epoch": 15.964767409854115,
1341
+ "grad_norm": 16.479217529296875,
1342
+ "learning_rate": 0.0003,
1343
+ "loss": 36.33540625,
1344
+ "step": 116000
1345
+ },
1346
+ {
1347
+ "epoch": 15.964767409854115,
1348
+ "eval_accuracy": 0.742688496326886,
1349
+ "eval_loss": 8.9140625,
1350
+ "eval_runtime": 171.1574,
1351
+ "eval_samples_per_second": 9659.935,
1352
+ "eval_steps_per_second": 18.872,
1353
+ "step": 116000
1354
+ },
1355
+ {
1356
+ "epoch": 16.10239471511148,
1357
+ "grad_norm": 16.778072357177734,
1358
+ "learning_rate": 0.0003,
1359
+ "loss": 36.3029140625,
1360
+ "step": 117000
1361
+ },
1362
+ {
1363
+ "epoch": 16.24002202036884,
1364
+ "grad_norm": 17.59860610961914,
1365
+ "learning_rate": 0.0003,
1366
+ "loss": 36.37992578125,
1367
+ "step": 118000
1368
+ },
1369
+ {
1370
+ "epoch": 16.24002202036884,
1371
+ "eval_accuracy": 0.7424896095870896,
1372
+ "eval_loss": 8.9140625,
1373
+ "eval_runtime": 169.4524,
1374
+ "eval_samples_per_second": 9757.132,
1375
+ "eval_steps_per_second": 19.061,
1376
+ "step": 118000
1377
+ },
1378
+ {
1379
+ "epoch": 16.377649325626205,
1380
+ "grad_norm": 19.460819244384766,
1381
+ "learning_rate": 0.0003,
1382
+ "loss": 36.3964765625,
1383
+ "step": 119000
1384
+ },
1385
+ {
1386
+ "epoch": 16.515276630883566,
1387
+ "grad_norm": 47.38739776611328,
1388
+ "learning_rate": 0.0003,
1389
+ "loss": 36.399671875,
1390
+ "step": 120000
1391
+ },
1392
+ {
1393
+ "epoch": 16.515276630883566,
1394
+ "eval_accuracy": 0.7421772826656658,
1395
+ "eval_loss": 8.90625,
1396
+ "eval_runtime": 168.2033,
1397
+ "eval_samples_per_second": 9829.585,
1398
+ "eval_steps_per_second": 19.203,
1399
+ "step": 120000
1400
+ },
1401
+ {
1402
+ "epoch": 16.515276630883566,
1403
+ "step": 120000,
1404
+ "total_flos": 1.617130434771026e+19,
1405
+ "train_loss": 38.9186564453125,
1406
+ "train_runtime": 44793.9304,
1407
+ "train_samples_per_second": 5486.458,
1408
+ "train_steps_per_second": 2.679
1409
+ }
1410
+ ],
1411
+ "logging_steps": 1000,
1412
+ "max_steps": 120000,
1413
+ "num_input_tokens_seen": 0,
1414
+ "num_train_epochs": 17,
1415
+ "save_steps": 10000,
1416
+ "stateful_callbacks": {
1417
+ "TrainerControl": {
1418
+ "args": {
1419
+ "should_epoch_stop": false,
1420
+ "should_evaluate": false,
1421
+ "should_log": false,
1422
+ "should_save": true,
1423
+ "should_training_stop": true
1424
+ },
1425
+ "attributes": {}
1426
+ }
1427
+ },
1428
+ "total_flos": 1.617130434771026e+19,
1429
+ "train_batch_size": 64,
1430
+ "trial_name": null,
1431
+ "trial_params": null
1432
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a649deb01156bb8da3ab1696f1e8bd669abf9eb2061c66e5c7389bb738bbb9f0
3
+ size 7377