Attila1011 commited on
Commit
69d7824
·
verified ·
1 Parent(s): ed40e86

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -45,3 +45,4 @@ checkpoints-v5.5/checkpoint-24576/eval_state.json filter=lfs diff=lfs merge=lfs
45
  checkpoints-v5.6/checkpoint-4096/eval_state.json filter=lfs diff=lfs merge=lfs -text
46
  checkpoints-v5.6/checkpoint-14336/eval_state.json filter=lfs diff=lfs merge=lfs -text
47
  checkpoints-v5.7/checkpoint-11264/eval_state.json filter=lfs diff=lfs merge=lfs -text
 
 
45
  checkpoints-v5.6/checkpoint-4096/eval_state.json filter=lfs diff=lfs merge=lfs -text
46
  checkpoints-v5.6/checkpoint-14336/eval_state.json filter=lfs diff=lfs merge=lfs -text
47
  checkpoints-v5.7/checkpoint-11264/eval_state.json filter=lfs diff=lfs merge=lfs -text
48
+ checkpoints-v5.8/checkpoint-10240/eval_state.json filter=lfs diff=lfs merge=lfs -text
checkpoints-v5.8/checkpoint-10240/ema.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e999a0151447deb2f736e37a5ea04fe7417f37792d807827cbeac852e7cad9f3
3
+ size 54599592
checkpoints-v5.8/checkpoint-10240/eval_state.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ab41d03aa2541e58672163e22db14aa05837338133c1c6f19ca09d6190f94f1c
3
+ size 57900436
checkpoints-v5.8/checkpoint-10240/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a262ca2a251444e555cdcec380059f77bbd36743b8a6ee6e4649d0ce1ec5431
3
+ size 54599624
checkpoints-v5.8/checkpoint-10240/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a0bd7903cd37f51e5c8188658d72987fe1e143785181620bb0c348e69ae0c332
3
+ size 76550347
checkpoints-v5.8/checkpoint-10240/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1fc9a7ad80ccf79ec63508fe00c62cf70deaf89caf3ca329a532d3f0b214151c
3
+ size 14645
checkpoints-v5.8/checkpoint-10240/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2edeacbafdc9b000fdfcc7b5e26dfdbd488cc1c079462aca0d777eefd7a4f8d4
3
+ size 1383
checkpoints-v5.8/checkpoint-10240/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0bc2da087d3bab042e131d5d0e0bdf1c5605ed7c07cd742ecbf80250f6aa7442
3
+ size 1465
checkpoints-v5.8/checkpoint-10240/trainer_state.json ADDED
@@ -0,0 +1,414 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.44131273299286744,
6
+ "eval_steps": 1024,
7
+ "global_step": 10240,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.04413127329928675,
14
+ "grad_norm": 2.8057446479797363,
15
+ "learning_rate": 9.990234375e-05,
16
+ "loss": 19.480113983154297,
17
+ "step": 1024
18
+ },
19
+ {
20
+ "epoch": 0.04413127329928675,
21
+ "eval_bleu": 0.15342690122492753,
22
+ "eval_ce_clean_loss": 2.2290129910654097,
23
+ "eval_ce_pred_loss": 4.926800312009702,
24
+ "eval_flow_cos_loss": 0.4851009784095577,
25
+ "eval_flow_mse_loss": 1.397121523743245,
26
+ "eval_loss": 11.836170995667546,
27
+ "flow/cos_sim": 0.5148990369681865,
28
+ "flow/improvement_ratio": 0.9246285279422427,
29
+ "flow/mag_ratio_mean": 0.013031711470264235,
30
+ "flow/mag_ratio_std": 0.006753856040187863,
31
+ "step": 1024
32
+ },
33
+ {
34
+ "epoch": 0.04413127329928675,
35
+ "eval_bleu": 0.15342690122492753,
36
+ "eval_ce_clean_loss": 2.2290129910654097,
37
+ "eval_ce_pred_loss": 4.926800312009702,
38
+ "eval_flow_cos_loss": 0.4851009784095577,
39
+ "eval_flow_mse_loss": 1.397121523743245,
40
+ "eval_loss": 11.836170995667546,
41
+ "eval_runtime": 207.3627,
42
+ "eval_samples_per_second": 144.674,
43
+ "eval_steps_per_second": 2.262,
44
+ "flow/cos_sim": 0.5148990369681865,
45
+ "flow/improvement_ratio": 0.9246285279422427,
46
+ "flow/mag_ratio_mean": 0.013031711470264235,
47
+ "flow/mag_ratio_std": 0.006753856040187863,
48
+ "step": 1024
49
+ },
50
+ {
51
+ "epoch": 0.0882625465985735,
52
+ "grad_norm": 3.7640140056610107,
53
+ "learning_rate": 9.947705025097448e-05,
54
+ "loss": 9.050005912780762,
55
+ "step": 2048
56
+ },
57
+ {
58
+ "epoch": 0.0882625465985735,
59
+ "eval_bleu": 0.3591161016860251,
60
+ "eval_ce_clean_loss": 0.5261886228503448,
61
+ "eval_ce_pred_loss": 3.2509129626918702,
62
+ "eval_flow_cos_loss": 0.444934548663178,
63
+ "eval_flow_mse_loss": 1.3912561704863364,
64
+ "eval_loss": 7.600675975335941,
65
+ "flow/cos_sim": 0.5550654290327385,
66
+ "flow/improvement_ratio": 0.9916605481715091,
67
+ "flow/mag_ratio_mean": 0.4833370310538359,
68
+ "flow/mag_ratio_std": 0.13753852689825397,
69
+ "step": 2048
70
+ },
71
+ {
72
+ "epoch": 0.0882625465985735,
73
+ "eval_bleu": 0.3591161016860251,
74
+ "eval_ce_clean_loss": 0.5261886228503448,
75
+ "eval_ce_pred_loss": 3.2509129626918702,
76
+ "eval_flow_cos_loss": 0.444934548663178,
77
+ "eval_flow_mse_loss": 1.3912561704863364,
78
+ "eval_loss": 7.600675975335941,
79
+ "eval_runtime": 202.3015,
80
+ "eval_samples_per_second": 148.293,
81
+ "eval_steps_per_second": 2.318,
82
+ "flow/cos_sim": 0.5550654290327385,
83
+ "flow/improvement_ratio": 0.9916605481715091,
84
+ "flow/mag_ratio_mean": 0.4833370310538359,
85
+ "flow/mag_ratio_std": 0.13753852689825397,
86
+ "step": 2048
87
+ },
88
+ {
89
+ "epoch": 0.13239381989786023,
90
+ "grad_norm": 2.177398681640625,
91
+ "learning_rate": 9.7915094488941e-05,
92
+ "loss": 7.1016950607299805,
93
+ "step": 3072
94
+ },
95
+ {
96
+ "epoch": 0.13239381989786023,
97
+ "eval_bleu": 0.4229786990281486,
98
+ "eval_ce_clean_loss": 0.19955731540727717,
99
+ "eval_ce_pred_loss": 2.75210454392789,
100
+ "eval_flow_cos_loss": 0.3368307453737076,
101
+ "eval_flow_mse_loss": 1.455592979246111,
102
+ "eval_loss": 6.595311283811069,
103
+ "flow/cos_sim": 0.663169250559451,
104
+ "flow/improvement_ratio": 0.9943748110138786,
105
+ "flow/mag_ratio_mean": 0.6215583990886013,
106
+ "flow/mag_ratio_std": 0.13196718303570107,
107
+ "step": 3072
108
+ },
109
+ {
110
+ "epoch": 0.13239381989786023,
111
+ "eval_bleu": 0.4229786990281486,
112
+ "eval_ce_clean_loss": 0.19955731540727717,
113
+ "eval_ce_pred_loss": 2.75210454392789,
114
+ "eval_flow_cos_loss": 0.3368307453737076,
115
+ "eval_flow_mse_loss": 1.455592979246111,
116
+ "eval_loss": 6.595311283811069,
117
+ "eval_runtime": 203.4697,
118
+ "eval_samples_per_second": 147.442,
119
+ "eval_steps_per_second": 2.305,
120
+ "flow/cos_sim": 0.663169250559451,
121
+ "flow/improvement_ratio": 0.9943748110138786,
122
+ "flow/mag_ratio_mean": 0.6215583990886013,
123
+ "flow/mag_ratio_std": 0.13196718303570107,
124
+ "step": 3072
125
+ },
126
+ {
127
+ "epoch": 0.176525093197147,
128
+ "grad_norm": 1.7163344621658325,
129
+ "learning_rate": 9.534693146185996e-05,
130
+ "loss": 6.385639667510986,
131
+ "step": 4096
132
+ },
133
+ {
134
+ "epoch": 0.176525093197147,
135
+ "eval_bleu": 0.4513643743621062,
136
+ "eval_ce_clean_loss": 0.10498536713341915,
137
+ "eval_ce_pred_loss": 2.5130718211883676,
138
+ "eval_flow_cos_loss": 0.2594938945414415,
139
+ "eval_flow_mse_loss": 1.4369017711834613,
140
+ "eval_loss": 6.09481920819801,
141
+ "flow/cos_sim": 0.7405061316388503,
142
+ "flow/improvement_ratio": 0.9943584953543982,
143
+ "flow/mag_ratio_mean": 0.7165890200051672,
144
+ "flow/mag_ratio_std": 0.12281975911052496,
145
+ "step": 4096
146
+ },
147
+ {
148
+ "epoch": 0.176525093197147,
149
+ "eval_bleu": 0.4513643743621062,
150
+ "eval_ce_clean_loss": 0.10498536713341915,
151
+ "eval_ce_pred_loss": 2.5130718211883676,
152
+ "eval_flow_cos_loss": 0.2594938945414415,
153
+ "eval_flow_mse_loss": 1.4369017711834613,
154
+ "eval_loss": 6.09481920819801,
155
+ "eval_runtime": 204.2952,
156
+ "eval_samples_per_second": 146.846,
157
+ "eval_steps_per_second": 2.296,
158
+ "flow/cos_sim": 0.7405061316388503,
159
+ "flow/improvement_ratio": 0.9943584953543982,
160
+ "flow/mag_ratio_mean": 0.7165890200051672,
161
+ "flow/mag_ratio_std": 0.12281975911052496,
162
+ "step": 4096
163
+ },
164
+ {
165
+ "epoch": 0.22065636649643372,
166
+ "grad_norm": 1.8768128156661987,
167
+ "learning_rate": 9.18264920723673e-05,
168
+ "loss": 6.064126491546631,
169
+ "step": 5120
170
+ },
171
+ {
172
+ "epoch": 0.22065636649643372,
173
+ "eval_bleu": 0.4614809350663718,
174
+ "eval_ce_clean_loss": 0.0667420147832777,
175
+ "eval_ce_pred_loss": 2.3907921583667746,
176
+ "eval_flow_cos_loss": 0.21185589446696138,
177
+ "eval_flow_mse_loss": 1.4259319119870282,
178
+ "eval_loss": 5.844792096599587,
179
+ "flow/cos_sim": 0.7881441263756009,
180
+ "flow/improvement_ratio": 0.9946798176399426,
181
+ "flow/mag_ratio_mean": 0.7737196202217135,
182
+ "flow/mag_ratio_std": 0.12071893823299326,
183
+ "step": 5120
184
+ },
185
+ {
186
+ "epoch": 0.22065636649643372,
187
+ "eval_bleu": 0.4614809350663718,
188
+ "eval_ce_clean_loss": 0.0667420147832777,
189
+ "eval_ce_pred_loss": 2.3907921583667746,
190
+ "eval_flow_cos_loss": 0.21185589446696138,
191
+ "eval_flow_mse_loss": 1.4259319119870282,
192
+ "eval_loss": 5.844792096599587,
193
+ "eval_runtime": 205.8254,
194
+ "eval_samples_per_second": 145.755,
195
+ "eval_steps_per_second": 2.279,
196
+ "flow/cos_sim": 0.7881441263756009,
197
+ "flow/improvement_ratio": 0.9946798176399426,
198
+ "flow/mag_ratio_mean": 0.7737196202217135,
199
+ "flow/mag_ratio_std": 0.12071893823299326,
200
+ "step": 5120
201
+ },
202
+ {
203
+ "epoch": 0.26478763979572045,
204
+ "grad_norm": 2.758634328842163,
205
+ "learning_rate": 8.74324003722993e-05,
206
+ "loss": 5.889730930328369,
207
+ "step": 6144
208
+ },
209
+ {
210
+ "epoch": 0.26478763979572045,
211
+ "eval_bleu": 0.4845742868272886,
212
+ "eval_ce_clean_loss": 0.04714409711121369,
213
+ "eval_ce_pred_loss": 2.2815945768661337,
214
+ "eval_flow_cos_loss": 0.18578982953705006,
215
+ "eval_flow_mse_loss": 1.4454534775666845,
216
+ "eval_loss": 5.6841636202228605,
217
+ "flow/cos_sim": 0.8142101938790604,
218
+ "flow/improvement_ratio": 0.9944394338868066,
219
+ "flow/mag_ratio_mean": 0.8032813853800678,
220
+ "flow/mag_ratio_std": 0.12197709971590083,
221
+ "step": 6144
222
+ },
223
+ {
224
+ "epoch": 0.26478763979572045,
225
+ "eval_bleu": 0.4845742868272886,
226
+ "eval_ce_clean_loss": 0.04714409711121369,
227
+ "eval_ce_pred_loss": 2.2815945768661337,
228
+ "eval_flow_cos_loss": 0.18578982953705006,
229
+ "eval_flow_mse_loss": 1.4454534775666845,
230
+ "eval_loss": 5.6841636202228605,
231
+ "eval_runtime": 206.3829,
232
+ "eval_samples_per_second": 145.361,
233
+ "eval_steps_per_second": 2.272,
234
+ "flow/cos_sim": 0.8142101938790604,
235
+ "flow/improvement_ratio": 0.9944394338868066,
236
+ "flow/mag_ratio_mean": 0.8032813853800678,
237
+ "flow/mag_ratio_std": 0.12197709971590083,
238
+ "step": 6144
239
+ },
240
+ {
241
+ "epoch": 0.30891891309500724,
242
+ "grad_norm": 5.409186363220215,
243
+ "learning_rate": 8.22483558761947e-05,
244
+ "loss": 5.709588527679443,
245
+ "step": 7168
246
+ },
247
+ {
248
+ "epoch": 0.30891891309500724,
249
+ "eval_bleu": 0.4929805768795461,
250
+ "eval_ce_clean_loss": 0.03396684316191465,
251
+ "eval_ce_pred_loss": 2.2168381203974743,
252
+ "eval_flow_cos_loss": 0.16666441371064705,
253
+ "eval_flow_mse_loss": 1.4314661895288334,
254
+ "eval_loss": 5.548089429005377,
255
+ "flow/cos_sim": 0.8333356131368609,
256
+ "flow/improvement_ratio": 0.9955934001438653,
257
+ "flow/mag_ratio_mean": 0.8248435496521402,
258
+ "flow/mag_ratio_std": 0.12081558916614508,
259
+ "step": 7168
260
+ },
261
+ {
262
+ "epoch": 0.30891891309500724,
263
+ "eval_bleu": 0.4929805768795461,
264
+ "eval_ce_clean_loss": 0.03396684316191465,
265
+ "eval_ce_pred_loss": 2.2168381203974743,
266
+ "eval_flow_cos_loss": 0.16666441371064705,
267
+ "eval_flow_mse_loss": 1.4314661895288334,
268
+ "eval_loss": 5.548089429005377,
269
+ "eval_runtime": 204.7605,
270
+ "eval_samples_per_second": 146.513,
271
+ "eval_steps_per_second": 2.29,
272
+ "flow/cos_sim": 0.8333356131368609,
273
+ "flow/improvement_ratio": 0.9955934001438653,
274
+ "flow/mag_ratio_mean": 0.8248435496521402,
275
+ "flow/mag_ratio_std": 0.12081558916614508,
276
+ "step": 7168
277
+ },
278
+ {
279
+ "epoch": 0.353050186394294,
280
+ "grad_norm": 3.4768965244293213,
281
+ "learning_rate": 7.639311770076283e-05,
282
+ "loss": 5.6106743812561035,
283
+ "step": 8192
284
+ },
285
+ {
286
+ "epoch": 0.353050186394294,
287
+ "eval_bleu": 0.5000895464260023,
288
+ "eval_ce_clean_loss": 0.026072347614127817,
289
+ "eval_ce_pred_loss": 2.172668118212523,
290
+ "eval_flow_cos_loss": 0.15404865479291374,
291
+ "eval_flow_mse_loss": 1.433172311101641,
292
+ "eval_loss": 5.4733451587050705,
293
+ "flow/cos_sim": 0.8459513680512971,
294
+ "flow/improvement_ratio": 0.9947214378222727,
295
+ "flow/mag_ratio_mean": 0.8395651439105524,
296
+ "flow/mag_ratio_std": 0.12124867371912958,
297
+ "step": 8192
298
+ },
299
+ {
300
+ "epoch": 0.353050186394294,
301
+ "eval_bleu": 0.5000895464260023,
302
+ "eval_ce_clean_loss": 0.026072347614127817,
303
+ "eval_ce_pred_loss": 2.172668118212523,
304
+ "eval_flow_cos_loss": 0.15404865479291374,
305
+ "eval_flow_mse_loss": 1.433172311101641,
306
+ "eval_loss": 5.4733451587050705,
307
+ "eval_runtime": 205.5563,
308
+ "eval_samples_per_second": 145.945,
309
+ "eval_steps_per_second": 2.282,
310
+ "flow/cos_sim": 0.8459513680512971,
311
+ "flow/improvement_ratio": 0.9947214378222727,
312
+ "flow/mag_ratio_mean": 0.8395651439105524,
313
+ "flow/mag_ratio_std": 0.12124867371912958,
314
+ "step": 8192
315
+ },
316
+ {
317
+ "epoch": 0.3971814596935807,
318
+ "grad_norm": 2.03994083404541,
319
+ "learning_rate": 6.997821756319211e-05,
320
+ "loss": 5.528759956359863,
321
+ "step": 9216
322
+ },
323
+ {
324
+ "epoch": 0.3971814596935807,
325
+ "eval_bleu": 0.5061985839044346,
326
+ "eval_ce_clean_loss": 0.020625252342189173,
327
+ "eval_ce_pred_loss": 2.0967261576449183,
328
+ "eval_flow_cos_loss": 0.1426173726092778,
329
+ "eval_flow_mse_loss": 1.4152808268187143,
330
+ "eval_loss": 5.324290069689883,
331
+ "flow/cos_sim": 0.8573826374783953,
332
+ "flow/improvement_ratio": 0.9952330511770269,
333
+ "flow/mag_ratio_mean": 0.8550254363240972,
334
+ "flow/mag_ratio_std": 0.12249722452496668,
335
+ "step": 9216
336
+ },
337
+ {
338
+ "epoch": 0.3971814596935807,
339
+ "eval_bleu": 0.5061985839044346,
340
+ "eval_ce_clean_loss": 0.020625252342189173,
341
+ "eval_ce_pred_loss": 2.0967261576449183,
342
+ "eval_flow_cos_loss": 0.1426173726092778,
343
+ "eval_flow_mse_loss": 1.4152808268187143,
344
+ "eval_loss": 5.324290069689883,
345
+ "eval_runtime": 204.3393,
346
+ "eval_samples_per_second": 146.815,
347
+ "eval_steps_per_second": 2.295,
348
+ "flow/cos_sim": 0.8573826374783953,
349
+ "flow/improvement_ratio": 0.9952330511770269,
350
+ "flow/mag_ratio_mean": 0.8550254363240972,
351
+ "flow/mag_ratio_std": 0.12249722452496668,
352
+ "step": 9216
353
+ },
354
+ {
355
+ "epoch": 0.44131273299286744,
356
+ "grad_norm": 3.8528435230255127,
357
+ "learning_rate": 6.315061173955019e-05,
358
+ "loss": 5.434952259063721,
359
+ "step": 10240
360
+ },
361
+ {
362
+ "epoch": 0.44131273299286744,
363
+ "eval_bleu": 0.509362067123535,
364
+ "eval_ce_clean_loss": 0.016885403623141206,
365
+ "eval_ce_pred_loss": 2.091463715282839,
366
+ "eval_flow_cos_loss": 0.13576125987430118,
367
+ "eval_flow_mse_loss": 1.4116799185779303,
368
+ "eval_loss": 5.305541175260727,
369
+ "flow/cos_sim": 0.864238769896249,
370
+ "flow/improvement_ratio": 0.9957862627277496,
371
+ "flow/mag_ratio_mean": 0.8612959158700159,
372
+ "flow/mag_ratio_std": 0.12316946316756673,
373
+ "step": 10240
374
+ },
375
+ {
376
+ "epoch": 0.44131273299286744,
377
+ "eval_bleu": 0.509362067123535,
378
+ "eval_ce_clean_loss": 0.016885403623141206,
379
+ "eval_ce_pred_loss": 2.091463715282839,
380
+ "eval_flow_cos_loss": 0.13576125987430118,
381
+ "eval_flow_mse_loss": 1.4116799185779303,
382
+ "eval_loss": 5.305541175260727,
383
+ "eval_runtime": 204.1554,
384
+ "eval_samples_per_second": 146.947,
385
+ "eval_steps_per_second": 2.297,
386
+ "flow/cos_sim": 0.864238769896249,
387
+ "flow/improvement_ratio": 0.9957862627277496,
388
+ "flow/mag_ratio_mean": 0.8612959158700159,
389
+ "flow/mag_ratio_std": 0.12316946316756673,
390
+ "step": 10240
391
+ }
392
+ ],
393
+ "logging_steps": 1024,
394
+ "max_steps": 23204,
395
+ "num_input_tokens_seen": 0,
396
+ "num_train_epochs": 1,
397
+ "save_steps": 1024,
398
+ "stateful_callbacks": {
399
+ "TrainerControl": {
400
+ "args": {
401
+ "should_epoch_stop": false,
402
+ "should_evaluate": false,
403
+ "should_log": false,
404
+ "should_save": true,
405
+ "should_training_stop": false
406
+ },
407
+ "attributes": {}
408
+ }
409
+ },
410
+ "total_flos": 0.0,
411
+ "train_batch_size": 64,
412
+ "trial_name": null,
413
+ "trial_params": null
414
+ }
checkpoints-v5.8/checkpoint-10240/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8469bbc81a2ba0be2f5b44007faafd15c75615abe30f4f4e56171816d31caa5b
3
+ size 5137