Attila1011 commited on
Commit
b7c586c
·
verified ·
1 Parent(s): b3faac2

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -48,3 +48,4 @@ checkpoints-v5.7/checkpoint-11264/eval_state.json filter=lfs diff=lfs merge=lfs
48
  checkpoints-v5.8/checkpoint-10240/eval_state.json filter=lfs diff=lfs merge=lfs -text
49
  checkpoints-v4.5/checkpoint-7168/eval_state.json filter=lfs diff=lfs merge=lfs -text
50
  checkpoints-v4.5/checkpoint-15360/eval_state.json filter=lfs diff=lfs merge=lfs -text
 
 
48
  checkpoints-v5.8/checkpoint-10240/eval_state.json filter=lfs diff=lfs merge=lfs -text
49
  checkpoints-v4.5/checkpoint-7168/eval_state.json filter=lfs diff=lfs merge=lfs -text
50
  checkpoints-v4.5/checkpoint-15360/eval_state.json filter=lfs diff=lfs merge=lfs -text
51
+ checkpoints-v5.9/checkpoint-11264/eval_state.json filter=lfs diff=lfs merge=lfs -text
checkpoints-v5.9/checkpoint-11264/ema.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d50c375b4adfdb6701b14ccf091d1af64e58a8bf6bc3b23d0b0f0572c48d2c88
3
+ size 54599592
checkpoints-v5.9/checkpoint-11264/eval_state.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b2a5c0a3ba0a45dd9a72c8140e6ddaf369fd02a427295653585d7c8b8422b3de
3
+ size 59623194
checkpoints-v5.9/checkpoint-11264/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b6d9dd468e268146a81f58330f565165ba042cb8cd627c90c26564638bcaf8f
3
+ size 54599624
checkpoints-v5.9/checkpoint-11264/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b3a6467fedb0537962ac0470155c14409beedb357936bfeb0262ddbfa5a31e88
3
+ size 76550347
checkpoints-v5.9/checkpoint-11264/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3445e2029753be8bf32790ad2df6092dc63f2028e7603a9903f650b4f19cbab4
3
+ size 14645
checkpoints-v5.9/checkpoint-11264/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1763c6e6bb9d17ce1be6965092741353bef140a65f63acbf2d5abb2b85825bac
3
+ size 1383
checkpoints-v5.9/checkpoint-11264/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19ad511dc53ec97f3489767f5cff37a551a58fb728bc82956d6ea6fdb5514518
3
+ size 1465
checkpoints-v5.9/checkpoint-11264/trainer_state.json ADDED
@@ -0,0 +1,430 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.4854440062921542,
6
+ "eval_steps": 1024,
7
+ "global_step": 11264,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.04413127329928675,
14
+ "grad_norm": 15.239693641662598,
15
+ "learning_rate": 9.990234375e-05,
16
+ "loss": 13.825879096984863,
17
+ "step": 1024
18
+ },
19
+ {
20
+ "epoch": 0.04413127329928675,
21
+ "eval_bleu": 0.03856809374192324,
22
+ "eval_ce_clean_loss": 4.292109883924537,
23
+ "eval_ce_pred_loss": 7.204727572418734,
24
+ "eval_flow_mse_loss": 0.6720071626878751,
25
+ "eval_loss": 10.00742630257027,
26
+ "flow/cos_sim": 0.6058795800341218,
27
+ "flow/improvement_ratio": 0.9756911718514937,
28
+ "flow/mag_ratio_mean": 0.5525405872096893,
29
+ "flow/mag_ratio_std": 0.1515131940814986,
30
+ "step": 1024
31
+ },
32
+ {
33
+ "epoch": 0.04413127329928675,
34
+ "eval_bleu": 0.03856809374192324,
35
+ "eval_ce_clean_loss": 4.292109883924537,
36
+ "eval_ce_pred_loss": 7.204727572418734,
37
+ "eval_flow_mse_loss": 0.6720071626878751,
38
+ "eval_loss": 10.00742630257027,
39
+ "eval_runtime": 196.3774,
40
+ "eval_samples_per_second": 152.767,
41
+ "eval_steps_per_second": 2.388,
42
+ "flow/cos_sim": 0.6058795800341218,
43
+ "flow/improvement_ratio": 0.9756911718514937,
44
+ "flow/mag_ratio_mean": 0.5525405872096893,
45
+ "flow/mag_ratio_std": 0.1515131940814986,
46
+ "step": 1024
47
+ },
48
+ {
49
+ "epoch": 0.0882625465985735,
50
+ "grad_norm": 14.527959823608398,
51
+ "learning_rate": 9.947705025097448e-05,
52
+ "loss": 8.282852172851562,
53
+ "step": 2048
54
+ },
55
+ {
56
+ "epoch": 0.0882625465985735,
57
+ "eval_bleu": 0.04835653057106903,
58
+ "eval_ce_clean_loss": 2.0321871253218986,
59
+ "eval_ce_pred_loss": 6.679203874266732,
60
+ "eval_flow_mse_loss": 0.34213011389348047,
61
+ "eval_loss": 7.049759871924102,
62
+ "flow/cos_sim": 0.8224498067837535,
63
+ "flow/improvement_ratio": 0.9910931860460147,
64
+ "flow/mag_ratio_mean": 0.7947272139825801,
65
+ "flow/mag_ratio_std": 0.1784994681951588,
66
+ "step": 2048
67
+ },
68
+ {
69
+ "epoch": 0.0882625465985735,
70
+ "eval_bleu": 0.04835653057106903,
71
+ "eval_ce_clean_loss": 2.0321871253218986,
72
+ "eval_ce_pred_loss": 6.679203874266732,
73
+ "eval_flow_mse_loss": 0.34213011389348047,
74
+ "eval_loss": 7.049759871924102,
75
+ "eval_runtime": 190.6373,
76
+ "eval_samples_per_second": 157.367,
77
+ "eval_steps_per_second": 2.46,
78
+ "flow/cos_sim": 0.8224498067837535,
79
+ "flow/improvement_ratio": 0.9910931860460147,
80
+ "flow/mag_ratio_mean": 0.7947272139825801,
81
+ "flow/mag_ratio_std": 0.1784994681951588,
82
+ "step": 2048
83
+ },
84
+ {
85
+ "epoch": 0.13239381989786023,
86
+ "grad_norm": 18.57774543762207,
87
+ "learning_rate": 9.791711775587888e-05,
88
+ "loss": 6.476203441619873,
89
+ "step": 3072
90
+ },
91
+ {
92
+ "epoch": 0.13239381989786023,
93
+ "eval_bleu": 0.046949984747013486,
94
+ "eval_ce_clean_loss": 0.9904567614547225,
95
+ "eval_ce_pred_loss": 6.729919616601615,
96
+ "eval_flow_mse_loss": 0.2399635909081522,
97
+ "eval_loss": 5.941364000601046,
98
+ "flow/cos_sim": 0.8910990282416598,
99
+ "flow/improvement_ratio": 0.9930302517246336,
100
+ "flow/mag_ratio_mean": 0.879739438038645,
101
+ "flow/mag_ratio_std": 0.15084654318371307,
102
+ "step": 3072
103
+ },
104
+ {
105
+ "epoch": 0.13239381989786023,
106
+ "eval_bleu": 0.046949984747013486,
107
+ "eval_ce_clean_loss": 0.9904567614547225,
108
+ "eval_ce_pred_loss": 6.729919616601615,
109
+ "eval_flow_mse_loss": 0.2399635909081522,
110
+ "eval_loss": 5.941364000601046,
111
+ "eval_runtime": 193.1899,
112
+ "eval_samples_per_second": 155.288,
113
+ "eval_steps_per_second": 2.428,
114
+ "flow/cos_sim": 0.8910990282416598,
115
+ "flow/improvement_ratio": 0.9930302517246336,
116
+ "flow/mag_ratio_mean": 0.879739438038645,
117
+ "flow/mag_ratio_std": 0.15084654318371307,
118
+ "step": 3072
119
+ },
120
+ {
121
+ "epoch": 0.176525093197147,
122
+ "grad_norm": 17.850786209106445,
123
+ "learning_rate": 9.534991440649608e-05,
124
+ "loss": 5.697450637817383,
125
+ "step": 4096
126
+ },
127
+ {
128
+ "epoch": 0.176525093197147,
129
+ "eval_bleu": 0.0394065103208499,
130
+ "eval_ce_clean_loss": 0.47789636076386294,
131
+ "eval_ce_pred_loss": 6.793904790491946,
132
+ "eval_flow_mse_loss": 0.19059975353131162,
133
+ "eval_loss": 5.424229394144087,
134
+ "flow/cos_sim": 0.9323588246221481,
135
+ "flow/improvement_ratio": 0.9941833016714816,
136
+ "flow/mag_ratio_mean": 0.9235222680228097,
137
+ "flow/mag_ratio_std": 0.12112796954762961,
138
+ "step": 4096
139
+ },
140
+ {
141
+ "epoch": 0.176525093197147,
142
+ "eval_bleu": 0.0394065103208499,
143
+ "eval_ce_clean_loss": 0.47789636076386294,
144
+ "eval_ce_pred_loss": 6.793904790491946,
145
+ "eval_flow_mse_loss": 0.19059975353131162,
146
+ "eval_loss": 5.424229394144087,
147
+ "eval_runtime": 194.4713,
148
+ "eval_samples_per_second": 154.264,
149
+ "eval_steps_per_second": 2.412,
150
+ "flow/cos_sim": 0.9323588246221481,
151
+ "flow/improvement_ratio": 0.9941833016714816,
152
+ "flow/mag_ratio_mean": 0.9235222680228097,
153
+ "flow/mag_ratio_std": 0.12112796954762961,
154
+ "step": 4096
155
+ },
156
+ {
157
+ "epoch": 0.22065636649643372,
158
+ "grad_norm": 16.39472007751465,
159
+ "learning_rate": 9.183037205346935e-05,
160
+ "loss": 5.361335277557373,
161
+ "step": 5120
162
+ },
163
+ {
164
+ "epoch": 0.22065636649643372,
165
+ "eval_bleu": 0.03285729536649342,
166
+ "eval_ce_clean_loss": 0.24329622429825348,
167
+ "eval_ce_pred_loss": 6.871835609997259,
168
+ "eval_flow_mse_loss": 0.1694319029606736,
169
+ "eval_loss": 5.223012985196958,
170
+ "flow/cos_sim": 0.9513821737852686,
171
+ "flow/improvement_ratio": 0.9946333313547472,
172
+ "flow/mag_ratio_mean": 0.9428434033892048,
173
+ "flow/mag_ratio_std": 0.09642866442897427,
174
+ "step": 5120
175
+ },
176
+ {
177
+ "epoch": 0.22065636649643372,
178
+ "eval_bleu": 0.03285729536649342,
179
+ "eval_ce_clean_loss": 0.24329622429825348,
180
+ "eval_ce_pred_loss": 6.871835609997259,
181
+ "eval_flow_mse_loss": 0.1694319029606736,
182
+ "eval_loss": 5.223012985196958,
183
+ "eval_runtime": 194.2427,
184
+ "eval_samples_per_second": 154.446,
185
+ "eval_steps_per_second": 2.415,
186
+ "flow/cos_sim": 0.9513821737852686,
187
+ "flow/improvement_ratio": 0.9946333313547472,
188
+ "flow/mag_ratio_mean": 0.9428434033892048,
189
+ "flow/mag_ratio_std": 0.09642866442897427,
190
+ "step": 5120
191
+ },
192
+ {
193
+ "epoch": 0.26478763979572045,
194
+ "grad_norm": 13.939663887023926,
195
+ "learning_rate": 8.74324003722993e-05,
196
+ "loss": 5.212263107299805,
197
+ "step": 6144
198
+ },
199
+ {
200
+ "epoch": 0.26478763979572045,
201
+ "eval_bleu": 0.027256566689703093,
202
+ "eval_ce_clean_loss": 0.1259780960646011,
203
+ "eval_ce_pred_loss": 6.923184791353465,
204
+ "eval_flow_mse_loss": 0.15653546359429735,
205
+ "eval_loss": 5.128742840244318,
206
+ "flow/cos_sim": 0.9604966702745922,
207
+ "flow/improvement_ratio": 0.9943097961991072,
208
+ "flow/mag_ratio_mean": 0.9571583648480332,
209
+ "flow/mag_ratio_std": 0.08279879235509616,
210
+ "step": 6144
211
+ },
212
+ {
213
+ "epoch": 0.26478763979572045,
214
+ "eval_bleu": 0.027256566689703093,
215
+ "eval_ce_clean_loss": 0.1259780960646011,
216
+ "eval_ce_pred_loss": 6.923184791353465,
217
+ "eval_flow_mse_loss": 0.15653546359429735,
218
+ "eval_loss": 5.128742840244318,
219
+ "eval_runtime": 193.2759,
220
+ "eval_samples_per_second": 155.219,
221
+ "eval_steps_per_second": 2.427,
222
+ "flow/cos_sim": 0.9604966702745922,
223
+ "flow/improvement_ratio": 0.9943097961991072,
224
+ "flow/mag_ratio_mean": 0.9571583648480332,
225
+ "flow/mag_ratio_std": 0.08279879235509616,
226
+ "step": 6144
227
+ },
228
+ {
229
+ "epoch": 0.30891891309500724,
230
+ "grad_norm": 16.255971908569336,
231
+ "learning_rate": 8.225917891725653e-05,
232
+ "loss": 5.156039714813232,
233
+ "step": 7168
234
+ },
235
+ {
236
+ "epoch": 0.30891891309500724,
237
+ "eval_bleu": 0.022155517814971083,
238
+ "eval_ce_clean_loss": 0.07260342659567719,
239
+ "eval_ce_pred_loss": 6.978886835102333,
240
+ "eval_flow_mse_loss": 0.14691995797571597,
241
+ "eval_loss": 5.104744104942533,
242
+ "flow/cos_sim": 0.9672264456748962,
243
+ "flow/improvement_ratio": 0.9955292942681546,
244
+ "flow/mag_ratio_mean": 0.9630893364644,
245
+ "flow/mag_ratio_std": 0.07540169008759293,
246
+ "step": 7168
247
+ },
248
+ {
249
+ "epoch": 0.30891891309500724,
250
+ "eval_bleu": 0.022155517814971083,
251
+ "eval_ce_clean_loss": 0.07260342659567719,
252
+ "eval_ce_pred_loss": 6.978886835102333,
253
+ "eval_flow_mse_loss": 0.14691995797571597,
254
+ "eval_loss": 5.104744104942533,
255
+ "eval_runtime": 195.9924,
256
+ "eval_samples_per_second": 153.067,
257
+ "eval_steps_per_second": 2.393,
258
+ "flow/cos_sim": 0.9672264456748962,
259
+ "flow/improvement_ratio": 0.9955292942681546,
260
+ "flow/mag_ratio_mean": 0.9630893364644,
261
+ "flow/mag_ratio_std": 0.07540169008759293,
262
+ "step": 7168
263
+ },
264
+ {
265
+ "epoch": 0.353050186394294,
266
+ "grad_norm": 8.994356155395508,
267
+ "learning_rate": 7.639913242399507e-05,
268
+ "loss": 5.134494781494141,
269
+ "step": 8192
270
+ },
271
+ {
272
+ "epoch": 0.353050186394294,
273
+ "eval_bleu": 0.019220227732395335,
274
+ "eval_ce_clean_loss": 0.04499363649819197,
275
+ "eval_ce_pred_loss": 7.006692920920691,
276
+ "eval_flow_mse_loss": 0.14029116780836698,
277
+ "eval_loss": 5.089969773282374,
278
+ "flow/cos_sim": 0.9713161822829419,
279
+ "flow/improvement_ratio": 0.9945664123685629,
280
+ "flow/mag_ratio_mean": 0.9694278030507346,
281
+ "flow/mag_ratio_std": 0.06895820532780467,
282
+ "step": 8192
283
+ },
284
+ {
285
+ "epoch": 0.353050186394294,
286
+ "eval_bleu": 0.019220227732395335,
287
+ "eval_ce_clean_loss": 0.04499363649819197,
288
+ "eval_ce_pred_loss": 7.006692920920691,
289
+ "eval_flow_mse_loss": 0.14029116780836698,
290
+ "eval_loss": 5.089969773282374,
291
+ "eval_runtime": 194.1667,
292
+ "eval_samples_per_second": 154.506,
293
+ "eval_steps_per_second": 2.415,
294
+ "flow/cos_sim": 0.9713161822829419,
295
+ "flow/improvement_ratio": 0.9945664123685629,
296
+ "flow/mag_ratio_mean": 0.9694278030507346,
297
+ "flow/mag_ratio_std": 0.06895820532780467,
298
+ "step": 8192
299
+ },
300
+ {
301
+ "epoch": 0.3971814596935807,
302
+ "grad_norm": 9.113734245300293,
303
+ "learning_rate": 6.998470950469718e-05,
304
+ "loss": 5.114821910858154,
305
+ "step": 9216
306
+ },
307
+ {
308
+ "epoch": 0.3971814596935807,
309
+ "eval_bleu": 0.01583632669902971,
310
+ "eval_ce_clean_loss": 0.03044994743163588,
311
+ "eval_ce_pred_loss": 7.017744996654454,
312
+ "eval_flow_mse_loss": 0.1349480300824017,
313
+ "eval_loss": 5.077819404317371,
314
+ "flow/cos_sim": 0.9751363802057847,
315
+ "flow/improvement_ratio": 0.9953071037843537,
316
+ "flow/mag_ratio_mean": 0.969198439929531,
317
+ "flow/mag_ratio_std": 0.06459524200510369,
318
+ "step": 9216
319
+ },
320
+ {
321
+ "epoch": 0.3971814596935807,
322
+ "eval_bleu": 0.01583632669902971,
323
+ "eval_ce_clean_loss": 0.03044994743163588,
324
+ "eval_ce_pred_loss": 7.017744996654454,
325
+ "eval_flow_mse_loss": 0.1349480300824017,
326
+ "eval_loss": 5.077819404317371,
327
+ "eval_runtime": 192.9017,
328
+ "eval_samples_per_second": 155.52,
329
+ "eval_steps_per_second": 2.431,
330
+ "flow/cos_sim": 0.9751363802057847,
331
+ "flow/improvement_ratio": 0.9953071037843537,
332
+ "flow/mag_ratio_mean": 0.969198439929531,
333
+ "flow/mag_ratio_std": 0.06459524200510369,
334
+ "step": 9216
335
+ },
336
+ {
337
+ "epoch": 0.44131273299286744,
338
+ "grad_norm": 7.316830158233643,
339
+ "learning_rate": 6.315061173955019e-05,
340
+ "loss": 5.1045637130737305,
341
+ "step": 10240
342
+ },
343
+ {
344
+ "epoch": 0.44131273299286744,
345
+ "eval_bleu": 0.01353516095055143,
346
+ "eval_ce_clean_loss": 0.021886831322220215,
347
+ "eval_ce_pred_loss": 7.025449516930814,
348
+ "eval_flow_mse_loss": 0.1312034219694036,
349
+ "eval_loss": 5.070904863922835,
350
+ "flow/cos_sim": 0.9770022788281634,
351
+ "flow/improvement_ratio": 0.9958924522786252,
352
+ "flow/mag_ratio_mean": 0.974764855050329,
353
+ "flow/mag_ratio_std": 0.06171431206563897,
354
+ "step": 10240
355
+ },
356
+ {
357
+ "epoch": 0.44131273299286744,
358
+ "eval_bleu": 0.01353516095055143,
359
+ "eval_ce_clean_loss": 0.021886831322220215,
360
+ "eval_ce_pred_loss": 7.025449516930814,
361
+ "eval_flow_mse_loss": 0.1312034219694036,
362
+ "eval_loss": 5.070904863922835,
363
+ "eval_runtime": 194.3566,
364
+ "eval_samples_per_second": 154.355,
365
+ "eval_steps_per_second": 2.413,
366
+ "flow/cos_sim": 0.9770022788281634,
367
+ "flow/improvement_ratio": 0.9958924522786252,
368
+ "flow/mag_ratio_mean": 0.974764855050329,
369
+ "flow/mag_ratio_std": 0.06171431206563897,
370
+ "step": 10240
371
+ },
372
+ {
373
+ "epoch": 0.4854440062921542,
374
+ "grad_norm": 8.874271392822266,
375
+ "learning_rate": 5.604738390528452e-05,
376
+ "loss": 5.102542400360107,
377
+ "step": 11264
378
+ },
379
+ {
380
+ "epoch": 0.4854440062921542,
381
+ "eval_bleu": 0.012540996821431605,
382
+ "eval_ce_clean_loss": 0.016188185467426456,
383
+ "eval_ce_pred_loss": 7.027885409560539,
384
+ "eval_flow_mse_loss": 0.12859490318402553,
385
+ "eval_loss": 5.064302796239792,
386
+ "flow/cos_sim": 0.9777249371065005,
387
+ "flow/improvement_ratio": 0.9943972062200371,
388
+ "flow/mag_ratio_mean": 0.9753310384272512,
389
+ "flow/mag_ratio_std": 0.05981211423842129,
390
+ "step": 11264
391
+ },
392
+ {
393
+ "epoch": 0.4854440062921542,
394
+ "eval_bleu": 0.012540996821431605,
395
+ "eval_ce_clean_loss": 0.016188185467426456,
396
+ "eval_ce_pred_loss": 7.027885409560539,
397
+ "eval_flow_mse_loss": 0.12859490318402553,
398
+ "eval_loss": 5.064302796239792,
399
+ "eval_runtime": 193.4331,
400
+ "eval_samples_per_second": 155.092,
401
+ "eval_steps_per_second": 2.425,
402
+ "flow/cos_sim": 0.9777249371065005,
403
+ "flow/improvement_ratio": 0.9943972062200371,
404
+ "flow/mag_ratio_mean": 0.9753310384272512,
405
+ "flow/mag_ratio_std": 0.05981211423842129,
406
+ "step": 11264
407
+ }
408
+ ],
409
+ "logging_steps": 1024,
410
+ "max_steps": 23204,
411
+ "num_input_tokens_seen": 0,
412
+ "num_train_epochs": 1,
413
+ "save_steps": 1024,
414
+ "stateful_callbacks": {
415
+ "TrainerControl": {
416
+ "args": {
417
+ "should_epoch_stop": false,
418
+ "should_evaluate": false,
419
+ "should_log": false,
420
+ "should_save": true,
421
+ "should_training_stop": false
422
+ },
423
+ "attributes": {}
424
+ }
425
+ },
426
+ "total_flos": 0.0,
427
+ "train_batch_size": 64,
428
+ "trial_name": null,
429
+ "trial_params": null
430
+ }
checkpoints-v5.9/checkpoint-11264/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8469bbc81a2ba0be2f5b44007faafd15c75615abe30f4f4e56171816d31caa5b
3
+ size 5137