Attila1011 commited on
Commit
3f654ac
·
verified ·
1 Parent(s): 6251938

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -63,3 +63,4 @@ checkpoints-v5.13/checkpoint-10240/eval_state.json filter=lfs diff=lfs merge=lfs
63
  checkpoints-v5.13-b/checkpoint-10240/eval_state.json filter=lfs diff=lfs merge=lfs -text
64
  checkpoints-v4.6+/checkpoint-13312/eval_state.json filter=lfs diff=lfs merge=lfs -text
65
  checkpoints-v4.6+/checkpoint-21504/eval_state.json filter=lfs diff=lfs merge=lfs -text
 
 
63
  checkpoints-v5.13-b/checkpoint-10240/eval_state.json filter=lfs diff=lfs merge=lfs -text
64
  checkpoints-v4.6+/checkpoint-13312/eval_state.json filter=lfs diff=lfs merge=lfs -text
65
  checkpoints-v4.6+/checkpoint-21504/eval_state.json filter=lfs diff=lfs merge=lfs -text
66
+ checkpoints-v5.13-c/checkpoint-11264/eval_state.json filter=lfs diff=lfs merge=lfs -text
checkpoints-v5.13-c/checkpoint-11264/ema.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c1564b6d4f63286e8acd1dbfd5fa20cb1b509e064a1e5b274796da0667851918
3
+ size 54599376
checkpoints-v5.13-c/checkpoint-11264/eval_state.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ec55c41e08740816ca781d286a8218ba7e4eb6abc62abe0616616de4bc214c66
3
+ size 56142817
checkpoints-v5.13-c/checkpoint-11264/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f50acab56735a8798bce69ef09b870381291802102711f85e36f72b1422b88d4
3
+ size 54599408
checkpoints-v5.13-c/checkpoint-11264/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:94c1957c5ec8c566f7ab98aacdc1ad546885ea44bde850fa889c1c22911bf0aa
3
+ size 76550347
checkpoints-v5.13-c/checkpoint-11264/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3445e2029753be8bf32790ad2df6092dc63f2028e7603a9903f650b4f19cbab4
3
+ size 14645
checkpoints-v5.13-c/checkpoint-11264/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:519e45a628186bdbcb09e971ade5f402fb12e109ab075b9f69e0e6257f05f429
3
+ size 1383
checkpoints-v5.13-c/checkpoint-11264/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:21c700f8c7da85aa9d4c7cb7c4f2fe1f5cc1460165fd5b9ff6c072c94729a07e
3
+ size 1465
checkpoints-v5.13-c/checkpoint-11264/trainer_state.json ADDED
@@ -0,0 +1,430 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.4854440062921542,
6
+ "eval_steps": 1024,
7
+ "global_step": 11264,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.04413127329928675,
14
+ "grad_norm": 1.8746798038482666,
15
+ "learning_rate": 9.990234375e-05,
16
+ "loss": 11.335476875305176,
17
+ "step": 1024
18
+ },
19
+ {
20
+ "epoch": 0.04413127329928675,
21
+ "eval_bleu": 0.009675771979359812,
22
+ "eval_ce_clean_loss": 5.557726876313753,
23
+ "eval_ce_pred_loss": 6.627119795345803,
24
+ "eval_flow_mse_loss": 0.5579553266832316,
25
+ "eval_loss": 8.41214029926227,
26
+ "flow/cos_sim": 0.7140552083820676,
27
+ "flow/improvement_ratio": 0.9909712693838677,
28
+ "flow/mag_ratio_mean": 0.6811908379292437,
29
+ "flow/mag_ratio_std": 0.12295716503726394,
30
+ "step": 1024
31
+ },
32
+ {
33
+ "epoch": 0.04413127329928675,
34
+ "eval_bleu": 0.009675771979359812,
35
+ "eval_ce_clean_loss": 5.557726876313753,
36
+ "eval_ce_pred_loss": 6.627119795345803,
37
+ "eval_flow_mse_loss": 0.5579553266832316,
38
+ "eval_loss": 8.41214029926227,
39
+ "eval_runtime": 209.168,
40
+ "eval_samples_per_second": 143.425,
41
+ "eval_steps_per_second": 2.242,
42
+ "flow/cos_sim": 0.7140552083820676,
43
+ "flow/improvement_ratio": 0.9909712693838677,
44
+ "flow/mag_ratio_mean": 0.6811908379292437,
45
+ "flow/mag_ratio_std": 0.12295716503726394,
46
+ "step": 1024
47
+ },
48
+ {
49
+ "epoch": 0.0882625465985735,
50
+ "grad_norm": 1.7705103158950806,
51
+ "learning_rate": 9.9476028157316e-05,
52
+ "loss": 7.429768085479736,
53
+ "step": 2048
54
+ },
55
+ {
56
+ "epoch": 0.0882625465985735,
57
+ "eval_bleu": 0.06525691509427865,
58
+ "eval_ce_clean_loss": 2.860388283028023,
59
+ "eval_ce_pred_loss": 5.240253638611165,
60
+ "eval_flow_mse_loss": 0.45478164507890306,
61
+ "eval_loss": 6.6618063353272134,
62
+ "flow/cos_sim": 0.8016271380219124,
63
+ "flow/improvement_ratio": 0.9943794359022112,
64
+ "flow/mag_ratio_mean": 0.7808856908192259,
65
+ "flow/mag_ratio_std": 0.13155144340257402,
66
+ "step": 2048
67
+ },
68
+ {
69
+ "epoch": 0.0882625465985735,
70
+ "eval_bleu": 0.06525691509427865,
71
+ "eval_ce_clean_loss": 2.860388283028023,
72
+ "eval_ce_pred_loss": 5.240253638611165,
73
+ "eval_flow_mse_loss": 0.45478164507890306,
74
+ "eval_loss": 6.6618063353272134,
75
+ "eval_runtime": 203.4463,
76
+ "eval_samples_per_second": 147.459,
77
+ "eval_steps_per_second": 2.305,
78
+ "flow/cos_sim": 0.8016271380219124,
79
+ "flow/improvement_ratio": 0.9943794359022112,
80
+ "flow/mag_ratio_mean": 0.7808856908192259,
81
+ "flow/mag_ratio_std": 0.13155144340257402,
82
+ "step": 2048
83
+ },
84
+ {
85
+ "epoch": 0.13239381989786023,
86
+ "grad_norm": 5.890896797180176,
87
+ "learning_rate": 9.791307026072513e-05,
88
+ "loss": 6.48097038269043,
89
+ "step": 3072
90
+ },
91
+ {
92
+ "epoch": 0.13239381989786023,
93
+ "eval_bleu": 0.1386991635405654,
94
+ "eval_ce_clean_loss": 1.702302875040945,
95
+ "eval_ce_pred_loss": 4.602994439952663,
96
+ "eval_flow_mse_loss": 0.4992917452666805,
97
+ "eval_loss": 6.134915745334585,
98
+ "flow/cos_sim": 0.7965366973805783,
99
+ "flow/improvement_ratio": 0.9947982580422847,
100
+ "flow/mag_ratio_mean": 0.7845185148690555,
101
+ "flow/mag_ratio_std": 0.125204219508654,
102
+ "step": 3072
103
+ },
104
+ {
105
+ "epoch": 0.13239381989786023,
106
+ "eval_bleu": 0.1386991635405654,
107
+ "eval_ce_clean_loss": 1.702302875040945,
108
+ "eval_ce_pred_loss": 4.602994439952663,
109
+ "eval_flow_mse_loss": 0.4992917452666805,
110
+ "eval_loss": 6.134915745334585,
111
+ "eval_runtime": 203.864,
112
+ "eval_samples_per_second": 147.157,
113
+ "eval_steps_per_second": 2.301,
114
+ "flow/cos_sim": 0.7965366973805783,
115
+ "flow/improvement_ratio": 0.9947982580422847,
116
+ "flow/mag_ratio_mean": 0.7845185148690555,
117
+ "flow/mag_ratio_std": 0.125204219508654,
118
+ "step": 3072
119
+ },
120
+ {
121
+ "epoch": 0.176525093197147,
122
+ "grad_norm": 4.509634494781494,
123
+ "learning_rate": 9.53439476074686e-05,
124
+ "loss": 6.0418806076049805,
125
+ "step": 4096
126
+ },
127
+ {
128
+ "epoch": 0.176525093197147,
129
+ "eval_bleu": 0.19298153591252967,
130
+ "eval_ce_clean_loss": 1.0902007042980397,
131
+ "eval_ce_pred_loss": 4.217226792500218,
132
+ "eval_flow_mse_loss": 0.5135862580748763,
133
+ "eval_loss": 5.779789580973481,
134
+ "flow/cos_sim": 0.8044289251380383,
135
+ "flow/improvement_ratio": 0.9947967400937192,
136
+ "flow/mag_ratio_mean": 0.7930461671560812,
137
+ "flow/mag_ratio_std": 0.10969439625485873,
138
+ "step": 4096
139
+ },
140
+ {
141
+ "epoch": 0.176525093197147,
142
+ "eval_bleu": 0.19298153591252967,
143
+ "eval_ce_clean_loss": 1.0902007042980397,
144
+ "eval_ce_pred_loss": 4.217226792500218,
145
+ "eval_flow_mse_loss": 0.5135862580748763,
146
+ "eval_loss": 5.779789580973481,
147
+ "eval_runtime": 207.3299,
148
+ "eval_samples_per_second": 144.697,
149
+ "eval_steps_per_second": 2.262,
150
+ "flow/cos_sim": 0.8044289251380383,
151
+ "flow/improvement_ratio": 0.9947967400937192,
152
+ "flow/mag_ratio_mean": 0.7930461671560812,
153
+ "flow/mag_ratio_std": 0.10969439625485873,
154
+ "step": 4096
155
+ },
156
+ {
157
+ "epoch": 0.22065636649643372,
158
+ "grad_norm": 2.1950623989105225,
159
+ "learning_rate": 9.182261125213742e-05,
160
+ "loss": 5.785842418670654,
161
+ "step": 5120
162
+ },
163
+ {
164
+ "epoch": 0.22065636649643372,
165
+ "eval_bleu": 0.22607963706845424,
166
+ "eval_ce_clean_loss": 0.7473960584923148,
167
+ "eval_ce_pred_loss": 4.035209667962243,
168
+ "eval_flow_mse_loss": 0.5162926226028247,
169
+ "eval_loss": 5.599035447086099,
170
+ "flow/cos_sim": 0.8121183847567675,
171
+ "flow/improvement_ratio": 0.9950191848821985,
172
+ "flow/mag_ratio_mean": 0.8036270247085262,
173
+ "flow/mag_ratio_std": 0.09918517022054078,
174
+ "step": 5120
175
+ },
176
+ {
177
+ "epoch": 0.22065636649643372,
178
+ "eval_bleu": 0.22607963706845424,
179
+ "eval_ce_clean_loss": 0.7473960584923148,
180
+ "eval_ce_pred_loss": 4.035209667962243,
181
+ "eval_flow_mse_loss": 0.5162926226028247,
182
+ "eval_loss": 5.599035447086099,
183
+ "eval_runtime": 208.4843,
184
+ "eval_samples_per_second": 143.896,
185
+ "eval_steps_per_second": 2.25,
186
+ "flow/cos_sim": 0.8121183847567675,
187
+ "flow/improvement_ratio": 0.9950191848821985,
188
+ "flow/mag_ratio_mean": 0.8036270247085262,
189
+ "flow/mag_ratio_std": 0.09918517022054078,
190
+ "step": 5120
191
+ },
192
+ {
193
+ "epoch": 0.26478763979572045,
194
+ "grad_norm": 2.527547597885132,
195
+ "learning_rate": 8.742770483354739e-05,
196
+ "loss": 5.640994548797607,
197
+ "step": 6144
198
+ },
199
+ {
200
+ "epoch": 0.26478763979572045,
201
+ "eval_bleu": 0.25024694158657995,
202
+ "eval_ce_clean_loss": 0.5482296061032871,
203
+ "eval_ce_pred_loss": 3.863517429782892,
204
+ "eval_flow_mse_loss": 0.528831982917623,
205
+ "eval_loss": 5.460977965072274,
206
+ "flow/cos_sim": 0.8133240915310662,
207
+ "flow/improvement_ratio": 0.9949406057532663,
208
+ "flow/mag_ratio_mean": 0.8068851480351836,
209
+ "flow/mag_ratio_std": 0.09499853814461592,
210
+ "step": 6144
211
+ },
212
+ {
213
+ "epoch": 0.26478763979572045,
214
+ "eval_bleu": 0.25024694158657995,
215
+ "eval_ce_clean_loss": 0.5482296061032871,
216
+ "eval_ce_pred_loss": 3.863517429782892,
217
+ "eval_flow_mse_loss": 0.528831982917623,
218
+ "eval_loss": 5.460977965072274,
219
+ "eval_runtime": 207.4649,
220
+ "eval_samples_per_second": 144.603,
221
+ "eval_steps_per_second": 2.261,
222
+ "flow/cos_sim": 0.8133240915310662,
223
+ "flow/improvement_ratio": 0.9949406057532663,
224
+ "flow/mag_ratio_mean": 0.8068851480351836,
225
+ "flow/mag_ratio_std": 0.09499853814461592,
226
+ "step": 6144
227
+ },
228
+ {
229
+ "epoch": 0.30891891309500724,
230
+ "grad_norm": 5.2498579025268555,
231
+ "learning_rate": 8.224294338515429e-05,
232
+ "loss": 5.5075578689575195,
233
+ "step": 7168
234
+ },
235
+ {
236
+ "epoch": 0.30891891309500724,
237
+ "eval_bleu": 0.261643210772413,
238
+ "eval_ce_clean_loss": 0.41339854340055093,
239
+ "eval_ce_pred_loss": 3.799610239102134,
240
+ "eval_flow_mse_loss": 0.5193964989581851,
241
+ "eval_loss": 5.366067711478357,
242
+ "flow/cos_sim": 0.8201883253512352,
243
+ "flow/improvement_ratio": 0.9960052086346185,
244
+ "flow/mag_ratio_mean": 0.8124456131127852,
245
+ "flow/mag_ratio_std": 0.09041677221560529,
246
+ "step": 7168
247
+ },
248
+ {
249
+ "epoch": 0.30891891309500724,
250
+ "eval_bleu": 0.261643210772413,
251
+ "eval_ce_clean_loss": 0.41339854340055093,
252
+ "eval_ce_pred_loss": 3.799610239102134,
253
+ "eval_flow_mse_loss": 0.5193964989581851,
254
+ "eval_loss": 5.366067711478357,
255
+ "eval_runtime": 209.2917,
256
+ "eval_samples_per_second": 143.341,
257
+ "eval_steps_per_second": 2.241,
258
+ "flow/cos_sim": 0.8201883253512352,
259
+ "flow/improvement_ratio": 0.9960052086346185,
260
+ "flow/mag_ratio_mean": 0.8124456131127852,
261
+ "flow/mag_ratio_std": 0.09041677221560529,
262
+ "step": 7168
263
+ },
264
+ {
265
+ "epoch": 0.353050186394294,
266
+ "grad_norm": 2.9191665649414062,
267
+ "learning_rate": 7.638710244802891e-05,
268
+ "loss": 5.437148571014404,
269
+ "step": 8192
270
+ },
271
+ {
272
+ "epoch": 0.353050186394294,
273
+ "eval_bleu": 0.28415719827555364,
274
+ "eval_ce_clean_loss": 0.3232654255590459,
275
+ "eval_ce_pred_loss": 3.6486374203330163,
276
+ "eval_flow_mse_loss": 0.5425256899933317,
277
+ "eval_loss": 5.282679809944462,
278
+ "flow/cos_sim": 0.8185702726276699,
279
+ "flow/improvement_ratio": 0.9952687219516047,
280
+ "flow/mag_ratio_mean": 0.8090884212746041,
281
+ "flow/mag_ratio_std": 0.08737777136981106,
282
+ "step": 8192
283
+ },
284
+ {
285
+ "epoch": 0.353050186394294,
286
+ "eval_bleu": 0.28415719827555364,
287
+ "eval_ce_clean_loss": 0.3232654255590459,
288
+ "eval_ce_pred_loss": 3.6486374203330163,
289
+ "eval_flow_mse_loss": 0.5425256899933317,
290
+ "eval_loss": 5.282679809944462,
291
+ "eval_runtime": 208.6577,
292
+ "eval_samples_per_second": 143.776,
293
+ "eval_steps_per_second": 2.248,
294
+ "flow/cos_sim": 0.8185702726276699,
295
+ "flow/improvement_ratio": 0.9952687219516047,
296
+ "flow/mag_ratio_mean": 0.8090884212746041,
297
+ "flow/mag_ratio_std": 0.08737777136981106,
298
+ "step": 8192
299
+ },
300
+ {
301
+ "epoch": 0.3971814596935807,
302
+ "grad_norm": 2.471464157104492,
303
+ "learning_rate": 6.997172522088177e-05,
304
+ "loss": 5.372776985168457,
305
+ "step": 9216
306
+ },
307
+ {
308
+ "epoch": 0.3971814596935807,
309
+ "eval_bleu": 0.29918716664058487,
310
+ "eval_ce_clean_loss": 0.2656148538342925,
311
+ "eval_ce_pred_loss": 3.554174758732192,
312
+ "eval_flow_mse_loss": 0.550302877227889,
313
+ "eval_loss": 5.210395683866066,
314
+ "flow/cos_sim": 0.8200526506916038,
315
+ "flow/improvement_ratio": 0.9958431662272796,
316
+ "flow/mag_ratio_mean": 0.8059873981262321,
317
+ "flow/mag_ratio_std": 0.08501417428127993,
318
+ "step": 9216
319
+ },
320
+ {
321
+ "epoch": 0.3971814596935807,
322
+ "eval_bleu": 0.29918716664058487,
323
+ "eval_ce_clean_loss": 0.2656148538342925,
324
+ "eval_ce_pred_loss": 3.554174758732192,
325
+ "eval_flow_mse_loss": 0.550302877227889,
326
+ "eval_loss": 5.210395683866066,
327
+ "eval_runtime": 208.1178,
328
+ "eval_samples_per_second": 144.149,
329
+ "eval_steps_per_second": 2.254,
330
+ "flow/cos_sim": 0.8200526506916038,
331
+ "flow/improvement_ratio": 0.9958431662272796,
332
+ "flow/mag_ratio_mean": 0.8059873981262321,
333
+ "flow/mag_ratio_std": 0.08501417428127993,
334
+ "step": 9216
335
+ },
336
+ {
337
+ "epoch": 0.44131273299286744,
338
+ "grad_norm": 1.976932168006897,
339
+ "learning_rate": 6.314377890922702e-05,
340
+ "loss": 5.303058624267578,
341
+ "step": 10240
342
+ },
343
+ {
344
+ "epoch": 0.44131273299286744,
345
+ "eval_bleu": 0.30946760464565337,
346
+ "eval_ce_clean_loss": 0.21070461149917227,
347
+ "eval_ce_pred_loss": 3.506126376357414,
348
+ "eval_flow_mse_loss": 0.548877762578952,
349
+ "eval_loss": 5.156973743235379,
350
+ "flow/cos_sim": 0.8226377320950473,
351
+ "flow/improvement_ratio": 0.9961782912455642,
352
+ "flow/mag_ratio_mean": 0.8068670463968696,
353
+ "flow/mag_ratio_std": 0.08314187864441352,
354
+ "step": 10240
355
+ },
356
+ {
357
+ "epoch": 0.44131273299286744,
358
+ "eval_bleu": 0.30946760464565337,
359
+ "eval_ce_clean_loss": 0.21070461149917227,
360
+ "eval_ce_pred_loss": 3.506126376357414,
361
+ "eval_flow_mse_loss": 0.548877762578952,
362
+ "eval_loss": 5.156973743235379,
363
+ "eval_runtime": 208.8429,
364
+ "eval_samples_per_second": 143.649,
365
+ "eval_steps_per_second": 2.246,
366
+ "flow/cos_sim": 0.8226377320950473,
367
+ "flow/improvement_ratio": 0.9961782912455642,
368
+ "flow/mag_ratio_mean": 0.8068670463968696,
369
+ "flow/mag_ratio_std": 0.08314187864441352,
370
+ "step": 10240
371
+ },
372
+ {
373
+ "epoch": 0.4854440062921542,
374
+ "grad_norm": 3.2680165767669678,
375
+ "learning_rate": 5.603332356428589e-05,
376
+ "loss": 5.236223220825195,
377
+ "step": 11264
378
+ },
379
+ {
380
+ "epoch": 0.4854440062921542,
381
+ "eval_bleu": 0.312954579012866,
382
+ "eval_ce_clean_loss": 0.18226672127556953,
383
+ "eval_ce_pred_loss": 3.534468324962201,
384
+ "eval_flow_mse_loss": 0.5367274030185203,
385
+ "eval_loss": 5.148295871230331,
386
+ "flow/cos_sim": 0.8278299589146937,
387
+ "flow/improvement_ratio": 0.994940139464478,
388
+ "flow/mag_ratio_mean": 0.8156169755880767,
389
+ "flow/mag_ratio_std": 0.08319824889524659,
390
+ "step": 11264
391
+ },
392
+ {
393
+ "epoch": 0.4854440062921542,
394
+ "eval_bleu": 0.312954579012866,
395
+ "eval_ce_clean_loss": 0.18226672127556953,
396
+ "eval_ce_pred_loss": 3.534468324962201,
397
+ "eval_flow_mse_loss": 0.5367274030185203,
398
+ "eval_loss": 5.148295871230331,
399
+ "eval_runtime": 207.6767,
400
+ "eval_samples_per_second": 144.455,
401
+ "eval_steps_per_second": 2.258,
402
+ "flow/cos_sim": 0.8278299589146937,
403
+ "flow/improvement_ratio": 0.994940139464478,
404
+ "flow/mag_ratio_mean": 0.8156169755880767,
405
+ "flow/mag_ratio_std": 0.08319824889524659,
406
+ "step": 11264
407
+ }
408
+ ],
409
+ "logging_steps": 1024,
410
+ "max_steps": 23204,
411
+ "num_input_tokens_seen": 0,
412
+ "num_train_epochs": 1,
413
+ "save_steps": 1024,
414
+ "stateful_callbacks": {
415
+ "TrainerControl": {
416
+ "args": {
417
+ "should_epoch_stop": false,
418
+ "should_evaluate": false,
419
+ "should_log": false,
420
+ "should_save": true,
421
+ "should_training_stop": false
422
+ },
423
+ "attributes": {}
424
+ }
425
+ },
426
+ "total_flos": 0.0,
427
+ "train_batch_size": 64,
428
+ "trial_name": null,
429
+ "trial_params": null
430
+ }
checkpoints-v5.13-c/checkpoint-11264/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8469bbc81a2ba0be2f5b44007faafd15c75615abe30f4f4e56171816d31caa5b
3
+ size 5137