Attila1011 commited on
Commit
04d0114
·
verified ·
1 Parent(s): 8092dda

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -43,3 +43,4 @@ checkpoints-v5.5/checkpoint-16384/eval_state.json filter=lfs diff=lfs merge=lfs
43
  checkpoints-v4.4+/checkpoint-7168/eval_state.json filter=lfs diff=lfs merge=lfs -text
44
  checkpoints-v5.5/checkpoint-24576/eval_state.json filter=lfs diff=lfs merge=lfs -text
45
  checkpoints-v5.6/checkpoint-4096/eval_state.json filter=lfs diff=lfs merge=lfs -text
 
 
43
  checkpoints-v4.4+/checkpoint-7168/eval_state.json filter=lfs diff=lfs merge=lfs -text
44
  checkpoints-v5.5/checkpoint-24576/eval_state.json filter=lfs diff=lfs merge=lfs -text
45
  checkpoints-v5.6/checkpoint-4096/eval_state.json filter=lfs diff=lfs merge=lfs -text
46
+ checkpoints-v5.6/checkpoint-14336/eval_state.json filter=lfs diff=lfs merge=lfs -text
checkpoints-v5.6/checkpoint-14336/ema.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f06b4cb6ab59aa4e5cfc27af41908925b25c9a1919b4fffb002fccd47fb83dd5
3
+ size 54599592
checkpoints-v5.6/checkpoint-14336/eval_state.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:faec1ef62aef8e8e3974a59295f05c1d11d0e69ead203ad058247a8ada06311a
3
+ size 58408752
checkpoints-v5.6/checkpoint-14336/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d658416497177d36a2a33f05ef6c531a5d65993c70c4a9d3138885f6e36463f9
3
+ size 54599624
checkpoints-v5.6/checkpoint-14336/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f9fc4bfb123840f05ffc8b78cccd5d95190ec220ea328196eefeced9dbf54af
3
+ size 76550347
checkpoints-v5.6/checkpoint-14336/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c84b0b910e410334974b64277a31060a45a30a6af56f606c7714596a8a3a85d
3
+ size 14645
checkpoints-v5.6/checkpoint-14336/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:79701bbdc0c49714e0e085b5c67881fa167d2bf0d010e2195ebdf057057d72dd
3
+ size 1383
checkpoints-v5.6/checkpoint-14336/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3fea187e43031df6b1b3d8be1593743f072013fecb64ec1a061b4e69f4994c94
3
+ size 1465
checkpoints-v5.6/checkpoint-14336/trainer_state.json ADDED
@@ -0,0 +1,566 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.6178378261900145,
6
+ "eval_steps": 1024,
7
+ "global_step": 14336,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.04413127329928675,
14
+ "grad_norm": 2.024094581604004,
15
+ "learning_rate": 9.990234375e-05,
16
+ "loss": 13.011528968811035,
17
+ "step": 1024
18
+ },
19
+ {
20
+ "epoch": 0.04413127329928675,
21
+ "eval_bleu": 0.13973491511585304,
22
+ "eval_ce_clean_loss": 2.321801658886582,
23
+ "eval_ce_pred_loss": 5.398845981687371,
24
+ "eval_flow_cos_loss": 0.4359625861970092,
25
+ "eval_flow_mse_loss": 1.261753735003441,
26
+ "eval_loss": 7.471738189014037,
27
+ "flow/cos_sim": 0.5640374244784495,
28
+ "flow/improvement_ratio": 0.9125845870737836,
29
+ "flow/mag_ratio_mean": 0.005239322681281803,
30
+ "flow/mag_ratio_std": 0.0019533936416349018,
31
+ "step": 1024
32
+ },
33
+ {
34
+ "epoch": 0.04413127329928675,
35
+ "eval_bleu": 0.13973491511585304,
36
+ "eval_ce_clean_loss": 2.321801658886582,
37
+ "eval_ce_pred_loss": 5.398845981687371,
38
+ "eval_flow_cos_loss": 0.4359625861970092,
39
+ "eval_flow_mse_loss": 1.261753735003441,
40
+ "eval_loss": 7.471738189014037,
41
+ "eval_runtime": 200.3486,
42
+ "eval_samples_per_second": 149.739,
43
+ "eval_steps_per_second": 2.341,
44
+ "flow/cos_sim": 0.5640374244784495,
45
+ "flow/improvement_ratio": 0.9125845870737836,
46
+ "flow/mag_ratio_mean": 0.005239322681281803,
47
+ "flow/mag_ratio_std": 0.0019533936416349018,
48
+ "step": 1024
49
+ },
50
+ {
51
+ "epoch": 0.0882625465985735,
52
+ "grad_norm": 2.3836026191711426,
53
+ "learning_rate": 9.9476028157316e-05,
54
+ "loss": 5.076037406921387,
55
+ "step": 2048
56
+ },
57
+ {
58
+ "epoch": 0.0882625465985735,
59
+ "eval_bleu": 0.334789938692836,
60
+ "eval_ce_clean_loss": 0.34960190961355847,
61
+ "eval_ce_pred_loss": 3.634908381301457,
62
+ "eval_flow_cos_loss": 0.4467242890075326,
63
+ "eval_flow_mse_loss": 1.0751057558222366,
64
+ "eval_loss": 4.080824585611632,
65
+ "flow/cos_sim": 0.5532757186177951,
66
+ "flow/improvement_ratio": 0.988054730363492,
67
+ "flow/mag_ratio_mean": 0.4422806202094438,
68
+ "flow/mag_ratio_std": 0.11935382749416681,
69
+ "step": 2048
70
+ },
71
+ {
72
+ "epoch": 0.0882625465985735,
73
+ "eval_bleu": 0.334789938692836,
74
+ "eval_ce_clean_loss": 0.34960190961355847,
75
+ "eval_ce_pred_loss": 3.634908381301457,
76
+ "eval_flow_cos_loss": 0.4467242890075326,
77
+ "eval_flow_mse_loss": 1.0751057558222366,
78
+ "eval_loss": 4.080824585611632,
79
+ "eval_runtime": 198.1742,
80
+ "eval_samples_per_second": 151.382,
81
+ "eval_steps_per_second": 2.367,
82
+ "flow/cos_sim": 0.5532757186177951,
83
+ "flow/improvement_ratio": 0.988054730363492,
84
+ "flow/mag_ratio_mean": 0.4422806202094438,
85
+ "flow/mag_ratio_std": 0.11935382749416681,
86
+ "step": 2048
87
+ },
88
+ {
89
+ "epoch": 0.13239381989786023,
90
+ "grad_norm": 0.8187930583953857,
91
+ "learning_rate": 9.791307026072513e-05,
92
+ "loss": 3.823514938354492,
93
+ "step": 3072
94
+ },
95
+ {
96
+ "epoch": 0.13239381989786023,
97
+ "eval_bleu": 0.372073011692056,
98
+ "eval_ce_clean_loss": 0.13431392060414052,
99
+ "eval_ce_pred_loss": 3.167514971832731,
100
+ "eval_flow_cos_loss": 0.3423818138870857,
101
+ "eval_flow_mse_loss": 1.1092233459578393,
102
+ "eval_loss": 3.5463931677438048,
103
+ "flow/cos_sim": 0.6576181853503815,
104
+ "flow/improvement_ratio": 0.9944826788993787,
105
+ "flow/mag_ratio_mean": 0.6025184103166625,
106
+ "flow/mag_ratio_std": 0.11320645424094536,
107
+ "step": 3072
108
+ },
109
+ {
110
+ "epoch": 0.13239381989786023,
111
+ "eval_bleu": 0.372073011692056,
112
+ "eval_ce_clean_loss": 0.13431392060414052,
113
+ "eval_ce_pred_loss": 3.167514971832731,
114
+ "eval_flow_cos_loss": 0.3423818138870857,
115
+ "eval_flow_mse_loss": 1.1092233459578393,
116
+ "eval_loss": 3.5463931677438048,
117
+ "eval_runtime": 199.1274,
118
+ "eval_samples_per_second": 150.657,
119
+ "eval_steps_per_second": 2.355,
120
+ "flow/cos_sim": 0.6576181853503815,
121
+ "flow/improvement_ratio": 0.9944826788993787,
122
+ "flow/mag_ratio_mean": 0.6025184103166625,
123
+ "flow/mag_ratio_std": 0.11320645424094536,
124
+ "step": 3072
125
+ },
126
+ {
127
+ "epoch": 0.176525093197147,
128
+ "grad_norm": 1.596978783607483,
129
+ "learning_rate": 9.53439476074686e-05,
130
+ "loss": 3.4521546363830566,
131
+ "step": 4096
132
+ },
133
+ {
134
+ "epoch": 0.176525093197147,
135
+ "eval_bleu": 0.3922768378761595,
136
+ "eval_ce_clean_loss": 0.06992586965023327,
137
+ "eval_ce_pred_loss": 2.9707689447951977,
138
+ "eval_flow_cos_loss": 0.18835138766241988,
139
+ "eval_flow_mse_loss": 1.08429173365839,
140
+ "eval_loss": 3.2808436879725345,
141
+ "flow/cos_sim": 0.8116486336884976,
142
+ "flow/improvement_ratio": 0.9945465602091889,
143
+ "flow/mag_ratio_mean": 0.7910831776509153,
144
+ "flow/mag_ratio_std": 0.09716444489544135,
145
+ "step": 4096
146
+ },
147
+ {
148
+ "epoch": 0.176525093197147,
149
+ "eval_bleu": 0.3922768378761595,
150
+ "eval_ce_clean_loss": 0.06992586965023327,
151
+ "eval_ce_pred_loss": 2.9707689447951977,
152
+ "eval_flow_cos_loss": 0.18835138766241988,
153
+ "eval_flow_mse_loss": 1.08429173365839,
154
+ "eval_loss": 3.2808436879725345,
155
+ "eval_runtime": 199.2079,
156
+ "eval_samples_per_second": 150.596,
157
+ "eval_steps_per_second": 2.354,
158
+ "flow/cos_sim": 0.8116486336884976,
159
+ "flow/improvement_ratio": 0.9945465602091889,
160
+ "flow/mag_ratio_mean": 0.7910831776509153,
161
+ "flow/mag_ratio_std": 0.09716444489544135,
162
+ "step": 4096
163
+ },
164
+ {
165
+ "epoch": 0.22065636649643372,
166
+ "grad_norm": 0.8742256760597229,
167
+ "learning_rate": 9.18264920723673e-05,
168
+ "loss": 3.2747244834899902,
169
+ "step": 5120
170
+ },
171
+ {
172
+ "epoch": 0.22065636649643372,
173
+ "eval_bleu": 0.39936848850149986,
174
+ "eval_ce_clean_loss": 0.04289639472310096,
175
+ "eval_ce_pred_loss": 2.848231249780797,
176
+ "eval_flow_cos_loss": 0.13245032197122636,
177
+ "eval_flow_mse_loss": 1.07859928343596,
178
+ "eval_loss": 3.148370101253615,
179
+ "flow/cos_sim": 0.867549685908279,
180
+ "flow/improvement_ratio": 0.9947950637925154,
181
+ "flow/mag_ratio_mean": 0.8595841084716163,
182
+ "flow/mag_ratio_std": 0.08974277026363528,
183
+ "step": 5120
184
+ },
185
+ {
186
+ "epoch": 0.22065636649643372,
187
+ "eval_bleu": 0.39936848850149986,
188
+ "eval_ce_clean_loss": 0.04289639472310096,
189
+ "eval_ce_pred_loss": 2.848231249780797,
190
+ "eval_flow_cos_loss": 0.13245032197122636,
191
+ "eval_flow_mse_loss": 1.07859928343596,
192
+ "eval_loss": 3.148370101253615,
193
+ "eval_runtime": 207.1188,
194
+ "eval_samples_per_second": 144.844,
195
+ "eval_steps_per_second": 2.264,
196
+ "flow/cos_sim": 0.867549685908279,
197
+ "flow/improvement_ratio": 0.9947950637925154,
198
+ "flow/mag_ratio_mean": 0.8595841084716163,
199
+ "flow/mag_ratio_std": 0.08974277026363528,
200
+ "step": 5120
201
+ },
202
+ {
203
+ "epoch": 0.26478763979572045,
204
+ "grad_norm": 1.2167394161224365,
205
+ "learning_rate": 8.742770483354739e-05,
206
+ "loss": 3.171807050704956,
207
+ "step": 6144
208
+ },
209
+ {
210
+ "epoch": 0.26478763979572045,
211
+ "eval_bleu": 0.42114677949263607,
212
+ "eval_ce_clean_loss": 0.02962892580983926,
213
+ "eval_ce_pred_loss": 2.721838645081022,
214
+ "eval_flow_cos_loss": 0.11157757617326687,
215
+ "eval_flow_mse_loss": 1.0874363405109724,
216
+ "eval_loss": 3.0502466992782886,
217
+ "flow/cos_sim": 0.8884224528188644,
218
+ "flow/improvement_ratio": 0.994537014061454,
219
+ "flow/mag_ratio_mean": 0.882329723601148,
220
+ "flow/mag_ratio_std": 0.08614625519653882,
221
+ "step": 6144
222
+ },
223
+ {
224
+ "epoch": 0.26478763979572045,
225
+ "eval_bleu": 0.42114677949263607,
226
+ "eval_ce_clean_loss": 0.02962892580983926,
227
+ "eval_ce_pred_loss": 2.721838645081022,
228
+ "eval_flow_cos_loss": 0.11157757617326687,
229
+ "eval_flow_mse_loss": 1.0874363405109724,
230
+ "eval_loss": 3.0502466992782886,
231
+ "eval_runtime": 201.5414,
232
+ "eval_samples_per_second": 148.853,
233
+ "eval_steps_per_second": 2.327,
234
+ "flow/cos_sim": 0.8884224528188644,
235
+ "flow/improvement_ratio": 0.994537014061454,
236
+ "flow/mag_ratio_mean": 0.882329723601148,
237
+ "flow/mag_ratio_std": 0.08614625519653882,
238
+ "step": 6144
239
+ },
240
+ {
241
+ "epoch": 0.30891891309500724,
242
+ "grad_norm": 1.7619376182556152,
243
+ "learning_rate": 8.22483558761947e-05,
244
+ "loss": 3.073448896408081,
245
+ "step": 7168
246
+ },
247
+ {
248
+ "epoch": 0.30891891309500724,
249
+ "eval_bleu": 0.4370876849406368,
250
+ "eval_ce_clean_loss": 0.02222317859911715,
251
+ "eval_ce_pred_loss": 2.6402354730980226,
252
+ "eval_flow_cos_loss": 0.10016548941765767,
253
+ "eval_flow_mse_loss": 1.0842415976371846,
254
+ "eval_loss": 2.979670949582098,
255
+ "flow/cos_sim": 0.8998345271356578,
256
+ "flow/improvement_ratio": 0.9956297869367132,
257
+ "flow/mag_ratio_mean": 0.8991066802030941,
258
+ "flow/mag_ratio_std": 0.08455248013424721,
259
+ "step": 7168
260
+ },
261
+ {
262
+ "epoch": 0.30891891309500724,
263
+ "eval_bleu": 0.4370876849406368,
264
+ "eval_ce_clean_loss": 0.02222317859911715,
265
+ "eval_ce_pred_loss": 2.6402354730980226,
266
+ "eval_flow_cos_loss": 0.10016548941765767,
267
+ "eval_flow_mse_loss": 1.0842415976371846,
268
+ "eval_loss": 2.979670949582098,
269
+ "eval_runtime": 203.5883,
270
+ "eval_samples_per_second": 147.356,
271
+ "eval_steps_per_second": 2.304,
272
+ "flow/cos_sim": 0.8998345271356578,
273
+ "flow/improvement_ratio": 0.9956297869367132,
274
+ "flow/mag_ratio_mean": 0.8991066802030941,
275
+ "flow/mag_ratio_std": 0.08455248013424721,
276
+ "step": 7168
277
+ },
278
+ {
279
+ "epoch": 0.353050186394294,
280
+ "grad_norm": 1.8936859369277954,
281
+ "learning_rate": 7.639311770076283e-05,
282
+ "loss": 3.0209758281707764,
283
+ "step": 8192
284
+ },
285
+ {
286
+ "epoch": 0.353050186394294,
287
+ "eval_bleu": 0.4462124473905187,
288
+ "eval_ce_clean_loss": 0.017575070791160947,
289
+ "eval_ce_pred_loss": 2.586446302277701,
290
+ "eval_flow_cos_loss": 0.09220715248381406,
291
+ "eval_flow_mse_loss": 1.0823260478373529,
292
+ "eval_loss": 2.933465297288224,
293
+ "flow/cos_sim": 0.9077928694072308,
294
+ "flow/improvement_ratio": 0.9947618440524347,
295
+ "flow/mag_ratio_mean": 0.8988318181495423,
296
+ "flow/mag_ratio_std": 0.08265599157256104,
297
+ "step": 8192
298
+ },
299
+ {
300
+ "epoch": 0.353050186394294,
301
+ "eval_bleu": 0.4462124473905187,
302
+ "eval_ce_clean_loss": 0.017575070791160947,
303
+ "eval_ce_pred_loss": 2.586446302277701,
304
+ "eval_flow_cos_loss": 0.09220715248381406,
305
+ "eval_flow_mse_loss": 1.0823260478373529,
306
+ "eval_loss": 2.933465297288224,
307
+ "eval_runtime": 205.9836,
308
+ "eval_samples_per_second": 145.643,
309
+ "eval_steps_per_second": 2.277,
310
+ "flow/cos_sim": 0.9077928694072308,
311
+ "flow/improvement_ratio": 0.9947618440524347,
312
+ "flow/mag_ratio_mean": 0.8988318181495423,
313
+ "flow/mag_ratio_std": 0.08265599157256104,
314
+ "step": 8192
315
+ },
316
+ {
317
+ "epoch": 0.3971814596935807,
318
+ "grad_norm": 1.6591744422912598,
319
+ "learning_rate": 6.997821756319211e-05,
320
+ "loss": 2.97641921043396,
321
+ "step": 9216
322
+ },
323
+ {
324
+ "epoch": 0.3971814596935807,
325
+ "eval_bleu": 0.4548893990902779,
326
+ "eval_ce_clean_loss": 0.014087324782308421,
327
+ "eval_ce_pred_loss": 2.5131919986403575,
328
+ "eval_flow_cos_loss": 0.08636398176585179,
329
+ "eval_flow_mse_loss": 1.073010201901515,
330
+ "eval_loss": 2.867922893211023,
331
+ "flow/cos_sim": 0.9136360249539682,
332
+ "flow/improvement_ratio": 0.9953014714631445,
333
+ "flow/mag_ratio_mean": 0.9083017234100716,
334
+ "flow/mag_ratio_std": 0.08202346263409678,
335
+ "step": 9216
336
+ },
337
+ {
338
+ "epoch": 0.3971814596935807,
339
+ "eval_bleu": 0.4548893990902779,
340
+ "eval_ce_clean_loss": 0.014087324782308421,
341
+ "eval_ce_pred_loss": 2.5131919986403575,
342
+ "eval_flow_cos_loss": 0.08636398176585179,
343
+ "eval_flow_mse_loss": 1.073010201901515,
344
+ "eval_loss": 2.867922893211023,
345
+ "eval_runtime": 205.454,
346
+ "eval_samples_per_second": 146.018,
347
+ "eval_steps_per_second": 2.283,
348
+ "flow/cos_sim": 0.9136360249539682,
349
+ "flow/improvement_ratio": 0.9953014714631445,
350
+ "flow/mag_ratio_mean": 0.9083017234100716,
351
+ "flow/mag_ratio_std": 0.08202346263409678,
352
+ "step": 9216
353
+ },
354
+ {
355
+ "epoch": 0.44131273299286744,
356
+ "grad_norm": 2.6923091411590576,
357
+ "learning_rate": 6.315061173955019e-05,
358
+ "loss": 2.924818515777588,
359
+ "step": 10240
360
+ },
361
+ {
362
+ "epoch": 0.44131273299286744,
363
+ "eval_bleu": 0.4700031951689637,
364
+ "eval_ce_clean_loss": 0.011653113022033593,
365
+ "eval_ce_pred_loss": 2.5109867228627967,
366
+ "eval_flow_cos_loss": 0.08097743840296386,
367
+ "eval_flow_mse_loss": 1.0633719702010977,
368
+ "eval_loss": 2.8529601102190485,
369
+ "flow/cos_sim": 0.9190225931627156,
370
+ "flow/improvement_ratio": 0.9958856232893238,
371
+ "flow/mag_ratio_mean": 0.9080375746877463,
372
+ "flow/mag_ratio_std": 0.07999324749337076,
373
+ "step": 10240
374
+ },
375
+ {
376
+ "epoch": 0.44131273299286744,
377
+ "eval_bleu": 0.4700031951689637,
378
+ "eval_ce_clean_loss": 0.011653113022033593,
379
+ "eval_ce_pred_loss": 2.5109867228627967,
380
+ "eval_flow_cos_loss": 0.08097743840296386,
381
+ "eval_flow_mse_loss": 1.0633719702010977,
382
+ "eval_loss": 2.8529601102190485,
383
+ "eval_runtime": 203.8367,
384
+ "eval_samples_per_second": 147.177,
385
+ "eval_steps_per_second": 2.301,
386
+ "flow/cos_sim": 0.9190225931627156,
387
+ "flow/improvement_ratio": 0.9958856232893238,
388
+ "flow/mag_ratio_mean": 0.9080375746877463,
389
+ "flow/mag_ratio_std": 0.07999324749337076,
390
+ "step": 10240
391
+ },
392
+ {
393
+ "epoch": 0.4854440062921542,
394
+ "grad_norm": 3.9323318004608154,
395
+ "learning_rate": 5.604035379537632e-05,
396
+ "loss": 2.895775556564331,
397
+ "step": 11264
398
+ },
399
+ {
400
+ "epoch": 0.4854440062921542,
401
+ "eval_bleu": 0.4627536040081024,
402
+ "eval_ce_clean_loss": 0.009901690839934769,
403
+ "eval_ce_pred_loss": 2.51465770151061,
404
+ "eval_flow_cos_loss": 0.07650925458939091,
405
+ "eval_flow_mse_loss": 1.041963977218945,
406
+ "eval_loss": 2.8312533544833216,
407
+ "flow/cos_sim": 0.9234907618209497,
408
+ "flow/improvement_ratio": 0.9943228665191227,
409
+ "flow/mag_ratio_mean": 0.9196320714981063,
410
+ "flow/mag_ratio_std": 0.07909934378381986,
411
+ "step": 11264
412
+ },
413
+ {
414
+ "epoch": 0.4854440062921542,
415
+ "eval_bleu": 0.4627536040081024,
416
+ "eval_ce_clean_loss": 0.009901690839934769,
417
+ "eval_ce_pred_loss": 2.51465770151061,
418
+ "eval_flow_cos_loss": 0.07650925458939091,
419
+ "eval_flow_mse_loss": 1.041963977218945,
420
+ "eval_loss": 2.8312533544833216,
421
+ "eval_runtime": 203.4569,
422
+ "eval_samples_per_second": 147.451,
423
+ "eval_steps_per_second": 2.305,
424
+ "flow/cos_sim": 0.9234907618209497,
425
+ "flow/improvement_ratio": 0.9943228665191227,
426
+ "flow/mag_ratio_mean": 0.9196320714981063,
427
+ "flow/mag_ratio_std": 0.07909934378381986,
428
+ "step": 11264
429
+ },
430
+ {
431
+ "epoch": 0.5295752795914409,
432
+ "grad_norm": 1.9154846668243408,
433
+ "learning_rate": 4.881032966918056e-05,
434
+ "loss": 2.879368305206299,
435
+ "step": 12288
436
+ },
437
+ {
438
+ "epoch": 0.5295752795914409,
439
+ "eval_bleu": 0.4667095049087505,
440
+ "eval_ce_clean_loss": 0.008838278966258838,
441
+ "eval_ce_pred_loss": 2.4798952907895737,
442
+ "eval_flow_cos_loss": 0.07372279336521112,
443
+ "eval_flow_mse_loss": 1.0375378379689606,
444
+ "eval_loss": 2.8007334864724167,
445
+ "flow/cos_sim": 0.9262772229180407,
446
+ "flow/improvement_ratio": 0.9949917554346992,
447
+ "flow/mag_ratio_mean": 0.9212620670098994,
448
+ "flow/mag_ratio_std": 0.07803667050752558,
449
+ "step": 12288
450
+ },
451
+ {
452
+ "epoch": 0.5295752795914409,
453
+ "eval_bleu": 0.4667095049087505,
454
+ "eval_ce_clean_loss": 0.008838278966258838,
455
+ "eval_ce_pred_loss": 2.4798952907895737,
456
+ "eval_flow_cos_loss": 0.07372279336521112,
457
+ "eval_flow_mse_loss": 1.0375378379689606,
458
+ "eval_loss": 2.8007334864724167,
459
+ "eval_runtime": 202.9744,
460
+ "eval_samples_per_second": 147.802,
461
+ "eval_steps_per_second": 2.311,
462
+ "flow/cos_sim": 0.9262772229180407,
463
+ "flow/improvement_ratio": 0.9949917554346992,
464
+ "flow/mag_ratio_mean": 0.9212620670098994,
465
+ "flow/mag_ratio_std": 0.07803667050752558,
466
+ "step": 12288
467
+ },
468
+ {
469
+ "epoch": 0.5737065528907277,
470
+ "grad_norm": 2.099470376968384,
471
+ "learning_rate": 4.159825826870804e-05,
472
+ "loss": 2.84110951423645,
473
+ "step": 13312
474
+ },
475
+ {
476
+ "epoch": 0.5737065528907277,
477
+ "eval_bleu": 0.4697937916499978,
478
+ "eval_ce_clean_loss": 0.008110439863556357,
479
+ "eval_ce_pred_loss": 2.445156754207001,
480
+ "eval_flow_cos_loss": 0.07064609612419662,
481
+ "eval_flow_mse_loss": 1.030803042942527,
482
+ "eval_loss": 2.7681847221053233,
483
+ "flow/cos_sim": 0.9293539301672978,
484
+ "flow/improvement_ratio": 0.9955378606883701,
485
+ "flow/mag_ratio_mean": 0.9253565624578676,
486
+ "flow/mag_ratio_std": 0.07775539313869945,
487
+ "step": 13312
488
+ },
489
+ {
490
+ "epoch": 0.5737065528907277,
491
+ "eval_bleu": 0.4697937916499978,
492
+ "eval_ce_clean_loss": 0.008110439863556357,
493
+ "eval_ce_pred_loss": 2.445156754207001,
494
+ "eval_flow_cos_loss": 0.07064609612419662,
495
+ "eval_flow_mse_loss": 1.030803042942527,
496
+ "eval_loss": 2.7681847221053233,
497
+ "eval_runtime": 202.522,
498
+ "eval_samples_per_second": 148.132,
499
+ "eval_steps_per_second": 2.316,
500
+ "flow/cos_sim": 0.9293539301672978,
501
+ "flow/improvement_ratio": 0.9955378606883701,
502
+ "flow/mag_ratio_mean": 0.9253565624578676,
503
+ "flow/mag_ratio_std": 0.07775539313869945,
504
+ "step": 13312
505
+ },
506
+ {
507
+ "epoch": 0.6178378261900145,
508
+ "grad_norm": 5.420510292053223,
509
+ "learning_rate": 3.456935793454373e-05,
510
+ "loss": 2.829457998275757,
511
+ "step": 14336
512
+ },
513
+ {
514
+ "epoch": 0.6178378261900145,
515
+ "eval_bleu": 0.47719369632160064,
516
+ "eval_ce_clean_loss": 0.007385544868102714,
517
+ "eval_ce_pred_loss": 2.418667951880742,
518
+ "eval_flow_cos_loss": 0.06978921110886754,
519
+ "eval_flow_mse_loss": 1.0323392859399954,
520
+ "eval_loss": 2.7502396734538617,
521
+ "flow/cos_sim": 0.9302108099719862,
522
+ "flow/improvement_ratio": 0.9951674874657507,
523
+ "flow/mag_ratio_mean": 0.9268048170533008,
524
+ "flow/mag_ratio_std": 0.07696014680842092,
525
+ "step": 14336
526
+ },
527
+ {
528
+ "epoch": 0.6178378261900145,
529
+ "eval_bleu": 0.47719369632160064,
530
+ "eval_ce_clean_loss": 0.007385544868102714,
531
+ "eval_ce_pred_loss": 2.418667951880742,
532
+ "eval_flow_cos_loss": 0.06978921110886754,
533
+ "eval_flow_mse_loss": 1.0323392859399954,
534
+ "eval_loss": 2.7502396734538617,
535
+ "eval_runtime": 202.3423,
536
+ "eval_samples_per_second": 148.264,
537
+ "eval_steps_per_second": 2.318,
538
+ "flow/cos_sim": 0.9302108099719862,
539
+ "flow/improvement_ratio": 0.9951674874657507,
540
+ "flow/mag_ratio_mean": 0.9268048170533008,
541
+ "flow/mag_ratio_std": 0.07696014680842092,
542
+ "step": 14336
543
+ }
544
+ ],
545
+ "logging_steps": 1024,
546
+ "max_steps": 23204,
547
+ "num_input_tokens_seen": 0,
548
+ "num_train_epochs": 1,
549
+ "save_steps": 1024,
550
+ "stateful_callbacks": {
551
+ "TrainerControl": {
552
+ "args": {
553
+ "should_epoch_stop": false,
554
+ "should_evaluate": false,
555
+ "should_log": false,
556
+ "should_save": true,
557
+ "should_training_stop": false
558
+ },
559
+ "attributes": {}
560
+ }
561
+ },
562
+ "total_flos": 0.0,
563
+ "train_batch_size": 64,
564
+ "trial_name": null,
565
+ "trial_params": null
566
+ }
checkpoints-v5.6/checkpoint-14336/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8469bbc81a2ba0be2f5b44007faafd15c75615abe30f4f4e56171816d31caa5b
3
+ size 5137