Attila1011 commited on
Commit
c985c7c
·
verified ·
1 Parent(s): 9c36c25

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -35,3 +35,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  checkpoints-v5/checkpoint-12288/eval_state.json filter=lfs diff=lfs merge=lfs -text
37
  checkpoints-v5.1/checkpoint-12288/eval_state.json filter=lfs diff=lfs merge=lfs -text
 
 
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  checkpoints-v5/checkpoint-12288/eval_state.json filter=lfs diff=lfs merge=lfs -text
37
  checkpoints-v5.1/checkpoint-12288/eval_state.json filter=lfs diff=lfs merge=lfs -text
38
+ checkpoints-v5.2/checkpoint-11264/eval_state.json filter=lfs diff=lfs merge=lfs -text
checkpoints-v5.2/checkpoint-11264/ema.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4f2242e440eec3def700ef54a81144370c3db53b96738cabe892f4077e78668f
3
+ size 54599376
checkpoints-v5.2/checkpoint-11264/eval_state.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:36f1f4672fd95a45825d62eb47ac7b08bb6ba3249dc46758ac1ffb574e8f9660
3
+ size 44036748
checkpoints-v5.2/checkpoint-11264/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b0c28c9d8d56b0f256f779d5c1c3c2d09e5e41b9462bec886e4b8c81c320930a
3
+ size 54599408
checkpoints-v5.2/checkpoint-11264/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:377971d6247a2ad31cdea9a913ab5da9a55b0b5db9068480ab264c26af741097
3
+ size 76550347
checkpoints-v5.2/checkpoint-11264/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45b1ca7f396cc5599f3f64fbc801b07bc6329480ea979bb7f593fcbee2c79ab9
3
+ size 14645
checkpoints-v5.2/checkpoint-11264/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c07c33eb3e2182286288b9d2aa659daed2b68835c0afb29f8d6c7aee7927a822
3
+ size 1383
checkpoints-v5.2/checkpoint-11264/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7672f8041e5b7bc5a00bd3cd6bedae5ac365dd2bdcbcd4bbaa88eefac435f928
3
+ size 1465
checkpoints-v5.2/checkpoint-11264/trainer_state.json ADDED
@@ -0,0 +1,452 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.5202531060920974,
6
+ "eval_steps": 1024,
7
+ "global_step": 11264,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.047295736917463395,
14
+ "grad_norm": 3.1038711071014404,
15
+ "learning_rate": 0.00029970703124999995,
16
+ "loss": 7.864967346191406,
17
+ "step": 1024
18
+ },
19
+ {
20
+ "epoch": 0.047295736917463395,
21
+ "eval_bleu": 0.3418101654679444,
22
+ "eval_ce_clean_loss": 0.2192277379299952,
23
+ "eval_ce_pred_loss": 3.7604843192993234,
24
+ "eval_flow_cos_loss": 0.38796317584166246,
25
+ "eval_flow_mse_loss": 0.8011491894721985,
26
+ "eval_loss": 2.99760987987257,
27
+ "flow/cos_sim": 0.6120368439585107,
28
+ "flow/improvement_ratio": 0.9814901100173933,
29
+ "flow/mag_ratio_mean": 0.48378803026458445,
30
+ "flow/mag_ratio_std": 0.15437949193667058,
31
+ "step": 1024
32
+ },
33
+ {
34
+ "epoch": 0.047295736917463395,
35
+ "eval_bleu": 0.3418101654679444,
36
+ "eval_ce_clean_loss": 0.2192277379299952,
37
+ "eval_ce_pred_loss": 3.7604843192993234,
38
+ "eval_flow_cos_loss": 0.38796317584166246,
39
+ "eval_flow_mse_loss": 0.8011491894721985,
40
+ "eval_loss": 2.99760987987257,
41
+ "eval_runtime": 197.0177,
42
+ "eval_samples_per_second": 142.084,
43
+ "eval_steps_per_second": 2.223,
44
+ "flow/cos_sim": 0.6120368439585107,
45
+ "flow/improvement_ratio": 0.9814901100173933,
46
+ "flow/mag_ratio_mean": 0.48378803026458445,
47
+ "flow/mag_ratio_std": 0.15437949193667058,
48
+ "step": 1024
49
+ },
50
+ {
51
+ "epoch": 0.09459147383492679,
52
+ "grad_norm": 1.4018265008926392,
53
+ "learning_rate": 0.00029818297300322,
54
+ "loss": 2.6447885036468506,
55
+ "step": 2048
56
+ },
57
+ {
58
+ "epoch": 0.09459147383492679,
59
+ "eval_bleu": 0.3947018649544751,
60
+ "eval_ce_clean_loss": 0.03551059079326723,
61
+ "eval_ce_pred_loss": 3.0670854468323867,
62
+ "eval_flow_cos_loss": 0.26620999992438105,
63
+ "eval_flow_mse_loss": 0.7514688725068689,
64
+ "eval_loss": 2.3870746936972282,
65
+ "flow/cos_sim": 0.733790022325298,
66
+ "flow/improvement_ratio": 0.9944704486627013,
67
+ "flow/mag_ratio_mean": 0.6799977410873866,
68
+ "flow/mag_ratio_std": 0.11552925763405077,
69
+ "step": 2048
70
+ },
71
+ {
72
+ "epoch": 0.09459147383492679,
73
+ "eval_bleu": 0.3947018649544751,
74
+ "eval_ce_clean_loss": 0.03551059079326723,
75
+ "eval_ce_pred_loss": 3.0670854468323867,
76
+ "eval_flow_cos_loss": 0.26620999992438105,
77
+ "eval_flow_mse_loss": 0.7514688725068689,
78
+ "eval_loss": 2.3870746936972282,
79
+ "eval_runtime": 192.2679,
80
+ "eval_samples_per_second": 145.594,
81
+ "eval_steps_per_second": 2.278,
82
+ "flow/cos_sim": 0.733790022325298,
83
+ "flow/improvement_ratio": 0.9944704486627013,
84
+ "flow/mag_ratio_mean": 0.6799977410873866,
85
+ "flow/mag_ratio_std": 0.11552925763405077,
86
+ "step": 2048
87
+ },
88
+ {
89
+ "epoch": 0.1418872107523902,
90
+ "grad_norm": 1.5566725730895996,
91
+ "learning_rate": 0.0002927689070858589,
92
+ "loss": 2.3424882888793945,
93
+ "step": 3072
94
+ },
95
+ {
96
+ "epoch": 0.1418872107523902,
97
+ "eval_bleu": 0.41753887403168577,
98
+ "eval_ce_clean_loss": 0.017339187425446427,
99
+ "eval_ce_pred_loss": 2.829169942091589,
100
+ "eval_flow_cos_loss": 0.14888262367684002,
101
+ "eval_flow_mse_loss": 0.7409176882271353,
102
+ "eval_loss": 2.2100625013651913,
103
+ "flow/cos_sim": 0.8511173925171159,
104
+ "flow/improvement_ratio": 0.9950010483939898,
105
+ "flow/mag_ratio_mean": 0.8375713426500695,
106
+ "flow/mag_ratio_std": 0.10142230264604364,
107
+ "step": 3072
108
+ },
109
+ {
110
+ "epoch": 0.1418872107523902,
111
+ "eval_bleu": 0.41753887403168577,
112
+ "eval_ce_clean_loss": 0.017339187425446427,
113
+ "eval_ce_pred_loss": 2.829169942091589,
114
+ "eval_flow_cos_loss": 0.14888262367684002,
115
+ "eval_flow_mse_loss": 0.7409176882271353,
116
+ "eval_loss": 2.2100625013651913,
117
+ "eval_runtime": 194.0365,
118
+ "eval_samples_per_second": 144.267,
119
+ "eval_steps_per_second": 2.257,
120
+ "flow/cos_sim": 0.8511173925171159,
121
+ "flow/improvement_ratio": 0.9950010483939898,
122
+ "flow/mag_ratio_mean": 0.8375713426500695,
123
+ "flow/mag_ratio_std": 0.10142230264604364,
124
+ "step": 3072
125
+ },
126
+ {
127
+ "epoch": 0.18918294766985358,
128
+ "grad_norm": 1.6979767084121704,
129
+ "learning_rate": 0.00028389952193475995,
130
+ "loss": 2.209465742111206,
131
+ "step": 4096
132
+ },
133
+ {
134
+ "epoch": 0.18918294766985358,
135
+ "eval_bleu": 0.4338824863011538,
136
+ "eval_ce_clean_loss": 0.010503773240491612,
137
+ "eval_ce_pred_loss": 2.7355331310398503,
138
+ "eval_flow_cos_loss": 0.10729855863743176,
139
+ "eval_flow_mse_loss": 0.7243417862071294,
140
+ "eval_loss": 2.1294367664480864,
141
+ "flow/cos_sim": 0.8927014652452513,
142
+ "flow/improvement_ratio": 0.9945996228146227,
143
+ "flow/mag_ratio_mean": 0.8834205535176682,
144
+ "flow/mag_ratio_std": 0.09435187315900032,
145
+ "step": 4096
146
+ },
147
+ {
148
+ "epoch": 0.18918294766985358,
149
+ "eval_bleu": 0.4338824863011538,
150
+ "eval_ce_clean_loss": 0.010503773240491612,
151
+ "eval_ce_pred_loss": 2.7355331310398503,
152
+ "eval_flow_cos_loss": 0.10729855863743176,
153
+ "eval_flow_mse_loss": 0.7243417862071294,
154
+ "eval_loss": 2.1294367664480864,
155
+ "eval_runtime": 193.5894,
156
+ "eval_samples_per_second": 144.6,
157
+ "eval_steps_per_second": 2.263,
158
+ "flow/cos_sim": 0.8927014652452513,
159
+ "flow/improvement_ratio": 0.9945996228146227,
160
+ "flow/mag_ratio_mean": 0.8834205535176682,
161
+ "flow/mag_ratio_std": 0.09435187315900032,
162
+ "step": 4096
163
+ },
164
+ {
165
+ "epoch": 0.236478684587317,
166
+ "grad_norm": 1.5808225870132446,
167
+ "learning_rate": 0.00027177281107320826,
168
+ "loss": 2.1267242431640625,
169
+ "step": 5120
170
+ },
171
+ {
172
+ "epoch": 0.236478684587317,
173
+ "eval_bleu": 0.4571689702049482,
174
+ "eval_ce_clean_loss": 0.007218487369381401,
175
+ "eval_ce_pred_loss": 2.582534832224998,
176
+ "eval_flow_cos_loss": 0.08806605126759777,
177
+ "eval_flow_mse_loss": 0.7072322978276641,
178
+ "eval_loss": 2.0277347072074403,
179
+ "flow/cos_sim": 0.9119339856110751,
180
+ "flow/improvement_ratio": 0.9952437794643995,
181
+ "flow/mag_ratio_mean": 0.9055254207626325,
182
+ "flow/mag_ratio_std": 0.0895098245205128,
183
+ "step": 5120
184
+ },
185
+ {
186
+ "epoch": 0.236478684587317,
187
+ "eval_bleu": 0.4571689702049482,
188
+ "eval_ce_clean_loss": 0.007218487369381401,
189
+ "eval_ce_pred_loss": 2.582534832224998,
190
+ "eval_flow_cos_loss": 0.08806605126759777,
191
+ "eval_flow_mse_loss": 0.7072322978276641,
192
+ "eval_loss": 2.0277347072074403,
193
+ "eval_runtime": 193.0062,
194
+ "eval_samples_per_second": 145.037,
195
+ "eval_steps_per_second": 2.269,
196
+ "flow/cos_sim": 0.9119339856110751,
197
+ "flow/improvement_ratio": 0.9952437794643995,
198
+ "flow/mag_ratio_mean": 0.9055254207626325,
199
+ "flow/mag_ratio_std": 0.0895098245205128,
200
+ "step": 5120
201
+ },
202
+ {
203
+ "epoch": 0.2837744215047804,
204
+ "grad_norm": 1.4207789897918701,
205
+ "learning_rate": 0.00025670620574270894,
206
+ "loss": 2.07169508934021,
207
+ "step": 6144
208
+ },
209
+ {
210
+ "epoch": 0.2837744215047804,
211
+ "eval_bleu": 0.4623355951643849,
212
+ "eval_ce_clean_loss": 0.005376053338000747,
213
+ "eval_ce_pred_loss": 2.5099684372884496,
214
+ "eval_flow_cos_loss": 0.07843345983727881,
215
+ "eval_flow_mse_loss": 0.7116553213226197,
216
+ "eval_loss": 1.9916239587683655,
217
+ "flow/cos_sim": 0.9215665681449245,
218
+ "flow/improvement_ratio": 0.9947730746443413,
219
+ "flow/mag_ratio_mean": 0.9184351398792441,
220
+ "flow/mag_ratio_std": 0.08669416432919567,
221
+ "step": 6144
222
+ },
223
+ {
224
+ "epoch": 0.2837744215047804,
225
+ "eval_bleu": 0.4623355951643849,
226
+ "eval_ce_clean_loss": 0.005376053338000747,
227
+ "eval_ce_pred_loss": 2.5099684372884496,
228
+ "eval_flow_cos_loss": 0.07843345983727881,
229
+ "eval_flow_mse_loss": 0.7116553213226197,
230
+ "eval_loss": 1.9916239587683655,
231
+ "eval_runtime": 196.543,
232
+ "eval_samples_per_second": 142.427,
233
+ "eval_steps_per_second": 2.229,
234
+ "flow/cos_sim": 0.9215665681449245,
235
+ "flow/improvement_ratio": 0.9947730746443413,
236
+ "flow/mag_ratio_mean": 0.9184351398792441,
237
+ "flow/mag_ratio_std": 0.08669416432919567,
238
+ "step": 6144
239
+ },
240
+ {
241
+ "epoch": 0.3310701584222438,
242
+ "grad_norm": 3.3632595539093018,
243
+ "learning_rate": 0.0002390360415767374,
244
+ "loss": 2.014681816101074,
245
+ "step": 7168
246
+ },
247
+ {
248
+ "epoch": 0.3310701584222438,
249
+ "eval_bleu": 0.4595571656999387,
250
+ "eval_ce_clean_loss": 0.004155274415994701,
251
+ "eval_ce_pred_loss": 2.4867921506977515,
252
+ "eval_flow_cos_loss": 0.0699176051351986,
253
+ "eval_flow_mse_loss": 0.6959788431859997,
254
+ "eval_loss": 1.9610095941312782,
255
+ "flow/cos_sim": 0.9300824132956327,
256
+ "flow/improvement_ratio": 0.9952014009702151,
257
+ "flow/mag_ratio_mean": 0.9340966411921532,
258
+ "flow/mag_ratio_std": 0.08523170804800508,
259
+ "step": 7168
260
+ },
261
+ {
262
+ "epoch": 0.3310701584222438,
263
+ "eval_bleu": 0.4595571656999387,
264
+ "eval_ce_clean_loss": 0.004155274415994701,
265
+ "eval_ce_pred_loss": 2.4867921506977515,
266
+ "eval_flow_cos_loss": 0.0699176051351986,
267
+ "eval_flow_mse_loss": 0.6959788431859997,
268
+ "eval_loss": 1.9610095941312782,
269
+ "eval_runtime": 195.7842,
270
+ "eval_samples_per_second": 142.979,
271
+ "eval_steps_per_second": 2.237,
272
+ "flow/cos_sim": 0.9300824132956327,
273
+ "flow/improvement_ratio": 0.9952014009702151,
274
+ "flow/mag_ratio_mean": 0.9340966411921532,
275
+ "flow/mag_ratio_std": 0.08523170804800508,
276
+ "step": 7168
277
+ },
278
+ {
279
+ "epoch": 0.37836589533970716,
280
+ "grad_norm": 1.9611129760742188,
281
+ "learning_rate": 0.00021922485637411153,
282
+ "loss": 1.9802610874176025,
283
+ "step": 8192
284
+ },
285
+ {
286
+ "epoch": 0.37836589533970716,
287
+ "eval_bleu": 0.4797964191826387,
288
+ "eval_ce_clean_loss": 0.00341097121557527,
289
+ "eval_ce_pred_loss": 2.4153971993215553,
290
+ "eval_flow_cos_loss": 0.06374154817398008,
291
+ "eval_flow_mse_loss": 0.6859119383439626,
292
+ "eval_loss": 1.9129569013369139,
293
+ "flow/cos_sim": 0.936258481245607,
294
+ "flow/improvement_ratio": 0.9943351938844271,
295
+ "flow/mag_ratio_mean": 0.9286861550318052,
296
+ "flow/mag_ratio_std": 0.08223920938086836,
297
+ "step": 8192
298
+ },
299
+ {
300
+ "epoch": 0.37836589533970716,
301
+ "eval_bleu": 0.4797964191826387,
302
+ "eval_ce_clean_loss": 0.00341097121557527,
303
+ "eval_ce_pred_loss": 2.4153971993215553,
304
+ "eval_flow_cos_loss": 0.06374154817398008,
305
+ "eval_flow_mse_loss": 0.6859119383439626,
306
+ "eval_loss": 1.9129569013369139,
307
+ "eval_runtime": 193.8032,
308
+ "eval_samples_per_second": 144.44,
309
+ "eval_steps_per_second": 2.26,
310
+ "flow/cos_sim": 0.936258481245607,
311
+ "flow/improvement_ratio": 0.9943351938844271,
312
+ "flow/mag_ratio_mean": 0.9286861550318052,
313
+ "flow/mag_ratio_std": 0.08223920938086836,
314
+ "step": 8192
315
+ },
316
+ {
317
+ "epoch": 0.4256616322571706,
318
+ "grad_norm": 1.6621181964874268,
319
+ "learning_rate": 0.00019771490022228366,
320
+ "loss": 1.9591320753097534,
321
+ "step": 9216
322
+ },
323
+ {
324
+ "epoch": 0.4256616322571706,
325
+ "eval_bleu": 0.4845638244135935,
326
+ "eval_ce_clean_loss": 0.0029391252050882197,
327
+ "eval_ce_pred_loss": 2.36602204493736,
328
+ "eval_flow_cos_loss": 0.061222414167164126,
329
+ "eval_flow_mse_loss": 0.6859568288064983,
330
+ "eval_loss": 1.8872125840078207,
331
+ "flow/cos_sim": 0.9387776077338005,
332
+ "flow/improvement_ratio": 0.9949078940909747,
333
+ "flow/mag_ratio_mean": 0.9425890755979982,
334
+ "flow/mag_ratio_std": 0.08194239209639971,
335
+ "step": 9216
336
+ },
337
+ {
338
+ "epoch": 0.4256616322571706,
339
+ "eval_bleu": 0.4845638244135935,
340
+ "eval_ce_clean_loss": 0.0029391252050882197,
341
+ "eval_ce_pred_loss": 2.36602204493736,
342
+ "eval_flow_cos_loss": 0.061222414167164126,
343
+ "eval_flow_mse_loss": 0.6859568288064983,
344
+ "eval_loss": 1.8872125840078207,
345
+ "eval_runtime": 193.8927,
346
+ "eval_samples_per_second": 144.374,
347
+ "eval_steps_per_second": 2.259,
348
+ "flow/cos_sim": 0.9387776077338005,
349
+ "flow/improvement_ratio": 0.9949078940909747,
350
+ "flow/mag_ratio_mean": 0.9425890755979982,
351
+ "flow/mag_ratio_std": 0.08194239209639971,
352
+ "step": 9216
353
+ },
354
+ {
355
+ "epoch": 0.472957369174634,
356
+ "grad_norm": 1.4548691511154175,
357
+ "learning_rate": 0.00017506922209339053,
358
+ "loss": 1.9363099336624146,
359
+ "step": 10240
360
+ },
361
+ {
362
+ "epoch": 0.472957369174634,
363
+ "eval_bleu": 0.48901069646205675,
364
+ "eval_ce_clean_loss": 0.0027231096204202282,
365
+ "eval_ce_pred_loss": 2.3451035591565312,
366
+ "eval_flow_cos_loss": 0.05783578853833077,
367
+ "eval_flow_mse_loss": 0.6834459889969325,
368
+ "eval_loss": 1.8731798227519205,
369
+ "flow/cos_sim": 0.9421642368242621,
370
+ "flow/improvement_ratio": 0.9953297391601893,
371
+ "flow/mag_ratio_mean": 0.9415649660101765,
372
+ "flow/mag_ratio_std": 0.08121724838399452,
373
+ "step": 10240
374
+ },
375
+ {
376
+ "epoch": 0.472957369174634,
377
+ "eval_bleu": 0.48901069646205675,
378
+ "eval_ce_clean_loss": 0.0027231096204202282,
379
+ "eval_ce_pred_loss": 2.3451035591565312,
380
+ "eval_flow_cos_loss": 0.05783578853833077,
381
+ "eval_flow_mse_loss": 0.6834459889969325,
382
+ "eval_loss": 1.8731798227519205,
383
+ "eval_runtime": 192.701,
384
+ "eval_samples_per_second": 145.266,
385
+ "eval_steps_per_second": 2.273,
386
+ "flow/cos_sim": 0.9421642368242621,
387
+ "flow/improvement_ratio": 0.9953297391601893,
388
+ "flow/mag_ratio_mean": 0.9415649660101765,
389
+ "flow/mag_ratio_std": 0.08121724838399452,
390
+ "step": 10240
391
+ },
392
+ {
393
+ "epoch": 0.5202531060920974,
394
+ "grad_norm": 1.612707495689392,
395
+ "learning_rate": 0.00015179334717587922,
396
+ "loss": 1.9111002683639526,
397
+ "step": 11264
398
+ },
399
+ {
400
+ "epoch": 0.5202531060920974,
401
+ "eval_bleu": 0.49133817797781537,
402
+ "eval_ce_clean_loss": 0.00239595649277549,
403
+ "eval_ce_pred_loss": 2.3280042031584265,
404
+ "eval_flow_cos_loss": 0.05585474443524124,
405
+ "eval_flow_mse_loss": 0.6858290074622795,
406
+ "eval_loss": 1.8661907554761459,
407
+ "flow/cos_sim": 0.9441452740806423,
408
+ "flow/improvement_ratio": 0.994876034727924,
409
+ "flow/mag_ratio_mean": 0.9411461739779607,
410
+ "flow/mag_ratio_std": 0.08062856834909143,
411
+ "step": 11264
412
+ },
413
+ {
414
+ "epoch": 0.5202531060920974,
415
+ "eval_bleu": 0.49133817797781537,
416
+ "eval_ce_clean_loss": 0.00239595649277549,
417
+ "eval_ce_pred_loss": 2.3280042031584265,
418
+ "eval_flow_cos_loss": 0.05585474443524124,
419
+ "eval_flow_mse_loss": 0.6858290074622795,
420
+ "eval_loss": 1.8661907554761459,
421
+ "eval_runtime": 193.6778,
422
+ "eval_samples_per_second": 144.534,
423
+ "eval_steps_per_second": 2.261,
424
+ "flow/cos_sim": 0.9441452740806423,
425
+ "flow/improvement_ratio": 0.994876034727924,
426
+ "flow/mag_ratio_mean": 0.9411461739779607,
427
+ "flow/mag_ratio_std": 0.08062856834909143,
428
+ "step": 11264
429
+ }
430
+ ],
431
+ "logging_steps": 1024,
432
+ "max_steps": 21651,
433
+ "num_input_tokens_seen": 0,
434
+ "num_train_epochs": 1,
435
+ "save_steps": 1024,
436
+ "stateful_callbacks": {
437
+ "TrainerControl": {
438
+ "args": {
439
+ "should_epoch_stop": false,
440
+ "should_evaluate": false,
441
+ "should_log": false,
442
+ "should_save": true,
443
+ "should_training_stop": false
444
+ },
445
+ "attributes": {}
446
+ }
447
+ },
448
+ "total_flos": 0.0,
449
+ "train_batch_size": 64,
450
+ "trial_name": null,
451
+ "trial_params": null
452
+ }
checkpoints-v5.2/checkpoint-11264/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e76013f70108cb4ddda27ec8328cf4ef6edec61c75334ae3d6e75eb2e082915f
3
+ size 5137