Attila1011 commited on
Commit
22f4834
·
verified ·
1 Parent(s): 80678ce

Upload folder using huggingface_hub

Browse files
checkpoints-v5.5/checkpoint-30940/ema.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ab35a52ed2db75e3b56628c48b92cb445c4c8dfe90f5ddf8a493611cfe988549
3
+ size 54599592
checkpoints-v5.5/checkpoint-30940/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:541e642c76012559a56b6418d96f2ca3bb4c7a2782aac5536a3db68202d9c97c
3
+ size 54599624
checkpoints-v5.5/checkpoint-30940/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:70f4e7430bd49270ed946075cca8339f6fecd51f9460232bb7c11dd8d5620c46
3
+ size 76551435
checkpoints-v5.5/checkpoint-30940/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:382c7176d0b9e61c608410c27a5ee3639d2486bf9d04212c4564bfa4059c86a7
3
+ size 14645
checkpoints-v5.5/checkpoint-30940/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa608fedba598c91eda0f48e5a4ffaf0c304af022ac7644a12b0bb6c376236c0
3
+ size 1383
checkpoints-v5.5/checkpoint-30940/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:36c23d6c807518dc2d7531de5444dc1f2fb3bd30675345f067304e2bcf3357b9
3
+ size 1465
checkpoints-v5.5/checkpoint-30940/trainer_state.json ADDED
@@ -0,0 +1,1174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 4.0,
6
+ "eval_steps": 1024,
7
+ "global_step": 30940,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.13239381989786023,
14
+ "grad_norm": 7.3368072509765625,
15
+ "learning_rate": 9.990234375e-05,
16
+ "loss": 13.833250999450684,
17
+ "step": 1024
18
+ },
19
+ {
20
+ "epoch": 0.13239381989786023,
21
+ "eval_bleu": 0.08843010093383347,
22
+ "eval_ce_clean_loss": 3.785046512154257,
23
+ "eval_ce_pred_loss": 6.1898151750017885,
24
+ "eval_flow_cos_loss": 0.39729490059955863,
25
+ "eval_flow_mse_loss": 1.2504585077808161,
26
+ "eval_loss": 9.467699330323821,
27
+ "flow/cos_sim": 0.6027051043358578,
28
+ "flow/improvement_ratio": 0.9944390654563904,
29
+ "flow/mag_ratio_mean": 0.6029561613775363,
30
+ "flow/mag_ratio_std": 0.06967356720357944,
31
+ "step": 1024
32
+ },
33
+ {
34
+ "epoch": 0.13239381989786023,
35
+ "eval_bleu": 0.08843010093383347,
36
+ "eval_ce_clean_loss": 3.785046512154257,
37
+ "eval_ce_pred_loss": 6.1898151750017885,
38
+ "eval_flow_cos_loss": 0.39729490059955863,
39
+ "eval_flow_mse_loss": 1.2504585077808161,
40
+ "eval_loss": 9.467699330323821,
41
+ "eval_runtime": 69.1426,
42
+ "eval_samples_per_second": 144.629,
43
+ "eval_steps_per_second": 2.271,
44
+ "flow/cos_sim": 0.6027051043358578,
45
+ "flow/improvement_ratio": 0.9944390654563904,
46
+ "flow/mag_ratio_mean": 0.6029561613775363,
47
+ "flow/mag_ratio_std": 0.06967356720357944,
48
+ "step": 1024
49
+ },
50
+ {
51
+ "epoch": 0.26478763979572045,
52
+ "grad_norm": 1.5976208448410034,
53
+ "learning_rate": 9.971175203561169e-05,
54
+ "loss": 6.556396484375,
55
+ "step": 2048
56
+ },
57
+ {
58
+ "epoch": 0.26478763979572045,
59
+ "eval_bleu": 0.2647666813502405,
60
+ "eval_ce_clean_loss": 0.6921462336922907,
61
+ "eval_ce_pred_loss": 4.159270631279915,
62
+ "eval_flow_cos_loss": 0.33016856043202103,
63
+ "eval_flow_mse_loss": 1.083006814786583,
64
+ "eval_loss": 4.769184565088551,
65
+ "flow/cos_sim": 0.6698314437441005,
66
+ "flow/improvement_ratio": 0.9946717445258122,
67
+ "flow/mag_ratio_mean": 0.6456947994839614,
68
+ "flow/mag_ratio_std": 0.08488734332239552,
69
+ "step": 2048
70
+ },
71
+ {
72
+ "epoch": 0.26478763979572045,
73
+ "eval_bleu": 0.2647666813502405,
74
+ "eval_ce_clean_loss": 0.6921462336922907,
75
+ "eval_ce_pred_loss": 4.159270631279915,
76
+ "eval_flow_cos_loss": 0.33016856043202103,
77
+ "eval_flow_mse_loss": 1.083006814786583,
78
+ "eval_loss": 4.769184565088551,
79
+ "eval_runtime": 67.8586,
80
+ "eval_samples_per_second": 147.365,
81
+ "eval_steps_per_second": 2.314,
82
+ "flow/cos_sim": 0.6698314437441005,
83
+ "flow/improvement_ratio": 0.9946717445258122,
84
+ "flow/mag_ratio_mean": 0.6456947994839614,
85
+ "flow/mag_ratio_std": 0.08488734332239552,
86
+ "step": 2048
87
+ },
88
+ {
89
+ "epoch": 0.3971814596935807,
90
+ "grad_norm": 1.0349175930023193,
91
+ "learning_rate": 9.885033161800567e-05,
92
+ "loss": 4.22868013381958,
93
+ "step": 3072
94
+ },
95
+ {
96
+ "epoch": 0.3971814596935807,
97
+ "eval_bleu": 0.3307477170153146,
98
+ "eval_ce_clean_loss": 0.21776276523140586,
99
+ "eval_ce_pred_loss": 3.5308286536271405,
100
+ "eval_flow_cos_loss": 0.3201853291244264,
101
+ "eval_flow_mse_loss": 1.0728299655732076,
102
+ "eval_loss": 3.8422190869689747,
103
+ "flow/cos_sim": 0.6798147354156349,
104
+ "flow/improvement_ratio": 0.9928556031482235,
105
+ "flow/mag_ratio_mean": 0.6495526474752243,
106
+ "flow/mag_ratio_std": 0.08990857878308388,
107
+ "step": 3072
108
+ },
109
+ {
110
+ "epoch": 0.3971814596935807,
111
+ "eval_bleu": 0.3307477170153146,
112
+ "eval_ce_clean_loss": 0.21776276523140586,
113
+ "eval_ce_pred_loss": 3.5308286536271405,
114
+ "eval_flow_cos_loss": 0.3201853291244264,
115
+ "eval_flow_mse_loss": 1.0728299655732076,
116
+ "eval_loss": 3.8422190869689747,
117
+ "eval_runtime": 67.9151,
118
+ "eval_samples_per_second": 147.243,
119
+ "eval_steps_per_second": 2.312,
120
+ "flow/cos_sim": 0.6798147354156349,
121
+ "flow/improvement_ratio": 0.9928556031482235,
122
+ "flow/mag_ratio_mean": 0.6495526474752243,
123
+ "flow/mag_ratio_std": 0.08990857878308388,
124
+ "step": 3072
125
+ },
126
+ {
127
+ "epoch": 0.5295752795914409,
128
+ "grad_norm": 2.284677028656006,
129
+ "learning_rate": 9.742400750550229e-05,
130
+ "loss": 3.7094979286193848,
131
+ "step": 4096
132
+ },
133
+ {
134
+ "epoch": 0.5295752795914409,
135
+ "eval_bleu": 0.3527590985954599,
136
+ "eval_ce_clean_loss": 0.10325595935818496,
137
+ "eval_ce_pred_loss": 3.3037764221240002,
138
+ "eval_flow_cos_loss": 0.29567099870390195,
139
+ "eval_flow_mse_loss": 1.0334580939286833,
140
+ "eval_loss": 3.5232752614719853,
141
+ "flow/cos_sim": 0.7043290168616423,
142
+ "flow/improvement_ratio": 0.9942314150227103,
143
+ "flow/mag_ratio_mean": 0.6710645951283206,
144
+ "flow/mag_ratio_std": 0.08415729860970929,
145
+ "step": 4096
146
+ },
147
+ {
148
+ "epoch": 0.5295752795914409,
149
+ "eval_bleu": 0.3527590985954599,
150
+ "eval_ce_clean_loss": 0.10325595935818496,
151
+ "eval_ce_pred_loss": 3.3037764221240002,
152
+ "eval_flow_cos_loss": 0.29567099870390195,
153
+ "eval_flow_mse_loss": 1.0334580939286833,
154
+ "eval_loss": 3.5232752614719853,
155
+ "eval_runtime": 67.527,
156
+ "eval_samples_per_second": 148.089,
157
+ "eval_steps_per_second": 2.325,
158
+ "flow/cos_sim": 0.7043290168616423,
159
+ "flow/improvement_ratio": 0.9942314150227103,
160
+ "flow/mag_ratio_mean": 0.6710645951283206,
161
+ "flow/mag_ratio_std": 0.08415729860970929,
162
+ "step": 4096
163
+ },
164
+ {
165
+ "epoch": 0.6619690994893012,
166
+ "grad_norm": 1.0625219345092773,
167
+ "learning_rate": 9.544981995345226e-05,
168
+ "loss": 3.4675893783569336,
169
+ "step": 5120
170
+ },
171
+ {
172
+ "epoch": 0.6619690994893012,
173
+ "eval_bleu": 0.37785369114831263,
174
+ "eval_ce_clean_loss": 0.0570975638404014,
175
+ "eval_ce_pred_loss": 3.1086114409622874,
176
+ "eval_flow_cos_loss": 0.2890773550340324,
177
+ "eval_flow_mse_loss": 1.0530509029983715,
178
+ "eval_loss": 3.3584457916818606,
179
+ "flow/cos_sim": 0.7109226474336757,
180
+ "flow/improvement_ratio": 0.9944721566643685,
181
+ "flow/mag_ratio_mean": 0.6796372863137798,
182
+ "flow/mag_ratio_std": 0.08220032687969268,
183
+ "step": 5120
184
+ },
185
+ {
186
+ "epoch": 0.6619690994893012,
187
+ "eval_bleu": 0.37785369114831263,
188
+ "eval_ce_clean_loss": 0.0570975638404014,
189
+ "eval_ce_pred_loss": 3.1086114409622874,
190
+ "eval_flow_cos_loss": 0.2890773550340324,
191
+ "eval_flow_mse_loss": 1.0530509029983715,
192
+ "eval_loss": 3.3584457916818606,
193
+ "eval_runtime": 67.5831,
194
+ "eval_samples_per_second": 147.966,
195
+ "eval_steps_per_second": 2.323,
196
+ "flow/cos_sim": 0.7109226474336757,
197
+ "flow/improvement_ratio": 0.9944721566643685,
198
+ "flow/mag_ratio_mean": 0.6796372863137798,
199
+ "flow/mag_ratio_std": 0.08220032687969268,
200
+ "step": 5120
201
+ },
202
+ {
203
+ "epoch": 0.7943629193871614,
204
+ "grad_norm": 1.390210747718811,
205
+ "learning_rate": 9.295057566334431e-05,
206
+ "loss": 3.3179638385772705,
207
+ "step": 6144
208
+ },
209
+ {
210
+ "epoch": 0.7943629193871614,
211
+ "eval_bleu": 0.40284924593911503,
212
+ "eval_ce_clean_loss": 0.03560329580999863,
213
+ "eval_ce_pred_loss": 2.935079864635589,
214
+ "eval_flow_cos_loss": 0.2695348363393431,
215
+ "eval_flow_mse_loss": 1.037700882383213,
216
+ "eval_loss": 3.195243762556914,
217
+ "flow/cos_sim": 0.7304651706841341,
218
+ "flow/improvement_ratio": 0.9945835288922498,
219
+ "flow/mag_ratio_mean": 0.7005888793119199,
220
+ "flow/mag_ratio_std": 0.0835665136480787,
221
+ "step": 6144
222
+ },
223
+ {
224
+ "epoch": 0.7943629193871614,
225
+ "eval_bleu": 0.40284924593911503,
226
+ "eval_ce_clean_loss": 0.03560329580999863,
227
+ "eval_ce_pred_loss": 2.935079864635589,
228
+ "eval_flow_cos_loss": 0.2695348363393431,
229
+ "eval_flow_mse_loss": 1.037700882383213,
230
+ "eval_loss": 3.195243762556914,
231
+ "eval_runtime": 67.6818,
232
+ "eval_samples_per_second": 147.75,
233
+ "eval_steps_per_second": 2.32,
234
+ "flow/cos_sim": 0.7304651706841341,
235
+ "flow/improvement_ratio": 0.9945835288922498,
236
+ "flow/mag_ratio_mean": 0.7005888793119199,
237
+ "flow/mag_ratio_std": 0.0835665136480787,
238
+ "step": 6144
239
+ },
240
+ {
241
+ "epoch": 0.9267567392850217,
242
+ "grad_norm": 1.7730882167816162,
243
+ "learning_rate": 8.995830349195804e-05,
244
+ "loss": 3.2147014141082764,
245
+ "step": 7168
246
+ },
247
+ {
248
+ "epoch": 0.9267567392850217,
249
+ "eval_bleu": 0.4108774093856199,
250
+ "eval_ce_clean_loss": 0.025004512588879105,
251
+ "eval_ce_pred_loss": 2.8478709542827243,
252
+ "eval_flow_cos_loss": 0.25457756676871307,
253
+ "eval_flow_mse_loss": 1.0318333833080948,
254
+ "eval_loss": 3.113991931745201,
255
+ "flow/cos_sim": 0.745422419469068,
256
+ "flow/improvement_ratio": 0.9949804590006542,
257
+ "flow/mag_ratio_mean": 0.7171509364607987,
258
+ "flow/mag_ratio_std": 0.08899391105600224,
259
+ "step": 7168
260
+ },
261
+ {
262
+ "epoch": 0.9267567392850217,
263
+ "eval_bleu": 0.4108774093856199,
264
+ "eval_ce_clean_loss": 0.025004512588879105,
265
+ "eval_ce_pred_loss": 2.8478709542827243,
266
+ "eval_flow_cos_loss": 0.25457756676871307,
267
+ "eval_flow_mse_loss": 1.0318333833080948,
268
+ "eval_loss": 3.113991931745201,
269
+ "eval_runtime": 67.7148,
270
+ "eval_samples_per_second": 147.678,
271
+ "eval_steps_per_second": 2.319,
272
+ "flow/cos_sim": 0.745422419469068,
273
+ "flow/improvement_ratio": 0.9949804590006542,
274
+ "flow/mag_ratio_mean": 0.7171509364607987,
275
+ "flow/mag_ratio_std": 0.08899391105600224,
276
+ "step": 7168
277
+ },
278
+ {
279
+ "epoch": 1.05908591376301,
280
+ "grad_norm": 2.208953619003296,
281
+ "learning_rate": 8.650172716103233e-05,
282
+ "loss": 3.119405508041382,
283
+ "step": 8192
284
+ },
285
+ {
286
+ "epoch": 1.05908591376301,
287
+ "eval_bleu": 0.4100243274079566,
288
+ "eval_ce_clean_loss": 0.018418982221632248,
289
+ "eval_ce_pred_loss": 2.801309669853016,
290
+ "eval_flow_cos_loss": 0.24036190160520518,
291
+ "eval_flow_mse_loss": 1.0228236701078475,
292
+ "eval_loss": 3.062249883724626,
293
+ "flow/cos_sim": 0.7596381280072935,
294
+ "flow/improvement_ratio": 0.9959618415042852,
295
+ "flow/mag_ratio_mean": 0.7364322915198697,
296
+ "flow/mag_ratio_std": 0.0917528191949152,
297
+ "step": 8192
298
+ },
299
+ {
300
+ "epoch": 1.05908591376301,
301
+ "eval_bleu": 0.4100243274079566,
302
+ "eval_ce_clean_loss": 0.018418982221632248,
303
+ "eval_ce_pred_loss": 2.801309669853016,
304
+ "eval_flow_cos_loss": 0.24036190160520518,
305
+ "eval_flow_mse_loss": 1.0228236701078475,
306
+ "eval_loss": 3.062249883724626,
307
+ "eval_runtime": 69.3159,
308
+ "eval_samples_per_second": 144.267,
309
+ "eval_steps_per_second": 2.265,
310
+ "flow/cos_sim": 0.7596381280072935,
311
+ "flow/improvement_ratio": 0.9959618415042852,
312
+ "flow/mag_ratio_mean": 0.7364322915198697,
313
+ "flow/mag_ratio_std": 0.0917528191949152,
314
+ "step": 8192
315
+ },
316
+ {
317
+ "epoch": 1.19147973366087,
318
+ "grad_norm": 1.0985896587371826,
319
+ "learning_rate": 8.263142386444264e-05,
320
+ "loss": 3.0675039291381836,
321
+ "step": 9216
322
+ },
323
+ {
324
+ "epoch": 1.19147973366087,
325
+ "eval_bleu": 0.42913809326340735,
326
+ "eval_ce_clean_loss": 0.014243369497904543,
327
+ "eval_ce_pred_loss": 2.679083149903899,
328
+ "eval_flow_cos_loss": 0.2306169035138598,
329
+ "eval_flow_mse_loss": 1.0239125793906534,
330
+ "eval_loss": 2.97116835861449,
331
+ "flow/cos_sim": 0.7693831127160674,
332
+ "flow/improvement_ratio": 0.9939722635184124,
333
+ "flow/mag_ratio_mean": 0.748531128190885,
334
+ "flow/mag_ratio_std": 0.0947970964821281,
335
+ "step": 9216
336
+ },
337
+ {
338
+ "epoch": 1.19147973366087,
339
+ "eval_bleu": 0.42913809326340735,
340
+ "eval_ce_clean_loss": 0.014243369497904543,
341
+ "eval_ce_pred_loss": 2.679083149903899,
342
+ "eval_flow_cos_loss": 0.2306169035138598,
343
+ "eval_flow_mse_loss": 1.0239125793906534,
344
+ "eval_loss": 2.97116835861449,
345
+ "eval_runtime": 67.1108,
346
+ "eval_samples_per_second": 149.007,
347
+ "eval_steps_per_second": 2.339,
348
+ "flow/cos_sim": 0.7693831127160674,
349
+ "flow/improvement_ratio": 0.9939722635184124,
350
+ "flow/mag_ratio_mean": 0.748531128190885,
351
+ "flow/mag_ratio_std": 0.0947970964821281,
352
+ "step": 9216
353
+ },
354
+ {
355
+ "epoch": 1.3238735535587303,
356
+ "grad_norm": 2.3166847229003906,
357
+ "learning_rate": 7.837697175482903e-05,
358
+ "loss": 3.002436399459839,
359
+ "step": 10240
360
+ },
361
+ {
362
+ "epoch": 1.3238735535587303,
363
+ "eval_bleu": 0.42768151912552244,
364
+ "eval_ce_clean_loss": 0.011440879840308883,
365
+ "eval_ce_pred_loss": 2.672361337455215,
366
+ "eval_flow_cos_loss": 0.2179829450739417,
367
+ "eval_flow_mse_loss": 1.0098259748926588,
368
+ "eval_loss": 2.946415506350766,
369
+ "flow/cos_sim": 0.782017103426016,
370
+ "flow/improvement_ratio": 0.9939965146362402,
371
+ "flow/mag_ratio_mean": 0.7625711093283003,
372
+ "flow/mag_ratio_std": 0.09684707447411908,
373
+ "step": 10240
374
+ },
375
+ {
376
+ "epoch": 1.3238735535587303,
377
+ "eval_bleu": 0.42768151912552244,
378
+ "eval_ce_clean_loss": 0.011440879840308883,
379
+ "eval_ce_pred_loss": 2.672361337455215,
380
+ "eval_flow_cos_loss": 0.2179829450739417,
381
+ "eval_flow_mse_loss": 1.0098259748926588,
382
+ "eval_loss": 2.946415506350766,
383
+ "eval_runtime": 67.7743,
384
+ "eval_samples_per_second": 147.549,
385
+ "eval_steps_per_second": 2.317,
386
+ "flow/cos_sim": 0.782017103426016,
387
+ "flow/improvement_ratio": 0.9939965146362402,
388
+ "flow/mag_ratio_mean": 0.7625711093283003,
389
+ "flow/mag_ratio_std": 0.09684707447411908,
390
+ "step": 10240
391
+ },
392
+ {
393
+ "epoch": 1.4562673734565905,
394
+ "grad_norm": 1.1153963804244995,
395
+ "learning_rate": 7.37946961193838e-05,
396
+ "loss": 2.948077917098999,
397
+ "step": 11264
398
+ },
399
+ {
400
+ "epoch": 1.4562673734565905,
401
+ "eval_bleu": 0.44608815072120206,
402
+ "eval_ce_clean_loss": 0.009407547670327554,
403
+ "eval_ce_pred_loss": 2.5671096637750126,
404
+ "eval_flow_cos_loss": 0.21284512444666237,
405
+ "eval_flow_mse_loss": 1.0222040597040942,
406
+ "eval_loss": 2.8817996310580307,
407
+ "flow/cos_sim": 0.7871549156061404,
408
+ "flow/improvement_ratio": 0.9953155885836121,
409
+ "flow/mag_ratio_mean": 0.767739696867147,
410
+ "flow/mag_ratio_std": 0.10089007670142848,
411
+ "step": 11264
412
+ },
413
+ {
414
+ "epoch": 1.4562673734565905,
415
+ "eval_bleu": 0.44608815072120206,
416
+ "eval_ce_clean_loss": 0.009407547670327554,
417
+ "eval_ce_pred_loss": 2.5671096637750126,
418
+ "eval_flow_cos_loss": 0.21284512444666237,
419
+ "eval_flow_mse_loss": 1.0222040597040942,
420
+ "eval_loss": 2.8817996310580307,
421
+ "eval_runtime": 69.2843,
422
+ "eval_samples_per_second": 144.333,
423
+ "eval_steps_per_second": 2.266,
424
+ "flow/cos_sim": 0.7871549156061404,
425
+ "flow/improvement_ratio": 0.9953155885836121,
426
+ "flow/mag_ratio_mean": 0.767739696867147,
427
+ "flow/mag_ratio_std": 0.10089007670142848,
428
+ "step": 11264
429
+ },
430
+ {
431
+ "epoch": 1.5886611933544508,
432
+ "grad_norm": 2.7540619373321533,
433
+ "learning_rate": 6.894239286709331e-05,
434
+ "loss": 2.921593427658081,
435
+ "step": 12288
436
+ },
437
+ {
438
+ "epoch": 1.5886611933544508,
439
+ "eval_bleu": 0.4600711751490207,
440
+ "eval_ce_clean_loss": 0.008017947172129135,
441
+ "eval_ce_pred_loss": 2.4964520749013137,
442
+ "eval_flow_cos_loss": 0.20653787796284742,
443
+ "eval_flow_mse_loss": 1.0247267002512694,
444
+ "eval_loss": 2.8318955564195183,
445
+ "flow/cos_sim": 0.7934621500361497,
446
+ "flow/improvement_ratio": 0.9952654098249545,
447
+ "flow/mag_ratio_mean": 0.7712421458997544,
448
+ "flow/mag_ratio_std": 0.10261066300663979,
449
+ "step": 12288
450
+ },
451
+ {
452
+ "epoch": 1.5886611933544508,
453
+ "eval_bleu": 0.4600711751490207,
454
+ "eval_ce_clean_loss": 0.008017947172129135,
455
+ "eval_ce_pred_loss": 2.4964520749013137,
456
+ "eval_flow_cos_loss": 0.20653787796284742,
457
+ "eval_flow_mse_loss": 1.0247267002512694,
458
+ "eval_loss": 2.8318955564195183,
459
+ "eval_runtime": 68.3644,
460
+ "eval_samples_per_second": 146.275,
461
+ "eval_steps_per_second": 2.297,
462
+ "flow/cos_sim": 0.7934621500361497,
463
+ "flow/improvement_ratio": 0.9952654098249545,
464
+ "flow/mag_ratio_mean": 0.7712421458997544,
465
+ "flow/mag_ratio_std": 0.10261066300663979,
466
+ "step": 12288
467
+ },
468
+ {
469
+ "epoch": 1.721055013252311,
470
+ "grad_norm": 1.0973964929580688,
471
+ "learning_rate": 6.386664071821704e-05,
472
+ "loss": 2.8836159706115723,
473
+ "step": 13312
474
+ },
475
+ {
476
+ "epoch": 1.721055013252311,
477
+ "eval_bleu": 0.45198542436217626,
478
+ "eval_ce_clean_loss": 0.00690584495562799,
479
+ "eval_ce_pred_loss": 2.526137225946803,
480
+ "eval_flow_cos_loss": 0.1992794095900408,
481
+ "eval_flow_mse_loss": 1.0171066932617479,
482
+ "eval_loss": 2.842128411979432,
483
+ "flow/cos_sim": 0.8007206206868409,
484
+ "flow/improvement_ratio": 0.9942179546234714,
485
+ "flow/mag_ratio_mean": 0.7813193399435395,
486
+ "flow/mag_ratio_std": 0.10161699169570473,
487
+ "step": 13312
488
+ },
489
+ {
490
+ "epoch": 1.721055013252311,
491
+ "eval_bleu": 0.45198542436217626,
492
+ "eval_ce_clean_loss": 0.00690584495562799,
493
+ "eval_ce_pred_loss": 2.526137225946803,
494
+ "eval_flow_cos_loss": 0.1992794095900408,
495
+ "eval_flow_mse_loss": 1.0171066932617479,
496
+ "eval_loss": 2.842128411979432,
497
+ "eval_runtime": 67.7624,
498
+ "eval_samples_per_second": 147.574,
499
+ "eval_steps_per_second": 2.317,
500
+ "flow/cos_sim": 0.8007206206868409,
501
+ "flow/improvement_ratio": 0.9942179546234714,
502
+ "flow/mag_ratio_mean": 0.7813193399435395,
503
+ "flow/mag_ratio_std": 0.10161699169570473,
504
+ "step": 13312
505
+ },
506
+ {
507
+ "epoch": 1.8534488331501713,
508
+ "grad_norm": 1.9011768102645874,
509
+ "learning_rate": 5.863069490629029e-05,
510
+ "loss": 2.8486995697021484,
511
+ "step": 14336
512
+ },
513
+ {
514
+ "epoch": 1.8534488331501713,
515
+ "eval_bleu": 0.4661235270091544,
516
+ "eval_ce_clean_loss": 0.0061916545386408355,
517
+ "eval_ce_pred_loss": 2.4525129415427043,
518
+ "eval_flow_cos_loss": 0.19385759503978073,
519
+ "eval_flow_mse_loss": 1.014179768835663,
520
+ "eval_loss": 2.785594868811832,
521
+ "flow/cos_sim": 0.8061424322948334,
522
+ "flow/improvement_ratio": 0.9953108915857448,
523
+ "flow/mag_ratio_mean": 0.7884186832768143,
524
+ "flow/mag_ratio_std": 0.10213647872968844,
525
+ "step": 14336
526
+ },
527
+ {
528
+ "epoch": 1.8534488331501713,
529
+ "eval_bleu": 0.4661235270091544,
530
+ "eval_ce_clean_loss": 0.0061916545386408355,
531
+ "eval_ce_pred_loss": 2.4525129415427043,
532
+ "eval_flow_cos_loss": 0.19385759503978073,
533
+ "eval_flow_mse_loss": 1.014179768835663,
534
+ "eval_loss": 2.785594868811832,
535
+ "eval_runtime": 66.8958,
536
+ "eval_samples_per_second": 149.486,
537
+ "eval_steps_per_second": 2.347,
538
+ "flow/cos_sim": 0.8061424322948334,
539
+ "flow/improvement_ratio": 0.9953108915857448,
540
+ "flow/mag_ratio_mean": 0.7884186832768143,
541
+ "flow/mag_ratio_std": 0.10213647872968844,
542
+ "step": 14336
543
+ },
544
+ {
545
+ "epoch": 1.9858426530480315,
546
+ "grad_norm": 1.449341893196106,
547
+ "learning_rate": 5.330028268696606e-05,
548
+ "loss": 2.8369064331054688,
549
+ "step": 15360
550
+ },
551
+ {
552
+ "epoch": 1.9858426530480315,
553
+ "eval_bleu": 0.47279436776678946,
554
+ "eval_ce_clean_loss": 0.005674782390024062,
555
+ "eval_ce_pred_loss": 2.4236644658313433,
556
+ "eval_flow_cos_loss": 0.18836158210304893,
557
+ "eval_flow_mse_loss": 1.0086946066018123,
558
+ "eval_loss": 2.7580248884334684,
559
+ "flow/cos_sim": 0.8116384444722704,
560
+ "flow/improvement_ratio": 0.9943355352255949,
561
+ "flow/mag_ratio_mean": 0.7959212903763838,
562
+ "flow/mag_ratio_std": 0.10561748859798832,
563
+ "step": 15360
564
+ },
565
+ {
566
+ "epoch": 1.9858426530480315,
567
+ "eval_bleu": 0.47279436776678946,
568
+ "eval_ce_clean_loss": 0.005674782390024062,
569
+ "eval_ce_pred_loss": 2.4236644658313433,
570
+ "eval_flow_cos_loss": 0.18836158210304893,
571
+ "eval_flow_mse_loss": 1.0086946066018123,
572
+ "eval_loss": 2.7580248884334684,
573
+ "eval_runtime": 67.2307,
574
+ "eval_samples_per_second": 148.742,
575
+ "eval_steps_per_second": 2.335,
576
+ "flow/cos_sim": 0.8116384444722704,
577
+ "flow/improvement_ratio": 0.9943355352255949,
578
+ "flow/mag_ratio_mean": 0.7959212903763838,
579
+ "flow/mag_ratio_std": 0.10561748859798832,
580
+ "step": 15360
581
+ },
582
+ {
583
+ "epoch": 2.11817182752602,
584
+ "grad_norm": 3.0230202674865723,
585
+ "learning_rate": 4.792657229246779e-05,
586
+ "loss": 2.816126585006714,
587
+ "step": 16384
588
+ },
589
+ {
590
+ "epoch": 2.11817182752602,
591
+ "eval_bleu": 0.47827420818893907,
592
+ "eval_ce_clean_loss": 0.005247315107162591,
593
+ "eval_ce_pred_loss": 2.396967076951531,
594
+ "eval_flow_cos_loss": 0.18433132226679735,
595
+ "eval_flow_mse_loss": 1.0063348123981695,
596
+ "eval_loss": 2.735541887344069,
597
+ "flow/cos_sim": 0.8156687302194583,
598
+ "flow/improvement_ratio": 0.9953372854336052,
599
+ "flow/mag_ratio_mean": 0.7989175444955279,
600
+ "flow/mag_ratio_std": 0.10495429419598003,
601
+ "step": 16384
602
+ },
603
+ {
604
+ "epoch": 2.11817182752602,
605
+ "eval_bleu": 0.47827420818893907,
606
+ "eval_ce_clean_loss": 0.005247315107162591,
607
+ "eval_ce_pred_loss": 2.396967076951531,
608
+ "eval_flow_cos_loss": 0.18433132226679735,
609
+ "eval_flow_mse_loss": 1.0063348123981695,
610
+ "eval_loss": 2.735541887344069,
611
+ "eval_runtime": 67.2185,
612
+ "eval_samples_per_second": 148.769,
613
+ "eval_steps_per_second": 2.336,
614
+ "flow/cos_sim": 0.8156687302194583,
615
+ "flow/improvement_ratio": 0.9953372854336052,
616
+ "flow/mag_ratio_mean": 0.7989175444955279,
617
+ "flow/mag_ratio_std": 0.10495429419598003,
618
+ "step": 16384
619
+ },
620
+ {
621
+ "epoch": 2.25056564742388,
622
+ "grad_norm": 0.8502700328826904,
623
+ "learning_rate": 4.2582007608037904e-05,
624
+ "loss": 2.789738416671753,
625
+ "step": 17408
626
+ },
627
+ {
628
+ "epoch": 2.25056564742388,
629
+ "eval_bleu": 0.4833431423882271,
630
+ "eval_ce_clean_loss": 0.004915411782059463,
631
+ "eval_ce_pred_loss": 2.399613708447499,
632
+ "eval_flow_cos_loss": 0.18183312740675203,
633
+ "eval_flow_mse_loss": 1.0084371517418296,
634
+ "eval_loss": 2.738540423903496,
635
+ "flow/cos_sim": 0.8181668907214122,
636
+ "flow/improvement_ratio": 0.9949730706822341,
637
+ "flow/mag_ratio_mean": 0.8024839326074928,
638
+ "flow/mag_ratio_std": 0.10586057484719404,
639
+ "step": 17408
640
+ },
641
+ {
642
+ "epoch": 2.25056564742388,
643
+ "eval_bleu": 0.4833431423882271,
644
+ "eval_ce_clean_loss": 0.004915411782059463,
645
+ "eval_ce_pred_loss": 2.399613708447499,
646
+ "eval_flow_cos_loss": 0.18183312740675203,
647
+ "eval_flow_mse_loss": 1.0084371517418296,
648
+ "eval_loss": 2.738540423903496,
649
+ "eval_runtime": 68.0506,
650
+ "eval_samples_per_second": 146.95,
651
+ "eval_steps_per_second": 2.307,
652
+ "flow/cos_sim": 0.8181668907214122,
653
+ "flow/improvement_ratio": 0.9949730706822341,
654
+ "flow/mag_ratio_mean": 0.8024839326074928,
655
+ "flow/mag_ratio_std": 0.10586057484719404,
656
+ "step": 17408
657
+ },
658
+ {
659
+ "epoch": 2.38295946732174,
660
+ "grad_norm": 1.0941267013549805,
661
+ "learning_rate": 3.731789271967459e-05,
662
+ "loss": 2.771674871444702,
663
+ "step": 18432
664
+ },
665
+ {
666
+ "epoch": 2.38295946732174,
667
+ "eval_bleu": 0.4920872409706913,
668
+ "eval_ce_clean_loss": 0.004664461751272724,
669
+ "eval_ce_pred_loss": 2.364128369434624,
670
+ "eval_flow_cos_loss": 0.1794095636363242,
671
+ "eval_flow_mse_loss": 1.0100623141428469,
672
+ "eval_loss": 2.714469000032753,
673
+ "flow/cos_sim": 0.820590474802977,
674
+ "flow/improvement_ratio": 0.9942654329500381,
675
+ "flow/mag_ratio_mean": 0.8044404573501296,
676
+ "flow/mag_ratio_std": 0.1081070894743227,
677
+ "step": 18432
678
+ },
679
+ {
680
+ "epoch": 2.38295946732174,
681
+ "eval_bleu": 0.4920872409706913,
682
+ "eval_ce_clean_loss": 0.004664461751272724,
683
+ "eval_ce_pred_loss": 2.364128369434624,
684
+ "eval_flow_cos_loss": 0.1794095636363242,
685
+ "eval_flow_mse_loss": 1.0100623141428469,
686
+ "eval_loss": 2.714469000032753,
687
+ "eval_runtime": 67.3234,
688
+ "eval_samples_per_second": 148.537,
689
+ "eval_steps_per_second": 2.332,
690
+ "flow/cos_sim": 0.820590474802977,
691
+ "flow/improvement_ratio": 0.9942654329500381,
692
+ "flow/mag_ratio_mean": 0.8044404573501296,
693
+ "flow/mag_ratio_std": 0.1081070894743227,
694
+ "step": 18432
695
+ },
696
+ {
697
+ "epoch": 2.5153532872196003,
698
+ "grad_norm": 1.274901032447815,
699
+ "learning_rate": 3.22002872319454e-05,
700
+ "loss": 2.7526628971099854,
701
+ "step": 19456
702
+ },
703
+ {
704
+ "epoch": 2.5153532872196003,
705
+ "eval_bleu": 0.49953457802460494,
706
+ "eval_ce_clean_loss": 0.004459979124574857,
707
+ "eval_ce_pred_loss": 2.326884046481673,
708
+ "eval_flow_cos_loss": 0.17535958984854874,
709
+ "eval_flow_mse_loss": 1.0013958295439458,
710
+ "eval_loss": 2.6785145139997932,
711
+ "flow/cos_sim": 0.824640438435184,
712
+ "flow/improvement_ratio": 0.9943981740125425,
713
+ "flow/mag_ratio_mean": 0.8087160583514317,
714
+ "flow/mag_ratio_std": 0.10657752490347358,
715
+ "step": 19456
716
+ },
717
+ {
718
+ "epoch": 2.5153532872196003,
719
+ "eval_bleu": 0.49953457802460494,
720
+ "eval_ce_clean_loss": 0.004459979124574857,
721
+ "eval_ce_pred_loss": 2.326884046481673,
722
+ "eval_flow_cos_loss": 0.17535958984854874,
723
+ "eval_flow_mse_loss": 1.0013958295439458,
724
+ "eval_loss": 2.6785145139997932,
725
+ "eval_runtime": 68.0305,
726
+ "eval_samples_per_second": 146.993,
727
+ "eval_steps_per_second": 2.308,
728
+ "flow/cos_sim": 0.824640438435184,
729
+ "flow/improvement_ratio": 0.9943981740125425,
730
+ "flow/mag_ratio_mean": 0.8087160583514317,
731
+ "flow/mag_ratio_std": 0.10657752490347358,
732
+ "step": 19456
733
+ },
734
+ {
735
+ "epoch": 2.6477471071174605,
736
+ "grad_norm": 1.057073950767517,
737
+ "learning_rate": 2.7288312022486472e-05,
738
+ "loss": 2.74858021736145,
739
+ "step": 20480
740
+ },
741
+ {
742
+ "epoch": 2.6477471071174605,
743
+ "eval_bleu": 0.49105986214857095,
744
+ "eval_ce_clean_loss": 0.004321782757783202,
745
+ "eval_ce_pred_loss": 2.3865722205228868,
746
+ "eval_flow_cos_loss": 0.17461080697311718,
747
+ "eval_flow_mse_loss": 1.0059761883346898,
748
+ "eval_loss": 2.724551202385289,
749
+ "flow/cos_sim": 0.8253892109652233,
750
+ "flow/improvement_ratio": 0.994614190736394,
751
+ "flow/mag_ratio_mean": 0.8106758245237314,
752
+ "flow/mag_ratio_std": 0.10591729089712641,
753
+ "step": 20480
754
+ },
755
+ {
756
+ "epoch": 2.6477471071174605,
757
+ "eval_bleu": 0.49105986214857095,
758
+ "eval_ce_clean_loss": 0.004321782757783202,
759
+ "eval_ce_pred_loss": 2.3865722205228868,
760
+ "eval_flow_cos_loss": 0.17461080697311718,
761
+ "eval_flow_mse_loss": 1.0059761883346898,
762
+ "eval_loss": 2.724551202385289,
763
+ "eval_runtime": 65.1786,
764
+ "eval_samples_per_second": 153.425,
765
+ "eval_steps_per_second": 2.409,
766
+ "flow/cos_sim": 0.8253892109652233,
767
+ "flow/improvement_ratio": 0.994614190736394,
768
+ "flow/mag_ratio_mean": 0.8106758245237314,
769
+ "flow/mag_ratio_std": 0.10591729089712641,
770
+ "step": 20480
771
+ },
772
+ {
773
+ "epoch": 2.780140927015321,
774
+ "grad_norm": 0.9793355464935303,
775
+ "learning_rate": 2.264310733522274e-05,
776
+ "loss": 2.733316659927368,
777
+ "step": 21504
778
+ },
779
+ {
780
+ "epoch": 2.780140927015321,
781
+ "eval_bleu": 0.49476989559226753,
782
+ "eval_ce_clean_loss": 0.004185588430512435,
783
+ "eval_ce_pred_loss": 2.348869823346472,
784
+ "eval_flow_cos_loss": 0.1727850574786496,
785
+ "eval_flow_mse_loss": 1.0050820533637028,
786
+ "eval_loss": 2.6966727615162065,
787
+ "flow/cos_sim": 0.827214957802159,
788
+ "flow/improvement_ratio": 0.9955724037376938,
789
+ "flow/mag_ratio_mean": 0.8125384882756859,
790
+ "flow/mag_ratio_std": 0.10621313242965442,
791
+ "step": 21504
792
+ },
793
+ {
794
+ "epoch": 2.780140927015321,
795
+ "eval_bleu": 0.49476989559226753,
796
+ "eval_ce_clean_loss": 0.004185588430512435,
797
+ "eval_ce_pred_loss": 2.348869823346472,
798
+ "eval_flow_cos_loss": 0.1727850574786496,
799
+ "eval_flow_mse_loss": 1.0050820533637028,
800
+ "eval_loss": 2.6966727615162065,
801
+ "eval_runtime": 68.9998,
802
+ "eval_samples_per_second": 144.928,
803
+ "eval_steps_per_second": 2.275,
804
+ "flow/cos_sim": 0.827214957802159,
805
+ "flow/improvement_ratio": 0.9955724037376938,
806
+ "flow/mag_ratio_mean": 0.8125384882756859,
807
+ "flow/mag_ratio_std": 0.10621313242965442,
808
+ "step": 21504
809
+ },
810
+ {
811
+ "epoch": 2.912534746913181,
812
+ "grad_norm": 1.0018037557601929,
813
+ "learning_rate": 1.83092638889173e-05,
814
+ "loss": 2.7381436824798584,
815
+ "step": 22528
816
+ },
817
+ {
818
+ "epoch": 2.912534746913181,
819
+ "eval_bleu": 0.5070712629503589,
820
+ "eval_ce_clean_loss": 0.004101792653684451,
821
+ "eval_ce_pred_loss": 2.294093737177029,
822
+ "eval_flow_cos_loss": 0.16953522318108066,
823
+ "eval_flow_mse_loss": 0.9949391457685239,
824
+ "eval_loss": 2.6472903338207563,
825
+ "flow/cos_sim": 0.8304647761545364,
826
+ "flow/improvement_ratio": 0.9939543570682501,
827
+ "flow/mag_ratio_mean": 0.8164081360883774,
828
+ "flow/mag_ratio_std": 0.10730971989168483,
829
+ "step": 22528
830
+ },
831
+ {
832
+ "epoch": 2.912534746913181,
833
+ "eval_bleu": 0.5070712629503589,
834
+ "eval_ce_clean_loss": 0.004101792653684451,
835
+ "eval_ce_pred_loss": 2.294093737177029,
836
+ "eval_flow_cos_loss": 0.16953522318108066,
837
+ "eval_flow_mse_loss": 0.9949391457685239,
838
+ "eval_loss": 2.6472903338207563,
839
+ "eval_runtime": 67.2838,
840
+ "eval_samples_per_second": 148.624,
841
+ "eval_steps_per_second": 2.333,
842
+ "flow/cos_sim": 0.8304647761545364,
843
+ "flow/improvement_ratio": 0.9939543570682501,
844
+ "flow/mag_ratio_mean": 0.8164081360883774,
845
+ "flow/mag_ratio_std": 0.10730971989168483,
846
+ "step": 22528
847
+ },
848
+ {
849
+ "epoch": 3.0448639213911695,
850
+ "grad_norm": 1.7332323789596558,
851
+ "learning_rate": 1.4345206949212337e-05,
852
+ "loss": 2.7229583263397217,
853
+ "step": 23552
854
+ },
855
+ {
856
+ "epoch": 3.0448639213911695,
857
+ "eval_bleu": 0.502826836403047,
858
+ "eval_ce_clean_loss": 0.0040093657700048324,
859
+ "eval_ce_pred_loss": 2.2946755681068276,
860
+ "eval_flow_cos_loss": 0.16862890978527675,
861
+ "eval_flow_mse_loss": 0.9954340754041247,
862
+ "eval_loss": 2.647873554260108,
863
+ "flow/cos_sim": 0.8313710966687293,
864
+ "flow/improvement_ratio": 0.9944029165681001,
865
+ "flow/mag_ratio_mean": 0.8175566006617941,
866
+ "flow/mag_ratio_std": 0.10629053266754576,
867
+ "step": 23552
868
+ },
869
+ {
870
+ "epoch": 3.0448639213911695,
871
+ "eval_bleu": 0.502826836403047,
872
+ "eval_ce_clean_loss": 0.0040093657700048324,
873
+ "eval_ce_pred_loss": 2.2946755681068276,
874
+ "eval_flow_cos_loss": 0.16862890978527675,
875
+ "eval_flow_mse_loss": 0.9954340754041247,
876
+ "eval_loss": 2.647873554260108,
877
+ "eval_runtime": 66.9795,
878
+ "eval_samples_per_second": 149.299,
879
+ "eval_steps_per_second": 2.344,
880
+ "flow/cos_sim": 0.8313710966687293,
881
+ "flow/improvement_ratio": 0.9944029165681001,
882
+ "flow/mag_ratio_mean": 0.8175566006617941,
883
+ "flow/mag_ratio_std": 0.10629053266754576,
884
+ "step": 23552
885
+ },
886
+ {
887
+ "epoch": 3.1772577412890297,
888
+ "grad_norm": 1.0988035202026367,
889
+ "learning_rate": 1.078898869250472e-05,
890
+ "loss": 2.712104320526123,
891
+ "step": 24576
892
+ },
893
+ {
894
+ "epoch": 3.1772577412890297,
895
+ "eval_bleu": 0.5035843636945546,
896
+ "eval_ce_clean_loss": 0.003988856448981745,
897
+ "eval_ce_pred_loss": 2.3216886368526777,
898
+ "eval_flow_cos_loss": 0.16886111970540066,
899
+ "eval_flow_mse_loss": 1.0004003287120988,
900
+ "eval_loss": 2.6717864836856817,
901
+ "flow/cos_sim": 0.8311388864638699,
902
+ "flow/improvement_ratio": 0.9948292883338442,
903
+ "flow/mag_ratio_mean": 0.8160582591014304,
904
+ "flow/mag_ratio_std": 0.10606027835873282,
905
+ "step": 24576
906
+ },
907
+ {
908
+ "epoch": 3.1772577412890297,
909
+ "eval_bleu": 0.5035843636945546,
910
+ "eval_ce_clean_loss": 0.003988856448981745,
911
+ "eval_ce_pred_loss": 2.3216886368526777,
912
+ "eval_flow_cos_loss": 0.16886111970540066,
913
+ "eval_flow_mse_loss": 1.0004003287120988,
914
+ "eval_loss": 2.6717864836856817,
915
+ "eval_runtime": 69.9629,
916
+ "eval_samples_per_second": 142.933,
917
+ "eval_steps_per_second": 2.244,
918
+ "flow/cos_sim": 0.8311388864638699,
919
+ "flow/improvement_ratio": 0.9948292883338442,
920
+ "flow/mag_ratio_mean": 0.8160582591014304,
921
+ "flow/mag_ratio_std": 0.10606027835873282,
922
+ "step": 24576
923
+ },
924
+ {
925
+ "epoch": 3.30965156118689,
926
+ "grad_norm": 1.0395138263702393,
927
+ "learning_rate": 7.691348992324593e-06,
928
+ "loss": 2.704360008239746,
929
+ "step": 25600
930
+ },
931
+ {
932
+ "epoch": 3.30965156118689,
933
+ "eval_bleu": 0.5039198733858501,
934
+ "eval_ce_clean_loss": 0.003921256066228791,
935
+ "eval_ce_pred_loss": 2.2708368779747348,
936
+ "eval_flow_cos_loss": 0.16833358689857897,
937
+ "eval_flow_mse_loss": 1.000913859932286,
938
+ "eval_loss": 2.636504320582007,
939
+ "flow/cos_sim": 0.831666430090643,
940
+ "flow/improvement_ratio": 0.9942229659694015,
941
+ "flow/mag_ratio_mean": 0.8166019050938309,
942
+ "flow/mag_ratio_std": 0.10622118669710341,
943
+ "step": 25600
944
+ },
945
+ {
946
+ "epoch": 3.30965156118689,
947
+ "eval_bleu": 0.5039198733858501,
948
+ "eval_ce_clean_loss": 0.003921256066228791,
949
+ "eval_ce_pred_loss": 2.2708368779747348,
950
+ "eval_flow_cos_loss": 0.16833358689857897,
951
+ "eval_flow_mse_loss": 1.000913859932286,
952
+ "eval_loss": 2.636504320582007,
953
+ "eval_runtime": 68.2119,
954
+ "eval_samples_per_second": 146.602,
955
+ "eval_steps_per_second": 2.302,
956
+ "flow/cos_sim": 0.831666430090643,
957
+ "flow/improvement_ratio": 0.9942229659694015,
958
+ "flow/mag_ratio_mean": 0.8166019050938309,
959
+ "flow/mag_ratio_std": 0.10622118669710341,
960
+ "step": 25600
961
+ },
962
+ {
963
+ "epoch": 3.44204538108475,
964
+ "grad_norm": 0.6483823657035828,
965
+ "learning_rate": 5.075961047606137e-06,
966
+ "loss": 2.7121200561523438,
967
+ "step": 26624
968
+ },
969
+ {
970
+ "epoch": 3.44204538108475,
971
+ "eval_bleu": 0.49887898452036056,
972
+ "eval_ce_clean_loss": 0.0038942673916863217,
973
+ "eval_ce_pred_loss": 2.3265422575033394,
974
+ "eval_flow_cos_loss": 0.16846198555390546,
975
+ "eval_flow_mse_loss": 1.0037431238563197,
976
+ "eval_loss": 2.6783324229489467,
977
+ "flow/cos_sim": 0.8315380261202526,
978
+ "flow/improvement_ratio": 0.993960594295696,
979
+ "flow/mag_ratio_mean": 0.8175518026777134,
980
+ "flow/mag_ratio_std": 0.10804502756162813,
981
+ "step": 26624
982
+ },
983
+ {
984
+ "epoch": 3.44204538108475,
985
+ "eval_bleu": 0.49887898452036056,
986
+ "eval_ce_clean_loss": 0.0038942673916863217,
987
+ "eval_ce_pred_loss": 2.3265422575033394,
988
+ "eval_flow_cos_loss": 0.16846198555390546,
989
+ "eval_flow_mse_loss": 1.0037431238563197,
990
+ "eval_loss": 2.6783324229489467,
991
+ "eval_runtime": 67.3034,
992
+ "eval_samples_per_second": 148.581,
993
+ "eval_steps_per_second": 2.333,
994
+ "flow/cos_sim": 0.8315380261202526,
995
+ "flow/improvement_ratio": 0.993960594295696,
996
+ "flow/mag_ratio_mean": 0.8175518026777134,
997
+ "flow/mag_ratio_std": 0.10804502756162813,
998
+ "step": 26624
999
+ },
1000
+ {
1001
+ "epoch": 3.5744392009826105,
1002
+ "grad_norm": 1.0585558414459229,
1003
+ "learning_rate": 2.979555782618604e-06,
1004
+ "loss": 2.700247049331665,
1005
+ "step": 27648
1006
+ },
1007
+ {
1008
+ "epoch": 3.5744392009826105,
1009
+ "eval_bleu": 0.49793104088984275,
1010
+ "eval_ce_clean_loss": 0.0038738787963179646,
1011
+ "eval_ce_pred_loss": 2.330296530845059,
1012
+ "eval_flow_cos_loss": 0.16851506719164028,
1013
+ "eval_flow_mse_loss": 1.005235201234271,
1014
+ "eval_loss": 2.6824453955243346,
1015
+ "flow/cos_sim": 0.831484940401308,
1016
+ "flow/improvement_ratio": 0.9950371783250457,
1017
+ "flow/mag_ratio_mean": 0.817362904548645,
1018
+ "flow/mag_ratio_std": 0.10697599723460569,
1019
+ "step": 27648
1020
+ },
1021
+ {
1022
+ "epoch": 3.5744392009826105,
1023
+ "eval_bleu": 0.49793104088984275,
1024
+ "eval_ce_clean_loss": 0.0038738787963179646,
1025
+ "eval_ce_pred_loss": 2.330296530845059,
1026
+ "eval_flow_cos_loss": 0.16851506719164028,
1027
+ "eval_flow_mse_loss": 1.005235201234271,
1028
+ "eval_loss": 2.6824453955243346,
1029
+ "eval_runtime": 66.0461,
1030
+ "eval_samples_per_second": 151.409,
1031
+ "eval_steps_per_second": 2.377,
1032
+ "flow/cos_sim": 0.831484940401308,
1033
+ "flow/improvement_ratio": 0.9950371783250457,
1034
+ "flow/mag_ratio_mean": 0.817362904548645,
1035
+ "flow/mag_ratio_std": 0.10697599723460569,
1036
+ "step": 27648
1037
+ },
1038
+ {
1039
+ "epoch": 3.7068330208804707,
1040
+ "grad_norm": 3.095872640609741,
1041
+ "learning_rate": 1.4263518127758779e-06,
1042
+ "loss": 2.7184364795684814,
1043
+ "step": 28672
1044
+ },
1045
+ {
1046
+ "epoch": 3.7068330208804707,
1047
+ "eval_bleu": 0.5068570358746337,
1048
+ "eval_ce_clean_loss": 0.0038693781436426908,
1049
+ "eval_ce_pred_loss": 2.279218629666954,
1050
+ "eval_flow_cos_loss": 0.16596888252504313,
1051
+ "eval_flow_mse_loss": 0.9924390498240283,
1052
+ "eval_loss": 2.633253667005308,
1053
+ "flow/cos_sim": 0.8340311475620148,
1054
+ "flow/improvement_ratio": 0.9944260948023219,
1055
+ "flow/mag_ratio_mean": 0.8203621438354444,
1056
+ "flow/mag_ratio_std": 0.10705859142883568,
1057
+ "step": 28672
1058
+ },
1059
+ {
1060
+ "epoch": 3.7068330208804707,
1061
+ "eval_bleu": 0.5068570358746337,
1062
+ "eval_ce_clean_loss": 0.0038693781436426908,
1063
+ "eval_ce_pred_loss": 2.279218629666954,
1064
+ "eval_flow_cos_loss": 0.16596888252504313,
1065
+ "eval_flow_mse_loss": 0.9924390498240283,
1066
+ "eval_loss": 2.633253667005308,
1067
+ "eval_runtime": 66.8771,
1068
+ "eval_samples_per_second": 149.528,
1069
+ "eval_steps_per_second": 2.348,
1070
+ "flow/cos_sim": 0.8340311475620148,
1071
+ "flow/improvement_ratio": 0.9944260948023219,
1072
+ "flow/mag_ratio_mean": 0.8203621438354444,
1073
+ "flow/mag_ratio_std": 0.10705859142883568,
1074
+ "step": 28672
1075
+ },
1076
+ {
1077
+ "epoch": 3.839226840778331,
1078
+ "grad_norm": 0.927253782749176,
1079
+ "learning_rate": 4.34983267029615e-07,
1080
+ "loss": 2.713036060333252,
1081
+ "step": 29696
1082
+ },
1083
+ {
1084
+ "epoch": 3.839226840778331,
1085
+ "eval_bleu": 0.5035067453298858,
1086
+ "eval_ce_clean_loss": 0.003862753908660049,
1087
+ "eval_ce_pred_loss": 2.29665062685681,
1088
+ "eval_flow_cos_loss": 0.16619450233544514,
1089
+ "eval_flow_mse_loss": 0.9944761357489665,
1090
+ "eval_loss": 2.647542930712366,
1091
+ "flow/cos_sim": 0.8338055117115094,
1092
+ "flow/improvement_ratio": 0.9944964187919714,
1093
+ "flow/mag_ratio_mean": 0.8199834428775082,
1094
+ "flow/mag_ratio_std": 0.1067680497268203,
1095
+ "step": 29696
1096
+ },
1097
+ {
1098
+ "epoch": 3.839226840778331,
1099
+ "eval_bleu": 0.5035067453298858,
1100
+ "eval_ce_clean_loss": 0.003862753908660049,
1101
+ "eval_ce_pred_loss": 2.29665062685681,
1102
+ "eval_flow_cos_loss": 0.16619450233544514,
1103
+ "eval_flow_mse_loss": 0.9944761357489665,
1104
+ "eval_loss": 2.647542930712366,
1105
+ "eval_runtime": 66.9649,
1106
+ "eval_samples_per_second": 149.332,
1107
+ "eval_steps_per_second": 2.345,
1108
+ "flow/cos_sim": 0.8338055117115094,
1109
+ "flow/improvement_ratio": 0.9944964187919714,
1110
+ "flow/mag_ratio_mean": 0.8199834428775082,
1111
+ "flow/mag_ratio_std": 0.1067680497268203,
1112
+ "step": 29696
1113
+ },
1114
+ {
1115
+ "epoch": 3.9716206606761912,
1116
+ "grad_norm": 1.7707074880599976,
1117
+ "learning_rate": 1.5095328409425736e-08,
1118
+ "loss": 2.715806484222412,
1119
+ "step": 30720
1120
+ },
1121
+ {
1122
+ "epoch": 3.9716206606761912,
1123
+ "eval_bleu": 0.5012037478282969,
1124
+ "eval_ce_clean_loss": 0.003861092226016246,
1125
+ "eval_ce_pred_loss": 2.3029726341271854,
1126
+ "eval_flow_cos_loss": 0.1667034083111271,
1127
+ "eval_flow_mse_loss": 0.9966826051663441,
1128
+ "eval_loss": 2.6543003814235613,
1129
+ "flow/cos_sim": 0.8332966137084232,
1130
+ "flow/improvement_ratio": 0.9941484476350675,
1131
+ "flow/mag_ratio_mean": 0.8199063995081908,
1132
+ "flow/mag_ratio_std": 0.10666013931392863,
1133
+ "step": 30720
1134
+ },
1135
+ {
1136
+ "epoch": 3.9716206606761912,
1137
+ "eval_bleu": 0.5012037478282969,
1138
+ "eval_ce_clean_loss": 0.003861092226016246,
1139
+ "eval_ce_pred_loss": 2.3029726341271854,
1140
+ "eval_flow_cos_loss": 0.1667034083111271,
1141
+ "eval_flow_mse_loss": 0.9966826051663441,
1142
+ "eval_loss": 2.6543003814235613,
1143
+ "eval_runtime": 66.7884,
1144
+ "eval_samples_per_second": 149.727,
1145
+ "eval_steps_per_second": 2.351,
1146
+ "flow/cos_sim": 0.8332966137084232,
1147
+ "flow/improvement_ratio": 0.9941484476350675,
1148
+ "flow/mag_ratio_mean": 0.8199063995081908,
1149
+ "flow/mag_ratio_std": 0.10666013931392863,
1150
+ "step": 30720
1151
+ }
1152
+ ],
1153
+ "logging_steps": 1024,
1154
+ "max_steps": 30940,
1155
+ "num_input_tokens_seen": 0,
1156
+ "num_train_epochs": 4,
1157
+ "save_steps": 1024,
1158
+ "stateful_callbacks": {
1159
+ "TrainerControl": {
1160
+ "args": {
1161
+ "should_epoch_stop": false,
1162
+ "should_evaluate": false,
1163
+ "should_log": false,
1164
+ "should_save": true,
1165
+ "should_training_stop": true
1166
+ },
1167
+ "attributes": {}
1168
+ }
1169
+ },
1170
+ "total_flos": 0.0,
1171
+ "train_batch_size": 64,
1172
+ "trial_name": null,
1173
+ "trial_params": null
1174
+ }
checkpoints-v5.5/checkpoint-30940/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d7be4d80b8499f3b5f618b042dcec062719328222caddac0d4e4ce11d371480d
3
+ size 5137