Attila1011 commited on
Commit
778c66e
·
verified ·
1 Parent(s): 254dbcb

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -56,3 +56,4 @@ checkpoints-v4.6/checkpoint-13312/eval_state.json filter=lfs diff=lfs merge=lfs
56
  checkpoints-v5.11-b/checkpoint-9216/eval_state.json filter=lfs diff=lfs merge=lfs -text
57
  checkpoints-v5.11-c/checkpoint-4096/eval_state.json filter=lfs diff=lfs merge=lfs -text
58
  checkpoints-v5.11-c/checkpoint-10240/eval_state.json filter=lfs diff=lfs merge=lfs -text
 
 
56
  checkpoints-v5.11-b/checkpoint-9216/eval_state.json filter=lfs diff=lfs merge=lfs -text
57
  checkpoints-v5.11-c/checkpoint-4096/eval_state.json filter=lfs diff=lfs merge=lfs -text
58
  checkpoints-v5.11-c/checkpoint-10240/eval_state.json filter=lfs diff=lfs merge=lfs -text
59
+ checkpoints-v5.12/checkpoint-11264/eval_state.json filter=lfs diff=lfs merge=lfs -text
checkpoints-v5.12/checkpoint-11264/ema.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:49447921d53c4db4a7732879b8ed6ef41aaa5f5dcd5936b849b445ed88234ab4
3
+ size 55150648
checkpoints-v5.12/checkpoint-11264/eval_state.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:faddd8d911a462f35f87226f011181597a657945a211544fe4a9ce4aee931234
3
+ size 60242873
checkpoints-v5.12/checkpoint-11264/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:395e2cbb78da104b0bc2dede3c66700b354d73a083dda41b441074e862954551
3
+ size 55150680
checkpoints-v5.12/checkpoint-11264/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:00ef4f91f0a8c658831caf28f1a43bf89c2e049d126ecfaaca58e15fd1f99860
3
+ size 77725643
checkpoints-v5.12/checkpoint-11264/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:50310c5f8d737208cf5a23571aa56c041c1d6c581482c803aaf1740270054c7b
3
+ size 14645
checkpoints-v5.12/checkpoint-11264/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ada79ea6c7ac33c12c71bea590d5ea2140ee809fed092f4670f19668dcbcc9d1
3
+ size 1383
checkpoints-v5.12/checkpoint-11264/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:21c700f8c7da85aa9d4c7cb7c4f2fe1f5cc1460165fd5b9ff6c072c94729a07e
3
+ size 1465
checkpoints-v5.12/checkpoint-11264/trainer_state.json ADDED
@@ -0,0 +1,452 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.4854440062921542,
6
+ "eval_steps": 1024,
7
+ "global_step": 11264,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.04413127329928675,
14
+ "grad_norm": 1.2638587951660156,
15
+ "learning_rate": 9.990234375e-05,
16
+ "loss": 10.142199516296387,
17
+ "step": 1024
18
+ },
19
+ {
20
+ "epoch": 0.04413127329928675,
21
+ "eval_bleu": 0.058932490136422516,
22
+ "eval_ce_clean_loss": 3.8288235781289366,
23
+ "eval_ce_pred_loss": 5.581323942904279,
24
+ "eval_flow_consistency_loss": 0.19790819223755712,
25
+ "eval_flow_mse_loss": 0.9903420607672572,
26
+ "eval_loss": 7.053502456974119,
27
+ "flow/cos_sim": 0.6025428896519676,
28
+ "flow/improvement_ratio": 0.9951722414762989,
29
+ "flow/mag_ratio_mean": 0.5899381368144997,
30
+ "flow/mag_ratio_std": 0.07305362846043065,
31
+ "step": 1024
32
+ },
33
+ {
34
+ "epoch": 0.04413127329928675,
35
+ "eval_bleu": 0.058932490136422516,
36
+ "eval_ce_clean_loss": 3.8288235781289366,
37
+ "eval_ce_pred_loss": 5.581323942904279,
38
+ "eval_flow_consistency_loss": 0.19790819223755712,
39
+ "eval_flow_mse_loss": 0.9903420607672572,
40
+ "eval_loss": 7.053502456974119,
41
+ "eval_runtime": 210.5321,
42
+ "eval_samples_per_second": 142.496,
43
+ "eval_steps_per_second": 2.228,
44
+ "flow/cos_sim": 0.6025428896519676,
45
+ "flow/improvement_ratio": 0.9951722414762989,
46
+ "flow/mag_ratio_mean": 0.5899381368144997,
47
+ "flow/mag_ratio_std": 0.07305362846043065,
48
+ "step": 1024
49
+ },
50
+ {
51
+ "epoch": 0.0882625465985735,
52
+ "grad_norm": 2.830021619796753,
53
+ "learning_rate": 9.9476028157316e-05,
54
+ "loss": 5.8488240242004395,
55
+ "step": 2048
56
+ },
57
+ {
58
+ "epoch": 0.0882625465985735,
59
+ "eval_bleu": 0.25233143434843397,
60
+ "eval_ce_clean_loss": 1.226465684010276,
61
+ "eval_ce_pred_loss": 3.66637077921235,
62
+ "eval_flow_consistency_loss": 0.19513546225867037,
63
+ "eval_flow_mse_loss": 1.1100789928741293,
64
+ "eval_loss": 4.996664083842784,
65
+ "flow/cos_sim": 0.5829344555767361,
66
+ "flow/improvement_ratio": 0.9943469047292209,
67
+ "flow/mag_ratio_mean": 0.5073343900475167,
68
+ "flow/mag_ratio_std": 0.06693376618217049,
69
+ "step": 2048
70
+ },
71
+ {
72
+ "epoch": 0.0882625465985735,
73
+ "eval_bleu": 0.25233143434843397,
74
+ "eval_ce_clean_loss": 1.226465684010276,
75
+ "eval_ce_pred_loss": 3.66637077921235,
76
+ "eval_flow_consistency_loss": 0.19513546225867037,
77
+ "eval_flow_mse_loss": 1.1100789928741293,
78
+ "eval_loss": 4.996664083842784,
79
+ "eval_runtime": 218.0358,
80
+ "eval_samples_per_second": 137.592,
81
+ "eval_steps_per_second": 2.151,
82
+ "flow/cos_sim": 0.5829344555767361,
83
+ "flow/improvement_ratio": 0.9943469047292209,
84
+ "flow/mag_ratio_mean": 0.5073343900475167,
85
+ "flow/mag_ratio_std": 0.06693376618217049,
86
+ "step": 2048
87
+ },
88
+ {
89
+ "epoch": 0.13239381989786023,
90
+ "grad_norm": 0.8616206645965576,
91
+ "learning_rate": 9.791307026072513e-05,
92
+ "loss": 4.643683433532715,
93
+ "step": 3072
94
+ },
95
+ {
96
+ "epoch": 0.13239381989786023,
97
+ "eval_bleu": 0.35994713222566843,
98
+ "eval_ce_clean_loss": 0.49520395553188284,
99
+ "eval_ce_pred_loss": 2.976453432396277,
100
+ "eval_flow_consistency_loss": 0.20647781791844602,
101
+ "eval_flow_mse_loss": 1.1147571771637972,
102
+ "eval_loss": 4.24396990713026,
103
+ "flow/cos_sim": 0.5977321162915179,
104
+ "flow/improvement_ratio": 0.9930047064956063,
105
+ "flow/mag_ratio_mean": 0.5432682163171423,
106
+ "flow/mag_ratio_std": 0.0777663038547105,
107
+ "step": 3072
108
+ },
109
+ {
110
+ "epoch": 0.13239381989786023,
111
+ "eval_bleu": 0.35994713222566843,
112
+ "eval_ce_clean_loss": 0.49520395553188284,
113
+ "eval_ce_pred_loss": 2.976453432396277,
114
+ "eval_flow_consistency_loss": 0.20647781791844602,
115
+ "eval_flow_mse_loss": 1.1147571771637972,
116
+ "eval_loss": 4.24396990713026,
117
+ "eval_runtime": 209.3799,
118
+ "eval_samples_per_second": 143.28,
119
+ "eval_steps_per_second": 2.24,
120
+ "flow/cos_sim": 0.5977321162915179,
121
+ "flow/improvement_ratio": 0.9930047064956063,
122
+ "flow/mag_ratio_mean": 0.5432682163171423,
123
+ "flow/mag_ratio_std": 0.0777663038547105,
124
+ "step": 3072
125
+ },
126
+ {
127
+ "epoch": 0.176525093197147,
128
+ "grad_norm": 1.9151488542556763,
129
+ "learning_rate": 9.53439476074686e-05,
130
+ "loss": 4.1653642654418945,
131
+ "step": 4096
132
+ },
133
+ {
134
+ "epoch": 0.176525093197147,
135
+ "eval_bleu": 0.40944445209476404,
136
+ "eval_ce_clean_loss": 0.25719291623086055,
137
+ "eval_ce_pred_loss": 2.6760470996787555,
138
+ "eval_flow_consistency_loss": 0.22640585670593197,
139
+ "eval_flow_mse_loss": 1.137088912382309,
140
+ "eval_loss": 3.952058234448626,
141
+ "flow/cos_sim": 0.6167135178915727,
142
+ "flow/improvement_ratio": 0.9935523163535194,
143
+ "flow/mag_ratio_mean": 0.5722723617228364,
144
+ "flow/mag_ratio_std": 0.10071539680268973,
145
+ "step": 4096
146
+ },
147
+ {
148
+ "epoch": 0.176525093197147,
149
+ "eval_bleu": 0.40944445209476404,
150
+ "eval_ce_clean_loss": 0.25719291623086055,
151
+ "eval_ce_pred_loss": 2.6760470996787555,
152
+ "eval_flow_consistency_loss": 0.22640585670593197,
153
+ "eval_flow_mse_loss": 1.137088912382309,
154
+ "eval_loss": 3.952058234448626,
155
+ "eval_runtime": 210.8013,
156
+ "eval_samples_per_second": 142.314,
157
+ "eval_steps_per_second": 2.225,
158
+ "flow/cos_sim": 0.6167135178915727,
159
+ "flow/improvement_ratio": 0.9935523163535194,
160
+ "flow/mag_ratio_mean": 0.5722723617228364,
161
+ "flow/mag_ratio_std": 0.10071539680268973,
162
+ "step": 4096
163
+ },
164
+ {
165
+ "epoch": 0.22065636649643372,
166
+ "grad_norm": 1.6371338367462158,
167
+ "learning_rate": 9.182261125213742e-05,
168
+ "loss": 3.9546115398406982,
169
+ "step": 5120
170
+ },
171
+ {
172
+ "epoch": 0.22065636649643372,
173
+ "eval_bleu": 0.44116786132852503,
174
+ "eval_ce_clean_loss": 0.15586312782408587,
175
+ "eval_ce_pred_loss": 2.4723648534400633,
176
+ "eval_flow_consistency_loss": 0.2307965545766135,
177
+ "eval_flow_mse_loss": 1.1533130791141535,
178
+ "eval_loss": 3.756662518230837,
179
+ "flow/cos_sim": 0.6392486124658889,
180
+ "flow/improvement_ratio": 0.9932308851528778,
181
+ "flow/mag_ratio_mean": 0.5966055151750284,
182
+ "flow/mag_ratio_std": 0.12516595324727772,
183
+ "step": 5120
184
+ },
185
+ {
186
+ "epoch": 0.22065636649643372,
187
+ "eval_bleu": 0.44116786132852503,
188
+ "eval_ce_clean_loss": 0.15586312782408587,
189
+ "eval_ce_pred_loss": 2.4723648534400633,
190
+ "eval_flow_consistency_loss": 0.2307965545766135,
191
+ "eval_flow_mse_loss": 1.1533130791141535,
192
+ "eval_loss": 3.756662518230837,
193
+ "eval_runtime": 211.3834,
194
+ "eval_samples_per_second": 141.922,
195
+ "eval_steps_per_second": 2.219,
196
+ "flow/cos_sim": 0.6392486124658889,
197
+ "flow/improvement_ratio": 0.9932308851528778,
198
+ "flow/mag_ratio_mean": 0.5966055151750284,
199
+ "flow/mag_ratio_std": 0.12516595324727772,
200
+ "step": 5120
201
+ },
202
+ {
203
+ "epoch": 0.26478763979572045,
204
+ "grad_norm": 1.4187636375427246,
205
+ "learning_rate": 8.742770483354739e-05,
206
+ "loss": 3.803468704223633,
207
+ "step": 6144
208
+ },
209
+ {
210
+ "epoch": 0.26478763979572045,
211
+ "eval_bleu": 0.4623410022491807,
212
+ "eval_ce_clean_loss": 0.10419444447514345,
213
+ "eval_ce_pred_loss": 2.367216001441484,
214
+ "eval_flow_consistency_loss": 0.23397522535659612,
215
+ "eval_flow_mse_loss": 1.1800828803576895,
216
+ "eval_loss": 3.6747059415398375,
217
+ "flow/cos_sim": 0.655801643059452,
218
+ "flow/improvement_ratio": 0.993431042887763,
219
+ "flow/mag_ratio_mean": 0.6149344101135157,
220
+ "flow/mag_ratio_std": 0.14765548493180955,
221
+ "step": 6144
222
+ },
223
+ {
224
+ "epoch": 0.26478763979572045,
225
+ "eval_bleu": 0.4623410022491807,
226
+ "eval_ce_clean_loss": 0.10419444447514345,
227
+ "eval_ce_pred_loss": 2.367216001441484,
228
+ "eval_flow_consistency_loss": 0.23397522535659612,
229
+ "eval_flow_mse_loss": 1.1800828803576895,
230
+ "eval_loss": 3.6747059415398375,
231
+ "eval_runtime": 213.1063,
232
+ "eval_samples_per_second": 140.775,
233
+ "eval_steps_per_second": 2.201,
234
+ "flow/cos_sim": 0.655801643059452,
235
+ "flow/improvement_ratio": 0.993431042887763,
236
+ "flow/mag_ratio_mean": 0.6149344101135157,
237
+ "flow/mag_ratio_std": 0.14765548493180955,
238
+ "step": 6144
239
+ },
240
+ {
241
+ "epoch": 0.30891891309500724,
242
+ "grad_norm": 0.977324366569519,
243
+ "learning_rate": 8.22483558761947e-05,
244
+ "loss": 3.6876096725463867,
245
+ "step": 7168
246
+ },
247
+ {
248
+ "epoch": 0.30891891309500724,
249
+ "eval_bleu": 0.48154751490891107,
250
+ "eval_ce_clean_loss": 0.07341260211204668,
251
+ "eval_ce_pred_loss": 2.2317311003772433,
252
+ "eval_flow_consistency_loss": 0.23049222644585282,
253
+ "eval_flow_mse_loss": 1.1846269220431476,
254
+ "eval_loss": 3.53894539021734,
255
+ "flow/cos_sim": 0.6778708653155167,
256
+ "flow/improvement_ratio": 0.9941845817098232,
257
+ "flow/mag_ratio_mean": 0.6407407661999213,
258
+ "flow/mag_ratio_std": 0.15953632788871652,
259
+ "step": 7168
260
+ },
261
+ {
262
+ "epoch": 0.30891891309500724,
263
+ "eval_bleu": 0.48154751490891107,
264
+ "eval_ce_clean_loss": 0.07341260211204668,
265
+ "eval_ce_pred_loss": 2.2317311003772433,
266
+ "eval_flow_consistency_loss": 0.23049222644585282,
267
+ "eval_flow_mse_loss": 1.1846269220431476,
268
+ "eval_loss": 3.53894539021734,
269
+ "eval_runtime": 211.8258,
270
+ "eval_samples_per_second": 141.626,
271
+ "eval_steps_per_second": 2.214,
272
+ "flow/cos_sim": 0.6778708653155167,
273
+ "flow/improvement_ratio": 0.9941845817098232,
274
+ "flow/mag_ratio_mean": 0.6407407661999213,
275
+ "flow/mag_ratio_std": 0.15953632788871652,
276
+ "step": 7168
277
+ },
278
+ {
279
+ "epoch": 0.353050186394294,
280
+ "grad_norm": 0.8968266844749451,
281
+ "learning_rate": 7.638710244802891e-05,
282
+ "loss": 3.5899417400360107,
283
+ "step": 8192
284
+ },
285
+ {
286
+ "epoch": 0.353050186394294,
287
+ "eval_bleu": 0.49368891647433405,
288
+ "eval_ce_clean_loss": 0.05484104749839951,
289
+ "eval_ce_pred_loss": 2.1758488762353276,
290
+ "eval_flow_consistency_loss": 0.2250279579589616,
291
+ "eval_flow_mse_loss": 1.2083756652976405,
292
+ "eval_loss": 3.5022226300066723,
293
+ "flow/cos_sim": 0.6907487210434383,
294
+ "flow/improvement_ratio": 0.9930832450832131,
295
+ "flow/mag_ratio_mean": 0.652735139006999,
296
+ "flow/mag_ratio_std": 0.16740097594794942,
297
+ "step": 8192
298
+ },
299
+ {
300
+ "epoch": 0.353050186394294,
301
+ "eval_bleu": 0.49368891647433405,
302
+ "eval_ce_clean_loss": 0.05484104749839951,
303
+ "eval_ce_pred_loss": 2.1758488762353276,
304
+ "eval_flow_consistency_loss": 0.2250279579589616,
305
+ "eval_flow_mse_loss": 1.2083756652976405,
306
+ "eval_loss": 3.5022226300066723,
307
+ "eval_runtime": 210.2696,
308
+ "eval_samples_per_second": 142.674,
309
+ "eval_steps_per_second": 2.23,
310
+ "flow/cos_sim": 0.6907487210434383,
311
+ "flow/improvement_ratio": 0.9930832450832131,
312
+ "flow/mag_ratio_mean": 0.652735139006999,
313
+ "flow/mag_ratio_std": 0.16740097594794942,
314
+ "step": 8192
315
+ },
316
+ {
317
+ "epoch": 0.3971814596935807,
318
+ "grad_norm": 1.9928840398788452,
319
+ "learning_rate": 6.997172522088177e-05,
320
+ "loss": 3.5251195430755615,
321
+ "step": 9216
322
+ },
323
+ {
324
+ "epoch": 0.3971814596935807,
325
+ "eval_bleu": 0.5060081279142332,
326
+ "eval_ce_clean_loss": 0.043055545256685604,
327
+ "eval_ce_pred_loss": 2.106957233282549,
328
+ "eval_flow_consistency_loss": 0.22306220640124544,
329
+ "eval_flow_mse_loss": 1.2162765316617514,
330
+ "eval_loss": 3.4390704296291004,
331
+ "flow/cos_sim": 0.7040034491878583,
332
+ "flow/improvement_ratio": 0.9930018180214775,
333
+ "flow/mag_ratio_mean": 0.6716202778348537,
334
+ "flow/mag_ratio_std": 0.17294156087486984,
335
+ "step": 9216
336
+ },
337
+ {
338
+ "epoch": 0.3971814596935807,
339
+ "eval_bleu": 0.5060081279142332,
340
+ "eval_ce_clean_loss": 0.043055545256685604,
341
+ "eval_ce_pred_loss": 2.106957233282549,
342
+ "eval_flow_consistency_loss": 0.22306220640124544,
343
+ "eval_flow_mse_loss": 1.2162765316617514,
344
+ "eval_loss": 3.4390704296291004,
345
+ "eval_runtime": 209.3455,
346
+ "eval_samples_per_second": 143.304,
347
+ "eval_steps_per_second": 2.24,
348
+ "flow/cos_sim": 0.7040034491878583,
349
+ "flow/improvement_ratio": 0.9930018180214775,
350
+ "flow/mag_ratio_mean": 0.6716202778348537,
351
+ "flow/mag_ratio_std": 0.17294156087486984,
352
+ "step": 9216
353
+ },
354
+ {
355
+ "epoch": 0.44131273299286744,
356
+ "grad_norm": 1.3912804126739502,
357
+ "learning_rate": 6.314377890922702e-05,
358
+ "loss": 3.487394332885742,
359
+ "step": 10240
360
+ },
361
+ {
362
+ "epoch": 0.44131273299286744,
363
+ "eval_bleu": 0.5124308712326037,
364
+ "eval_ce_clean_loss": 0.034350075578289244,
365
+ "eval_ce_pred_loss": 2.0600304321439538,
366
+ "eval_flow_consistency_loss": 0.21802372556886693,
367
+ "eval_flow_mse_loss": 1.2173504947599318,
368
+ "eval_loss": 3.3898277974077886,
369
+ "flow/cos_sim": 0.7159046165978731,
370
+ "flow/improvement_ratio": 0.9939875282458405,
371
+ "flow/mag_ratio_mean": 0.6801346318045659,
372
+ "flow/mag_ratio_std": 0.17366625556051096,
373
+ "step": 10240
374
+ },
375
+ {
376
+ "epoch": 0.44131273299286744,
377
+ "eval_bleu": 0.5124308712326037,
378
+ "eval_ce_clean_loss": 0.034350075578289244,
379
+ "eval_ce_pred_loss": 2.0600304321439538,
380
+ "eval_flow_consistency_loss": 0.21802372556886693,
381
+ "eval_flow_mse_loss": 1.2173504947599318,
382
+ "eval_loss": 3.3898277974077886,
383
+ "eval_runtime": 209.8004,
384
+ "eval_samples_per_second": 142.993,
385
+ "eval_steps_per_second": 2.235,
386
+ "flow/cos_sim": 0.7159046165978731,
387
+ "flow/improvement_ratio": 0.9939875282458405,
388
+ "flow/mag_ratio_mean": 0.6801346318045659,
389
+ "flow/mag_ratio_std": 0.17366625556051096,
390
+ "step": 10240
391
+ },
392
+ {
393
+ "epoch": 0.4854440062921542,
394
+ "grad_norm": 1.2457709312438965,
395
+ "learning_rate": 5.603332356428589e-05,
396
+ "loss": 3.454292058944702,
397
+ "step": 11264
398
+ },
399
+ {
400
+ "epoch": 0.4854440062921542,
401
+ "eval_bleu": 0.5169710884245263,
402
+ "eval_ce_clean_loss": 0.0282955012286205,
403
+ "eval_ce_pred_loss": 2.015394574289383,
404
+ "eval_flow_consistency_loss": 0.2058143128972572,
405
+ "eval_flow_mse_loss": 1.222560783693277,
406
+ "eval_loss": 3.343692081060999,
407
+ "flow/cos_sim": 0.7249747821008727,
408
+ "flow/improvement_ratio": 0.9951503481437911,
409
+ "flow/mag_ratio_mean": 0.692842490891658,
410
+ "flow/mag_ratio_std": 0.17570793422173336,
411
+ "step": 11264
412
+ },
413
+ {
414
+ "epoch": 0.4854440062921542,
415
+ "eval_bleu": 0.5169710884245263,
416
+ "eval_ce_clean_loss": 0.0282955012286205,
417
+ "eval_ce_pred_loss": 2.015394574289383,
418
+ "eval_flow_consistency_loss": 0.2058143128972572,
419
+ "eval_flow_mse_loss": 1.222560783693277,
420
+ "eval_loss": 3.343692081060999,
421
+ "eval_runtime": 208.4095,
422
+ "eval_samples_per_second": 143.947,
423
+ "eval_steps_per_second": 2.25,
424
+ "flow/cos_sim": 0.7249747821008727,
425
+ "flow/improvement_ratio": 0.9951503481437911,
426
+ "flow/mag_ratio_mean": 0.692842490891658,
427
+ "flow/mag_ratio_std": 0.17570793422173336,
428
+ "step": 11264
429
+ }
430
+ ],
431
+ "logging_steps": 1024,
432
+ "max_steps": 23204,
433
+ "num_input_tokens_seen": 0,
434
+ "num_train_epochs": 1,
435
+ "save_steps": 1024,
436
+ "stateful_callbacks": {
437
+ "TrainerControl": {
438
+ "args": {
439
+ "should_epoch_stop": false,
440
+ "should_evaluate": false,
441
+ "should_log": false,
442
+ "should_save": true,
443
+ "should_training_stop": false
444
+ },
445
+ "attributes": {}
446
+ }
447
+ },
448
+ "total_flos": 0.0,
449
+ "train_batch_size": 64,
450
+ "trial_name": null,
451
+ "trial_params": null
452
+ }
checkpoints-v5.12/checkpoint-11264/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8469bbc81a2ba0be2f5b44007faafd15c75615abe30f4f4e56171816d31caa5b
3
+ size 5137