Attila1011 commited on
Commit
651c1b1
·
verified ·
1 Parent(s): c985c7c

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -36,3 +36,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
36
  checkpoints-v5/checkpoint-12288/eval_state.json filter=lfs diff=lfs merge=lfs -text
37
  checkpoints-v5.1/checkpoint-12288/eval_state.json filter=lfs diff=lfs merge=lfs -text
38
  checkpoints-v5.2/checkpoint-11264/eval_state.json filter=lfs diff=lfs merge=lfs -text
 
 
36
  checkpoints-v5/checkpoint-12288/eval_state.json filter=lfs diff=lfs merge=lfs -text
37
  checkpoints-v5.1/checkpoint-12288/eval_state.json filter=lfs diff=lfs merge=lfs -text
38
  checkpoints-v5.2/checkpoint-11264/eval_state.json filter=lfs diff=lfs merge=lfs -text
39
+ checkpoints-v5.3/checkpoint-11264/eval_state.json filter=lfs diff=lfs merge=lfs -text
checkpoints-v5.3/checkpoint-11264/ema.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:918386e13ebed823cbd0a173c6a42adbba76cfe94106fda8c32135b8f19d9a89
3
+ size 54599592
checkpoints-v5.3/checkpoint-11264/eval_state.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3fd0ea4c4a0a732f97ffcf272be5602de37eab06e7ce60f4873fa47112f116fa
3
+ size 42301743
checkpoints-v5.3/checkpoint-11264/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f8fe6a8fc4de104f21451dc60300a18071a9bf4d342222de71ec8967c03f770f
3
+ size 54599624
checkpoints-v5.3/checkpoint-11264/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cde21da133499f0e7071a8f8855f1b5163ee2ea4e78a99423f101930ebb60003
3
+ size 76551435
checkpoints-v5.3/checkpoint-11264/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45b1ca7f396cc5599f3f64fbc801b07bc6329480ea979bb7f593fcbee2c79ab9
3
+ size 14645
checkpoints-v5.3/checkpoint-11264/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:50166b6cf492c5af2bf0da013ef651526fcf32a8101bc3fb9382b3bc7ba301dc
3
+ size 1383
checkpoints-v5.3/checkpoint-11264/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b6571b97a5ee63d34ba933d2df4e57394f03e1f6410b5a28aa769998b78ac31
3
+ size 1465
checkpoints-v5.3/checkpoint-11264/trainer_state.json ADDED
@@ -0,0 +1,452 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.5202531060920974,
6
+ "eval_steps": 1024,
7
+ "global_step": 11264,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.047295736917463395,
14
+ "grad_norm": 2.416111707687378,
15
+ "learning_rate": 0.000298828125,
16
+ "loss": 9.019423484802246,
17
+ "step": 1024
18
+ },
19
+ {
20
+ "epoch": 0.047295736917463395,
21
+ "eval_bleu": 0.2106642247482737,
22
+ "eval_ce_clean_loss": 0.29124547343820195,
23
+ "eval_ce_pred_loss": 4.992772727252142,
24
+ "eval_flow_cos_loss": 0.2647965398008965,
25
+ "eval_flow_mse_loss": 0.8835841697644969,
26
+ "eval_loss": 3.571403525191355,
27
+ "flow/cos_sim": 0.7352034720383822,
28
+ "flow/improvement_ratio": 0.9943101596614542,
29
+ "flow/mag_ratio_mean": 0.7476322943489301,
30
+ "flow/mag_ratio_std": 0.06709049885335579,
31
+ "step": 1024
32
+ },
33
+ {
34
+ "epoch": 0.047295736917463395,
35
+ "eval_bleu": 0.2106642247482737,
36
+ "eval_ce_clean_loss": 0.29124547343820195,
37
+ "eval_ce_pred_loss": 4.992772727252142,
38
+ "eval_flow_cos_loss": 0.2647965398008965,
39
+ "eval_flow_mse_loss": 0.8835841697644969,
40
+ "eval_loss": 3.571403525191355,
41
+ "eval_runtime": 183.9899,
42
+ "eval_samples_per_second": 152.144,
43
+ "eval_steps_per_second": 2.381,
44
+ "flow/cos_sim": 0.7352034720383822,
45
+ "flow/improvement_ratio": 0.9943101596614542,
46
+ "flow/mag_ratio_mean": 0.7476322943489301,
47
+ "flow/mag_ratio_std": 0.06709049885335579,
48
+ "step": 1024
49
+ },
50
+ {
51
+ "epoch": 0.09459147383492679,
52
+ "grad_norm": 3.0350542068481445,
53
+ "learning_rate": 0.0002981935930165064,
54
+ "loss": 2.5434765815734863,
55
+ "step": 2048
56
+ },
57
+ {
58
+ "epoch": 0.09459147383492679,
59
+ "eval_bleu": 0.1579851884808921,
60
+ "eval_ce_clean_loss": 0.03997791357355439,
61
+ "eval_ce_pred_loss": 5.100844090387701,
62
+ "eval_flow_cos_loss": 0.07959831905759633,
63
+ "eval_flow_mse_loss": 0.31194442517409043,
64
+ "eval_loss": 2.018676113048101,
65
+ "flow/cos_sim": 0.9204016981059557,
66
+ "flow/improvement_ratio": 0.9950934457996664,
67
+ "flow/mag_ratio_mean": 0.9062496831972305,
68
+ "flow/mag_ratio_std": 0.08489629088743636,
69
+ "step": 2048
70
+ },
71
+ {
72
+ "epoch": 0.09459147383492679,
73
+ "eval_bleu": 0.1579851884808921,
74
+ "eval_ce_clean_loss": 0.03997791357355439,
75
+ "eval_ce_pred_loss": 5.100844090387701,
76
+ "eval_flow_cos_loss": 0.07959831905759633,
77
+ "eval_flow_mse_loss": 0.31194442517409043,
78
+ "eval_loss": 2.018676113048101,
79
+ "eval_runtime": 178.4606,
80
+ "eval_samples_per_second": 156.858,
81
+ "eval_steps_per_second": 2.454,
82
+ "flow/cos_sim": 0.9204016981059557,
83
+ "flow/improvement_ratio": 0.9950934457996664,
84
+ "flow/mag_ratio_mean": 0.9062496831972305,
85
+ "flow/mag_ratio_std": 0.08489629088743636,
86
+ "step": 2048
87
+ },
88
+ {
89
+ "epoch": 0.1418872107523902,
90
+ "grad_norm": 2.6396632194519043,
91
+ "learning_rate": 0.0002927969115673581,
92
+ "loss": 1.9693392515182495,
93
+ "step": 3072
94
+ },
95
+ {
96
+ "epoch": 0.1418872107523902,
97
+ "eval_bleu": 0.11643667444645887,
98
+ "eval_ce_clean_loss": 0.01767831638489412,
99
+ "eval_ce_pred_loss": 5.5003395265640185,
100
+ "eval_flow_cos_loss": 0.053572113553410794,
101
+ "eval_flow_mse_loss": 0.22004157059813198,
102
+ "eval_loss": 1.8864184528180998,
103
+ "flow/cos_sim": 0.9464279026745661,
104
+ "flow/improvement_ratio": 0.9951256691865181,
105
+ "flow/mag_ratio_mean": 0.928381720226105,
106
+ "flow/mag_ratio_std": 0.08207591311776474,
107
+ "step": 3072
108
+ },
109
+ {
110
+ "epoch": 0.1418872107523902,
111
+ "eval_bleu": 0.11643667444645887,
112
+ "eval_ce_clean_loss": 0.01767831638489412,
113
+ "eval_ce_pred_loss": 5.5003395265640185,
114
+ "eval_flow_cos_loss": 0.053572113553410794,
115
+ "eval_flow_mse_loss": 0.22004157059813198,
116
+ "eval_loss": 1.8864184528180998,
117
+ "eval_runtime": 181.1859,
118
+ "eval_samples_per_second": 154.499,
119
+ "eval_steps_per_second": 2.417,
120
+ "flow/cos_sim": 0.9464279026745661,
121
+ "flow/improvement_ratio": 0.9951256691865181,
122
+ "flow/mag_ratio_mean": 0.928381720226105,
123
+ "flow/mag_ratio_std": 0.08207591311776474,
124
+ "step": 3072
125
+ },
126
+ {
127
+ "epoch": 0.18918294766985358,
128
+ "grad_norm": 2.8408706188201904,
129
+ "learning_rate": 0.0002839406855398806,
130
+ "loss": 1.8455302715301514,
131
+ "step": 4096
132
+ },
133
+ {
134
+ "epoch": 0.18918294766985358,
135
+ "eval_bleu": 0.12642263806175938,
136
+ "eval_ce_clean_loss": 0.009541632419932616,
137
+ "eval_ce_pred_loss": 5.360420514459479,
138
+ "eval_flow_cos_loss": 0.049334879088551484,
139
+ "eval_flow_mse_loss": 0.20619700709570488,
140
+ "eval_loss": 1.811375651185371,
141
+ "flow/cos_sim": 0.9506651357700836,
142
+ "flow/improvement_ratio": 0.9946121761243637,
143
+ "flow/mag_ratio_mean": 0.9399185652874377,
144
+ "flow/mag_ratio_std": 0.07947734282373294,
145
+ "step": 4096
146
+ },
147
+ {
148
+ "epoch": 0.18918294766985358,
149
+ "eval_bleu": 0.12642263806175938,
150
+ "eval_ce_clean_loss": 0.009541632419932616,
151
+ "eval_ce_pred_loss": 5.360420514459479,
152
+ "eval_flow_cos_loss": 0.049334879088551484,
153
+ "eval_flow_mse_loss": 0.20619700709570488,
154
+ "eval_loss": 1.811375651185371,
155
+ "eval_runtime": 180.9735,
156
+ "eval_samples_per_second": 154.68,
157
+ "eval_steps_per_second": 2.42,
158
+ "flow/cos_sim": 0.9506651357700836,
159
+ "flow/improvement_ratio": 0.9946121761243637,
160
+ "flow/mag_ratio_mean": 0.9399185652874377,
161
+ "flow/mag_ratio_std": 0.07947734282373294,
162
+ "step": 4096
163
+ },
164
+ {
165
+ "epoch": 0.236478684587317,
166
+ "grad_norm": 1.581679344177246,
167
+ "learning_rate": 0.0002718261478322326,
168
+ "loss": 1.7856016159057617,
169
+ "step": 5120
170
+ },
171
+ {
172
+ "epoch": 0.236478684587317,
173
+ "eval_bleu": 0.12365900485552281,
174
+ "eval_ce_clean_loss": 0.007601529405001534,
175
+ "eval_ce_pred_loss": 5.553882997329921,
176
+ "eval_flow_cos_loss": 0.04624706012575321,
177
+ "eval_flow_mse_loss": 0.19693581780342206,
178
+ "eval_loss": 1.8361909707931623,
179
+ "flow/cos_sim": 0.9537529678649554,
180
+ "flow/improvement_ratio": 0.9952095367320596,
181
+ "flow/mag_ratio_mean": 0.9415775195920848,
182
+ "flow/mag_ratio_std": 0.07597032391793651,
183
+ "step": 5120
184
+ },
185
+ {
186
+ "epoch": 0.236478684587317,
187
+ "eval_bleu": 0.12365900485552281,
188
+ "eval_ce_clean_loss": 0.007601529405001534,
189
+ "eval_ce_pred_loss": 5.553882997329921,
190
+ "eval_flow_cos_loss": 0.04624706012575321,
191
+ "eval_flow_mse_loss": 0.19693581780342206,
192
+ "eval_loss": 1.8361909707931623,
193
+ "eval_runtime": 183.4659,
194
+ "eval_samples_per_second": 152.579,
195
+ "eval_steps_per_second": 2.387,
196
+ "flow/cos_sim": 0.9537529678649554,
197
+ "flow/improvement_ratio": 0.9952095367320596,
198
+ "flow/mag_ratio_mean": 0.9415775195920848,
199
+ "flow/mag_ratio_std": 0.07597032391793651,
200
+ "step": 5120
201
+ },
202
+ {
203
+ "epoch": 0.2837744215047804,
204
+ "grad_norm": 6.030252933502197,
205
+ "learning_rate": 0.00025675436342907166,
206
+ "loss": 1.7446959018707275,
207
+ "step": 6144
208
+ },
209
+ {
210
+ "epoch": 0.2837744215047804,
211
+ "eval_bleu": 0.12054442540488737,
212
+ "eval_ce_clean_loss": 0.005119796283649425,
213
+ "eval_ce_pred_loss": 5.576888217229277,
214
+ "eval_flow_cos_loss": 0.04454986083419083,
215
+ "eval_flow_mse_loss": 0.19221621612299522,
216
+ "eval_loss": 1.8283241480997163,
217
+ "flow/cos_sim": 0.955450158140975,
218
+ "flow/improvement_ratio": 0.9951490931314965,
219
+ "flow/mag_ratio_mean": 0.9462693515977904,
220
+ "flow/mag_ratio_std": 0.07402643641312373,
221
+ "step": 6144
222
+ },
223
+ {
224
+ "epoch": 0.2837744215047804,
225
+ "eval_bleu": 0.12054442540488737,
226
+ "eval_ce_clean_loss": 0.005119796283649425,
227
+ "eval_ce_pred_loss": 5.576888217229277,
228
+ "eval_flow_cos_loss": 0.04454986083419083,
229
+ "eval_flow_mse_loss": 0.19221621612299522,
230
+ "eval_loss": 1.8283241480997163,
231
+ "eval_runtime": 182.7448,
232
+ "eval_samples_per_second": 153.181,
233
+ "eval_steps_per_second": 2.397,
234
+ "flow/cos_sim": 0.955450158140975,
235
+ "flow/improvement_ratio": 0.9951490931314965,
236
+ "flow/mag_ratio_mean": 0.9462693515977904,
237
+ "flow/mag_ratio_std": 0.07402643641312373,
238
+ "step": 6144
239
+ },
240
+ {
241
+ "epoch": 0.3310701584222438,
242
+ "grad_norm": 3.546534299850464,
243
+ "learning_rate": 0.00023909118962306302,
244
+ "loss": 1.71308434009552,
245
+ "step": 7168
246
+ },
247
+ {
248
+ "epoch": 0.3310701584222438,
249
+ "eval_bleu": 0.12181793641122035,
250
+ "eval_ce_clean_loss": 0.0037171931636464446,
251
+ "eval_ce_pred_loss": 5.587970645460364,
252
+ "eval_flow_cos_loss": 0.0436794480742657,
253
+ "eval_flow_mse_loss": 0.191167407724411,
254
+ "eval_loss": 1.8267241172050233,
255
+ "flow/cos_sim": 0.9563205816430044,
256
+ "flow/improvement_ratio": 0.9955269416717634,
257
+ "flow/mag_ratio_mean": 0.9423067518020873,
258
+ "flow/mag_ratio_std": 0.0696161582929903,
259
+ "step": 7168
260
+ },
261
+ {
262
+ "epoch": 0.3310701584222438,
263
+ "eval_bleu": 0.12181793641122035,
264
+ "eval_ce_clean_loss": 0.0037171931636464446,
265
+ "eval_ce_pred_loss": 5.587970645460364,
266
+ "eval_flow_cos_loss": 0.0436794480742657,
267
+ "eval_flow_mse_loss": 0.191167407724411,
268
+ "eval_loss": 1.8267241172050233,
269
+ "eval_runtime": 184.7771,
270
+ "eval_samples_per_second": 151.496,
271
+ "eval_steps_per_second": 2.37,
272
+ "flow/cos_sim": 0.9563205816430044,
273
+ "flow/improvement_ratio": 0.9955269416717634,
274
+ "flow/mag_ratio_mean": 0.9423067518020873,
275
+ "flow/mag_ratio_std": 0.0696161582929903,
276
+ "step": 7168
277
+ },
278
+ {
279
+ "epoch": 0.37836589533970716,
280
+ "grad_norm": 1.8774560689926147,
281
+ "learning_rate": 0.00021926538791819868,
282
+ "loss": 1.6859357357025146,
283
+ "step": 8192
284
+ },
285
+ {
286
+ "epoch": 0.37836589533970716,
287
+ "eval_bleu": 0.12412716525130273,
288
+ "eval_ce_clean_loss": 0.003235808240098785,
289
+ "eval_ce_pred_loss": 5.586532745187141,
290
+ "eval_flow_cos_loss": 0.041930202632870305,
291
+ "eval_flow_mse_loss": 0.18674346998538058,
292
+ "eval_loss": 1.815286138830664,
293
+ "flow/cos_sim": 0.9580698176606061,
294
+ "flow/improvement_ratio": 0.9943266426591568,
295
+ "flow/mag_ratio_mean": 0.9418132807293983,
296
+ "flow/mag_ratio_std": 0.06752052529795802,
297
+ "step": 8192
298
+ },
299
+ {
300
+ "epoch": 0.37836589533970716,
301
+ "eval_bleu": 0.12412716525130273,
302
+ "eval_ce_clean_loss": 0.003235808240098785,
303
+ "eval_ce_pred_loss": 5.586532745187141,
304
+ "eval_flow_cos_loss": 0.041930202632870305,
305
+ "eval_flow_mse_loss": 0.18674346998538058,
306
+ "eval_loss": 1.815286138830664,
307
+ "eval_runtime": 184.5163,
308
+ "eval_samples_per_second": 151.71,
309
+ "eval_steps_per_second": 2.374,
310
+ "flow/cos_sim": 0.9580698176606061,
311
+ "flow/improvement_ratio": 0.9943266426591568,
312
+ "flow/mag_ratio_mean": 0.9418132807293983,
313
+ "flow/mag_ratio_std": 0.06752052529795802,
314
+ "step": 8192
315
+ },
316
+ {
317
+ "epoch": 0.4256616322571706,
318
+ "grad_norm": 2.329709768295288,
319
+ "learning_rate": 0.00019775821613655416,
320
+ "loss": 1.6662598848342896,
321
+ "step": 9216
322
+ },
323
+ {
324
+ "epoch": 0.4256616322571706,
325
+ "eval_bleu": 0.11950310892550009,
326
+ "eval_ce_clean_loss": 0.0025977063480207537,
327
+ "eval_ce_pred_loss": 5.67026076686981,
328
+ "eval_flow_cos_loss": 0.04075824715129044,
329
+ "eval_flow_mse_loss": 0.18378718476317243,
330
+ "eval_loss": 1.8284955149916209,
331
+ "flow/cos_sim": 0.9592417728955343,
332
+ "flow/improvement_ratio": 0.9952241491237187,
333
+ "flow/mag_ratio_mean": 0.9439138316944854,
334
+ "flow/mag_ratio_std": 0.06612530227359299,
335
+ "step": 9216
336
+ },
337
+ {
338
+ "epoch": 0.4256616322571706,
339
+ "eval_bleu": 0.11950310892550009,
340
+ "eval_ce_clean_loss": 0.0025977063480207537,
341
+ "eval_ce_pred_loss": 5.67026076686981,
342
+ "eval_flow_cos_loss": 0.04075824715129044,
343
+ "eval_flow_mse_loss": 0.18378718476317243,
344
+ "eval_loss": 1.8284955149916209,
345
+ "eval_runtime": 186.0874,
346
+ "eval_samples_per_second": 150.429,
347
+ "eval_steps_per_second": 2.354,
348
+ "flow/cos_sim": 0.9592417728955343,
349
+ "flow/improvement_ratio": 0.9952241491237187,
350
+ "flow/mag_ratio_mean": 0.9439138316944854,
351
+ "flow/mag_ratio_std": 0.06612530227359299,
352
+ "step": 9216
353
+ },
354
+ {
355
+ "epoch": 0.472957369174634,
356
+ "grad_norm": 2.7362329959869385,
357
+ "learning_rate": 0.0001751142697499792,
358
+ "loss": 1.6503983736038208,
359
+ "step": 10240
360
+ },
361
+ {
362
+ "epoch": 0.472957369174634,
363
+ "eval_bleu": 0.11976017597763533,
364
+ "eval_ce_clean_loss": 0.002652165328458666,
365
+ "eval_ce_pred_loss": 5.855250802758622,
366
+ "eval_flow_cos_loss": 0.03927829848883087,
367
+ "eval_flow_mse_loss": 0.17931374369963118,
368
+ "eval_loss": 1.864370654162751,
369
+ "flow/cos_sim": 0.9607217273755705,
370
+ "flow/improvement_ratio": 0.9954166140186188,
371
+ "flow/mag_ratio_mean": 0.9475667691666242,
372
+ "flow/mag_ratio_std": 0.06483245904655217,
373
+ "step": 10240
374
+ },
375
+ {
376
+ "epoch": 0.472957369174634,
377
+ "eval_bleu": 0.11976017597763533,
378
+ "eval_ce_clean_loss": 0.002652165328458666,
379
+ "eval_ce_pred_loss": 5.855250802758622,
380
+ "eval_flow_cos_loss": 0.03927829848883087,
381
+ "eval_flow_mse_loss": 0.17931374369963118,
382
+ "eval_loss": 1.864370654162751,
383
+ "eval_runtime": 186.3375,
384
+ "eval_samples_per_second": 150.227,
385
+ "eval_steps_per_second": 2.351,
386
+ "flow/cos_sim": 0.9607217273755705,
387
+ "flow/improvement_ratio": 0.9954166140186188,
388
+ "flow/mag_ratio_mean": 0.9475667691666242,
389
+ "flow/mag_ratio_std": 0.06483245904655217,
390
+ "step": 10240
391
+ },
392
+ {
393
+ "epoch": 0.5202531060920974,
394
+ "grad_norm": 3.2729897499084473,
395
+ "learning_rate": 0.00015186187928153583,
396
+ "loss": 1.6382803916931152,
397
+ "step": 11264
398
+ },
399
+ {
400
+ "epoch": 0.5202531060920974,
401
+ "eval_bleu": 0.1233228239119365,
402
+ "eval_ce_clean_loss": 0.0019490949249535068,
403
+ "eval_ce_pred_loss": 5.623051446322437,
404
+ "eval_flow_cos_loss": 0.03965857877612931,
405
+ "eval_flow_mse_loss": 0.18266330101446474,
406
+ "eval_loss": 1.8126971375996663,
407
+ "flow/cos_sim": 0.9603414374943737,
408
+ "flow/improvement_ratio": 0.9949628561054735,
409
+ "flow/mag_ratio_mean": 0.9479109590456366,
410
+ "flow/mag_ratio_std": 0.06405468670370644,
411
+ "step": 11264
412
+ },
413
+ {
414
+ "epoch": 0.5202531060920974,
415
+ "eval_bleu": 0.1233228239119365,
416
+ "eval_ce_clean_loss": 0.0019490949249535068,
417
+ "eval_ce_pred_loss": 5.623051446322437,
418
+ "eval_flow_cos_loss": 0.03965857877612931,
419
+ "eval_flow_mse_loss": 0.18266330101446474,
420
+ "eval_loss": 1.8126971375996663,
421
+ "eval_runtime": 185.5321,
422
+ "eval_samples_per_second": 150.88,
423
+ "eval_steps_per_second": 2.361,
424
+ "flow/cos_sim": 0.9603414374943737,
425
+ "flow/improvement_ratio": 0.9949628561054735,
426
+ "flow/mag_ratio_mean": 0.9479109590456366,
427
+ "flow/mag_ratio_std": 0.06405468670370644,
428
+ "step": 11264
429
+ }
430
+ ],
431
+ "logging_steps": 1024,
432
+ "max_steps": 21651,
433
+ "num_input_tokens_seen": 0,
434
+ "num_train_epochs": 1,
435
+ "save_steps": 1024,
436
+ "stateful_callbacks": {
437
+ "TrainerControl": {
438
+ "args": {
439
+ "should_epoch_stop": false,
440
+ "should_evaluate": false,
441
+ "should_log": false,
442
+ "should_save": true,
443
+ "should_training_stop": false
444
+ },
445
+ "attributes": {}
446
+ }
447
+ },
448
+ "total_flos": 0.0,
449
+ "train_batch_size": 64,
450
+ "trial_name": null,
451
+ "trial_params": null
452
+ }
checkpoints-v5.3/checkpoint-11264/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e76013f70108cb4ddda27ec8328cf4ef6edec61c75334ae3d6e75eb2e082915f
3
+ size 5137