Attila1011 commited on
Commit
7ddfe6f
·
verified ·
1 Parent(s): 51bca73

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -51,3 +51,4 @@ checkpoints-v4.5/checkpoint-15360/eval_state.json filter=lfs diff=lfs merge=lfs
51
  checkpoints-v5.9/checkpoint-11264/eval_state.json filter=lfs diff=lfs merge=lfs -text
52
  checkpoints-v5.10/checkpoint-7168/eval_state.json filter=lfs diff=lfs merge=lfs -text
53
  checkpoints-v4.5/checkpoint-21504/eval_state.json filter=lfs diff=lfs merge=lfs -text
 
 
51
  checkpoints-v5.9/checkpoint-11264/eval_state.json filter=lfs diff=lfs merge=lfs -text
52
  checkpoints-v5.10/checkpoint-7168/eval_state.json filter=lfs diff=lfs merge=lfs -text
53
  checkpoints-v4.5/checkpoint-21504/eval_state.json filter=lfs diff=lfs merge=lfs -text
54
+ checkpoints-v5.11/checkpoint-10240/eval_state.json filter=lfs diff=lfs merge=lfs -text
checkpoints-v5.11/checkpoint-10240/ema.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:41499b4d7bb1556af3feff86eda28b00e8d3dba3def7737434f6b80e5ae78f4c
3
+ size 55150648
checkpoints-v5.11/checkpoint-10240/eval_state.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a083769287760925a8dd73ab66868102703115faee1d1657ed250c789373753c
3
+ size 59162021
checkpoints-v5.11/checkpoint-10240/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4038bd5206f65c2615de3198601fbacb5b23e99ca6b2ffc48a43fb2014a82001
3
+ size 55150680
checkpoints-v5.11/checkpoint-10240/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f78465715bb28134a98872ad456f604761b722e3a8dc69cb430807fdb4af763
3
+ size 77724619
checkpoints-v5.11/checkpoint-10240/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:395a99c9379c14b57848072fe8b6246133ccdfbc9ea5f8c57ebc64605d710240
3
+ size 14645
checkpoints-v5.11/checkpoint-10240/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:52b7da564e6046cbf689eeaeb9be88fd2c9342eb25ebe600069b5454c6acfd21
3
+ size 1383
checkpoints-v5.11/checkpoint-10240/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:012f7b23a26c3659b3d642258b23d2a9db5c112a8101c3ff10a13ac93f47a91e
3
+ size 1465
checkpoints-v5.11/checkpoint-10240/trainer_state.json ADDED
@@ -0,0 +1,414 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.44131273299286744,
6
+ "eval_steps": 1024,
7
+ "global_step": 10240,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.04413127329928675,
14
+ "grad_norm": 2.9264373779296875,
15
+ "learning_rate": 9.990234375e-05,
16
+ "loss": 12.596921920776367,
17
+ "step": 1024
18
+ },
19
+ {
20
+ "epoch": 0.04413127329928675,
21
+ "eval_bleu": 0.13005541908821117,
22
+ "eval_ce_clean_loss": 2.286205404602897,
23
+ "eval_ce_pred_loss": 5.3831145524470285,
24
+ "eval_flow_consistency_loss": 0.13552273369864867,
25
+ "eval_flow_mse_loss": 0.9414198791294464,
26
+ "eval_loss": 7.06356679757775,
27
+ "flow/cos_sim": 0.35049253765708094,
28
+ "flow/improvement_ratio": 0.9494910390138118,
29
+ "flow/mag_ratio_mean": 0.2741034229172827,
30
+ "flow/mag_ratio_std": 0.0854220198892327,
31
+ "step": 1024
32
+ },
33
+ {
34
+ "epoch": 0.04413127329928675,
35
+ "eval_bleu": 0.13005541908821117,
36
+ "eval_ce_clean_loss": 2.286205404602897,
37
+ "eval_ce_pred_loss": 5.3831145524470285,
38
+ "eval_flow_consistency_loss": 0.13552273369864867,
39
+ "eval_flow_mse_loss": 0.9414198791294464,
40
+ "eval_loss": 7.06356679757775,
41
+ "eval_runtime": 212.0249,
42
+ "eval_samples_per_second": 141.493,
43
+ "eval_steps_per_second": 2.212,
44
+ "flow/cos_sim": 0.35049253765708094,
45
+ "flow/improvement_ratio": 0.9494910390138118,
46
+ "flow/mag_ratio_mean": 0.2741034229172827,
47
+ "flow/mag_ratio_std": 0.0854220198892327,
48
+ "step": 1024
49
+ },
50
+ {
51
+ "epoch": 0.0882625465985735,
52
+ "grad_norm": 1.9657727479934692,
53
+ "learning_rate": 9.9476028157316e-05,
54
+ "loss": 5.091745853424072,
55
+ "step": 2048
56
+ },
57
+ {
58
+ "epoch": 0.0882625465985735,
59
+ "eval_bleu": 0.32513472477417255,
60
+ "eval_ce_clean_loss": 0.36479977980605577,
61
+ "eval_ce_pred_loss": 3.6980401625765413,
62
+ "eval_flow_consistency_loss": 0.1417246216427543,
63
+ "eval_flow_mse_loss": 1.0055452192515961,
64
+ "eval_loss": 4.0298353888586895,
65
+ "flow/cos_sim": 0.4995506982813512,
66
+ "flow/improvement_ratio": 0.9729367145088944,
67
+ "flow/mag_ratio_mean": 0.4342197393303487,
68
+ "flow/mag_ratio_std": 0.1229526477772544,
69
+ "step": 2048
70
+ },
71
+ {
72
+ "epoch": 0.0882625465985735,
73
+ "eval_bleu": 0.32513472477417255,
74
+ "eval_ce_clean_loss": 0.36479977980605577,
75
+ "eval_ce_pred_loss": 3.6980401625765413,
76
+ "eval_flow_consistency_loss": 0.1417246216427543,
77
+ "eval_flow_mse_loss": 1.0055452192515961,
78
+ "eval_loss": 4.0298353888586895,
79
+ "eval_runtime": 209.9563,
80
+ "eval_samples_per_second": 142.887,
81
+ "eval_steps_per_second": 2.234,
82
+ "flow/cos_sim": 0.4995506982813512,
83
+ "flow/improvement_ratio": 0.9729367145088944,
84
+ "flow/mag_ratio_mean": 0.4342197393303487,
85
+ "flow/mag_ratio_std": 0.1229526477772544,
86
+ "step": 2048
87
+ },
88
+ {
89
+ "epoch": 0.13239381989786023,
90
+ "grad_norm": 1.1680883169174194,
91
+ "learning_rate": 9.791307026072513e-05,
92
+ "loss": 3.746001720428467,
93
+ "step": 3072
94
+ },
95
+ {
96
+ "epoch": 0.13239381989786023,
97
+ "eval_bleu": 0.3841303115629711,
98
+ "eval_ce_clean_loss": 0.13881633935897336,
99
+ "eval_ce_pred_loss": 3.082805846037387,
100
+ "eval_flow_consistency_loss": 0.1823698268262054,
101
+ "eval_flow_mse_loss": 1.0654335448991006,
102
+ "eval_loss": 3.45339886466069,
103
+ "flow/cos_sim": 0.5774906538188584,
104
+ "flow/improvement_ratio": 0.9910711745209277,
105
+ "flow/mag_ratio_mean": 0.5197838679559703,
106
+ "flow/mag_ratio_std": 0.101421576716117,
107
+ "step": 3072
108
+ },
109
+ {
110
+ "epoch": 0.13239381989786023,
111
+ "eval_bleu": 0.3841303115629711,
112
+ "eval_ce_clean_loss": 0.13881633935897336,
113
+ "eval_ce_pred_loss": 3.082805846037387,
114
+ "eval_flow_consistency_loss": 0.1823698268262054,
115
+ "eval_flow_mse_loss": 1.0654335448991006,
116
+ "eval_loss": 3.45339886466069,
117
+ "eval_runtime": 212.9337,
118
+ "eval_samples_per_second": 140.889,
119
+ "eval_steps_per_second": 2.203,
120
+ "flow/cos_sim": 0.5774906538188584,
121
+ "flow/improvement_ratio": 0.9910711745209277,
122
+ "flow/mag_ratio_mean": 0.5197838679559703,
123
+ "flow/mag_ratio_std": 0.101421576716117,
124
+ "step": 3072
125
+ },
126
+ {
127
+ "epoch": 0.176525093197147,
128
+ "grad_norm": 0.6403291821479797,
129
+ "learning_rate": 9.53439476074686e-05,
130
+ "loss": 3.401794672012329,
131
+ "step": 4096
132
+ },
133
+ {
134
+ "epoch": 0.176525093197147,
135
+ "eval_bleu": 0.41164963024752277,
136
+ "eval_ce_clean_loss": 0.075089068999931,
137
+ "eval_ce_pred_loss": 2.8144510768369826,
138
+ "eval_flow_consistency_loss": 0.20391849938359088,
139
+ "eval_flow_mse_loss": 1.0832029329434132,
140
+ "eval_loss": 3.230366972717903,
141
+ "flow/cos_sim": 0.6372234959846367,
142
+ "flow/improvement_ratio": 0.9945152276106226,
143
+ "flow/mag_ratio_mean": 0.590258006602208,
144
+ "flow/mag_ratio_std": 0.08975283501308355,
145
+ "step": 4096
146
+ },
147
+ {
148
+ "epoch": 0.176525093197147,
149
+ "eval_bleu": 0.41164963024752277,
150
+ "eval_ce_clean_loss": 0.075089068999931,
151
+ "eval_ce_pred_loss": 2.8144510768369826,
152
+ "eval_flow_consistency_loss": 0.20391849938359088,
153
+ "eval_flow_mse_loss": 1.0832029329434132,
154
+ "eval_loss": 3.230366972717903,
155
+ "eval_runtime": 212.6089,
156
+ "eval_samples_per_second": 141.104,
157
+ "eval_steps_per_second": 2.206,
158
+ "flow/cos_sim": 0.6372234959846367,
159
+ "flow/improvement_ratio": 0.9945152276106226,
160
+ "flow/mag_ratio_mean": 0.590258006602208,
161
+ "flow/mag_ratio_std": 0.08975283501308355,
162
+ "step": 4096
163
+ },
164
+ {
165
+ "epoch": 0.22065636649643372,
166
+ "grad_norm": 0.6607010364532471,
167
+ "learning_rate": 9.18264920723673e-05,
168
+ "loss": 3.239504337310791,
169
+ "step": 5120
170
+ },
171
+ {
172
+ "epoch": 0.22065636649643372,
173
+ "eval_bleu": 0.4271325450649275,
174
+ "eval_ce_clean_loss": 0.04691080389611884,
175
+ "eval_ce_pred_loss": 2.6821144310904463,
176
+ "eval_flow_consistency_loss": 0.19756401179314675,
177
+ "eval_flow_mse_loss": 1.05885077488702,
178
+ "eval_loss": 3.0820236617822383,
179
+ "flow/cos_sim": 0.6784213738146622,
180
+ "flow/improvement_ratio": 0.9945986220069023,
181
+ "flow/mag_ratio_mean": 0.6420265626805678,
182
+ "flow/mag_ratio_std": 0.09060588493339543,
183
+ "step": 5120
184
+ },
185
+ {
186
+ "epoch": 0.22065636649643372,
187
+ "eval_bleu": 0.4271325450649275,
188
+ "eval_ce_clean_loss": 0.04691080389611884,
189
+ "eval_ce_pred_loss": 2.6821144310904463,
190
+ "eval_flow_consistency_loss": 0.19756401179314675,
191
+ "eval_flow_mse_loss": 1.05885077488702,
192
+ "eval_loss": 3.0820236617822383,
193
+ "eval_runtime": 211.4363,
194
+ "eval_samples_per_second": 141.887,
195
+ "eval_steps_per_second": 2.218,
196
+ "flow/cos_sim": 0.6784213738146622,
197
+ "flow/improvement_ratio": 0.9945986220069023,
198
+ "flow/mag_ratio_mean": 0.6420265626805678,
199
+ "flow/mag_ratio_std": 0.09060588493339543,
200
+ "step": 5120
201
+ },
202
+ {
203
+ "epoch": 0.26478763979572045,
204
+ "grad_norm": 1.069400429725647,
205
+ "learning_rate": 8.742770483354739e-05,
206
+ "loss": 3.1248061656951904,
207
+ "step": 6144
208
+ },
209
+ {
210
+ "epoch": 0.26478763979572045,
211
+ "eval_bleu": 0.4376915429907597,
212
+ "eval_ce_clean_loss": 0.03250600234754304,
213
+ "eval_ce_pred_loss": 2.6012275836615166,
214
+ "eval_flow_consistency_loss": 0.18590141693030848,
215
+ "eval_flow_mse_loss": 1.0617919451138105,
216
+ "eval_loss": 3.0081079306124625,
217
+ "flow/cos_sim": 0.7023057956685389,
218
+ "flow/improvement_ratio": 0.9948139922705286,
219
+ "flow/mag_ratio_mean": 0.6713837680023617,
220
+ "flow/mag_ratio_std": 0.10080839297982422,
221
+ "step": 6144
222
+ },
223
+ {
224
+ "epoch": 0.26478763979572045,
225
+ "eval_bleu": 0.4376915429907597,
226
+ "eval_ce_clean_loss": 0.03250600234754304,
227
+ "eval_ce_pred_loss": 2.6012275836615166,
228
+ "eval_flow_consistency_loss": 0.18590141693030848,
229
+ "eval_flow_mse_loss": 1.0617919451138105,
230
+ "eval_loss": 3.0081079306124625,
231
+ "eval_runtime": 213.8497,
232
+ "eval_samples_per_second": 140.285,
233
+ "eval_steps_per_second": 2.193,
234
+ "flow/cos_sim": 0.7023057956685389,
235
+ "flow/improvement_ratio": 0.9948139922705286,
236
+ "flow/mag_ratio_mean": 0.6713837680023617,
237
+ "flow/mag_ratio_std": 0.10080839297982422,
238
+ "step": 6144
239
+ },
240
+ {
241
+ "epoch": 0.30891891309500724,
242
+ "grad_norm": 1.219580054283142,
243
+ "learning_rate": 8.22483558761947e-05,
244
+ "loss": 3.0330727100372314,
245
+ "step": 7168
246
+ },
247
+ {
248
+ "epoch": 0.30891891309500724,
249
+ "eval_bleu": 0.4551949901017014,
250
+ "eval_ce_clean_loss": 0.02424235607205487,
251
+ "eval_ce_pred_loss": 2.500773771993641,
252
+ "eval_flow_consistency_loss": 0.17524592572056663,
253
+ "eval_flow_mse_loss": 1.0443366509256586,
254
+ "eval_loss": 2.9067435889864273,
255
+ "flow/cos_sim": 0.7204479055363995,
256
+ "flow/improvement_ratio": 0.9950863978247653,
257
+ "flow/mag_ratio_mean": 0.6897140079215646,
258
+ "flow/mag_ratio_std": 0.10839694198260684,
259
+ "step": 7168
260
+ },
261
+ {
262
+ "epoch": 0.30891891309500724,
263
+ "eval_bleu": 0.4551949901017014,
264
+ "eval_ce_clean_loss": 0.02424235607205487,
265
+ "eval_ce_pred_loss": 2.500773771993641,
266
+ "eval_flow_consistency_loss": 0.17524592572056663,
267
+ "eval_flow_mse_loss": 1.0443366509256586,
268
+ "eval_loss": 2.9067435889864273,
269
+ "eval_runtime": 215.0284,
270
+ "eval_samples_per_second": 139.516,
271
+ "eval_steps_per_second": 2.181,
272
+ "flow/cos_sim": 0.7204479055363995,
273
+ "flow/improvement_ratio": 0.9950863978247653,
274
+ "flow/mag_ratio_mean": 0.6897140079215646,
275
+ "flow/mag_ratio_std": 0.10839694198260684,
276
+ "step": 7168
277
+ },
278
+ {
279
+ "epoch": 0.353050186394294,
280
+ "grad_norm": 2.246631145477295,
281
+ "learning_rate": 7.638710244802891e-05,
282
+ "loss": 2.95896315574646,
283
+ "step": 8192
284
+ },
285
+ {
286
+ "epoch": 0.353050186394294,
287
+ "eval_bleu": 0.46141542061426044,
288
+ "eval_ce_clean_loss": 0.01877150087674925,
289
+ "eval_ce_pred_loss": 2.4870052022466274,
290
+ "eval_flow_consistency_loss": 0.16635312886634615,
291
+ "eval_flow_mse_loss": 1.0426371824512604,
292
+ "eval_loss": 2.8854888682680597,
293
+ "flow/cos_sim": 0.7327545807559861,
294
+ "flow/improvement_ratio": 0.9935595951100656,
295
+ "flow/mag_ratio_mean": 0.7086031040403126,
296
+ "flow/mag_ratio_std": 0.11536222337278476,
297
+ "step": 8192
298
+ },
299
+ {
300
+ "epoch": 0.353050186394294,
301
+ "eval_bleu": 0.46141542061426044,
302
+ "eval_ce_clean_loss": 0.01877150087674925,
303
+ "eval_ce_pred_loss": 2.4870052022466274,
304
+ "eval_flow_consistency_loss": 0.16635312886634615,
305
+ "eval_flow_mse_loss": 1.0426371824512604,
306
+ "eval_loss": 2.8854888682680597,
307
+ "eval_runtime": 212.0987,
308
+ "eval_samples_per_second": 141.444,
309
+ "eval_steps_per_second": 2.211,
310
+ "flow/cos_sim": 0.7327545807559861,
311
+ "flow/improvement_ratio": 0.9935595951100656,
312
+ "flow/mag_ratio_mean": 0.7086031040403126,
313
+ "flow/mag_ratio_std": 0.11536222337278476,
314
+ "step": 8192
315
+ },
316
+ {
317
+ "epoch": 0.3971814596935807,
318
+ "grad_norm": 1.2935556173324585,
319
+ "learning_rate": 6.997821756319211e-05,
320
+ "loss": 2.9052517414093018,
321
+ "step": 9216
322
+ },
323
+ {
324
+ "epoch": 0.3971814596935807,
325
+ "eval_bleu": 0.46944041426382027,
326
+ "eval_ce_clean_loss": 0.014995425907787737,
327
+ "eval_ce_pred_loss": 2.4030428840153255,
328
+ "eval_flow_consistency_loss": 0.1594380565098862,
329
+ "eval_flow_mse_loss": 1.0466467982162035,
330
+ "eval_loss": 2.8234912571368187,
331
+ "flow/cos_sim": 0.744667167983838,
332
+ "flow/improvement_ratio": 0.993155452361239,
333
+ "flow/mag_ratio_mean": 0.7179579849182162,
334
+ "flow/mag_ratio_std": 0.11927568821955337,
335
+ "step": 9216
336
+ },
337
+ {
338
+ "epoch": 0.3971814596935807,
339
+ "eval_bleu": 0.46944041426382027,
340
+ "eval_ce_clean_loss": 0.014995425907787737,
341
+ "eval_ce_pred_loss": 2.4030428840153255,
342
+ "eval_flow_consistency_loss": 0.1594380565098862,
343
+ "eval_flow_mse_loss": 1.0466467982162035,
344
+ "eval_loss": 2.8234912571368187,
345
+ "eval_runtime": 210.1213,
346
+ "eval_samples_per_second": 142.775,
347
+ "eval_steps_per_second": 2.232,
348
+ "flow/cos_sim": 0.744667167983838,
349
+ "flow/improvement_ratio": 0.993155452361239,
350
+ "flow/mag_ratio_mean": 0.7179579849182162,
351
+ "flow/mag_ratio_std": 0.11927568821955337,
352
+ "step": 9216
353
+ },
354
+ {
355
+ "epoch": 0.44131273299286744,
356
+ "grad_norm": 0.7428159117698669,
357
+ "learning_rate": 6.314377890922702e-05,
358
+ "loss": 2.8689093589782715,
359
+ "step": 10240
360
+ },
361
+ {
362
+ "epoch": 0.44131273299286744,
363
+ "eval_bleu": 0.4685034093487891,
364
+ "eval_ce_clean_loss": 0.01237281793450464,
365
+ "eval_ce_pred_loss": 2.384004484361677,
366
+ "eval_flow_consistency_loss": 0.1589052606302538,
367
+ "eval_flow_mse_loss": 1.0222321362383584,
368
+ "eval_loss": 2.7828606944094334,
369
+ "flow/cos_sim": 0.7520584338255274,
370
+ "flow/improvement_ratio": 0.9938689404204964,
371
+ "flow/mag_ratio_mean": 0.7304074718499742,
372
+ "flow/mag_ratio_std": 0.12232410612263914,
373
+ "step": 10240
374
+ },
375
+ {
376
+ "epoch": 0.44131273299286744,
377
+ "eval_bleu": 0.4685034093487891,
378
+ "eval_ce_clean_loss": 0.01237281793450464,
379
+ "eval_ce_pred_loss": 2.384004484361677,
380
+ "eval_flow_consistency_loss": 0.1589052606302538,
381
+ "eval_flow_mse_loss": 1.0222321362383584,
382
+ "eval_loss": 2.7828606944094334,
383
+ "eval_runtime": 211.3456,
384
+ "eval_samples_per_second": 141.948,
385
+ "eval_steps_per_second": 2.219,
386
+ "flow/cos_sim": 0.7520584338255274,
387
+ "flow/improvement_ratio": 0.9938689404204964,
388
+ "flow/mag_ratio_mean": 0.7304074718499742,
389
+ "flow/mag_ratio_std": 0.12232410612263914,
390
+ "step": 10240
391
+ }
392
+ ],
393
+ "logging_steps": 1024,
394
+ "max_steps": 23204,
395
+ "num_input_tokens_seen": 0,
396
+ "num_train_epochs": 1,
397
+ "save_steps": 1024,
398
+ "stateful_callbacks": {
399
+ "TrainerControl": {
400
+ "args": {
401
+ "should_epoch_stop": false,
402
+ "should_evaluate": false,
403
+ "should_log": false,
404
+ "should_save": true,
405
+ "should_training_stop": false
406
+ },
407
+ "attributes": {}
408
+ }
409
+ },
410
+ "total_flos": 0.0,
411
+ "train_batch_size": 64,
412
+ "trial_name": null,
413
+ "trial_params": null
414
+ }
checkpoints-v5.11/checkpoint-10240/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8469bbc81a2ba0be2f5b44007faafd15c75615abe30f4f4e56171816d31caa5b
3
+ size 5137