Attila1011 commited on
Commit
b99dada
·
verified ·
1 Parent(s): d8a33cf

Upload folder using huggingface_hub

Browse files
checkpoints-v2.8-h-2/checkpoint-21651/ema.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f6cab88e892c6d3d398146ba3e47141b43f87d7a5854120892dae7c50a7d5a44
3
+ size 550088
checkpoints-v2.8-h-2/checkpoint-21651/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:71f293191018d1ae9cebc826c13788048803dbb6c59d84bd30c059b8f5e5a8e5
3
+ size 19318464
checkpoints-v2.8-h-2/checkpoint-21651/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2972d42888f6f806590a53719723fbe36f4b3bb3d0cff683e1abb63e5ed184c2
3
+ size 1175115
checkpoints-v2.8-h-2/checkpoint-21651/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:13e90563fac8d100230efe8ed83cb90f5dba5cae4ef0fd5b21a7a334409667e3
3
+ size 14645
checkpoints-v2.8-h-2/checkpoint-21651/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:292ad3fa2862942d57211b5dc215b794892749e4755a645c17374656a0c1b093
3
+ size 1383
checkpoints-v2.8-h-2/checkpoint-21651/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:832d6d1252c8f66c3851354151edb931a66032460323a9421e37c6a2a7b4fa65
3
+ size 1465
checkpoints-v2.8-h-2/checkpoint-21651/trainer_state.json ADDED
@@ -0,0 +1,706 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 1.0,
6
+ "eval_steps": 1024,
7
+ "global_step": 21651,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.047295736917463395,
14
+ "grad_norm": 0.13216529786586761,
15
+ "learning_rate": 0.0003330078125,
16
+ "loss": 2.2540314197540283,
17
+ "step": 1024
18
+ },
19
+ {
20
+ "epoch": 0.047295736917463395,
21
+ "eval_cos_loss": 0.5608168508364185,
22
+ "eval_loss": 1.840172099740538,
23
+ "eval_mse_loss": 1.5597636724171573,
24
+ "flow/cos_sim": 0.4391831588255216,
25
+ "flow/improvement_ratio": 0.9548978971564062,
26
+ "flow/mag_ratio_mean": 0.4252509521022779,
27
+ "flow/mag_ratio_std": 0.15304358966954767,
28
+ "step": 1024
29
+ },
30
+ {
31
+ "epoch": 0.047295736917463395,
32
+ "eval_cos_loss": 0.5608168508364185,
33
+ "eval_loss": 1.840172099740538,
34
+ "eval_mse_loss": 1.5597636724171573,
35
+ "eval_runtime": 36.5199,
36
+ "eval_samples_per_second": 766.514,
37
+ "eval_steps_per_second": 11.993,
38
+ "flow/cos_sim": 0.4391831588255216,
39
+ "flow/improvement_ratio": 0.9548978971564062,
40
+ "flow/mag_ratio_mean": 0.4252509521022779,
41
+ "flow/mag_ratio_std": 0.15304358966954767,
42
+ "step": 1024
43
+ },
44
+ {
45
+ "epoch": 0.09459147383492679,
46
+ "grad_norm": 0.4216933250427246,
47
+ "learning_rate": 0.0006663411458333333,
48
+ "loss": 1.7353554964065552,
49
+ "step": 2048
50
+ },
51
+ {
52
+ "epoch": 0.09459147383492679,
53
+ "eval_cos_loss": 0.470765205482914,
54
+ "eval_loss": 1.593590806063996,
55
+ "eval_mse_loss": 1.3582082039689365,
56
+ "flow/cos_sim": 0.5292348125481714,
57
+ "flow/improvement_ratio": 0.9650679060585423,
58
+ "flow/mag_ratio_mean": 0.5083687867475971,
59
+ "flow/mag_ratio_std": 0.18558434835852008,
60
+ "step": 2048
61
+ },
62
+ {
63
+ "epoch": 0.09459147383492679,
64
+ "eval_cos_loss": 0.470765205482914,
65
+ "eval_loss": 1.593590806063996,
66
+ "eval_mse_loss": 1.3582082039689365,
67
+ "eval_runtime": 36.3508,
68
+ "eval_samples_per_second": 770.08,
69
+ "eval_steps_per_second": 12.049,
70
+ "flow/cos_sim": 0.5292348125481714,
71
+ "flow/improvement_ratio": 0.9650679060585423,
72
+ "flow/mag_ratio_mean": 0.5083687867475971,
73
+ "flow/mag_ratio_std": 0.18558434835852008,
74
+ "step": 2048
75
+ },
76
+ {
77
+ "epoch": 0.1418872107523902,
78
+ "grad_norm": 0.27868369221687317,
79
+ "learning_rate": 0.0009996744791666667,
80
+ "loss": 1.6089088916778564,
81
+ "step": 3072
82
+ },
83
+ {
84
+ "epoch": 0.1418872107523902,
85
+ "eval_cos_loss": 0.44682060161682025,
86
+ "eval_loss": 1.5164495962395517,
87
+ "eval_mse_loss": 1.293039296043518,
88
+ "flow/cos_sim": 0.5531794064121159,
89
+ "flow/improvement_ratio": 0.9667330909537398,
90
+ "flow/mag_ratio_mean": 0.5384583678691899,
91
+ "flow/mag_ratio_std": 0.2053203195265439,
92
+ "step": 3072
93
+ },
94
+ {
95
+ "epoch": 0.1418872107523902,
96
+ "eval_cos_loss": 0.44682060161682025,
97
+ "eval_loss": 1.5164495962395517,
98
+ "eval_mse_loss": 1.293039296043518,
99
+ "eval_runtime": 36.3439,
100
+ "eval_samples_per_second": 770.225,
101
+ "eval_steps_per_second": 12.052,
102
+ "flow/cos_sim": 0.5531794064121159,
103
+ "flow/improvement_ratio": 0.9667330909537398,
104
+ "flow/mag_ratio_mean": 0.5384583678691899,
105
+ "flow/mag_ratio_std": 0.2053203195265439,
106
+ "step": 3072
107
+ },
108
+ {
109
+ "epoch": 0.18918294766985358,
110
+ "grad_norm": 0.15250740945339203,
111
+ "learning_rate": 0.0009925378645256155,
112
+ "loss": 1.5571845769882202,
113
+ "step": 4096
114
+ },
115
+ {
116
+ "epoch": 0.18918294766985358,
117
+ "eval_cos_loss": 0.43560234120447344,
118
+ "eval_loss": 1.4811601312193152,
119
+ "eval_mse_loss": 1.2633589582356144,
120
+ "flow/cos_sim": 0.5643976608367816,
121
+ "flow/improvement_ratio": 0.967472982324966,
122
+ "flow/mag_ratio_mean": 0.5474448244865626,
123
+ "flow/mag_ratio_std": 0.21047595840746955,
124
+ "step": 4096
125
+ },
126
+ {
127
+ "epoch": 0.18918294766985358,
128
+ "eval_cos_loss": 0.43560234120447344,
129
+ "eval_loss": 1.4811601312193152,
130
+ "eval_mse_loss": 1.2633589582356144,
131
+ "eval_runtime": 37.7437,
132
+ "eval_samples_per_second": 741.661,
133
+ "eval_steps_per_second": 11.605,
134
+ "flow/cos_sim": 0.5643976608367816,
135
+ "flow/improvement_ratio": 0.967472982324966,
136
+ "flow/mag_ratio_mean": 0.5474448244865626,
137
+ "flow/mag_ratio_std": 0.21047595840746955,
138
+ "step": 4096
139
+ },
140
+ {
141
+ "epoch": 0.236478684587317,
142
+ "grad_norm": 0.13595673441886902,
143
+ "learning_rate": 0.0009703455149398919,
144
+ "loss": 1.531019687652588,
145
+ "step": 5120
146
+ },
147
+ {
148
+ "epoch": 0.236478684587317,
149
+ "eval_cos_loss": 0.42780890196697896,
150
+ "eval_loss": 1.4542832099683753,
151
+ "eval_mse_loss": 1.2403787578621956,
152
+ "flow/cos_sim": 0.572191093202051,
153
+ "flow/improvement_ratio": 0.9686366873516884,
154
+ "flow/mag_ratio_mean": 0.5504985138705877,
155
+ "flow/mag_ratio_std": 0.21518446338367245,
156
+ "step": 5120
157
+ },
158
+ {
159
+ "epoch": 0.236478684587317,
160
+ "eval_cos_loss": 0.42780890196697896,
161
+ "eval_loss": 1.4542832099683753,
162
+ "eval_mse_loss": 1.2403787578621956,
163
+ "eval_runtime": 36.5108,
164
+ "eval_samples_per_second": 766.704,
165
+ "eval_steps_per_second": 11.996,
166
+ "flow/cos_sim": 0.572191093202051,
167
+ "flow/improvement_ratio": 0.9686366873516884,
168
+ "flow/mag_ratio_mean": 0.5504985138705877,
169
+ "flow/mag_ratio_std": 0.21518446338367245,
170
+ "step": 5120
171
+ },
172
+ {
173
+ "epoch": 0.2837744215047804,
174
+ "grad_norm": 0.1523830145597458,
175
+ "learning_rate": 0.0009340866457980386,
176
+ "loss": 1.5127094984054565,
177
+ "step": 6144
178
+ },
179
+ {
180
+ "epoch": 0.2837744215047804,
181
+ "eval_cos_loss": 0.4244332621767096,
182
+ "eval_loss": 1.4425126316884882,
183
+ "eval_mse_loss": 1.2302960026754093,
184
+ "flow/cos_sim": 0.5755667399325871,
185
+ "flow/improvement_ratio": 0.9678758921688551,
186
+ "flow/mag_ratio_mean": 0.5517790397824762,
187
+ "flow/mag_ratio_std": 0.21870258596798056,
188
+ "step": 6144
189
+ },
190
+ {
191
+ "epoch": 0.2837744215047804,
192
+ "eval_cos_loss": 0.4244332621767096,
193
+ "eval_loss": 1.4425126316884882,
194
+ "eval_mse_loss": 1.2302960026754093,
195
+ "eval_runtime": 37.204,
196
+ "eval_samples_per_second": 752.419,
197
+ "eval_steps_per_second": 11.773,
198
+ "flow/cos_sim": 0.5755667399325871,
199
+ "flow/improvement_ratio": 0.9678758921688551,
200
+ "flow/mag_ratio_mean": 0.5517790397824762,
201
+ "flow/mag_ratio_std": 0.21870258596798056,
202
+ "step": 6144
203
+ },
204
+ {
205
+ "epoch": 0.3310701584222438,
206
+ "grad_norm": 0.3283827602863312,
207
+ "learning_rate": 0.0008848456431007006,
208
+ "loss": 1.4992624521255493,
209
+ "step": 7168
210
+ },
211
+ {
212
+ "epoch": 0.3310701584222438,
213
+ "eval_cos_loss": 0.42202579097388543,
214
+ "eval_loss": 1.4335319138553044,
215
+ "eval_mse_loss": 1.2225190172456715,
216
+ "flow/cos_sim": 0.5779742277376184,
217
+ "flow/improvement_ratio": 0.9681859147058774,
218
+ "flow/mag_ratio_mean": 0.556161379705281,
219
+ "flow/mag_ratio_std": 0.22328348849053797,
220
+ "step": 7168
221
+ },
222
+ {
223
+ "epoch": 0.3310701584222438,
224
+ "eval_cos_loss": 0.42202579097388543,
225
+ "eval_loss": 1.4335319138553044,
226
+ "eval_mse_loss": 1.2225190172456715,
227
+ "eval_runtime": 37.4238,
228
+ "eval_samples_per_second": 748.0,
229
+ "eval_steps_per_second": 11.704,
230
+ "flow/cos_sim": 0.5779742277376184,
231
+ "flow/improvement_ratio": 0.9681859147058774,
232
+ "flow/mag_ratio_mean": 0.556161379705281,
233
+ "flow/mag_ratio_std": 0.22328348849053797,
234
+ "step": 7168
235
+ },
236
+ {
237
+ "epoch": 0.37836589533970716,
238
+ "grad_norm": 0.132944718003273,
239
+ "learning_rate": 0.0008240951466528818,
240
+ "loss": 1.4886243343353271,
241
+ "step": 8192
242
+ },
243
+ {
244
+ "epoch": 0.37836589533970716,
245
+ "eval_cos_loss": 0.41840511710132094,
246
+ "eval_loss": 1.4239725503202987,
247
+ "eval_mse_loss": 1.21476999183768,
248
+ "flow/cos_sim": 0.581594914061838,
249
+ "flow/improvement_ratio": 0.9676709669093563,
250
+ "flow/mag_ratio_mean": 0.5514429012934366,
251
+ "flow/mag_ratio_std": 0.21821844441705643,
252
+ "step": 8192
253
+ },
254
+ {
255
+ "epoch": 0.37836589533970716,
256
+ "eval_cos_loss": 0.41840511710132094,
257
+ "eval_loss": 1.4239725503202987,
258
+ "eval_mse_loss": 1.21476999183768,
259
+ "eval_runtime": 37.4921,
260
+ "eval_samples_per_second": 746.637,
261
+ "eval_steps_per_second": 11.682,
262
+ "flow/cos_sim": 0.581594914061838,
263
+ "flow/improvement_ratio": 0.9676709669093563,
264
+ "flow/mag_ratio_mean": 0.5514429012934366,
265
+ "flow/mag_ratio_std": 0.21821844441705643,
266
+ "step": 8192
267
+ },
268
+ {
269
+ "epoch": 0.4256616322571706,
270
+ "grad_norm": 0.20388394594192505,
271
+ "learning_rate": 0.0007536520081501641,
272
+ "loss": 1.4808920621871948,
273
+ "step": 9216
274
+ },
275
+ {
276
+ "epoch": 0.4256616322571706,
277
+ "eval_cos_loss": 0.4158792528387618,
278
+ "eval_loss": 1.4131894620586203,
279
+ "eval_mse_loss": 1.20524983863308,
280
+ "flow/cos_sim": 0.584120759408768,
281
+ "flow/improvement_ratio": 0.9670752337261966,
282
+ "flow/mag_ratio_mean": 0.5632989504565932,
283
+ "flow/mag_ratio_std": 0.22649330343013485,
284
+ "step": 9216
285
+ },
286
+ {
287
+ "epoch": 0.4256616322571706,
288
+ "eval_cos_loss": 0.4158792528387618,
289
+ "eval_loss": 1.4131894620586203,
290
+ "eval_mse_loss": 1.20524983863308,
291
+ "eval_runtime": 37.6013,
292
+ "eval_samples_per_second": 744.468,
293
+ "eval_steps_per_second": 11.649,
294
+ "flow/cos_sim": 0.584120759408768,
295
+ "flow/improvement_ratio": 0.9670752337261966,
296
+ "flow/mag_ratio_mean": 0.5632989504565932,
297
+ "flow/mag_ratio_std": 0.22649330343013485,
298
+ "step": 9216
299
+ },
300
+ {
301
+ "epoch": 0.472957369174634,
302
+ "grad_norm": 0.16685892641544342,
303
+ "learning_rate": 0.0006756229549958484,
304
+ "loss": 1.4752637147903442,
305
+ "step": 10240
306
+ },
307
+ {
308
+ "epoch": 0.472957369174634,
309
+ "eval_cos_loss": 0.4138242877917747,
310
+ "eval_loss": 1.409086674043577,
311
+ "eval_mse_loss": 1.2021745326312165,
312
+ "flow/cos_sim": 0.5861757269733028,
313
+ "flow/improvement_ratio": 0.9699954621868047,
314
+ "flow/mag_ratio_mean": 0.5634166458153833,
315
+ "flow/mag_ratio_std": 0.22239429697598498,
316
+ "step": 10240
317
+ },
318
+ {
319
+ "epoch": 0.472957369174634,
320
+ "eval_cos_loss": 0.4138242877917747,
321
+ "eval_loss": 1.409086674043577,
322
+ "eval_mse_loss": 1.2021745326312165,
323
+ "eval_runtime": 37.5678,
324
+ "eval_samples_per_second": 745.133,
325
+ "eval_steps_per_second": 11.659,
326
+ "flow/cos_sim": 0.5861757269733028,
327
+ "flow/improvement_ratio": 0.9699954621868047,
328
+ "flow/mag_ratio_mean": 0.5634166458153833,
329
+ "flow/mag_ratio_std": 0.22239429697598498,
330
+ "step": 10240
331
+ },
332
+ {
333
+ "epoch": 0.5202531060920974,
334
+ "grad_norm": 0.19414471089839935,
335
+ "learning_rate": 0.0005923415848692534,
336
+ "loss": 1.4696978330612183,
337
+ "step": 11264
338
+ },
339
+ {
340
+ "epoch": 0.5202531060920974,
341
+ "eval_cos_loss": 0.4130789393295436,
342
+ "eval_loss": 1.4056184373489797,
343
+ "eval_mse_loss": 1.1990789692151491,
344
+ "flow/cos_sim": 0.5869210932624939,
345
+ "flow/improvement_ratio": 0.9684095639888555,
346
+ "flow/mag_ratio_mean": 0.5640594227128921,
347
+ "flow/mag_ratio_std": 0.22638205625967348,
348
+ "step": 11264
349
+ },
350
+ {
351
+ "epoch": 0.5202531060920974,
352
+ "eval_cos_loss": 0.4130789393295436,
353
+ "eval_loss": 1.4056184373489797,
354
+ "eval_mse_loss": 1.1990789692151491,
355
+ "eval_runtime": 37.4767,
356
+ "eval_samples_per_second": 746.944,
357
+ "eval_steps_per_second": 11.687,
358
+ "flow/cos_sim": 0.5869210932624939,
359
+ "flow/improvement_ratio": 0.9684095639888555,
360
+ "flow/mag_ratio_mean": 0.5640594227128921,
361
+ "flow/mag_ratio_std": 0.22638205625967348,
362
+ "step": 11264
363
+ },
364
+ {
365
+ "epoch": 0.5675488430095608,
366
+ "grad_norm": 0.21493041515350342,
367
+ "learning_rate": 0.000506383115407268,
368
+ "loss": 1.4641783237457275,
369
+ "step": 12288
370
+ },
371
+ {
372
+ "epoch": 0.5675488430095608,
373
+ "eval_cos_loss": 0.4106343852602728,
374
+ "eval_loss": 1.397022104426606,
375
+ "eval_mse_loss": 1.1917049117284277,
376
+ "flow/cos_sim": 0.5893656173253168,
377
+ "flow/improvement_ratio": 0.9678930984512312,
378
+ "flow/mag_ratio_mean": 0.565856612164136,
379
+ "flow/mag_ratio_std": 0.2252416678079187,
380
+ "step": 12288
381
+ },
382
+ {
383
+ "epoch": 0.5675488430095608,
384
+ "eval_cos_loss": 0.4106343852602728,
385
+ "eval_loss": 1.397022104426606,
386
+ "eval_mse_loss": 1.1917049117284277,
387
+ "eval_runtime": 37.5318,
388
+ "eval_samples_per_second": 745.848,
389
+ "eval_steps_per_second": 11.67,
390
+ "flow/cos_sim": 0.5893656173253168,
391
+ "flow/improvement_ratio": 0.9678930984512312,
392
+ "flow/mag_ratio_mean": 0.565856612164136,
393
+ "flow/mag_ratio_std": 0.2252416678079187,
394
+ "step": 12288
395
+ },
396
+ {
397
+ "epoch": 0.6148445799270241,
398
+ "grad_norm": 0.1553519070148468,
399
+ "learning_rate": 0.00042015065633581203,
400
+ "loss": 1.460407018661499,
401
+ "step": 13312
402
+ },
403
+ {
404
+ "epoch": 0.6148445799270241,
405
+ "eval_cos_loss": 0.41024401954047757,
406
+ "eval_loss": 1.3953286480141558,
407
+ "eval_mse_loss": 1.190206638209896,
408
+ "flow/cos_sim": 0.5897559979462732,
409
+ "flow/improvement_ratio": 0.9680638820885523,
410
+ "flow/mag_ratio_mean": 0.5661686862984748,
411
+ "flow/mag_ratio_std": 0.22554758708226627,
412
+ "step": 13312
413
+ },
414
+ {
415
+ "epoch": 0.6148445799270241,
416
+ "eval_cos_loss": 0.41024401954047757,
417
+ "eval_loss": 1.3953286480141558,
418
+ "eval_mse_loss": 1.190206638209896,
419
+ "eval_runtime": 37.5911,
420
+ "eval_samples_per_second": 744.671,
421
+ "eval_steps_per_second": 11.652,
422
+ "flow/cos_sim": 0.5897559979462732,
423
+ "flow/improvement_ratio": 0.9680638820885523,
424
+ "flow/mag_ratio_mean": 0.5661686862984748,
425
+ "flow/mag_ratio_std": 0.22554758708226627,
426
+ "step": 13312
427
+ },
428
+ {
429
+ "epoch": 0.6621403168444876,
430
+ "grad_norm": 0.17843684554100037,
431
+ "learning_rate": 0.0003363861238167604,
432
+ "loss": 1.456861138343811,
433
+ "step": 14336
434
+ },
435
+ {
436
+ "epoch": 0.6621403168444876,
437
+ "eval_cos_loss": 0.4097557538720571,
438
+ "eval_loss": 1.3941423579982426,
439
+ "eval_mse_loss": 1.1892644820147997,
440
+ "flow/cos_sim": 0.5902442677652455,
441
+ "flow/improvement_ratio": 0.9690267289882382,
442
+ "flow/mag_ratio_mean": 0.5675587929002771,
443
+ "flow/mag_ratio_std": 0.22511094835795223,
444
+ "step": 14336
445
+ },
446
+ {
447
+ "epoch": 0.6621403168444876,
448
+ "eval_cos_loss": 0.4097557538720571,
449
+ "eval_loss": 1.3941423579982426,
450
+ "eval_mse_loss": 1.1892644820147997,
451
+ "eval_runtime": 37.725,
452
+ "eval_samples_per_second": 742.027,
453
+ "eval_steps_per_second": 11.61,
454
+ "flow/cos_sim": 0.5902442677652455,
455
+ "flow/improvement_ratio": 0.9690267289882382,
456
+ "flow/mag_ratio_mean": 0.5675587929002771,
457
+ "flow/mag_ratio_std": 0.22511094835795223,
458
+ "step": 14336
459
+ },
460
+ {
461
+ "epoch": 0.709436053761951,
462
+ "grad_norm": 0.16084638237953186,
463
+ "learning_rate": 0.0002574312924773668,
464
+ "loss": 1.4563266038894653,
465
+ "step": 15360
466
+ },
467
+ {
468
+ "epoch": 0.709436053761951,
469
+ "eval_cos_loss": 0.4082320715602674,
470
+ "eval_loss": 1.3902559054496626,
471
+ "eval_mse_loss": 1.1861398674581694,
472
+ "flow/cos_sim": 0.5917679436130611,
473
+ "flow/improvement_ratio": 0.9698736070225772,
474
+ "flow/mag_ratio_mean": 0.5668952558955102,
475
+ "flow/mag_ratio_std": 0.22523612784196254,
476
+ "step": 15360
477
+ },
478
+ {
479
+ "epoch": 0.709436053761951,
480
+ "eval_cos_loss": 0.4082320715602674,
481
+ "eval_loss": 1.3902559054496626,
482
+ "eval_mse_loss": 1.1861398674581694,
483
+ "eval_runtime": 37.6782,
484
+ "eval_samples_per_second": 742.949,
485
+ "eval_steps_per_second": 11.625,
486
+ "flow/cos_sim": 0.5917679436130611,
487
+ "flow/improvement_ratio": 0.9698736070225772,
488
+ "flow/mag_ratio_mean": 0.5668952558955102,
489
+ "flow/mag_ratio_std": 0.22523612784196254,
490
+ "step": 15360
491
+ },
492
+ {
493
+ "epoch": 0.7567317906794143,
494
+ "grad_norm": 0.1939423829317093,
495
+ "learning_rate": 0.0001857309099627023,
496
+ "loss": 1.4537469148635864,
497
+ "step": 16384
498
+ },
499
+ {
500
+ "epoch": 0.7567317906794143,
501
+ "eval_cos_loss": 0.4083600237489291,
502
+ "eval_loss": 1.3896791839708476,
503
+ "eval_mse_loss": 1.18549917087163,
504
+ "flow/cos_sim": 0.5916399929893615,
505
+ "flow/improvement_ratio": 0.9679969251155853,
506
+ "flow/mag_ratio_mean": 0.5672158710488445,
507
+ "flow/mag_ratio_std": 0.2283102631841076,
508
+ "step": 16384
509
+ },
510
+ {
511
+ "epoch": 0.7567317906794143,
512
+ "eval_cos_loss": 0.4083600237489291,
513
+ "eval_loss": 1.3896791839708476,
514
+ "eval_mse_loss": 1.18549917087163,
515
+ "eval_runtime": 37.6784,
516
+ "eval_samples_per_second": 742.946,
517
+ "eval_steps_per_second": 11.625,
518
+ "flow/cos_sim": 0.5916399929893615,
519
+ "flow/improvement_ratio": 0.9679969251155853,
520
+ "flow/mag_ratio_mean": 0.5672158710488445,
521
+ "flow/mag_ratio_std": 0.2283102631841076,
522
+ "step": 16384
523
+ },
524
+ {
525
+ "epoch": 0.8040275275968778,
526
+ "grad_norm": 0.16456730663776398,
527
+ "learning_rate": 0.0001234293037953827,
528
+ "loss": 1.4506418704986572,
529
+ "step": 17408
530
+ },
531
+ {
532
+ "epoch": 0.8040275275968778,
533
+ "eval_cos_loss": 0.4077592275730551,
534
+ "eval_loss": 1.3878156499231242,
535
+ "eval_mse_loss": 1.1839360382458934,
536
+ "flow/cos_sim": 0.5922407781424588,
537
+ "flow/improvement_ratio": 0.9692025500345448,
538
+ "flow/mag_ratio_mean": 0.5673814232218756,
539
+ "flow/mag_ratio_std": 0.22620373494821053,
540
+ "step": 17408
541
+ },
542
+ {
543
+ "epoch": 0.8040275275968778,
544
+ "eval_cos_loss": 0.4077592275730551,
545
+ "eval_loss": 1.3878156499231242,
546
+ "eval_mse_loss": 1.1839360382458934,
547
+ "eval_runtime": 37.7046,
548
+ "eval_samples_per_second": 742.43,
549
+ "eval_steps_per_second": 11.617,
550
+ "flow/cos_sim": 0.5922407781424588,
551
+ "flow/improvement_ratio": 0.9692025500345448,
552
+ "flow/mag_ratio_mean": 0.5673814232218756,
553
+ "flow/mag_ratio_std": 0.22620373494821053,
554
+ "step": 17408
555
+ },
556
+ {
557
+ "epoch": 0.8513232645143411,
558
+ "grad_norm": 0.16574722528457642,
559
+ "learning_rate": 7.243353809647096e-05,
560
+ "loss": 1.450889229774475,
561
+ "step": 18432
562
+ },
563
+ {
564
+ "epoch": 0.8513232645143411,
565
+ "eval_cos_loss": 0.4075628997392306,
566
+ "eval_loss": 1.389668641025073,
567
+ "eval_mse_loss": 1.185887193026608,
568
+ "flow/cos_sim": 0.5924371161145162,
569
+ "flow/improvement_ratio": 0.9703709175597587,
570
+ "flow/mag_ratio_mean": 0.5677012568467283,
571
+ "flow/mag_ratio_std": 0.22380314015498445,
572
+ "step": 18432
573
+ },
574
+ {
575
+ "epoch": 0.8513232645143411,
576
+ "eval_cos_loss": 0.4075628997392306,
577
+ "eval_loss": 1.389668641025073,
578
+ "eval_mse_loss": 1.185887193026608,
579
+ "eval_runtime": 37.4946,
580
+ "eval_samples_per_second": 746.588,
581
+ "eval_steps_per_second": 11.682,
582
+ "flow/cos_sim": 0.5924371161145162,
583
+ "flow/improvement_ratio": 0.9703709175597587,
584
+ "flow/mag_ratio_mean": 0.5677012568467283,
585
+ "flow/mag_ratio_std": 0.22380314015498445,
586
+ "step": 18432
587
+ },
588
+ {
589
+ "epoch": 0.8986190014318045,
590
+ "grad_norm": 0.1395236700773239,
591
+ "learning_rate": 3.416928297501892e-05,
592
+ "loss": 1.4504221677780151,
593
+ "step": 19456
594
+ },
595
+ {
596
+ "epoch": 0.8986190014318045,
597
+ "eval_cos_loss": 0.4083901912518288,
598
+ "eval_loss": 1.390020596926615,
599
+ "eval_mse_loss": 1.185825499497592,
600
+ "flow/cos_sim": 0.5916098165729818,
601
+ "flow/improvement_ratio": 0.9690576137200883,
602
+ "flow/mag_ratio_mean": 0.5676344796402814,
603
+ "flow/mag_ratio_std": 0.22553705900363183,
604
+ "step": 19456
605
+ },
606
+ {
607
+ "epoch": 0.8986190014318045,
608
+ "eval_cos_loss": 0.4083901912518288,
609
+ "eval_loss": 1.390020596926615,
610
+ "eval_mse_loss": 1.185825499497592,
611
+ "eval_runtime": 37.3515,
612
+ "eval_samples_per_second": 749.448,
613
+ "eval_steps_per_second": 11.726,
614
+ "flow/cos_sim": 0.5916098165729818,
615
+ "flow/improvement_ratio": 0.9690576137200883,
616
+ "flow/mag_ratio_mean": 0.5676344796402814,
617
+ "flow/mag_ratio_std": 0.22553705900363183,
618
+ "step": 19456
619
+ },
620
+ {
621
+ "epoch": 0.945914738349268,
622
+ "grad_norm": 0.1532638967037201,
623
+ "learning_rate": 9.853219257125512e-06,
624
+ "loss": 1.4503555297851562,
625
+ "step": 20480
626
+ },
627
+ {
628
+ "epoch": 0.945914738349268,
629
+ "eval_cos_loss": 0.4074147270830799,
630
+ "eval_loss": 1.3868377687724214,
631
+ "eval_mse_loss": 1.1831304028153964,
632
+ "flow/cos_sim": 0.5925852951665992,
633
+ "flow/improvement_ratio": 0.9679995860955487,
634
+ "flow/mag_ratio_mean": 0.5671717142133408,
635
+ "flow/mag_ratio_std": 0.2263773354833529,
636
+ "step": 20480
637
+ },
638
+ {
639
+ "epoch": 0.945914738349268,
640
+ "eval_cos_loss": 0.4074147270830799,
641
+ "eval_loss": 1.3868377687724214,
642
+ "eval_mse_loss": 1.1831304028153964,
643
+ "eval_runtime": 37.367,
644
+ "eval_samples_per_second": 749.138,
645
+ "eval_steps_per_second": 11.722,
646
+ "flow/cos_sim": 0.5925852951665992,
647
+ "flow/improvement_ratio": 0.9679995860955487,
648
+ "flow/mag_ratio_mean": 0.5671717142133408,
649
+ "flow/mag_ratio_std": 0.2263773354833529,
650
+ "step": 20480
651
+ },
652
+ {
653
+ "epoch": 0.9932104752667313,
654
+ "grad_norm": 0.11826281249523163,
655
+ "learning_rate": 1.651423088821491e-07,
656
+ "loss": 1.450731635093689,
657
+ "step": 21504
658
+ },
659
+ {
660
+ "epoch": 0.9932104752667313,
661
+ "eval_cos_loss": 0.4083047036983107,
662
+ "eval_loss": 1.3886951202671278,
663
+ "eval_mse_loss": 1.184542767805596,
664
+ "flow/cos_sim": 0.5916953210689161,
665
+ "flow/improvement_ratio": 0.9686407382901945,
666
+ "flow/mag_ratio_mean": 0.5667071160116152,
667
+ "flow/mag_ratio_std": 0.22649903095340077,
668
+ "step": 21504
669
+ },
670
+ {
671
+ "epoch": 0.9932104752667313,
672
+ "eval_cos_loss": 0.4083047036983107,
673
+ "eval_loss": 1.3886951202671278,
674
+ "eval_mse_loss": 1.184542767805596,
675
+ "eval_runtime": 37.4955,
676
+ "eval_samples_per_second": 746.569,
677
+ "eval_steps_per_second": 11.681,
678
+ "flow/cos_sim": 0.5916953210689161,
679
+ "flow/improvement_ratio": 0.9686407382901945,
680
+ "flow/mag_ratio_mean": 0.5667071160116152,
681
+ "flow/mag_ratio_std": 0.22649903095340077,
682
+ "step": 21504
683
+ }
684
+ ],
685
+ "logging_steps": 1024,
686
+ "max_steps": 21651,
687
+ "num_input_tokens_seen": 0,
688
+ "num_train_epochs": 1,
689
+ "save_steps": 1024,
690
+ "stateful_callbacks": {
691
+ "TrainerControl": {
692
+ "args": {
693
+ "should_epoch_stop": false,
694
+ "should_evaluate": false,
695
+ "should_log": false,
696
+ "should_save": true,
697
+ "should_training_stop": true
698
+ },
699
+ "attributes": {}
700
+ }
701
+ },
702
+ "total_flos": 0.0,
703
+ "train_batch_size": 64,
704
+ "trial_name": null,
705
+ "trial_params": null
706
+ }
checkpoints-v2.8-h-2/checkpoint-21651/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bd8764e705e3d3a01283602f7bbb7db8de6272269197e8d0fb5f615cd86459fe
3
+ size 5137