Attila1011 commited on
Commit
de58ebb
·
verified ·
1 Parent(s): b7c586c

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -49,3 +49,4 @@ checkpoints-v5.8/checkpoint-10240/eval_state.json filter=lfs diff=lfs merge=lfs
49
  checkpoints-v4.5/checkpoint-7168/eval_state.json filter=lfs diff=lfs merge=lfs -text
50
  checkpoints-v4.5/checkpoint-15360/eval_state.json filter=lfs diff=lfs merge=lfs -text
51
  checkpoints-v5.9/checkpoint-11264/eval_state.json filter=lfs diff=lfs merge=lfs -text
 
 
49
  checkpoints-v4.5/checkpoint-7168/eval_state.json filter=lfs diff=lfs merge=lfs -text
50
  checkpoints-v4.5/checkpoint-15360/eval_state.json filter=lfs diff=lfs merge=lfs -text
51
  checkpoints-v5.9/checkpoint-11264/eval_state.json filter=lfs diff=lfs merge=lfs -text
52
+ checkpoints-v5.10/checkpoint-7168/eval_state.json filter=lfs diff=lfs merge=lfs -text
checkpoints-v5.10/checkpoint-7168/ema.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:859b524c6f08cb8e4de0cdeac3d1078754a838adbe2bb2b52253d00d572d8949
3
+ size 55150648
checkpoints-v5.10/checkpoint-7168/eval_state.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6172bae3b0d393a89c588da917a75b35b245d09483a6f2e1d40162c71c7a1b08
3
+ size 58881754
checkpoints-v5.10/checkpoint-7168/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d73ce65202c4a4403f56015696a3586e66b41a2df367b58ac4eab6399e1b7796
3
+ size 55150680
checkpoints-v5.10/checkpoint-7168/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0191582a7b4e76bacb808a998eca18354df820c14390b9ea35ba5b5a749a826b
3
+ size 77724619
checkpoints-v5.10/checkpoint-7168/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:179c71fcead7f0f1a71ca3f32ded1de0ad5b7d83affaf2bafaa286d4b4ab393f
3
+ size 14645
checkpoints-v5.10/checkpoint-7168/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d595dc04756955b718dbe40e12e3b42e9a74ec09bbdeec39a22714665de3cd13
3
+ size 1383
checkpoints-v5.10/checkpoint-7168/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59ae6fdfd0cae3f8c7c6f88ba50d2b929ba08e8f73177205bb5416fc134c7f7b
3
+ size 1465
checkpoints-v5.10/checkpoint-7168/trainer_state.json ADDED
@@ -0,0 +1,286 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.30891891309500724,
6
+ "eval_steps": 1024,
7
+ "global_step": 7168,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.04413127329928675,
14
+ "grad_norm": 0.7938550114631653,
15
+ "learning_rate": 9.990234375e-05,
16
+ "loss": 12.336601257324219,
17
+ "step": 1024
18
+ },
19
+ {
20
+ "epoch": 0.04413127329928675,
21
+ "eval_bleu": 0.16407143517604253,
22
+ "eval_ce_clean_loss": 1.724545789679993,
23
+ "eval_ce_pred_loss": 5.058502770690267,
24
+ "eval_flow_mse_loss": 0.929569506823127,
25
+ "eval_loss": 6.195067207442164,
26
+ "flow/cos_sim": 0.055166260543853235,
27
+ "flow/improvement_ratio": 0.596057767425773,
28
+ "flow/mag_ratio_mean": 0.4713213621680416,
29
+ "flow/mag_ratio_std": 0.4101032533371118,
30
+ "step": 1024
31
+ },
32
+ {
33
+ "epoch": 0.04413127329928675,
34
+ "eval_bleu": 0.16407143517604253,
35
+ "eval_ce_clean_loss": 1.724545789679993,
36
+ "eval_ce_pred_loss": 5.058502770690267,
37
+ "eval_flow_mse_loss": 0.929569506823127,
38
+ "eval_loss": 6.195067207442164,
39
+ "eval_runtime": 212.5631,
40
+ "eval_samples_per_second": 141.135,
41
+ "eval_steps_per_second": 2.206,
42
+ "flow/cos_sim": 0.055166260543853235,
43
+ "flow/improvement_ratio": 0.596057767425773,
44
+ "flow/mag_ratio_mean": 0.4713213621680416,
45
+ "flow/mag_ratio_std": 0.4101032533371118,
46
+ "step": 1024
47
+ },
48
+ {
49
+ "epoch": 0.0882625465985735,
50
+ "grad_norm": 0.450653612613678,
51
+ "learning_rate": 9.9476028157316e-05,
52
+ "loss": 4.612443923950195,
53
+ "step": 2048
54
+ },
55
+ {
56
+ "epoch": 0.0882625465985735,
57
+ "eval_bleu": 0.3321661463154056,
58
+ "eval_ce_clean_loss": 0.2987517699885216,
59
+ "eval_ce_pred_loss": 3.625808963389285,
60
+ "eval_flow_mse_loss": 1.015768651387839,
61
+ "eval_loss": 3.8525866531868225,
62
+ "flow/cos_sim": 0.1616891078921015,
63
+ "flow/improvement_ratio": 0.7423238582702588,
64
+ "flow/mag_ratio_mean": 0.6468003783017587,
65
+ "flow/mag_ratio_std": 0.5354506893834071,
66
+ "step": 2048
67
+ },
68
+ {
69
+ "epoch": 0.0882625465985735,
70
+ "eval_bleu": 0.3321661463154056,
71
+ "eval_ce_clean_loss": 0.2987517699885216,
72
+ "eval_ce_pred_loss": 3.625808963389285,
73
+ "eval_flow_mse_loss": 1.015768651387839,
74
+ "eval_loss": 3.8525866531868225,
75
+ "eval_runtime": 208.1773,
76
+ "eval_samples_per_second": 144.108,
77
+ "eval_steps_per_second": 2.253,
78
+ "flow/cos_sim": 0.1616891078921015,
79
+ "flow/improvement_ratio": 0.7423238582702588,
80
+ "flow/mag_ratio_mean": 0.6468003783017587,
81
+ "flow/mag_ratio_std": 0.5354506893834071,
82
+ "step": 2048
83
+ },
84
+ {
85
+ "epoch": 0.13239381989786023,
86
+ "grad_norm": 0.37855076789855957,
87
+ "learning_rate": 9.791307026072513e-05,
88
+ "loss": 3.617213010787964,
89
+ "step": 3072
90
+ },
91
+ {
92
+ "epoch": 0.13239381989786023,
93
+ "eval_bleu": 0.3836053070830755,
94
+ "eval_ce_clean_loss": 0.12278842976860908,
95
+ "eval_ce_pred_loss": 3.1099086214484437,
96
+ "eval_flow_mse_loss": 1.0722369169121357,
97
+ "eval_loss": 3.3719613491090885,
98
+ "flow/cos_sim": 0.22163581698815196,
99
+ "flow/improvement_ratio": 0.8109787233602772,
100
+ "flow/mag_ratio_mean": 0.6704668670829171,
101
+ "flow/mag_ratio_std": 0.4992133626805694,
102
+ "step": 3072
103
+ },
104
+ {
105
+ "epoch": 0.13239381989786023,
106
+ "eval_bleu": 0.3836053070830755,
107
+ "eval_ce_clean_loss": 0.12278842976860908,
108
+ "eval_ce_pred_loss": 3.1099086214484437,
109
+ "eval_flow_mse_loss": 1.0722369169121357,
110
+ "eval_loss": 3.3719613491090885,
111
+ "eval_runtime": 209.1152,
112
+ "eval_samples_per_second": 143.462,
113
+ "eval_steps_per_second": 2.243,
114
+ "flow/cos_sim": 0.22163581698815196,
115
+ "flow/improvement_ratio": 0.8109787233602772,
116
+ "flow/mag_ratio_mean": 0.6704668670829171,
117
+ "flow/mag_ratio_std": 0.4992133626805694,
118
+ "step": 3072
119
+ },
120
+ {
121
+ "epoch": 0.176525093197147,
122
+ "grad_norm": 0.4389702081680298,
123
+ "learning_rate": 9.53439476074686e-05,
124
+ "loss": 3.2931454181671143,
125
+ "step": 4096
126
+ },
127
+ {
128
+ "epoch": 0.176525093197147,
129
+ "eval_bleu": 0.40933125434293716,
130
+ "eval_ce_clean_loss": 0.06711880822997612,
131
+ "eval_ce_pred_loss": 2.8558058159183592,
132
+ "eval_flow_mse_loss": 1.081392692604553,
133
+ "eval_loss": 3.147575543125047,
134
+ "flow/cos_sim": 0.25172679529769587,
135
+ "flow/improvement_ratio": 0.8514864339248966,
136
+ "flow/mag_ratio_mean": 0.6699917584594125,
137
+ "flow/mag_ratio_std": 0.44682612055654464,
138
+ "step": 4096
139
+ },
140
+ {
141
+ "epoch": 0.176525093197147,
142
+ "eval_bleu": 0.40933125434293716,
143
+ "eval_ce_clean_loss": 0.06711880822997612,
144
+ "eval_ce_pred_loss": 2.8558058159183592,
145
+ "eval_flow_mse_loss": 1.081392692604553,
146
+ "eval_loss": 3.147575543125047,
147
+ "eval_runtime": 210.459,
148
+ "eval_samples_per_second": 142.546,
149
+ "eval_steps_per_second": 2.228,
150
+ "flow/cos_sim": 0.25172679529769587,
151
+ "flow/improvement_ratio": 0.8514864339248966,
152
+ "flow/mag_ratio_mean": 0.6699917584594125,
153
+ "flow/mag_ratio_std": 0.44682612055654464,
154
+ "step": 4096
155
+ },
156
+ {
157
+ "epoch": 0.22065636649643372,
158
+ "grad_norm": 0.44552379846572876,
159
+ "learning_rate": 9.182261125213742e-05,
160
+ "loss": 3.127476692199707,
161
+ "step": 5120
162
+ },
163
+ {
164
+ "epoch": 0.22065636649643372,
165
+ "eval_bleu": 0.42390360625279244,
166
+ "eval_ce_clean_loss": 0.04179192618377554,
167
+ "eval_ce_pred_loss": 2.713604238495898,
168
+ "eval_flow_mse_loss": 1.0599637749606867,
169
+ "eval_loss": 3.00127863375617,
170
+ "flow/cos_sim": 0.25444072618413327,
171
+ "flow/improvement_ratio": 0.8746775842424649,
172
+ "flow/mag_ratio_mean": 0.671740317395501,
173
+ "flow/mag_ratio_std": 0.40047251164659,
174
+ "step": 5120
175
+ },
176
+ {
177
+ "epoch": 0.22065636649643372,
178
+ "eval_bleu": 0.42390360625279244,
179
+ "eval_ce_clean_loss": 0.04179192618377554,
180
+ "eval_ce_pred_loss": 2.713604238495898,
181
+ "eval_flow_mse_loss": 1.0599637749606867,
182
+ "eval_loss": 3.00127863375617,
183
+ "eval_runtime": 211.2198,
184
+ "eval_samples_per_second": 142.032,
185
+ "eval_steps_per_second": 2.22,
186
+ "flow/cos_sim": 0.25444072618413327,
187
+ "flow/improvement_ratio": 0.8746775842424649,
188
+ "flow/mag_ratio_mean": 0.671740317395501,
189
+ "flow/mag_ratio_std": 0.40047251164659,
190
+ "step": 5120
191
+ },
192
+ {
193
+ "epoch": 0.26478763979572045,
194
+ "grad_norm": 0.4731499254703522,
195
+ "learning_rate": 8.742300854391668e-05,
196
+ "loss": 3.012479782104492,
197
+ "step": 6144
198
+ },
199
+ {
200
+ "epoch": 0.26478763979572045,
201
+ "eval_bleu": 0.4346563578061454,
202
+ "eval_ce_clean_loss": 0.028318230050808586,
203
+ "eval_ce_pred_loss": 2.6327081789085858,
204
+ "eval_flow_mse_loss": 1.056245713854141,
205
+ "eval_loss": 2.927459636985112,
206
+ "flow/cos_sim": 0.2519478304808074,
207
+ "flow/improvement_ratio": 0.8869821624969368,
208
+ "flow/mag_ratio_mean": 0.6784741002867725,
209
+ "flow/mag_ratio_std": 0.37580890851869764,
210
+ "step": 6144
211
+ },
212
+ {
213
+ "epoch": 0.26478763979572045,
214
+ "eval_bleu": 0.4346563578061454,
215
+ "eval_ce_clean_loss": 0.028318230050808586,
216
+ "eval_ce_pred_loss": 2.6327081789085858,
217
+ "eval_flow_mse_loss": 1.056245713854141,
218
+ "eval_loss": 2.927459636985112,
219
+ "eval_runtime": 211.7504,
220
+ "eval_samples_per_second": 141.676,
221
+ "eval_steps_per_second": 2.215,
222
+ "flow/cos_sim": 0.2519478304808074,
223
+ "flow/improvement_ratio": 0.8869821624969368,
224
+ "flow/mag_ratio_mean": 0.6784741002867725,
225
+ "flow/mag_ratio_std": 0.37580890851869764,
226
+ "step": 6144
227
+ },
228
+ {
229
+ "epoch": 0.30891891309500724,
230
+ "grad_norm": 0.6240633726119995,
231
+ "learning_rate": 8.223753024725232e-05,
232
+ "loss": 2.9197511672973633,
233
+ "step": 7168
234
+ },
235
+ {
236
+ "epoch": 0.30891891309500724,
237
+ "eval_bleu": 0.44790457353944046,
238
+ "eval_ce_clean_loss": 0.020565879328656934,
239
+ "eval_ce_pred_loss": 2.529554044768246,
240
+ "eval_flow_mse_loss": 1.036346342009522,
241
+ "eval_loss": 2.8276000312650638,
242
+ "flow/cos_sim": 0.24807281547517918,
243
+ "flow/improvement_ratio": 0.8986854375298343,
244
+ "flow/mag_ratio_mean": 0.6740282399059613,
245
+ "flow/mag_ratio_std": 0.3316722640287139,
246
+ "step": 7168
247
+ },
248
+ {
249
+ "epoch": 0.30891891309500724,
250
+ "eval_bleu": 0.44790457353944046,
251
+ "eval_ce_clean_loss": 0.020565879328656934,
252
+ "eval_ce_pred_loss": 2.529554044768246,
253
+ "eval_flow_mse_loss": 1.036346342009522,
254
+ "eval_loss": 2.8276000312650638,
255
+ "eval_runtime": 211.5611,
256
+ "eval_samples_per_second": 141.803,
257
+ "eval_steps_per_second": 2.217,
258
+ "flow/cos_sim": 0.24807281547517918,
259
+ "flow/improvement_ratio": 0.8986854375298343,
260
+ "flow/mag_ratio_mean": 0.6740282399059613,
261
+ "flow/mag_ratio_std": 0.3316722640287139,
262
+ "step": 7168
263
+ }
264
+ ],
265
+ "logging_steps": 1024,
266
+ "max_steps": 23204,
267
+ "num_input_tokens_seen": 0,
268
+ "num_train_epochs": 1,
269
+ "save_steps": 1024,
270
+ "stateful_callbacks": {
271
+ "TrainerControl": {
272
+ "args": {
273
+ "should_epoch_stop": false,
274
+ "should_evaluate": false,
275
+ "should_log": false,
276
+ "should_save": true,
277
+ "should_training_stop": false
278
+ },
279
+ "attributes": {}
280
+ }
281
+ },
282
+ "total_flos": 0.0,
283
+ "train_batch_size": 64,
284
+ "trial_name": null,
285
+ "trial_params": null
286
+ }
checkpoints-v5.10/checkpoint-7168/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8469bbc81a2ba0be2f5b44007faafd15c75615abe30f4f4e56171816d31caa5b
3
+ size 5137