nvan15 commited on
Commit
1360496
·
verified ·
1 Parent(s): ba22426

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. reproduction/env.yaml +63 -0
  2. reproduction/glue_exp/cola/dr0.0,mlr9e-03,clr9e-03,ep=34.0t=21d10h53m36/CoLA.tsv +1064 -0
  3. reproduction/glue_exp/cola/dr0.0,mlr9e-03,clr9e-03,ep=34.0t=21d10h53m36/all_results.json +9 -0
  4. reproduction/glue_exp/cola/dr0.0,mlr9e-03,clr9e-03,ep=34.0t=21d10h53m36/eval_results.json +9 -0
  5. reproduction/glue_exp/cola/dr0.0,mlr9e-03,clr9e-03,ep=34.0t=21d10h53m36/ft/added_tokens.json +3 -0
  6. reproduction/glue_exp/cola/dr0.0,mlr9e-03,clr9e-03,ep=34.0t=21d10h53m36/ft/special_tokens_map.json +15 -0
  7. reproduction/glue_exp/cola/dr0.0,mlr9e-03,clr9e-03,ep=34.0t=21d10h53m36/ft/tokenizer.json +0 -0
  8. reproduction/glue_exp/cola/dr0.0,mlr9e-03,clr9e-03,ep=34.0t=21d10h53m36/ft/tokenizer_config.json +60 -0
  9. reproduction/glue_exp/cola/dr0.0,mlr9e-03,clr9e-03,ep=34.0t=21d10h53m36/ft2/README.md +205 -0
  10. reproduction/glue_exp/cola/dr0.0,mlr9e-03,clr9e-03,ep=34.0t=21d10h53m36/ft2/adapter_config.json +30 -0
  11. reproduction/glue_exp/cola/dr0.0,mlr9e-03,clr9e-03,ep=34.0t=21d10h53m36/trainer_state.json +1499 -0
  12. reproduction/glue_exp/mnli/dr0.0,mlr1e-02,clr1e-02,ep=8.0t=20d22h22m35/MNLI-m.tsv +0 -0
  13. reproduction/glue_exp/mnli/dr0.0,mlr1e-02,clr1e-02,ep=8.0t=20d22h22m35/MNLI-mm.tsv +0 -0
  14. reproduction/glue_exp/mnli/dr0.0,mlr1e-02,clr1e-02,ep=8.0t=20d22h22m35/all_results.json +16 -0
  15. reproduction/glue_exp/mnli/dr0.0,mlr1e-02,clr1e-02,ep=8.0t=20d22h22m35/eval_results.json +16 -0
  16. reproduction/glue_exp/mnli/dr0.0,mlr1e-02,clr1e-02,ep=8.0t=20d22h22m35/ft/added_tokens.json +3 -0
  17. reproduction/glue_exp/mnli/dr0.0,mlr1e-02,clr1e-02,ep=8.0t=20d22h22m35/ft/special_tokens_map.json +15 -0
  18. reproduction/glue_exp/mnli/dr0.0,mlr1e-02,clr1e-02,ep=8.0t=20d22h22m35/ft/tokenizer.json +0 -0
  19. reproduction/glue_exp/mnli/dr0.0,mlr1e-02,clr1e-02,ep=8.0t=20d22h22m35/ft/tokenizer_config.json +60 -0
  20. reproduction/glue_exp/mnli/dr0.0,mlr1e-02,clr1e-02,ep=8.0t=20d22h22m35/ft2/README.md +205 -0
  21. reproduction/glue_exp/mnli/dr0.0,mlr1e-02,clr1e-02,ep=8.0t=20d22h22m35/ft2/adapter_config.json +30 -0
  22. reproduction/glue_exp/mnli/dr0.0,mlr1e-02,clr1e-02,ep=8.0t=20d22h22m35/trainer_state.json +1611 -0
  23. reproduction/glue_exp/mrpc/dr0.0,mlr6e-03,clr6e-03,ep=60.0t=22d12h19m17/MRPC.tsv +1726 -0
  24. reproduction/glue_exp/mrpc/dr0.0,mlr6e-03,clr6e-03,ep=60.0t=22d12h19m17/all_results.json +11 -0
  25. reproduction/glue_exp/mrpc/dr0.0,mlr6e-03,clr6e-03,ep=60.0t=22d12h19m17/eval_results.json +11 -0
  26. reproduction/glue_exp/mrpc/dr0.0,mlr6e-03,clr6e-03,ep=60.0t=22d12h19m17/ft/added_tokens.json +3 -0
  27. reproduction/glue_exp/mrpc/dr0.0,mlr6e-03,clr6e-03,ep=60.0t=22d12h19m17/ft/special_tokens_map.json +15 -0
  28. reproduction/glue_exp/mrpc/dr0.0,mlr6e-03,clr6e-03,ep=60.0t=22d12h19m17/ft/tokenizer.json +0 -0
  29. reproduction/glue_exp/mrpc/dr0.0,mlr6e-03,clr6e-03,ep=60.0t=22d12h19m17/ft/tokenizer_config.json +60 -0
  30. reproduction/glue_exp/mrpc/dr0.0,mlr6e-03,clr6e-03,ep=60.0t=22d12h19m17/ft2/README.md +205 -0
  31. reproduction/glue_exp/mrpc/dr0.0,mlr6e-03,clr6e-03,ep=60.0t=22d12h19m17/ft2/adapter_config.json +30 -0
  32. reproduction/glue_exp/mrpc/dr0.0,mlr6e-03,clr6e-03,ep=60.0t=22d12h19m17/trainer_state.json +1285 -0
  33. reproduction/glue_exp/qnli/dr0.0,mlr1e-02,clr1e-02,ep=12.0t=20d22h35m39/QNLI.tsv +0 -0
  34. reproduction/glue_exp/qnli/dr0.0,mlr1e-02,clr1e-02,ep=12.0t=20d22h35m39/all_results.json +9 -0
  35. reproduction/glue_exp/qnli/dr0.0,mlr1e-02,clr1e-02,ep=12.0t=20d22h35m39/eval_results.json +9 -0
  36. reproduction/glue_exp/qnli/dr0.0,mlr1e-02,clr1e-02,ep=12.0t=20d22h35m39/ft/added_tokens.json +3 -0
  37. reproduction/glue_exp/qnli/dr0.0,mlr1e-02,clr1e-02,ep=12.0t=20d22h35m39/ft/special_tokens_map.json +15 -0
  38. reproduction/glue_exp/qnli/dr0.0,mlr1e-02,clr1e-02,ep=12.0t=20d22h35m39/ft/tokenizer.json +0 -0
  39. reproduction/glue_exp/qnli/dr0.0,mlr1e-02,clr1e-02,ep=12.0t=20d22h35m39/ft/tokenizer_config.json +60 -0
  40. reproduction/glue_exp/qnli/dr0.0,mlr1e-02,clr1e-02,ep=12.0t=20d22h35m39/ft2/README.md +205 -0
  41. reproduction/glue_exp/qnli/dr0.0,mlr1e-02,clr1e-02,ep=12.0t=20d22h35m39/ft2/adapter_config.json +30 -0
  42. reproduction/glue_exp/qnli/dr0.0,mlr1e-02,clr1e-02,ep=12.0t=20d22h35m39/trainer_state.json +1291 -0
  43. reproduction/glue_exp/qqp/dr0.0,mlr8e-03,clr8e-03,ep=12.0t=21d03h04m16/QQP.tsv +0 -0
  44. reproduction/glue_exp/qqp/dr0.0,mlr8e-03,clr8e-03,ep=12.0t=21d03h04m16/all_results.json +11 -0
  45. reproduction/glue_exp/qqp/dr0.0,mlr8e-03,clr8e-03,ep=12.0t=21d03h04m16/eval_results.json +11 -0
  46. reproduction/glue_exp/qqp/dr0.0,mlr8e-03,clr8e-03,ep=12.0t=21d03h04m16/ft/added_tokens.json +3 -0
  47. reproduction/glue_exp/qqp/dr0.0,mlr8e-03,clr8e-03,ep=12.0t=21d03h04m16/ft/special_tokens_map.json +15 -0
  48. reproduction/glue_exp/qqp/dr0.0,mlr8e-03,clr8e-03,ep=12.0t=21d03h04m16/ft/tokenizer.json +0 -0
  49. reproduction/glue_exp/qqp/dr0.0,mlr8e-03,clr8e-03,ep=12.0t=21d03h04m16/ft/tokenizer_config.json +60 -0
  50. reproduction/glue_exp/qqp/dr0.0,mlr8e-03,clr8e-03,ep=12.0t=21d03h04m16/ft2/README.md +205 -0
reproduction/env.yaml ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # environment.yml
2
+ name: an_rep # The name of the environment
3
+
4
+ channels: # The conda channels to search for packages
5
+ # - pytorch
6
+ - conda-forge
7
+ # - dnachun
8
+ # - anaconda
9
+ channel_priority: strict
10
+
11
+ dependencies:
12
+ # Packages to install with conda
13
+ # - python=3.11.3
14
+ #- pytorch-cuda=12.4
15
+ # - pytorch >= 2.6
16
+ # - numpy
17
+
18
+ # - tensorboard
19
+ # - omegaconf
20
+
21
+
22
+ - wandb
23
+
24
+ - scipy
25
+ - pandas
26
+ - matplotlib
27
+ - scikit-image
28
+ - scikit-learn
29
+ - joblib
30
+ - pillow
31
+ ## NO - huggingface_hub
32
+ - tqdm
33
+ - nltk
34
+ # - future
35
+
36
+
37
+ # - defusedxml
38
+ # - ipdb
39
+ # - torchinfo
40
+
41
+
42
+
43
+ # - timm
44
+ # - graphviz #anaconda::graphviz
45
+ # - dnachun::torchviz
46
+ - pip:
47
+ - --index-url https://download.pytorch.org/whl/cu126
48
+ - torch
49
+ - torchvision
50
+ - torchaudio
51
+ - --index-url https://pypi.org/simple
52
+ - transformers>=4.55
53
+ - accelerate
54
+
55
+ - einops
56
+ - jaxtyping
57
+ - peft
58
+ - datasets
59
+ # - fraction
60
+ - draccus
61
+ - vllm
62
+ - evaluate
63
+
reproduction/glue_exp/cola/dr0.0,mlr9e-03,clr9e-03,ep=34.0t=21d10h53m36/CoLA.tsv ADDED
@@ -0,0 +1,1064 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ index prediction
2
+ 0 1
3
+ 1 1
4
+ 2 1
5
+ 3 1
6
+ 4 1
7
+ 5 1
8
+ 6 1
9
+ 7 1
10
+ 8 1
11
+ 9 1
12
+ 10 1
13
+ 11 1
14
+ 12 1
15
+ 13 1
16
+ 14 1
17
+ 15 1
18
+ 16 1
19
+ 17 1
20
+ 18 1
21
+ 19 1
22
+ 20 1
23
+ 21 1
24
+ 22 1
25
+ 23 1
26
+ 24 1
27
+ 25 1
28
+ 26 1
29
+ 27 1
30
+ 28 1
31
+ 29 1
32
+ 30 1
33
+ 31 1
34
+ 32 1
35
+ 33 1
36
+ 34 1
37
+ 35 1
38
+ 36 1
39
+ 37 1
40
+ 38 1
41
+ 39 1
42
+ 40 1
43
+ 41 1
44
+ 42 1
45
+ 43 1
46
+ 44 1
47
+ 45 1
48
+ 46 1
49
+ 47 1
50
+ 48 1
51
+ 49 1
52
+ 50 1
53
+ 51 1
54
+ 52 1
55
+ 53 1
56
+ 54 1
57
+ 55 1
58
+ 56 1
59
+ 57 1
60
+ 58 1
61
+ 59 1
62
+ 60 1
63
+ 61 1
64
+ 62 1
65
+ 63 1
66
+ 64 1
67
+ 65 1
68
+ 66 1
69
+ 67 1
70
+ 68 1
71
+ 69 1
72
+ 70 1
73
+ 71 1
74
+ 72 1
75
+ 73 1
76
+ 74 1
77
+ 75 1
78
+ 76 1
79
+ 77 1
80
+ 78 1
81
+ 79 1
82
+ 80 1
83
+ 81 1
84
+ 82 1
85
+ 83 1
86
+ 84 1
87
+ 85 1
88
+ 86 1
89
+ 87 1
90
+ 88 1
91
+ 89 1
92
+ 90 1
93
+ 91 1
94
+ 92 1
95
+ 93 1
96
+ 94 1
97
+ 95 1
98
+ 96 1
99
+ 97 1
100
+ 98 1
101
+ 99 1
102
+ 100 1
103
+ 101 1
104
+ 102 1
105
+ 103 1
106
+ 104 1
107
+ 105 1
108
+ 106 1
109
+ 107 1
110
+ 108 1
111
+ 109 1
112
+ 110 1
113
+ 111 1
114
+ 112 1
115
+ 113 1
116
+ 114 1
117
+ 115 1
118
+ 116 1
119
+ 117 1
120
+ 118 1
121
+ 119 1
122
+ 120 1
123
+ 121 1
124
+ 122 1
125
+ 123 1
126
+ 124 1
127
+ 125 1
128
+ 126 1
129
+ 127 1
130
+ 128 1
131
+ 129 1
132
+ 130 1
133
+ 131 1
134
+ 132 1
135
+ 133 1
136
+ 134 1
137
+ 135 1
138
+ 136 1
139
+ 137 1
140
+ 138 1
141
+ 139 1
142
+ 140 1
143
+ 141 1
144
+ 142 1
145
+ 143 1
146
+ 144 1
147
+ 145 1
148
+ 146 1
149
+ 147 1
150
+ 148 1
151
+ 149 1
152
+ 150 1
153
+ 151 1
154
+ 152 1
155
+ 153 1
156
+ 154 1
157
+ 155 1
158
+ 156 1
159
+ 157 1
160
+ 158 1
161
+ 159 1
162
+ 160 1
163
+ 161 1
164
+ 162 1
165
+ 163 1
166
+ 164 1
167
+ 165 1
168
+ 166 1
169
+ 167 1
170
+ 168 1
171
+ 169 1
172
+ 170 1
173
+ 171 1
174
+ 172 1
175
+ 173 1
176
+ 174 1
177
+ 175 1
178
+ 176 1
179
+ 177 1
180
+ 178 1
181
+ 179 1
182
+ 180 1
183
+ 181 1
184
+ 182 1
185
+ 183 1
186
+ 184 1
187
+ 185 1
188
+ 186 1
189
+ 187 1
190
+ 188 1
191
+ 189 1
192
+ 190 1
193
+ 191 1
194
+ 192 1
195
+ 193 1
196
+ 194 1
197
+ 195 1
198
+ 196 1
199
+ 197 1
200
+ 198 1
201
+ 199 1
202
+ 200 1
203
+ 201 1
204
+ 202 1
205
+ 203 1
206
+ 204 1
207
+ 205 1
208
+ 206 1
209
+ 207 1
210
+ 208 1
211
+ 209 1
212
+ 210 1
213
+ 211 1
214
+ 212 1
215
+ 213 1
216
+ 214 1
217
+ 215 1
218
+ 216 1
219
+ 217 1
220
+ 218 1
221
+ 219 1
222
+ 220 1
223
+ 221 1
224
+ 222 1
225
+ 223 1
226
+ 224 1
227
+ 225 1
228
+ 226 1
229
+ 227 1
230
+ 228 1
231
+ 229 1
232
+ 230 1
233
+ 231 1
234
+ 232 1
235
+ 233 1
236
+ 234 1
237
+ 235 1
238
+ 236 1
239
+ 237 1
240
+ 238 1
241
+ 239 1
242
+ 240 1
243
+ 241 1
244
+ 242 1
245
+ 243 1
246
+ 244 1
247
+ 245 1
248
+ 246 1
249
+ 247 1
250
+ 248 1
251
+ 249 1
252
+ 250 1
253
+ 251 1
254
+ 252 1
255
+ 253 1
256
+ 254 1
257
+ 255 1
258
+ 256 1
259
+ 257 1
260
+ 258 1
261
+ 259 1
262
+ 260 1
263
+ 261 1
264
+ 262 1
265
+ 263 1
266
+ 264 1
267
+ 265 1
268
+ 266 1
269
+ 267 1
270
+ 268 1
271
+ 269 1
272
+ 270 1
273
+ 271 1
274
+ 272 1
275
+ 273 1
276
+ 274 1
277
+ 275 1
278
+ 276 1
279
+ 277 1
280
+ 278 1
281
+ 279 1
282
+ 280 1
283
+ 281 1
284
+ 282 1
285
+ 283 1
286
+ 284 1
287
+ 285 1
288
+ 286 1
289
+ 287 1
290
+ 288 1
291
+ 289 1
292
+ 290 1
293
+ 291 1
294
+ 292 1
295
+ 293 1
296
+ 294 1
297
+ 295 1
298
+ 296 1
299
+ 297 1
300
+ 298 1
301
+ 299 1
302
+ 300 1
303
+ 301 1
304
+ 302 1
305
+ 303 1
306
+ 304 1
307
+ 305 1
308
+ 306 1
309
+ 307 1
310
+ 308 1
311
+ 309 1
312
+ 310 1
313
+ 311 1
314
+ 312 1
315
+ 313 1
316
+ 314 1
317
+ 315 1
318
+ 316 1
319
+ 317 1
320
+ 318 1
321
+ 319 1
322
+ 320 1
323
+ 321 1
324
+ 322 1
325
+ 323 1
326
+ 324 1
327
+ 325 1
328
+ 326 1
329
+ 327 1
330
+ 328 1
331
+ 329 1
332
+ 330 1
333
+ 331 1
334
+ 332 1
335
+ 333 1
336
+ 334 1
337
+ 335 1
338
+ 336 1
339
+ 337 1
340
+ 338 1
341
+ 339 1
342
+ 340 1
343
+ 341 1
344
+ 342 1
345
+ 343 1
346
+ 344 1
347
+ 345 1
348
+ 346 1
349
+ 347 1
350
+ 348 1
351
+ 349 1
352
+ 350 1
353
+ 351 1
354
+ 352 1
355
+ 353 1
356
+ 354 1
357
+ 355 1
358
+ 356 1
359
+ 357 1
360
+ 358 1
361
+ 359 1
362
+ 360 1
363
+ 361 1
364
+ 362 1
365
+ 363 1
366
+ 364 1
367
+ 365 1
368
+ 366 1
369
+ 367 1
370
+ 368 1
371
+ 369 1
372
+ 370 1
373
+ 371 1
374
+ 372 1
375
+ 373 1
376
+ 374 1
377
+ 375 1
378
+ 376 1
379
+ 377 1
380
+ 378 1
381
+ 379 1
382
+ 380 1
383
+ 381 1
384
+ 382 1
385
+ 383 1
386
+ 384 1
387
+ 385 1
388
+ 386 1
389
+ 387 1
390
+ 388 1
391
+ 389 1
392
+ 390 1
393
+ 391 1
394
+ 392 1
395
+ 393 1
396
+ 394 1
397
+ 395 1
398
+ 396 1
399
+ 397 1
400
+ 398 1
401
+ 399 1
402
+ 400 1
403
+ 401 1
404
+ 402 1
405
+ 403 1
406
+ 404 1
407
+ 405 1
408
+ 406 1
409
+ 407 1
410
+ 408 1
411
+ 409 1
412
+ 410 1
413
+ 411 1
414
+ 412 1
415
+ 413 1
416
+ 414 1
417
+ 415 1
418
+ 416 1
419
+ 417 1
420
+ 418 1
421
+ 419 1
422
+ 420 1
423
+ 421 1
424
+ 422 1
425
+ 423 1
426
+ 424 1
427
+ 425 1
428
+ 426 1
429
+ 427 1
430
+ 428 1
431
+ 429 1
432
+ 430 1
433
+ 431 1
434
+ 432 1
435
+ 433 1
436
+ 434 1
437
+ 435 1
438
+ 436 1
439
+ 437 1
440
+ 438 1
441
+ 439 1
442
+ 440 1
443
+ 441 1
444
+ 442 1
445
+ 443 1
446
+ 444 1
447
+ 445 1
448
+ 446 1
449
+ 447 1
450
+ 448 1
451
+ 449 1
452
+ 450 1
453
+ 451 1
454
+ 452 1
455
+ 453 1
456
+ 454 1
457
+ 455 1
458
+ 456 1
459
+ 457 1
460
+ 458 1
461
+ 459 1
462
+ 460 1
463
+ 461 1
464
+ 462 1
465
+ 463 1
466
+ 464 1
467
+ 465 1
468
+ 466 1
469
+ 467 1
470
+ 468 1
471
+ 469 1
472
+ 470 1
473
+ 471 1
474
+ 472 1
475
+ 473 1
476
+ 474 1
477
+ 475 1
478
+ 476 1
479
+ 477 1
480
+ 478 1
481
+ 479 1
482
+ 480 1
483
+ 481 1
484
+ 482 1
485
+ 483 1
486
+ 484 1
487
+ 485 1
488
+ 486 1
489
+ 487 1
490
+ 488 1
491
+ 489 1
492
+ 490 1
493
+ 491 1
494
+ 492 1
495
+ 493 1
496
+ 494 1
497
+ 495 1
498
+ 496 1
499
+ 497 1
500
+ 498 1
501
+ 499 1
502
+ 500 1
503
+ 501 1
504
+ 502 1
505
+ 503 1
506
+ 504 1
507
+ 505 1
508
+ 506 1
509
+ 507 1
510
+ 508 1
511
+ 509 1
512
+ 510 1
513
+ 511 1
514
+ 512 1
515
+ 513 1
516
+ 514 1
517
+ 515 1
518
+ 516 1
519
+ 517 1
520
+ 518 1
521
+ 519 1
522
+ 520 1
523
+ 521 1
524
+ 522 1
525
+ 523 1
526
+ 524 1
527
+ 525 1
528
+ 526 1
529
+ 527 1
530
+ 528 1
531
+ 529 1
532
+ 530 1
533
+ 531 1
534
+ 532 1
535
+ 533 1
536
+ 534 1
537
+ 535 1
538
+ 536 1
539
+ 537 1
540
+ 538 1
541
+ 539 1
542
+ 540 1
543
+ 541 1
544
+ 542 1
545
+ 543 1
546
+ 544 1
547
+ 545 1
548
+ 546 1
549
+ 547 1
550
+ 548 1
551
+ 549 1
552
+ 550 1
553
+ 551 1
554
+ 552 1
555
+ 553 1
556
+ 554 1
557
+ 555 1
558
+ 556 1
559
+ 557 1
560
+ 558 1
561
+ 559 1
562
+ 560 1
563
+ 561 1
564
+ 562 1
565
+ 563 1
566
+ 564 1
567
+ 565 1
568
+ 566 1
569
+ 567 1
570
+ 568 1
571
+ 569 1
572
+ 570 1
573
+ 571 1
574
+ 572 1
575
+ 573 1
576
+ 574 1
577
+ 575 1
578
+ 576 1
579
+ 577 1
580
+ 578 1
581
+ 579 1
582
+ 580 1
583
+ 581 1
584
+ 582 1
585
+ 583 1
586
+ 584 1
587
+ 585 1
588
+ 586 1
589
+ 587 1
590
+ 588 1
591
+ 589 1
592
+ 590 1
593
+ 591 1
594
+ 592 1
595
+ 593 1
596
+ 594 1
597
+ 595 1
598
+ 596 1
599
+ 597 1
600
+ 598 1
601
+ 599 1
602
+ 600 1
603
+ 601 1
604
+ 602 1
605
+ 603 1
606
+ 604 1
607
+ 605 1
608
+ 606 1
609
+ 607 1
610
+ 608 1
611
+ 609 1
612
+ 610 1
613
+ 611 1
614
+ 612 1
615
+ 613 1
616
+ 614 1
617
+ 615 1
618
+ 616 1
619
+ 617 1
620
+ 618 1
621
+ 619 1
622
+ 620 1
623
+ 621 1
624
+ 622 1
625
+ 623 1
626
+ 624 1
627
+ 625 1
628
+ 626 1
629
+ 627 1
630
+ 628 1
631
+ 629 1
632
+ 630 1
633
+ 631 1
634
+ 632 1
635
+ 633 1
636
+ 634 1
637
+ 635 1
638
+ 636 1
639
+ 637 1
640
+ 638 1
641
+ 639 1
642
+ 640 1
643
+ 641 1
644
+ 642 1
645
+ 643 1
646
+ 644 1
647
+ 645 1
648
+ 646 1
649
+ 647 1
650
+ 648 1
651
+ 649 1
652
+ 650 1
653
+ 651 1
654
+ 652 1
655
+ 653 1
656
+ 654 1
657
+ 655 1
658
+ 656 1
659
+ 657 1
660
+ 658 1
661
+ 659 1
662
+ 660 1
663
+ 661 1
664
+ 662 1
665
+ 663 1
666
+ 664 1
667
+ 665 1
668
+ 666 1
669
+ 667 1
670
+ 668 1
671
+ 669 1
672
+ 670 1
673
+ 671 1
674
+ 672 1
675
+ 673 1
676
+ 674 1
677
+ 675 1
678
+ 676 1
679
+ 677 1
680
+ 678 1
681
+ 679 1
682
+ 680 1
683
+ 681 1
684
+ 682 1
685
+ 683 1
686
+ 684 1
687
+ 685 1
688
+ 686 1
689
+ 687 1
690
+ 688 1
691
+ 689 1
692
+ 690 1
693
+ 691 1
694
+ 692 1
695
+ 693 1
696
+ 694 1
697
+ 695 1
698
+ 696 1
699
+ 697 1
700
+ 698 1
701
+ 699 1
702
+ 700 1
703
+ 701 1
704
+ 702 1
705
+ 703 1
706
+ 704 1
707
+ 705 1
708
+ 706 1
709
+ 707 1
710
+ 708 1
711
+ 709 1
712
+ 710 1
713
+ 711 1
714
+ 712 1
715
+ 713 1
716
+ 714 1
717
+ 715 1
718
+ 716 1
719
+ 717 1
720
+ 718 1
721
+ 719 1
722
+ 720 1
723
+ 721 1
724
+ 722 1
725
+ 723 1
726
+ 724 1
727
+ 725 1
728
+ 726 1
729
+ 727 1
730
+ 728 1
731
+ 729 1
732
+ 730 1
733
+ 731 1
734
+ 732 1
735
+ 733 1
736
+ 734 1
737
+ 735 1
738
+ 736 1
739
+ 737 1
740
+ 738 1
741
+ 739 1
742
+ 740 1
743
+ 741 1
744
+ 742 1
745
+ 743 1
746
+ 744 1
747
+ 745 1
748
+ 746 1
749
+ 747 1
750
+ 748 1
751
+ 749 1
752
+ 750 1
753
+ 751 1
754
+ 752 1
755
+ 753 1
756
+ 754 1
757
+ 755 1
758
+ 756 1
759
+ 757 1
760
+ 758 1
761
+ 759 1
762
+ 760 1
763
+ 761 1
764
+ 762 1
765
+ 763 1
766
+ 764 1
767
+ 765 1
768
+ 766 1
769
+ 767 1
770
+ 768 1
771
+ 769 1
772
+ 770 1
773
+ 771 1
774
+ 772 1
775
+ 773 1
776
+ 774 1
777
+ 775 1
778
+ 776 1
779
+ 777 1
780
+ 778 1
781
+ 779 1
782
+ 780 1
783
+ 781 1
784
+ 782 1
785
+ 783 1
786
+ 784 1
787
+ 785 1
788
+ 786 1
789
+ 787 1
790
+ 788 1
791
+ 789 1
792
+ 790 1
793
+ 791 1
794
+ 792 1
795
+ 793 1
796
+ 794 1
797
+ 795 1
798
+ 796 1
799
+ 797 1
800
+ 798 1
801
+ 799 1
802
+ 800 1
803
+ 801 1
804
+ 802 1
805
+ 803 1
806
+ 804 1
807
+ 805 1
808
+ 806 1
809
+ 807 1
810
+ 808 1
811
+ 809 1
812
+ 810 1
813
+ 811 1
814
+ 812 1
815
+ 813 1
816
+ 814 1
817
+ 815 1
818
+ 816 1
819
+ 817 1
820
+ 818 1
821
+ 819 1
822
+ 820 1
823
+ 821 1
824
+ 822 1
825
+ 823 1
826
+ 824 1
827
+ 825 1
828
+ 826 1
829
+ 827 1
830
+ 828 1
831
+ 829 1
832
+ 830 1
833
+ 831 1
834
+ 832 1
835
+ 833 1
836
+ 834 1
837
+ 835 1
838
+ 836 1
839
+ 837 1
840
+ 838 1
841
+ 839 1
842
+ 840 1
843
+ 841 1
844
+ 842 1
845
+ 843 1
846
+ 844 1
847
+ 845 1
848
+ 846 1
849
+ 847 1
850
+ 848 1
851
+ 849 1
852
+ 850 1
853
+ 851 1
854
+ 852 1
855
+ 853 1
856
+ 854 1
857
+ 855 1
858
+ 856 1
859
+ 857 1
860
+ 858 1
861
+ 859 1
862
+ 860 1
863
+ 861 1
864
+ 862 1
865
+ 863 1
866
+ 864 1
867
+ 865 1
868
+ 866 1
869
+ 867 1
870
+ 868 1
871
+ 869 1
872
+ 870 1
873
+ 871 1
874
+ 872 1
875
+ 873 1
876
+ 874 1
877
+ 875 1
878
+ 876 1
879
+ 877 1
880
+ 878 1
881
+ 879 1
882
+ 880 1
883
+ 881 1
884
+ 882 1
885
+ 883 1
886
+ 884 1
887
+ 885 1
888
+ 886 1
889
+ 887 1
890
+ 888 1
891
+ 889 1
892
+ 890 1
893
+ 891 1
894
+ 892 1
895
+ 893 1
896
+ 894 1
897
+ 895 1
898
+ 896 1
899
+ 897 1
900
+ 898 1
901
+ 899 1
902
+ 900 1
903
+ 901 1
904
+ 902 1
905
+ 903 1
906
+ 904 1
907
+ 905 1
908
+ 906 1
909
+ 907 1
910
+ 908 1
911
+ 909 1
912
+ 910 1
913
+ 911 1
914
+ 912 1
915
+ 913 1
916
+ 914 1
917
+ 915 1
918
+ 916 1
919
+ 917 1
920
+ 918 1
921
+ 919 1
922
+ 920 1
923
+ 921 1
924
+ 922 1
925
+ 923 1
926
+ 924 1
927
+ 925 1
928
+ 926 1
929
+ 927 1
930
+ 928 1
931
+ 929 1
932
+ 930 1
933
+ 931 1
934
+ 932 1
935
+ 933 1
936
+ 934 1
937
+ 935 1
938
+ 936 1
939
+ 937 1
940
+ 938 1
941
+ 939 1
942
+ 940 1
943
+ 941 1
944
+ 942 1
945
+ 943 1
946
+ 944 1
947
+ 945 1
948
+ 946 1
949
+ 947 1
950
+ 948 1
951
+ 949 1
952
+ 950 1
953
+ 951 1
954
+ 952 1
955
+ 953 1
956
+ 954 1
957
+ 955 1
958
+ 956 1
959
+ 957 1
960
+ 958 1
961
+ 959 1
962
+ 960 1
963
+ 961 1
964
+ 962 1
965
+ 963 1
966
+ 964 1
967
+ 965 1
968
+ 966 1
969
+ 967 1
970
+ 968 1
971
+ 969 1
972
+ 970 1
973
+ 971 1
974
+ 972 1
975
+ 973 1
976
+ 974 1
977
+ 975 1
978
+ 976 1
979
+ 977 1
980
+ 978 1
981
+ 979 1
982
+ 980 1
983
+ 981 1
984
+ 982 1
985
+ 983 1
986
+ 984 1
987
+ 985 1
988
+ 986 1
989
+ 987 1
990
+ 988 1
991
+ 989 1
992
+ 990 1
993
+ 991 1
994
+ 992 1
995
+ 993 1
996
+ 994 1
997
+ 995 1
998
+ 996 1
999
+ 997 1
1000
+ 998 1
1001
+ 999 1
1002
+ 1000 1
1003
+ 1001 1
1004
+ 1002 1
1005
+ 1003 1
1006
+ 1004 1
1007
+ 1005 1
1008
+ 1006 1
1009
+ 1007 1
1010
+ 1008 1
1011
+ 1009 1
1012
+ 1010 1
1013
+ 1011 1
1014
+ 1012 1
1015
+ 1013 1
1016
+ 1014 1
1017
+ 1015 1
1018
+ 1016 1
1019
+ 1017 1
1020
+ 1018 1
1021
+ 1019 1
1022
+ 1020 1
1023
+ 1021 1
1024
+ 1022 1
1025
+ 1023 1
1026
+ 1024 1
1027
+ 1025 1
1028
+ 1026 1
1029
+ 1027 1
1030
+ 1028 1
1031
+ 1029 1
1032
+ 1030 1
1033
+ 1031 1
1034
+ 1032 1
1035
+ 1033 1
1036
+ 1034 1
1037
+ 1035 1
1038
+ 1036 1
1039
+ 1037 1
1040
+ 1038 1
1041
+ 1039 1
1042
+ 1040 1
1043
+ 1041 1
1044
+ 1042 1
1045
+ 1043 1
1046
+ 1044 1
1047
+ 1045 1
1048
+ 1046 1
1049
+ 1047 1
1050
+ 1048 1
1051
+ 1049 1
1052
+ 1050 1
1053
+ 1051 1
1054
+ 1052 1
1055
+ 1053 1
1056
+ 1054 1
1057
+ 1055 1
1058
+ 1056 1
1059
+ 1057 1
1060
+ 1058 1
1061
+ 1059 1
1062
+ 1060 1
1063
+ 1061 1
1064
+ 1062 1
reproduction/glue_exp/cola/dr0.0,mlr9e-03,clr9e-03,ep=34.0t=21d10h53m36/all_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 34.0,
3
+ "eval_loss": 0.6913050413131714,
4
+ "eval_matthews_correlation": 0.0,
5
+ "eval_runtime": 0.4247,
6
+ "eval_samples": 1043,
7
+ "eval_samples_per_second": 2455.672,
8
+ "eval_steps_per_second": 7.063
9
+ }
reproduction/glue_exp/cola/dr0.0,mlr9e-03,clr9e-03,ep=34.0t=21d10h53m36/eval_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 34.0,
3
+ "eval_loss": 0.6913050413131714,
4
+ "eval_matthews_correlation": 0.0,
5
+ "eval_runtime": 0.4247,
6
+ "eval_samples": 1043,
7
+ "eval_samples_per_second": 2455.672,
8
+ "eval_steps_per_second": 7.063
9
+ }
reproduction/glue_exp/cola/dr0.0,mlr9e-03,clr9e-03,ep=34.0t=21d10h53m36/ft/added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "[MASK]": 128000
3
+ }
reproduction/glue_exp/cola/dr0.0,mlr9e-03,clr9e-03,ep=34.0t=21d10h53m36/ft/special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "[CLS]",
3
+ "cls_token": "[CLS]",
4
+ "eos_token": "[SEP]",
5
+ "mask_token": "[MASK]",
6
+ "pad_token": "[PAD]",
7
+ "sep_token": "[SEP]",
8
+ "unk_token": {
9
+ "content": "[UNK]",
10
+ "lstrip": false,
11
+ "normalized": true,
12
+ "rstrip": false,
13
+ "single_word": false
14
+ }
15
+ }
reproduction/glue_exp/cola/dr0.0,mlr9e-03,clr9e-03,ep=34.0t=21d10h53m36/ft/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
reproduction/glue_exp/cola/dr0.0,mlr9e-03,clr9e-03,ep=34.0t=21d10h53m36/ft/tokenizer_config.json ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[CLS]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[SEP]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[UNK]",
29
+ "lstrip": false,
30
+ "normalized": true,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "128000": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "bos_token": "[CLS]",
45
+ "clean_up_tokenization_spaces": false,
46
+ "cls_token": "[CLS]",
47
+ "do_lower_case": false,
48
+ "eos_token": "[SEP]",
49
+ "extra_special_tokens": {},
50
+ "mask_token": "[MASK]",
51
+ "model_max_length": 512,
52
+ "pad_token": "[PAD]",
53
+ "padding_side": "right",
54
+ "sep_token": "[SEP]",
55
+ "sp_model_kwargs": {},
56
+ "split_by_punct": false,
57
+ "tokenizer_class": "DebertaV2Tokenizer",
58
+ "unk_token": "[UNK]",
59
+ "vocab_type": "spm"
60
+ }
reproduction/glue_exp/cola/dr0.0,mlr9e-03,clr9e-03,ep=34.0t=21d10h53m36/ft2/README.md ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: microsoft/deberta-v3-base
3
+ library_name: peft
4
+ tags:
5
+ - base_model:adapter:microsoft/deberta-v3-base
6
+ - transformers
7
+ ---
8
+
9
+ # Model Card for Model ID
10
+
11
+ <!-- Provide a quick summary of what the model is/does. -->
12
+
13
+
14
+
15
+ ## Model Details
16
+
17
+ ### Model Description
18
+
19
+ <!-- Provide a longer summary of what this model is. -->
20
+
21
+
22
+
23
+ - **Developed by:** [More Information Needed]
24
+ - **Funded by [optional]:** [More Information Needed]
25
+ - **Shared by [optional]:** [More Information Needed]
26
+ - **Model type:** [More Information Needed]
27
+ - **Language(s) (NLP):** [More Information Needed]
28
+ - **License:** [More Information Needed]
29
+ - **Finetuned from model [optional]:** [More Information Needed]
30
+
31
+ ### Model Sources [optional]
32
+
33
+ <!-- Provide the basic links for the model. -->
34
+
35
+ - **Repository:** [More Information Needed]
36
+ - **Paper [optional]:** [More Information Needed]
37
+ - **Demo [optional]:** [More Information Needed]
38
+
39
+ ## Uses
40
+
41
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
42
+
43
+ ### Direct Use
44
+
45
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
46
+
47
+ [More Information Needed]
48
+
49
+ ### Downstream Use [optional]
50
+
51
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
52
+
53
+ [More Information Needed]
54
+
55
+ ### Out-of-Scope Use
56
+
57
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
58
+
59
+ [More Information Needed]
60
+
61
+ ## Bias, Risks, and Limitations
62
+
63
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
64
+
65
+ [More Information Needed]
66
+
67
+ ### Recommendations
68
+
69
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
70
+
71
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
72
+
73
+ ## How to Get Started with the Model
74
+
75
+ Use the code below to get started with the model.
76
+
77
+ [More Information Needed]
78
+
79
+ ## Training Details
80
+
81
+ ### Training Data
82
+
83
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
84
+
85
+ [More Information Needed]
86
+
87
+ ### Training Procedure
88
+
89
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
90
+
91
+ #### Preprocessing [optional]
92
+
93
+ [More Information Needed]
94
+
95
+
96
+ #### Training Hyperparameters
97
+
98
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
99
+
100
+ #### Speeds, Sizes, Times [optional]
101
+
102
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
103
+
104
+ [More Information Needed]
105
+
106
+ ## Evaluation
107
+
108
+ <!-- This section describes the evaluation protocols and provides the results. -->
109
+
110
+ ### Testing Data, Factors & Metrics
111
+
112
+ #### Testing Data
113
+
114
+ <!-- This should link to a Dataset Card if possible. -->
115
+
116
+ [More Information Needed]
117
+
118
+ #### Factors
119
+
120
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
121
+
122
+ [More Information Needed]
123
+
124
+ #### Metrics
125
+
126
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
127
+
128
+ [More Information Needed]
129
+
130
+ ### Results
131
+
132
+ [More Information Needed]
133
+
134
+ #### Summary
135
+
136
+
137
+
138
+ ## Model Examination [optional]
139
+
140
+ <!-- Relevant interpretability work for the model goes here -->
141
+
142
+ [More Information Needed]
143
+
144
+ ## Environmental Impact
145
+
146
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
147
+
148
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
149
+
150
+ - **Hardware Type:** [More Information Needed]
151
+ - **Hours used:** [More Information Needed]
152
+ - **Cloud Provider:** [More Information Needed]
153
+ - **Compute Region:** [More Information Needed]
154
+ - **Carbon Emitted:** [More Information Needed]
155
+
156
+ ## Technical Specifications [optional]
157
+
158
+ ### Model Architecture and Objective
159
+
160
+ [More Information Needed]
161
+
162
+ ### Compute Infrastructure
163
+
164
+ [More Information Needed]
165
+
166
+ #### Hardware
167
+
168
+ [More Information Needed]
169
+
170
+ #### Software
171
+
172
+ [More Information Needed]
173
+
174
+ ## Citation [optional]
175
+
176
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
177
+
178
+ **BibTeX:**
179
+
180
+ [More Information Needed]
181
+
182
+ **APA:**
183
+
184
+ [More Information Needed]
185
+
186
+ ## Glossary [optional]
187
+
188
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
189
+
190
+ [More Information Needed]
191
+
192
+ ## More Information [optional]
193
+
194
+ [More Information Needed]
195
+
196
+ ## Model Card Authors [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Contact
201
+
202
+ [More Information Needed]
203
+ ### Framework versions
204
+
205
+ - PEFT 0.18.0
reproduction/glue_exp/cola/dr0.0,mlr9e-03,clr9e-03,ep=34.0t=21d10h53m36/ft2/adapter_config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "apply_GS": false,
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "microsoft/deberta-v3-base",
5
+ "bias": "none",
6
+ "exclude_modules": null,
7
+ "inference_mode": true,
8
+ "init_weights": true,
9
+ "layers_pattern": null,
10
+ "layers_to_transform": null,
11
+ "modules_to_save": [
12
+ "classifier",
13
+ "pooler",
14
+ "classifier",
15
+ "score"
16
+ ],
17
+ "peft_type": "HRA",
18
+ "peft_version": "0.18.0",
19
+ "r": 8,
20
+ "revision": null,
21
+ "target_modules": [
22
+ "attention.output.dense",
23
+ "intermediate.dense",
24
+ "value_proj",
25
+ "output.dense",
26
+ "key_proj",
27
+ "query_proj"
28
+ ],
29
+ "task_type": "SEQ_CLS"
30
+ }
reproduction/glue_exp/cola/dr0.0,mlr9e-03,clr9e-03,ep=34.0t=21d10h53m36/trainer_state.json ADDED
@@ -0,0 +1,1499 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 100,
3
+ "best_metric": 0.0,
4
+ "best_model_checkpoint": "./glue_exp/cola/dr0.0,mlr9e-03,clr9e-03,ep=34.0t=21d10h53m36/checkpoint-100",
5
+ "epoch": 34.0,
6
+ "eval_steps": 100,
7
+ "global_step": 9112,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.373134328358209,
14
+ "grad_norm": 2.7437047958374023,
15
+ "learning_rate": 0.00891,
16
+ "loss": 0.597,
17
+ "step": 100
18
+ },
19
+ {
20
+ "epoch": 0.373134328358209,
21
+ "eval_loss": 0.6913050413131714,
22
+ "eval_matthews_correlation": 0.0,
23
+ "eval_runtime": 0.5068,
24
+ "eval_samples_per_second": 2057.829,
25
+ "eval_steps_per_second": 5.919,
26
+ "step": 100
27
+ },
28
+ {
29
+ "epoch": 0.746268656716418,
30
+ "grad_norm": 0.46689021587371826,
31
+ "learning_rate": 0.00899730008546003,
32
+ "loss": 0.6175,
33
+ "step": 200
34
+ },
35
+ {
36
+ "epoch": 0.746268656716418,
37
+ "eval_loss": 0.6228563785552979,
38
+ "eval_matthews_correlation": 0.0,
39
+ "eval_runtime": 0.4323,
40
+ "eval_samples_per_second": 2412.816,
41
+ "eval_steps_per_second": 6.94,
42
+ "step": 200
43
+ },
44
+ {
45
+ "epoch": 1.1194029850746268,
46
+ "grad_norm": 0.27211815118789673,
47
+ "learning_rate": 0.008989094295694706,
48
+ "loss": 0.63,
49
+ "step": 300
50
+ },
51
+ {
52
+ "epoch": 1.1194029850746268,
53
+ "eval_loss": 0.623540997505188,
54
+ "eval_matthews_correlation": 0.0,
55
+ "eval_runtime": 0.4265,
56
+ "eval_samples_per_second": 2445.319,
57
+ "eval_steps_per_second": 7.034,
58
+ "step": 300
59
+ },
60
+ {
61
+ "epoch": 1.4925373134328357,
62
+ "grad_norm": 0.8687720894813538,
63
+ "learning_rate": 0.00897539240174535,
64
+ "loss": 0.6336,
65
+ "step": 400
66
+ },
67
+ {
68
+ "epoch": 1.4925373134328357,
69
+ "eval_loss": 0.6220645308494568,
70
+ "eval_matthews_correlation": 0.0,
71
+ "eval_runtime": 0.4279,
72
+ "eval_samples_per_second": 2437.49,
73
+ "eval_steps_per_second": 7.011,
74
+ "step": 400
75
+ },
76
+ {
77
+ "epoch": 1.8656716417910446,
78
+ "grad_norm": 0.050571564584970474,
79
+ "learning_rate": 0.008956211179164375,
80
+ "loss": 0.6548,
81
+ "step": 500
82
+ },
83
+ {
84
+ "epoch": 1.8656716417910446,
85
+ "eval_loss": 0.6198702454566956,
86
+ "eval_matthews_correlation": 0.0,
87
+ "eval_runtime": 0.4234,
88
+ "eval_samples_per_second": 2463.12,
89
+ "eval_steps_per_second": 7.085,
90
+ "step": 500
91
+ },
92
+ {
93
+ "epoch": 2.2388059701492535,
94
+ "grad_norm": 0.22496835887432098,
95
+ "learning_rate": 0.008931574111975675,
96
+ "loss": 0.6166,
97
+ "step": 600
98
+ },
99
+ {
100
+ "epoch": 2.2388059701492535,
101
+ "eval_loss": 0.6180900931358337,
102
+ "eval_matthews_correlation": 0.0,
103
+ "eval_runtime": 0.4225,
104
+ "eval_samples_per_second": 2468.392,
105
+ "eval_steps_per_second": 7.1,
106
+ "step": 600
107
+ },
108
+ {
109
+ "epoch": 2.611940298507463,
110
+ "grad_norm": 0.14902736246585846,
111
+ "learning_rate": 0.008901511363922565,
112
+ "loss": 0.6049,
113
+ "step": 700
114
+ },
115
+ {
116
+ "epoch": 2.611940298507463,
117
+ "eval_loss": 0.6187416911125183,
118
+ "eval_matthews_correlation": 0.0,
119
+ "eval_runtime": 0.4299,
120
+ "eval_samples_per_second": 2426.415,
121
+ "eval_steps_per_second": 6.979,
122
+ "step": 700
123
+ },
124
+ {
125
+ "epoch": 2.9850746268656714,
126
+ "grad_norm": 0.10430820286273956,
127
+ "learning_rate": 0.008866059741537603,
128
+ "loss": 0.5999,
129
+ "step": 800
130
+ },
131
+ {
132
+ "epoch": 2.9850746268656714,
133
+ "eval_loss": 0.6188157200813293,
134
+ "eval_matthews_correlation": 0.0,
135
+ "eval_runtime": 0.428,
136
+ "eval_samples_per_second": 2437.088,
137
+ "eval_steps_per_second": 7.01,
138
+ "step": 800
139
+ },
140
+ {
141
+ "epoch": 3.3582089552238807,
142
+ "grad_norm": 0.18977157771587372,
143
+ "learning_rate": 0.008825262649079487,
144
+ "loss": 0.618,
145
+ "step": 900
146
+ },
147
+ {
148
+ "epoch": 3.3582089552238807,
149
+ "eval_loss": 0.6181120872497559,
150
+ "eval_matthews_correlation": 0.0,
151
+ "eval_runtime": 0.4218,
152
+ "eval_samples_per_second": 2472.875,
153
+ "eval_steps_per_second": 7.113,
154
+ "step": 900
155
+ },
156
+ {
157
+ "epoch": 3.7313432835820897,
158
+ "grad_norm": 0.024376844987273216,
159
+ "learning_rate": 0.008779170035392189,
160
+ "loss": 0.6018,
161
+ "step": 1000
162
+ },
163
+ {
164
+ "epoch": 3.7313432835820897,
165
+ "eval_loss": 0.6186581254005432,
166
+ "eval_matthews_correlation": 0.0,
167
+ "eval_runtime": 0.4232,
168
+ "eval_samples_per_second": 2464.683,
169
+ "eval_steps_per_second": 7.089,
170
+ "step": 1000
171
+ },
172
+ {
173
+ "epoch": 4.104477611940299,
174
+ "grad_norm": 0.04904789477586746,
175
+ "learning_rate": 0.008727838332751408,
176
+ "loss": 0.602,
177
+ "step": 1100
178
+ },
179
+ {
180
+ "epoch": 4.104477611940299,
181
+ "eval_loss": 0.6193557381629944,
182
+ "eval_matthews_correlation": 0.0,
183
+ "eval_runtime": 0.4255,
184
+ "eval_samples_per_second": 2451.227,
185
+ "eval_steps_per_second": 7.051,
186
+ "step": 1100
187
+ },
188
+ {
189
+ "epoch": 4.477611940298507,
190
+ "grad_norm": 0.17297646403312683,
191
+ "learning_rate": 0.008671330387773206,
192
+ "loss": 0.6021,
193
+ "step": 1200
194
+ },
195
+ {
196
+ "epoch": 4.477611940298507,
197
+ "eval_loss": 0.618083655834198,
198
+ "eval_matthews_correlation": 0.0,
199
+ "eval_runtime": 0.4223,
200
+ "eval_samples_per_second": 2469.707,
201
+ "eval_steps_per_second": 7.104,
202
+ "step": 1200
203
+ },
204
+ {
205
+ "epoch": 4.850746268656716,
206
+ "grad_norm": 0.2165410965681076,
207
+ "learning_rate": 0.008609715384469405,
208
+ "loss": 0.6144,
209
+ "step": 1300
210
+ },
211
+ {
212
+ "epoch": 4.850746268656716,
213
+ "eval_loss": 0.6193823218345642,
214
+ "eval_matthews_correlation": 0.0,
215
+ "eval_runtime": 1.379,
216
+ "eval_samples_per_second": 756.328,
217
+ "eval_steps_per_second": 2.175,
218
+ "step": 1300
219
+ },
220
+ {
221
+ "epoch": 5.223880597014926,
222
+ "grad_norm": 0.033694978803396225,
223
+ "learning_rate": 0.00854306875954397,
224
+ "loss": 0.6039,
225
+ "step": 1400
226
+ },
227
+ {
228
+ "epoch": 5.223880597014926,
229
+ "eval_loss": 0.618939220905304,
230
+ "eval_matthews_correlation": 0.0,
231
+ "eval_runtime": 0.426,
232
+ "eval_samples_per_second": 2448.118,
233
+ "eval_steps_per_second": 7.042,
234
+ "step": 1400
235
+ },
236
+ {
237
+ "epoch": 5.597014925373134,
238
+ "grad_norm": 0.245077446103096,
239
+ "learning_rate": 0.008471472110034071,
240
+ "loss": 0.6012,
241
+ "step": 1500
242
+ },
243
+ {
244
+ "epoch": 5.597014925373134,
245
+ "eval_loss": 0.6183524131774902,
246
+ "eval_matthews_correlation": 0.0,
247
+ "eval_runtime": 0.4218,
248
+ "eval_samples_per_second": 2472.844,
249
+ "eval_steps_per_second": 7.113,
250
+ "step": 1500
251
+ },
252
+ {
253
+ "epoch": 5.970149253731344,
254
+ "grad_norm": 0.14359234273433685,
255
+ "learning_rate": 0.008395013093408902,
256
+ "loss": 0.6142,
257
+ "step": 1600
258
+ },
259
+ {
260
+ "epoch": 5.970149253731344,
261
+ "eval_loss": 0.6183721423149109,
262
+ "eval_matthews_correlation": 0.0,
263
+ "eval_runtime": 0.4212,
264
+ "eval_samples_per_second": 2476.373,
265
+ "eval_steps_per_second": 7.123,
266
+ "step": 1600
267
+ },
268
+ {
269
+ "epoch": 6.343283582089552,
270
+ "grad_norm": 0.12752141058444977,
271
+ "learning_rate": 0.008313785320248571,
272
+ "loss": 0.6068,
273
+ "step": 1700
274
+ },
275
+ {
276
+ "epoch": 6.343283582089552,
277
+ "eval_loss": 0.6180772185325623,
278
+ "eval_matthews_correlation": 0.0,
279
+ "eval_runtime": 0.4259,
280
+ "eval_samples_per_second": 2448.766,
281
+ "eval_steps_per_second": 7.043,
282
+ "step": 1700
283
+ },
284
+ {
285
+ "epoch": 6.7164179104477615,
286
+ "grad_norm": 0.012222304940223694,
287
+ "learning_rate": 0.008227888239634457,
288
+ "loss": 0.6014,
289
+ "step": 1800
290
+ },
291
+ {
292
+ "epoch": 6.7164179104477615,
293
+ "eval_loss": 0.6189431548118591,
294
+ "eval_matthews_correlation": 0.0,
295
+ "eval_runtime": 0.4217,
296
+ "eval_samples_per_second": 2473.387,
297
+ "eval_steps_per_second": 7.114,
298
+ "step": 1800
299
+ },
300
+ {
301
+ "epoch": 7.08955223880597,
302
+ "grad_norm": 0.24005922675132751,
303
+ "learning_rate": 0.008137427017391348,
304
+ "loss": 0.6138,
305
+ "step": 1900
306
+ },
307
+ {
308
+ "epoch": 7.08955223880597,
309
+ "eval_loss": 0.6186363101005554,
310
+ "eval_matthews_correlation": 0.0,
311
+ "eval_runtime": 0.4261,
312
+ "eval_samples_per_second": 2447.668,
313
+ "eval_steps_per_second": 7.04,
314
+ "step": 1900
315
+ },
316
+ {
317
+ "epoch": 7.462686567164179,
318
+ "grad_norm": 0.046122558414936066,
319
+ "learning_rate": 0.008042512407330438,
320
+ "loss": 0.6127,
321
+ "step": 2000
322
+ },
323
+ {
324
+ "epoch": 7.462686567164179,
325
+ "eval_loss": 0.6181656718254089,
326
+ "eval_matthews_correlation": 0.0,
327
+ "eval_runtime": 0.4259,
328
+ "eval_samples_per_second": 2448.791,
329
+ "eval_steps_per_second": 7.044,
330
+ "step": 2000
331
+ },
332
+ {
333
+ "epoch": 7.835820895522388,
334
+ "grad_norm": 0.10897961258888245,
335
+ "learning_rate": 0.007943260615650823,
336
+ "loss": 0.6052,
337
+ "step": 2100
338
+ },
339
+ {
340
+ "epoch": 7.835820895522388,
341
+ "eval_loss": 0.618504524230957,
342
+ "eval_matthews_correlation": 0.0,
343
+ "eval_runtime": 0.4215,
344
+ "eval_samples_per_second": 2474.598,
345
+ "eval_steps_per_second": 7.118,
346
+ "step": 2100
347
+ },
348
+ {
349
+ "epoch": 8.208955223880597,
350
+ "grad_norm": 0.0767781138420105,
351
+ "learning_rate": 0.007839793158665505,
352
+ "loss": 0.6021,
353
+ "step": 2200
354
+ },
355
+ {
356
+ "epoch": 8.208955223880597,
357
+ "eval_loss": 0.6189298629760742,
358
+ "eval_matthews_correlation": 0.0,
359
+ "eval_runtime": 0.4304,
360
+ "eval_samples_per_second": 2423.202,
361
+ "eval_steps_per_second": 6.97,
362
+ "step": 2200
363
+ },
364
+ {
365
+ "epoch": 8.582089552238806,
366
+ "grad_norm": 0.012093938887119293,
367
+ "learning_rate": 0.007732236714026093,
368
+ "loss": 0.6103,
369
+ "step": 2300
370
+ },
371
+ {
372
+ "epoch": 8.582089552238806,
373
+ "eval_loss": 0.6190155148506165,
374
+ "eval_matthews_correlation": 0.0,
375
+ "eval_runtime": 0.4283,
376
+ "eval_samples_per_second": 2435.13,
377
+ "eval_steps_per_second": 7.004,
378
+ "step": 2300
379
+ },
380
+ {
381
+ "epoch": 8.955223880597014,
382
+ "grad_norm": 0.10195652395486832,
383
+ "learning_rate": 0.007620722965628375,
384
+ "loss": 0.6084,
385
+ "step": 2400
386
+ },
387
+ {
388
+ "epoch": 8.955223880597014,
389
+ "eval_loss": 0.618133008480072,
390
+ "eval_matthews_correlation": 0.0,
391
+ "eval_runtime": 0.4257,
392
+ "eval_samples_per_second": 2450.299,
393
+ "eval_steps_per_second": 7.048,
394
+ "step": 2400
395
+ },
396
+ {
397
+ "epoch": 9.328358208955224,
398
+ "grad_norm": 0.14498992264270782,
399
+ "learning_rate": 0.007505388442388603,
400
+ "loss": 0.6028,
401
+ "step": 2500
402
+ },
403
+ {
404
+ "epoch": 9.328358208955224,
405
+ "eval_loss": 0.6181071400642395,
406
+ "eval_matthews_correlation": 0.0,
407
+ "eval_runtime": 0.426,
408
+ "eval_samples_per_second": 2448.253,
409
+ "eval_steps_per_second": 7.042,
410
+ "step": 2500
411
+ },
412
+ {
413
+ "epoch": 9.701492537313433,
414
+ "grad_norm": 0.16252005100250244,
415
+ "learning_rate": 0.007386374351087919,
416
+ "loss": 0.6104,
417
+ "step": 2600
418
+ },
419
+ {
420
+ "epoch": 9.701492537313433,
421
+ "eval_loss": 0.6188204288482666,
422
+ "eval_matthews_correlation": 0.0,
423
+ "eval_runtime": 0.414,
424
+ "eval_samples_per_second": 2519.216,
425
+ "eval_steps_per_second": 7.246,
426
+ "step": 2600
427
+ },
428
+ {
429
+ "epoch": 10.074626865671641,
430
+ "grad_norm": 0.07220250368118286,
431
+ "learning_rate": 0.007263826403489559,
432
+ "loss": 0.6066,
433
+ "step": 2700
434
+ },
435
+ {
436
+ "epoch": 10.074626865671641,
437
+ "eval_loss": 0.6182201504707336,
438
+ "eval_matthews_correlation": 0.0,
439
+ "eval_runtime": 0.4238,
440
+ "eval_samples_per_second": 2461.247,
441
+ "eval_steps_per_second": 7.079,
442
+ "step": 2700
443
+ },
444
+ {
445
+ "epoch": 10.447761194029852,
446
+ "grad_norm": 0.022631853818893433,
447
+ "learning_rate": 0.007137894637940499,
448
+ "loss": 0.602,
449
+ "step": 2800
450
+ },
451
+ {
452
+ "epoch": 10.447761194029852,
453
+ "eval_loss": 0.6184264421463013,
454
+ "eval_matthews_correlation": 0.0,
455
+ "eval_runtime": 0.4255,
456
+ "eval_samples_per_second": 2451.035,
457
+ "eval_steps_per_second": 7.05,
458
+ "step": 2800
459
+ },
460
+ {
461
+ "epoch": 10.82089552238806,
462
+ "grad_norm": 0.03383394703269005,
463
+ "learning_rate": 0.007008733235675957,
464
+ "loss": 0.6063,
465
+ "step": 2900
466
+ },
467
+ {
468
+ "epoch": 10.82089552238806,
469
+ "eval_loss": 0.6181037425994873,
470
+ "eval_matthews_correlation": 0.0,
471
+ "eval_runtime": 0.4243,
472
+ "eval_samples_per_second": 2458.313,
473
+ "eval_steps_per_second": 7.071,
474
+ "step": 2900
475
+ },
476
+ {
477
+ "epoch": 11.194029850746269,
478
+ "grad_norm": 0.09707663208246231,
479
+ "learning_rate": 0.006876500332051677,
480
+ "loss": 0.6069,
481
+ "step": 3000
482
+ },
483
+ {
484
+ "epoch": 11.194029850746269,
485
+ "eval_loss": 0.6191997528076172,
486
+ "eval_matthews_correlation": 0.0,
487
+ "eval_runtime": 0.4275,
488
+ "eval_samples_per_second": 2439.676,
489
+ "eval_steps_per_second": 7.017,
490
+ "step": 3000
491
+ },
492
+ {
493
+ "epoch": 11.567164179104477,
494
+ "grad_norm": 0.051615312695503235,
495
+ "learning_rate": 0.006741357822935066,
496
+ "loss": 0.6116,
497
+ "step": 3100
498
+ },
499
+ {
500
+ "epoch": 11.567164179104477,
501
+ "eval_loss": 0.6180777549743652,
502
+ "eval_matthews_correlation": 0.0,
503
+ "eval_runtime": 0.4269,
504
+ "eval_samples_per_second": 2443.257,
505
+ "eval_steps_per_second": 7.028,
506
+ "step": 3100
507
+ },
508
+ {
509
+ "epoch": 11.940298507462687,
510
+ "grad_norm": 0.07059226930141449,
511
+ "learning_rate": 0.006603471166492263,
512
+ "loss": 0.6106,
513
+ "step": 3200
514
+ },
515
+ {
516
+ "epoch": 11.940298507462687,
517
+ "eval_loss": 0.6182488203048706,
518
+ "eval_matthews_correlation": 0.0,
519
+ "eval_runtime": 0.3997,
520
+ "eval_samples_per_second": 2609.668,
521
+ "eval_steps_per_second": 7.506,
522
+ "step": 3200
523
+ },
524
+ {
525
+ "epoch": 12.313432835820896,
526
+ "grad_norm": 0.08970490097999573,
527
+ "learning_rate": 0.006463009180613802,
528
+ "loss": 0.6014,
529
+ "step": 3300
530
+ },
531
+ {
532
+ "epoch": 12.313432835820896,
533
+ "eval_loss": 0.6201241612434387,
534
+ "eval_matthews_correlation": 0.0,
535
+ "eval_runtime": 0.4247,
536
+ "eval_samples_per_second": 2455.698,
537
+ "eval_steps_per_second": 7.063,
538
+ "step": 3300
539
+ },
540
+ {
541
+ "epoch": 12.686567164179104,
542
+ "grad_norm": 0.038416266441345215,
543
+ "learning_rate": 0.006320143836226874,
544
+ "loss": 0.6143,
545
+ "step": 3400
546
+ },
547
+ {
548
+ "epoch": 12.686567164179104,
549
+ "eval_loss": 0.6180766820907593,
550
+ "eval_matthews_correlation": 0.0,
551
+ "eval_runtime": 0.4246,
552
+ "eval_samples_per_second": 2456.156,
553
+ "eval_steps_per_second": 7.065,
554
+ "step": 3400
555
+ },
556
+ {
557
+ "epoch": 13.059701492537313,
558
+ "grad_norm": 0.10318686068058014,
559
+ "learning_rate": 0.006175050046747262,
560
+ "loss": 0.6053,
561
+ "step": 3500
562
+ },
563
+ {
564
+ "epoch": 13.059701492537313,
565
+ "eval_loss": 0.6187887191772461,
566
+ "eval_matthews_correlation": 0.0,
567
+ "eval_runtime": 0.4219,
568
+ "eval_samples_per_second": 2472.32,
569
+ "eval_steps_per_second": 7.111,
570
+ "step": 3500
571
+ },
572
+ {
573
+ "epoch": 13.432835820895523,
574
+ "grad_norm": 0.08013023436069489,
575
+ "learning_rate": 0.0060279054539287185,
576
+ "loss": 0.5995,
577
+ "step": 3600
578
+ },
579
+ {
580
+ "epoch": 13.432835820895523,
581
+ "eval_loss": 0.619225800037384,
582
+ "eval_matthews_correlation": 0.0,
583
+ "eval_runtime": 0.4284,
584
+ "eval_samples_per_second": 2434.517,
585
+ "eval_steps_per_second": 7.002,
586
+ "step": 3600
587
+ },
588
+ {
589
+ "epoch": 13.805970149253731,
590
+ "grad_norm": 0.04627981036901474,
591
+ "learning_rate": 0.005878890210371962,
592
+ "loss": 0.6185,
593
+ "step": 3700
594
+ },
595
+ {
596
+ "epoch": 13.805970149253731,
597
+ "eval_loss": 0.6181351542472839,
598
+ "eval_matthews_correlation": 0.0,
599
+ "eval_runtime": 0.4275,
600
+ "eval_samples_per_second": 2439.928,
601
+ "eval_steps_per_second": 7.018,
602
+ "step": 3700
603
+ },
604
+ {
605
+ "epoch": 14.17910447761194,
606
+ "grad_norm": 0.07548993080854416,
607
+ "learning_rate": 0.0057281867589596,
608
+ "loss": 0.6032,
609
+ "step": 3800
610
+ },
611
+ {
612
+ "epoch": 14.17910447761194,
613
+ "eval_loss": 0.6188670992851257,
614
+ "eval_matthews_correlation": 0.0,
615
+ "eval_runtime": 0.427,
616
+ "eval_samples_per_second": 2442.391,
617
+ "eval_steps_per_second": 7.025,
618
+ "step": 3800
619
+ },
620
+ {
621
+ "epoch": 14.552238805970148,
622
+ "grad_norm": 0.037377819418907166,
623
+ "learning_rate": 0.005575979609486994,
624
+ "loss": 0.6177,
625
+ "step": 3900
626
+ },
627
+ {
628
+ "epoch": 14.552238805970148,
629
+ "eval_loss": 0.6180774569511414,
630
+ "eval_matthews_correlation": 0.0,
631
+ "eval_runtime": 0.428,
632
+ "eval_samples_per_second": 2437.098,
633
+ "eval_steps_per_second": 7.01,
634
+ "step": 3900
635
+ },
636
+ {
637
+ "epoch": 14.925373134328359,
638
+ "grad_norm": 0.010882982052862644,
639
+ "learning_rate": 0.005422455112762556,
640
+ "loss": 0.605,
641
+ "step": 4000
642
+ },
643
+ {
644
+ "epoch": 14.925373134328359,
645
+ "eval_loss": 0.6190503835678101,
646
+ "eval_matthews_correlation": 0.0,
647
+ "eval_runtime": 0.4278,
648
+ "eval_samples_per_second": 2437.827,
649
+ "eval_steps_per_second": 7.012,
650
+ "step": 4000
651
+ },
652
+ {
653
+ "epoch": 15.298507462686567,
654
+ "grad_norm": 0.17184129357337952,
655
+ "learning_rate": 0.005267801232454065,
656
+ "loss": 0.6104,
657
+ "step": 4100
658
+ },
659
+ {
660
+ "epoch": 15.298507462686567,
661
+ "eval_loss": 0.6180769801139832,
662
+ "eval_matthews_correlation": 0.0,
663
+ "eval_runtime": 0.4282,
664
+ "eval_samples_per_second": 2435.989,
665
+ "eval_steps_per_second": 7.007,
666
+ "step": 4100
667
+ },
668
+ {
669
+ "epoch": 15.671641791044776,
670
+ "grad_norm": 0.16398368775844574,
671
+ "learning_rate": 0.005112207314960288,
672
+ "loss": 0.637,
673
+ "step": 4200
674
+ },
675
+ {
676
+ "epoch": 15.671641791044776,
677
+ "eval_loss": 0.618771493434906,
678
+ "eval_matthews_correlation": 0.0,
679
+ "eval_runtime": 0.427,
680
+ "eval_samples_per_second": 2442.389,
681
+ "eval_steps_per_second": 7.025,
682
+ "step": 4200
683
+ },
684
+ {
685
+ "epoch": 16.044776119402986,
686
+ "grad_norm": 0.09540565311908722,
687
+ "learning_rate": 0.004955863857589732,
688
+ "loss": 0.5997,
689
+ "step": 4300
690
+ },
691
+ {
692
+ "epoch": 16.044776119402986,
693
+ "eval_loss": 0.6193976998329163,
694
+ "eval_matthews_correlation": 0.0,
695
+ "eval_runtime": 0.4275,
696
+ "eval_samples_per_second": 2439.988,
697
+ "eval_steps_per_second": 7.018,
698
+ "step": 4300
699
+ },
700
+ {
701
+ "epoch": 16.417910447761194,
702
+ "grad_norm": 0.057080455124378204,
703
+ "learning_rate": 0.004798962275330275,
704
+ "loss": 0.6051,
705
+ "step": 4400
706
+ },
707
+ {
708
+ "epoch": 16.417910447761194,
709
+ "eval_loss": 0.6188737750053406,
710
+ "eval_matthews_correlation": 0.0,
711
+ "eval_runtime": 1.4148,
712
+ "eval_samples_per_second": 737.205,
713
+ "eval_steps_per_second": 2.12,
714
+ "step": 4400
715
+ },
716
+ {
717
+ "epoch": 16.791044776119403,
718
+ "grad_norm": 0.11938146501779556,
719
+ "learning_rate": 0.004641694666495282,
720
+ "loss": 0.61,
721
+ "step": 4500
722
+ },
723
+ {
724
+ "epoch": 16.791044776119403,
725
+ "eval_loss": 0.6181567907333374,
726
+ "eval_matthews_correlation": 0.0,
727
+ "eval_runtime": 0.4265,
728
+ "eval_samples_per_second": 2445.718,
729
+ "eval_steps_per_second": 7.035,
730
+ "step": 4500
731
+ },
732
+ {
733
+ "epoch": 17.16417910447761,
734
+ "grad_norm": 0.06280449777841568,
735
+ "learning_rate": 0.004484253577533101,
736
+ "loss": 0.6067,
737
+ "step": 4600
738
+ },
739
+ {
740
+ "epoch": 17.16417910447761,
741
+ "eval_loss": 0.6186078786849976,
742
+ "eval_matthews_correlation": 0.0,
743
+ "eval_runtime": 0.427,
744
+ "eval_samples_per_second": 2442.396,
745
+ "eval_steps_per_second": 7.025,
746
+ "step": 4600
747
+ },
748
+ {
749
+ "epoch": 17.53731343283582,
750
+ "grad_norm": 0.04914025962352753,
751
+ "learning_rate": 0.004326831767287894,
752
+ "loss": 0.6025,
753
+ "step": 4700
754
+ },
755
+ {
756
+ "epoch": 17.53731343283582,
757
+ "eval_loss": 0.6194823384284973,
758
+ "eval_matthews_correlation": 0.0,
759
+ "eval_runtime": 0.4279,
760
+ "eval_samples_per_second": 2437.228,
761
+ "eval_steps_per_second": 7.01,
762
+ "step": 4700
763
+ },
764
+ {
765
+ "epoch": 17.91044776119403,
766
+ "grad_norm": 0.05280961096286774,
767
+ "learning_rate": 0.00416962197100044,
768
+ "loss": 0.6198,
769
+ "step": 4800
770
+ },
771
+ {
772
+ "epoch": 17.91044776119403,
773
+ "eval_loss": 0.6180894374847412,
774
+ "eval_matthews_correlation": 0.0,
775
+ "eval_runtime": 0.4304,
776
+ "eval_samples_per_second": 2423.377,
777
+ "eval_steps_per_second": 6.97,
778
+ "step": 4800
779
+ },
780
+ {
781
+ "epoch": 18.28358208955224,
782
+ "grad_norm": 0.12214235216379166,
783
+ "learning_rate": 0.004012816664337817,
784
+ "loss": 0.6092,
785
+ "step": 4900
786
+ },
787
+ {
788
+ "epoch": 18.28358208955224,
789
+ "eval_loss": 0.6181216239929199,
790
+ "eval_matthews_correlation": 0.0,
791
+ "eval_runtime": 0.4239,
792
+ "eval_samples_per_second": 2460.505,
793
+ "eval_steps_per_second": 7.077,
794
+ "step": 4900
795
+ },
796
+ {
797
+ "epoch": 18.65671641791045,
798
+ "grad_norm": 0.16166962683200836,
799
+ "learning_rate": 0.0038566078277409025,
800
+ "loss": 0.6072,
801
+ "step": 5000
802
+ },
803
+ {
804
+ "epoch": 18.65671641791045,
805
+ "eval_loss": 0.6187422275543213,
806
+ "eval_matthews_correlation": 0.0,
807
+ "eval_runtime": 0.4252,
808
+ "eval_samples_per_second": 2452.844,
809
+ "eval_steps_per_second": 7.055,
810
+ "step": 5000
811
+ },
812
+ {
813
+ "epoch": 19.029850746268657,
814
+ "grad_norm": 0.23016615211963654,
815
+ "learning_rate": 0.003701186711378183,
816
+ "loss": 0.6004,
817
+ "step": 5100
818
+ },
819
+ {
820
+ "epoch": 19.029850746268657,
821
+ "eval_loss": 0.6192060708999634,
822
+ "eval_matthews_correlation": 0.0,
823
+ "eval_runtime": 0.4288,
824
+ "eval_samples_per_second": 2432.269,
825
+ "eval_steps_per_second": 6.996,
826
+ "step": 5100
827
+ },
828
+ {
829
+ "epoch": 19.402985074626866,
830
+ "grad_norm": 0.0806824266910553,
831
+ "learning_rate": 0.003546743600993655,
832
+ "loss": 0.5981,
833
+ "step": 5200
834
+ },
835
+ {
836
+ "epoch": 19.402985074626866,
837
+ "eval_loss": 0.6192720532417297,
838
+ "eval_matthews_correlation": 0.0,
839
+ "eval_runtime": 0.4228,
840
+ "eval_samples_per_second": 2466.987,
841
+ "eval_steps_per_second": 7.096,
842
+ "step": 5200
843
+ },
844
+ {
845
+ "epoch": 19.776119402985074,
846
+ "grad_norm": 0.007001778110861778,
847
+ "learning_rate": 0.0033934675849354953,
848
+ "loss": 0.6196,
849
+ "step": 5300
850
+ },
851
+ {
852
+ "epoch": 19.776119402985074,
853
+ "eval_loss": 0.6180797815322876,
854
+ "eval_matthews_correlation": 0.0,
855
+ "eval_runtime": 0.428,
856
+ "eval_samples_per_second": 2436.791,
857
+ "eval_steps_per_second": 7.009,
858
+ "step": 5300
859
+ },
860
+ {
861
+ "epoch": 20.149253731343283,
862
+ "grad_norm": 0.15153659880161285,
863
+ "learning_rate": 0.0032415463226507265,
864
+ "loss": 0.605,
865
+ "step": 5400
866
+ },
867
+ {
868
+ "epoch": 20.149253731343283,
869
+ "eval_loss": 0.618575930595398,
870
+ "eval_matthews_correlation": 0.0,
871
+ "eval_runtime": 0.4015,
872
+ "eval_samples_per_second": 2597.772,
873
+ "eval_steps_per_second": 7.472,
874
+ "step": 5400
875
+ },
876
+ {
877
+ "epoch": 20.52238805970149,
878
+ "grad_norm": 0.030688393861055374,
879
+ "learning_rate": 0.0030911658149293288,
880
+ "loss": 0.6121,
881
+ "step": 5500
882
+ },
883
+ {
884
+ "epoch": 20.52238805970149,
885
+ "eval_loss": 0.6181564331054688,
886
+ "eval_matthews_correlation": 0.0,
887
+ "eval_runtime": 0.4236,
888
+ "eval_samples_per_second": 2462.078,
889
+ "eval_steps_per_second": 7.082,
890
+ "step": 5500
891
+ },
892
+ {
893
+ "epoch": 20.895522388059703,
894
+ "grad_norm": 0.15170888602733612,
895
+ "learning_rate": 0.0029425101761790745,
896
+ "loss": 0.603,
897
+ "step": 5600
898
+ },
899
+ {
900
+ "epoch": 20.895522388059703,
901
+ "eval_loss": 0.6185344457626343,
902
+ "eval_matthews_correlation": 0.0,
903
+ "eval_runtime": 0.4255,
904
+ "eval_samples_per_second": 2451.138,
905
+ "eval_steps_per_second": 7.05,
906
+ "step": 5600
907
+ },
908
+ {
909
+ "epoch": 21.26865671641791,
910
+ "grad_norm": 0.2022939920425415,
911
+ "learning_rate": 0.002795761409009909,
912
+ "loss": 0.6104,
913
+ "step": 5700
914
+ },
915
+ {
916
+ "epoch": 21.26865671641791,
917
+ "eval_loss": 0.6182953715324402,
918
+ "eval_matthews_correlation": 0.0,
919
+ "eval_runtime": 0.4293,
920
+ "eval_samples_per_second": 2429.467,
921
+ "eval_steps_per_second": 6.988,
922
+ "step": 5700
923
+ },
924
+ {
925
+ "epoch": 21.64179104477612,
926
+ "grad_norm": 0.019533611834049225,
927
+ "learning_rate": 0.0026510991814038634,
928
+ "loss": 0.6059,
929
+ "step": 5800
930
+ },
931
+ {
932
+ "epoch": 21.64179104477612,
933
+ "eval_loss": 0.6185408234596252,
934
+ "eval_matthews_correlation": 0.0,
935
+ "eval_runtime": 0.4215,
936
+ "eval_samples_per_second": 2474.539,
937
+ "eval_steps_per_second": 7.118,
938
+ "step": 5800
939
+ },
940
+ {
941
+ "epoch": 22.01492537313433,
942
+ "grad_norm": 0.014536094851791859,
943
+ "learning_rate": 0.002508700606743287,
944
+ "loss": 0.6067,
945
+ "step": 5900
946
+ },
947
+ {
948
+ "epoch": 22.01492537313433,
949
+ "eval_loss": 0.618817150592804,
950
+ "eval_matthews_correlation": 0.0,
951
+ "eval_runtime": 0.4272,
952
+ "eval_samples_per_second": 2441.747,
953
+ "eval_steps_per_second": 7.023,
954
+ "step": 5900
955
+ },
956
+ {
957
+ "epoch": 22.388059701492537,
958
+ "grad_norm": 0.03431111201643944,
959
+ "learning_rate": 0.002368740026966765,
960
+ "loss": 0.5992,
961
+ "step": 6000
962
+ },
963
+ {
964
+ "epoch": 22.388059701492537,
965
+ "eval_loss": 0.6190950870513916,
966
+ "eval_matthews_correlation": 0.0,
967
+ "eval_runtime": 0.4256,
968
+ "eval_samples_per_second": 2450.522,
969
+ "eval_steps_per_second": 7.048,
970
+ "step": 6000
971
+ },
972
+ {
973
+ "epoch": 22.761194029850746,
974
+ "grad_norm": 0.11788914352655411,
975
+ "learning_rate": 0.002231388799118156,
976
+ "loss": 0.6037,
977
+ "step": 6100
978
+ },
979
+ {
980
+ "epoch": 22.761194029850746,
981
+ "eval_loss": 0.6187638640403748,
982
+ "eval_matthews_correlation": 0.0,
983
+ "eval_runtime": 0.4282,
984
+ "eval_samples_per_second": 2435.607,
985
+ "eval_steps_per_second": 7.006,
986
+ "step": 6100
987
+ },
988
+ {
989
+ "epoch": 23.134328358208954,
990
+ "grad_norm": 0.03254074975848198,
991
+ "learning_rate": 0.002096815085550116,
992
+ "loss": 0.6222,
993
+ "step": 6200
994
+ },
995
+ {
996
+ "epoch": 23.134328358208954,
997
+ "eval_loss": 0.6181178092956543,
998
+ "eval_matthews_correlation": 0.0,
999
+ "eval_runtime": 0.4246,
1000
+ "eval_samples_per_second": 2456.419,
1001
+ "eval_steps_per_second": 7.065,
1002
+ "step": 6200
1003
+ },
1004
+ {
1005
+ "epoch": 23.507462686567163,
1006
+ "grad_norm": 0.019555965438485146,
1007
+ "learning_rate": 0.001965183648038961,
1008
+ "loss": 0.6021,
1009
+ "step": 6300
1010
+ },
1011
+ {
1012
+ "epoch": 23.507462686567163,
1013
+ "eval_loss": 0.6185086369514465,
1014
+ "eval_matthews_correlation": 0.0,
1015
+ "eval_runtime": 0.4245,
1016
+ "eval_samples_per_second": 2456.833,
1017
+ "eval_steps_per_second": 7.067,
1018
+ "step": 6300
1019
+ },
1020
+ {
1021
+ "epoch": 23.880597014925375,
1022
+ "grad_norm": 0.15938392281532288,
1023
+ "learning_rate": 0.001836655646062926,
1024
+ "loss": 0.6107,
1025
+ "step": 6400
1026
+ },
1027
+ {
1028
+ "epoch": 23.880597014925375,
1029
+ "eval_loss": 0.6182588934898376,
1030
+ "eval_matthews_correlation": 0.0,
1031
+ "eval_runtime": 0.426,
1032
+ "eval_samples_per_second": 2448.279,
1033
+ "eval_steps_per_second": 7.042,
1034
+ "step": 6400
1035
+ },
1036
+ {
1037
+ "epoch": 24.253731343283583,
1038
+ "grad_norm": 0.11047554016113281,
1039
+ "learning_rate": 0.0017113884394908182,
1040
+ "loss": 0.6059,
1041
+ "step": 6500
1042
+ },
1043
+ {
1044
+ "epoch": 24.253731343283583,
1045
+ "eval_loss": 0.6184186339378357,
1046
+ "eval_matthews_correlation": 0.0,
1047
+ "eval_runtime": 0.4223,
1048
+ "eval_samples_per_second": 2469.805,
1049
+ "eval_steps_per_second": 7.104,
1050
+ "step": 6500
1051
+ },
1052
+ {
1053
+ "epoch": 24.62686567164179,
1054
+ "grad_norm": 0.19051669538021088,
1055
+ "learning_rate": 0.0015895353959226057,
1056
+ "loss": 0.6227,
1057
+ "step": 6600
1058
+ },
1059
+ {
1060
+ "epoch": 24.62686567164179,
1061
+ "eval_loss": 0.6181543469429016,
1062
+ "eval_matthews_correlation": 0.0,
1063
+ "eval_runtime": 0.4244,
1064
+ "eval_samples_per_second": 2457.726,
1065
+ "eval_steps_per_second": 7.069,
1066
+ "step": 6600
1067
+ },
1068
+ {
1069
+ "epoch": 25.0,
1070
+ "grad_norm": 0.21268145740032196,
1071
+ "learning_rate": 0.0014712457029178454,
1072
+ "loss": 0.5919,
1073
+ "step": 6700
1074
+ },
1075
+ {
1076
+ "epoch": 25.0,
1077
+ "eval_loss": 0.618682861328125,
1078
+ "eval_matthews_correlation": 0.0,
1079
+ "eval_runtime": 0.4275,
1080
+ "eval_samples_per_second": 2439.493,
1081
+ "eval_steps_per_second": 7.017,
1082
+ "step": 6700
1083
+ },
1084
+ {
1085
+ "epoch": 25.37313432835821,
1086
+ "grad_norm": 0.057016950100660324,
1087
+ "learning_rate": 0.001356664185341829,
1088
+ "loss": 0.6053,
1089
+ "step": 6800
1090
+ },
1091
+ {
1092
+ "epoch": 25.37313432835821,
1093
+ "eval_loss": 0.6188849806785583,
1094
+ "eval_matthews_correlation": 0.0,
1095
+ "eval_runtime": 0.426,
1096
+ "eval_samples_per_second": 2448.377,
1097
+ "eval_steps_per_second": 7.042,
1098
+ "step": 6800
1099
+ },
1100
+ {
1101
+ "epoch": 25.746268656716417,
1102
+ "grad_norm": 0.19724732637405396,
1103
+ "learning_rate": 0.0012459311280530751,
1104
+ "loss": 0.616,
1105
+ "step": 6900
1106
+ },
1107
+ {
1108
+ "epoch": 25.746268656716417,
1109
+ "eval_loss": 0.6183460354804993,
1110
+ "eval_matthews_correlation": 0.0,
1111
+ "eval_runtime": 0.4235,
1112
+ "eval_samples_per_second": 2462.79,
1113
+ "eval_steps_per_second": 7.084,
1114
+ "step": 6900
1115
+ },
1116
+ {
1117
+ "epoch": 26.119402985074625,
1118
+ "grad_norm": 0.017139364033937454,
1119
+ "learning_rate": 0.0011391821041492733,
1120
+ "loss": 0.5993,
1121
+ "step": 7000
1122
+ },
1123
+ {
1124
+ "epoch": 26.119402985074625,
1125
+ "eval_loss": 0.6186425089836121,
1126
+ "eval_matthews_correlation": 0.0,
1127
+ "eval_runtime": 0.4225,
1128
+ "eval_samples_per_second": 2468.694,
1129
+ "eval_steps_per_second": 7.101,
1130
+ "step": 7000
1131
+ },
1132
+ {
1133
+ "epoch": 26.492537313432837,
1134
+ "grad_norm": 0.16466005146503448,
1135
+ "learning_rate": 0.001036547808981928,
1136
+ "loss": 0.6017,
1137
+ "step": 7100
1138
+ },
1139
+ {
1140
+ "epoch": 26.492537313432837,
1141
+ "eval_loss": 0.6189157366752625,
1142
+ "eval_matthews_correlation": 0.0,
1143
+ "eval_runtime": 0.4295,
1144
+ "eval_samples_per_second": 2428.64,
1145
+ "eval_steps_per_second": 6.986,
1146
+ "step": 7100
1147
+ },
1148
+ {
1149
+ "epoch": 26.865671641791046,
1150
+ "grad_norm": 0.11627840995788574,
1151
+ "learning_rate": 0.0009381539001429589,
1152
+ "loss": 0.6066,
1153
+ "step": 7200
1154
+ },
1155
+ {
1156
+ "epoch": 26.865671641791046,
1157
+ "eval_loss": 0.6186762452125549,
1158
+ "eval_matthews_correlation": 0.0,
1159
+ "eval_runtime": 0.4269,
1160
+ "eval_samples_per_second": 2443.358,
1161
+ "eval_steps_per_second": 7.028,
1162
+ "step": 7200
1163
+ },
1164
+ {
1165
+ "epoch": 27.238805970149254,
1166
+ "grad_norm": 0.025689436122775078,
1167
+ "learning_rate": 0.000844120843619142,
1168
+ "loss": 0.6188,
1169
+ "step": 7300
1170
+ },
1171
+ {
1172
+ "epoch": 27.238805970149254,
1173
+ "eval_loss": 0.6182805299758911,
1174
+ "eval_matthews_correlation": 0.0,
1175
+ "eval_runtime": 0.4281,
1176
+ "eval_samples_per_second": 2436.449,
1177
+ "eval_steps_per_second": 7.008,
1178
+ "step": 7300
1179
+ },
1180
+ {
1181
+ "epoch": 27.611940298507463,
1182
+ "grad_norm": 0.06709582358598709,
1183
+ "learning_rate": 0.0007545637663027499,
1184
+ "loss": 0.6024,
1185
+ "step": 7400
1186
+ },
1187
+ {
1188
+ "epoch": 27.611940298507463,
1189
+ "eval_loss": 0.6183831691741943,
1190
+ "eval_matthews_correlation": 0.0,
1191
+ "eval_runtime": 0.4261,
1192
+ "eval_samples_per_second": 2447.942,
1193
+ "eval_steps_per_second": 7.041,
1194
+ "step": 7400
1195
+ },
1196
+ {
1197
+ "epoch": 27.98507462686567,
1198
+ "grad_norm": 0.15548723936080933,
1199
+ "learning_rate": 0.0006695923150389905,
1200
+ "loss": 0.6074,
1201
+ "step": 7500
1202
+ },
1203
+ {
1204
+ "epoch": 27.98507462686567,
1205
+ "eval_loss": 0.6184422373771667,
1206
+ "eval_matthews_correlation": 0.0,
1207
+ "eval_runtime": 0.3918,
1208
+ "eval_samples_per_second": 2661.921,
1209
+ "eval_steps_per_second": 7.657,
1210
+ "step": 7500
1211
+ },
1212
+ {
1213
+ "epoch": 28.35820895522388,
1214
+ "grad_norm": 0.11029838770627975,
1215
+ "learning_rate": 0.0005893105223827713,
1216
+ "loss": 0.606,
1217
+ "step": 7600
1218
+ },
1219
+ {
1220
+ "epoch": 28.35820895522388,
1221
+ "eval_loss": 0.6184215545654297,
1222
+ "eval_matthews_correlation": 0.0,
1223
+ "eval_runtime": 0.4213,
1224
+ "eval_samples_per_second": 2475.871,
1225
+ "eval_steps_per_second": 7.121,
1226
+ "step": 7600
1227
+ },
1228
+ {
1229
+ "epoch": 28.73134328358209,
1230
+ "grad_norm": 0.02234813943505287,
1231
+ "learning_rate": 0.000513816679229194,
1232
+ "loss": 0.6088,
1233
+ "step": 7700
1234
+ },
1235
+ {
1236
+ "epoch": 28.73134328358209,
1237
+ "eval_loss": 0.6184214949607849,
1238
+ "eval_matthews_correlation": 0.0,
1239
+ "eval_runtime": 0.4289,
1240
+ "eval_samples_per_second": 2431.751,
1241
+ "eval_steps_per_second": 6.994,
1242
+ "step": 7700
1243
+ },
1244
+ {
1245
+ "epoch": 29.104477611940297,
1246
+ "grad_norm": 0.06740151345729828,
1247
+ "learning_rate": 0.00044320321447369003,
1248
+ "loss": 0.6091,
1249
+ "step": 7800
1250
+ },
1251
+ {
1252
+ "epoch": 29.104477611940297,
1253
+ "eval_loss": 0.6184511780738831,
1254
+ "eval_matthews_correlation": 0.0,
1255
+ "eval_runtime": 0.429,
1256
+ "eval_samples_per_second": 2431.159,
1257
+ "eval_steps_per_second": 6.993,
1258
+ "step": 7800
1259
+ },
1260
+ {
1261
+ "epoch": 29.47761194029851,
1262
+ "grad_norm": 0.1996515542268753,
1263
+ "learning_rate": 0.00037755658184913206,
1264
+ "loss": 0.6042,
1265
+ "step": 7900
1266
+ },
1267
+ {
1268
+ "epoch": 29.47761194029851,
1269
+ "eval_loss": 0.6184438467025757,
1270
+ "eval_matthews_correlation": 0.0,
1271
+ "eval_runtime": 0.4206,
1272
+ "eval_samples_per_second": 2480.062,
1273
+ "eval_steps_per_second": 7.133,
1274
+ "step": 7900
1275
+ },
1276
+ {
1277
+ "epoch": 29.850746268656717,
1278
+ "grad_norm": 0.020066693425178528,
1279
+ "learning_rate": 0.00031695715407849663,
1280
+ "loss": 0.6022,
1281
+ "step": 8000
1282
+ },
1283
+ {
1284
+ "epoch": 29.850746268656717,
1285
+ "eval_loss": 0.6184985041618347,
1286
+ "eval_matthews_correlation": 0.0,
1287
+ "eval_runtime": 0.4229,
1288
+ "eval_samples_per_second": 2466.537,
1289
+ "eval_steps_per_second": 7.095,
1290
+ "step": 8000
1291
+ },
1292
+ {
1293
+ "epoch": 30.223880597014926,
1294
+ "grad_norm": 0.11092804372310638,
1295
+ "learning_rate": 0.00026147912447263273,
1296
+ "loss": 0.6168,
1297
+ "step": 8100
1298
+ },
1299
+ {
1300
+ "epoch": 30.223880597014926,
1301
+ "eval_loss": 0.6184263229370117,
1302
+ "eval_matthews_correlation": 0.0,
1303
+ "eval_runtime": 0.424,
1304
+ "eval_samples_per_second": 2459.842,
1305
+ "eval_steps_per_second": 7.075,
1306
+ "step": 8100
1307
+ },
1308
+ {
1309
+ "epoch": 30.597014925373134,
1310
+ "grad_norm": 0.06512635201215744,
1311
+ "learning_rate": 0.00021119041609364567,
1312
+ "loss": 0.6047,
1313
+ "step": 8200
1314
+ },
1315
+ {
1316
+ "epoch": 30.597014925373134,
1317
+ "eval_loss": 0.6184566617012024,
1318
+ "eval_matthews_correlation": 0.0,
1319
+ "eval_runtime": 0.4256,
1320
+ "eval_samples_per_second": 2450.699,
1321
+ "eval_steps_per_second": 7.049,
1322
+ "step": 8200
1323
+ },
1324
+ {
1325
+ "epoch": 30.970149253731343,
1326
+ "grad_norm": 0.19852015376091003,
1327
+ "learning_rate": 0.00016615259859508396,
1328
+ "loss": 0.6074,
1329
+ "step": 8300
1330
+ },
1331
+ {
1332
+ "epoch": 30.970149253731343,
1333
+ "eval_loss": 0.6184257864952087,
1334
+ "eval_matthews_correlation": 0.0,
1335
+ "eval_runtime": 0.4249,
1336
+ "eval_samples_per_second": 2454.848,
1337
+ "eval_steps_per_second": 7.061,
1338
+ "step": 8300
1339
+ },
1340
+ {
1341
+ "epoch": 31.34328358208955,
1342
+ "grad_norm": 0.11174537986516953,
1343
+ "learning_rate": 0.00012642081284075857,
1344
+ "loss": 0.6022,
1345
+ "step": 8400
1346
+ },
1347
+ {
1348
+ "epoch": 31.34328358208955,
1349
+ "eval_loss": 0.618462860584259,
1350
+ "eval_matthews_correlation": 0.0,
1351
+ "eval_runtime": 0.4246,
1352
+ "eval_samples_per_second": 2456.502,
1353
+ "eval_steps_per_second": 7.066,
1354
+ "step": 8400
1355
+ },
1356
+ {
1357
+ "epoch": 31.71641791044776,
1358
+ "grad_norm": 0.1555200070142746,
1359
+ "learning_rate": 9.204370339448152e-05,
1360
+ "loss": 0.6144,
1361
+ "step": 8500
1362
+ },
1363
+ {
1364
+ "epoch": 31.71641791044776,
1365
+ "eval_loss": 0.6184436082839966,
1366
+ "eval_matthews_correlation": 0.0,
1367
+ "eval_runtime": 1.379,
1368
+ "eval_samples_per_second": 756.349,
1369
+ "eval_steps_per_second": 2.175,
1370
+ "step": 8500
1371
+ },
1372
+ {
1373
+ "epoch": 32.08955223880597,
1374
+ "grad_norm": 0.022661428898572922,
1375
+ "learning_rate": 6.306335896337538e-05,
1376
+ "loss": 0.6129,
1377
+ "step": 8600
1378
+ },
1379
+ {
1380
+ "epoch": 32.08955223880597,
1381
+ "eval_loss": 0.6184321641921997,
1382
+ "eval_matthews_correlation": 0.0,
1383
+ "eval_runtime": 0.4274,
1384
+ "eval_samples_per_second": 2440.463,
1385
+ "eval_steps_per_second": 7.02,
1386
+ "step": 8600
1387
+ },
1388
+ {
1389
+ "epoch": 32.46268656716418,
1390
+ "grad_norm": 0.15528610348701477,
1391
+ "learning_rate": 3.951526086767903e-05,
1392
+ "loss": 0.6028,
1393
+ "step": 8700
1394
+ },
1395
+ {
1396
+ "epoch": 32.46268656716418,
1397
+ "eval_loss": 0.6184345483779907,
1398
+ "eval_matthews_correlation": 0.0,
1399
+ "eval_runtime": 0.4242,
1400
+ "eval_samples_per_second": 2458.86,
1401
+ "eval_steps_per_second": 7.072,
1402
+ "step": 8700
1403
+ },
1404
+ {
1405
+ "epoch": 32.83582089552239,
1406
+ "grad_norm": 0.24232083559036255,
1407
+ "learning_rate": 2.142823960012463e-05,
1408
+ "loss": 0.6033,
1409
+ "step": 8800
1410
+ },
1411
+ {
1412
+ "epoch": 32.83582089552239,
1413
+ "eval_loss": 0.6184403896331787,
1414
+ "eval_matthews_correlation": 0.0,
1415
+ "eval_runtime": 0.4234,
1416
+ "eval_samples_per_second": 2463.571,
1417
+ "eval_steps_per_second": 7.086,
1418
+ "step": 8800
1419
+ },
1420
+ {
1421
+ "epoch": 33.208955223880594,
1422
+ "grad_norm": 0.022899480536580086,
1423
+ "learning_rate": 8.82443952808798e-06,
1424
+ "loss": 0.6058,
1425
+ "step": 8900
1426
+ },
1427
+ {
1428
+ "epoch": 33.208955223880594,
1429
+ "eval_loss": 0.6184419989585876,
1430
+ "eval_matthews_correlation": 0.0,
1431
+ "eval_runtime": 0.4208,
1432
+ "eval_samples_per_second": 2478.774,
1433
+ "eval_steps_per_second": 7.13,
1434
+ "step": 8900
1435
+ },
1436
+ {
1437
+ "epoch": 33.582089552238806,
1438
+ "grad_norm": 0.1096784919500351,
1439
+ "learning_rate": 1.719291781713761e-06,
1440
+ "loss": 0.596,
1441
+ "step": 9000
1442
+ },
1443
+ {
1444
+ "epoch": 33.582089552238806,
1445
+ "eval_loss": 0.618442952632904,
1446
+ "eval_matthews_correlation": 0.0,
1447
+ "eval_runtime": 0.4249,
1448
+ "eval_samples_per_second": 2454.479,
1449
+ "eval_steps_per_second": 7.06,
1450
+ "step": 9000
1451
+ },
1452
+ {
1453
+ "epoch": 33.95522388059702,
1454
+ "grad_norm": 0.1113029196858406,
1455
+ "learning_rate": 1.2149536122013637e-07,
1456
+ "loss": 0.6187,
1457
+ "step": 9100
1458
+ },
1459
+ {
1460
+ "epoch": 33.95522388059702,
1461
+ "eval_loss": 0.618442952632904,
1462
+ "eval_matthews_correlation": 0.0,
1463
+ "eval_runtime": 0.4223,
1464
+ "eval_samples_per_second": 2469.6,
1465
+ "eval_steps_per_second": 7.103,
1466
+ "step": 9100
1467
+ },
1468
+ {
1469
+ "epoch": 34.0,
1470
+ "step": 9112,
1471
+ "total_flos": 9702274347045888.0,
1472
+ "train_loss": 0.608722202394802,
1473
+ "train_runtime": 3174.1388,
1474
+ "train_samples_per_second": 91.595,
1475
+ "train_steps_per_second": 2.871
1476
+ }
1477
+ ],
1478
+ "logging_steps": 100,
1479
+ "max_steps": 9112,
1480
+ "num_input_tokens_seen": 0,
1481
+ "num_train_epochs": 34,
1482
+ "save_steps": 100,
1483
+ "stateful_callbacks": {
1484
+ "TrainerControl": {
1485
+ "args": {
1486
+ "should_epoch_stop": false,
1487
+ "should_evaluate": false,
1488
+ "should_log": false,
1489
+ "should_save": true,
1490
+ "should_training_stop": true
1491
+ },
1492
+ "attributes": {}
1493
+ }
1494
+ },
1495
+ "total_flos": 9702274347045888.0,
1496
+ "train_batch_size": 32,
1497
+ "trial_name": null,
1498
+ "trial_params": null
1499
+ }
reproduction/glue_exp/mnli/dr0.0,mlr1e-02,clr1e-02,ep=8.0t=20d22h22m35/MNLI-m.tsv ADDED
The diff for this file is too large to render. See raw diff
 
reproduction/glue_exp/mnli/dr0.0,mlr1e-02,clr1e-02,ep=8.0t=20d22h22m35/MNLI-mm.tsv ADDED
The diff for this file is too large to render. See raw diff
 
reproduction/glue_exp/mnli/dr0.0,mlr1e-02,clr1e-02,ep=8.0t=20d22h22m35/all_results.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 8.0,
3
+ "epoch_mm": 8.0,
4
+ "eval_accuracy": 0.9054508405501783,
5
+ "eval_accuracy_mm": 0.9054508405501783,
6
+ "eval_loss": 0.3119710385799408,
7
+ "eval_loss_mm": 0.3119710385799408,
8
+ "eval_runtime": 7.482,
9
+ "eval_runtime_mm": 7.4867,
10
+ "eval_samples": 9815,
11
+ "eval_samples_mm": 9832,
12
+ "eval_samples_per_second": 1311.821,
13
+ "eval_samples_per_second_mm": 1310.991,
14
+ "eval_steps_per_second": 2.673,
15
+ "eval_steps_per_second_mm": 2.671
16
+ }
reproduction/glue_exp/mnli/dr0.0,mlr1e-02,clr1e-02,ep=8.0t=20d22h22m35/eval_results.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 8.0,
3
+ "epoch_mm": 8.0,
4
+ "eval_accuracy": 0.9054508405501783,
5
+ "eval_accuracy_mm": 0.9054508405501783,
6
+ "eval_loss": 0.3119710385799408,
7
+ "eval_loss_mm": 0.3119710385799408,
8
+ "eval_runtime": 7.482,
9
+ "eval_runtime_mm": 7.4867,
10
+ "eval_samples": 9815,
11
+ "eval_samples_mm": 9832,
12
+ "eval_samples_per_second": 1311.821,
13
+ "eval_samples_per_second_mm": 1310.991,
14
+ "eval_steps_per_second": 2.673,
15
+ "eval_steps_per_second_mm": 2.671
16
+ }
reproduction/glue_exp/mnli/dr0.0,mlr1e-02,clr1e-02,ep=8.0t=20d22h22m35/ft/added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "[MASK]": 128000
3
+ }
reproduction/glue_exp/mnli/dr0.0,mlr1e-02,clr1e-02,ep=8.0t=20d22h22m35/ft/special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "[CLS]",
3
+ "cls_token": "[CLS]",
4
+ "eos_token": "[SEP]",
5
+ "mask_token": "[MASK]",
6
+ "pad_token": "[PAD]",
7
+ "sep_token": "[SEP]",
8
+ "unk_token": {
9
+ "content": "[UNK]",
10
+ "lstrip": false,
11
+ "normalized": true,
12
+ "rstrip": false,
13
+ "single_word": false
14
+ }
15
+ }
reproduction/glue_exp/mnli/dr0.0,mlr1e-02,clr1e-02,ep=8.0t=20d22h22m35/ft/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
reproduction/glue_exp/mnli/dr0.0,mlr1e-02,clr1e-02,ep=8.0t=20d22h22m35/ft/tokenizer_config.json ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[CLS]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[SEP]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[UNK]",
29
+ "lstrip": false,
30
+ "normalized": true,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "128000": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "bos_token": "[CLS]",
45
+ "clean_up_tokenization_spaces": false,
46
+ "cls_token": "[CLS]",
47
+ "do_lower_case": false,
48
+ "eos_token": "[SEP]",
49
+ "extra_special_tokens": {},
50
+ "mask_token": "[MASK]",
51
+ "model_max_length": 512,
52
+ "pad_token": "[PAD]",
53
+ "padding_side": "right",
54
+ "sep_token": "[SEP]",
55
+ "sp_model_kwargs": {},
56
+ "split_by_punct": false,
57
+ "tokenizer_class": "DebertaV2Tokenizer",
58
+ "unk_token": "[UNK]",
59
+ "vocab_type": "spm"
60
+ }
reproduction/glue_exp/mnli/dr0.0,mlr1e-02,clr1e-02,ep=8.0t=20d22h22m35/ft2/README.md ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: microsoft/deberta-v3-base
3
+ library_name: peft
4
+ tags:
5
+ - base_model:adapter:microsoft/deberta-v3-base
6
+ - transformers
7
+ ---
8
+
9
+ # Model Card for Model ID
10
+
11
+ <!-- Provide a quick summary of what the model is/does. -->
12
+
13
+
14
+
15
+ ## Model Details
16
+
17
+ ### Model Description
18
+
19
+ <!-- Provide a longer summary of what this model is. -->
20
+
21
+
22
+
23
+ - **Developed by:** [More Information Needed]
24
+ - **Funded by [optional]:** [More Information Needed]
25
+ - **Shared by [optional]:** [More Information Needed]
26
+ - **Model type:** [More Information Needed]
27
+ - **Language(s) (NLP):** [More Information Needed]
28
+ - **License:** [More Information Needed]
29
+ - **Finetuned from model [optional]:** [More Information Needed]
30
+
31
+ ### Model Sources [optional]
32
+
33
+ <!-- Provide the basic links for the model. -->
34
+
35
+ - **Repository:** [More Information Needed]
36
+ - **Paper [optional]:** [More Information Needed]
37
+ - **Demo [optional]:** [More Information Needed]
38
+
39
+ ## Uses
40
+
41
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
42
+
43
+ ### Direct Use
44
+
45
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
46
+
47
+ [More Information Needed]
48
+
49
+ ### Downstream Use [optional]
50
+
51
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
52
+
53
+ [More Information Needed]
54
+
55
+ ### Out-of-Scope Use
56
+
57
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
58
+
59
+ [More Information Needed]
60
+
61
+ ## Bias, Risks, and Limitations
62
+
63
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
64
+
65
+ [More Information Needed]
66
+
67
+ ### Recommendations
68
+
69
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
70
+
71
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
72
+
73
+ ## How to Get Started with the Model
74
+
75
+ Use the code below to get started with the model.
76
+
77
+ [More Information Needed]
78
+
79
+ ## Training Details
80
+
81
+ ### Training Data
82
+
83
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
84
+
85
+ [More Information Needed]
86
+
87
+ ### Training Procedure
88
+
89
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
90
+
91
+ #### Preprocessing [optional]
92
+
93
+ [More Information Needed]
94
+
95
+
96
+ #### Training Hyperparameters
97
+
98
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
99
+
100
+ #### Speeds, Sizes, Times [optional]
101
+
102
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
103
+
104
+ [More Information Needed]
105
+
106
+ ## Evaluation
107
+
108
+ <!-- This section describes the evaluation protocols and provides the results. -->
109
+
110
+ ### Testing Data, Factors & Metrics
111
+
112
+ #### Testing Data
113
+
114
+ <!-- This should link to a Dataset Card if possible. -->
115
+
116
+ [More Information Needed]
117
+
118
+ #### Factors
119
+
120
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
121
+
122
+ [More Information Needed]
123
+
124
+ #### Metrics
125
+
126
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
127
+
128
+ [More Information Needed]
129
+
130
+ ### Results
131
+
132
+ [More Information Needed]
133
+
134
+ #### Summary
135
+
136
+
137
+
138
+ ## Model Examination [optional]
139
+
140
+ <!-- Relevant interpretability work for the model goes here -->
141
+
142
+ [More Information Needed]
143
+
144
+ ## Environmental Impact
145
+
146
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
147
+
148
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
149
+
150
+ - **Hardware Type:** [More Information Needed]
151
+ - **Hours used:** [More Information Needed]
152
+ - **Cloud Provider:** [More Information Needed]
153
+ - **Compute Region:** [More Information Needed]
154
+ - **Carbon Emitted:** [More Information Needed]
155
+
156
+ ## Technical Specifications [optional]
157
+
158
+ ### Model Architecture and Objective
159
+
160
+ [More Information Needed]
161
+
162
+ ### Compute Infrastructure
163
+
164
+ [More Information Needed]
165
+
166
+ #### Hardware
167
+
168
+ [More Information Needed]
169
+
170
+ #### Software
171
+
172
+ [More Information Needed]
173
+
174
+ ## Citation [optional]
175
+
176
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
177
+
178
+ **BibTeX:**
179
+
180
+ [More Information Needed]
181
+
182
+ **APA:**
183
+
184
+ [More Information Needed]
185
+
186
+ ## Glossary [optional]
187
+
188
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
189
+
190
+ [More Information Needed]
191
+
192
+ ## More Information [optional]
193
+
194
+ [More Information Needed]
195
+
196
+ ## Model Card Authors [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Contact
201
+
202
+ [More Information Needed]
203
+ ### Framework versions
204
+
205
+ - PEFT 0.18.0
reproduction/glue_exp/mnli/dr0.0,mlr1e-02,clr1e-02,ep=8.0t=20d22h22m35/ft2/adapter_config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "apply_GS": false,
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "microsoft/deberta-v3-base",
5
+ "bias": "none",
6
+ "exclude_modules": null,
7
+ "inference_mode": true,
8
+ "init_weights": true,
9
+ "layers_pattern": null,
10
+ "layers_to_transform": null,
11
+ "modules_to_save": [
12
+ "classifier",
13
+ "pooler",
14
+ "classifier",
15
+ "score"
16
+ ],
17
+ "peft_type": "HRA",
18
+ "peft_version": "0.18.0",
19
+ "r": 8,
20
+ "revision": null,
21
+ "target_modules": [
22
+ "query_proj",
23
+ "output.dense",
24
+ "value_proj",
25
+ "attention.output.dense",
26
+ "intermediate.dense",
27
+ "key_proj"
28
+ ],
29
+ "task_type": "SEQ_CLS"
30
+ }
reproduction/glue_exp/mnli/dr0.0,mlr1e-02,clr1e-02,ep=8.0t=20d22h22m35/trainer_state.json ADDED
@@ -0,0 +1,1611 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 89000,
3
+ "best_metric": 0.9054508405501783,
4
+ "best_model_checkpoint": "./glue_exp/mnli/dr0.0,mlr1e-02,clr1e-02,ep=8.0t=20d22h22m35/checkpoint-89000",
5
+ "epoch": 8.0,
6
+ "eval_steps": 1000,
7
+ "global_step": 98176,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.08148631029986962,
14
+ "grad_norm": 2.2060513496398926,
15
+ "learning_rate": 0.00999,
16
+ "loss": 0.6185,
17
+ "step": 1000
18
+ },
19
+ {
20
+ "epoch": 0.08148631029986962,
21
+ "eval_accuracy": 0.83015792154865,
22
+ "eval_loss": 0.4527137875556946,
23
+ "eval_runtime": 7.5871,
24
+ "eval_samples_per_second": 1293.637,
25
+ "eval_steps_per_second": 2.636,
26
+ "step": 1000
27
+ },
28
+ {
29
+ "epoch": 0.16297262059973924,
30
+ "grad_norm": 2.3141701221466064,
31
+ "learning_rate": 0.00999739212694888,
32
+ "loss": 0.5538,
33
+ "step": 2000
34
+ },
35
+ {
36
+ "epoch": 0.16297262059973924,
37
+ "eval_accuracy": 0.8308711156393276,
38
+ "eval_loss": 0.48633873462677,
39
+ "eval_runtime": 6.5933,
40
+ "eval_samples_per_second": 1488.627,
41
+ "eval_steps_per_second": 3.033,
42
+ "step": 2000
43
+ },
44
+ {
45
+ "epoch": 0.24445893089960888,
46
+ "grad_norm": 1.5351825952529907,
47
+ "learning_rate": 0.009989560790007823,
48
+ "loss": 0.5116,
49
+ "step": 3000
50
+ },
51
+ {
52
+ "epoch": 0.24445893089960888,
53
+ "eval_accuracy": 0.8529801324503311,
54
+ "eval_loss": 0.46232548356056213,
55
+ "eval_runtime": 8.0442,
56
+ "eval_samples_per_second": 1220.131,
57
+ "eval_steps_per_second": 2.486,
58
+ "step": 3000
59
+ },
60
+ {
61
+ "epoch": 0.3259452411994785,
62
+ "grad_norm": 1.9609521627426147,
63
+ "learning_rate": 0.009976514172178154,
64
+ "loss": 0.4814,
65
+ "step": 4000
66
+ },
67
+ {
68
+ "epoch": 0.3259452411994785,
69
+ "eval_accuracy": 0.8528782475802343,
70
+ "eval_loss": 0.44729259610176086,
71
+ "eval_runtime": 8.0863,
72
+ "eval_samples_per_second": 1213.785,
73
+ "eval_steps_per_second": 2.473,
74
+ "step": 4000
75
+ },
76
+ {
77
+ "epoch": 0.4074315514993481,
78
+ "grad_norm": 1.7375686168670654,
79
+ "learning_rate": 0.009958265910286741,
80
+ "loss": 0.4778,
81
+ "step": 5000
82
+ },
83
+ {
84
+ "epoch": 0.4074315514993481,
85
+ "eval_accuracy": 0.8511462047885889,
86
+ "eval_loss": 0.4127245545387268,
87
+ "eval_runtime": 8.0582,
88
+ "eval_samples_per_second": 1218.011,
89
+ "eval_steps_per_second": 2.482,
90
+ "step": 5000
91
+ },
92
+ {
93
+ "epoch": 0.48891786179921776,
94
+ "grad_norm": 1.112365484237671,
95
+ "learning_rate": 0.009934835078118927,
96
+ "loss": 0.462,
97
+ "step": 6000
98
+ },
99
+ {
100
+ "epoch": 0.48891786179921776,
101
+ "eval_accuracy": 0.8568517575140092,
102
+ "eval_loss": 0.40465885400772095,
103
+ "eval_runtime": 8.0396,
104
+ "eval_samples_per_second": 1220.826,
105
+ "eval_steps_per_second": 2.488,
106
+ "step": 6000
107
+ },
108
+ {
109
+ "epoch": 0.5704041720990873,
110
+ "grad_norm": 1.2553290128707886,
111
+ "learning_rate": 0.009906246166481895,
112
+ "loss": 0.4442,
113
+ "step": 7000
114
+ },
115
+ {
116
+ "epoch": 0.5704041720990873,
117
+ "eval_accuracy": 0.8553234844625573,
118
+ "eval_loss": 0.39058223366737366,
119
+ "eval_runtime": 8.063,
120
+ "eval_samples_per_second": 1217.286,
121
+ "eval_steps_per_second": 2.48,
122
+ "step": 7000
123
+ },
124
+ {
125
+ "epoch": 0.651890482398957,
126
+ "grad_norm": 0.8650873303413391,
127
+ "learning_rate": 0.009872529057605913,
128
+ "loss": 0.447,
129
+ "step": 8000
130
+ },
131
+ {
132
+ "epoch": 0.651890482398957,
133
+ "eval_accuracy": 0.8709118695873663,
134
+ "eval_loss": 0.35326236486434937,
135
+ "eval_runtime": 8.0589,
136
+ "eval_samples_per_second": 1217.909,
137
+ "eval_steps_per_second": 2.482,
138
+ "step": 8000
139
+ },
140
+ {
141
+ "epoch": 0.7333767926988266,
142
+ "grad_norm": 0.7614707946777344,
143
+ "learning_rate": 0.009833718993910296,
144
+ "loss": 0.4425,
145
+ "step": 9000
146
+ },
147
+ {
148
+ "epoch": 0.7333767926988266,
149
+ "eval_accuracy": 0.8624554253693326,
150
+ "eval_loss": 0.4045211970806122,
151
+ "eval_runtime": 7.9662,
152
+ "eval_samples_per_second": 1232.085,
153
+ "eval_steps_per_second": 2.511,
154
+ "step": 9000
155
+ },
156
+ {
157
+ "epoch": 0.8148631029986962,
158
+ "grad_norm": 1.2573055028915405,
159
+ "learning_rate": 0.009789856541166658,
160
+ "loss": 0.4291,
161
+ "step": 10000
162
+ },
163
+ {
164
+ "epoch": 0.8148631029986962,
165
+ "eval_accuracy": 0.8760061130922058,
166
+ "eval_loss": 0.3816593289375305,
167
+ "eval_runtime": 8.0807,
168
+ "eval_samples_per_second": 1214.621,
169
+ "eval_steps_per_second": 2.475,
170
+ "step": 10000
171
+ },
172
+ {
173
+ "epoch": 0.8963494132985659,
174
+ "grad_norm": 0.9861263036727905,
175
+ "learning_rate": 0.00974098754609802,
176
+ "loss": 0.4305,
177
+ "step": 11000
178
+ },
179
+ {
180
+ "epoch": 0.8963494132985659,
181
+ "eval_accuracy": 0.8772287315333673,
182
+ "eval_loss": 0.3414895534515381,
183
+ "eval_runtime": 8.0756,
184
+ "eval_samples_per_second": 1215.385,
185
+ "eval_steps_per_second": 2.477,
186
+ "step": 11000
187
+ },
188
+ {
189
+ "epoch": 0.9778357235984355,
190
+ "grad_norm": 1.3806720972061157,
191
+ "learning_rate": 0.009687163088458042,
192
+ "loss": 0.4161,
193
+ "step": 12000
194
+ },
195
+ {
196
+ "epoch": 0.9778357235984355,
197
+ "eval_accuracy": 0.8665308201732043,
198
+ "eval_loss": 0.34990960359573364,
199
+ "eval_runtime": 8.084,
200
+ "eval_samples_per_second": 1214.129,
201
+ "eval_steps_per_second": 2.474,
202
+ "step": 12000
203
+ },
204
+ {
205
+ "epoch": 1.0593220338983051,
206
+ "grad_norm": 1.9694197177886963,
207
+ "learning_rate": 0.0096284394276405,
208
+ "loss": 0.406,
209
+ "step": 13000
210
+ },
211
+ {
212
+ "epoch": 1.0593220338983051,
213
+ "eval_accuracy": 0.8722363728986245,
214
+ "eval_loss": 0.34625330567359924,
215
+ "eval_runtime": 8.0547,
216
+ "eval_samples_per_second": 1218.537,
217
+ "eval_steps_per_second": 2.483,
218
+ "step": 13000
219
+ },
220
+ {
221
+ "epoch": 1.1408083441981747,
222
+ "grad_norm": 0.6525322198867798,
223
+ "learning_rate": 0.009564877943874813,
224
+ "loss": 0.4115,
225
+ "step": 14000
226
+ },
227
+ {
228
+ "epoch": 1.1408083441981747,
229
+ "eval_accuracy": 0.8772287315333673,
230
+ "eval_loss": 0.3376229405403137,
231
+ "eval_runtime": 8.0754,
232
+ "eval_samples_per_second": 1215.421,
233
+ "eval_steps_per_second": 2.477,
234
+ "step": 14000
235
+ },
236
+ {
237
+ "epoch": 1.2222946544980444,
238
+ "grad_norm": 0.8454155325889587,
239
+ "learning_rate": 0.009496545074069052,
240
+ "loss": 0.4013,
241
+ "step": 15000
242
+ },
243
+ {
244
+ "epoch": 1.2222946544980444,
245
+ "eval_accuracy": 0.8723382577687213,
246
+ "eval_loss": 0.37961477041244507,
247
+ "eval_runtime": 8.0556,
248
+ "eval_samples_per_second": 1218.412,
249
+ "eval_steps_per_second": 2.483,
250
+ "step": 15000
251
+ },
252
+ {
253
+ "epoch": 1.303780964797914,
254
+ "grad_norm": 2.028202533721924,
255
+ "learning_rate": 0.00942351224236754,
256
+ "loss": 0.4032,
257
+ "step": 16000
258
+ },
259
+ {
260
+ "epoch": 1.303780964797914,
261
+ "eval_accuracy": 0.8673458991339786,
262
+ "eval_loss": 0.36960068345069885,
263
+ "eval_runtime": 11.6809,
264
+ "eval_samples_per_second": 840.26,
265
+ "eval_steps_per_second": 1.712,
266
+ "step": 16000
267
+ },
268
+ {
269
+ "epoch": 1.3852672750977835,
270
+ "grad_norm": 1.3635278940200806,
271
+ "learning_rate": 0.009345855785495578,
272
+ "loss": 0.4042,
273
+ "step": 17000
274
+ },
275
+ {
276
+ "epoch": 1.3852672750977835,
277
+ "eval_accuracy": 0.8787570045848192,
278
+ "eval_loss": 0.3371247947216034,
279
+ "eval_runtime": 16.8955,
280
+ "eval_samples_per_second": 580.925,
281
+ "eval_steps_per_second": 1.184,
282
+ "step": 17000
283
+ },
284
+ {
285
+ "epoch": 1.4667535853976532,
286
+ "grad_norm": 0.8526794910430908,
287
+ "learning_rate": 0.009263656872969374,
288
+ "loss": 0.4046,
289
+ "step": 18000
290
+ },
291
+ {
292
+ "epoch": 1.4667535853976532,
293
+ "eval_accuracy": 0.87396841569027,
294
+ "eval_loss": 0.37868475914001465,
295
+ "eval_runtime": 16.7972,
296
+ "eval_samples_per_second": 584.325,
297
+ "eval_steps_per_second": 1.191,
298
+ "step": 18000
299
+ },
300
+ {
301
+ "epoch": 1.548239895697523,
302
+ "grad_norm": 0.8174905180931091,
303
+ "learning_rate": 0.009177001422254532,
304
+ "loss": 0.4021,
305
+ "step": 19000
306
+ },
307
+ {
308
+ "epoch": 1.548239895697523,
309
+ "eval_accuracy": 0.8821192052980132,
310
+ "eval_loss": 0.3413024842739105,
311
+ "eval_runtime": 14.5484,
312
+ "eval_samples_per_second": 674.643,
313
+ "eval_steps_per_second": 1.375,
314
+ "step": 19000
315
+ },
316
+ {
317
+ "epoch": 1.6297262059973925,
318
+ "grad_norm": 0.7442933917045593,
319
+ "learning_rate": 0.00908598000896182,
320
+ "loss": 0.3978,
321
+ "step": 20000
322
+ },
323
+ {
324
+ "epoch": 1.6297262059973925,
325
+ "eval_accuracy": 0.8796739684156902,
326
+ "eval_loss": 0.3163932263851166,
327
+ "eval_runtime": 8.0598,
328
+ "eval_samples_per_second": 1217.765,
329
+ "eval_steps_per_second": 2.481,
330
+ "step": 20000
331
+ },
332
+ {
333
+ "epoch": 1.711212516297262,
334
+ "grad_norm": 2.230134963989258,
335
+ "learning_rate": 0.008990687772174046,
336
+ "loss": 0.3981,
337
+ "step": 21000
338
+ },
339
+ {
340
+ "epoch": 1.711212516297262,
341
+ "eval_accuracy": 0.873458991339786,
342
+ "eval_loss": 0.38318774104118347,
343
+ "eval_runtime": 8.0862,
344
+ "eval_samples_per_second": 1213.802,
345
+ "eval_steps_per_second": 2.473,
346
+ "step": 21000
347
+ },
348
+ {
349
+ "epoch": 1.7926988265971318,
350
+ "grad_norm": 0.8759768009185791,
351
+ "learning_rate": 0.008891224315003048,
352
+ "loss": 0.3969,
353
+ "step": 22000
354
+ },
355
+ {
356
+ "epoch": 1.7926988265971318,
357
+ "eval_accuracy": 0.8774325012735609,
358
+ "eval_loss": 0.3344375491142273,
359
+ "eval_runtime": 8.0567,
360
+ "eval_samples_per_second": 1218.245,
361
+ "eval_steps_per_second": 2.482,
362
+ "step": 22000
363
+ },
364
+ {
365
+ "epoch": 1.8741851368970013,
366
+ "grad_norm": 1.1339051723480225,
367
+ "learning_rate": 0.008787693600480663,
368
+ "loss": 0.4036,
369
+ "step": 23000
370
+ },
371
+ {
372
+ "epoch": 1.8741851368970013,
373
+ "eval_accuracy": 0.8791645440652063,
374
+ "eval_loss": 0.3310340344905853,
375
+ "eval_runtime": 8.0576,
376
+ "eval_samples_per_second": 1218.11,
377
+ "eval_steps_per_second": 2.482,
378
+ "step": 23000
379
+ },
380
+ {
381
+ "epoch": 1.9556714471968708,
382
+ "grad_norm": 0.7995139360427856,
383
+ "learning_rate": 0.008680203842892588,
384
+ "loss": 0.3967,
385
+ "step": 24000
386
+ },
387
+ {
388
+ "epoch": 1.9556714471968708,
389
+ "eval_accuracy": 0.8821192052980132,
390
+ "eval_loss": 0.3221582770347595,
391
+ "eval_runtime": 8.042,
392
+ "eval_samples_per_second": 1220.462,
393
+ "eval_steps_per_second": 2.487,
394
+ "step": 24000
395
+ },
396
+ {
397
+ "epoch": 2.0371577574967406,
398
+ "grad_norm": 1.8584222793579102,
399
+ "learning_rate": 0.008568867394668646,
400
+ "loss": 0.3872,
401
+ "step": 25000
402
+ },
403
+ {
404
+ "epoch": 2.0371577574967406,
405
+ "eval_accuracy": 0.8768211920529801,
406
+ "eval_loss": 0.32817786931991577,
407
+ "eval_runtime": 8.0703,
408
+ "eval_samples_per_second": 1216.184,
409
+ "eval_steps_per_second": 2.478,
410
+ "step": 25000
411
+ },
412
+ {
413
+ "epoch": 2.1186440677966103,
414
+ "grad_norm": 0.7538495063781738,
415
+ "learning_rate": 0.008453800628947683,
416
+ "loss": 0.3793,
417
+ "step": 26000
418
+ },
419
+ {
420
+ "epoch": 2.1186440677966103,
421
+ "eval_accuracy": 0.8758023433520122,
422
+ "eval_loss": 0.3552575707435608,
423
+ "eval_runtime": 8.0675,
424
+ "eval_samples_per_second": 1216.614,
425
+ "eval_steps_per_second": 2.479,
426
+ "step": 26000
427
+ },
428
+ {
429
+ "epoch": 2.2001303780964796,
430
+ "grad_norm": 0.8285762071609497,
431
+ "learning_rate": 0.008335123817939908,
432
+ "loss": 0.3765,
433
+ "step": 27000
434
+ },
435
+ {
436
+ "epoch": 2.2001303780964796,
437
+ "eval_accuracy": 0.8812022414671421,
438
+ "eval_loss": 0.3156055510044098,
439
+ "eval_runtime": 8.0952,
440
+ "eval_samples_per_second": 1212.442,
441
+ "eval_steps_per_second": 2.471,
442
+ "step": 27000
443
+ },
444
+ {
445
+ "epoch": 2.2816166883963493,
446
+ "grad_norm": 1.013386845588684,
447
+ "learning_rate": 0.008212961007213745,
448
+ "loss": 0.3834,
449
+ "step": 28000
450
+ },
451
+ {
452
+ "epoch": 2.2816166883963493,
453
+ "eval_accuracy": 0.8802852776362711,
454
+ "eval_loss": 0.32749319076538086,
455
+ "eval_runtime": 8.0328,
456
+ "eval_samples_per_second": 1221.871,
457
+ "eval_steps_per_second": 2.49,
458
+ "step": 28000
459
+ },
460
+ {
461
+ "epoch": 2.363102998696219,
462
+ "grad_norm": 0.3707163333892822,
463
+ "learning_rate": 0.008087439886038625,
464
+ "loss": 0.3705,
465
+ "step": 29000
466
+ },
467
+ {
468
+ "epoch": 2.363102998696219,
469
+ "eval_accuracy": 0.8877228731533368,
470
+ "eval_loss": 0.3311881124973297,
471
+ "eval_runtime": 8.083,
472
+ "eval_samples_per_second": 1214.28,
473
+ "eval_steps_per_second": 2.474,
474
+ "step": 29000
475
+ },
476
+ {
477
+ "epoch": 2.444589308996089,
478
+ "grad_norm": 0.6465876698493958,
479
+ "learning_rate": 0.007958691653919263,
480
+ "loss": 0.3749,
481
+ "step": 30000
482
+ },
483
+ {
484
+ "epoch": 2.444589308996089,
485
+ "eval_accuracy": 0.8703005603667855,
486
+ "eval_loss": 0.3598898947238922,
487
+ "eval_runtime": 8.1121,
488
+ "eval_samples_per_second": 1209.92,
489
+ "eval_steps_per_second": 2.465,
490
+ "step": 30000
491
+ },
492
+ {
493
+ "epoch": 2.526075619295958,
494
+ "grad_norm": 1.221585750579834,
495
+ "learning_rate": 0.007826850883460879,
496
+ "loss": 0.3838,
497
+ "step": 31000
498
+ },
499
+ {
500
+ "epoch": 2.526075619295958,
501
+ "eval_accuracy": 0.8767193071828834,
502
+ "eval_loss": 0.34019705653190613,
503
+ "eval_runtime": 8.0675,
504
+ "eval_samples_per_second": 1216.612,
505
+ "eval_steps_per_second": 2.479,
506
+ "step": 31000
507
+ },
508
+ {
509
+ "epoch": 2.607561929595828,
510
+ "grad_norm": 2.0899202823638916,
511
+ "learning_rate": 0.0076920553797087355,
512
+ "loss": 0.3761,
513
+ "step": 32000
514
+ },
515
+ {
516
+ "epoch": 2.607561929595828,
517
+ "eval_accuracy": 0.8805909322465614,
518
+ "eval_loss": 0.33326148986816406,
519
+ "eval_runtime": 8.0683,
520
+ "eval_samples_per_second": 1216.494,
521
+ "eval_steps_per_second": 2.479,
522
+ "step": 32000
523
+ },
524
+ {
525
+ "epoch": 2.6890482398956976,
526
+ "grad_norm": 0.9150403738021851,
527
+ "learning_rate": 0.007554446036109006,
528
+ "loss": 0.3829,
529
+ "step": 33000
530
+ },
531
+ {
532
+ "epoch": 2.6890482398956976,
533
+ "eval_accuracy": 0.878349465104432,
534
+ "eval_loss": 0.3460722267627716,
535
+ "eval_runtime": 8.0705,
536
+ "eval_samples_per_second": 1216.154,
537
+ "eval_steps_per_second": 2.478,
538
+ "step": 33000
539
+ },
540
+ {
541
+ "epoch": 2.770534550195567,
542
+ "grad_norm": 1.091302752494812,
543
+ "learning_rate": 0.0074141666872415204,
544
+ "loss": 0.3816,
545
+ "step": 34000
546
+ },
547
+ {
548
+ "epoch": 2.770534550195567,
549
+ "eval_accuracy": 0.875904228222109,
550
+ "eval_loss": 0.35716861486434937,
551
+ "eval_runtime": 8.0896,
552
+ "eval_samples_per_second": 1213.281,
553
+ "eval_steps_per_second": 2.472,
554
+ "step": 34000
555
+ },
556
+ {
557
+ "epoch": 2.8520208604954367,
558
+ "grad_norm": 0.9538711309432983,
559
+ "learning_rate": 0.00727136395847833,
560
+ "loss": 0.3739,
561
+ "step": 35000
562
+ },
563
+ {
564
+ "epoch": 2.8520208604954367,
565
+ "eval_accuracy": 0.8781456953642384,
566
+ "eval_loss": 0.3289404809474945,
567
+ "eval_runtime": 8.0726,
568
+ "eval_samples_per_second": 1215.843,
569
+ "eval_steps_per_second": 2.478,
570
+ "step": 35000
571
+ },
572
+ {
573
+ "epoch": 2.9335071707953064,
574
+ "grad_norm": 0.9174529314041138,
575
+ "learning_rate": 0.0071261871127252345,
576
+ "loss": 0.3749,
577
+ "step": 36000
578
+ },
579
+ {
580
+ "epoch": 2.9335071707953064,
581
+ "eval_accuracy": 0.8872134488028528,
582
+ "eval_loss": 0.3154527246952057,
583
+ "eval_runtime": 8.0759,
584
+ "eval_samples_per_second": 1215.351,
585
+ "eval_steps_per_second": 2.477,
586
+ "step": 36000
587
+ },
588
+ {
589
+ "epoch": 3.014993481095176,
590
+ "grad_norm": 0.7346525192260742,
591
+ "learning_rate": 0.006978787894406435,
592
+ "loss": 0.3665,
593
+ "step": 37000
594
+ },
595
+ {
596
+ "epoch": 3.014993481095176,
597
+ "eval_accuracy": 0.8840550178298523,
598
+ "eval_loss": 0.3637458086013794,
599
+ "eval_runtime": 8.0723,
600
+ "eval_samples_per_second": 1215.881,
601
+ "eval_steps_per_second": 2.478,
602
+ "step": 37000
603
+ },
604
+ {
605
+ "epoch": 3.0964797913950455,
606
+ "grad_norm": 1.5922387838363647,
607
+ "learning_rate": 0.006829320370855446,
608
+ "loss": 0.3504,
609
+ "step": 38000
610
+ },
611
+ {
612
+ "epoch": 3.0964797913950455,
613
+ "eval_accuracy": 0.8765155374426897,
614
+ "eval_loss": 0.35780608654022217,
615
+ "eval_runtime": 8.0625,
616
+ "eval_samples_per_second": 1217.363,
617
+ "eval_steps_per_second": 2.481,
618
+ "step": 38000
619
+ },
620
+ {
621
+ "epoch": 3.1779661016949152,
622
+ "grad_norm": 0.9475153088569641,
623
+ "learning_rate": 0.006677940771277968,
624
+ "loss": 0.3589,
625
+ "step": 39000
626
+ },
627
+ {
628
+ "epoch": 3.1779661016949152,
629
+ "eval_accuracy": 0.8893530310748854,
630
+ "eval_loss": 0.3214789628982544,
631
+ "eval_runtime": 8.0814,
632
+ "eval_samples_per_second": 1214.512,
633
+ "eval_steps_per_second": 2.475,
634
+ "step": 39000
635
+ },
636
+ {
637
+ "epoch": 3.259452411994785,
638
+ "grad_norm": 1.2292866706848145,
639
+ "learning_rate": 0.006524807323455133,
640
+ "loss": 0.3534,
641
+ "step": 40000
642
+ },
643
+ {
644
+ "epoch": 3.259452411994785,
645
+ "eval_accuracy": 0.8806928171166581,
646
+ "eval_loss": 0.37568399310112,
647
+ "eval_runtime": 8.0686,
648
+ "eval_samples_per_second": 1216.449,
649
+ "eval_steps_per_second": 2.479,
650
+ "step": 40000
651
+ },
652
+ {
653
+ "epoch": 3.3409387222946547,
654
+ "grad_norm": 0.9425502419471741,
655
+ "learning_rate": 0.006370080088357722,
656
+ "loss": 0.3501,
657
+ "step": 41000
658
+ },
659
+ {
660
+ "epoch": 3.3409387222946547,
661
+ "eval_accuracy": 0.8826286296484972,
662
+ "eval_loss": 0.3434010446071625,
663
+ "eval_runtime": 8.0828,
664
+ "eval_samples_per_second": 1214.31,
665
+ "eval_steps_per_second": 2.474,
666
+ "step": 41000
667
+ },
668
+ {
669
+ "epoch": 3.422425032594524,
670
+ "grad_norm": 0.6420383453369141,
671
+ "learning_rate": 0.006213920792844295,
672
+ "loss": 0.352,
673
+ "step": 42000
674
+ },
675
+ {
676
+ "epoch": 3.422425032594524,
677
+ "eval_accuracy": 0.8902699949057565,
678
+ "eval_loss": 0.30623751878738403,
679
+ "eval_runtime": 6.591,
680
+ "eval_samples_per_second": 1489.15,
681
+ "eval_steps_per_second": 3.034,
682
+ "step": 42000
683
+ },
684
+ {
685
+ "epoch": 3.5039113428943938,
686
+ "grad_norm": 1.2944601774215698,
687
+ "learning_rate": 0.006056492660618047,
688
+ "loss": 0.3504,
689
+ "step": 43000
690
+ },
691
+ {
692
+ "epoch": 3.5039113428943938,
693
+ "eval_accuracy": 0.8889454915944982,
694
+ "eval_loss": 0.31484460830688477,
695
+ "eval_runtime": 7.3757,
696
+ "eval_samples_per_second": 1330.722,
697
+ "eval_steps_per_second": 2.712,
698
+ "step": 43000
699
+ },
700
+ {
701
+ "epoch": 3.5853976531942635,
702
+ "grad_norm": 2.653141498565674,
703
+ "learning_rate": 0.00589796024161912,
704
+ "loss": 0.3533,
705
+ "step": 44000
706
+ },
707
+ {
708
+ "epoch": 3.5853976531942635,
709
+ "eval_accuracy": 0.8906775343861436,
710
+ "eval_loss": 0.3489537835121155,
711
+ "eval_runtime": 7.3968,
712
+ "eval_samples_per_second": 1326.921,
713
+ "eval_steps_per_second": 2.704,
714
+ "step": 44000
715
+ },
716
+ {
717
+ "epoch": 3.666883963494133,
718
+ "grad_norm": 0.9806082248687744,
719
+ "learning_rate": 0.005738489240030675,
720
+ "loss": 0.3538,
721
+ "step": 45000
722
+ },
723
+ {
724
+ "epoch": 3.666883963494133,
725
+ "eval_accuracy": 0.8881304126337239,
726
+ "eval_loss": 0.3089299499988556,
727
+ "eval_runtime": 7.3711,
728
+ "eval_samples_per_second": 1331.544,
729
+ "eval_steps_per_second": 2.713,
730
+ "step": 45000
731
+ },
732
+ {
733
+ "epoch": 3.7483702737940026,
734
+ "grad_norm": 0.6134408712387085,
735
+ "learning_rate": 0.005578246341078499,
736
+ "loss": 0.3438,
737
+ "step": 46000
738
+ },
739
+ {
740
+ "epoch": 3.7483702737940026,
741
+ "eval_accuracy": 0.8898624554253693,
742
+ "eval_loss": 0.30195650458335876,
743
+ "eval_runtime": 7.4856,
744
+ "eval_samples_per_second": 1311.177,
745
+ "eval_steps_per_second": 2.672,
746
+ "step": 46000
747
+ },
748
+ {
749
+ "epoch": 3.8298565840938723,
750
+ "grad_norm": 1.5439763069152832,
751
+ "learning_rate": 0.005417399036805203,
752
+ "loss": 0.3461,
753
+ "step": 47000
754
+ },
755
+ {
756
+ "epoch": 3.8298565840938723,
757
+ "eval_accuracy": 0.8915944982170148,
758
+ "eval_loss": 0.33832383155822754,
759
+ "eval_runtime": 7.5014,
760
+ "eval_samples_per_second": 1308.424,
761
+ "eval_steps_per_second": 2.666,
762
+ "step": 47000
763
+ },
764
+ {
765
+ "epoch": 3.9113428943937416,
766
+ "grad_norm": 0.9223654270172119,
767
+ "learning_rate": 0.005256115451001088,
768
+ "loss": 0.355,
769
+ "step": 48000
770
+ },
771
+ {
772
+ "epoch": 3.9113428943937416,
773
+ "eval_accuracy": 0.8920020376974019,
774
+ "eval_loss": 0.30014675855636597,
775
+ "eval_runtime": 7.4942,
776
+ "eval_samples_per_second": 1309.68,
777
+ "eval_steps_per_second": 2.669,
778
+ "step": 48000
779
+ },
780
+ {
781
+ "epoch": 3.9928292046936114,
782
+ "grad_norm": 1.4363678693771362,
783
+ "learning_rate": 0.0050945641634746975,
784
+ "loss": 0.3435,
785
+ "step": 49000
786
+ },
787
+ {
788
+ "epoch": 3.9928292046936114,
789
+ "eval_accuracy": 0.8847682119205298,
790
+ "eval_loss": 0.3268650770187378,
791
+ "eval_runtime": 7.503,
792
+ "eval_samples_per_second": 1308.136,
793
+ "eval_steps_per_second": 2.666,
794
+ "step": 49000
795
+ },
796
+ {
797
+ "epoch": 4.074315514993481,
798
+ "grad_norm": 0.8659418821334839,
799
+ "learning_rate": 0.004932914033846713,
800
+ "loss": 0.316,
801
+ "step": 50000
802
+ },
803
+ {
804
+ "epoch": 4.074315514993481,
805
+ "eval_accuracy": 0.8877228731533368,
806
+ "eval_loss": 0.2992446720600128,
807
+ "eval_runtime": 7.4725,
808
+ "eval_samples_per_second": 1313.479,
809
+ "eval_steps_per_second": 2.676,
810
+ "step": 50000
811
+ },
812
+ {
813
+ "epoch": 4.15580182529335,
814
+ "grad_norm": 0.6110065579414368,
815
+ "learning_rate": 0.004771334025051382,
816
+ "loss": 0.3138,
817
+ "step": 51000
818
+ },
819
+ {
820
+ "epoch": 4.15580182529335,
821
+ "eval_accuracy": 0.8908813041263373,
822
+ "eval_loss": 0.29451584815979004,
823
+ "eval_runtime": 7.4809,
824
+ "eval_samples_per_second": 1312.002,
825
+ "eval_steps_per_second": 2.673,
826
+ "step": 51000
827
+ },
828
+ {
829
+ "epoch": 4.237288135593221,
830
+ "grad_norm": 1.414892554283142,
831
+ "learning_rate": 0.004609993026729961,
832
+ "loss": 0.3189,
833
+ "step": 52000
834
+ },
835
+ {
836
+ "epoch": 4.237288135593221,
837
+ "eval_accuracy": 0.8854814060112073,
838
+ "eval_loss": 0.3163043260574341,
839
+ "eval_runtime": 7.4918,
840
+ "eval_samples_per_second": 1310.097,
841
+ "eval_steps_per_second": 2.67,
842
+ "step": 52000
843
+ },
844
+ {
845
+ "epoch": 4.31877444589309,
846
+ "grad_norm": 1.5064332485198975,
847
+ "learning_rate": 0.004449059678700766,
848
+ "loss": 0.3222,
849
+ "step": 53000
850
+ },
851
+ {
852
+ "epoch": 4.31877444589309,
853
+ "eval_accuracy": 0.8927152317880794,
854
+ "eval_loss": 0.31491199135780334,
855
+ "eval_runtime": 7.4988,
856
+ "eval_samples_per_second": 1308.87,
857
+ "eval_steps_per_second": 2.667,
858
+ "step": 53000
859
+ },
860
+ {
861
+ "epoch": 4.400260756192959,
862
+ "grad_norm": 0.49173757433891296,
863
+ "learning_rate": 0.004288702194690342,
864
+ "loss": 0.3194,
865
+ "step": 54000
866
+ },
867
+ {
868
+ "epoch": 4.400260756192959,
869
+ "eval_accuracy": 0.8904737646459501,
870
+ "eval_loss": 0.30408918857574463,
871
+ "eval_runtime": 7.5071,
872
+ "eval_samples_per_second": 1307.425,
873
+ "eval_steps_per_second": 2.664,
874
+ "step": 54000
875
+ },
876
+ {
877
+ "epoch": 4.481747066492829,
878
+ "grad_norm": 0.6415348649024963,
879
+ "learning_rate": 0.00412908818651001,
880
+ "loss": 0.3214,
881
+ "step": 55000
882
+ },
883
+ {
884
+ "epoch": 4.481747066492829,
885
+ "eval_accuracy": 0.8858889454915945,
886
+ "eval_loss": 0.30896589159965515,
887
+ "eval_runtime": 7.4849,
888
+ "eval_samples_per_second": 1311.309,
889
+ "eval_steps_per_second": 2.672,
890
+ "step": 55000
891
+ },
892
+ {
893
+ "epoch": 4.563233376792699,
894
+ "grad_norm": 0.7397938370704651,
895
+ "learning_rate": 0.003970384488861551,
896
+ "loss": 0.3167,
897
+ "step": 56000
898
+ },
899
+ {
900
+ "epoch": 4.563233376792699,
901
+ "eval_accuracy": 0.8891492613346917,
902
+ "eval_loss": 0.2900329530239105,
903
+ "eval_runtime": 7.4757,
904
+ "eval_samples_per_second": 1312.918,
905
+ "eval_steps_per_second": 2.675,
906
+ "step": 56000
907
+ },
908
+ {
909
+ "epoch": 4.644719687092568,
910
+ "grad_norm": 0.8142307996749878,
911
+ "learning_rate": 0.003812756984955154,
912
+ "loss": 0.321,
913
+ "step": 57000
914
+ },
915
+ {
916
+ "epoch": 4.644719687092568,
917
+ "eval_accuracy": 0.8921039225674987,
918
+ "eval_loss": 0.3090360760688782,
919
+ "eval_runtime": 7.4786,
920
+ "eval_samples_per_second": 1312.415,
921
+ "eval_steps_per_second": 2.674,
922
+ "step": 57000
923
+ },
924
+ {
925
+ "epoch": 4.726205997392438,
926
+ "grad_norm": 1.6489217281341553,
927
+ "learning_rate": 0.0036563704331219154,
928
+ "loss": 0.316,
929
+ "step": 58000
930
+ },
931
+ {
932
+ "epoch": 4.726205997392438,
933
+ "eval_accuracy": 0.8911869587366276,
934
+ "eval_loss": 0.31844472885131836,
935
+ "eval_runtime": 7.5116,
936
+ "eval_samples_per_second": 1306.645,
937
+ "eval_steps_per_second": 2.663,
938
+ "step": 58000
939
+ },
940
+ {
941
+ "epoch": 4.8076923076923075,
942
+ "grad_norm": 0.6077815890312195,
943
+ "learning_rate": 0.00350138829460208,
944
+ "loss": 0.3212,
945
+ "step": 59000
946
+ },
947
+ {
948
+ "epoch": 4.8076923076923075,
949
+ "eval_accuracy": 0.8929190015282731,
950
+ "eval_loss": 0.30086687207221985,
951
+ "eval_runtime": 7.4977,
952
+ "eval_samples_per_second": 1309.06,
953
+ "eval_steps_per_second": 2.667,
954
+ "step": 59000
955
+ },
956
+ {
957
+ "epoch": 4.889178617992178,
958
+ "grad_norm": 0.4297344386577606,
959
+ "learning_rate": 0.0033479725626890658,
960
+ "loss": 0.3129,
961
+ "step": 60000
962
+ },
963
+ {
964
+ "epoch": 4.889178617992178,
965
+ "eval_accuracy": 0.8930208863983699,
966
+ "eval_loss": 0.3006724417209625,
967
+ "eval_runtime": 7.4834,
968
+ "eval_samples_per_second": 1311.564,
969
+ "eval_steps_per_second": 2.673,
970
+ "step": 60000
971
+ },
972
+ {
973
+ "epoch": 4.970664928292047,
974
+ "grad_norm": 0.5105591416358948,
975
+ "learning_rate": 0.0031962835934078488,
976
+ "loss": 0.3093,
977
+ "step": 61000
978
+ },
979
+ {
980
+ "epoch": 4.970664928292047,
981
+ "eval_accuracy": 0.8922058074375955,
982
+ "eval_loss": 0.2791294753551483,
983
+ "eval_runtime": 7.4805,
984
+ "eval_samples_per_second": 1312.076,
985
+ "eval_steps_per_second": 2.674,
986
+ "step": 61000
987
+ },
988
+ {
989
+ "epoch": 5.052151238591916,
990
+ "grad_norm": 1.757141351699829,
991
+ "learning_rate": 0.0030464799379046744,
992
+ "loss": 0.2951,
993
+ "step": 62000
994
+ },
995
+ {
996
+ "epoch": 5.052151238591916,
997
+ "eval_accuracy": 0.8961793173713704,
998
+ "eval_loss": 0.3046811521053314,
999
+ "eval_runtime": 7.5049,
1000
+ "eval_samples_per_second": 1307.809,
1001
+ "eval_steps_per_second": 2.665,
1002
+ "step": 62000
1003
+ },
1004
+ {
1005
+ "epoch": 5.1336375488917865,
1006
+ "grad_norm": 0.8879550099372864,
1007
+ "learning_rate": 0.0028987181767232946,
1008
+ "loss": 0.2752,
1009
+ "step": 63000
1010
+ },
1011
+ {
1012
+ "epoch": 5.1336375488917865,
1013
+ "eval_accuracy": 0.8965868568517575,
1014
+ "eval_loss": 0.28206223249435425,
1015
+ "eval_runtime": 14.9757,
1016
+ "eval_samples_per_second": 655.394,
1017
+ "eval_steps_per_second": 1.335,
1018
+ "step": 63000
1019
+ },
1020
+ {
1021
+ "epoch": 5.215123859191656,
1022
+ "grad_norm": 1.276357650756836,
1023
+ "learning_rate": 0.0027531527561409663,
1024
+ "loss": 0.2813,
1025
+ "step": 64000
1026
+ },
1027
+ {
1028
+ "epoch": 5.215123859191656,
1029
+ "eval_accuracy": 0.8965868568517575,
1030
+ "eval_loss": 0.30695512890815735,
1031
+ "eval_runtime": 7.4973,
1032
+ "eval_samples_per_second": 1309.135,
1033
+ "eval_steps_per_second": 2.668,
1034
+ "step": 64000
1035
+ },
1036
+ {
1037
+ "epoch": 5.296610169491525,
1038
+ "grad_norm": 0.6619582176208496,
1039
+ "learning_rate": 0.0026099358267352548,
1040
+ "loss": 0.2838,
1041
+ "step": 65000
1042
+ },
1043
+ {
1044
+ "epoch": 5.296610169491525,
1045
+ "eval_accuracy": 0.8966887417218543,
1046
+ "eval_loss": 0.2938063144683838,
1047
+ "eval_runtime": 7.4918,
1048
+ "eval_samples_per_second": 1310.108,
1049
+ "eval_steps_per_second": 2.67,
1050
+ "step": 65000
1051
+ },
1052
+ {
1053
+ "epoch": 5.378096479791395,
1054
+ "grad_norm": 0.8515479564666748,
1055
+ "learning_rate": 0.002469217084350399,
1056
+ "loss": 0.2782,
1057
+ "step": 66000
1058
+ },
1059
+ {
1060
+ "epoch": 5.378096479791395,
1061
+ "eval_accuracy": 0.8956698930208864,
1062
+ "eval_loss": 0.29661813378334045,
1063
+ "eval_runtime": 7.494,
1064
+ "eval_samples_per_second": 1309.711,
1065
+ "eval_steps_per_second": 2.669,
1066
+ "step": 66000
1067
+ },
1068
+ {
1069
+ "epoch": 5.459582790091265,
1070
+ "grad_norm": 1.3585212230682373,
1071
+ "learning_rate": 0.0023311436136294657,
1072
+ "loss": 0.2812,
1073
+ "step": 67000
1074
+ },
1075
+ {
1076
+ "epoch": 5.459582790091265,
1077
+ "eval_accuracy": 0.8964849719816608,
1078
+ "eval_loss": 0.2987619638442993,
1079
+ "eval_runtime": 7.5037,
1080
+ "eval_samples_per_second": 1308.029,
1081
+ "eval_steps_per_second": 2.665,
1082
+ "step": 67000
1083
+ },
1084
+ {
1085
+ "epoch": 5.541069100391134,
1086
+ "grad_norm": 0.3554459512233734,
1087
+ "learning_rate": 0.0021958597342758073,
1088
+ "loss": 0.2838,
1089
+ "step": 68000
1090
+ },
1091
+ {
1092
+ "epoch": 5.541069100391134,
1093
+ "eval_accuracy": 0.8991339786041773,
1094
+ "eval_loss": 0.2801837921142578,
1095
+ "eval_runtime": 7.4834,
1096
+ "eval_samples_per_second": 1311.562,
1097
+ "eval_steps_per_second": 2.673,
1098
+ "step": 68000
1099
+ },
1100
+ {
1101
+ "epoch": 5.622555410691004,
1102
+ "grad_norm": 1.9772554636001587,
1103
+ "learning_rate": 0.0020635068502045583,
1104
+ "loss": 0.2755,
1105
+ "step": 69000
1106
+ },
1107
+ {
1108
+ "epoch": 5.622555410691004,
1109
+ "eval_accuracy": 0.9006622516556292,
1110
+ "eval_loss": 0.2981088161468506,
1111
+ "eval_runtime": 7.4836,
1112
+ "eval_samples_per_second": 1311.527,
1113
+ "eval_steps_per_second": 2.672,
1114
+ "step": 69000
1115
+ },
1116
+ {
1117
+ "epoch": 5.704041720990873,
1118
+ "grad_norm": 0.5913628935813904,
1119
+ "learning_rate": 0.0019342233017418326,
1120
+ "loss": 0.2817,
1121
+ "step": 70000
1122
+ },
1123
+ {
1124
+ "epoch": 5.704041720990873,
1125
+ "eval_accuracy": 0.894854814060112,
1126
+ "eval_loss": 0.2903667092323303,
1127
+ "eval_runtime": 7.4986,
1128
+ "eval_samples_per_second": 1308.911,
1129
+ "eval_steps_per_second": 2.667,
1130
+ "step": 70000
1131
+ },
1132
+ {
1133
+ "epoch": 5.7855280312907436,
1134
+ "grad_norm": 0.5924209952354431,
1135
+ "learning_rate": 0.001808144221026089,
1136
+ "loss": 0.2783,
1137
+ "step": 71000
1138
+ },
1139
+ {
1140
+ "epoch": 5.7855280312907436,
1141
+ "eval_accuracy": 0.8975038206826287,
1142
+ "eval_loss": 0.29570597410202026,
1143
+ "eval_runtime": 7.4989,
1144
+ "eval_samples_per_second": 1308.859,
1145
+ "eval_steps_per_second": 2.667,
1146
+ "step": 71000
1147
+ },
1148
+ {
1149
+ "epoch": 5.867014341590613,
1150
+ "grad_norm": 0.6399027109146118,
1151
+ "learning_rate": 0.0016854013907628169,
1152
+ "loss": 0.2733,
1153
+ "step": 72000
1154
+ },
1155
+ {
1156
+ "epoch": 5.867014341590613,
1157
+ "eval_accuracy": 0.8976057055527255,
1158
+ "eval_loss": 0.2965139150619507,
1159
+ "eval_runtime": 7.4938,
1160
+ "eval_samples_per_second": 1309.743,
1161
+ "eval_steps_per_second": 2.669,
1162
+ "step": 72000
1163
+ },
1164
+ {
1165
+ "epoch": 5.948500651890482,
1166
+ "grad_norm": 0.46064063906669617,
1167
+ "learning_rate": 0.0015661231064801817,
1168
+ "loss": 0.2698,
1169
+ "step": 73000
1170
+ },
1171
+ {
1172
+ "epoch": 5.948500651890482,
1173
+ "eval_accuracy": 0.894854814060112,
1174
+ "eval_loss": 0.3313537836074829,
1175
+ "eval_runtime": 7.5019,
1176
+ "eval_samples_per_second": 1308.336,
1177
+ "eval_steps_per_second": 2.666,
1178
+ "step": 73000
1179
+ },
1180
+ {
1181
+ "epoch": 6.029986962190352,
1182
+ "grad_norm": 0.9792414903640747,
1183
+ "learning_rate": 0.001450434042429613,
1184
+ "loss": 0.2617,
1185
+ "step": 74000
1186
+ },
1187
+ {
1188
+ "epoch": 6.029986962190352,
1189
+ "eval_accuracy": 0.8960774325012736,
1190
+ "eval_loss": 0.32309064269065857,
1191
+ "eval_runtime": 7.4944,
1192
+ "eval_samples_per_second": 1309.64,
1193
+ "eval_steps_per_second": 2.669,
1194
+ "step": 74000
1195
+ },
1196
+ {
1197
+ "epoch": 6.111473272490222,
1198
+ "grad_norm": 0.49711874127388,
1199
+ "learning_rate": 0.0013384551212714713,
1200
+ "loss": 0.2349,
1201
+ "step": 75000
1202
+ },
1203
+ {
1204
+ "epoch": 6.111473272490222,
1205
+ "eval_accuracy": 0.8949566989302089,
1206
+ "eval_loss": 0.3282994329929352,
1207
+ "eval_runtime": 7.4789,
1208
+ "eval_samples_per_second": 1312.363,
1209
+ "eval_steps_per_second": 2.674,
1210
+ "step": 75000
1211
+ },
1212
+ {
1213
+ "epoch": 6.192959582790091,
1214
+ "grad_norm": 0.6636607646942139,
1215
+ "learning_rate": 0.0012303033876820402,
1216
+ "loss": 0.2359,
1217
+ "step": 76000
1218
+ },
1219
+ {
1220
+ "epoch": 6.192959582790091,
1221
+ "eval_accuracy": 0.903616912888436,
1222
+ "eval_loss": 0.3025311529636383,
1223
+ "eval_runtime": 7.5059,
1224
+ "eval_samples_per_second": 1307.643,
1225
+ "eval_steps_per_second": 2.665,
1226
+ "step": 76000
1227
+ },
1228
+ {
1229
+ "epoch": 6.274445893089961,
1230
+ "grad_norm": 0.6344679594039917,
1231
+ "learning_rate": 0.0011260918860139136,
1232
+ "loss": 0.2384,
1233
+ "step": 77000
1234
+ },
1235
+ {
1236
+ "epoch": 6.274445893089961,
1237
+ "eval_accuracy": 0.900254712175242,
1238
+ "eval_loss": 0.30031833052635193,
1239
+ "eval_runtime": 7.5071,
1240
+ "eval_samples_per_second": 1307.423,
1241
+ "eval_steps_per_second": 2.664,
1242
+ "step": 77000
1243
+ },
1244
+ {
1245
+ "epoch": 6.3559322033898304,
1246
+ "grad_norm": 0.8250097036361694,
1247
+ "learning_rate": 0.0010259295421377063,
1248
+ "loss": 0.2356,
1249
+ "step": 78000
1250
+ },
1251
+ {
1252
+ "epoch": 6.3559322033898304,
1253
+ "eval_accuracy": 0.8990320937340804,
1254
+ "eval_loss": 0.31037458777427673,
1255
+ "eval_runtime": 7.5017,
1256
+ "eval_samples_per_second": 1308.377,
1257
+ "eval_steps_per_second": 2.666,
1258
+ "step": 78000
1259
+ },
1260
+ {
1261
+ "epoch": 6.4374185136897,
1262
+ "grad_norm": 0.41535335779190063,
1263
+ "learning_rate": 0.0009299210495885319,
1264
+ "loss": 0.237,
1265
+ "step": 79000
1266
+ },
1267
+ {
1268
+ "epoch": 6.4374185136897,
1269
+ "eval_accuracy": 0.8959755476311768,
1270
+ "eval_loss": 0.3192022144794464,
1271
+ "eval_runtime": 7.5095,
1272
+ "eval_samples_per_second": 1307.011,
1273
+ "eval_steps_per_second": 2.663,
1274
+ "step": 79000
1275
+ },
1276
+ {
1277
+ "epoch": 6.51890482398957,
1278
+ "grad_norm": 1.490874171257019,
1279
+ "learning_rate": 0.000838166760136298,
1280
+ "loss": 0.2377,
1281
+ "step": 80000
1282
+ },
1283
+ {
1284
+ "epoch": 6.51890482398957,
1285
+ "eval_accuracy": 0.9045338767193072,
1286
+ "eval_loss": 0.2835454046726227,
1287
+ "eval_runtime": 7.4926,
1288
+ "eval_samples_per_second": 1309.951,
1289
+ "eval_steps_per_second": 2.669,
1290
+ "step": 80000
1291
+ },
1292
+ {
1293
+ "epoch": 6.600391134289439,
1294
+ "grad_norm": 1.9151830673217773,
1295
+ "learning_rate": 0.0007507625788941736,
1296
+ "loss": 0.2327,
1297
+ "step": 81000
1298
+ },
1299
+ {
1300
+ "epoch": 6.600391134289439,
1301
+ "eval_accuracy": 0.9035150280183393,
1302
+ "eval_loss": 0.3065315783023834,
1303
+ "eval_runtime": 7.4827,
1304
+ "eval_samples_per_second": 1311.698,
1305
+ "eval_steps_per_second": 2.673,
1306
+ "step": 81000
1307
+ },
1308
+ {
1309
+ "epoch": 6.681877444589309,
1310
+ "grad_norm": 0.849884033203125,
1311
+ "learning_rate": 0.0006677998640748751,
1312
+ "loss": 0.2337,
1313
+ "step": 82000
1314
+ },
1315
+ {
1316
+ "epoch": 6.681877444589309,
1317
+ "eval_accuracy": 0.9020886398369842,
1318
+ "eval_loss": 0.29341772198677063,
1319
+ "eval_runtime": 7.4817,
1320
+ "eval_samples_per_second": 1311.872,
1321
+ "eval_steps_per_second": 2.673,
1322
+ "step": 82000
1323
+ },
1324
+ {
1325
+ "epoch": 6.763363754889179,
1326
+ "grad_norm": 1.3731963634490967,
1327
+ "learning_rate": 0.000589365331499549,
1328
+ "loss": 0.2319,
1329
+ "step": 83000
1330
+ },
1331
+ {
1332
+ "epoch": 6.763363754889179,
1333
+ "eval_accuracy": 0.9004584819154355,
1334
+ "eval_loss": 0.3038472533226013,
1335
+ "eval_runtime": 7.5,
1336
+ "eval_samples_per_second": 1308.675,
1337
+ "eval_steps_per_second": 2.667,
1338
+ "step": 83000
1339
+ },
1340
+ {
1341
+ "epoch": 6.844850065189048,
1342
+ "grad_norm": 1.3203397989273071,
1343
+ "learning_rate": 0.0005155409639590586,
1344
+ "loss": 0.2318,
1345
+ "step": 84000
1346
+ },
1347
+ {
1348
+ "epoch": 6.844850065189048,
1349
+ "eval_accuracy": 0.9033112582781457,
1350
+ "eval_loss": 0.2874143421649933,
1351
+ "eval_runtime": 7.5053,
1352
+ "eval_samples_per_second": 1307.745,
1353
+ "eval_steps_per_second": 2.665,
1354
+ "step": 84000
1355
+ },
1356
+ {
1357
+ "epoch": 6.926336375488918,
1358
+ "grad_norm": 0.5277544856071472,
1359
+ "learning_rate": 0.0004464039255224173,
1360
+ "loss": 0.2251,
1361
+ "step": 85000
1362
+ },
1363
+ {
1364
+ "epoch": 6.926336375488918,
1365
+ "eval_accuracy": 0.9032093734080489,
1366
+ "eval_loss": 0.29026326537132263,
1367
+ "eval_runtime": 11.8762,
1368
+ "eval_samples_per_second": 826.439,
1369
+ "eval_steps_per_second": 1.684,
1370
+ "step": 85000
1371
+ },
1372
+ {
1373
+ "epoch": 7.0078226857887875,
1374
+ "grad_norm": 0.9861716628074646,
1375
+ "learning_rate": 0.0003820264808819357,
1376
+ "loss": 0.2212,
1377
+ "step": 86000
1378
+ },
1379
+ {
1380
+ "epoch": 7.0078226857887875,
1381
+ "eval_accuracy": 0.9033112582781457,
1382
+ "eval_loss": 0.3097410500049591,
1383
+ "eval_runtime": 7.5021,
1384
+ "eval_samples_per_second": 1308.292,
1385
+ "eval_steps_per_second": 2.666,
1386
+ "step": 86000
1387
+ },
1388
+ {
1389
+ "epoch": 7.089308996088657,
1390
+ "grad_norm": 0.5879834890365601,
1391
+ "learning_rate": 0.0003224759198193844,
1392
+ "loss": 0.2014,
1393
+ "step": 87000
1394
+ },
1395
+ {
1396
+ "epoch": 7.089308996088657,
1397
+ "eval_accuracy": 0.90412633723892,
1398
+ "eval_loss": 0.31044328212738037,
1399
+ "eval_runtime": 7.4793,
1400
+ "eval_samples_per_second": 1312.29,
1401
+ "eval_steps_per_second": 2.674,
1402
+ "step": 87000
1403
+ },
1404
+ {
1405
+ "epoch": 7.170795306388527,
1406
+ "grad_norm": 0.5867053270339966,
1407
+ "learning_rate": 0.00026781448687212606,
1408
+ "loss": 0.201,
1409
+ "step": 88000
1410
+ },
1411
+ {
1412
+ "epoch": 7.170795306388527,
1413
+ "eval_accuracy": 0.9038206826286297,
1414
+ "eval_loss": 0.3128410577774048,
1415
+ "eval_runtime": 7.5118,
1416
+ "eval_samples_per_second": 1306.612,
1417
+ "eval_steps_per_second": 2.662,
1418
+ "step": 88000
1419
+ },
1420
+ {
1421
+ "epoch": 7.252281616688396,
1422
+ "grad_norm": 0.5377621650695801,
1423
+ "learning_rate": 0.0002180993162727296,
1424
+ "loss": 0.2005,
1425
+ "step": 89000
1426
+ },
1427
+ {
1428
+ "epoch": 7.252281616688396,
1429
+ "eval_accuracy": 0.9054508405501783,
1430
+ "eval_loss": 0.3119710385799408,
1431
+ "eval_runtime": 7.4957,
1432
+ "eval_samples_per_second": 1309.416,
1433
+ "eval_steps_per_second": 2.668,
1434
+ "step": 89000
1435
+ },
1436
+ {
1437
+ "epoch": 7.333767926988266,
1438
+ "grad_norm": 0.5833735466003418,
1439
+ "learning_rate": 0.00017338237223007313,
1440
+ "loss": 0.2062,
1441
+ "step": 90000
1442
+ },
1443
+ {
1444
+ "epoch": 7.333767926988266,
1445
+ "eval_accuracy": 0.9047376464595007,
1446
+ "eval_loss": 0.3042852580547333,
1447
+ "eval_runtime": 7.4986,
1448
+ "eval_samples_per_second": 1308.908,
1449
+ "eval_steps_per_second": 2.667,
1450
+ "step": 90000
1451
+ },
1452
+ {
1453
+ "epoch": 7.415254237288136,
1454
+ "grad_norm": 0.7836347222328186,
1455
+ "learning_rate": 0.00013371039461435253,
1456
+ "loss": 0.2047,
1457
+ "step": 91000
1458
+ },
1459
+ {
1460
+ "epoch": 7.415254237288136,
1461
+ "eval_accuracy": 0.9048395313295976,
1462
+ "eval_loss": 0.3043782711029053,
1463
+ "eval_runtime": 7.4753,
1464
+ "eval_samples_per_second": 1312.983,
1465
+ "eval_steps_per_second": 2.675,
1466
+ "step": 91000
1467
+ },
1468
+ {
1469
+ "epoch": 7.496740547588005,
1470
+ "grad_norm": 0.4800412654876709,
1471
+ "learning_rate": 9.912485010277361e-05,
1472
+ "loss": 0.2038,
1473
+ "step": 92000
1474
+ },
1475
+ {
1476
+ "epoch": 7.496740547588005,
1477
+ "eval_accuracy": 0.9048395313295976,
1478
+ "eval_loss": 0.3026777505874634,
1479
+ "eval_runtime": 7.5009,
1480
+ "eval_samples_per_second": 1308.515,
1481
+ "eval_steps_per_second": 2.666,
1482
+ "step": 92000
1483
+ },
1484
+ {
1485
+ "epoch": 7.578226857887875,
1486
+ "grad_norm": 0.3577961325645447,
1487
+ "learning_rate": 6.966188883698266e-05,
1488
+ "loss": 0.2043,
1489
+ "step": 93000
1490
+ },
1491
+ {
1492
+ "epoch": 7.578226857887875,
1493
+ "eval_accuracy": 0.9043301069791136,
1494
+ "eval_loss": 0.2996033728122711,
1495
+ "eval_runtime": 7.4793,
1496
+ "eval_samples_per_second": 1312.283,
1497
+ "eval_steps_per_second": 2.674,
1498
+ "step": 93000
1499
+ },
1500
+ {
1501
+ "epoch": 7.659713168187745,
1502
+ "grad_norm": 0.7479351162910461,
1503
+ "learning_rate": 4.53523066375483e-05,
1504
+ "loss": 0.1958,
1505
+ "step": 94000
1506
+ },
1507
+ {
1508
+ "epoch": 7.659713168187745,
1509
+ "eval_accuracy": 0.9039225674987265,
1510
+ "eval_loss": 0.3070015609264374,
1511
+ "eval_runtime": 7.4944,
1512
+ "eval_samples_per_second": 1309.642,
1513
+ "eval_steps_per_second": 2.669,
1514
+ "step": 94000
1515
+ },
1516
+ {
1517
+ "epoch": 7.741199478487614,
1518
+ "grad_norm": 0.593128502368927,
1519
+ "learning_rate": 2.6221512814988413e-05,
1520
+ "loss": 0.2021,
1521
+ "step": 95000
1522
+ },
1523
+ {
1524
+ "epoch": 7.741199478487614,
1525
+ "eval_accuracy": 0.9050433010697911,
1526
+ "eval_loss": 0.3058023154735565,
1527
+ "eval_runtime": 7.5046,
1528
+ "eval_samples_per_second": 1307.858,
1529
+ "eval_steps_per_second": 2.665,
1530
+ "step": 95000
1531
+ },
1532
+ {
1533
+ "epoch": 7.822685788787483,
1534
+ "grad_norm": 0.8814780116081238,
1535
+ "learning_rate": 1.2289503610977692e-05,
1536
+ "loss": 0.2039,
1537
+ "step": 96000
1538
+ },
1539
+ {
1540
+ "epoch": 7.822685788787483,
1541
+ "eval_accuracy": 0.9054508405501783,
1542
+ "eval_loss": 0.30380532145500183,
1543
+ "eval_runtime": 7.4946,
1544
+ "eval_samples_per_second": 1309.603,
1545
+ "eval_steps_per_second": 2.669,
1546
+ "step": 96000
1547
+ },
1548
+ {
1549
+ "epoch": 7.904172099087353,
1550
+ "grad_norm": 1.0658342838287354,
1551
+ "learning_rate": 3.570841297507177e-06,
1552
+ "loss": 0.2,
1553
+ "step": 97000
1554
+ },
1555
+ {
1556
+ "epoch": 7.904172099087353,
1557
+ "eval_accuracy": 0.9051451859398879,
1558
+ "eval_loss": 0.30474814772605896,
1559
+ "eval_runtime": 7.4886,
1560
+ "eval_samples_per_second": 1310.65,
1561
+ "eval_steps_per_second": 2.671,
1562
+ "step": 97000
1563
+ },
1564
+ {
1565
+ "epoch": 7.985658409387223,
1566
+ "grad_norm": 0.4510676860809326,
1567
+ "learning_rate": 7.463895583814661e-08,
1568
+ "loss": 0.2031,
1569
+ "step": 98000
1570
+ },
1571
+ {
1572
+ "epoch": 7.985658409387223,
1573
+ "eval_accuracy": 0.9050433010697911,
1574
+ "eval_loss": 0.3046664297580719,
1575
+ "eval_runtime": 7.4982,
1576
+ "eval_samples_per_second": 1308.98,
1577
+ "eval_steps_per_second": 2.667,
1578
+ "step": 98000
1579
+ },
1580
+ {
1581
+ "epoch": 8.0,
1582
+ "step": 98176,
1583
+ "total_flos": 4.1937110564010394e+17,
1584
+ "train_loss": 0.3294937685617723,
1585
+ "train_runtime": 35336.2891,
1586
+ "train_samples_per_second": 88.906,
1587
+ "train_steps_per_second": 2.778
1588
+ }
1589
+ ],
1590
+ "logging_steps": 1000,
1591
+ "max_steps": 98176,
1592
+ "num_input_tokens_seen": 0,
1593
+ "num_train_epochs": 8,
1594
+ "save_steps": 1000,
1595
+ "stateful_callbacks": {
1596
+ "TrainerControl": {
1597
+ "args": {
1598
+ "should_epoch_stop": false,
1599
+ "should_evaluate": false,
1600
+ "should_log": false,
1601
+ "should_save": true,
1602
+ "should_training_stop": true
1603
+ },
1604
+ "attributes": {}
1605
+ }
1606
+ },
1607
+ "total_flos": 4.1937110564010394e+17,
1608
+ "train_batch_size": 32,
1609
+ "trial_name": null,
1610
+ "trial_params": null
1611
+ }
reproduction/glue_exp/mrpc/dr0.0,mlr6e-03,clr6e-03,ep=60.0t=22d12h19m17/MRPC.tsv ADDED
@@ -0,0 +1,1726 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ index prediction
2
+ 0 1
3
+ 1 1
4
+ 2 1
5
+ 3 0
6
+ 4 0
7
+ 5 1
8
+ 6 0
9
+ 7 1
10
+ 8 1
11
+ 9 0
12
+ 10 1
13
+ 11 1
14
+ 12 1
15
+ 13 0
16
+ 14 1
17
+ 15 0
18
+ 16 1
19
+ 17 1
20
+ 18 1
21
+ 19 1
22
+ 20 1
23
+ 21 1
24
+ 22 1
25
+ 23 1
26
+ 24 0
27
+ 25 1
28
+ 26 1
29
+ 27 1
30
+ 28 0
31
+ 29 0
32
+ 30 1
33
+ 31 0
34
+ 32 0
35
+ 33 0
36
+ 34 0
37
+ 35 0
38
+ 36 1
39
+ 37 1
40
+ 38 0
41
+ 39 1
42
+ 40 1
43
+ 41 1
44
+ 42 0
45
+ 43 1
46
+ 44 1
47
+ 45 0
48
+ 46 0
49
+ 47 0
50
+ 48 1
51
+ 49 1
52
+ 50 1
53
+ 51 1
54
+ 52 0
55
+ 53 1
56
+ 54 0
57
+ 55 1
58
+ 56 1
59
+ 57 1
60
+ 58 0
61
+ 59 0
62
+ 60 1
63
+ 61 1
64
+ 62 1
65
+ 63 1
66
+ 64 1
67
+ 65 1
68
+ 66 1
69
+ 67 1
70
+ 68 1
71
+ 69 1
72
+ 70 0
73
+ 71 1
74
+ 72 1
75
+ 73 0
76
+ 74 1
77
+ 75 0
78
+ 76 1
79
+ 77 1
80
+ 78 0
81
+ 79 0
82
+ 80 1
83
+ 81 0
84
+ 82 1
85
+ 83 0
86
+ 84 0
87
+ 85 0
88
+ 86 1
89
+ 87 1
90
+ 88 0
91
+ 89 0
92
+ 90 1
93
+ 91 1
94
+ 92 1
95
+ 93 0
96
+ 94 1
97
+ 95 0
98
+ 96 0
99
+ 97 1
100
+ 98 0
101
+ 99 1
102
+ 100 1
103
+ 101 0
104
+ 102 1
105
+ 103 1
106
+ 104 1
107
+ 105 0
108
+ 106 0
109
+ 107 1
110
+ 108 1
111
+ 109 1
112
+ 110 1
113
+ 111 0
114
+ 112 0
115
+ 113 1
116
+ 114 1
117
+ 115 0
118
+ 116 0
119
+ 117 0
120
+ 118 1
121
+ 119 1
122
+ 120 1
123
+ 121 0
124
+ 122 1
125
+ 123 1
126
+ 124 1
127
+ 125 1
128
+ 126 1
129
+ 127 1
130
+ 128 0
131
+ 129 1
132
+ 130 0
133
+ 131 1
134
+ 132 1
135
+ 133 0
136
+ 134 1
137
+ 135 0
138
+ 136 1
139
+ 137 1
140
+ 138 1
141
+ 139 1
142
+ 140 1
143
+ 141 1
144
+ 142 1
145
+ 143 1
146
+ 144 1
147
+ 145 1
148
+ 146 0
149
+ 147 1
150
+ 148 1
151
+ 149 1
152
+ 150 1
153
+ 151 1
154
+ 152 0
155
+ 153 0
156
+ 154 1
157
+ 155 1
158
+ 156 1
159
+ 157 1
160
+ 158 1
161
+ 159 1
162
+ 160 1
163
+ 161 1
164
+ 162 1
165
+ 163 1
166
+ 164 0
167
+ 165 1
168
+ 166 1
169
+ 167 0
170
+ 168 1
171
+ 169 0
172
+ 170 1
173
+ 171 1
174
+ 172 1
175
+ 173 0
176
+ 174 0
177
+ 175 1
178
+ 176 1
179
+ 177 0
180
+ 178 1
181
+ 179 1
182
+ 180 0
183
+ 181 0
184
+ 182 1
185
+ 183 1
186
+ 184 1
187
+ 185 1
188
+ 186 0
189
+ 187 0
190
+ 188 1
191
+ 189 1
192
+ 190 0
193
+ 191 0
194
+ 192 1
195
+ 193 1
196
+ 194 0
197
+ 195 0
198
+ 196 0
199
+ 197 1
200
+ 198 1
201
+ 199 0
202
+ 200 0
203
+ 201 1
204
+ 202 1
205
+ 203 1
206
+ 204 0
207
+ 205 0
208
+ 206 1
209
+ 207 1
210
+ 208 0
211
+ 209 1
212
+ 210 0
213
+ 211 0
214
+ 212 1
215
+ 213 0
216
+ 214 1
217
+ 215 0
218
+ 216 1
219
+ 217 1
220
+ 218 1
221
+ 219 1
222
+ 220 1
223
+ 221 0
224
+ 222 1
225
+ 223 0
226
+ 224 1
227
+ 225 1
228
+ 226 1
229
+ 227 0
230
+ 228 0
231
+ 229 1
232
+ 230 1
233
+ 231 1
234
+ 232 1
235
+ 233 1
236
+ 234 0
237
+ 235 0
238
+ 236 1
239
+ 237 1
240
+ 238 0
241
+ 239 0
242
+ 240 1
243
+ 241 1
244
+ 242 1
245
+ 243 1
246
+ 244 0
247
+ 245 1
248
+ 246 1
249
+ 247 0
250
+ 248 1
251
+ 249 0
252
+ 250 1
253
+ 251 0
254
+ 252 0
255
+ 253 1
256
+ 254 1
257
+ 255 1
258
+ 256 1
259
+ 257 1
260
+ 258 1
261
+ 259 1
262
+ 260 1
263
+ 261 1
264
+ 262 1
265
+ 263 1
266
+ 264 0
267
+ 265 1
268
+ 266 0
269
+ 267 1
270
+ 268 1
271
+ 269 1
272
+ 270 1
273
+ 271 0
274
+ 272 1
275
+ 273 0
276
+ 274 0
277
+ 275 0
278
+ 276 0
279
+ 277 1
280
+ 278 1
281
+ 279 1
282
+ 280 1
283
+ 281 0
284
+ 282 1
285
+ 283 1
286
+ 284 1
287
+ 285 0
288
+ 286 1
289
+ 287 0
290
+ 288 0
291
+ 289 0
292
+ 290 0
293
+ 291 1
294
+ 292 1
295
+ 293 1
296
+ 294 1
297
+ 295 0
298
+ 296 0
299
+ 297 0
300
+ 298 1
301
+ 299 1
302
+ 300 1
303
+ 301 1
304
+ 302 0
305
+ 303 1
306
+ 304 1
307
+ 305 0
308
+ 306 1
309
+ 307 1
310
+ 308 0
311
+ 309 1
312
+ 310 1
313
+ 311 1
314
+ 312 0
315
+ 313 1
316
+ 314 1
317
+ 315 1
318
+ 316 1
319
+ 317 1
320
+ 318 1
321
+ 319 0
322
+ 320 1
323
+ 321 0
324
+ 322 1
325
+ 323 0
326
+ 324 1
327
+ 325 1
328
+ 326 1
329
+ 327 0
330
+ 328 0
331
+ 329 0
332
+ 330 0
333
+ 331 1
334
+ 332 1
335
+ 333 1
336
+ 334 1
337
+ 335 0
338
+ 336 1
339
+ 337 1
340
+ 338 1
341
+ 339 1
342
+ 340 1
343
+ 341 1
344
+ 342 1
345
+ 343 0
346
+ 344 0
347
+ 345 1
348
+ 346 0
349
+ 347 0
350
+ 348 1
351
+ 349 0
352
+ 350 1
353
+ 351 1
354
+ 352 0
355
+ 353 1
356
+ 354 0
357
+ 355 0
358
+ 356 0
359
+ 357 0
360
+ 358 1
361
+ 359 0
362
+ 360 0
363
+ 361 0
364
+ 362 0
365
+ 363 1
366
+ 364 0
367
+ 365 1
368
+ 366 0
369
+ 367 0
370
+ 368 0
371
+ 369 0
372
+ 370 1
373
+ 371 1
374
+ 372 1
375
+ 373 1
376
+ 374 1
377
+ 375 0
378
+ 376 0
379
+ 377 1
380
+ 378 0
381
+ 379 0
382
+ 380 1
383
+ 381 1
384
+ 382 0
385
+ 383 1
386
+ 384 1
387
+ 385 0
388
+ 386 1
389
+ 387 0
390
+ 388 1
391
+ 389 1
392
+ 390 1
393
+ 391 0
394
+ 392 0
395
+ 393 1
396
+ 394 0
397
+ 395 1
398
+ 396 1
399
+ 397 1
400
+ 398 0
401
+ 399 0
402
+ 400 1
403
+ 401 0
404
+ 402 1
405
+ 403 1
406
+ 404 1
407
+ 405 0
408
+ 406 0
409
+ 407 1
410
+ 408 1
411
+ 409 1
412
+ 410 0
413
+ 411 0
414
+ 412 1
415
+ 413 1
416
+ 414 1
417
+ 415 0
418
+ 416 0
419
+ 417 1
420
+ 418 1
421
+ 419 1
422
+ 420 1
423
+ 421 1
424
+ 422 1
425
+ 423 1
426
+ 424 0
427
+ 425 0
428
+ 426 0
429
+ 427 1
430
+ 428 0
431
+ 429 1
432
+ 430 0
433
+ 431 1
434
+ 432 1
435
+ 433 1
436
+ 434 1
437
+ 435 1
438
+ 436 1
439
+ 437 0
440
+ 438 1
441
+ 439 0
442
+ 440 0
443
+ 441 0
444
+ 442 1
445
+ 443 1
446
+ 444 1
447
+ 445 1
448
+ 446 1
449
+ 447 1
450
+ 448 1
451
+ 449 1
452
+ 450 0
453
+ 451 1
454
+ 452 1
455
+ 453 0
456
+ 454 1
457
+ 455 1
458
+ 456 1
459
+ 457 1
460
+ 458 0
461
+ 459 1
462
+ 460 1
463
+ 461 1
464
+ 462 1
465
+ 463 0
466
+ 464 0
467
+ 465 0
468
+ 466 1
469
+ 467 1
470
+ 468 1
471
+ 469 1
472
+ 470 0
473
+ 471 1
474
+ 472 1
475
+ 473 1
476
+ 474 1
477
+ 475 1
478
+ 476 1
479
+ 477 1
480
+ 478 1
481
+ 479 0
482
+ 480 1
483
+ 481 1
484
+ 482 1
485
+ 483 1
486
+ 484 0
487
+ 485 1
488
+ 486 1
489
+ 487 1
490
+ 488 1
491
+ 489 1
492
+ 490 1
493
+ 491 1
494
+ 492 1
495
+ 493 1
496
+ 494 0
497
+ 495 1
498
+ 496 0
499
+ 497 1
500
+ 498 0
501
+ 499 1
502
+ 500 1
503
+ 501 1
504
+ 502 1
505
+ 503 0
506
+ 504 1
507
+ 505 1
508
+ 506 0
509
+ 507 1
510
+ 508 0
511
+ 509 0
512
+ 510 1
513
+ 511 1
514
+ 512 0
515
+ 513 1
516
+ 514 1
517
+ 515 1
518
+ 516 1
519
+ 517 0
520
+ 518 1
521
+ 519 1
522
+ 520 1
523
+ 521 0
524
+ 522 1
525
+ 523 1
526
+ 524 1
527
+ 525 1
528
+ 526 0
529
+ 527 1
530
+ 528 1
531
+ 529 1
532
+ 530 1
533
+ 531 1
534
+ 532 1
535
+ 533 0
536
+ 534 1
537
+ 535 1
538
+ 536 1
539
+ 537 1
540
+ 538 0
541
+ 539 1
542
+ 540 1
543
+ 541 1
544
+ 542 1
545
+ 543 1
546
+ 544 1
547
+ 545 1
548
+ 546 1
549
+ 547 0
550
+ 548 0
551
+ 549 0
552
+ 550 0
553
+ 551 1
554
+ 552 0
555
+ 553 1
556
+ 554 0
557
+ 555 1
558
+ 556 0
559
+ 557 1
560
+ 558 1
561
+ 559 1
562
+ 560 1
563
+ 561 0
564
+ 562 0
565
+ 563 0
566
+ 564 0
567
+ 565 1
568
+ 566 1
569
+ 567 1
570
+ 568 0
571
+ 569 0
572
+ 570 0
573
+ 571 1
574
+ 572 1
575
+ 573 0
576
+ 574 1
577
+ 575 0
578
+ 576 1
579
+ 577 0
580
+ 578 1
581
+ 579 0
582
+ 580 1
583
+ 581 0
584
+ 582 1
585
+ 583 1
586
+ 584 1
587
+ 585 1
588
+ 586 0
589
+ 587 1
590
+ 588 1
591
+ 589 1
592
+ 590 1
593
+ 591 1
594
+ 592 1
595
+ 593 0
596
+ 594 0
597
+ 595 1
598
+ 596 1
599
+ 597 0
600
+ 598 1
601
+ 599 1
602
+ 600 1
603
+ 601 1
604
+ 602 1
605
+ 603 1
606
+ 604 1
607
+ 605 1
608
+ 606 1
609
+ 607 0
610
+ 608 0
611
+ 609 1
612
+ 610 1
613
+ 611 1
614
+ 612 0
615
+ 613 1
616
+ 614 1
617
+ 615 1
618
+ 616 1
619
+ 617 0
620
+ 618 1
621
+ 619 1
622
+ 620 1
623
+ 621 0
624
+ 622 1
625
+ 623 0
626
+ 624 1
627
+ 625 0
628
+ 626 1
629
+ 627 1
630
+ 628 0
631
+ 629 1
632
+ 630 0
633
+ 631 1
634
+ 632 1
635
+ 633 0
636
+ 634 0
637
+ 635 1
638
+ 636 0
639
+ 637 1
640
+ 638 1
641
+ 639 1
642
+ 640 1
643
+ 641 1
644
+ 642 1
645
+ 643 1
646
+ 644 1
647
+ 645 0
648
+ 646 1
649
+ 647 1
650
+ 648 1
651
+ 649 1
652
+ 650 1
653
+ 651 0
654
+ 652 0
655
+ 653 1
656
+ 654 0
657
+ 655 1
658
+ 656 1
659
+ 657 1
660
+ 658 1
661
+ 659 1
662
+ 660 1
663
+ 661 1
664
+ 662 1
665
+ 663 1
666
+ 664 1
667
+ 665 0
668
+ 666 1
669
+ 667 0
670
+ 668 0
671
+ 669 0
672
+ 670 1
673
+ 671 1
674
+ 672 0
675
+ 673 1
676
+ 674 1
677
+ 675 0
678
+ 676 1
679
+ 677 1
680
+ 678 1
681
+ 679 1
682
+ 680 0
683
+ 681 1
684
+ 682 1
685
+ 683 1
686
+ 684 0
687
+ 685 1
688
+ 686 1
689
+ 687 0
690
+ 688 1
691
+ 689 1
692
+ 690 1
693
+ 691 0
694
+ 692 0
695
+ 693 1
696
+ 694 0
697
+ 695 0
698
+ 696 0
699
+ 697 1
700
+ 698 0
701
+ 699 1
702
+ 700 0
703
+ 701 1
704
+ 702 1
705
+ 703 0
706
+ 704 1
707
+ 705 0
708
+ 706 0
709
+ 707 0
710
+ 708 1
711
+ 709 1
712
+ 710 0
713
+ 711 0
714
+ 712 1
715
+ 713 1
716
+ 714 1
717
+ 715 1
718
+ 716 1
719
+ 717 0
720
+ 718 1
721
+ 719 1
722
+ 720 1
723
+ 721 1
724
+ 722 0
725
+ 723 0
726
+ 724 1
727
+ 725 1
728
+ 726 1
729
+ 727 1
730
+ 728 1
731
+ 729 0
732
+ 730 1
733
+ 731 1
734
+ 732 1
735
+ 733 0
736
+ 734 1
737
+ 735 1
738
+ 736 1
739
+ 737 1
740
+ 738 1
741
+ 739 1
742
+ 740 1
743
+ 741 1
744
+ 742 1
745
+ 743 1
746
+ 744 1
747
+ 745 0
748
+ 746 0
749
+ 747 0
750
+ 748 1
751
+ 749 0
752
+ 750 1
753
+ 751 1
754
+ 752 1
755
+ 753 0
756
+ 754 1
757
+ 755 1
758
+ 756 1
759
+ 757 0
760
+ 758 1
761
+ 759 1
762
+ 760 1
763
+ 761 1
764
+ 762 1
765
+ 763 1
766
+ 764 1
767
+ 765 1
768
+ 766 1
769
+ 767 1
770
+ 768 1
771
+ 769 1
772
+ 770 1
773
+ 771 1
774
+ 772 1
775
+ 773 1
776
+ 774 1
777
+ 775 1
778
+ 776 1
779
+ 777 1
780
+ 778 0
781
+ 779 1
782
+ 780 0
783
+ 781 1
784
+ 782 1
785
+ 783 0
786
+ 784 1
787
+ 785 1
788
+ 786 1
789
+ 787 0
790
+ 788 1
791
+ 789 1
792
+ 790 1
793
+ 791 1
794
+ 792 0
795
+ 793 1
796
+ 794 1
797
+ 795 1
798
+ 796 0
799
+ 797 0
800
+ 798 1
801
+ 799 0
802
+ 800 0
803
+ 801 0
804
+ 802 1
805
+ 803 1
806
+ 804 0
807
+ 805 1
808
+ 806 1
809
+ 807 0
810
+ 808 1
811
+ 809 0
812
+ 810 1
813
+ 811 1
814
+ 812 1
815
+ 813 1
816
+ 814 0
817
+ 815 1
818
+ 816 0
819
+ 817 0
820
+ 818 0
821
+ 819 1
822
+ 820 0
823
+ 821 0
824
+ 822 0
825
+ 823 1
826
+ 824 1
827
+ 825 1
828
+ 826 1
829
+ 827 0
830
+ 828 0
831
+ 829 1
832
+ 830 1
833
+ 831 1
834
+ 832 1
835
+ 833 0
836
+ 834 1
837
+ 835 0
838
+ 836 1
839
+ 837 1
840
+ 838 1
841
+ 839 0
842
+ 840 0
843
+ 841 0
844
+ 842 1
845
+ 843 1
846
+ 844 0
847
+ 845 1
848
+ 846 1
849
+ 847 1
850
+ 848 0
851
+ 849 1
852
+ 850 0
853
+ 851 1
854
+ 852 1
855
+ 853 0
856
+ 854 1
857
+ 855 0
858
+ 856 1
859
+ 857 1
860
+ 858 0
861
+ 859 1
862
+ 860 1
863
+ 861 1
864
+ 862 1
865
+ 863 1
866
+ 864 0
867
+ 865 1
868
+ 866 1
869
+ 867 1
870
+ 868 1
871
+ 869 1
872
+ 870 0
873
+ 871 1
874
+ 872 1
875
+ 873 1
876
+ 874 1
877
+ 875 0
878
+ 876 0
879
+ 877 0
880
+ 878 0
881
+ 879 0
882
+ 880 0
883
+ 881 0
884
+ 882 1
885
+ 883 0
886
+ 884 0
887
+ 885 1
888
+ 886 1
889
+ 887 1
890
+ 888 0
891
+ 889 1
892
+ 890 1
893
+ 891 1
894
+ 892 0
895
+ 893 0
896
+ 894 0
897
+ 895 1
898
+ 896 0
899
+ 897 1
900
+ 898 1
901
+ 899 1
902
+ 900 1
903
+ 901 1
904
+ 902 0
905
+ 903 1
906
+ 904 1
907
+ 905 1
908
+ 906 1
909
+ 907 1
910
+ 908 1
911
+ 909 1
912
+ 910 1
913
+ 911 1
914
+ 912 1
915
+ 913 0
916
+ 914 1
917
+ 915 1
918
+ 916 1
919
+ 917 1
920
+ 918 0
921
+ 919 1
922
+ 920 1
923
+ 921 1
924
+ 922 1
925
+ 923 1
926
+ 924 1
927
+ 925 1
928
+ 926 1
929
+ 927 0
930
+ 928 1
931
+ 929 1
932
+ 930 1
933
+ 931 0
934
+ 932 0
935
+ 933 0
936
+ 934 0
937
+ 935 1
938
+ 936 0
939
+ 937 0
940
+ 938 1
941
+ 939 1
942
+ 940 1
943
+ 941 1
944
+ 942 0
945
+ 943 1
946
+ 944 1
947
+ 945 1
948
+ 946 1
949
+ 947 1
950
+ 948 0
951
+ 949 1
952
+ 950 0
953
+ 951 1
954
+ 952 0
955
+ 953 1
956
+ 954 1
957
+ 955 1
958
+ 956 1
959
+ 957 1
960
+ 958 1
961
+ 959 1
962
+ 960 1
963
+ 961 1
964
+ 962 1
965
+ 963 0
966
+ 964 0
967
+ 965 1
968
+ 966 1
969
+ 967 1
970
+ 968 1
971
+ 969 1
972
+ 970 1
973
+ 971 0
974
+ 972 0
975
+ 973 1
976
+ 974 1
977
+ 975 1
978
+ 976 1
979
+ 977 1
980
+ 978 1
981
+ 979 1
982
+ 980 1
983
+ 981 1
984
+ 982 0
985
+ 983 1
986
+ 984 0
987
+ 985 0
988
+ 986 1
989
+ 987 0
990
+ 988 1
991
+ 989 0
992
+ 990 0
993
+ 991 1
994
+ 992 1
995
+ 993 1
996
+ 994 0
997
+ 995 1
998
+ 996 1
999
+ 997 1
1000
+ 998 1
1001
+ 999 1
1002
+ 1000 0
1003
+ 1001 1
1004
+ 1002 0
1005
+ 1003 0
1006
+ 1004 1
1007
+ 1005 1
1008
+ 1006 1
1009
+ 1007 1
1010
+ 1008 1
1011
+ 1009 0
1012
+ 1010 1
1013
+ 1011 0
1014
+ 1012 1
1015
+ 1013 1
1016
+ 1014 1
1017
+ 1015 1
1018
+ 1016 1
1019
+ 1017 1
1020
+ 1018 1
1021
+ 1019 1
1022
+ 1020 1
1023
+ 1021 1
1024
+ 1022 1
1025
+ 1023 0
1026
+ 1024 1
1027
+ 1025 0
1028
+ 1026 0
1029
+ 1027 1
1030
+ 1028 0
1031
+ 1029 1
1032
+ 1030 0
1033
+ 1031 1
1034
+ 1032 1
1035
+ 1033 1
1036
+ 1034 1
1037
+ 1035 0
1038
+ 1036 1
1039
+ 1037 0
1040
+ 1038 1
1041
+ 1039 0
1042
+ 1040 0
1043
+ 1041 1
1044
+ 1042 0
1045
+ 1043 0
1046
+ 1044 1
1047
+ 1045 1
1048
+ 1046 0
1049
+ 1047 1
1050
+ 1048 1
1051
+ 1049 1
1052
+ 1050 1
1053
+ 1051 1
1054
+ 1052 0
1055
+ 1053 1
1056
+ 1054 0
1057
+ 1055 1
1058
+ 1056 1
1059
+ 1057 1
1060
+ 1058 1
1061
+ 1059 1
1062
+ 1060 1
1063
+ 1061 1
1064
+ 1062 1
1065
+ 1063 1
1066
+ 1064 1
1067
+ 1065 1
1068
+ 1066 1
1069
+ 1067 1
1070
+ 1068 0
1071
+ 1069 1
1072
+ 1070 1
1073
+ 1071 1
1074
+ 1072 1
1075
+ 1073 1
1076
+ 1074 1
1077
+ 1075 1
1078
+ 1076 1
1079
+ 1077 1
1080
+ 1078 1
1081
+ 1079 1
1082
+ 1080 0
1083
+ 1081 0
1084
+ 1082 1
1085
+ 1083 1
1086
+ 1084 1
1087
+ 1085 1
1088
+ 1086 1
1089
+ 1087 1
1090
+ 1088 1
1091
+ 1089 1
1092
+ 1090 1
1093
+ 1091 0
1094
+ 1092 1
1095
+ 1093 0
1096
+ 1094 1
1097
+ 1095 1
1098
+ 1096 1
1099
+ 1097 1
1100
+ 1098 1
1101
+ 1099 1
1102
+ 1100 1
1103
+ 1101 1
1104
+ 1102 0
1105
+ 1103 1
1106
+ 1104 1
1107
+ 1105 0
1108
+ 1106 1
1109
+ 1107 0
1110
+ 1108 1
1111
+ 1109 1
1112
+ 1110 1
1113
+ 1111 1
1114
+ 1112 0
1115
+ 1113 0
1116
+ 1114 1
1117
+ 1115 1
1118
+ 1116 0
1119
+ 1117 1
1120
+ 1118 1
1121
+ 1119 1
1122
+ 1120 0
1123
+ 1121 0
1124
+ 1122 1
1125
+ 1123 1
1126
+ 1124 0
1127
+ 1125 1
1128
+ 1126 0
1129
+ 1127 0
1130
+ 1128 1
1131
+ 1129 1
1132
+ 1130 0
1133
+ 1131 1
1134
+ 1132 0
1135
+ 1133 1
1136
+ 1134 0
1137
+ 1135 1
1138
+ 1136 0
1139
+ 1137 1
1140
+ 1138 0
1141
+ 1139 0
1142
+ 1140 1
1143
+ 1141 1
1144
+ 1142 1
1145
+ 1143 1
1146
+ 1144 1
1147
+ 1145 1
1148
+ 1146 1
1149
+ 1147 1
1150
+ 1148 0
1151
+ 1149 1
1152
+ 1150 0
1153
+ 1151 1
1154
+ 1152 0
1155
+ 1153 0
1156
+ 1154 0
1157
+ 1155 1
1158
+ 1156 1
1159
+ 1157 0
1160
+ 1158 0
1161
+ 1159 1
1162
+ 1160 0
1163
+ 1161 0
1164
+ 1162 1
1165
+ 1163 1
1166
+ 1164 1
1167
+ 1165 0
1168
+ 1166 0
1169
+ 1167 1
1170
+ 1168 1
1171
+ 1169 0
1172
+ 1170 0
1173
+ 1171 1
1174
+ 1172 1
1175
+ 1173 1
1176
+ 1174 1
1177
+ 1175 0
1178
+ 1176 1
1179
+ 1177 0
1180
+ 1178 1
1181
+ 1179 1
1182
+ 1180 0
1183
+ 1181 1
1184
+ 1182 1
1185
+ 1183 1
1186
+ 1184 1
1187
+ 1185 1
1188
+ 1186 0
1189
+ 1187 1
1190
+ 1188 1
1191
+ 1189 0
1192
+ 1190 0
1193
+ 1191 0
1194
+ 1192 1
1195
+ 1193 1
1196
+ 1194 1
1197
+ 1195 0
1198
+ 1196 0
1199
+ 1197 1
1200
+ 1198 1
1201
+ 1199 1
1202
+ 1200 1
1203
+ 1201 1
1204
+ 1202 0
1205
+ 1203 1
1206
+ 1204 0
1207
+ 1205 1
1208
+ 1206 0
1209
+ 1207 0
1210
+ 1208 1
1211
+ 1209 1
1212
+ 1210 1
1213
+ 1211 1
1214
+ 1212 1
1215
+ 1213 1
1216
+ 1214 1
1217
+ 1215 1
1218
+ 1216 1
1219
+ 1217 1
1220
+ 1218 1
1221
+ 1219 1
1222
+ 1220 1
1223
+ 1221 0
1224
+ 1222 1
1225
+ 1223 1
1226
+ 1224 0
1227
+ 1225 1
1228
+ 1226 0
1229
+ 1227 1
1230
+ 1228 1
1231
+ 1229 1
1232
+ 1230 1
1233
+ 1231 0
1234
+ 1232 1
1235
+ 1233 1
1236
+ 1234 1
1237
+ 1235 1
1238
+ 1236 1
1239
+ 1237 1
1240
+ 1238 1
1241
+ 1239 1
1242
+ 1240 1
1243
+ 1241 0
1244
+ 1242 1
1245
+ 1243 1
1246
+ 1244 1
1247
+ 1245 1
1248
+ 1246 0
1249
+ 1247 0
1250
+ 1248 0
1251
+ 1249 1
1252
+ 1250 1
1253
+ 1251 1
1254
+ 1252 1
1255
+ 1253 1
1256
+ 1254 1
1257
+ 1255 1
1258
+ 1256 1
1259
+ 1257 1
1260
+ 1258 0
1261
+ 1259 1
1262
+ 1260 1
1263
+ 1261 1
1264
+ 1262 1
1265
+ 1263 1
1266
+ 1264 1
1267
+ 1265 1
1268
+ 1266 1
1269
+ 1267 1
1270
+ 1268 1
1271
+ 1269 1
1272
+ 1270 1
1273
+ 1271 1
1274
+ 1272 0
1275
+ 1273 0
1276
+ 1274 0
1277
+ 1275 1
1278
+ 1276 0
1279
+ 1277 1
1280
+ 1278 0
1281
+ 1279 0
1282
+ 1280 0
1283
+ 1281 1
1284
+ 1282 0
1285
+ 1283 0
1286
+ 1284 1
1287
+ 1285 1
1288
+ 1286 1
1289
+ 1287 0
1290
+ 1288 1
1291
+ 1289 1
1292
+ 1290 1
1293
+ 1291 1
1294
+ 1292 0
1295
+ 1293 0
1296
+ 1294 0
1297
+ 1295 1
1298
+ 1296 1
1299
+ 1297 1
1300
+ 1298 1
1301
+ 1299 1
1302
+ 1300 0
1303
+ 1301 1
1304
+ 1302 1
1305
+ 1303 1
1306
+ 1304 1
1307
+ 1305 0
1308
+ 1306 0
1309
+ 1307 1
1310
+ 1308 1
1311
+ 1309 0
1312
+ 1310 1
1313
+ 1311 1
1314
+ 1312 1
1315
+ 1313 1
1316
+ 1314 1
1317
+ 1315 1
1318
+ 1316 1
1319
+ 1317 1
1320
+ 1318 0
1321
+ 1319 1
1322
+ 1320 1
1323
+ 1321 0
1324
+ 1322 1
1325
+ 1323 1
1326
+ 1324 1
1327
+ 1325 1
1328
+ 1326 0
1329
+ 1327 1
1330
+ 1328 1
1331
+ 1329 0
1332
+ 1330 0
1333
+ 1331 1
1334
+ 1332 1
1335
+ 1333 0
1336
+ 1334 1
1337
+ 1335 0
1338
+ 1336 1
1339
+ 1337 0
1340
+ 1338 0
1341
+ 1339 1
1342
+ 1340 0
1343
+ 1341 1
1344
+ 1342 0
1345
+ 1343 0
1346
+ 1344 1
1347
+ 1345 1
1348
+ 1346 1
1349
+ 1347 1
1350
+ 1348 1
1351
+ 1349 1
1352
+ 1350 1
1353
+ 1351 1
1354
+ 1352 1
1355
+ 1353 1
1356
+ 1354 0
1357
+ 1355 0
1358
+ 1356 1
1359
+ 1357 1
1360
+ 1358 0
1361
+ 1359 1
1362
+ 1360 1
1363
+ 1361 1
1364
+ 1362 1
1365
+ 1363 0
1366
+ 1364 1
1367
+ 1365 0
1368
+ 1366 0
1369
+ 1367 0
1370
+ 1368 1
1371
+ 1369 1
1372
+ 1370 0
1373
+ 1371 0
1374
+ 1372 1
1375
+ 1373 1
1376
+ 1374 0
1377
+ 1375 1
1378
+ 1376 0
1379
+ 1377 1
1380
+ 1378 1
1381
+ 1379 1
1382
+ 1380 1
1383
+ 1381 1
1384
+ 1382 1
1385
+ 1383 0
1386
+ 1384 1
1387
+ 1385 1
1388
+ 1386 0
1389
+ 1387 0
1390
+ 1388 1
1391
+ 1389 1
1392
+ 1390 1
1393
+ 1391 0
1394
+ 1392 0
1395
+ 1393 1
1396
+ 1394 1
1397
+ 1395 1
1398
+ 1396 0
1399
+ 1397 1
1400
+ 1398 1
1401
+ 1399 1
1402
+ 1400 0
1403
+ 1401 0
1404
+ 1402 1
1405
+ 1403 0
1406
+ 1404 1
1407
+ 1405 0
1408
+ 1406 1
1409
+ 1407 1
1410
+ 1408 1
1411
+ 1409 1
1412
+ 1410 0
1413
+ 1411 0
1414
+ 1412 1
1415
+ 1413 1
1416
+ 1414 0
1417
+ 1415 1
1418
+ 1416 1
1419
+ 1417 0
1420
+ 1418 1
1421
+ 1419 1
1422
+ 1420 1
1423
+ 1421 1
1424
+ 1422 0
1425
+ 1423 0
1426
+ 1424 0
1427
+ 1425 1
1428
+ 1426 0
1429
+ 1427 1
1430
+ 1428 0
1431
+ 1429 1
1432
+ 1430 0
1433
+ 1431 1
1434
+ 1432 1
1435
+ 1433 0
1436
+ 1434 0
1437
+ 1435 0
1438
+ 1436 1
1439
+ 1437 1
1440
+ 1438 1
1441
+ 1439 1
1442
+ 1440 0
1443
+ 1441 1
1444
+ 1442 0
1445
+ 1443 0
1446
+ 1444 0
1447
+ 1445 1
1448
+ 1446 1
1449
+ 1447 1
1450
+ 1448 0
1451
+ 1449 1
1452
+ 1450 1
1453
+ 1451 0
1454
+ 1452 1
1455
+ 1453 1
1456
+ 1454 1
1457
+ 1455 1
1458
+ 1456 1
1459
+ 1457 1
1460
+ 1458 1
1461
+ 1459 1
1462
+ 1460 1
1463
+ 1461 1
1464
+ 1462 1
1465
+ 1463 0
1466
+ 1464 1
1467
+ 1465 1
1468
+ 1466 1
1469
+ 1467 0
1470
+ 1468 1
1471
+ 1469 0
1472
+ 1470 1
1473
+ 1471 1
1474
+ 1472 0
1475
+ 1473 0
1476
+ 1474 1
1477
+ 1475 0
1478
+ 1476 0
1479
+ 1477 1
1480
+ 1478 0
1481
+ 1479 0
1482
+ 1480 1
1483
+ 1481 0
1484
+ 1482 1
1485
+ 1483 0
1486
+ 1484 0
1487
+ 1485 1
1488
+ 1486 0
1489
+ 1487 1
1490
+ 1488 1
1491
+ 1489 1
1492
+ 1490 0
1493
+ 1491 1
1494
+ 1492 1
1495
+ 1493 0
1496
+ 1494 0
1497
+ 1495 1
1498
+ 1496 1
1499
+ 1497 0
1500
+ 1498 0
1501
+ 1499 0
1502
+ 1500 1
1503
+ 1501 1
1504
+ 1502 0
1505
+ 1503 0
1506
+ 1504 0
1507
+ 1505 1
1508
+ 1506 1
1509
+ 1507 1
1510
+ 1508 1
1511
+ 1509 0
1512
+ 1510 1
1513
+ 1511 1
1514
+ 1512 1
1515
+ 1513 1
1516
+ 1514 0
1517
+ 1515 1
1518
+ 1516 0
1519
+ 1517 1
1520
+ 1518 0
1521
+ 1519 0
1522
+ 1520 0
1523
+ 1521 0
1524
+ 1522 0
1525
+ 1523 1
1526
+ 1524 1
1527
+ 1525 1
1528
+ 1526 1
1529
+ 1527 1
1530
+ 1528 0
1531
+ 1529 1
1532
+ 1530 1
1533
+ 1531 1
1534
+ 1532 1
1535
+ 1533 1
1536
+ 1534 1
1537
+ 1535 0
1538
+ 1536 1
1539
+ 1537 1
1540
+ 1538 1
1541
+ 1539 0
1542
+ 1540 0
1543
+ 1541 1
1544
+ 1542 0
1545
+ 1543 1
1546
+ 1544 0
1547
+ 1545 0
1548
+ 1546 1
1549
+ 1547 0
1550
+ 1548 1
1551
+ 1549 1
1552
+ 1550 1
1553
+ 1551 0
1554
+ 1552 0
1555
+ 1553 0
1556
+ 1554 1
1557
+ 1555 1
1558
+ 1556 0
1559
+ 1557 1
1560
+ 1558 0
1561
+ 1559 1
1562
+ 1560 1
1563
+ 1561 0
1564
+ 1562 1
1565
+ 1563 1
1566
+ 1564 0
1567
+ 1565 0
1568
+ 1566 1
1569
+ 1567 1
1570
+ 1568 1
1571
+ 1569 1
1572
+ 1570 1
1573
+ 1571 1
1574
+ 1572 1
1575
+ 1573 0
1576
+ 1574 1
1577
+ 1575 1
1578
+ 1576 1
1579
+ 1577 0
1580
+ 1578 0
1581
+ 1579 1
1582
+ 1580 1
1583
+ 1581 1
1584
+ 1582 0
1585
+ 1583 1
1586
+ 1584 0
1587
+ 1585 1
1588
+ 1586 1
1589
+ 1587 1
1590
+ 1588 1
1591
+ 1589 1
1592
+ 1590 0
1593
+ 1591 1
1594
+ 1592 1
1595
+ 1593 0
1596
+ 1594 1
1597
+ 1595 1
1598
+ 1596 1
1599
+ 1597 0
1600
+ 1598 0
1601
+ 1599 1
1602
+ 1600 0
1603
+ 1601 1
1604
+ 1602 1
1605
+ 1603 0
1606
+ 1604 0
1607
+ 1605 0
1608
+ 1606 1
1609
+ 1607 1
1610
+ 1608 0
1611
+ 1609 1
1612
+ 1610 0
1613
+ 1611 1
1614
+ 1612 1
1615
+ 1613 1
1616
+ 1614 1
1617
+ 1615 1
1618
+ 1616 1
1619
+ 1617 0
1620
+ 1618 1
1621
+ 1619 1
1622
+ 1620 0
1623
+ 1621 0
1624
+ 1622 1
1625
+ 1623 1
1626
+ 1624 0
1627
+ 1625 0
1628
+ 1626 1
1629
+ 1627 1
1630
+ 1628 1
1631
+ 1629 0
1632
+ 1630 0
1633
+ 1631 1
1634
+ 1632 1
1635
+ 1633 1
1636
+ 1634 1
1637
+ 1635 1
1638
+ 1636 1
1639
+ 1637 1
1640
+ 1638 0
1641
+ 1639 0
1642
+ 1640 0
1643
+ 1641 1
1644
+ 1642 1
1645
+ 1643 1
1646
+ 1644 1
1647
+ 1645 1
1648
+ 1646 1
1649
+ 1647 0
1650
+ 1648 1
1651
+ 1649 1
1652
+ 1650 0
1653
+ 1651 0
1654
+ 1652 0
1655
+ 1653 1
1656
+ 1654 0
1657
+ 1655 1
1658
+ 1656 0
1659
+ 1657 0
1660
+ 1658 1
1661
+ 1659 1
1662
+ 1660 0
1663
+ 1661 0
1664
+ 1662 0
1665
+ 1663 1
1666
+ 1664 1
1667
+ 1665 0
1668
+ 1666 0
1669
+ 1667 1
1670
+ 1668 0
1671
+ 1669 1
1672
+ 1670 0
1673
+ 1671 0
1674
+ 1672 1
1675
+ 1673 1
1676
+ 1674 1
1677
+ 1675 0
1678
+ 1676 1
1679
+ 1677 1
1680
+ 1678 1
1681
+ 1679 1
1682
+ 1680 0
1683
+ 1681 1
1684
+ 1682 1
1685
+ 1683 0
1686
+ 1684 1
1687
+ 1685 1
1688
+ 1686 0
1689
+ 1687 0
1690
+ 1688 1
1691
+ 1689 1
1692
+ 1690 1
1693
+ 1691 0
1694
+ 1692 1
1695
+ 1693 1
1696
+ 1694 1
1697
+ 1695 1
1698
+ 1696 1
1699
+ 1697 0
1700
+ 1698 0
1701
+ 1699 1
1702
+ 1700 0
1703
+ 1701 1
1704
+ 1702 0
1705
+ 1703 1
1706
+ 1704 1
1707
+ 1705 1
1708
+ 1706 0
1709
+ 1707 0
1710
+ 1708 1
1711
+ 1709 1
1712
+ 1710 1
1713
+ 1711 1
1714
+ 1712 1
1715
+ 1713 1
1716
+ 1714 1
1717
+ 1715 1
1718
+ 1716 1
1719
+ 1717 1
1720
+ 1718 0
1721
+ 1719 1
1722
+ 1720 0
1723
+ 1721 0
1724
+ 1722 0
1725
+ 1723 1
1726
+ 1724 1
reproduction/glue_exp/mrpc/dr0.0,mlr6e-03,clr6e-03,ep=60.0t=22d12h19m17/all_results.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 60.0,
3
+ "eval_accuracy": 0.9068627450980392,
4
+ "eval_combined_score": 0.9196235433675249,
5
+ "eval_f1": 0.9323843416370107,
6
+ "eval_loss": 0.38074013590812683,
7
+ "eval_runtime": 0.5403,
8
+ "eval_samples": 408,
9
+ "eval_samples_per_second": 755.198,
10
+ "eval_steps_per_second": 1.851
11
+ }
reproduction/glue_exp/mrpc/dr0.0,mlr6e-03,clr6e-03,ep=60.0t=22d12h19m17/eval_results.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 60.0,
3
+ "eval_accuracy": 0.9068627450980392,
4
+ "eval_combined_score": 0.9196235433675249,
5
+ "eval_f1": 0.9323843416370107,
6
+ "eval_loss": 0.38074013590812683,
7
+ "eval_runtime": 0.5403,
8
+ "eval_samples": 408,
9
+ "eval_samples_per_second": 755.198,
10
+ "eval_steps_per_second": 1.851
11
+ }
reproduction/glue_exp/mrpc/dr0.0,mlr6e-03,clr6e-03,ep=60.0t=22d12h19m17/ft/added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "[MASK]": 128000
3
+ }
reproduction/glue_exp/mrpc/dr0.0,mlr6e-03,clr6e-03,ep=60.0t=22d12h19m17/ft/special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "[CLS]",
3
+ "cls_token": "[CLS]",
4
+ "eos_token": "[SEP]",
5
+ "mask_token": "[MASK]",
6
+ "pad_token": "[PAD]",
7
+ "sep_token": "[SEP]",
8
+ "unk_token": {
9
+ "content": "[UNK]",
10
+ "lstrip": false,
11
+ "normalized": true,
12
+ "rstrip": false,
13
+ "single_word": false
14
+ }
15
+ }
reproduction/glue_exp/mrpc/dr0.0,mlr6e-03,clr6e-03,ep=60.0t=22d12h19m17/ft/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
reproduction/glue_exp/mrpc/dr0.0,mlr6e-03,clr6e-03,ep=60.0t=22d12h19m17/ft/tokenizer_config.json ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[CLS]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[SEP]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[UNK]",
29
+ "lstrip": false,
30
+ "normalized": true,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "128000": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "bos_token": "[CLS]",
45
+ "clean_up_tokenization_spaces": false,
46
+ "cls_token": "[CLS]",
47
+ "do_lower_case": false,
48
+ "eos_token": "[SEP]",
49
+ "extra_special_tokens": {},
50
+ "mask_token": "[MASK]",
51
+ "model_max_length": 512,
52
+ "pad_token": "[PAD]",
53
+ "padding_side": "right",
54
+ "sep_token": "[SEP]",
55
+ "sp_model_kwargs": {},
56
+ "split_by_punct": false,
57
+ "tokenizer_class": "DebertaV2Tokenizer",
58
+ "unk_token": "[UNK]",
59
+ "vocab_type": "spm"
60
+ }
reproduction/glue_exp/mrpc/dr0.0,mlr6e-03,clr6e-03,ep=60.0t=22d12h19m17/ft2/README.md ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: microsoft/deberta-v3-base
3
+ library_name: peft
4
+ tags:
5
+ - base_model:adapter:microsoft/deberta-v3-base
6
+ - transformers
7
+ ---
8
+
9
+ # Model Card for Model ID
10
+
11
+ <!-- Provide a quick summary of what the model is/does. -->
12
+
13
+
14
+
15
+ ## Model Details
16
+
17
+ ### Model Description
18
+
19
+ <!-- Provide a longer summary of what this model is. -->
20
+
21
+
22
+
23
+ - **Developed by:** [More Information Needed]
24
+ - **Funded by [optional]:** [More Information Needed]
25
+ - **Shared by [optional]:** [More Information Needed]
26
+ - **Model type:** [More Information Needed]
27
+ - **Language(s) (NLP):** [More Information Needed]
28
+ - **License:** [More Information Needed]
29
+ - **Finetuned from model [optional]:** [More Information Needed]
30
+
31
+ ### Model Sources [optional]
32
+
33
+ <!-- Provide the basic links for the model. -->
34
+
35
+ - **Repository:** [More Information Needed]
36
+ - **Paper [optional]:** [More Information Needed]
37
+ - **Demo [optional]:** [More Information Needed]
38
+
39
+ ## Uses
40
+
41
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
42
+
43
+ ### Direct Use
44
+
45
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
46
+
47
+ [More Information Needed]
48
+
49
+ ### Downstream Use [optional]
50
+
51
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
52
+
53
+ [More Information Needed]
54
+
55
+ ### Out-of-Scope Use
56
+
57
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
58
+
59
+ [More Information Needed]
60
+
61
+ ## Bias, Risks, and Limitations
62
+
63
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
64
+
65
+ [More Information Needed]
66
+
67
+ ### Recommendations
68
+
69
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
70
+
71
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
72
+
73
+ ## How to Get Started with the Model
74
+
75
+ Use the code below to get started with the model.
76
+
77
+ [More Information Needed]
78
+
79
+ ## Training Details
80
+
81
+ ### Training Data
82
+
83
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
84
+
85
+ [More Information Needed]
86
+
87
+ ### Training Procedure
88
+
89
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
90
+
91
+ #### Preprocessing [optional]
92
+
93
+ [More Information Needed]
94
+
95
+
96
+ #### Training Hyperparameters
97
+
98
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
99
+
100
+ #### Speeds, Sizes, Times [optional]
101
+
102
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
103
+
104
+ [More Information Needed]
105
+
106
+ ## Evaluation
107
+
108
+ <!-- This section describes the evaluation protocols and provides the results. -->
109
+
110
+ ### Testing Data, Factors & Metrics
111
+
112
+ #### Testing Data
113
+
114
+ <!-- This should link to a Dataset Card if possible. -->
115
+
116
+ [More Information Needed]
117
+
118
+ #### Factors
119
+
120
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
121
+
122
+ [More Information Needed]
123
+
124
+ #### Metrics
125
+
126
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
127
+
128
+ [More Information Needed]
129
+
130
+ ### Results
131
+
132
+ [More Information Needed]
133
+
134
+ #### Summary
135
+
136
+
137
+
138
+ ## Model Examination [optional]
139
+
140
+ <!-- Relevant interpretability work for the model goes here -->
141
+
142
+ [More Information Needed]
143
+
144
+ ## Environmental Impact
145
+
146
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
147
+
148
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
149
+
150
+ - **Hardware Type:** [More Information Needed]
151
+ - **Hours used:** [More Information Needed]
152
+ - **Cloud Provider:** [More Information Needed]
153
+ - **Compute Region:** [More Information Needed]
154
+ - **Carbon Emitted:** [More Information Needed]
155
+
156
+ ## Technical Specifications [optional]
157
+
158
+ ### Model Architecture and Objective
159
+
160
+ [More Information Needed]
161
+
162
+ ### Compute Infrastructure
163
+
164
+ [More Information Needed]
165
+
166
+ #### Hardware
167
+
168
+ [More Information Needed]
169
+
170
+ #### Software
171
+
172
+ [More Information Needed]
173
+
174
+ ## Citation [optional]
175
+
176
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
177
+
178
+ **BibTeX:**
179
+
180
+ [More Information Needed]
181
+
182
+ **APA:**
183
+
184
+ [More Information Needed]
185
+
186
+ ## Glossary [optional]
187
+
188
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
189
+
190
+ [More Information Needed]
191
+
192
+ ## More Information [optional]
193
+
194
+ [More Information Needed]
195
+
196
+ ## Model Card Authors [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Contact
201
+
202
+ [More Information Needed]
203
+ ### Framework versions
204
+
205
+ - PEFT 0.18.0
reproduction/glue_exp/mrpc/dr0.0,mlr6e-03,clr6e-03,ep=60.0t=22d12h19m17/ft2/adapter_config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "apply_GS": false,
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "microsoft/deberta-v3-base",
5
+ "bias": "none",
6
+ "exclude_modules": null,
7
+ "inference_mode": true,
8
+ "init_weights": true,
9
+ "layers_pattern": null,
10
+ "layers_to_transform": null,
11
+ "modules_to_save": [
12
+ "classifier",
13
+ "pooler",
14
+ "classifier",
15
+ "score"
16
+ ],
17
+ "peft_type": "HRA",
18
+ "peft_version": "0.18.0",
19
+ "r": 8,
20
+ "revision": null,
21
+ "target_modules": [
22
+ "key_proj",
23
+ "value_proj",
24
+ "output.dense",
25
+ "intermediate.dense",
26
+ "query_proj",
27
+ "attention.output.dense"
28
+ ],
29
+ "task_type": "SEQ_CLS"
30
+ }
reproduction/glue_exp/mrpc/dr0.0,mlr6e-03,clr6e-03,ep=60.0t=22d12h19m17/trainer_state.json ADDED
@@ -0,0 +1,1285 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 500,
3
+ "best_metric": 0.9068627450980392,
4
+ "best_model_checkpoint": "./glue_exp/mrpc/dr0.0,mlr6e-03,clr6e-03,ep=60.0t=22d12h19m17/checkpoint-500",
5
+ "epoch": 60.0,
6
+ "eval_steps": 100,
7
+ "global_step": 6900,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.8695652173913043,
14
+ "grad_norm": 2.6364493370056152,
15
+ "learning_rate": 0.0059992290521279626,
16
+ "loss": 0.5476,
17
+ "step": 100
18
+ },
19
+ {
20
+ "epoch": 0.8695652173913043,
21
+ "eval_accuracy": 0.8848039215686274,
22
+ "eval_combined_score": 0.9006612858287186,
23
+ "eval_f1": 0.91651865008881,
24
+ "eval_loss": 0.28565412759780884,
25
+ "eval_runtime": 0.8654,
26
+ "eval_samples_per_second": 471.485,
27
+ "eval_steps_per_second": 1.156,
28
+ "step": 100
29
+ },
30
+ {
31
+ "epoch": 1.7391304347826086,
32
+ "grad_norm": 0.4555858075618744,
33
+ "learning_rate": 0.005992873898843122,
34
+ "loss": 0.399,
35
+ "step": 200
36
+ },
37
+ {
38
+ "epoch": 1.7391304347826086,
39
+ "eval_accuracy": 0.8553921568627451,
40
+ "eval_combined_score": 0.8792560127499276,
41
+ "eval_f1": 0.90311986863711,
42
+ "eval_loss": 0.34397462010383606,
43
+ "eval_runtime": 0.7166,
44
+ "eval_samples_per_second": 569.383,
45
+ "eval_steps_per_second": 1.396,
46
+ "step": 200
47
+ },
48
+ {
49
+ "epoch": 2.608695652173913,
50
+ "grad_norm": 2.3028626441955566,
51
+ "learning_rate": 0.0059801129785423835,
52
+ "loss": 0.3385,
53
+ "step": 300
54
+ },
55
+ {
56
+ "epoch": 2.608695652173913,
57
+ "eval_accuracy": 0.8627450980392157,
58
+ "eval_combined_score": 0.8851679285575617,
59
+ "eval_f1": 0.9075907590759076,
60
+ "eval_loss": 0.3621212840080261,
61
+ "eval_runtime": 0.7189,
62
+ "eval_samples_per_second": 567.504,
63
+ "eval_steps_per_second": 1.391,
64
+ "step": 300
65
+ },
66
+ {
67
+ "epoch": 3.4782608695652173,
68
+ "grad_norm": 0.3876798152923584,
69
+ "learning_rate": 0.005960973603930901,
70
+ "loss": 0.2599,
71
+ "step": 400
72
+ },
73
+ {
74
+ "epoch": 3.4782608695652173,
75
+ "eval_accuracy": 0.8774509803921569,
76
+ "eval_combined_score": 0.8963526088401462,
77
+ "eval_f1": 0.9152542372881356,
78
+ "eval_loss": 0.3706170618534088,
79
+ "eval_runtime": 0.7175,
80
+ "eval_samples_per_second": 568.678,
81
+ "eval_steps_per_second": 1.394,
82
+ "step": 400
83
+ },
84
+ {
85
+ "epoch": 4.3478260869565215,
86
+ "grad_norm": 2.169461965560913,
87
+ "learning_rate": 0.00593549673977322,
88
+ "loss": 0.2635,
89
+ "step": 500
90
+ },
91
+ {
92
+ "epoch": 4.3478260869565215,
93
+ "eval_accuracy": 0.9068627450980392,
94
+ "eval_combined_score": 0.9196235433675249,
95
+ "eval_f1": 0.9323843416370107,
96
+ "eval_loss": 0.38074013590812683,
97
+ "eval_runtime": 0.7024,
98
+ "eval_samples_per_second": 580.861,
99
+ "eval_steps_per_second": 1.424,
100
+ "step": 500
101
+ },
102
+ {
103
+ "epoch": 5.217391304347826,
104
+ "grad_norm": 1.3256586790084839,
105
+ "learning_rate": 0.005903736915214777,
106
+ "loss": 0.2076,
107
+ "step": 600
108
+ },
109
+ {
110
+ "epoch": 5.217391304347826,
111
+ "eval_accuracy": 0.875,
112
+ "eval_combined_score": 0.8946428571428571,
113
+ "eval_f1": 0.9142857142857143,
114
+ "eval_loss": 0.3881168067455292,
115
+ "eval_runtime": 0.7474,
116
+ "eval_samples_per_second": 545.861,
117
+ "eval_steps_per_second": 1.338,
118
+ "step": 600
119
+ },
120
+ {
121
+ "epoch": 6.086956521739131,
122
+ "grad_norm": 0.6423662900924683,
123
+ "learning_rate": 0.005865762107070985,
124
+ "loss": 0.1466,
125
+ "step": 700
126
+ },
127
+ {
128
+ "epoch": 6.086956521739131,
129
+ "eval_accuracy": 0.8504901960784313,
130
+ "eval_combined_score": 0.8736376530307555,
131
+ "eval_f1": 0.8967851099830795,
132
+ "eval_loss": 0.6260377764701843,
133
+ "eval_runtime": 0.7506,
134
+ "eval_samples_per_second": 543.577,
135
+ "eval_steps_per_second": 1.332,
136
+ "step": 700
137
+ },
138
+ {
139
+ "epoch": 6.956521739130435,
140
+ "grad_norm": 1.2912263870239258,
141
+ "learning_rate": 0.005821653594333766,
142
+ "loss": 0.125,
143
+ "step": 800
144
+ },
145
+ {
146
+ "epoch": 6.956521739130435,
147
+ "eval_accuracy": 0.8872549019607843,
148
+ "eval_combined_score": 0.9041085506367494,
149
+ "eval_f1": 0.9209621993127147,
150
+ "eval_loss": 0.34258151054382324,
151
+ "eval_runtime": 0.7263,
152
+ "eval_samples_per_second": 561.786,
153
+ "eval_steps_per_second": 1.377,
154
+ "step": 800
155
+ },
156
+ {
157
+ "epoch": 7.826086956521739,
158
+ "grad_norm": 1.239829182624817,
159
+ "learning_rate": 0.005771505784206885,
160
+ "loss": 0.109,
161
+ "step": 900
162
+ },
163
+ {
164
+ "epoch": 7.826086956521739,
165
+ "eval_accuracy": 0.8897058823529411,
166
+ "eval_combined_score": 0.9065224471389919,
167
+ "eval_f1": 0.9233390119250426,
168
+ "eval_loss": 0.5580412149429321,
169
+ "eval_runtime": 0.7066,
170
+ "eval_samples_per_second": 577.381,
171
+ "eval_steps_per_second": 1.415,
172
+ "step": 900
173
+ },
174
+ {
175
+ "epoch": 8.695652173913043,
176
+ "grad_norm": 0.47330760955810547,
177
+ "learning_rate": 0.005715426010042459,
178
+ "loss": 0.0882,
179
+ "step": 1000
180
+ },
181
+ {
182
+ "epoch": 8.695652173913043,
183
+ "eval_accuracy": 0.875,
184
+ "eval_combined_score": 0.8949290484140233,
185
+ "eval_f1": 0.9148580968280468,
186
+ "eval_loss": 0.4662654995918274,
187
+ "eval_runtime": 0.5408,
188
+ "eval_samples_per_second": 754.477,
189
+ "eval_steps_per_second": 1.849,
190
+ "step": 1000
191
+ },
192
+ {
193
+ "epoch": 9.565217391304348,
194
+ "grad_norm": 0.23926566541194916,
195
+ "learning_rate": 0.00565353430161112,
196
+ "loss": 0.0773,
197
+ "step": 1100
198
+ },
199
+ {
200
+ "epoch": 9.565217391304348,
201
+ "eval_accuracy": 0.8970588235294118,
202
+ "eval_combined_score": 0.9108950031625553,
203
+ "eval_f1": 0.9247311827956989,
204
+ "eval_loss": 0.3885922133922577,
205
+ "eval_runtime": 0.5403,
206
+ "eval_samples_per_second": 755.163,
207
+ "eval_steps_per_second": 1.851,
208
+ "step": 1100
209
+ },
210
+ {
211
+ "epoch": 10.434782608695652,
212
+ "grad_norm": 0.9964650869369507,
213
+ "learning_rate": 0.005585963128197517,
214
+ "loss": 0.0588,
215
+ "step": 1200
216
+ },
217
+ {
218
+ "epoch": 10.434782608695652,
219
+ "eval_accuracy": 0.8995098039215687,
220
+ "eval_combined_score": 0.9133428238080312,
221
+ "eval_f1": 0.9271758436944938,
222
+ "eval_loss": 0.6083597540855408,
223
+ "eval_runtime": 0.5378,
224
+ "eval_samples_per_second": 758.652,
225
+ "eval_steps_per_second": 1.859,
226
+ "step": 1200
227
+ },
228
+ {
229
+ "epoch": 11.304347826086957,
230
+ "grad_norm": 0.002499501220881939,
231
+ "learning_rate": 0.005512857115071042,
232
+ "loss": 0.054,
233
+ "step": 1300
234
+ },
235
+ {
236
+ "epoch": 11.304347826086957,
237
+ "eval_accuracy": 0.8897058823529411,
238
+ "eval_combined_score": 0.9055859254696643,
239
+ "eval_f1": 0.9214659685863874,
240
+ "eval_loss": 0.7781792283058167,
241
+ "eval_runtime": 0.5407,
242
+ "eval_samples_per_second": 754.578,
243
+ "eval_steps_per_second": 1.849,
244
+ "step": 1300
245
+ },
246
+ {
247
+ "epoch": 12.173913043478262,
248
+ "grad_norm": 0.10482348501682281,
249
+ "learning_rate": 0.005434372733938616,
250
+ "loss": 0.0627,
251
+ "step": 1400
252
+ },
253
+ {
254
+ "epoch": 12.173913043478262,
255
+ "eval_accuracy": 0.9044117647058824,
256
+ "eval_combined_score": 0.9180552693932214,
257
+ "eval_f1": 0.9316987740805605,
258
+ "eval_loss": 0.4779199957847595,
259
+ "eval_runtime": 0.5432,
260
+ "eval_samples_per_second": 751.158,
261
+ "eval_steps_per_second": 1.841,
262
+ "step": 1400
263
+ },
264
+ {
265
+ "epoch": 13.043478260869565,
266
+ "grad_norm": 0.16774187982082367,
267
+ "learning_rate": 0.00535067796804207,
268
+ "loss": 0.0439,
269
+ "step": 1500
270
+ },
271
+ {
272
+ "epoch": 13.043478260869565,
273
+ "eval_accuracy": 0.8995098039215687,
274
+ "eval_combined_score": 0.9137267824528756,
275
+ "eval_f1": 0.9279437609841827,
276
+ "eval_loss": 0.7853822112083435,
277
+ "eval_runtime": 0.5436,
278
+ "eval_samples_per_second": 750.58,
279
+ "eval_steps_per_second": 1.84,
280
+ "step": 1500
281
+ },
282
+ {
283
+ "epoch": 13.91304347826087,
284
+ "grad_norm": 1.0404678583145142,
285
+ "learning_rate": 0.005261951952616936,
286
+ "loss": 0.0698,
287
+ "step": 1600
288
+ },
289
+ {
290
+ "epoch": 13.91304347826087,
291
+ "eval_accuracy": 0.8897058823529411,
292
+ "eval_combined_score": 0.9063914027149321,
293
+ "eval_f1": 0.9230769230769231,
294
+ "eval_loss": 0.996893048286438,
295
+ "eval_runtime": 0.5398,
296
+ "eval_samples_per_second": 755.831,
297
+ "eval_steps_per_second": 1.853,
298
+ "step": 1600
299
+ },
300
+ {
301
+ "epoch": 14.782608695652174,
302
+ "grad_norm": 0.4925846457481384,
303
+ "learning_rate": 0.005168384591482175,
304
+ "loss": 0.0524,
305
+ "step": 1700
306
+ },
307
+ {
308
+ "epoch": 14.782608695652174,
309
+ "eval_accuracy": 0.8872549019607843,
310
+ "eval_combined_score": 0.9043783042227129,
311
+ "eval_f1": 0.9215017064846417,
312
+ "eval_loss": 0.5238045454025269,
313
+ "eval_runtime": 0.5425,
314
+ "eval_samples_per_second": 752.115,
315
+ "eval_steps_per_second": 1.843,
316
+ "step": 1700
317
+ },
318
+ {
319
+ "epoch": 15.652173913043478,
320
+ "grad_norm": 0.03307371959090233,
321
+ "learning_rate": 0.005070176150581485,
322
+ "loss": 0.0558,
323
+ "step": 1800
324
+ },
325
+ {
326
+ "epoch": 15.652173913043478,
327
+ "eval_accuracy": 0.8872549019607843,
328
+ "eval_combined_score": 0.9032765737874097,
329
+ "eval_f1": 0.9192982456140351,
330
+ "eval_loss": 0.7846351861953735,
331
+ "eval_runtime": 0.5402,
332
+ "eval_samples_per_second": 755.211,
333
+ "eval_steps_per_second": 1.851,
334
+ "step": 1800
335
+ },
336
+ {
337
+ "epoch": 16.52173913043478,
338
+ "grad_norm": 0.20798969268798828,
339
+ "learning_rate": 0.004967536829346139,
340
+ "loss": 0.0504,
341
+ "step": 1900
342
+ },
343
+ {
344
+ "epoch": 16.52173913043478,
345
+ "eval_accuracy": 0.8872549019607843,
346
+ "eval_combined_score": 0.9042438893365565,
347
+ "eval_f1": 0.9212328767123288,
348
+ "eval_loss": 0.7127761244773865,
349
+ "eval_runtime": 0.5411,
350
+ "eval_samples_per_second": 753.985,
351
+ "eval_steps_per_second": 1.848,
352
+ "step": 1900
353
+ },
354
+ {
355
+ "epoch": 17.391304347826086,
356
+ "grad_norm": 0.006357874721288681,
357
+ "learning_rate": 0.004860686310796779,
358
+ "loss": 0.0235,
359
+ "step": 2000
360
+ },
361
+ {
362
+ "epoch": 17.391304347826086,
363
+ "eval_accuracy": 0.8946078431372549,
364
+ "eval_combined_score": 0.9102987580574399,
365
+ "eval_f1": 0.9259896729776248,
366
+ "eval_loss": 0.8576990962028503,
367
+ "eval_runtime": 0.5397,
368
+ "eval_samples_per_second": 755.98,
369
+ "eval_steps_per_second": 1.853,
370
+ "step": 2000
371
+ },
372
+ {
373
+ "epoch": 18.26086956521739,
374
+ "grad_norm": 0.04779256880283356,
375
+ "learning_rate": 0.004749853291347118,
376
+ "loss": 0.0319,
377
+ "step": 2100
378
+ },
379
+ {
380
+ "epoch": 18.26086956521739,
381
+ "eval_accuracy": 0.8897058823529411,
382
+ "eval_combined_score": 0.9059928375495275,
383
+ "eval_f1": 0.9222797927461139,
384
+ "eval_loss": 0.6823519468307495,
385
+ "eval_runtime": 0.5406,
386
+ "eval_samples_per_second": 754.673,
387
+ "eval_steps_per_second": 1.85,
388
+ "step": 2100
389
+ },
390
+ {
391
+ "epoch": 19.130434782608695,
392
+ "grad_norm": 0.08435946702957153,
393
+ "learning_rate": 0.004635274991315905,
394
+ "loss": 0.0229,
395
+ "step": 2200
396
+ },
397
+ {
398
+ "epoch": 19.130434782608695,
399
+ "eval_accuracy": 0.8921568627450981,
400
+ "eval_combined_score": 0.9078839869281046,
401
+ "eval_f1": 0.9236111111111112,
402
+ "eval_loss": 0.7368404865264893,
403
+ "eval_runtime": 1.5391,
404
+ "eval_samples_per_second": 265.095,
405
+ "eval_steps_per_second": 0.65,
406
+ "step": 2200
407
+ },
408
+ {
409
+ "epoch": 20.0,
410
+ "grad_norm": 0.1149434819817543,
411
+ "learning_rate": 0.004517196647194848,
412
+ "loss": 0.0166,
413
+ "step": 2300
414
+ },
415
+ {
416
+ "epoch": 20.0,
417
+ "eval_accuracy": 0.8799019607843137,
418
+ "eval_combined_score": 0.8976366453317077,
419
+ "eval_f1": 0.9153713298791019,
420
+ "eval_loss": 0.9132684469223022,
421
+ "eval_runtime": 0.5393,
422
+ "eval_samples_per_second": 756.59,
423
+ "eval_steps_per_second": 1.854,
424
+ "step": 2300
425
+ },
426
+ {
427
+ "epoch": 20.869565217391305,
428
+ "grad_norm": 0.17054922878742218,
429
+ "learning_rate": 0.004395870986759199,
430
+ "loss": 0.0187,
431
+ "step": 2400
432
+ },
433
+ {
434
+ "epoch": 20.869565217391305,
435
+ "eval_accuracy": 0.8799019607843137,
436
+ "eval_combined_score": 0.8964341065022812,
437
+ "eval_f1": 0.9129662522202486,
438
+ "eval_loss": 0.45014092326164246,
439
+ "eval_runtime": 0.543,
440
+ "eval_samples_per_second": 751.419,
441
+ "eval_steps_per_second": 1.842,
442
+ "step": 2400
443
+ },
444
+ {
445
+ "epoch": 21.73913043478261,
446
+ "grad_norm": 0.4253406226634979,
447
+ "learning_rate": 0.004271557688144445,
448
+ "loss": 0.0201,
449
+ "step": 2500
450
+ },
451
+ {
452
+ "epoch": 21.73913043478261,
453
+ "eval_accuracy": 0.8602941176470589,
454
+ "eval_combined_score": 0.8795253891965312,
455
+ "eval_f1": 0.8987566607460036,
456
+ "eval_loss": 0.5724949836730957,
457
+ "eval_runtime": 0.5422,
458
+ "eval_samples_per_second": 752.426,
459
+ "eval_steps_per_second": 1.844,
460
+ "step": 2500
461
+ },
462
+ {
463
+ "epoch": 22.608695652173914,
464
+ "grad_norm": 0.06475929915904999,
465
+ "learning_rate": 0.00414452282404687,
466
+ "loss": 0.0342,
467
+ "step": 2600
468
+ },
469
+ {
470
+ "epoch": 22.608695652173914,
471
+ "eval_accuracy": 0.8872549019607843,
472
+ "eval_combined_score": 0.9041085506367494,
473
+ "eval_f1": 0.9209621993127147,
474
+ "eval_loss": 0.45861247181892395,
475
+ "eval_runtime": 0.5425,
476
+ "eval_samples_per_second": 752.044,
477
+ "eval_steps_per_second": 1.843,
478
+ "step": 2600
479
+ },
480
+ {
481
+ "epoch": 23.47826086956522,
482
+ "grad_norm": 0.0008664391352795064,
483
+ "learning_rate": 0.004015038292237584,
484
+ "loss": 0.0206,
485
+ "step": 2700
486
+ },
487
+ {
488
+ "epoch": 23.47826086956522,
489
+ "eval_accuracy": 0.8872549019607843,
490
+ "eval_combined_score": 0.903972278566599,
491
+ "eval_f1": 0.9206896551724137,
492
+ "eval_loss": 0.7849236130714417,
493
+ "eval_runtime": 0.7931,
494
+ "eval_samples_per_second": 514.44,
495
+ "eval_steps_per_second": 1.261,
496
+ "step": 2700
497
+ },
498
+ {
499
+ "epoch": 24.347826086956523,
500
+ "grad_norm": 0.3173099756240845,
501
+ "learning_rate": 0.0038833812336089082,
502
+ "loss": 0.0141,
503
+ "step": 2800
504
+ },
505
+ {
506
+ "epoch": 24.347826086956523,
507
+ "eval_accuracy": 0.8725490196078431,
508
+ "eval_combined_score": 0.8920568227290917,
509
+ "eval_f1": 0.9115646258503401,
510
+ "eval_loss": 0.9240702986717224,
511
+ "eval_runtime": 1.7547,
512
+ "eval_samples_per_second": 232.52,
513
+ "eval_steps_per_second": 0.57,
514
+ "step": 2800
515
+ },
516
+ {
517
+ "epoch": 25.217391304347824,
518
+ "grad_norm": 0.25173866748809814,
519
+ "learning_rate": 0.003749833438998706,
520
+ "loss": 0.0275,
521
+ "step": 2900
522
+ },
523
+ {
524
+ "epoch": 25.217391304347824,
525
+ "eval_accuracy": 0.8872549019607843,
526
+ "eval_combined_score": 0.9028473091364205,
527
+ "eval_f1": 0.9184397163120568,
528
+ "eval_loss": 0.5891702771186829,
529
+ "eval_runtime": 0.8016,
530
+ "eval_samples_per_second": 509.011,
531
+ "eval_steps_per_second": 1.248,
532
+ "step": 2900
533
+ },
534
+ {
535
+ "epoch": 26.08695652173913,
536
+ "grad_norm": 0.08256181329488754,
537
+ "learning_rate": 0.0036146807460622453,
538
+ "loss": 0.0152,
539
+ "step": 3000
540
+ },
541
+ {
542
+ "epoch": 26.08695652173913,
543
+ "eval_accuracy": 0.8897058823529411,
544
+ "eval_combined_score": 0.9059928375495275,
545
+ "eval_f1": 0.9222797927461139,
546
+ "eval_loss": 0.7931877374649048,
547
+ "eval_runtime": 1.7507,
548
+ "eval_samples_per_second": 233.049,
549
+ "eval_steps_per_second": 0.571,
550
+ "step": 3000
551
+ },
552
+ {
553
+ "epoch": 26.956521739130434,
554
+ "grad_norm": 0.08613915741443634,
555
+ "learning_rate": 0.0034782124274825013,
556
+ "loss": 0.0184,
557
+ "step": 3100
558
+ },
559
+ {
560
+ "epoch": 26.956521739130434,
561
+ "eval_accuracy": 0.8872549019607843,
562
+ "eval_combined_score": 0.9008765216123624,
563
+ "eval_f1": 0.9144981412639405,
564
+ "eval_loss": 0.5767039656639099,
565
+ "eval_runtime": 1.3941,
566
+ "eval_samples_per_second": 292.665,
567
+ "eval_steps_per_second": 0.717,
568
+ "step": 3100
569
+ },
570
+ {
571
+ "epoch": 27.82608695652174,
572
+ "grad_norm": 0.07717280089855194,
573
+ "learning_rate": 0.0033407205718283266,
574
+ "loss": 0.0206,
575
+ "step": 3200
576
+ },
577
+ {
578
+ "epoch": 27.82608695652174,
579
+ "eval_accuracy": 0.8946078431372549,
580
+ "eval_combined_score": 0.9092508242234947,
581
+ "eval_f1": 0.9238938053097345,
582
+ "eval_loss": 0.6821743249893188,
583
+ "eval_runtime": 1.6183,
584
+ "eval_samples_per_second": 252.122,
585
+ "eval_steps_per_second": 0.618,
586
+ "step": 3200
587
+ },
588
+ {
589
+ "epoch": 28.695652173913043,
590
+ "grad_norm": 1.320895791053772,
591
+ "learning_rate": 0.0032024994583856776,
592
+ "loss": 0.0174,
593
+ "step": 3300
594
+ },
595
+ {
596
+ "epoch": 28.695652173913043,
597
+ "eval_accuracy": 0.8921568627450981,
598
+ "eval_combined_score": 0.9081473968897904,
599
+ "eval_f1": 0.9241379310344827,
600
+ "eval_loss": 0.5172853469848633,
601
+ "eval_runtime": 0.5397,
602
+ "eval_samples_per_second": 756.022,
603
+ "eval_steps_per_second": 1.853,
604
+ "step": 3300
605
+ },
606
+ {
607
+ "epoch": 29.565217391304348,
608
+ "grad_norm": 0.01968921534717083,
609
+ "learning_rate": 0.0030638449272999513,
610
+ "loss": 0.0179,
611
+ "step": 3400
612
+ },
613
+ {
614
+ "epoch": 29.565217391304348,
615
+ "eval_accuracy": 0.8921568627450981,
616
+ "eval_combined_score": 0.9069325238992394,
617
+ "eval_f1": 0.9217081850533808,
618
+ "eval_loss": 0.7016431093215942,
619
+ "eval_runtime": 0.5391,
620
+ "eval_samples_per_second": 756.825,
621
+ "eval_steps_per_second": 1.855,
622
+ "step": 3400
623
+ },
624
+ {
625
+ "epoch": 30.434782608695652,
626
+ "grad_norm": 0.003688236465677619,
627
+ "learning_rate": 0.0029250537463775617,
628
+ "loss": 0.0086,
629
+ "step": 3500
630
+ },
631
+ {
632
+ "epoch": 30.434782608695652,
633
+ "eval_accuracy": 0.8848039215686274,
634
+ "eval_combined_score": 0.9015323955669223,
635
+ "eval_f1": 0.9182608695652174,
636
+ "eval_loss": 0.805417001247406,
637
+ "eval_runtime": 0.5431,
638
+ "eval_samples_per_second": 751.254,
639
+ "eval_steps_per_second": 1.841,
640
+ "step": 3500
641
+ },
642
+ {
643
+ "epoch": 31.304347826086957,
644
+ "grad_norm": 0.005005138926208019,
645
+ "learning_rate": 0.0027864229759020013,
646
+ "loss": 0.0044,
647
+ "step": 3600
648
+ },
649
+ {
650
+ "epoch": 31.304347826086957,
651
+ "eval_accuracy": 0.8872549019607843,
652
+ "eval_combined_score": 0.9043783042227129,
653
+ "eval_f1": 0.9215017064846417,
654
+ "eval_loss": 1.432096242904663,
655
+ "eval_runtime": 0.5394,
656
+ "eval_samples_per_second": 756.399,
657
+ "eval_steps_per_second": 1.854,
658
+ "step": 3600
659
+ },
660
+ {
661
+ "epoch": 32.17391304347826,
662
+ "grad_norm": 0.00238438555970788,
663
+ "learning_rate": 0.0026482493328239083,
664
+ "loss": 0.0168,
665
+ "step": 3700
666
+ },
667
+ {
668
+ "epoch": 32.17391304347826,
669
+ "eval_accuracy": 0.8897058823529411,
670
+ "eval_combined_score": 0.9054483877614092,
671
+ "eval_f1": 0.9211908931698775,
672
+ "eval_loss": 0.9366185665130615,
673
+ "eval_runtime": 0.5425,
674
+ "eval_samples_per_second": 752.1,
675
+ "eval_steps_per_second": 1.843,
676
+ "step": 3700
677
+ },
678
+ {
679
+ "epoch": 33.04347826086956,
680
+ "grad_norm": 0.0020455929916352034,
681
+ "learning_rate": 0.0025108285556859683,
682
+ "loss": 0.0116,
683
+ "step": 3800
684
+ },
685
+ {
686
+ "epoch": 33.04347826086956,
687
+ "eval_accuracy": 0.8897058823529411,
688
+ "eval_combined_score": 0.9041657802361451,
689
+ "eval_f1": 0.918625678119349,
690
+ "eval_loss": 0.7671505808830261,
691
+ "eval_runtime": 0.5401,
692
+ "eval_samples_per_second": 755.459,
693
+ "eval_steps_per_second": 1.852,
694
+ "step": 3800
695
+ },
696
+ {
697
+ "epoch": 33.91304347826087,
698
+ "grad_norm": 0.0003456902632024139,
699
+ "learning_rate": 0.0023744547716419632,
700
+ "loss": 0.0074,
701
+ "step": 3900
702
+ },
703
+ {
704
+ "epoch": 33.91304347826087,
705
+ "eval_accuracy": 0.8946078431372549,
706
+ "eval_combined_score": 0.9089795008912656,
707
+ "eval_f1": 0.9233511586452763,
708
+ "eval_loss": 0.8340775966644287,
709
+ "eval_runtime": 0.5432,
710
+ "eval_samples_per_second": 751.15,
711
+ "eval_steps_per_second": 1.841,
712
+ "step": 3900
713
+ },
714
+ {
715
+ "epoch": 34.78260869565217,
716
+ "grad_norm": 0.0021141970064491034,
717
+ "learning_rate": 0.00223941986692472,
718
+ "loss": 0.0045,
719
+ "step": 4000
720
+ },
721
+ {
722
+ "epoch": 34.78260869565217,
723
+ "eval_accuracy": 0.9044117647058824,
724
+ "eval_combined_score": 0.9174465240641712,
725
+ "eval_f1": 0.93048128342246,
726
+ "eval_loss": 0.7846585512161255,
727
+ "eval_runtime": 0.5416,
728
+ "eval_samples_per_second": 753.34,
729
+ "eval_steps_per_second": 1.846,
730
+ "step": 4000
731
+ },
732
+ {
733
+ "epoch": 35.65217391304348,
734
+ "grad_norm": 0.0007770864176563919,
735
+ "learning_rate": 0.002106012862110405,
736
+ "loss": 0.0028,
737
+ "step": 4100
738
+ },
739
+ {
740
+ "epoch": 35.65217391304348,
741
+ "eval_accuracy": 0.9044117647058824,
742
+ "eval_combined_score": 0.9178143479614067,
743
+ "eval_f1": 0.9312169312169312,
744
+ "eval_loss": 0.7687886357307434,
745
+ "eval_runtime": 0.5399,
746
+ "eval_samples_per_second": 755.746,
747
+ "eval_steps_per_second": 1.852,
748
+ "step": 4100
749
+ },
750
+ {
751
+ "epoch": 36.52173913043478,
752
+ "grad_norm": 0.012195412069559097,
753
+ "learning_rate": 0.0019745192935162865,
754
+ "loss": 0.0031,
755
+ "step": 4200
756
+ },
757
+ {
758
+ "epoch": 36.52173913043478,
759
+ "eval_accuracy": 0.8897058823529411,
760
+ "eval_combined_score": 0.9044579681064526,
761
+ "eval_f1": 0.9192100538599641,
762
+ "eval_loss": 0.7898321747779846,
763
+ "eval_runtime": 0.5404,
764
+ "eval_samples_per_second": 754.959,
765
+ "eval_steps_per_second": 1.85,
766
+ "step": 4200
767
+ },
768
+ {
769
+ "epoch": 37.391304347826086,
770
+ "grad_norm": 0.0025567489210516214,
771
+ "learning_rate": 0.0018452206020560069,
772
+ "loss": 0.004,
773
+ "step": 4300
774
+ },
775
+ {
776
+ "epoch": 37.391304347826086,
777
+ "eval_accuracy": 0.8897058823529411,
778
+ "eval_combined_score": 0.9050299323269131,
779
+ "eval_f1": 0.9203539823008849,
780
+ "eval_loss": 0.7720369100570679,
781
+ "eval_runtime": 0.5397,
782
+ "eval_samples_per_second": 755.98,
783
+ "eval_steps_per_second": 1.853,
784
+ "step": 4300
785
+ },
786
+ {
787
+ "epoch": 38.26086956521739,
788
+ "grad_norm": 0.0025629671290516853,
789
+ "learning_rate": 0.0017183935308603985,
790
+ "loss": 0.0033,
791
+ "step": 4400
792
+ },
793
+ {
794
+ "epoch": 38.26086956521739,
795
+ "eval_accuracy": 0.8872549019607843,
796
+ "eval_combined_score": 0.9042438893365565,
797
+ "eval_f1": 0.9212328767123288,
798
+ "eval_loss": 0.9692898988723755,
799
+ "eval_runtime": 0.538,
800
+ "eval_samples_per_second": 758.342,
801
+ "eval_steps_per_second": 1.859,
802
+ "step": 4400
803
+ },
804
+ {
805
+ "epoch": 39.130434782608695,
806
+ "grad_norm": 5.716999658034183e-05,
807
+ "learning_rate": 0.0015943095329531598,
808
+ "loss": 0.0021,
809
+ "step": 4500
810
+ },
811
+ {
812
+ "epoch": 39.130434782608695,
813
+ "eval_accuracy": 0.9019607843137255,
814
+ "eval_combined_score": 0.9160153571918278,
815
+ "eval_f1": 0.9300699300699301,
816
+ "eval_loss": 1.1709413528442383,
817
+ "eval_runtime": 0.5404,
818
+ "eval_samples_per_second": 755.034,
819
+ "eval_steps_per_second": 1.851,
820
+ "step": 4500
821
+ },
822
+ {
823
+ "epoch": 40.0,
824
+ "grad_norm": 5.226567736826837e-05,
825
+ "learning_rate": 0.001473234190249152,
826
+ "loss": 0.0029,
827
+ "step": 4600
828
+ },
829
+ {
830
+ "epoch": 40.0,
831
+ "eval_accuracy": 0.8946078431372549,
832
+ "eval_combined_score": 0.9095183328164307,
833
+ "eval_f1": 0.9244288224956063,
834
+ "eval_loss": 1.1188499927520752,
835
+ "eval_runtime": 0.5428,
836
+ "eval_samples_per_second": 751.701,
837
+ "eval_steps_per_second": 1.842,
838
+ "step": 4600
839
+ },
840
+ {
841
+ "epoch": 40.869565217391305,
842
+ "grad_norm": 0.00022964319214224815,
843
+ "learning_rate": 0.001355426645118869,
844
+ "loss": 0.0039,
845
+ "step": 4700
846
+ },
847
+ {
848
+ "epoch": 40.869565217391305,
849
+ "eval_accuracy": 0.8970588235294118,
850
+ "eval_combined_score": 0.9118161250514192,
851
+ "eval_f1": 0.9265734265734266,
852
+ "eval_loss": 1.0468120574951172,
853
+ "eval_runtime": 0.538,
854
+ "eval_samples_per_second": 758.342,
855
+ "eval_steps_per_second": 1.859,
856
+ "step": 4700
857
+ },
858
+ {
859
+ "epoch": 41.73913043478261,
860
+ "grad_norm": 0.00023312283155974,
861
+ "learning_rate": 0.0012411390457357126,
862
+ "loss": 0.0016,
863
+ "step": 4800
864
+ },
865
+ {
866
+ "epoch": 41.73913043478261,
867
+ "eval_accuracy": 0.8970588235294118,
868
+ "eval_combined_score": 0.9114269382664727,
869
+ "eval_f1": 0.9257950530035336,
870
+ "eval_loss": 1.1018487215042114,
871
+ "eval_runtime": 0.5401,
872
+ "eval_samples_per_second": 755.406,
873
+ "eval_steps_per_second": 1.851,
874
+ "step": 4800
875
+ },
876
+ {
877
+ "epoch": 42.608695652173914,
878
+ "grad_norm": 1.0152802133234218e-05,
879
+ "learning_rate": 0.0011306160063932343,
880
+ "loss": 0.0017,
881
+ "step": 4900
882
+ },
883
+ {
884
+ "epoch": 42.608695652173914,
885
+ "eval_accuracy": 0.8921568627450981,
886
+ "eval_combined_score": 0.907209173422019,
887
+ "eval_f1": 0.9222614840989399,
888
+ "eval_loss": 1.0843183994293213,
889
+ "eval_runtime": 0.5403,
890
+ "eval_samples_per_second": 755.111,
891
+ "eval_steps_per_second": 1.851,
892
+ "step": 4900
893
+ },
894
+ {
895
+ "epoch": 43.47826086956522,
896
+ "grad_norm": 0.0011523779248818755,
897
+ "learning_rate": 0.0010240940839474372,
898
+ "loss": 0.0059,
899
+ "step": 5000
900
+ },
901
+ {
902
+ "epoch": 43.47826086956522,
903
+ "eval_accuracy": 0.9019607843137255,
904
+ "eval_combined_score": 0.9163783160322954,
905
+ "eval_f1": 0.9307958477508651,
906
+ "eval_loss": 0.9805396795272827,
907
+ "eval_runtime": 0.54,
908
+ "eval_samples_per_second": 755.537,
909
+ "eval_steps_per_second": 1.852,
910
+ "step": 5000
911
+ },
912
+ {
913
+ "epoch": 44.34782608695652,
914
+ "grad_norm": 0.003142524277791381,
915
+ "learning_rate": 0.0009218012715047219,
916
+ "loss": 0.0017,
917
+ "step": 5100
918
+ },
919
+ {
920
+ "epoch": 44.34782608695652,
921
+ "eval_accuracy": 0.8921568627450981,
922
+ "eval_combined_score": 0.9069325238992394,
923
+ "eval_f1": 0.9217081850533808,
924
+ "eval_loss": 0.8588517308235168,
925
+ "eval_runtime": 0.5423,
926
+ "eval_samples_per_second": 752.371,
927
+ "eval_steps_per_second": 1.844,
928
+ "step": 5100
929
+ },
930
+ {
931
+ "epoch": 45.21739130434783,
932
+ "grad_norm": 0.00017512345220893621,
933
+ "learning_rate": 0.0008239565104391876,
934
+ "loss": 0.0017,
935
+ "step": 5200
936
+ },
937
+ {
938
+ "epoch": 45.21739130434783,
939
+ "eval_accuracy": 0.8995098039215687,
940
+ "eval_combined_score": 0.9138529755159507,
941
+ "eval_f1": 0.9281961471103327,
942
+ "eval_loss": 1.147383689880371,
943
+ "eval_runtime": 0.54,
944
+ "eval_samples_per_second": 755.561,
945
+ "eval_steps_per_second": 1.852,
946
+ "step": 5200
947
+ },
948
+ {
949
+ "epoch": 46.08695652173913,
950
+ "grad_norm": 0.0001278241106774658,
951
+ "learning_rate": 0.0007307692217836915,
952
+ "loss": 0.0009,
953
+ "step": 5300
954
+ },
955
+ {
956
+ "epoch": 46.08695652173913,
957
+ "eval_accuracy": 0.8995098039215687,
958
+ "eval_combined_score": 0.9138529755159507,
959
+ "eval_f1": 0.9281961471103327,
960
+ "eval_loss": 1.2277849912643433,
961
+ "eval_runtime": 0.5402,
962
+ "eval_samples_per_second": 755.325,
963
+ "eval_steps_per_second": 1.851,
964
+ "step": 5300
965
+ },
966
+ {
967
+ "epoch": 46.95652173913044,
968
+ "grad_norm": 0.0006020450382493436,
969
+ "learning_rate": 0.0006424388579976914,
970
+ "loss": 0.0031,
971
+ "step": 5400
972
+ },
973
+ {
974
+ "epoch": 46.95652173913044,
975
+ "eval_accuracy": 0.8946078431372549,
976
+ "eval_combined_score": 0.9097821065599014,
977
+ "eval_f1": 0.924956369982548,
978
+ "eval_loss": 1.220462441444397,
979
+ "eval_runtime": 0.543,
980
+ "eval_samples_per_second": 751.421,
981
+ "eval_steps_per_second": 1.842,
982
+ "step": 5400
983
+ },
984
+ {
985
+ "epoch": 47.82608695652174,
986
+ "grad_norm": 2.0725792637676932e-05,
987
+ "learning_rate": 0.0005591544760712127,
988
+ "loss": 0.001,
989
+ "step": 5500
990
+ },
991
+ {
992
+ "epoch": 47.82608695652174,
993
+ "eval_accuracy": 0.8946078431372549,
994
+ "eval_combined_score": 0.9101709336584375,
995
+ "eval_f1": 0.92573402417962,
996
+ "eval_loss": 1.453669786453247,
997
+ "eval_runtime": 0.5399,
998
+ "eval_samples_per_second": 755.634,
999
+ "eval_steps_per_second": 1.852,
1000
+ "step": 5500
1001
+ },
1002
+ {
1003
+ "epoch": 48.69565217391305,
1004
+ "grad_norm": 0.00041490638977847993,
1005
+ "learning_rate": 0.0004810943328786581,
1006
+ "loss": 0.0031,
1007
+ "step": 5600
1008
+ },
1009
+ {
1010
+ "epoch": 48.69565217391305,
1011
+ "eval_accuracy": 0.8921568627450981,
1012
+ "eval_combined_score": 0.907209173422019,
1013
+ "eval_f1": 0.9222614840989399,
1014
+ "eval_loss": 1.014076590538025,
1015
+ "eval_runtime": 0.5383,
1016
+ "eval_samples_per_second": 757.912,
1017
+ "eval_steps_per_second": 1.858,
1018
+ "step": 5600
1019
+ },
1020
+ {
1021
+ "epoch": 49.56521739130435,
1022
+ "grad_norm": 0.0008342143846675754,
1023
+ "learning_rate": 0.0004084255036485247,
1024
+ "loss": 0.0008,
1025
+ "step": 5700
1026
+ },
1027
+ {
1028
+ "epoch": 49.56521739130435,
1029
+ "eval_accuracy": 0.8995098039215687,
1030
+ "eval_combined_score": 0.9137267824528756,
1031
+ "eval_f1": 0.9279437609841827,
1032
+ "eval_loss": 1.0565646886825562,
1033
+ "eval_runtime": 0.5406,
1034
+ "eval_samples_per_second": 754.734,
1035
+ "eval_steps_per_second": 1.85,
1036
+ "step": 5700
1037
+ },
1038
+ {
1039
+ "epoch": 50.43478260869565,
1040
+ "grad_norm": 0.00030943751335144043,
1041
+ "learning_rate": 0.0003413035243656567,
1042
+ "loss": 0.0006,
1043
+ "step": 5800
1044
+ },
1045
+ {
1046
+ "epoch": 50.43478260869565,
1047
+ "eval_accuracy": 0.9044117647058824,
1048
+ "eval_combined_score": 0.9181744687403757,
1049
+ "eval_f1": 0.9319371727748691,
1050
+ "eval_loss": 1.1727582216262817,
1051
+ "eval_runtime": 0.542,
1052
+ "eval_samples_per_second": 752.737,
1053
+ "eval_steps_per_second": 1.845,
1054
+ "step": 5800
1055
+ },
1056
+ {
1057
+ "epoch": 51.30434782608695,
1058
+ "grad_norm": 1.8360608009970747e-05,
1059
+ "learning_rate": 0.00027987205887138636,
1060
+ "loss": 0.0019,
1061
+ "step": 5900
1062
+ },
1063
+ {
1064
+ "epoch": 51.30434782608695,
1065
+ "eval_accuracy": 0.9019607843137255,
1066
+ "eval_combined_score": 0.9163783160322954,
1067
+ "eval_f1": 0.9307958477508651,
1068
+ "eval_loss": 1.293489694595337,
1069
+ "eval_runtime": 0.5401,
1070
+ "eval_samples_per_second": 755.456,
1071
+ "eval_steps_per_second": 1.852,
1072
+ "step": 5900
1073
+ },
1074
+ {
1075
+ "epoch": 52.17391304347826,
1076
+ "grad_norm": 0.2986358106136322,
1077
+ "learning_rate": 0.00022426259137410366,
1078
+ "loss": 0.0026,
1079
+ "step": 6000
1080
+ },
1081
+ {
1082
+ "epoch": 52.17391304347826,
1083
+ "eval_accuracy": 0.8970588235294118,
1084
+ "eval_combined_score": 0.9123225152129817,
1085
+ "eval_f1": 0.9275862068965517,
1086
+ "eval_loss": 1.2229700088500977,
1087
+ "eval_runtime": 0.5412,
1088
+ "eval_samples_per_second": 753.884,
1089
+ "eval_steps_per_second": 1.848,
1090
+ "step": 6000
1091
+ },
1092
+ {
1093
+ "epoch": 53.04347826086956,
1094
+ "grad_norm": 0.00012920332665089518,
1095
+ "learning_rate": 0.00017459414502837722,
1096
+ "loss": 0.0007,
1097
+ "step": 6100
1098
+ },
1099
+ {
1100
+ "epoch": 53.04347826086956,
1101
+ "eval_accuracy": 0.9019607843137255,
1102
+ "eval_combined_score": 0.9163783160322954,
1103
+ "eval_f1": 0.9307958477508651,
1104
+ "eval_loss": 1.1866824626922607,
1105
+ "eval_runtime": 0.5394,
1106
+ "eval_samples_per_second": 756.357,
1107
+ "eval_steps_per_second": 1.854,
1108
+ "step": 6100
1109
+ },
1110
+ {
1111
+ "epoch": 53.91304347826087,
1112
+ "grad_norm": 0.00025677334633655846,
1113
+ "learning_rate": 0.00013097302718496274,
1114
+ "loss": 0.0007,
1115
+ "step": 6200
1116
+ },
1117
+ {
1118
+ "epoch": 53.91304347826087,
1119
+ "eval_accuracy": 0.9019607843137255,
1120
+ "eval_combined_score": 0.9163783160322954,
1121
+ "eval_f1": 0.9307958477508651,
1122
+ "eval_loss": 1.1940966844558716,
1123
+ "eval_runtime": 0.5396,
1124
+ "eval_samples_per_second": 756.156,
1125
+ "eval_steps_per_second": 1.853,
1126
+ "step": 6200
1127
+ },
1128
+ {
1129
+ "epoch": 54.78260869565217,
1130
+ "grad_norm": 6.180995114846155e-05,
1131
+ "learning_rate": 9.349260185695385e-05,
1132
+ "loss": 0.0006,
1133
+ "step": 6300
1134
+ },
1135
+ {
1136
+ "epoch": 54.78260869565217,
1137
+ "eval_accuracy": 0.9019607843137255,
1138
+ "eval_combined_score": 0.9163783160322954,
1139
+ "eval_f1": 0.9307958477508651,
1140
+ "eval_loss": 1.1967113018035889,
1141
+ "eval_runtime": 0.5396,
1142
+ "eval_samples_per_second": 756.101,
1143
+ "eval_steps_per_second": 1.853,
1144
+ "step": 6300
1145
+ },
1146
+ {
1147
+ "epoch": 55.65217391304348,
1148
+ "grad_norm": 3.121919871773571e-05,
1149
+ "learning_rate": 6.223308988907062e-05,
1150
+ "loss": 0.0009,
1151
+ "step": 6400
1152
+ },
1153
+ {
1154
+ "epoch": 55.65217391304348,
1155
+ "eval_accuracy": 0.8995098039215687,
1156
+ "eval_combined_score": 0.9139782876501386,
1157
+ "eval_f1": 0.9284467713787086,
1158
+ "eval_loss": 1.158166766166687,
1159
+ "eval_runtime": 0.5404,
1160
+ "eval_samples_per_second": 755.002,
1161
+ "eval_steps_per_second": 1.85,
1162
+ "step": 6400
1163
+ },
1164
+ {
1165
+ "epoch": 56.52173913043478,
1166
+ "grad_norm": 8.268425881396979e-05,
1167
+ "learning_rate": 3.726139725779809e-05,
1168
+ "loss": 0.0013,
1169
+ "step": 6500
1170
+ },
1171
+ {
1172
+ "epoch": 56.52173913043478,
1173
+ "eval_accuracy": 0.8970588235294118,
1174
+ "eval_combined_score": 0.9118161250514192,
1175
+ "eval_f1": 0.9265734265734266,
1176
+ "eval_loss": 1.143733263015747,
1177
+ "eval_runtime": 0.5392,
1178
+ "eval_samples_per_second": 756.74,
1179
+ "eval_steps_per_second": 1.855,
1180
+ "step": 6500
1181
+ },
1182
+ {
1183
+ "epoch": 57.391304347826086,
1184
+ "grad_norm": 4.56096458947286e-05,
1185
+ "learning_rate": 1.8630971869861734e-05,
1186
+ "loss": 0.0009,
1187
+ "step": 6600
1188
+ },
1189
+ {
1190
+ "epoch": 57.391304347826086,
1191
+ "eval_accuracy": 0.8970588235294118,
1192
+ "eval_combined_score": 0.9118161250514192,
1193
+ "eval_f1": 0.9265734265734266,
1194
+ "eval_loss": 1.1513959169387817,
1195
+ "eval_runtime": 0.5406,
1196
+ "eval_samples_per_second": 754.69,
1197
+ "eval_steps_per_second": 1.85,
1198
+ "step": 6600
1199
+ },
1200
+ {
1201
+ "epoch": 58.26086956521739,
1202
+ "grad_norm": 0.005534951575100422,
1203
+ "learning_rate": 6.381689165550264e-06,
1204
+ "loss": 0.0009,
1205
+ "step": 6700
1206
+ },
1207
+ {
1208
+ "epoch": 58.26086956521739,
1209
+ "eval_accuracy": 0.8970588235294118,
1210
+ "eval_combined_score": 0.9118161250514192,
1211
+ "eval_f1": 0.9265734265734266,
1212
+ "eval_loss": 1.1539251804351807,
1213
+ "eval_runtime": 0.5414,
1214
+ "eval_samples_per_second": 753.553,
1215
+ "eval_steps_per_second": 1.847,
1216
+ "step": 6700
1217
+ },
1218
+ {
1219
+ "epoch": 59.130434782608695,
1220
+ "grad_norm": 0.0001028048136504367,
1221
+ "learning_rate": 5.397667717218502e-07,
1222
+ "loss": 0.0006,
1223
+ "step": 6800
1224
+ },
1225
+ {
1226
+ "epoch": 59.130434782608695,
1227
+ "eval_accuracy": 0.8970588235294118,
1228
+ "eval_combined_score": 0.9118161250514192,
1229
+ "eval_f1": 0.9265734265734266,
1230
+ "eval_loss": 1.153951644897461,
1231
+ "eval_runtime": 0.5392,
1232
+ "eval_samples_per_second": 756.67,
1233
+ "eval_steps_per_second": 1.855,
1234
+ "step": 6800
1235
+ },
1236
+ {
1237
+ "epoch": 60.0,
1238
+ "grad_norm": 0.00017295156430918723,
1239
+ "learning_rate": 1.1177083871778404e-06,
1240
+ "loss": 0.0011,
1241
+ "step": 6900
1242
+ },
1243
+ {
1244
+ "epoch": 60.0,
1245
+ "eval_accuracy": 0.8970588235294118,
1246
+ "eval_combined_score": 0.9118161250514192,
1247
+ "eval_f1": 0.9265734265734266,
1248
+ "eval_loss": 1.1541002988815308,
1249
+ "eval_runtime": 0.5394,
1250
+ "eval_samples_per_second": 756.444,
1251
+ "eval_steps_per_second": 1.854,
1252
+ "step": 6900
1253
+ },
1254
+ {
1255
+ "epoch": 60.0,
1256
+ "step": 6900,
1257
+ "total_flos": 3.67221676566528e+16,
1258
+ "train_loss": 0.04985012825902389,
1259
+ "train_runtime": 3922.5682,
1260
+ "train_samples_per_second": 56.106,
1261
+ "train_steps_per_second": 1.759
1262
+ }
1263
+ ],
1264
+ "logging_steps": 100,
1265
+ "max_steps": 6900,
1266
+ "num_input_tokens_seen": 0,
1267
+ "num_train_epochs": 60,
1268
+ "save_steps": 100,
1269
+ "stateful_callbacks": {
1270
+ "TrainerControl": {
1271
+ "args": {
1272
+ "should_epoch_stop": false,
1273
+ "should_evaluate": false,
1274
+ "should_log": false,
1275
+ "should_save": true,
1276
+ "should_training_stop": true
1277
+ },
1278
+ "attributes": {}
1279
+ }
1280
+ },
1281
+ "total_flos": 3.67221676566528e+16,
1282
+ "train_batch_size": 32,
1283
+ "trial_name": null,
1284
+ "trial_params": null
1285
+ }
reproduction/glue_exp/qnli/dr0.0,mlr1e-02,clr1e-02,ep=12.0t=20d22h35m39/QNLI.tsv ADDED
The diff for this file is too large to render. See raw diff
 
reproduction/glue_exp/qnli/dr0.0,mlr1e-02,clr1e-02,ep=12.0t=20d22h35m39/all_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 12.0,
3
+ "eval_accuracy": 0.942339373970346,
4
+ "eval_loss": 0.16657477617263794,
5
+ "eval_runtime": 10.6426,
6
+ "eval_samples": 5463,
7
+ "eval_samples_per_second": 513.315,
8
+ "eval_steps_per_second": 0.564
9
+ }
reproduction/glue_exp/qnli/dr0.0,mlr1e-02,clr1e-02,ep=12.0t=20d22h35m39/eval_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 12.0,
3
+ "eval_accuracy": 0.942339373970346,
4
+ "eval_loss": 0.16657477617263794,
5
+ "eval_runtime": 10.6426,
6
+ "eval_samples": 5463,
7
+ "eval_samples_per_second": 513.315,
8
+ "eval_steps_per_second": 0.564
9
+ }
reproduction/glue_exp/qnli/dr0.0,mlr1e-02,clr1e-02,ep=12.0t=20d22h35m39/ft/added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "[MASK]": 128000
3
+ }
reproduction/glue_exp/qnli/dr0.0,mlr1e-02,clr1e-02,ep=12.0t=20d22h35m39/ft/special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "[CLS]",
3
+ "cls_token": "[CLS]",
4
+ "eos_token": "[SEP]",
5
+ "mask_token": "[MASK]",
6
+ "pad_token": "[PAD]",
7
+ "sep_token": "[SEP]",
8
+ "unk_token": {
9
+ "content": "[UNK]",
10
+ "lstrip": false,
11
+ "normalized": true,
12
+ "rstrip": false,
13
+ "single_word": false
14
+ }
15
+ }
reproduction/glue_exp/qnli/dr0.0,mlr1e-02,clr1e-02,ep=12.0t=20d22h35m39/ft/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
reproduction/glue_exp/qnli/dr0.0,mlr1e-02,clr1e-02,ep=12.0t=20d22h35m39/ft/tokenizer_config.json ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[CLS]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[SEP]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[UNK]",
29
+ "lstrip": false,
30
+ "normalized": true,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "128000": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "bos_token": "[CLS]",
45
+ "clean_up_tokenization_spaces": false,
46
+ "cls_token": "[CLS]",
47
+ "do_lower_case": false,
48
+ "eos_token": "[SEP]",
49
+ "extra_special_tokens": {},
50
+ "mask_token": "[MASK]",
51
+ "model_max_length": 512,
52
+ "pad_token": "[PAD]",
53
+ "padding_side": "right",
54
+ "sep_token": "[SEP]",
55
+ "sp_model_kwargs": {},
56
+ "split_by_punct": false,
57
+ "tokenizer_class": "DebertaV2Tokenizer",
58
+ "unk_token": "[UNK]",
59
+ "vocab_type": "spm"
60
+ }
reproduction/glue_exp/qnli/dr0.0,mlr1e-02,clr1e-02,ep=12.0t=20d22h35m39/ft2/README.md ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: microsoft/deberta-v3-base
3
+ library_name: peft
4
+ tags:
5
+ - base_model:adapter:microsoft/deberta-v3-base
6
+ - transformers
7
+ ---
8
+
9
+ # Model Card for Model ID
10
+
11
+ <!-- Provide a quick summary of what the model is/does. -->
12
+
13
+
14
+
15
+ ## Model Details
16
+
17
+ ### Model Description
18
+
19
+ <!-- Provide a longer summary of what this model is. -->
20
+
21
+
22
+
23
+ - **Developed by:** [More Information Needed]
24
+ - **Funded by [optional]:** [More Information Needed]
25
+ - **Shared by [optional]:** [More Information Needed]
26
+ - **Model type:** [More Information Needed]
27
+ - **Language(s) (NLP):** [More Information Needed]
28
+ - **License:** [More Information Needed]
29
+ - **Finetuned from model [optional]:** [More Information Needed]
30
+
31
+ ### Model Sources [optional]
32
+
33
+ <!-- Provide the basic links for the model. -->
34
+
35
+ - **Repository:** [More Information Needed]
36
+ - **Paper [optional]:** [More Information Needed]
37
+ - **Demo [optional]:** [More Information Needed]
38
+
39
+ ## Uses
40
+
41
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
42
+
43
+ ### Direct Use
44
+
45
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
46
+
47
+ [More Information Needed]
48
+
49
+ ### Downstream Use [optional]
50
+
51
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
52
+
53
+ [More Information Needed]
54
+
55
+ ### Out-of-Scope Use
56
+
57
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
58
+
59
+ [More Information Needed]
60
+
61
+ ## Bias, Risks, and Limitations
62
+
63
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
64
+
65
+ [More Information Needed]
66
+
67
+ ### Recommendations
68
+
69
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
70
+
71
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
72
+
73
+ ## How to Get Started with the Model
74
+
75
+ Use the code below to get started with the model.
76
+
77
+ [More Information Needed]
78
+
79
+ ## Training Details
80
+
81
+ ### Training Data
82
+
83
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
84
+
85
+ [More Information Needed]
86
+
87
+ ### Training Procedure
88
+
89
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
90
+
91
+ #### Preprocessing [optional]
92
+
93
+ [More Information Needed]
94
+
95
+
96
+ #### Training Hyperparameters
97
+
98
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
99
+
100
+ #### Speeds, Sizes, Times [optional]
101
+
102
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
103
+
104
+ [More Information Needed]
105
+
106
+ ## Evaluation
107
+
108
+ <!-- This section describes the evaluation protocols and provides the results. -->
109
+
110
+ ### Testing Data, Factors & Metrics
111
+
112
+ #### Testing Data
113
+
114
+ <!-- This should link to a Dataset Card if possible. -->
115
+
116
+ [More Information Needed]
117
+
118
+ #### Factors
119
+
120
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
121
+
122
+ [More Information Needed]
123
+
124
+ #### Metrics
125
+
126
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
127
+
128
+ [More Information Needed]
129
+
130
+ ### Results
131
+
132
+ [More Information Needed]
133
+
134
+ #### Summary
135
+
136
+
137
+
138
+ ## Model Examination [optional]
139
+
140
+ <!-- Relevant interpretability work for the model goes here -->
141
+
142
+ [More Information Needed]
143
+
144
+ ## Environmental Impact
145
+
146
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
147
+
148
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
149
+
150
+ - **Hardware Type:** [More Information Needed]
151
+ - **Hours used:** [More Information Needed]
152
+ - **Cloud Provider:** [More Information Needed]
153
+ - **Compute Region:** [More Information Needed]
154
+ - **Carbon Emitted:** [More Information Needed]
155
+
156
+ ## Technical Specifications [optional]
157
+
158
+ ### Model Architecture and Objective
159
+
160
+ [More Information Needed]
161
+
162
+ ### Compute Infrastructure
163
+
164
+ [More Information Needed]
165
+
166
+ #### Hardware
167
+
168
+ [More Information Needed]
169
+
170
+ #### Software
171
+
172
+ [More Information Needed]
173
+
174
+ ## Citation [optional]
175
+
176
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
177
+
178
+ **BibTeX:**
179
+
180
+ [More Information Needed]
181
+
182
+ **APA:**
183
+
184
+ [More Information Needed]
185
+
186
+ ## Glossary [optional]
187
+
188
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
189
+
190
+ [More Information Needed]
191
+
192
+ ## More Information [optional]
193
+
194
+ [More Information Needed]
195
+
196
+ ## Model Card Authors [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Contact
201
+
202
+ [More Information Needed]
203
+ ### Framework versions
204
+
205
+ - PEFT 0.18.0
reproduction/glue_exp/qnli/dr0.0,mlr1e-02,clr1e-02,ep=12.0t=20d22h35m39/ft2/adapter_config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "apply_GS": false,
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "microsoft/deberta-v3-base",
5
+ "bias": "none",
6
+ "exclude_modules": null,
7
+ "inference_mode": true,
8
+ "init_weights": true,
9
+ "layers_pattern": null,
10
+ "layers_to_transform": null,
11
+ "modules_to_save": [
12
+ "classifier",
13
+ "pooler",
14
+ "classifier",
15
+ "score"
16
+ ],
17
+ "peft_type": "HRA",
18
+ "peft_version": "0.18.0",
19
+ "r": 8,
20
+ "revision": null,
21
+ "target_modules": [
22
+ "attention.output.dense",
23
+ "query_proj",
24
+ "key_proj",
25
+ "value_proj",
26
+ "intermediate.dense",
27
+ "output.dense"
28
+ ],
29
+ "task_type": "SEQ_CLS"
30
+ }
reproduction/glue_exp/qnli/dr0.0,mlr1e-02,clr1e-02,ep=12.0t=20d22h35m39/trainer_state.json ADDED
@@ -0,0 +1,1291 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 22500,
3
+ "best_metric": 0.942339373970346,
4
+ "best_model_checkpoint": "./glue_exp/qnli/dr0.0,mlr1e-02,clr1e-02,ep=12.0t=20d22h35m39/checkpoint-22500",
5
+ "epoch": 12.0,
6
+ "eval_steps": 500,
7
+ "global_step": 39288,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.15271838729383017,
14
+ "grad_norm": 1.5037044286727905,
15
+ "learning_rate": 0.009980000000000001,
16
+ "loss": 0.4534,
17
+ "step": 500
18
+ },
19
+ {
20
+ "epoch": 0.15271838729383017,
21
+ "eval_accuracy": 0.7565440234303497,
22
+ "eval_loss": 0.47564518451690674,
23
+ "eval_runtime": 10.841,
24
+ "eval_samples_per_second": 503.92,
25
+ "eval_steps_per_second": 0.553,
26
+ "step": 500
27
+ },
28
+ {
29
+ "epoch": 0.30543677458766033,
30
+ "grad_norm": 1.301108717918396,
31
+ "learning_rate": 0.009995914401570357,
32
+ "loss": 0.3969,
33
+ "step": 1000
34
+ },
35
+ {
36
+ "epoch": 0.30543677458766033,
37
+ "eval_accuracy": 0.8936481786564159,
38
+ "eval_loss": 0.2647385895252228,
39
+ "eval_runtime": 10.7003,
40
+ "eval_samples_per_second": 510.546,
41
+ "eval_steps_per_second": 0.561,
42
+ "step": 1000
43
+ },
44
+ {
45
+ "epoch": 0.4581551618814905,
46
+ "grad_norm": 1.488311767578125,
47
+ "learning_rate": 0.009983631547700822,
48
+ "loss": 0.3658,
49
+ "step": 1500
50
+ },
51
+ {
52
+ "epoch": 0.4581551618814905,
53
+ "eval_accuracy": 0.9055464030752334,
54
+ "eval_loss": 0.27435341477394104,
55
+ "eval_runtime": 10.6807,
56
+ "eval_samples_per_second": 511.481,
57
+ "eval_steps_per_second": 0.562,
58
+ "step": 1500
59
+ },
60
+ {
61
+ "epoch": 0.6108735491753207,
62
+ "grad_norm": 0.742245614528656,
63
+ "learning_rate": 0.009963171575627594,
64
+ "loss": 0.3348,
65
+ "step": 2000
66
+ },
67
+ {
68
+ "epoch": 0.6108735491753207,
69
+ "eval_accuracy": 0.9137836353651839,
70
+ "eval_loss": 0.2198512852191925,
71
+ "eval_runtime": 10.6447,
72
+ "eval_samples_per_second": 513.212,
73
+ "eval_steps_per_second": 0.564,
74
+ "step": 2000
75
+ },
76
+ {
77
+ "epoch": 0.7635919364691509,
78
+ "grad_norm": 0.7200786471366882,
79
+ "learning_rate": 0.009934568055972514,
80
+ "loss": 0.3216,
81
+ "step": 2500
82
+ },
83
+ {
84
+ "epoch": 0.7635919364691509,
85
+ "eval_accuracy": 0.8881566904631155,
86
+ "eval_loss": 0.25786492228507996,
87
+ "eval_runtime": 10.6621,
88
+ "eval_samples_per_second": 512.377,
89
+ "eval_steps_per_second": 0.563,
90
+ "step": 2500
91
+ },
92
+ {
93
+ "epoch": 0.916310323762981,
94
+ "grad_norm": 1.2832411527633667,
95
+ "learning_rate": 0.00989786792125036,
96
+ "loss": 0.296,
97
+ "step": 3000
98
+ },
99
+ {
100
+ "epoch": 0.916310323762981,
101
+ "eval_accuracy": 0.9161632802489474,
102
+ "eval_loss": 0.23129652440547943,
103
+ "eval_runtime": 10.6609,
104
+ "eval_samples_per_second": 512.432,
105
+ "eval_steps_per_second": 0.563,
106
+ "step": 3000
107
+ },
108
+ {
109
+ "epoch": 1.0690287110568113,
110
+ "grad_norm": 0.39881283044815063,
111
+ "learning_rate": 0.00985313138886221,
112
+ "loss": 0.2923,
113
+ "step": 3500
114
+ },
115
+ {
116
+ "epoch": 1.0690287110568113,
117
+ "eval_accuracy": 0.9082921471718836,
118
+ "eval_loss": 0.2395196557044983,
119
+ "eval_runtime": 10.6865,
120
+ "eval_samples_per_second": 511.205,
121
+ "eval_steps_per_second": 0.561,
122
+ "step": 3500
123
+ },
124
+ {
125
+ "epoch": 1.2217470983506413,
126
+ "grad_norm": 0.7452009916305542,
127
+ "learning_rate": 0.009800431862291011,
128
+ "loss": 0.2746,
129
+ "step": 4000
130
+ },
131
+ {
132
+ "epoch": 1.2217470983506413,
133
+ "eval_accuracy": 0.8844956983342486,
134
+ "eval_loss": 0.2552923262119293,
135
+ "eval_runtime": 10.6494,
136
+ "eval_samples_per_second": 512.988,
137
+ "eval_steps_per_second": 0.563,
138
+ "step": 4000
139
+ },
140
+ {
141
+ "epoch": 1.3744654856444716,
142
+ "grad_norm": 4.700128078460693,
143
+ "learning_rate": 0.009739855810661528,
144
+ "loss": 0.2829,
145
+ "step": 4500
146
+ },
147
+ {
148
+ "epoch": 1.3744654856444716,
149
+ "eval_accuracy": 0.8802855573860516,
150
+ "eval_loss": 0.3161483108997345,
151
+ "eval_runtime": 10.6701,
152
+ "eval_samples_per_second": 511.994,
153
+ "eval_steps_per_second": 0.562,
154
+ "step": 4500
155
+ },
156
+ {
157
+ "epoch": 1.5271838729383018,
158
+ "grad_norm": 0.5145873427391052,
159
+ "learning_rate": 0.009671502626862208,
160
+ "loss": 0.2666,
161
+ "step": 5000
162
+ },
163
+ {
164
+ "epoch": 1.5271838729383018,
165
+ "eval_accuracy": 0.9220208676551346,
166
+ "eval_loss": 0.19354809820652008,
167
+ "eval_runtime": 10.6591,
168
+ "eval_samples_per_second": 512.522,
169
+ "eval_steps_per_second": 0.563,
170
+ "step": 5000
171
+ },
172
+ {
173
+ "epoch": 1.679902260232132,
174
+ "grad_norm": 0.9298949241638184,
175
+ "learning_rate": 0.009595484464461823,
176
+ "loss": 0.2742,
177
+ "step": 5500
178
+ },
179
+ {
180
+ "epoch": 1.679902260232132,
181
+ "eval_accuracy": 0.919092073952041,
182
+ "eval_loss": 0.2683914303779602,
183
+ "eval_runtime": 10.6958,
184
+ "eval_samples_per_second": 510.76,
185
+ "eval_steps_per_second": 0.561,
186
+ "step": 5500
187
+ },
188
+ {
189
+ "epoch": 1.832620647525962,
190
+ "grad_norm": 1.6296656131744385,
191
+ "learning_rate": 0.009511926053688446,
192
+ "loss": 0.2815,
193
+ "step": 6000
194
+ },
195
+ {
196
+ "epoch": 1.832620647525962,
197
+ "eval_accuracy": 0.9176276771004942,
198
+ "eval_loss": 0.22844888269901276,
199
+ "eval_runtime": 10.6542,
200
+ "eval_samples_per_second": 512.758,
201
+ "eval_steps_per_second": 0.563,
202
+ "step": 6000
203
+ },
204
+ {
205
+ "epoch": 1.9853390348197923,
206
+ "grad_norm": 0.6419379115104675,
207
+ "learning_rate": 0.009420964496772703,
208
+ "loss": 0.2716,
209
+ "step": 6500
210
+ },
211
+ {
212
+ "epoch": 1.9853390348197923,
213
+ "eval_accuracy": 0.9209225700164745,
214
+ "eval_loss": 0.21448443830013275,
215
+ "eval_runtime": 10.6684,
216
+ "eval_samples_per_second": 512.075,
217
+ "eval_steps_per_second": 0.562,
218
+ "step": 6500
219
+ },
220
+ {
221
+ "epoch": 2.1380574221136226,
222
+ "grad_norm": 1.490606665611267,
223
+ "learning_rate": 0.009322749042991107,
224
+ "loss": 0.2386,
225
+ "step": 7000
226
+ },
227
+ {
228
+ "epoch": 2.1380574221136226,
229
+ "eval_accuracy": 0.8916346329855391,
230
+ "eval_loss": 0.28844568133354187,
231
+ "eval_runtime": 10.6717,
232
+ "eval_samples_per_second": 511.915,
233
+ "eval_steps_per_second": 0.562,
234
+ "step": 7000
235
+ },
236
+ {
237
+ "epoch": 2.2907758094074526,
238
+ "grad_norm": 2.767303705215454,
239
+ "learning_rate": 0.00921744084377857,
240
+ "loss": 0.241,
241
+ "step": 7500
242
+ },
243
+ {
244
+ "epoch": 2.2907758094074526,
245
+ "eval_accuracy": 0.9306241991579718,
246
+ "eval_loss": 0.19840888679027557,
247
+ "eval_runtime": 10.6553,
248
+ "eval_samples_per_second": 512.701,
249
+ "eval_steps_per_second": 0.563,
250
+ "step": 7500
251
+ },
252
+ {
253
+ "epoch": 2.4434941967012827,
254
+ "grad_norm": 1.0674463510513306,
255
+ "learning_rate": 0.00910521268831193,
256
+ "loss": 0.2458,
257
+ "step": 8000
258
+ },
259
+ {
260
+ "epoch": 2.4434941967012827,
261
+ "eval_accuracy": 0.9276954054548783,
262
+ "eval_loss": 0.2097393274307251,
263
+ "eval_runtime": 10.6626,
264
+ "eval_samples_per_second": 512.35,
265
+ "eval_steps_per_second": 0.563,
266
+ "step": 8000
267
+ },
268
+ {
269
+ "epoch": 2.596212583995113,
270
+ "grad_norm": 0.6073241829872131,
271
+ "learning_rate": 0.008986248719998306,
272
+ "loss": 0.2404,
273
+ "step": 8500
274
+ },
275
+ {
276
+ "epoch": 2.596212583995113,
277
+ "eval_accuracy": 0.914881933003844,
278
+ "eval_loss": 0.2356455773115158,
279
+ "eval_runtime": 10.6615,
280
+ "eval_samples_per_second": 512.402,
281
+ "eval_steps_per_second": 0.563,
282
+ "step": 8500
283
+ },
284
+ {
285
+ "epoch": 2.748930971288943,
286
+ "grad_norm": 0.36522233486175537,
287
+ "learning_rate": 0.008860744134333512,
288
+ "loss": 0.2528,
289
+ "step": 9000
290
+ },
291
+ {
292
+ "epoch": 2.748930971288943,
293
+ "eval_accuracy": 0.9315394471901886,
294
+ "eval_loss": 0.17801421880722046,
295
+ "eval_runtime": 10.6373,
296
+ "eval_samples_per_second": 513.572,
297
+ "eval_steps_per_second": 0.564,
298
+ "step": 9000
299
+ },
300
+ {
301
+ "epoch": 2.901649358582773,
302
+ "grad_norm": 0.3105764091014862,
303
+ "learning_rate": 0.008728904858626225,
304
+ "loss": 0.2493,
305
+ "step": 9500
306
+ },
307
+ {
308
+ "epoch": 2.901649358582773,
309
+ "eval_accuracy": 0.9265971078162182,
310
+ "eval_loss": 0.19014635682106018,
311
+ "eval_runtime": 10.6935,
312
+ "eval_samples_per_second": 510.872,
313
+ "eval_steps_per_second": 0.561,
314
+ "step": 9500
315
+ },
316
+ {
317
+ "epoch": 3.0543677458766036,
318
+ "grad_norm": 0.7906205058097839,
319
+ "learning_rate": 0.008590947214113487,
320
+ "loss": 0.2601,
321
+ "step": 10000
322
+ },
323
+ {
324
+ "epoch": 3.0543677458766036,
325
+ "eval_accuracy": 0.924400512538898,
326
+ "eval_loss": 0.20123699307441711,
327
+ "eval_runtime": 10.6623,
328
+ "eval_samples_per_second": 512.368,
329
+ "eval_steps_per_second": 0.563,
330
+ "step": 10000
331
+ },
332
+ {
333
+ "epoch": 3.2070861331704337,
334
+ "grad_norm": 0.40712904930114746,
335
+ "learning_rate": 0.00844709756102187,
336
+ "loss": 0.2952,
337
+ "step": 10500
338
+ },
339
+ {
340
+ "epoch": 3.2070861331704337,
341
+ "eval_accuracy": 0.933369943254622,
342
+ "eval_loss": 0.1751944124698639,
343
+ "eval_runtime": 10.6795,
344
+ "eval_samples_per_second": 511.542,
345
+ "eval_steps_per_second": 0.562,
346
+ "step": 10500
347
+ },
348
+ {
349
+ "epoch": 3.359804520464264,
350
+ "grad_norm": 1.2962415218353271,
351
+ "learning_rate": 0.008297591927156726,
352
+ "loss": 0.2187,
353
+ "step": 11000
354
+ },
355
+ {
356
+ "epoch": 3.359804520464264,
357
+ "eval_accuracy": 0.9264140582097748,
358
+ "eval_loss": 0.20032303035259247,
359
+ "eval_runtime": 10.6692,
360
+ "eval_samples_per_second": 512.033,
361
+ "eval_steps_per_second": 0.562,
362
+ "step": 11000
363
+ },
364
+ {
365
+ "epoch": 3.512522907758094,
366
+ "grad_norm": 0.606886088848114,
367
+ "learning_rate": 0.008142675620628925,
368
+ "loss": 0.2084,
369
+ "step": 11500
370
+ },
371
+ {
372
+ "epoch": 3.512522907758094,
373
+ "eval_accuracy": 0.9254988101775581,
374
+ "eval_loss": 0.17964500188827515,
375
+ "eval_runtime": 10.66,
376
+ "eval_samples_per_second": 512.476,
377
+ "eval_steps_per_second": 0.563,
378
+ "step": 11500
379
+ },
380
+ {
381
+ "epoch": 3.665241295051924,
382
+ "grad_norm": 0.4789172410964966,
383
+ "learning_rate": 0.007982602827354504,
384
+ "loss": 0.2421,
385
+ "step": 12000
386
+ },
387
+ {
388
+ "epoch": 3.665241295051924,
389
+ "eval_accuracy": 0.9280615046677649,
390
+ "eval_loss": 0.1904270052909851,
391
+ "eval_runtime": 10.6645,
392
+ "eval_samples_per_second": 512.261,
393
+ "eval_steps_per_second": 0.563,
394
+ "step": 12000
395
+ },
396
+ {
397
+ "epoch": 3.8179596823457542,
398
+ "grad_norm": 0.5354307293891907,
399
+ "learning_rate": 0.007817636193987672,
400
+ "loss": 0.2239,
401
+ "step": 12500
402
+ },
403
+ {
404
+ "epoch": 3.8179596823457542,
405
+ "eval_accuracy": 0.9225700164744646,
406
+ "eval_loss": 0.17951762676239014,
407
+ "eval_runtime": 10.6419,
408
+ "eval_samples_per_second": 513.347,
409
+ "eval_steps_per_second": 0.564,
410
+ "step": 12500
411
+ },
412
+ {
413
+ "epoch": 3.9706780696395847,
414
+ "grad_norm": 0.33672019839286804,
415
+ "learning_rate": 0.007648046396971458,
416
+ "loss": 0.2241,
417
+ "step": 13000
418
+ },
419
+ {
420
+ "epoch": 3.9706780696395847,
421
+ "eval_accuracy": 0.9335529928610653,
422
+ "eval_loss": 0.17079654335975647,
423
+ "eval_runtime": 10.6698,
424
+ "eval_samples_per_second": 512.008,
425
+ "eval_steps_per_second": 0.562,
426
+ "step": 13000
427
+ },
428
+ {
429
+ "epoch": 4.123396456933415,
430
+ "grad_norm": 1.8505825996398926,
431
+ "learning_rate": 0.0074741116984131375,
432
+ "loss": 0.1942,
433
+ "step": 13500
434
+ },
435
+ {
436
+ "epoch": 4.123396456933415,
437
+ "eval_accuracy": 0.932820794435292,
438
+ "eval_loss": 0.17642812430858612,
439
+ "eval_runtime": 11.649,
440
+ "eval_samples_per_second": 468.969,
441
+ "eval_steps_per_second": 0.515,
442
+ "step": 13500
443
+ },
444
+ {
445
+ "epoch": 4.276114844227245,
446
+ "grad_norm": 1.2131459712982178,
447
+ "learning_rate": 0.007296117489513126,
448
+ "loss": 0.1926,
449
+ "step": 14000
450
+ },
451
+ {
452
+ "epoch": 4.276114844227245,
453
+ "eval_accuracy": 0.9267801574226615,
454
+ "eval_loss": 0.17633135616779327,
455
+ "eval_runtime": 10.6901,
456
+ "eval_samples_per_second": 511.034,
457
+ "eval_steps_per_second": 0.561,
458
+ "step": 14000
459
+ },
460
+ {
461
+ "epoch": 4.428833231521075,
462
+ "grad_norm": 0.5870394110679626,
463
+ "learning_rate": 0.007114355822296504,
464
+ "loss": 0.2021,
465
+ "step": 14500
466
+ },
467
+ {
468
+ "epoch": 4.428833231521075,
469
+ "eval_accuracy": 0.9368478857770456,
470
+ "eval_loss": 0.1699468046426773,
471
+ "eval_runtime": 12.8884,
472
+ "eval_samples_per_second": 423.868,
473
+ "eval_steps_per_second": 0.466,
474
+ "step": 14500
475
+ },
476
+ {
477
+ "epoch": 4.581551618814905,
478
+ "grad_norm": 1.3461284637451172,
479
+ "learning_rate": 0.006929124930415475,
480
+ "loss": 0.1951,
481
+ "step": 15000
482
+ },
483
+ {
484
+ "epoch": 4.581551618814905,
485
+ "eval_accuracy": 0.9416071755445726,
486
+ "eval_loss": 0.1666867583990097,
487
+ "eval_runtime": 10.6488,
488
+ "eval_samples_per_second": 513.017,
489
+ "eval_steps_per_second": 0.563,
490
+ "step": 15000
491
+ },
492
+ {
493
+ "epoch": 4.734270006108735,
494
+ "grad_norm": 0.5517491102218628,
495
+ "learning_rate": 0.006740728739809068,
496
+ "loss": 0.2014,
497
+ "step": 15500
498
+ },
499
+ {
500
+ "epoch": 4.734270006108735,
501
+ "eval_accuracy": 0.9339190920739521,
502
+ "eval_loss": 0.1630578488111496,
503
+ "eval_runtime": 12.8458,
504
+ "eval_samples_per_second": 425.275,
505
+ "eval_steps_per_second": 0.467,
506
+ "step": 15500
507
+ },
508
+ {
509
+ "epoch": 4.886988393402565,
510
+ "grad_norm": 1.1786259412765503,
511
+ "learning_rate": 0.006549476370022931,
512
+ "loss": 0.2076,
513
+ "step": 16000
514
+ },
515
+ {
516
+ "epoch": 4.886988393402565,
517
+ "eval_accuracy": 0.9353834889254988,
518
+ "eval_loss": 0.1612890213727951,
519
+ "eval_runtime": 10.6764,
520
+ "eval_samples_per_second": 511.687,
521
+ "eval_steps_per_second": 0.562,
522
+ "step": 16000
523
+ },
524
+ {
525
+ "epoch": 5.039706780696396,
526
+ "grad_norm": 0.5138195753097534,
527
+ "learning_rate": 0.006355681627007508,
528
+ "loss": 0.1913,
529
+ "step": 16500
530
+ },
531
+ {
532
+ "epoch": 5.039706780696396,
533
+ "eval_accuracy": 0.929159802306425,
534
+ "eval_loss": 0.19614355266094208,
535
+ "eval_runtime": 11.8468,
536
+ "eval_samples_per_second": 461.137,
537
+ "eval_steps_per_second": 0.506,
538
+ "step": 16500
539
+ },
540
+ {
541
+ "epoch": 5.192425167990226,
542
+ "grad_norm": 0.2939035892486572,
543
+ "learning_rate": 0.006159662488226765,
544
+ "loss": 0.1691,
545
+ "step": 17000
546
+ },
547
+ {
548
+ "epoch": 5.192425167990226,
549
+ "eval_accuracy": 0.9331868936481786,
550
+ "eval_loss": 0.17631573975086212,
551
+ "eval_runtime": 10.6578,
552
+ "eval_samples_per_second": 512.581,
553
+ "eval_steps_per_second": 0.563,
554
+ "step": 17000
555
+ },
556
+ {
557
+ "epoch": 5.345143555284056,
558
+ "grad_norm": 0.5290659070014954,
559
+ "learning_rate": 0.005961740580922335,
560
+ "loss": 0.1783,
561
+ "step": 17500
562
+ },
563
+ {
564
+ "epoch": 5.345143555284056,
565
+ "eval_accuracy": 0.9341021416803954,
566
+ "eval_loss": 0.18246591091156006,
567
+ "eval_runtime": 10.6395,
568
+ "eval_samples_per_second": 513.465,
569
+ "eval_steps_per_second": 0.564,
570
+ "step": 17500
571
+ },
572
+ {
573
+ "epoch": 5.497861942577886,
574
+ "grad_norm": 0.19821959733963013,
575
+ "learning_rate": 0.005762240654389111,
576
+ "loss": 0.1725,
577
+ "step": 18000
578
+ },
579
+ {
580
+ "epoch": 5.497861942577886,
581
+ "eval_accuracy": 0.9284276038806517,
582
+ "eval_loss": 0.18397371470928192,
583
+ "eval_runtime": 10.6953,
584
+ "eval_samples_per_second": 510.786,
585
+ "eval_steps_per_second": 0.561,
586
+ "step": 18000
587
+ },
588
+ {
589
+ "epoch": 5.650580329871716,
590
+ "grad_norm": 0.5750489234924316,
591
+ "learning_rate": 0.005561490047128203,
592
+ "loss": 0.1782,
593
+ "step": 18500
594
+ },
595
+ {
596
+ "epoch": 5.650580329871716,
597
+ "eval_accuracy": 0.9362987369577156,
598
+ "eval_loss": 0.1769886612892151,
599
+ "eval_runtime": 10.6828,
600
+ "eval_samples_per_second": 511.381,
601
+ "eval_steps_per_second": 0.562,
602
+ "step": 18500
603
+ },
604
+ {
605
+ "epoch": 5.803298717165546,
606
+ "grad_norm": 1.0194318294525146,
607
+ "learning_rate": 0.0053598181497515164,
608
+ "loss": 0.1864,
609
+ "step": 19000
610
+ },
611
+ {
612
+ "epoch": 5.803298717165546,
613
+ "eval_accuracy": 0.9359326377448288,
614
+ "eval_loss": 0.16293832659721375,
615
+ "eval_runtime": 10.6821,
616
+ "eval_samples_per_second": 511.415,
617
+ "eval_steps_per_second": 0.562,
618
+ "step": 19000
619
+ },
620
+ {
621
+ "epoch": 5.956017104459377,
622
+ "grad_norm": 1.04185950756073,
623
+ "learning_rate": 0.00515755586451927,
624
+ "loss": 0.1873,
625
+ "step": 19500
626
+ },
627
+ {
628
+ "epoch": 5.956017104459377,
629
+ "eval_accuracy": 0.938678381841479,
630
+ "eval_loss": 0.16236957907676697,
631
+ "eval_runtime": 10.667,
632
+ "eval_samples_per_second": 512.142,
633
+ "eval_steps_per_second": 0.562,
634
+ "step": 19500
635
+ },
636
+ {
637
+ "epoch": 6.108735491753207,
638
+ "grad_norm": 0.5527487397193909,
639
+ "learning_rate": 0.004955035062397176,
640
+ "loss": 0.1481,
641
+ "step": 20000
642
+ },
643
+ {
644
+ "epoch": 6.108735491753207,
645
+ "eval_accuracy": 0.9339190920739521,
646
+ "eval_loss": 0.1655975580215454,
647
+ "eval_runtime": 10.6678,
648
+ "eval_samples_per_second": 512.102,
649
+ "eval_steps_per_second": 0.562,
650
+ "step": 20000
651
+ },
652
+ {
653
+ "epoch": 6.261453879047037,
654
+ "grad_norm": 0.2231854945421219,
655
+ "learning_rate": 0.004752588038524194,
656
+ "loss": 0.1435,
657
+ "step": 20500
658
+ },
659
+ {
660
+ "epoch": 6.261453879047037,
661
+ "eval_accuracy": 0.9352004393190555,
662
+ "eval_loss": 0.16785979270935059,
663
+ "eval_runtime": 10.6639,
664
+ "eval_samples_per_second": 512.291,
665
+ "eval_steps_per_second": 0.563,
666
+ "step": 20500
667
+ },
668
+ {
669
+ "epoch": 6.414172266340867,
670
+ "grad_norm": 0.6651498079299927,
671
+ "learning_rate": 0.004550546966984289,
672
+ "loss": 0.1466,
673
+ "step": 21000
674
+ },
675
+ {
676
+ "epoch": 6.414172266340867,
677
+ "eval_accuracy": 0.9350173897126122,
678
+ "eval_loss": 0.20046856999397278,
679
+ "eval_runtime": 10.647,
680
+ "eval_samples_per_second": 513.1,
681
+ "eval_steps_per_second": 0.564,
682
+ "step": 21000
683
+ },
684
+ {
685
+ "epoch": 6.566890653634697,
686
+ "grad_norm": 0.2773177921772003,
687
+ "learning_rate": 0.004349243355776835,
688
+ "loss": 0.1503,
689
+ "step": 21500
690
+ },
691
+ {
692
+ "epoch": 6.566890653634697,
693
+ "eval_accuracy": 0.9408749771187992,
694
+ "eval_loss": 0.18849799036979675,
695
+ "eval_runtime": 10.6557,
696
+ "eval_samples_per_second": 512.683,
697
+ "eval_steps_per_second": 0.563,
698
+ "step": 21500
699
+ },
700
+ {
701
+ "epoch": 6.719609040928528,
702
+ "grad_norm": 0.3191971778869629,
703
+ "learning_rate": 0.004149007502879905,
704
+ "loss": 0.152,
705
+ "step": 22000
706
+ },
707
+ {
708
+ "epoch": 6.719609040928528,
709
+ "eval_accuracy": 0.9390444810543658,
710
+ "eval_loss": 0.20619872212409973,
711
+ "eval_runtime": 10.6681,
712
+ "eval_samples_per_second": 512.088,
713
+ "eval_steps_per_second": 0.562,
714
+ "step": 22000
715
+ },
716
+ {
717
+ "epoch": 6.872327428222358,
718
+ "grad_norm": 0.3263784646987915,
719
+ "learning_rate": 0.003950167954298976,
720
+ "loss": 0.1509,
721
+ "step": 22500
722
+ },
723
+ {
724
+ "epoch": 6.872327428222358,
725
+ "eval_accuracy": 0.942339373970346,
726
+ "eval_loss": 0.16657477617263794,
727
+ "eval_runtime": 10.6628,
728
+ "eval_samples_per_second": 512.344,
729
+ "eval_steps_per_second": 0.563,
730
+ "step": 22500
731
+ },
732
+ {
733
+ "epoch": 7.025045815516188,
734
+ "grad_norm": 0.6678400039672852,
735
+ "learning_rate": 0.0037530509649902465,
736
+ "loss": 0.1422,
737
+ "step": 23000
738
+ },
739
+ {
740
+ "epoch": 7.025045815516188,
741
+ "eval_accuracy": 0.9383122826285923,
742
+ "eval_loss": 0.2081904113292694,
743
+ "eval_runtime": 10.6962,
744
+ "eval_samples_per_second": 510.74,
745
+ "eval_steps_per_second": 0.561,
746
+ "step": 23000
747
+ },
748
+ {
749
+ "epoch": 7.177764202810018,
750
+ "grad_norm": 0.31175652146339417,
751
+ "learning_rate": 0.003557979963543113,
752
+ "loss": 0.1175,
753
+ "step": 23500
754
+ },
755
+ {
756
+ "epoch": 7.177764202810018,
757
+ "eval_accuracy": 0.9352004393190555,
758
+ "eval_loss": 0.20077426731586456,
759
+ "eval_runtime": 10.6573,
760
+ "eval_samples_per_second": 512.605,
761
+ "eval_steps_per_second": 0.563,
762
+ "step": 23500
763
+ },
764
+ {
765
+ "epoch": 7.330482590103848,
766
+ "grad_norm": 1.3799458742141724,
767
+ "learning_rate": 0.003365275021500116,
768
+ "loss": 0.1192,
769
+ "step": 24000
770
+ },
771
+ {
772
+ "epoch": 7.330482590103848,
773
+ "eval_accuracy": 0.9372139849899322,
774
+ "eval_loss": 0.20808292925357819,
775
+ "eval_runtime": 10.6649,
776
+ "eval_samples_per_second": 512.239,
777
+ "eval_steps_per_second": 0.563,
778
+ "step": 24000
779
+ },
780
+ {
781
+ "epoch": 7.483200977397678,
782
+ "grad_norm": 1.1041522026062012,
783
+ "learning_rate": 0.0031752523281851387,
784
+ "loss": 0.1183,
785
+ "step": 24500
786
+ },
787
+ {
788
+ "epoch": 7.483200977397678,
789
+ "eval_accuracy": 0.9394105802672524,
790
+ "eval_loss": 0.20463068783283234,
791
+ "eval_runtime": 10.65,
792
+ "eval_samples_per_second": 512.958,
793
+ "eval_steps_per_second": 0.563,
794
+ "step": 24500
795
+ },
796
+ {
797
+ "epoch": 7.6359193646915084,
798
+ "grad_norm": 0.9938382506370544,
799
+ "learning_rate": 0.0029882236719014944,
800
+ "loss": 0.1232,
801
+ "step": 25000
802
+ },
803
+ {
804
+ "epoch": 7.6359193646915084,
805
+ "eval_accuracy": 0.9388614314479223,
806
+ "eval_loss": 0.202561154961586,
807
+ "eval_runtime": 10.6361,
808
+ "eval_samples_per_second": 513.627,
809
+ "eval_steps_per_second": 0.564,
810
+ "step": 25000
811
+ },
812
+ {
813
+ "epoch": 7.788637751985339,
814
+ "grad_norm": 0.23097889125347137,
815
+ "learning_rate": 0.0028044959283512255,
816
+ "loss": 0.1258,
817
+ "step": 25500
818
+ },
819
+ {
820
+ "epoch": 7.788637751985339,
821
+ "eval_accuracy": 0.9370309353834889,
822
+ "eval_loss": 0.20270980894565582,
823
+ "eval_runtime": 10.6659,
824
+ "eval_samples_per_second": 512.192,
825
+ "eval_steps_per_second": 0.563,
826
+ "step": 25500
827
+ },
828
+ {
829
+ "epoch": 7.941356139279169,
830
+ "grad_norm": 0.2913364768028259,
831
+ "learning_rate": 0.0026243705571149458,
832
+ "loss": 0.1204,
833
+ "step": 26000
834
+ },
835
+ {
836
+ "epoch": 7.941356139279169,
837
+ "eval_accuracy": 0.9397766794801391,
838
+ "eval_loss": 0.17229758203029633,
839
+ "eval_runtime": 10.6644,
840
+ "eval_samples_per_second": 512.266,
841
+ "eval_steps_per_second": 0.563,
842
+ "step": 26000
843
+ },
844
+ {
845
+ "epoch": 8.094074526573,
846
+ "grad_norm": 0.3294864892959595,
847
+ "learning_rate": 0.002448143107018443,
848
+ "loss": 0.0987,
849
+ "step": 26500
850
+ },
851
+ {
852
+ "epoch": 8.094074526573,
853
+ "eval_accuracy": 0.9377631338092624,
854
+ "eval_loss": 0.21938388049602509,
855
+ "eval_runtime": 10.6536,
856
+ "eval_samples_per_second": 512.786,
857
+ "eval_steps_per_second": 0.563,
858
+ "step": 26500
859
+ },
860
+ {
861
+ "epoch": 8.24679291386683,
862
+ "grad_norm": 1.273834228515625,
863
+ "learning_rate": 0.0022761027311976175,
864
+ "loss": 0.0878,
865
+ "step": 27000
866
+ },
867
+ {
868
+ "epoch": 8.24679291386683,
869
+ "eval_accuracy": 0.9392275306608091,
870
+ "eval_loss": 0.1930987387895584,
871
+ "eval_runtime": 10.6614,
872
+ "eval_samples_per_second": 512.412,
873
+ "eval_steps_per_second": 0.563,
874
+ "step": 27000
875
+ },
876
+ {
877
+ "epoch": 8.39951130116066,
878
+ "grad_norm": 0.23500996828079224,
879
+ "learning_rate": 0.0021085317126574217,
880
+ "loss": 0.096,
881
+ "step": 27500
882
+ },
883
+ {
884
+ "epoch": 8.39951130116066,
885
+ "eval_accuracy": 0.9366648361706023,
886
+ "eval_loss": 0.2221830040216446,
887
+ "eval_runtime": 10.6715,
888
+ "eval_samples_per_second": 511.925,
889
+ "eval_steps_per_second": 0.562,
890
+ "step": 27500
891
+ },
892
+ {
893
+ "epoch": 8.55222968845449,
894
+ "grad_norm": 1.5383870601654053,
895
+ "learning_rate": 0.001945705001103315,
896
+ "loss": 0.0968,
897
+ "step": 28000
898
+ },
899
+ {
900
+ "epoch": 8.55222968845449,
901
+ "eval_accuracy": 0.9405088779059125,
902
+ "eval_loss": 0.19442974030971527,
903
+ "eval_runtime": 10.66,
904
+ "eval_samples_per_second": 512.479,
905
+ "eval_steps_per_second": 0.563,
906
+ "step": 28000
907
+ },
908
+ {
909
+ "epoch": 8.70494807574832,
910
+ "grad_norm": 0.3537600636482239,
911
+ "learning_rate": 0.001787889761805106,
912
+ "loss": 0.0963,
913
+ "step": 28500
914
+ },
915
+ {
916
+ "epoch": 8.70494807574832,
917
+ "eval_accuracy": 0.9394105802672524,
918
+ "eval_loss": 0.2216956913471222,
919
+ "eval_runtime": 10.6564,
920
+ "eval_samples_per_second": 512.651,
921
+ "eval_steps_per_second": 0.563,
922
+ "step": 28500
923
+ },
924
+ {
925
+ "epoch": 8.85766646304215,
926
+ "grad_norm": 0.2855343818664551,
927
+ "learning_rate": 0.0016353449372335095,
928
+ "loss": 0.0895,
929
+ "step": 29000
930
+ },
931
+ {
932
+ "epoch": 8.85766646304215,
933
+ "eval_accuracy": 0.9410580267252425,
934
+ "eval_loss": 0.2340826839208603,
935
+ "eval_runtime": 10.655,
936
+ "eval_samples_per_second": 512.716,
937
+ "eval_steps_per_second": 0.563,
938
+ "step": 29000
939
+ },
940
+ {
941
+ "epoch": 9.01038485033598,
942
+ "grad_norm": 0.24387948215007782,
943
+ "learning_rate": 0.0014883208221886013,
944
+ "loss": 0.0912,
945
+ "step": 29500
946
+ },
947
+ {
948
+ "epoch": 9.01038485033598,
949
+ "eval_accuracy": 0.9359326377448288,
950
+ "eval_loss": 0.22634004056453705,
951
+ "eval_runtime": 10.6493,
952
+ "eval_samples_per_second": 512.994,
953
+ "eval_steps_per_second": 0.563,
954
+ "step": 29500
955
+ },
956
+ {
957
+ "epoch": 9.16310323762981,
958
+ "grad_norm": 0.1418183296918869,
959
+ "learning_rate": 0.0013470586531173394,
960
+ "loss": 0.064,
961
+ "step": 30000
962
+ },
963
+ {
964
+ "epoch": 9.16310323762981,
965
+ "eval_accuracy": 0.9390444810543658,
966
+ "eval_loss": 0.2190869152545929,
967
+ "eval_runtime": 10.662,
968
+ "eval_samples_per_second": 512.38,
969
+ "eval_steps_per_second": 0.563,
970
+ "step": 30000
971
+ },
972
+ {
973
+ "epoch": 9.315821624923641,
974
+ "grad_norm": 0.16249431669712067,
975
+ "learning_rate": 0.0012117902122939861,
976
+ "loss": 0.0635,
977
+ "step": 30500
978
+ },
979
+ {
980
+ "epoch": 9.315821624923641,
981
+ "eval_accuracy": 0.9357495881383855,
982
+ "eval_loss": 0.24300245940685272,
983
+ "eval_runtime": 10.6649,
984
+ "eval_samples_per_second": 512.24,
985
+ "eval_steps_per_second": 0.563,
986
+ "step": 30500
987
+ },
988
+ {
989
+ "epoch": 9.46854001221747,
990
+ "grad_norm": 0.40274578332901,
991
+ "learning_rate": 0.0010827374475128864,
992
+ "loss": 0.0697,
993
+ "step": 31000
994
+ },
995
+ {
996
+ "epoch": 9.46854001221747,
997
+ "eval_accuracy": 0.9372139849899322,
998
+ "eval_loss": 0.19327227771282196,
999
+ "eval_runtime": 10.6762,
1000
+ "eval_samples_per_second": 511.697,
1001
+ "eval_steps_per_second": 0.562,
1002
+ "step": 31000
1003
+ },
1004
+ {
1005
+ "epoch": 9.621258399511301,
1006
+ "grad_norm": 0.7273208498954773,
1007
+ "learning_rate": 0.0009601121079176139,
1008
+ "loss": 0.0736,
1009
+ "step": 31500
1010
+ },
1011
+ {
1012
+ "epoch": 9.621258399511301,
1013
+ "eval_accuracy": 0.9406919275123559,
1014
+ "eval_loss": 0.22759296000003815,
1015
+ "eval_runtime": 10.6452,
1016
+ "eval_samples_per_second": 513.188,
1017
+ "eval_steps_per_second": 0.564,
1018
+ "step": 31500
1019
+ },
1020
+ {
1021
+ "epoch": 9.77397678680513,
1022
+ "grad_norm": 0.26268601417541504,
1023
+ "learning_rate": 0.0008441153965640264,
1024
+ "loss": 0.069,
1025
+ "step": 32000
1026
+ },
1027
+ {
1028
+ "epoch": 9.77397678680513,
1029
+ "eval_accuracy": 0.941790225151016,
1030
+ "eval_loss": 0.18988043069839478,
1031
+ "eval_runtime": 10.6469,
1032
+ "eval_samples_per_second": 513.109,
1033
+ "eval_steps_per_second": 0.564,
1034
+ "step": 32000
1035
+ },
1036
+ {
1037
+ "epoch": 9.926695174098962,
1038
+ "grad_norm": 0.23675082623958588,
1039
+ "learning_rate": 0.0007349376402872593,
1040
+ "loss": 0.0679,
1041
+ "step": 32500
1042
+ },
1043
+ {
1044
+ "epoch": 9.926695174098962,
1045
+ "eval_accuracy": 0.9414241259381292,
1046
+ "eval_loss": 0.19748558104038239,
1047
+ "eval_runtime": 10.6699,
1048
+ "eval_samples_per_second": 512.002,
1049
+ "eval_steps_per_second": 0.562,
1050
+ "step": 32500
1051
+ },
1052
+ {
1053
+ "epoch": 10.079413561392792,
1054
+ "grad_norm": 0.19973616302013397,
1055
+ "learning_rate": 0.0006327579774144044,
1056
+ "loss": 0.0537,
1057
+ "step": 33000
1058
+ },
1059
+ {
1060
+ "epoch": 10.079413561392792,
1061
+ "eval_accuracy": 0.942339373970346,
1062
+ "eval_loss": 0.27014923095703125,
1063
+ "eval_runtime": 10.663,
1064
+ "eval_samples_per_second": 512.332,
1065
+ "eval_steps_per_second": 0.563,
1066
+ "step": 33000
1067
+ },
1068
+ {
1069
+ "epoch": 10.232131948686622,
1070
+ "grad_norm": 0.16148479282855988,
1071
+ "learning_rate": 0.0005377440638352149,
1072
+ "loss": 0.0565,
1073
+ "step": 33500
1074
+ },
1075
+ {
1076
+ "epoch": 10.232131948686622,
1077
+ "eval_accuracy": 0.9410580267252425,
1078
+ "eval_loss": 0.2295636087656021,
1079
+ "eval_runtime": 10.6399,
1080
+ "eval_samples_per_second": 513.444,
1081
+ "eval_steps_per_second": 0.564,
1082
+ "step": 33500
1083
+ },
1084
+ {
1085
+ "epoch": 10.384850335980452,
1086
+ "grad_norm": 0.15913018584251404,
1087
+ "learning_rate": 0.00045005179791313913,
1088
+ "loss": 0.0476,
1089
+ "step": 34000
1090
+ },
1091
+ {
1092
+ "epoch": 10.384850335980452,
1093
+ "eval_accuracy": 0.9408749771187992,
1094
+ "eval_loss": 0.23820237815380096,
1095
+ "eval_runtime": 10.646,
1096
+ "eval_samples_per_second": 513.152,
1097
+ "eval_steps_per_second": 0.564,
1098
+ "step": 34000
1099
+ },
1100
+ {
1101
+ "epoch": 10.537568723274282,
1102
+ "grad_norm": 0.36721134185791016,
1103
+ "learning_rate": 0.0003698250646880624,
1104
+ "loss": 0.053,
1105
+ "step": 34500
1106
+ },
1107
+ {
1108
+ "epoch": 10.537568723274282,
1109
+ "eval_accuracy": 0.9399597290865824,
1110
+ "eval_loss": 0.2536531984806061,
1111
+ "eval_runtime": 10.6385,
1112
+ "eval_samples_per_second": 513.513,
1113
+ "eval_steps_per_second": 0.564,
1114
+ "step": 34500
1115
+ },
1116
+ {
1117
+ "epoch": 10.690287110568113,
1118
+ "grad_norm": 0.371605783700943,
1119
+ "learning_rate": 0.00029719549979042415,
1120
+ "loss": 0.0476,
1121
+ "step": 35000
1122
+ },
1123
+ {
1124
+ "epoch": 10.690287110568113,
1125
+ "eval_accuracy": 0.9403258282994692,
1126
+ "eval_loss": 0.2486487627029419,
1127
+ "eval_runtime": 10.6558,
1128
+ "eval_samples_per_second": 512.678,
1129
+ "eval_steps_per_second": 0.563,
1130
+ "step": 35000
1131
+ },
1132
+ {
1133
+ "epoch": 10.843005497861942,
1134
+ "grad_norm": 0.08992467820644379,
1135
+ "learning_rate": 0.00023228227345413466,
1136
+ "loss": 0.0465,
1137
+ "step": 35500
1138
+ },
1139
+ {
1140
+ "epoch": 10.843005497861942,
1141
+ "eval_accuracy": 0.9406919275123559,
1142
+ "eval_loss": 0.26264604926109314,
1143
+ "eval_runtime": 10.658,
1144
+ "eval_samples_per_second": 512.573,
1145
+ "eval_steps_per_second": 0.563,
1146
+ "step": 35500
1147
+ },
1148
+ {
1149
+ "epoch": 10.995723885155773,
1150
+ "grad_norm": 0.2935340702533722,
1151
+ "learning_rate": 0.0001751918949826309,
1152
+ "loss": 0.0506,
1153
+ "step": 36000
1154
+ },
1155
+ {
1156
+ "epoch": 10.995723885155773,
1157
+ "eval_accuracy": 0.9414241259381292,
1158
+ "eval_loss": 0.24543774127960205,
1159
+ "eval_runtime": 10.6849,
1160
+ "eval_samples_per_second": 511.283,
1161
+ "eval_steps_per_second": 0.562,
1162
+ "step": 36000
1163
+ },
1164
+ {
1165
+ "epoch": 11.148442272449604,
1166
+ "grad_norm": 0.09359394758939743,
1167
+ "learning_rate": 0.00012601803798893642,
1168
+ "loss": 0.0455,
1169
+ "step": 36500
1170
+ },
1171
+ {
1172
+ "epoch": 11.148442272449604,
1173
+ "eval_accuracy": 0.9412410763316859,
1174
+ "eval_loss": 0.2553355097770691,
1175
+ "eval_runtime": 10.6713,
1176
+ "eval_samples_per_second": 511.936,
1177
+ "eval_steps_per_second": 0.562,
1178
+ "step": 36500
1179
+ },
1180
+ {
1181
+ "epoch": 11.301160659743433,
1182
+ "grad_norm": 0.5179402232170105,
1183
+ "learning_rate": 8.484138669645869e-05,
1184
+ "loss": 0.0431,
1185
+ "step": 37000
1186
+ },
1187
+ {
1188
+ "epoch": 11.301160659743433,
1189
+ "eval_accuracy": 0.9408749771187992,
1190
+ "eval_loss": 0.2569965124130249,
1191
+ "eval_runtime": 10.6427,
1192
+ "eval_samples_per_second": 513.309,
1193
+ "eval_steps_per_second": 0.564,
1194
+ "step": 37000
1195
+ },
1196
+ {
1197
+ "epoch": 11.453879047037264,
1198
+ "grad_norm": 0.0961860865354538,
1199
+ "learning_rate": 5.172950355270711e-05,
1200
+ "loss": 0.043,
1201
+ "step": 37500
1202
+ },
1203
+ {
1204
+ "epoch": 11.453879047037264,
1205
+ "eval_accuracy": 0.9403258282994692,
1206
+ "eval_loss": 0.2571694254875183,
1207
+ "eval_runtime": 10.6504,
1208
+ "eval_samples_per_second": 512.941,
1209
+ "eval_steps_per_second": 0.563,
1210
+ "step": 37500
1211
+ },
1212
+ {
1213
+ "epoch": 11.606597434331093,
1214
+ "grad_norm": 0.7020719051361084,
1215
+ "learning_rate": 2.6736718373162605e-05,
1216
+ "loss": 0.0376,
1217
+ "step": 38000
1218
+ },
1219
+ {
1220
+ "epoch": 11.606597434331093,
1221
+ "eval_accuracy": 0.9403258282994692,
1222
+ "eval_loss": 0.260541707277298,
1223
+ "eval_runtime": 10.6414,
1224
+ "eval_samples_per_second": 513.371,
1225
+ "eval_steps_per_second": 0.564,
1226
+ "step": 38000
1227
+ },
1228
+ {
1229
+ "epoch": 11.759315821624924,
1230
+ "grad_norm": 0.0348581038415432,
1231
+ "learning_rate": 9.9040391971833e-06,
1232
+ "loss": 0.0384,
1233
+ "step": 38500
1234
+ },
1235
+ {
1236
+ "epoch": 11.759315821624924,
1237
+ "eval_accuracy": 0.9405088779059125,
1238
+ "eval_loss": 0.26105377078056335,
1239
+ "eval_runtime": 10.6519,
1240
+ "eval_samples_per_second": 512.867,
1241
+ "eval_steps_per_second": 0.563,
1242
+ "step": 38500
1243
+ },
1244
+ {
1245
+ "epoch": 11.912034208918755,
1246
+ "grad_norm": 0.17020323872566223,
1247
+ "learning_rate": 1.2590850022137223e-06,
1248
+ "loss": 0.0431,
1249
+ "step": 39000
1250
+ },
1251
+ {
1252
+ "epoch": 11.912034208918755,
1253
+ "eval_accuracy": 0.9406919275123559,
1254
+ "eval_loss": 0.2613375186920166,
1255
+ "eval_runtime": 10.6681,
1256
+ "eval_samples_per_second": 512.087,
1257
+ "eval_steps_per_second": 0.562,
1258
+ "step": 39000
1259
+ },
1260
+ {
1261
+ "epoch": 12.0,
1262
+ "step": 39288,
1263
+ "total_flos": 3.355629231721513e+17,
1264
+ "train_loss": 0.16749728762479646,
1265
+ "train_runtime": 15045.4573,
1266
+ "train_samples_per_second": 83.541,
1267
+ "train_steps_per_second": 2.611
1268
+ }
1269
+ ],
1270
+ "logging_steps": 500,
1271
+ "max_steps": 39288,
1272
+ "num_input_tokens_seen": 0,
1273
+ "num_train_epochs": 12,
1274
+ "save_steps": 500,
1275
+ "stateful_callbacks": {
1276
+ "TrainerControl": {
1277
+ "args": {
1278
+ "should_epoch_stop": false,
1279
+ "should_evaluate": false,
1280
+ "should_log": false,
1281
+ "should_save": true,
1282
+ "should_training_stop": true
1283
+ },
1284
+ "attributes": {}
1285
+ }
1286
+ },
1287
+ "total_flos": 3.355629231721513e+17,
1288
+ "train_batch_size": 32,
1289
+ "trial_name": null,
1290
+ "trial_params": null
1291
+ }
reproduction/glue_exp/qqp/dr0.0,mlr8e-03,clr8e-03,ep=12.0t=21d03h04m16/QQP.tsv ADDED
The diff for this file is too large to render. See raw diff
 
reproduction/glue_exp/qqp/dr0.0,mlr8e-03,clr8e-03,ep=12.0t=21d03h04m16/all_results.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 12.0,
3
+ "eval_accuracy": 0.9238436804353203,
4
+ "eval_combined_score": 0.9113618875098537,
5
+ "eval_f1": 0.898880094584387,
6
+ "eval_loss": 0.227777898311615,
7
+ "eval_runtime": 44.7109,
8
+ "eval_samples": 40430,
9
+ "eval_samples_per_second": 904.254,
10
+ "eval_steps_per_second": 1.767
11
+ }
reproduction/glue_exp/qqp/dr0.0,mlr8e-03,clr8e-03,ep=12.0t=21d03h04m16/eval_results.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 12.0,
3
+ "eval_accuracy": 0.9238436804353203,
4
+ "eval_combined_score": 0.9113618875098537,
5
+ "eval_f1": 0.898880094584387,
6
+ "eval_loss": 0.227777898311615,
7
+ "eval_runtime": 44.7109,
8
+ "eval_samples": 40430,
9
+ "eval_samples_per_second": 904.254,
10
+ "eval_steps_per_second": 1.767
11
+ }
reproduction/glue_exp/qqp/dr0.0,mlr8e-03,clr8e-03,ep=12.0t=21d03h04m16/ft/added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "[MASK]": 128000
3
+ }
reproduction/glue_exp/qqp/dr0.0,mlr8e-03,clr8e-03,ep=12.0t=21d03h04m16/ft/special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "[CLS]",
3
+ "cls_token": "[CLS]",
4
+ "eos_token": "[SEP]",
5
+ "mask_token": "[MASK]",
6
+ "pad_token": "[PAD]",
7
+ "sep_token": "[SEP]",
8
+ "unk_token": {
9
+ "content": "[UNK]",
10
+ "lstrip": false,
11
+ "normalized": true,
12
+ "rstrip": false,
13
+ "single_word": false
14
+ }
15
+ }
reproduction/glue_exp/qqp/dr0.0,mlr8e-03,clr8e-03,ep=12.0t=21d03h04m16/ft/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
reproduction/glue_exp/qqp/dr0.0,mlr8e-03,clr8e-03,ep=12.0t=21d03h04m16/ft/tokenizer_config.json ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[CLS]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[SEP]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[UNK]",
29
+ "lstrip": false,
30
+ "normalized": true,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "128000": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "bos_token": "[CLS]",
45
+ "clean_up_tokenization_spaces": false,
46
+ "cls_token": "[CLS]",
47
+ "do_lower_case": false,
48
+ "eos_token": "[SEP]",
49
+ "extra_special_tokens": {},
50
+ "mask_token": "[MASK]",
51
+ "model_max_length": 512,
52
+ "pad_token": "[PAD]",
53
+ "padding_side": "right",
54
+ "sep_token": "[SEP]",
55
+ "sp_model_kwargs": {},
56
+ "split_by_punct": false,
57
+ "tokenizer_class": "DebertaV2Tokenizer",
58
+ "unk_token": "[UNK]",
59
+ "vocab_type": "spm"
60
+ }
reproduction/glue_exp/qqp/dr0.0,mlr8e-03,clr8e-03,ep=12.0t=21d03h04m16/ft2/README.md ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: microsoft/deberta-v3-base
3
+ library_name: peft
4
+ tags:
5
+ - base_model:adapter:microsoft/deberta-v3-base
6
+ - transformers
7
+ ---
8
+
9
+ # Model Card for Model ID
10
+
11
+ <!-- Provide a quick summary of what the model is/does. -->
12
+
13
+
14
+
15
+ ## Model Details
16
+
17
+ ### Model Description
18
+
19
+ <!-- Provide a longer summary of what this model is. -->
20
+
21
+
22
+
23
+ - **Developed by:** [More Information Needed]
24
+ - **Funded by [optional]:** [More Information Needed]
25
+ - **Shared by [optional]:** [More Information Needed]
26
+ - **Model type:** [More Information Needed]
27
+ - **Language(s) (NLP):** [More Information Needed]
28
+ - **License:** [More Information Needed]
29
+ - **Finetuned from model [optional]:** [More Information Needed]
30
+
31
+ ### Model Sources [optional]
32
+
33
+ <!-- Provide the basic links for the model. -->
34
+
35
+ - **Repository:** [More Information Needed]
36
+ - **Paper [optional]:** [More Information Needed]
37
+ - **Demo [optional]:** [More Information Needed]
38
+
39
+ ## Uses
40
+
41
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
42
+
43
+ ### Direct Use
44
+
45
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
46
+
47
+ [More Information Needed]
48
+
49
+ ### Downstream Use [optional]
50
+
51
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
52
+
53
+ [More Information Needed]
54
+
55
+ ### Out-of-Scope Use
56
+
57
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
58
+
59
+ [More Information Needed]
60
+
61
+ ## Bias, Risks, and Limitations
62
+
63
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
64
+
65
+ [More Information Needed]
66
+
67
+ ### Recommendations
68
+
69
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
70
+
71
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
72
+
73
+ ## How to Get Started with the Model
74
+
75
+ Use the code below to get started with the model.
76
+
77
+ [More Information Needed]
78
+
79
+ ## Training Details
80
+
81
+ ### Training Data
82
+
83
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
84
+
85
+ [More Information Needed]
86
+
87
+ ### Training Procedure
88
+
89
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
90
+
91
+ #### Preprocessing [optional]
92
+
93
+ [More Information Needed]
94
+
95
+
96
+ #### Training Hyperparameters
97
+
98
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
99
+
100
+ #### Speeds, Sizes, Times [optional]
101
+
102
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
103
+
104
+ [More Information Needed]
105
+
106
+ ## Evaluation
107
+
108
+ <!-- This section describes the evaluation protocols and provides the results. -->
109
+
110
+ ### Testing Data, Factors & Metrics
111
+
112
+ #### Testing Data
113
+
114
+ <!-- This should link to a Dataset Card if possible. -->
115
+
116
+ [More Information Needed]
117
+
118
+ #### Factors
119
+
120
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
121
+
122
+ [More Information Needed]
123
+
124
+ #### Metrics
125
+
126
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
127
+
128
+ [More Information Needed]
129
+
130
+ ### Results
131
+
132
+ [More Information Needed]
133
+
134
+ #### Summary
135
+
136
+
137
+
138
+ ## Model Examination [optional]
139
+
140
+ <!-- Relevant interpretability work for the model goes here -->
141
+
142
+ [More Information Needed]
143
+
144
+ ## Environmental Impact
145
+
146
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
147
+
148
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
149
+
150
+ - **Hardware Type:** [More Information Needed]
151
+ - **Hours used:** [More Information Needed]
152
+ - **Cloud Provider:** [More Information Needed]
153
+ - **Compute Region:** [More Information Needed]
154
+ - **Carbon Emitted:** [More Information Needed]
155
+
156
+ ## Technical Specifications [optional]
157
+
158
+ ### Model Architecture and Objective
159
+
160
+ [More Information Needed]
161
+
162
+ ### Compute Infrastructure
163
+
164
+ [More Information Needed]
165
+
166
+ #### Hardware
167
+
168
+ [More Information Needed]
169
+
170
+ #### Software
171
+
172
+ [More Information Needed]
173
+
174
+ ## Citation [optional]
175
+
176
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
177
+
178
+ **BibTeX:**
179
+
180
+ [More Information Needed]
181
+
182
+ **APA:**
183
+
184
+ [More Information Needed]
185
+
186
+ ## Glossary [optional]
187
+
188
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
189
+
190
+ [More Information Needed]
191
+
192
+ ## More Information [optional]
193
+
194
+ [More Information Needed]
195
+
196
+ ## Model Card Authors [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Contact
201
+
202
+ [More Information Needed]
203
+ ### Framework versions
204
+
205
+ - PEFT 0.18.0