Guen commited on
Commit
68a4af1
·
verified ·
1 Parent(s): 6058c6f

sync 2026-04-24T07:52:39+00:00

Browse files
sft/checkpoint-400/adapter_config.json CHANGED
@@ -30,8 +30,8 @@
30
  "rank_pattern": {},
31
  "revision": null,
32
  "target_modules": [
33
- "q_proj",
34
  "k_proj",
 
35
  "v_proj",
36
  "o_proj"
37
  ],
 
30
  "rank_pattern": {},
31
  "revision": null,
32
  "target_modules": [
 
33
  "k_proj",
34
+ "q_proj",
35
  "v_proj",
36
  "o_proj"
37
  ],
sft/checkpoint-400/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b152714f3de9276f49eb38c48fa0911fb388519f0094cc4a1f17b7bbe1907129
3
  size 61380432
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e4160a7f58c058e509163ce331774f03fdfa138d52a65bc19dcba71f10f0cf86
3
  size 61380432
sft/checkpoint-400/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:afd44d2d8ede5e8e25d59fe05c88b0f0f201b83fb64dd0ba80ed0f93c7cc52f1
3
  size 122930379
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d995674cea91aba25b767784162d650dc974d9d00e1673050234bd7b0db3ce2
3
  size 122930379
sft/checkpoint-400/trainer_state.json CHANGED
@@ -11,282 +11,282 @@
11
  "log_history": [
12
  {
13
  "epoch": 0.04094165813715456,
14
- "grad_norm": 6.233852386474609,
15
  "learning_rate": 9.000000000000001e-07,
16
- "loss": 6.206051254272461,
17
  "step": 10
18
  },
19
  {
20
  "epoch": 0.08188331627430911,
21
- "grad_norm": 5.675178527832031,
22
  "learning_rate": 1.9000000000000002e-06,
23
- "loss": 6.079831314086914,
24
  "step": 20
25
  },
26
  {
27
  "epoch": 0.12282497441146366,
28
- "grad_norm": 5.132958889007568,
29
  "learning_rate": 2.9e-06,
30
- "loss": 5.749290466308594,
31
  "step": 30
32
  },
33
  {
34
  "epoch": 0.16376663254861823,
35
- "grad_norm": 5.81948184967041,
36
  "learning_rate": 3.900000000000001e-06,
37
- "loss": 6.033080673217773,
38
  "step": 40
39
  },
40
  {
41
  "epoch": 0.2047082906857728,
42
- "grad_norm": 6.442906379699707,
43
  "learning_rate": 4.9000000000000005e-06,
44
- "loss": 5.972381973266602,
45
  "step": 50
46
  },
47
  {
48
  "epoch": 0.24564994882292732,
49
- "grad_norm": 5.304522514343262,
50
  "learning_rate": 5.9e-06,
51
- "loss": 5.472136306762695,
52
  "step": 60
53
  },
54
  {
55
  "epoch": 0.2865916069600819,
56
- "grad_norm": 5.261083126068115,
57
  "learning_rate": 6.9e-06,
58
- "loss": 5.041688537597656,
59
  "step": 70
60
  },
61
  {
62
  "epoch": 0.32753326509723646,
63
- "grad_norm": 6.387202739715576,
64
  "learning_rate": 7.9e-06,
65
- "loss": 4.916830062866211,
66
  "step": 80
67
  },
68
  {
69
  "epoch": 0.368474923234391,
70
- "grad_norm": 7.068334102630615,
71
  "learning_rate": 8.900000000000001e-06,
72
- "loss": 3.531498336791992,
73
  "step": 90
74
  },
75
  {
76
  "epoch": 0.4094165813715456,
77
- "grad_norm": 4.787615776062012,
78
  "learning_rate": 9.9e-06,
79
- "loss": 2.2226877212524414,
80
  "step": 100
81
  },
82
  {
83
  "epoch": 0.4503582395087001,
84
- "grad_norm": 3.4490268230438232,
85
  "learning_rate": 9.76923076923077e-06,
86
- "loss": 1.955281639099121,
87
  "step": 110
88
  },
89
  {
90
  "epoch": 0.49129989764585463,
91
- "grad_norm": 2.435624361038208,
92
  "learning_rate": 9.512820512820514e-06,
93
- "loss": 1.5814408302307128,
94
  "step": 120
95
  },
96
  {
97
  "epoch": 0.5322415557830092,
98
- "grad_norm": 2.0870230197906494,
99
  "learning_rate": 9.256410256410257e-06,
100
- "loss": 1.4773900985717774,
101
  "step": 130
102
  },
103
  {
104
  "epoch": 0.5731832139201638,
105
- "grad_norm": 2.1049163341522217,
106
  "learning_rate": 9e-06,
107
- "loss": 1.508018970489502,
108
  "step": 140
109
  },
110
  {
111
  "epoch": 0.6141248720573184,
112
- "grad_norm": 2.7535016536712646,
113
  "learning_rate": 8.743589743589743e-06,
114
- "loss": 1.6693695068359375,
115
  "step": 150
116
  },
117
  {
118
  "epoch": 0.6550665301944729,
119
- "grad_norm": 2.2710325717926025,
120
  "learning_rate": 8.487179487179488e-06,
121
- "loss": 1.590962028503418,
122
  "step": 160
123
  },
124
  {
125
  "epoch": 0.6960081883316275,
126
- "grad_norm": 2.007903575897217,
127
  "learning_rate": 8.230769230769232e-06,
128
- "loss": 1.6514751434326171,
129
  "step": 170
130
  },
131
  {
132
  "epoch": 0.736949846468782,
133
- "grad_norm": 2.2933390140533447,
134
  "learning_rate": 7.974358974358975e-06,
135
- "loss": 1.2684261322021484,
136
  "step": 180
137
  },
138
  {
139
  "epoch": 0.7778915046059366,
140
- "grad_norm": 2.4877562522888184,
141
  "learning_rate": 7.717948717948718e-06,
142
- "loss": 1.4245257377624512,
143
  "step": 190
144
  },
145
  {
146
  "epoch": 0.8188331627430911,
147
- "grad_norm": 2.4900834560394287,
148
  "learning_rate": 7.461538461538462e-06,
149
- "loss": 1.4417343139648438,
150
  "step": 200
151
  },
152
  {
153
  "epoch": 0.8597748208802457,
154
- "grad_norm": 1.8601328134536743,
155
  "learning_rate": 7.205128205128206e-06,
156
- "loss": 1.3829060554504395,
157
  "step": 210
158
  },
159
  {
160
  "epoch": 0.9007164790174002,
161
- "grad_norm": 1.3816406726837158,
162
  "learning_rate": 6.948717948717949e-06,
163
- "loss": 1.2489395141601562,
164
  "step": 220
165
  },
166
  {
167
  "epoch": 0.9416581371545547,
168
- "grad_norm": 2.6820783615112305,
169
  "learning_rate": 6.692307692307692e-06,
170
- "loss": 1.2914579391479493,
171
  "step": 230
172
  },
173
  {
174
  "epoch": 0.9825997952917093,
175
- "grad_norm": 4.043029308319092,
176
  "learning_rate": 6.435897435897437e-06,
177
- "loss": 1.2840445518493653,
178
  "step": 240
179
  },
180
  {
181
  "epoch": 1.0204708290685773,
182
- "grad_norm": 1.33917236328125,
183
  "learning_rate": 6.17948717948718e-06,
184
- "loss": 1.20665283203125,
185
  "step": 250
186
  },
187
  {
188
  "epoch": 1.0614124872057318,
189
- "grad_norm": 3.203650712966919,
190
  "learning_rate": 5.923076923076924e-06,
191
- "loss": 1.0558320999145507,
192
  "step": 260
193
  },
194
  {
195
  "epoch": 1.1023541453428864,
196
- "grad_norm": 2.068211078643799,
197
  "learning_rate": 5.666666666666667e-06,
198
- "loss": 1.2426029205322267,
199
  "step": 270
200
  },
201
  {
202
  "epoch": 1.143295803480041,
203
- "grad_norm": 1.6644175052642822,
204
  "learning_rate": 5.41025641025641e-06,
205
- "loss": 1.2411796569824218,
206
  "step": 280
207
  },
208
  {
209
  "epoch": 1.1842374616171956,
210
- "grad_norm": 2.9029407501220703,
211
  "learning_rate": 5.1538461538461534e-06,
212
- "loss": 1.0601702690124513,
213
  "step": 290
214
  },
215
  {
216
  "epoch": 1.22517911975435,
217
- "grad_norm": 2.7912986278533936,
218
  "learning_rate": 4.8974358974358975e-06,
219
- "loss": 1.2574929237365722,
220
  "step": 300
221
  },
222
  {
223
  "epoch": 1.2661207778915047,
224
- "grad_norm": 3.1852738857269287,
225
  "learning_rate": 4.641025641025642e-06,
226
- "loss": 1.3007983207702636,
227
  "step": 310
228
  },
229
  {
230
  "epoch": 1.3070624360286591,
231
- "grad_norm": 2.374436616897583,
232
  "learning_rate": 4.384615384615385e-06,
233
- "loss": 1.1643206596374511,
234
  "step": 320
235
  },
236
  {
237
  "epoch": 1.3480040941658138,
238
- "grad_norm": 2.4103164672851562,
239
  "learning_rate": 4.128205128205128e-06,
240
- "loss": 1.0461477279663085,
241
  "step": 330
242
  },
243
  {
244
  "epoch": 1.3889457523029682,
245
- "grad_norm": 2.588911533355713,
246
  "learning_rate": 3.871794871794872e-06,
247
- "loss": 1.152013111114502,
248
  "step": 340
249
  },
250
  {
251
  "epoch": 1.429887410440123,
252
- "grad_norm": 2.1230835914611816,
253
  "learning_rate": 3.6153846153846156e-06,
254
- "loss": 1.2414540290832519,
255
  "step": 350
256
  },
257
  {
258
  "epoch": 1.4708290685772774,
259
- "grad_norm": 2.0331356525421143,
260
  "learning_rate": 3.358974358974359e-06,
261
- "loss": 1.221489429473877,
262
  "step": 360
263
  },
264
  {
265
  "epoch": 1.511770726714432,
266
- "grad_norm": 2.5309641361236572,
267
  "learning_rate": 3.102564102564103e-06,
268
- "loss": 0.9268945693969727,
269
  "step": 370
270
  },
271
  {
272
  "epoch": 1.5527123848515865,
273
- "grad_norm": 2.3488223552703857,
274
  "learning_rate": 2.846153846153846e-06,
275
- "loss": 1.5277278900146485,
276
  "step": 380
277
  },
278
  {
279
  "epoch": 1.593654042988741,
280
- "grad_norm": 2.196445941925049,
281
  "learning_rate": 2.5897435897435903e-06,
282
- "loss": 1.1361653327941894,
283
  "step": 390
284
  },
285
  {
286
  "epoch": 1.6345957011258956,
287
- "grad_norm": 2.2993838787078857,
288
  "learning_rate": 2.3333333333333336e-06,
289
- "loss": 1.1851417541503906,
290
  "step": 400
291
  }
292
  ],
 
11
  "log_history": [
12
  {
13
  "epoch": 0.04094165813715456,
14
+ "grad_norm": 7.529000759124756,
15
  "learning_rate": 9.000000000000001e-07,
16
+ "loss": 6.639524841308594,
17
  "step": 10
18
  },
19
  {
20
  "epoch": 0.08188331627430911,
21
+ "grad_norm": 7.1427388191223145,
22
  "learning_rate": 1.9000000000000002e-06,
23
+ "loss": 6.582770538330078,
24
  "step": 20
25
  },
26
  {
27
  "epoch": 0.12282497441146366,
28
+ "grad_norm": 6.251121997833252,
29
  "learning_rate": 2.9e-06,
30
+ "loss": 6.114261245727539,
31
  "step": 30
32
  },
33
  {
34
  "epoch": 0.16376663254861823,
35
+ "grad_norm": 7.014133453369141,
36
  "learning_rate": 3.900000000000001e-06,
37
+ "loss": 6.333962631225586,
38
  "step": 40
39
  },
40
  {
41
  "epoch": 0.2047082906857728,
42
+ "grad_norm": 7.072969913482666,
43
  "learning_rate": 4.9000000000000005e-06,
44
+ "loss": 6.248543930053711,
45
  "step": 50
46
  },
47
  {
48
  "epoch": 0.24564994882292732,
49
+ "grad_norm": 5.674424171447754,
50
  "learning_rate": 5.9e-06,
51
+ "loss": 5.6220745086669925,
52
  "step": 60
53
  },
54
  {
55
  "epoch": 0.2865916069600819,
56
+ "grad_norm": 5.3739800453186035,
57
  "learning_rate": 6.9e-06,
58
+ "loss": 5.0251930236816404,
59
  "step": 70
60
  },
61
  {
62
  "epoch": 0.32753326509723646,
63
+ "grad_norm": 6.372293472290039,
64
  "learning_rate": 7.9e-06,
65
+ "loss": 5.033017349243164,
66
  "step": 80
67
  },
68
  {
69
  "epoch": 0.368474923234391,
70
+ "grad_norm": 7.22106409072876,
71
  "learning_rate": 8.900000000000001e-06,
72
+ "loss": 3.514456939697266,
73
  "step": 90
74
  },
75
  {
76
  "epoch": 0.4094165813715456,
77
+ "grad_norm": 4.990958213806152,
78
  "learning_rate": 9.9e-06,
79
+ "loss": 2.256962776184082,
80
  "step": 100
81
  },
82
  {
83
  "epoch": 0.4503582395087001,
84
+ "grad_norm": 3.2661993503570557,
85
  "learning_rate": 9.76923076923077e-06,
86
+ "loss": 2.0482555389404298,
87
  "step": 110
88
  },
89
  {
90
  "epoch": 0.49129989764585463,
91
+ "grad_norm": 2.997323513031006,
92
  "learning_rate": 9.512820512820514e-06,
93
+ "loss": 1.6077747344970703,
94
  "step": 120
95
  },
96
  {
97
  "epoch": 0.5322415557830092,
98
+ "grad_norm": 2.422741413116455,
99
  "learning_rate": 9.256410256410257e-06,
100
+ "loss": 1.538028621673584,
101
  "step": 130
102
  },
103
  {
104
  "epoch": 0.5731832139201638,
105
+ "grad_norm": 2.4594290256500244,
106
  "learning_rate": 9e-06,
107
+ "loss": 1.5614049911499024,
108
  "step": 140
109
  },
110
  {
111
  "epoch": 0.6141248720573184,
112
+ "grad_norm": 2.7256577014923096,
113
  "learning_rate": 8.743589743589743e-06,
114
+ "loss": 1.7180768966674804,
115
  "step": 150
116
  },
117
  {
118
  "epoch": 0.6550665301944729,
119
+ "grad_norm": 2.6614902019500732,
120
  "learning_rate": 8.487179487179488e-06,
121
+ "loss": 1.6412044525146485,
122
  "step": 160
123
  },
124
  {
125
  "epoch": 0.6960081883316275,
126
+ "grad_norm": 2.1517934799194336,
127
  "learning_rate": 8.230769230769232e-06,
128
+ "loss": 1.729467010498047,
129
  "step": 170
130
  },
131
  {
132
  "epoch": 0.736949846468782,
133
+ "grad_norm": 2.4588229656219482,
134
  "learning_rate": 7.974358974358975e-06,
135
+ "loss": 1.305363941192627,
136
  "step": 180
137
  },
138
  {
139
  "epoch": 0.7778915046059366,
140
+ "grad_norm": 2.5282061100006104,
141
  "learning_rate": 7.717948717948718e-06,
142
+ "loss": 1.4986873626708985,
143
  "step": 190
144
  },
145
  {
146
  "epoch": 0.8188331627430911,
147
+ "grad_norm": 3.699396848678589,
148
  "learning_rate": 7.461538461538462e-06,
149
+ "loss": 1.5059691429138184,
150
  "step": 200
151
  },
152
  {
153
  "epoch": 0.8597748208802457,
154
+ "grad_norm": 2.2632241249084473,
155
  "learning_rate": 7.205128205128206e-06,
156
+ "loss": 1.4249659538269044,
157
  "step": 210
158
  },
159
  {
160
  "epoch": 0.9007164790174002,
161
+ "grad_norm": 1.6525421142578125,
162
  "learning_rate": 6.948717948717949e-06,
163
+ "loss": 1.2684237480163574,
164
  "step": 220
165
  },
166
  {
167
  "epoch": 0.9416581371545547,
168
+ "grad_norm": 4.144842624664307,
169
  "learning_rate": 6.692307692307692e-06,
170
+ "loss": 1.3331901550292968,
171
  "step": 230
172
  },
173
  {
174
  "epoch": 0.9825997952917093,
175
+ "grad_norm": 4.287148475646973,
176
  "learning_rate": 6.435897435897437e-06,
177
+ "loss": 1.3455743789672852,
178
  "step": 240
179
  },
180
  {
181
  "epoch": 1.0204708290685773,
182
+ "grad_norm": 1.7868481874465942,
183
  "learning_rate": 6.17948717948718e-06,
184
+ "loss": 1.2180957794189453,
185
  "step": 250
186
  },
187
  {
188
  "epoch": 1.0614124872057318,
189
+ "grad_norm": 3.4123318195343018,
190
  "learning_rate": 5.923076923076924e-06,
191
+ "loss": 1.0844655990600587,
192
  "step": 260
193
  },
194
  {
195
  "epoch": 1.1023541453428864,
196
+ "grad_norm": 2.9107143878936768,
197
  "learning_rate": 5.666666666666667e-06,
198
+ "loss": 1.2759632110595702,
199
  "step": 270
200
  },
201
  {
202
  "epoch": 1.143295803480041,
203
+ "grad_norm": 2.166071653366089,
204
  "learning_rate": 5.41025641025641e-06,
205
+ "loss": 1.2902444839477538,
206
  "step": 280
207
  },
208
  {
209
  "epoch": 1.1842374616171956,
210
+ "grad_norm": 3.7286741733551025,
211
  "learning_rate": 5.1538461538461534e-06,
212
+ "loss": 1.1132170677185058,
213
  "step": 290
214
  },
215
  {
216
  "epoch": 1.22517911975435,
217
+ "grad_norm": 3.644495964050293,
218
  "learning_rate": 4.8974358974358975e-06,
219
+ "loss": 1.3006702423095704,
220
  "step": 300
221
  },
222
  {
223
  "epoch": 1.2661207778915047,
224
+ "grad_norm": 2.9774715900421143,
225
  "learning_rate": 4.641025641025642e-06,
226
+ "loss": 1.3277738571166993,
227
  "step": 310
228
  },
229
  {
230
  "epoch": 1.3070624360286591,
231
+ "grad_norm": 3.5428693294525146,
232
  "learning_rate": 4.384615384615385e-06,
233
+ "loss": 1.2050466537475586,
234
  "step": 320
235
  },
236
  {
237
  "epoch": 1.3480040941658138,
238
+ "grad_norm": 2.9129176139831543,
239
  "learning_rate": 4.128205128205128e-06,
240
+ "loss": 1.0705522537231444,
241
  "step": 330
242
  },
243
  {
244
  "epoch": 1.3889457523029682,
245
+ "grad_norm": 3.6323633193969727,
246
  "learning_rate": 3.871794871794872e-06,
247
+ "loss": 1.1811490058898926,
248
  "step": 340
249
  },
250
  {
251
  "epoch": 1.429887410440123,
252
+ "grad_norm": 2.344156265258789,
253
  "learning_rate": 3.6153846153846156e-06,
254
+ "loss": 1.2702527046203613,
255
  "step": 350
256
  },
257
  {
258
  "epoch": 1.4708290685772774,
259
+ "grad_norm": 2.93733286857605,
260
  "learning_rate": 3.358974358974359e-06,
261
+ "loss": 1.2678390502929688,
262
  "step": 360
263
  },
264
  {
265
  "epoch": 1.511770726714432,
266
+ "grad_norm": 3.3147552013397217,
267
  "learning_rate": 3.102564102564103e-06,
268
+ "loss": 0.975819206237793,
269
  "step": 370
270
  },
271
  {
272
  "epoch": 1.5527123848515865,
273
+ "grad_norm": 3.5217061042785645,
274
  "learning_rate": 2.846153846153846e-06,
275
+ "loss": 1.5488268852233886,
276
  "step": 380
277
  },
278
  {
279
  "epoch": 1.593654042988741,
280
+ "grad_norm": 2.5279595851898193,
281
  "learning_rate": 2.5897435897435903e-06,
282
+ "loss": 1.1411288261413575,
283
  "step": 390
284
  },
285
  {
286
  "epoch": 1.6345957011258956,
287
+ "grad_norm": 2.848078489303589,
288
  "learning_rate": 2.3333333333333336e-06,
289
+ "loss": 1.1716268539428711,
290
  "step": 400
291
  }
292
  ],
sft/checkpoint-490/adapter_config.json CHANGED
@@ -30,8 +30,8 @@
30
  "rank_pattern": {},
31
  "revision": null,
32
  "target_modules": [
33
- "q_proj",
34
  "k_proj",
 
35
  "v_proj",
36
  "o_proj"
37
  ],
 
30
  "rank_pattern": {},
31
  "revision": null,
32
  "target_modules": [
 
33
  "k_proj",
34
+ "q_proj",
35
  "v_proj",
36
  "o_proj"
37
  ],
sft/checkpoint-490/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7d5efc1629290bcfa82fca8d6fe0a1fa199e6edd6673b12516e20f84ef8bf8f9
3
  size 61380432
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f17065f015b60b2b71094531a596650f064bc0d4e2c5ae78ced1765b8daf35ac
3
  size 61380432
sft/checkpoint-490/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:35e3f008ba58149c4d3cf026a50e809c2dc5af6a08fc7f325152af82499501e0
3
  size 122930379
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a3a60a8ba30916dbad0a22c74ab53e349e337af2bc29188447ad2e67d2854f41
3
  size 122930379
sft/checkpoint-490/trainer_state.json CHANGED
@@ -11,345 +11,345 @@
11
  "log_history": [
12
  {
13
  "epoch": 0.04094165813715456,
14
- "grad_norm": 6.233852386474609,
15
  "learning_rate": 9.000000000000001e-07,
16
- "loss": 6.206051254272461,
17
  "step": 10
18
  },
19
  {
20
  "epoch": 0.08188331627430911,
21
- "grad_norm": 5.675178527832031,
22
  "learning_rate": 1.9000000000000002e-06,
23
- "loss": 6.079831314086914,
24
  "step": 20
25
  },
26
  {
27
  "epoch": 0.12282497441146366,
28
- "grad_norm": 5.132958889007568,
29
  "learning_rate": 2.9e-06,
30
- "loss": 5.749290466308594,
31
  "step": 30
32
  },
33
  {
34
  "epoch": 0.16376663254861823,
35
- "grad_norm": 5.81948184967041,
36
  "learning_rate": 3.900000000000001e-06,
37
- "loss": 6.033080673217773,
38
  "step": 40
39
  },
40
  {
41
  "epoch": 0.2047082906857728,
42
- "grad_norm": 6.442906379699707,
43
  "learning_rate": 4.9000000000000005e-06,
44
- "loss": 5.972381973266602,
45
  "step": 50
46
  },
47
  {
48
  "epoch": 0.24564994882292732,
49
- "grad_norm": 5.304522514343262,
50
  "learning_rate": 5.9e-06,
51
- "loss": 5.472136306762695,
52
  "step": 60
53
  },
54
  {
55
  "epoch": 0.2865916069600819,
56
- "grad_norm": 5.261083126068115,
57
  "learning_rate": 6.9e-06,
58
- "loss": 5.041688537597656,
59
  "step": 70
60
  },
61
  {
62
  "epoch": 0.32753326509723646,
63
- "grad_norm": 6.387202739715576,
64
  "learning_rate": 7.9e-06,
65
- "loss": 4.916830062866211,
66
  "step": 80
67
  },
68
  {
69
  "epoch": 0.368474923234391,
70
- "grad_norm": 7.068334102630615,
71
  "learning_rate": 8.900000000000001e-06,
72
- "loss": 3.531498336791992,
73
  "step": 90
74
  },
75
  {
76
  "epoch": 0.4094165813715456,
77
- "grad_norm": 4.787615776062012,
78
  "learning_rate": 9.9e-06,
79
- "loss": 2.2226877212524414,
80
  "step": 100
81
  },
82
  {
83
  "epoch": 0.4503582395087001,
84
- "grad_norm": 3.4490268230438232,
85
  "learning_rate": 9.76923076923077e-06,
86
- "loss": 1.955281639099121,
87
  "step": 110
88
  },
89
  {
90
  "epoch": 0.49129989764585463,
91
- "grad_norm": 2.435624361038208,
92
  "learning_rate": 9.512820512820514e-06,
93
- "loss": 1.5814408302307128,
94
  "step": 120
95
  },
96
  {
97
  "epoch": 0.5322415557830092,
98
- "grad_norm": 2.0870230197906494,
99
  "learning_rate": 9.256410256410257e-06,
100
- "loss": 1.4773900985717774,
101
  "step": 130
102
  },
103
  {
104
  "epoch": 0.5731832139201638,
105
- "grad_norm": 2.1049163341522217,
106
  "learning_rate": 9e-06,
107
- "loss": 1.508018970489502,
108
  "step": 140
109
  },
110
  {
111
  "epoch": 0.6141248720573184,
112
- "grad_norm": 2.7535016536712646,
113
  "learning_rate": 8.743589743589743e-06,
114
- "loss": 1.6693695068359375,
115
  "step": 150
116
  },
117
  {
118
  "epoch": 0.6550665301944729,
119
- "grad_norm": 2.2710325717926025,
120
  "learning_rate": 8.487179487179488e-06,
121
- "loss": 1.590962028503418,
122
  "step": 160
123
  },
124
  {
125
  "epoch": 0.6960081883316275,
126
- "grad_norm": 2.007903575897217,
127
  "learning_rate": 8.230769230769232e-06,
128
- "loss": 1.6514751434326171,
129
  "step": 170
130
  },
131
  {
132
  "epoch": 0.736949846468782,
133
- "grad_norm": 2.2933390140533447,
134
  "learning_rate": 7.974358974358975e-06,
135
- "loss": 1.2684261322021484,
136
  "step": 180
137
  },
138
  {
139
  "epoch": 0.7778915046059366,
140
- "grad_norm": 2.4877562522888184,
141
  "learning_rate": 7.717948717948718e-06,
142
- "loss": 1.4245257377624512,
143
  "step": 190
144
  },
145
  {
146
  "epoch": 0.8188331627430911,
147
- "grad_norm": 2.4900834560394287,
148
  "learning_rate": 7.461538461538462e-06,
149
- "loss": 1.4417343139648438,
150
  "step": 200
151
  },
152
  {
153
  "epoch": 0.8597748208802457,
154
- "grad_norm": 1.8601328134536743,
155
  "learning_rate": 7.205128205128206e-06,
156
- "loss": 1.3829060554504395,
157
  "step": 210
158
  },
159
  {
160
  "epoch": 0.9007164790174002,
161
- "grad_norm": 1.3816406726837158,
162
  "learning_rate": 6.948717948717949e-06,
163
- "loss": 1.2489395141601562,
164
  "step": 220
165
  },
166
  {
167
  "epoch": 0.9416581371545547,
168
- "grad_norm": 2.6820783615112305,
169
  "learning_rate": 6.692307692307692e-06,
170
- "loss": 1.2914579391479493,
171
  "step": 230
172
  },
173
  {
174
  "epoch": 0.9825997952917093,
175
- "grad_norm": 4.043029308319092,
176
  "learning_rate": 6.435897435897437e-06,
177
- "loss": 1.2840445518493653,
178
  "step": 240
179
  },
180
  {
181
  "epoch": 1.0204708290685773,
182
- "grad_norm": 1.33917236328125,
183
  "learning_rate": 6.17948717948718e-06,
184
- "loss": 1.20665283203125,
185
  "step": 250
186
  },
187
  {
188
  "epoch": 1.0614124872057318,
189
- "grad_norm": 3.203650712966919,
190
  "learning_rate": 5.923076923076924e-06,
191
- "loss": 1.0558320999145507,
192
  "step": 260
193
  },
194
  {
195
  "epoch": 1.1023541453428864,
196
- "grad_norm": 2.068211078643799,
197
  "learning_rate": 5.666666666666667e-06,
198
- "loss": 1.2426029205322267,
199
  "step": 270
200
  },
201
  {
202
  "epoch": 1.143295803480041,
203
- "grad_norm": 1.6644175052642822,
204
  "learning_rate": 5.41025641025641e-06,
205
- "loss": 1.2411796569824218,
206
  "step": 280
207
  },
208
  {
209
  "epoch": 1.1842374616171956,
210
- "grad_norm": 2.9029407501220703,
211
  "learning_rate": 5.1538461538461534e-06,
212
- "loss": 1.0601702690124513,
213
  "step": 290
214
  },
215
  {
216
  "epoch": 1.22517911975435,
217
- "grad_norm": 2.7912986278533936,
218
  "learning_rate": 4.8974358974358975e-06,
219
- "loss": 1.2574929237365722,
220
  "step": 300
221
  },
222
  {
223
  "epoch": 1.2661207778915047,
224
- "grad_norm": 3.1852738857269287,
225
  "learning_rate": 4.641025641025642e-06,
226
- "loss": 1.3007983207702636,
227
  "step": 310
228
  },
229
  {
230
  "epoch": 1.3070624360286591,
231
- "grad_norm": 2.374436616897583,
232
  "learning_rate": 4.384615384615385e-06,
233
- "loss": 1.1643206596374511,
234
  "step": 320
235
  },
236
  {
237
  "epoch": 1.3480040941658138,
238
- "grad_norm": 2.4103164672851562,
239
  "learning_rate": 4.128205128205128e-06,
240
- "loss": 1.0461477279663085,
241
  "step": 330
242
  },
243
  {
244
  "epoch": 1.3889457523029682,
245
- "grad_norm": 2.588911533355713,
246
  "learning_rate": 3.871794871794872e-06,
247
- "loss": 1.152013111114502,
248
  "step": 340
249
  },
250
  {
251
  "epoch": 1.429887410440123,
252
- "grad_norm": 2.1230835914611816,
253
  "learning_rate": 3.6153846153846156e-06,
254
- "loss": 1.2414540290832519,
255
  "step": 350
256
  },
257
  {
258
  "epoch": 1.4708290685772774,
259
- "grad_norm": 2.0331356525421143,
260
  "learning_rate": 3.358974358974359e-06,
261
- "loss": 1.221489429473877,
262
  "step": 360
263
  },
264
  {
265
  "epoch": 1.511770726714432,
266
- "grad_norm": 2.5309641361236572,
267
  "learning_rate": 3.102564102564103e-06,
268
- "loss": 0.9268945693969727,
269
  "step": 370
270
  },
271
  {
272
  "epoch": 1.5527123848515865,
273
- "grad_norm": 2.3488223552703857,
274
  "learning_rate": 2.846153846153846e-06,
275
- "loss": 1.5277278900146485,
276
  "step": 380
277
  },
278
  {
279
  "epoch": 1.593654042988741,
280
- "grad_norm": 2.196445941925049,
281
  "learning_rate": 2.5897435897435903e-06,
282
- "loss": 1.1361653327941894,
283
  "step": 390
284
  },
285
  {
286
  "epoch": 1.6345957011258956,
287
- "grad_norm": 2.2993838787078857,
288
  "learning_rate": 2.3333333333333336e-06,
289
- "loss": 1.1851417541503906,
290
  "step": 400
291
  },
292
  {
293
  "epoch": 1.6755373592630503,
294
- "grad_norm": 1.7422572374343872,
295
  "learning_rate": 2.0769230769230773e-06,
296
- "loss": 1.0863821983337403,
297
  "step": 410
298
  },
299
  {
300
  "epoch": 1.7164790174002047,
301
- "grad_norm": 2.5725717544555664,
302
  "learning_rate": 1.8205128205128205e-06,
303
- "loss": 0.9950428962707519,
304
  "step": 420
305
  },
306
  {
307
  "epoch": 1.7574206755373591,
308
- "grad_norm": 2.113790988922119,
309
  "learning_rate": 1.5641025641025642e-06,
310
- "loss": 0.9922480583190918,
311
  "step": 430
312
  },
313
  {
314
  "epoch": 1.7983623336745138,
315
- "grad_norm": 1.7282646894454956,
316
  "learning_rate": 1.307692307692308e-06,
317
- "loss": 1.1919940948486327,
318
  "step": 440
319
  },
320
  {
321
  "epoch": 1.8393039918116685,
322
- "grad_norm": 2.1598236560821533,
323
  "learning_rate": 1.0512820512820514e-06,
324
- "loss": 1.173375701904297,
325
  "step": 450
326
  },
327
  {
328
  "epoch": 1.880245649948823,
329
- "grad_norm": 2.2551820278167725,
330
  "learning_rate": 7.948717948717949e-07,
331
- "loss": 1.1812771797180175,
332
  "step": 460
333
  },
334
  {
335
  "epoch": 1.9211873080859774,
336
- "grad_norm": 2.8052544593811035,
337
  "learning_rate": 5.384615384615386e-07,
338
- "loss": 0.9614218711853028,
339
  "step": 470
340
  },
341
  {
342
  "epoch": 1.962128966223132,
343
- "grad_norm": 3.274524450302124,
344
  "learning_rate": 2.820512820512821e-07,
345
- "loss": 1.0670873641967773,
346
  "step": 480
347
  },
348
  {
349
  "epoch": 2.0,
350
- "grad_norm": 6.130195617675781,
351
  "learning_rate": 2.5641025641025643e-08,
352
- "loss": 1.051560401916504,
353
  "step": 490
354
  }
355
  ],
 
11
  "log_history": [
12
  {
13
  "epoch": 0.04094165813715456,
14
+ "grad_norm": 7.529000759124756,
15
  "learning_rate": 9.000000000000001e-07,
16
+ "loss": 6.639524841308594,
17
  "step": 10
18
  },
19
  {
20
  "epoch": 0.08188331627430911,
21
+ "grad_norm": 7.1427388191223145,
22
  "learning_rate": 1.9000000000000002e-06,
23
+ "loss": 6.582770538330078,
24
  "step": 20
25
  },
26
  {
27
  "epoch": 0.12282497441146366,
28
+ "grad_norm": 6.251121997833252,
29
  "learning_rate": 2.9e-06,
30
+ "loss": 6.114261245727539,
31
  "step": 30
32
  },
33
  {
34
  "epoch": 0.16376663254861823,
35
+ "grad_norm": 7.014133453369141,
36
  "learning_rate": 3.900000000000001e-06,
37
+ "loss": 6.333962631225586,
38
  "step": 40
39
  },
40
  {
41
  "epoch": 0.2047082906857728,
42
+ "grad_norm": 7.072969913482666,
43
  "learning_rate": 4.9000000000000005e-06,
44
+ "loss": 6.248543930053711,
45
  "step": 50
46
  },
47
  {
48
  "epoch": 0.24564994882292732,
49
+ "grad_norm": 5.674424171447754,
50
  "learning_rate": 5.9e-06,
51
+ "loss": 5.6220745086669925,
52
  "step": 60
53
  },
54
  {
55
  "epoch": 0.2865916069600819,
56
+ "grad_norm": 5.3739800453186035,
57
  "learning_rate": 6.9e-06,
58
+ "loss": 5.0251930236816404,
59
  "step": 70
60
  },
61
  {
62
  "epoch": 0.32753326509723646,
63
+ "grad_norm": 6.372293472290039,
64
  "learning_rate": 7.9e-06,
65
+ "loss": 5.033017349243164,
66
  "step": 80
67
  },
68
  {
69
  "epoch": 0.368474923234391,
70
+ "grad_norm": 7.22106409072876,
71
  "learning_rate": 8.900000000000001e-06,
72
+ "loss": 3.514456939697266,
73
  "step": 90
74
  },
75
  {
76
  "epoch": 0.4094165813715456,
77
+ "grad_norm": 4.990958213806152,
78
  "learning_rate": 9.9e-06,
79
+ "loss": 2.256962776184082,
80
  "step": 100
81
  },
82
  {
83
  "epoch": 0.4503582395087001,
84
+ "grad_norm": 3.2661993503570557,
85
  "learning_rate": 9.76923076923077e-06,
86
+ "loss": 2.0482555389404298,
87
  "step": 110
88
  },
89
  {
90
  "epoch": 0.49129989764585463,
91
+ "grad_norm": 2.997323513031006,
92
  "learning_rate": 9.512820512820514e-06,
93
+ "loss": 1.6077747344970703,
94
  "step": 120
95
  },
96
  {
97
  "epoch": 0.5322415557830092,
98
+ "grad_norm": 2.422741413116455,
99
  "learning_rate": 9.256410256410257e-06,
100
+ "loss": 1.538028621673584,
101
  "step": 130
102
  },
103
  {
104
  "epoch": 0.5731832139201638,
105
+ "grad_norm": 2.4594290256500244,
106
  "learning_rate": 9e-06,
107
+ "loss": 1.5614049911499024,
108
  "step": 140
109
  },
110
  {
111
  "epoch": 0.6141248720573184,
112
+ "grad_norm": 2.7256577014923096,
113
  "learning_rate": 8.743589743589743e-06,
114
+ "loss": 1.7180768966674804,
115
  "step": 150
116
  },
117
  {
118
  "epoch": 0.6550665301944729,
119
+ "grad_norm": 2.6614902019500732,
120
  "learning_rate": 8.487179487179488e-06,
121
+ "loss": 1.6412044525146485,
122
  "step": 160
123
  },
124
  {
125
  "epoch": 0.6960081883316275,
126
+ "grad_norm": 2.1517934799194336,
127
  "learning_rate": 8.230769230769232e-06,
128
+ "loss": 1.729467010498047,
129
  "step": 170
130
  },
131
  {
132
  "epoch": 0.736949846468782,
133
+ "grad_norm": 2.4588229656219482,
134
  "learning_rate": 7.974358974358975e-06,
135
+ "loss": 1.305363941192627,
136
  "step": 180
137
  },
138
  {
139
  "epoch": 0.7778915046059366,
140
+ "grad_norm": 2.5282061100006104,
141
  "learning_rate": 7.717948717948718e-06,
142
+ "loss": 1.4986873626708985,
143
  "step": 190
144
  },
145
  {
146
  "epoch": 0.8188331627430911,
147
+ "grad_norm": 3.699396848678589,
148
  "learning_rate": 7.461538461538462e-06,
149
+ "loss": 1.5059691429138184,
150
  "step": 200
151
  },
152
  {
153
  "epoch": 0.8597748208802457,
154
+ "grad_norm": 2.2632241249084473,
155
  "learning_rate": 7.205128205128206e-06,
156
+ "loss": 1.4249659538269044,
157
  "step": 210
158
  },
159
  {
160
  "epoch": 0.9007164790174002,
161
+ "grad_norm": 1.6525421142578125,
162
  "learning_rate": 6.948717948717949e-06,
163
+ "loss": 1.2684237480163574,
164
  "step": 220
165
  },
166
  {
167
  "epoch": 0.9416581371545547,
168
+ "grad_norm": 4.144842624664307,
169
  "learning_rate": 6.692307692307692e-06,
170
+ "loss": 1.3331901550292968,
171
  "step": 230
172
  },
173
  {
174
  "epoch": 0.9825997952917093,
175
+ "grad_norm": 4.287148475646973,
176
  "learning_rate": 6.435897435897437e-06,
177
+ "loss": 1.3455743789672852,
178
  "step": 240
179
  },
180
  {
181
  "epoch": 1.0204708290685773,
182
+ "grad_norm": 1.7868481874465942,
183
  "learning_rate": 6.17948717948718e-06,
184
+ "loss": 1.2180957794189453,
185
  "step": 250
186
  },
187
  {
188
  "epoch": 1.0614124872057318,
189
+ "grad_norm": 3.4123318195343018,
190
  "learning_rate": 5.923076923076924e-06,
191
+ "loss": 1.0844655990600587,
192
  "step": 260
193
  },
194
  {
195
  "epoch": 1.1023541453428864,
196
+ "grad_norm": 2.9107143878936768,
197
  "learning_rate": 5.666666666666667e-06,
198
+ "loss": 1.2759632110595702,
199
  "step": 270
200
  },
201
  {
202
  "epoch": 1.143295803480041,
203
+ "grad_norm": 2.166071653366089,
204
  "learning_rate": 5.41025641025641e-06,
205
+ "loss": 1.2902444839477538,
206
  "step": 280
207
  },
208
  {
209
  "epoch": 1.1842374616171956,
210
+ "grad_norm": 3.7286741733551025,
211
  "learning_rate": 5.1538461538461534e-06,
212
+ "loss": 1.1132170677185058,
213
  "step": 290
214
  },
215
  {
216
  "epoch": 1.22517911975435,
217
+ "grad_norm": 3.644495964050293,
218
  "learning_rate": 4.8974358974358975e-06,
219
+ "loss": 1.3006702423095704,
220
  "step": 300
221
  },
222
  {
223
  "epoch": 1.2661207778915047,
224
+ "grad_norm": 2.9774715900421143,
225
  "learning_rate": 4.641025641025642e-06,
226
+ "loss": 1.3277738571166993,
227
  "step": 310
228
  },
229
  {
230
  "epoch": 1.3070624360286591,
231
+ "grad_norm": 3.5428693294525146,
232
  "learning_rate": 4.384615384615385e-06,
233
+ "loss": 1.2050466537475586,
234
  "step": 320
235
  },
236
  {
237
  "epoch": 1.3480040941658138,
238
+ "grad_norm": 2.9129176139831543,
239
  "learning_rate": 4.128205128205128e-06,
240
+ "loss": 1.0705522537231444,
241
  "step": 330
242
  },
243
  {
244
  "epoch": 1.3889457523029682,
245
+ "grad_norm": 3.6323633193969727,
246
  "learning_rate": 3.871794871794872e-06,
247
+ "loss": 1.1811490058898926,
248
  "step": 340
249
  },
250
  {
251
  "epoch": 1.429887410440123,
252
+ "grad_norm": 2.344156265258789,
253
  "learning_rate": 3.6153846153846156e-06,
254
+ "loss": 1.2702527046203613,
255
  "step": 350
256
  },
257
  {
258
  "epoch": 1.4708290685772774,
259
+ "grad_norm": 2.93733286857605,
260
  "learning_rate": 3.358974358974359e-06,
261
+ "loss": 1.2678390502929688,
262
  "step": 360
263
  },
264
  {
265
  "epoch": 1.511770726714432,
266
+ "grad_norm": 3.3147552013397217,
267
  "learning_rate": 3.102564102564103e-06,
268
+ "loss": 0.975819206237793,
269
  "step": 370
270
  },
271
  {
272
  "epoch": 1.5527123848515865,
273
+ "grad_norm": 3.5217061042785645,
274
  "learning_rate": 2.846153846153846e-06,
275
+ "loss": 1.5488268852233886,
276
  "step": 380
277
  },
278
  {
279
  "epoch": 1.593654042988741,
280
+ "grad_norm": 2.5279595851898193,
281
  "learning_rate": 2.5897435897435903e-06,
282
+ "loss": 1.1411288261413575,
283
  "step": 390
284
  },
285
  {
286
  "epoch": 1.6345957011258956,
287
+ "grad_norm": 2.848078489303589,
288
  "learning_rate": 2.3333333333333336e-06,
289
+ "loss": 1.1716268539428711,
290
  "step": 400
291
  },
292
  {
293
  "epoch": 1.6755373592630503,
294
+ "grad_norm": 2.8351571559906006,
295
  "learning_rate": 2.0769230769230773e-06,
296
+ "loss": 1.139915370941162,
297
  "step": 410
298
  },
299
  {
300
  "epoch": 1.7164790174002047,
301
+ "grad_norm": 5.1406707763671875,
302
  "learning_rate": 1.8205128205128205e-06,
303
+ "loss": 1.016810417175293,
304
  "step": 420
305
  },
306
  {
307
  "epoch": 1.7574206755373591,
308
+ "grad_norm": 2.3416874408721924,
309
  "learning_rate": 1.5641025641025642e-06,
310
+ "loss": 1.0270769119262695,
311
  "step": 430
312
  },
313
  {
314
  "epoch": 1.7983623336745138,
315
+ "grad_norm": 2.4181466102600098,
316
  "learning_rate": 1.307692307692308e-06,
317
+ "loss": 1.2266251564025878,
318
  "step": 440
319
  },
320
  {
321
  "epoch": 1.8393039918116685,
322
+ "grad_norm": 2.907052755355835,
323
  "learning_rate": 1.0512820512820514e-06,
324
+ "loss": 1.1883393287658692,
325
  "step": 450
326
  },
327
  {
328
  "epoch": 1.880245649948823,
329
+ "grad_norm": 2.765097141265869,
330
  "learning_rate": 7.948717948717949e-07,
331
+ "loss": 1.2259196281433105,
332
  "step": 460
333
  },
334
  {
335
  "epoch": 1.9211873080859774,
336
+ "grad_norm": 3.7463090419769287,
337
  "learning_rate": 5.384615384615386e-07,
338
+ "loss": 1.016909694671631,
339
  "step": 470
340
  },
341
  {
342
  "epoch": 1.962128966223132,
343
+ "grad_norm": 3.9370245933532715,
344
  "learning_rate": 2.820512820512821e-07,
345
+ "loss": 1.1455986976623536,
346
  "step": 480
347
  },
348
  {
349
  "epoch": 2.0,
350
+ "grad_norm": 4.772556304931641,
351
  "learning_rate": 2.5641025641025643e-08,
352
+ "loss": 1.0747014999389648,
353
  "step": 490
354
  }
355
  ],
sft/final/adapter_config.json CHANGED
@@ -30,8 +30,8 @@
30
  "rank_pattern": {},
31
  "revision": null,
32
  "target_modules": [
33
- "q_proj",
34
  "k_proj",
 
35
  "v_proj",
36
  "o_proj"
37
  ],
 
30
  "rank_pattern": {},
31
  "revision": null,
32
  "target_modules": [
 
33
  "k_proj",
34
+ "q_proj",
35
  "v_proj",
36
  "o_proj"
37
  ],
sft/final/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7d5efc1629290bcfa82fca8d6fe0a1fa199e6edd6673b12516e20f84ef8bf8f9
3
  size 61380432
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f17065f015b60b2b71094531a596650f064bc0d4e2c5ae78ced1765b8daf35ac
3
  size 61380432