Safetensors
File size: 14,593 Bytes
7d0cb16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 3.0,
  "eval_steps": 500,
  "global_step": 237,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.06389776357827476,
      "grad_norm": 17.501014709472656,
      "learning_rate": 9.992973140107996e-07,
      "loss": 1.3507,
      "num_input_tokens_seen": 93984,
      "step": 5,
      "train_runtime": 556.6226,
      "train_tokens_per_second": 168.847
    },
    {
      "epoch": 0.12779552715654952,
      "grad_norm": 11.144261360168457,
      "learning_rate": 9.964460368509865e-07,
      "loss": 1.0678,
      "num_input_tokens_seen": 187200,
      "step": 10,
      "train_runtime": 1095.0218,
      "train_tokens_per_second": 170.956
    },
    {
      "epoch": 0.19169329073482427,
      "grad_norm": 5.366312026977539,
      "learning_rate": 9.914147615517526e-07,
      "loss": 0.8995,
      "num_input_tokens_seen": 282560,
      "step": 15,
      "train_runtime": 1632.6431,
      "train_tokens_per_second": 173.069
    },
    {
      "epoch": 0.25559105431309903,
      "grad_norm": 5.131296157836914,
      "learning_rate": 9.842255814927944e-07,
      "loss": 0.8231,
      "num_input_tokens_seen": 376064,
      "step": 20,
      "train_runtime": 2172.7033,
      "train_tokens_per_second": 173.086
    },
    {
      "epoch": 0.3194888178913738,
      "grad_norm": 3.8358519077301025,
      "learning_rate": 9.749100658638914e-07,
      "loss": 0.7679,
      "num_input_tokens_seen": 471616,
      "step": 25,
      "train_runtime": 2712.1273,
      "train_tokens_per_second": 173.892
    },
    {
      "epoch": 0.38338658146964855,
      "grad_norm": 3.4662442207336426,
      "learning_rate": 9.63509121038005e-07,
      "loss": 0.7301,
      "num_input_tokens_seen": 564352,
      "step": 30,
      "train_runtime": 3636.2616,
      "train_tokens_per_second": 155.201
    },
    {
      "epoch": 0.4472843450479233,
      "grad_norm": 3.0572352409362793,
      "learning_rate": 9.500728109428603e-07,
      "loss": 0.7017,
      "num_input_tokens_seen": 657568,
      "step": 35,
      "train_runtime": 3832.9972,
      "train_tokens_per_second": 171.555
    },
    {
      "epoch": 0.5111821086261981,
      "grad_norm": 3.1490511894226074,
      "learning_rate": 9.346601372197913e-07,
      "loss": 0.7058,
      "num_input_tokens_seen": 750720,
      "step": 40,
      "train_runtime": 4012.6034,
      "train_tokens_per_second": 187.091
    },
    {
      "epoch": 0.5750798722044729,
      "grad_norm": 3.2475409507751465,
      "learning_rate": 9.17338780135223e-07,
      "loss": 0.6841,
      "num_input_tokens_seen": 842784,
      "step": 45,
      "train_runtime": 4192.5527,
      "train_tokens_per_second": 201.019
    },
    {
      "epoch": 0.6389776357827476,
      "grad_norm": 3.1415064334869385,
      "learning_rate": 8.981848013824993e-07,
      "loss": 0.6738,
      "num_input_tokens_seen": 936128,
      "step": 50,
      "train_runtime": 4379.3543,
      "train_tokens_per_second": 213.759
    },
    {
      "epoch": 0.7028753993610224,
      "grad_norm": 3.2890915870666504,
      "learning_rate": 8.77282310079115e-07,
      "loss": 0.6643,
      "num_input_tokens_seen": 1028160,
      "step": 55,
      "train_runtime": 4592.6801,
      "train_tokens_per_second": 223.869
    },
    {
      "epoch": 0.7667731629392971,
      "grad_norm": 2.9755921363830566,
      "learning_rate": 8.547230934260311e-07,
      "loss": 0.6449,
      "num_input_tokens_seen": 1119552,
      "step": 60,
      "train_runtime": 5103.7665,
      "train_tokens_per_second": 219.358
    },
    {
      "epoch": 0.8306709265175719,
      "grad_norm": 3.0064377784729004,
      "learning_rate": 8.306062136509219e-07,
      "loss": 0.6547,
      "num_input_tokens_seen": 1212032,
      "step": 65,
      "train_runtime": 5281.6563,
      "train_tokens_per_second": 229.48
    },
    {
      "epoch": 0.8945686900958466,
      "grad_norm": 3.0753512382507324,
      "learning_rate": 8.050375730052621e-07,
      "loss": 0.6543,
      "num_input_tokens_seen": 1306368,
      "step": 70,
      "train_runtime": 5458.516,
      "train_tokens_per_second": 239.327
    },
    {
      "epoch": 0.9584664536741214,
      "grad_norm": 2.9098427295684814,
      "learning_rate": 7.781294487254435e-07,
      "loss": 0.6579,
      "num_input_tokens_seen": 1400576,
      "step": 75,
      "train_runtime": 5636.7125,
      "train_tokens_per_second": 248.474
    },
    {
      "epoch": 1.012779552715655,
      "grad_norm": 3.033903121948242,
      "learning_rate": 7.5e-07,
      "loss": 0.6344,
      "num_input_tokens_seen": 1481248,
      "step": 80,
      "train_runtime": 5831.8252,
      "train_tokens_per_second": 253.994
    },
    {
      "epoch": 1.0766773162939298,
      "grad_norm": 2.6656150817871094,
      "learning_rate": 7.207727491079559e-07,
      "loss": 0.6292,
      "num_input_tokens_seen": 1575104,
      "step": 85,
      "train_runtime": 6012.044,
      "train_tokens_per_second": 261.991
    },
    {
      "epoch": 1.1405750798722045,
      "grad_norm": 2.7004282474517822,
      "learning_rate": 6.905760390067234e-07,
      "loss": 0.6239,
      "num_input_tokens_seen": 1668064,
      "step": 90,
      "train_runtime": 6510.4742,
      "train_tokens_per_second": 256.212
    },
    {
      "epoch": 1.2044728434504792,
      "grad_norm": 2.72955060005188,
      "learning_rate": 6.595424697513963e-07,
      "loss": 0.6157,
      "num_input_tokens_seen": 1764128,
      "step": 95,
      "train_runtime": 6697.2327,
      "train_tokens_per_second": 263.411
    },
    {
      "epoch": 1.268370607028754,
      "grad_norm": 2.819629192352295,
      "learning_rate": 6.278083162202373e-07,
      "loss": 0.6096,
      "num_input_tokens_seen": 1858912,
      "step": 100,
      "train_runtime": 6880.9117,
      "train_tokens_per_second": 270.155
    },
    {
      "epoch": 1.3322683706070286,
      "grad_norm": 2.837791919708252,
      "learning_rate": 5.955129297032538e-07,
      "loss": 0.5967,
      "num_input_tokens_seen": 1952640,
      "step": 105,
      "train_runtime": 7062.8237,
      "train_tokens_per_second": 276.467
    },
    {
      "epoch": 1.3961661341853036,
      "grad_norm": 2.6342546939849854,
      "learning_rate": 5.62798125981604e-07,
      "loss": 0.6051,
      "num_input_tokens_seen": 2045792,
      "step": 110,
      "train_runtime": 7245.471,
      "train_tokens_per_second": 282.355
    },
    {
      "epoch": 1.4600638977635783,
      "grad_norm": 2.5401482582092285,
      "learning_rate": 5.298075625849099e-07,
      "loss": 0.5899,
      "num_input_tokens_seen": 2140736,
      "step": 115,
      "train_runtime": 7427.2464,
      "train_tokens_per_second": 288.227
    },
    {
      "epoch": 1.5239616613418532,
      "grad_norm": 2.6414806842803955,
      "learning_rate": 4.966861079610687e-07,
      "loss": 0.5901,
      "num_input_tokens_seen": 2233280,
      "step": 120,
      "train_runtime": 7929.4515,
      "train_tokens_per_second": 281.644
    },
    {
      "epoch": 1.5878594249201279,
      "grad_norm": 2.817983865737915,
      "learning_rate": 4.6357920532866816e-07,
      "loss": 0.6011,
      "num_input_tokens_seen": 2326144,
      "step": 125,
      "train_runtime": 8118.8041,
      "train_tokens_per_second": 286.513
    },
    {
      "epoch": 1.6517571884984026,
      "grad_norm": 2.9443130493164062,
      "learning_rate": 4.306322340054659e-07,
      "loss": 0.5969,
      "num_input_tokens_seen": 2418592,
      "step": 130,
      "train_runtime": 8306.8501,
      "train_tokens_per_second": 291.156
    },
    {
      "epoch": 1.7156549520766773,
      "grad_norm": 2.630876302719116,
      "learning_rate": 3.979898710174677e-07,
      "loss": 0.5948,
      "num_input_tokens_seen": 2512320,
      "step": 135,
      "train_runtime": 8501.9493,
      "train_tokens_per_second": 295.499
    },
    {
      "epoch": 1.779552715654952,
      "grad_norm": 2.6901042461395264,
      "learning_rate": 3.657954557919183e-07,
      "loss": 0.598,
      "num_input_tokens_seen": 2606112,
      "step": 140,
      "train_runtime": 8694.019,
      "train_tokens_per_second": 299.759
    },
    {
      "epoch": 1.8434504792332267,
      "grad_norm": 2.8361966609954834,
      "learning_rate": 3.3419036072396614e-07,
      "loss": 0.5902,
      "num_input_tokens_seen": 2699936,
      "step": 145,
      "train_runtime": 8889.9576,
      "train_tokens_per_second": 303.706
    },
    {
      "epoch": 1.9073482428115016,
      "grad_norm": 2.87080979347229,
      "learning_rate": 3.033133703809759e-07,
      "loss": 0.5978,
      "num_input_tokens_seen": 2795136,
      "step": 150,
      "train_runtime": 9406.664,
      "train_tokens_per_second": 297.144
    },
    {
      "epoch": 1.9712460063897763,
      "grad_norm": 2.7429561614990234,
      "learning_rate": 2.7330007207053406e-07,
      "loss": 0.5946,
      "num_input_tokens_seen": 2888960,
      "step": 155,
      "train_runtime": 9589.8224,
      "train_tokens_per_second": 301.253
    },
    {
      "epoch": 2.02555910543131,
      "grad_norm": 2.6952402591705322,
      "learning_rate": 2.442822604482889e-07,
      "loss": 0.5918,
      "num_input_tokens_seen": 2968224,
      "step": 160,
      "train_runtime": 9773.1495,
      "train_tokens_per_second": 303.712
    },
    {
      "epoch": 2.0894568690095845,
      "grad_norm": 2.673067569732666,
      "learning_rate": 2.16387358780116e-07,
      "loss": 0.5663,
      "num_input_tokens_seen": 3062400,
      "step": 165,
      "train_runtime": 9959.8127,
      "train_tokens_per_second": 307.476
    },
    {
      "epoch": 2.1533546325878596,
      "grad_norm": 2.7801618576049805,
      "learning_rate": 1.8973785939996927e-07,
      "loss": 0.5791,
      "num_input_tokens_seen": 3155520,
      "step": 170,
      "train_runtime": 10148.3528,
      "train_tokens_per_second": 310.939
    },
    {
      "epoch": 2.2172523961661343,
      "grad_norm": 2.5863192081451416,
      "learning_rate": 1.6445078582048154e-07,
      "loss": 0.5695,
      "num_input_tokens_seen": 3250496,
      "step": 175,
      "train_runtime": 10338.7555,
      "train_tokens_per_second": 314.399
    },
    {
      "epoch": 2.281150159744409,
      "grad_norm": 2.7501046657562256,
      "learning_rate": 1.4063717885830373e-07,
      "loss": 0.5675,
      "num_input_tokens_seen": 3344672,
      "step": 180,
      "train_runtime": 10847.5534,
      "train_tokens_per_second": 308.334
    },
    {
      "epoch": 2.3450479233226837,
      "grad_norm": 2.7307002544403076,
      "learning_rate": 1.184016090307059e-07,
      "loss": 0.5657,
      "num_input_tokens_seen": 3438784,
      "step": 185,
      "train_runtime": 11033.7768,
      "train_tokens_per_second": 311.66
    },
    {
      "epoch": 2.4089456869009584,
      "grad_norm": 2.663017988204956,
      "learning_rate": 9.78417173646176e-08,
      "loss": 0.5745,
      "num_input_tokens_seen": 3531840,
      "step": 190,
      "train_runtime": 11217.6988,
      "train_tokens_per_second": 314.845
    },
    {
      "epoch": 2.472843450479233,
      "grad_norm": 2.6426873207092285,
      "learning_rate": 7.904778663450323e-08,
      "loss": 0.5885,
      "num_input_tokens_seen": 3625984,
      "step": 195,
      "train_runtime": 11403.1929,
      "train_tokens_per_second": 317.98
    },
    {
      "epoch": 2.536741214057508,
      "grad_norm": 2.7548089027404785,
      "learning_rate": 6.210234491186079e-08,
      "loss": 0.5748,
      "num_input_tokens_seen": 3720352,
      "step": 200,
      "train_runtime": 11603.6555,
      "train_tokens_per_second": 320.619
    },
    {
      "epoch": 2.600638977635783,
      "grad_norm": 2.6678242683410645,
      "learning_rate": 4.7079803167238366e-08,
      "loss": 0.5741,
      "num_input_tokens_seen": 3812992,
      "step": 205,
      "train_runtime": 11793.3317,
      "train_tokens_per_second": 323.318
    },
    {
      "epoch": 2.6645367412140573,
      "grad_norm": 2.8868303298950195,
      "learning_rate": 3.4046128516136754e-08,
      "loss": 0.5642,
      "num_input_tokens_seen": 3905280,
      "step": 210,
      "train_runtime": 12299.9488,
      "train_tokens_per_second": 317.504
    },
    {
      "epoch": 2.7284345047923324,
      "grad_norm": 2.5737545490264893,
      "learning_rate": 2.3058554543638698e-08,
      "loss": 0.5741,
      "num_input_tokens_seen": 3999680,
      "step": 215,
      "train_runtime": 12494.1775,
      "train_tokens_per_second": 320.124
    },
    {
      "epoch": 2.792332268370607,
      "grad_norm": 2.635117292404175,
      "learning_rate": 1.4165329979794971e-08,
      "loss": 0.5805,
      "num_input_tokens_seen": 4094720,
      "step": 220,
      "train_runtime": 12687.8777,
      "train_tokens_per_second": 322.727
    },
    {
      "epoch": 2.856230031948882,
      "grad_norm": 2.665903329849243,
      "learning_rate": 7.405506829382735e-09,
      "loss": 0.5779,
      "num_input_tokens_seen": 4189248,
      "step": 225,
      "train_runtime": 12880.2851,
      "train_tokens_per_second": 325.245
    },
    {
      "epoch": 2.9201277955271565,
      "grad_norm": 2.6100857257843018,
      "learning_rate": 2.808768886403301e-09,
      "loss": 0.5671,
      "num_input_tokens_seen": 4282208,
      "step": 230,
      "train_runtime": 13076.8487,
      "train_tokens_per_second": 327.465
    },
    {
      "epoch": 2.984025559105431,
      "grad_norm": 2.5199291706085205,
      "learning_rate": 3.9530138634907837e-10,
      "loss": 0.5685,
      "num_input_tokens_seen": 4376096,
      "step": 235,
      "train_runtime": 13275.1963,
      "train_tokens_per_second": 329.645
    },
    {
      "epoch": 3.0,
      "num_input_tokens_seen": 4400096,
      "step": 237,
      "total_flos": 1.8666841676395315e+17,
      "train_loss": 0.6493978349468376,
      "train_runtime": 13351.2629,
      "train_samples_per_second": 2.247,
      "train_steps_per_second": 0.018
    }
  ],
  "logging_steps": 5,
  "max_steps": 237,
  "num_input_tokens_seen": 4400096,
  "num_train_epochs": 3,
  "save_steps": 500,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 1.8666841676395315e+17,
  "train_batch_size": 4,
  "trial_name": null,
  "trial_params": null
}