sdar_4b_trace_sft-final / trainer_state.json
autoprogrammer's picture
Upload sdar_4b_trace_sft-final
e682cfb verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 1404,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0106951871657754,
"grad_norm": 5.869461536407471,
"learning_rate": 9.30232558139535e-07,
"loss": 0.3054,
"step": 5
},
{
"epoch": 0.0213903743315508,
"grad_norm": 4.7690653800964355,
"learning_rate": 2.0930232558139536e-06,
"loss": 0.2661,
"step": 10
},
{
"epoch": 0.03208556149732621,
"grad_norm": 4.248436450958252,
"learning_rate": 3.2558139534883724e-06,
"loss": 0.2409,
"step": 15
},
{
"epoch": 0.0427807486631016,
"grad_norm": 4.08921480178833,
"learning_rate": 4.418604651162791e-06,
"loss": 0.2228,
"step": 20
},
{
"epoch": 0.053475935828877004,
"grad_norm": 4.846882343292236,
"learning_rate": 5.58139534883721e-06,
"loss": 0.2258,
"step": 25
},
{
"epoch": 0.06417112299465241,
"grad_norm": 4.230620384216309,
"learning_rate": 6.744186046511628e-06,
"loss": 0.2103,
"step": 30
},
{
"epoch": 0.0748663101604278,
"grad_norm": 4.4064130783081055,
"learning_rate": 7.906976744186048e-06,
"loss": 0.2122,
"step": 35
},
{
"epoch": 0.0855614973262032,
"grad_norm": 4.86531400680542,
"learning_rate": 9.069767441860465e-06,
"loss": 0.1996,
"step": 40
},
{
"epoch": 0.0962566844919786,
"grad_norm": 5.53223991394043,
"learning_rate": 9.999986679414613e-06,
"loss": 0.2311,
"step": 45
},
{
"epoch": 0.10695187165775401,
"grad_norm": 4.2916178703308105,
"learning_rate": 9.999520466378376e-06,
"loss": 0.2319,
"step": 50
},
{
"epoch": 0.11764705882352941,
"grad_norm": 4.200897216796875,
"learning_rate": 9.998388295046227e-06,
"loss": 0.2368,
"step": 55
},
{
"epoch": 0.12834224598930483,
"grad_norm": 4.808861255645752,
"learning_rate": 9.996590316228402e-06,
"loss": 0.2354,
"step": 60
},
{
"epoch": 0.13903743315508021,
"grad_norm": 4.5134077072143555,
"learning_rate": 9.994126769423656e-06,
"loss": 0.234,
"step": 65
},
{
"epoch": 0.1497326203208556,
"grad_norm": 4.348900318145752,
"learning_rate": 9.990997982787348e-06,
"loss": 0.2541,
"step": 70
},
{
"epoch": 0.16042780748663102,
"grad_norm": 4.4039306640625,
"learning_rate": 9.98720437308773e-06,
"loss": 0.2496,
"step": 75
},
{
"epoch": 0.1711229946524064,
"grad_norm": 4.726532936096191,
"learning_rate": 9.982746445650437e-06,
"loss": 0.2797,
"step": 80
},
{
"epoch": 0.18181818181818182,
"grad_norm": 4.256592273712158,
"learning_rate": 9.977624794291172e-06,
"loss": 0.2518,
"step": 85
},
{
"epoch": 0.1925133689839572,
"grad_norm": 4.330168724060059,
"learning_rate": 9.97184010123661e-06,
"loss": 0.261,
"step": 90
},
{
"epoch": 0.20320855614973263,
"grad_norm": 4.281119346618652,
"learning_rate": 9.965393137033512e-06,
"loss": 0.2564,
"step": 95
},
{
"epoch": 0.21390374331550802,
"grad_norm": 4.247673511505127,
"learning_rate": 9.958284760446104e-06,
"loss": 0.2501,
"step": 100
},
{
"epoch": 0.22459893048128343,
"grad_norm": 3.930504322052002,
"learning_rate": 9.950515918341666e-06,
"loss": 0.2504,
"step": 105
},
{
"epoch": 0.23529411764705882,
"grad_norm": 4.084280967712402,
"learning_rate": 9.942087645564415e-06,
"loss": 0.2706,
"step": 110
},
{
"epoch": 0.24598930481283424,
"grad_norm": 3.869400978088379,
"learning_rate": 9.93300106479766e-06,
"loss": 0.2499,
"step": 115
},
{
"epoch": 0.25668449197860965,
"grad_norm": 4.0887675285339355,
"learning_rate": 9.923257386414253e-06,
"loss": 0.2628,
"step": 120
},
{
"epoch": 0.26737967914438504,
"grad_norm": 4.140570163726807,
"learning_rate": 9.912857908315363e-06,
"loss": 0.2642,
"step": 125
},
{
"epoch": 0.27807486631016043,
"grad_norm": 4.731294631958008,
"learning_rate": 9.901804015757588e-06,
"loss": 0.2669,
"step": 130
},
{
"epoch": 0.2887700534759358,
"grad_norm": 4.113650321960449,
"learning_rate": 9.89009718116843e-06,
"loss": 0.2809,
"step": 135
},
{
"epoch": 0.2994652406417112,
"grad_norm": 3.8770499229431152,
"learning_rate": 9.877738963950175e-06,
"loss": 0.2705,
"step": 140
},
{
"epoch": 0.31016042780748665,
"grad_norm": 4.077112197875977,
"learning_rate": 9.864731010272152e-06,
"loss": 0.2548,
"step": 145
},
{
"epoch": 0.32085561497326204,
"grad_norm": 4.771427631378174,
"learning_rate": 9.851075052851476e-06,
"loss": 0.2534,
"step": 150
},
{
"epoch": 0.3315508021390374,
"grad_norm": 3.84387469291687,
"learning_rate": 9.83677291072223e-06,
"loss": 0.2438,
"step": 155
},
{
"epoch": 0.3422459893048128,
"grad_norm": 4.427935600280762,
"learning_rate": 9.821826488993168e-06,
"loss": 0.2791,
"step": 160
},
{
"epoch": 0.35294117647058826,
"grad_norm": 4.1868767738342285,
"learning_rate": 9.806237778593941e-06,
"loss": 0.2734,
"step": 165
},
{
"epoch": 0.36363636363636365,
"grad_norm": 3.8055779933929443,
"learning_rate": 9.790008856009902e-06,
"loss": 0.2463,
"step": 170
},
{
"epoch": 0.37433155080213903,
"grad_norm": 4.041962623596191,
"learning_rate": 9.773141883005507e-06,
"loss": 0.2813,
"step": 175
},
{
"epoch": 0.3850267379679144,
"grad_norm": 3.6363329887390137,
"learning_rate": 9.755639106336347e-06,
"loss": 0.2724,
"step": 180
},
{
"epoch": 0.39572192513368987,
"grad_norm": 3.50486421585083,
"learning_rate": 9.737502857449894e-06,
"loss": 0.271,
"step": 185
},
{
"epoch": 0.40641711229946526,
"grad_norm": 3.4967920780181885,
"learning_rate": 9.718735552174923e-06,
"loss": 0.2467,
"step": 190
},
{
"epoch": 0.41711229946524064,
"grad_norm": 3.3797831535339355,
"learning_rate": 9.699339690399717e-06,
"loss": 0.2621,
"step": 195
},
{
"epoch": 0.42780748663101603,
"grad_norm": 15.109771728515625,
"learning_rate": 9.679317855739073e-06,
"loss": 0.2791,
"step": 200
},
{
"epoch": 0.4385026737967914,
"grad_norm": 3.828474760055542,
"learning_rate": 9.658672715190151e-06,
"loss": 0.2829,
"step": 205
},
{
"epoch": 0.44919786096256686,
"grad_norm": 3.2750980854034424,
"learning_rate": 9.637407018777224e-06,
"loss": 0.2866,
"step": 210
},
{
"epoch": 0.45989304812834225,
"grad_norm": 3.615290880203247,
"learning_rate": 9.615523599185353e-06,
"loss": 0.2422,
"step": 215
},
{
"epoch": 0.47058823529411764,
"grad_norm": 3.5919339656829834,
"learning_rate": 9.593025371383064e-06,
"loss": 0.269,
"step": 220
},
{
"epoch": 0.48128342245989303,
"grad_norm": 4.708985805511475,
"learning_rate": 9.569915332234068e-06,
"loss": 0.2686,
"step": 225
},
{
"epoch": 0.4919786096256685,
"grad_norm": 3.5988545417785645,
"learning_rate": 9.546196560098062e-06,
"loss": 0.273,
"step": 230
},
{
"epoch": 0.5026737967914439,
"grad_norm": 3.4772067070007324,
"learning_rate": 9.521872214420668e-06,
"loss": 0.2395,
"step": 235
},
{
"epoch": 0.5133689839572193,
"grad_norm": 3.7974278926849365,
"learning_rate": 9.496945535312597e-06,
"loss": 0.2682,
"step": 240
},
{
"epoch": 0.5240641711229946,
"grad_norm": 3.69970440864563,
"learning_rate": 9.471419843118036e-06,
"loss": 0.2557,
"step": 245
},
{
"epoch": 0.5347593582887701,
"grad_norm": 3.470737934112549,
"learning_rate": 9.44529853797238e-06,
"loss": 0.266,
"step": 250
},
{
"epoch": 0.5454545454545454,
"grad_norm": 3.733236312866211,
"learning_rate": 9.418585099349306e-06,
"loss": 0.2779,
"step": 255
},
{
"epoch": 0.5561497326203209,
"grad_norm": 3.434349298477173,
"learning_rate": 9.391283085597299e-06,
"loss": 0.2508,
"step": 260
},
{
"epoch": 0.5668449197860963,
"grad_norm": 3.5163183212280273,
"learning_rate": 9.36339613346565e-06,
"loss": 0.2727,
"step": 265
},
{
"epoch": 0.5775401069518716,
"grad_norm": 4.07936429977417,
"learning_rate": 9.33492795762005e-06,
"loss": 0.2714,
"step": 270
},
{
"epoch": 0.5882352941176471,
"grad_norm": 3.171149492263794,
"learning_rate": 9.305882350147763e-06,
"loss": 0.2585,
"step": 275
},
{
"epoch": 0.5989304812834224,
"grad_norm": 3.424302101135254,
"learning_rate": 9.276263180052498e-06,
"loss": 0.2584,
"step": 280
},
{
"epoch": 0.6096256684491979,
"grad_norm": 3.800421714782715,
"learning_rate": 9.246074392739057e-06,
"loss": 0.2771,
"step": 285
},
{
"epoch": 0.6203208556149733,
"grad_norm": 3.4711737632751465,
"learning_rate": 9.21532000948778e-06,
"loss": 0.265,
"step": 290
},
{
"epoch": 0.6310160427807486,
"grad_norm": 3.5598926544189453,
"learning_rate": 9.184004126918891e-06,
"loss": 0.2653,
"step": 295
},
{
"epoch": 0.6417112299465241,
"grad_norm": 3.320875644683838,
"learning_rate": 9.152130916446817e-06,
"loss": 0.2373,
"step": 300
},
{
"epoch": 0.6524064171122995,
"grad_norm": 3.606067419052124,
"learning_rate": 9.119704623724528e-06,
"loss": 0.2779,
"step": 305
},
{
"epoch": 0.6631016042780749,
"grad_norm": 3.189316511154175,
"learning_rate": 9.086729568078006e-06,
"loss": 0.2813,
"step": 310
},
{
"epoch": 0.6737967914438503,
"grad_norm": 3.725550651550293,
"learning_rate": 9.05321014193089e-06,
"loss": 0.284,
"step": 315
},
{
"epoch": 0.6844919786096256,
"grad_norm": 3.6915953159332275,
"learning_rate": 9.019150810219376e-06,
"loss": 0.2646,
"step": 320
},
{
"epoch": 0.6951871657754011,
"grad_norm": 3.499095916748047,
"learning_rate": 8.984556109797484e-06,
"loss": 0.2693,
"step": 325
},
{
"epoch": 0.7058823529411765,
"grad_norm": 3.671689510345459,
"learning_rate": 8.949430648832716e-06,
"loss": 0.2662,
"step": 330
},
{
"epoch": 0.7165775401069518,
"grad_norm": 3.549403429031372,
"learning_rate": 8.91377910619223e-06,
"loss": 0.2745,
"step": 335
},
{
"epoch": 0.7272727272727273,
"grad_norm": 3.2107975482940674,
"learning_rate": 8.8776062308196e-06,
"loss": 0.2462,
"step": 340
},
{
"epoch": 0.7379679144385026,
"grad_norm": 3.040982246398926,
"learning_rate": 8.84091684110223e-06,
"loss": 0.2663,
"step": 345
},
{
"epoch": 0.7486631016042781,
"grad_norm": 3.460141658782959,
"learning_rate": 8.803715824229525e-06,
"loss": 0.2595,
"step": 350
},
{
"epoch": 0.7593582887700535,
"grad_norm": 3.5115394592285156,
"learning_rate": 8.766008135541896e-06,
"loss": 0.2476,
"step": 355
},
{
"epoch": 0.7700534759358288,
"grad_norm": 3.2345032691955566,
"learning_rate": 8.727798797870688e-06,
"loss": 0.2399,
"step": 360
},
{
"epoch": 0.7807486631016043,
"grad_norm": 3.489104986190796,
"learning_rate": 8.689092900869112e-06,
"loss": 0.2568,
"step": 365
},
{
"epoch": 0.7914438502673797,
"grad_norm": 3.4822070598602295,
"learning_rate": 8.649895600334284e-06,
"loss": 0.2779,
"step": 370
},
{
"epoch": 0.8021390374331551,
"grad_norm": 3.6014904975891113,
"learning_rate": 8.610212117520453e-06,
"loss": 0.2738,
"step": 375
},
{
"epoch": 0.8128342245989305,
"grad_norm": 3.3353493213653564,
"learning_rate": 8.570047738443502e-06,
"loss": 0.2564,
"step": 380
},
{
"epoch": 0.8235294117647058,
"grad_norm": 3.1294198036193848,
"learning_rate": 8.52940781317683e-06,
"loss": 0.2459,
"step": 385
},
{
"epoch": 0.8342245989304813,
"grad_norm": 3.2122206687927246,
"learning_rate": 8.48829775513869e-06,
"loss": 0.2639,
"step": 390
},
{
"epoch": 0.8449197860962567,
"grad_norm": 3.2475640773773193,
"learning_rate": 8.446723040371114e-06,
"loss": 0.2545,
"step": 395
},
{
"epoch": 0.8556149732620321,
"grad_norm": 3.113609552383423,
"learning_rate": 8.40468920681047e-06,
"loss": 0.2712,
"step": 400
},
{
"epoch": 0.8663101604278075,
"grad_norm": 3.231935501098633,
"learning_rate": 8.362201853549777e-06,
"loss": 0.27,
"step": 405
},
{
"epoch": 0.8770053475935828,
"grad_norm": 3.3826744556427,
"learning_rate": 8.319266640092899e-06,
"loss": 0.2823,
"step": 410
},
{
"epoch": 0.8877005347593583,
"grad_norm": 3.3691983222961426,
"learning_rate": 8.275889285600656e-06,
"loss": 0.2555,
"step": 415
},
{
"epoch": 0.8983957219251337,
"grad_norm": 3.4460556507110596,
"learning_rate": 8.23207556812902e-06,
"loss": 0.2615,
"step": 420
},
{
"epoch": 0.9090909090909091,
"grad_norm": 3.0713820457458496,
"learning_rate": 8.187831323859445e-06,
"loss": 0.2512,
"step": 425
},
{
"epoch": 0.9197860962566845,
"grad_norm": 2.781736373901367,
"learning_rate": 8.143162446321465e-06,
"loss": 0.2568,
"step": 430
},
{
"epoch": 0.93048128342246,
"grad_norm": 3.0802292823791504,
"learning_rate": 8.098074885607646e-06,
"loss": 0.2506,
"step": 435
},
{
"epoch": 0.9411764705882353,
"grad_norm": 3.0498411655426025,
"learning_rate": 8.052574647581009e-06,
"loss": 0.2586,
"step": 440
},
{
"epoch": 0.9518716577540107,
"grad_norm": 3.0598437786102295,
"learning_rate": 8.006667793075026e-06,
"loss": 0.2576,
"step": 445
},
{
"epoch": 0.9625668449197861,
"grad_norm": 2.935920238494873,
"learning_rate": 7.960360437086287e-06,
"loss": 0.2363,
"step": 450
},
{
"epoch": 0.9732620320855615,
"grad_norm": 3.2348110675811768,
"learning_rate": 7.91365874795995e-06,
"loss": 0.2677,
"step": 455
},
{
"epoch": 0.983957219251337,
"grad_norm": 3.4138996601104736,
"learning_rate": 7.866568946568107e-06,
"loss": 0.2702,
"step": 460
},
{
"epoch": 0.9946524064171123,
"grad_norm": 2.949201822280884,
"learning_rate": 7.819097305481112e-06,
"loss": 0.2487,
"step": 465
},
{
"epoch": 1.0042780748663103,
"grad_norm": 2.0031890869140625,
"learning_rate": 7.771250148132067e-06,
"loss": 0.1638,
"step": 470
},
{
"epoch": 1.0149732620320855,
"grad_norm": 2.1339635848999023,
"learning_rate": 7.723033847974503e-06,
"loss": 0.1013,
"step": 475
},
{
"epoch": 1.025668449197861,
"grad_norm": 2.822927474975586,
"learning_rate": 7.674454827633413e-06,
"loss": 0.0924,
"step": 480
},
{
"epoch": 1.0363636363636364,
"grad_norm": 3.7076971530914307,
"learning_rate": 7.625519558049722e-06,
"loss": 0.1075,
"step": 485
},
{
"epoch": 1.0470588235294118,
"grad_norm": 2.7152915000915527,
"learning_rate": 7.576234557618336e-06,
"loss": 0.0926,
"step": 490
},
{
"epoch": 1.0577540106951873,
"grad_norm": 3.0349481105804443,
"learning_rate": 7.526606391319862e-06,
"loss": 0.1054,
"step": 495
},
{
"epoch": 1.0684491978609625,
"grad_norm": 2.7987284660339355,
"learning_rate": 7.476641669846121e-06,
"loss": 0.0999,
"step": 500
},
{
"epoch": 1.079144385026738,
"grad_norm": 2.2019879817962646,
"learning_rate": 7.426347048719577e-06,
"loss": 0.1082,
"step": 505
},
{
"epoch": 1.0898395721925134,
"grad_norm": 2.8885598182678223,
"learning_rate": 7.375729227406789e-06,
"loss": 0.1227,
"step": 510
},
{
"epoch": 1.1005347593582888,
"grad_norm": 2.8309147357940674,
"learning_rate": 7.324794948426015e-06,
"loss": 0.0943,
"step": 515
},
{
"epoch": 1.1112299465240643,
"grad_norm": 2.788823366165161,
"learning_rate": 7.273550996449077e-06,
"loss": 0.1038,
"step": 520
},
{
"epoch": 1.1219251336898395,
"grad_norm": 2.1967334747314453,
"learning_rate": 7.222004197397613e-06,
"loss": 0.0956,
"step": 525
},
{
"epoch": 1.132620320855615,
"grad_norm": 2.6466832160949707,
"learning_rate": 7.170161417533836e-06,
"loss": 0.0946,
"step": 530
},
{
"epoch": 1.1433155080213904,
"grad_norm": 2.3864052295684814,
"learning_rate": 7.118029562545915e-06,
"loss": 0.0868,
"step": 535
},
{
"epoch": 1.1540106951871658,
"grad_norm": 2.6868040561676025,
"learning_rate": 7.065615576628107e-06,
"loss": 0.1009,
"step": 540
},
{
"epoch": 1.1647058823529413,
"grad_norm": 2.748037815093994,
"learning_rate": 7.0129264415557585e-06,
"loss": 0.1067,
"step": 545
},
{
"epoch": 1.1754010695187165,
"grad_norm": 3.183288097381592,
"learning_rate": 6.959969175755306e-06,
"loss": 0.1009,
"step": 550
},
{
"epoch": 1.186096256684492,
"grad_norm": 2.472684383392334,
"learning_rate": 6.906750833369386e-06,
"loss": 0.0912,
"step": 555
},
{
"epoch": 1.1967914438502674,
"grad_norm": 3.301072359085083,
"learning_rate": 6.8532785033171975e-06,
"loss": 0.1026,
"step": 560
},
{
"epoch": 1.2074866310160428,
"grad_norm": 2.607532262802124,
"learning_rate": 6.799559308350219e-06,
"loss": 0.0985,
"step": 565
},
{
"epoch": 1.2181818181818183,
"grad_norm": 2.714355230331421,
"learning_rate": 6.745600404103431e-06,
"loss": 0.0982,
"step": 570
},
{
"epoch": 1.2288770053475937,
"grad_norm": 3.1020829677581787,
"learning_rate": 6.6914089781421535e-06,
"loss": 0.1084,
"step": 575
},
{
"epoch": 1.239572192513369,
"grad_norm": 3.3140673637390137,
"learning_rate": 6.636992249004629e-06,
"loss": 0.1054,
"step": 580
},
{
"epoch": 1.2502673796791444,
"grad_norm": 2.7101094722747803,
"learning_rate": 6.582357465240488e-06,
"loss": 0.1045,
"step": 585
},
{
"epoch": 1.2609625668449198,
"grad_norm": 2.9788272380828857,
"learning_rate": 6.527511904445194e-06,
"loss": 0.0883,
"step": 590
},
{
"epoch": 1.2716577540106953,
"grad_norm": 3.2309136390686035,
"learning_rate": 6.472462872290654e-06,
"loss": 0.0946,
"step": 595
},
{
"epoch": 1.2823529411764705,
"grad_norm": 2.4168930053710938,
"learning_rate": 6.417217701552059e-06,
"loss": 0.0996,
"step": 600
},
{
"epoch": 1.293048128342246,
"grad_norm": 2.6719398498535156,
"learning_rate": 6.36178375113113e-06,
"loss": 0.0963,
"step": 605
},
{
"epoch": 1.3037433155080214,
"grad_norm": 2.4402902126312256,
"learning_rate": 6.3061684050758776e-06,
"loss": 0.0986,
"step": 610
},
{
"epoch": 1.3144385026737968,
"grad_norm": 3.2404541969299316,
"learning_rate": 6.250379071597018e-06,
"loss": 0.0925,
"step": 615
},
{
"epoch": 1.3251336898395722,
"grad_norm": 2.904754161834717,
"learning_rate": 6.194423182081161e-06,
"loss": 0.1125,
"step": 620
},
{
"epoch": 1.3358288770053477,
"grad_norm": 3.329714059829712,
"learning_rate": 6.138308190100918e-06,
"loss": 0.1038,
"step": 625
},
{
"epoch": 1.346524064171123,
"grad_norm": 3.2199363708496094,
"learning_rate": 6.082041570422059e-06,
"loss": 0.1097,
"step": 630
},
{
"epoch": 1.3572192513368984,
"grad_norm": 2.6863086223602295,
"learning_rate": 6.025630818007833e-06,
"loss": 0.1001,
"step": 635
},
{
"epoch": 1.3679144385026738,
"grad_norm": 2.825577735900879,
"learning_rate": 5.969083447020606e-06,
"loss": 0.1018,
"step": 640
},
{
"epoch": 1.3786096256684492,
"grad_norm": 2.606800079345703,
"learning_rate": 5.912406989820948e-06,
"loss": 0.1043,
"step": 645
},
{
"epoch": 1.3893048128342245,
"grad_norm": 2.4628851413726807,
"learning_rate": 5.855608995964283e-06,
"loss": 0.0961,
"step": 650
},
{
"epoch": 1.4,
"grad_norm": 2.584479331970215,
"learning_rate": 5.798697031195257e-06,
"loss": 0.1025,
"step": 655
},
{
"epoch": 1.4106951871657754,
"grad_norm": 2.4871041774749756,
"learning_rate": 5.741678676439946e-06,
"loss": 0.0993,
"step": 660
},
{
"epoch": 1.4213903743315508,
"grad_norm": 2.876145601272583,
"learning_rate": 5.684561526796045e-06,
"loss": 0.0968,
"step": 665
},
{
"epoch": 1.4320855614973262,
"grad_norm": 3.0748062133789062,
"learning_rate": 5.627353190521168e-06,
"loss": 0.0824,
"step": 670
},
{
"epoch": 1.4427807486631017,
"grad_norm": 2.5074338912963867,
"learning_rate": 5.570061288019385e-06,
"loss": 0.0994,
"step": 675
},
{
"epoch": 1.4534759358288771,
"grad_norm": 2.8043177127838135,
"learning_rate": 5.51269345082617e-06,
"loss": 0.1007,
"step": 680
},
{
"epoch": 1.4641711229946524,
"grad_norm": 2.6914560794830322,
"learning_rate": 5.455257320591825e-06,
"loss": 0.1021,
"step": 685
},
{
"epoch": 1.4748663101604278,
"grad_norm": 2.208425521850586,
"learning_rate": 5.397760548063591e-06,
"loss": 0.0918,
"step": 690
},
{
"epoch": 1.4855614973262032,
"grad_norm": 2.6503446102142334,
"learning_rate": 5.340210792066531e-06,
"loss": 0.0967,
"step": 695
},
{
"epoch": 1.4962566844919787,
"grad_norm": 2.4122114181518555,
"learning_rate": 5.282615718483344e-06,
"loss": 0.0859,
"step": 700
},
{
"epoch": 1.506951871657754,
"grad_norm": 2.084665060043335,
"learning_rate": 5.224982999233228e-06,
"loss": 0.0858,
"step": 705
},
{
"epoch": 1.5176470588235293,
"grad_norm": 3.171128034591675,
"learning_rate": 5.167320311249951e-06,
"loss": 0.1037,
"step": 710
},
{
"epoch": 1.5283422459893048,
"grad_norm": 3.304917573928833,
"learning_rate": 5.109635335459256e-06,
"loss": 0.0845,
"step": 715
},
{
"epoch": 1.5390374331550802,
"grad_norm": 2.538336992263794,
"learning_rate": 5.051935755755713e-06,
"loss": 0.0875,
"step": 720
},
{
"epoch": 1.5497326203208557,
"grad_norm": 3.2048041820526123,
"learning_rate": 4.9942292579791965e-06,
"loss": 0.0957,
"step": 725
},
{
"epoch": 1.5604278074866311,
"grad_norm": 2.32966685295105,
"learning_rate": 4.936523528891111e-06,
"loss": 0.08,
"step": 730
},
{
"epoch": 1.5711229946524066,
"grad_norm": 2.5276477336883545,
"learning_rate": 4.878826255150453e-06,
"loss": 0.1025,
"step": 735
},
{
"epoch": 1.5818181818181818,
"grad_norm": 2.489506244659424,
"learning_rate": 4.821145122289941e-06,
"loss": 0.0905,
"step": 740
},
{
"epoch": 1.5925133689839572,
"grad_norm": 2.0386905670166016,
"learning_rate": 4.763487813692252e-06,
"loss": 0.0863,
"step": 745
},
{
"epoch": 1.6032085561497325,
"grad_norm": 3.1131086349487305,
"learning_rate": 4.705862009566564e-06,
"loss": 0.0884,
"step": 750
},
{
"epoch": 1.613903743315508,
"grad_norm": 2.19130277633667,
"learning_rate": 4.648275385925522e-06,
"loss": 0.0919,
"step": 755
},
{
"epoch": 1.6245989304812833,
"grad_norm": 2.7279891967773438,
"learning_rate": 4.5907356135627605e-06,
"loss": 0.0956,
"step": 760
},
{
"epoch": 1.6352941176470588,
"grad_norm": 2.4997522830963135,
"learning_rate": 4.533250357031104e-06,
"loss": 0.1017,
"step": 765
},
{
"epoch": 1.6459893048128342,
"grad_norm": 2.488523006439209,
"learning_rate": 4.475827273621639e-06,
"loss": 0.0968,
"step": 770
},
{
"epoch": 1.6566844919786097,
"grad_norm": 2.9442784786224365,
"learning_rate": 4.418474012343711e-06,
"loss": 0.0973,
"step": 775
},
{
"epoch": 1.6673796791443851,
"grad_norm": 2.7047040462493896,
"learning_rate": 4.361198212906048e-06,
"loss": 0.0884,
"step": 780
},
{
"epoch": 1.6780748663101606,
"grad_norm": 2.4104325771331787,
"learning_rate": 4.304007504699118e-06,
"loss": 0.087,
"step": 785
},
{
"epoch": 1.6887700534759358,
"grad_norm": 2.9229371547698975,
"learning_rate": 4.246909505778862e-06,
"loss": 0.0772,
"step": 790
},
{
"epoch": 1.6994652406417112,
"grad_norm": 2.7569501399993896,
"learning_rate": 4.189911821851928e-06,
"loss": 0.0973,
"step": 795
},
{
"epoch": 1.7101604278074867,
"grad_norm": 2.2701845169067383,
"learning_rate": 4.1330220452625644e-06,
"loss": 0.0869,
"step": 800
},
{
"epoch": 1.720855614973262,
"grad_norm": 2.91239857673645,
"learning_rate": 4.076247753981285e-06,
"loss": 0.0985,
"step": 805
},
{
"epoch": 1.7315508021390373,
"grad_norm": 2.3714444637298584,
"learning_rate": 4.019596510595447e-06,
"loss": 0.0862,
"step": 810
},
{
"epoch": 1.7422459893048128,
"grad_norm": 2.617003917694092,
"learning_rate": 3.963075861301886e-06,
"loss": 0.0933,
"step": 815
},
{
"epoch": 1.7529411764705882,
"grad_norm": 2.6759443283081055,
"learning_rate": 3.9066933349017165e-06,
"loss": 0.0883,
"step": 820
},
{
"epoch": 1.7636363636363637,
"grad_norm": 3.302290439605713,
"learning_rate": 3.8504564417974795e-06,
"loss": 0.0988,
"step": 825
},
{
"epoch": 1.7743315508021391,
"grad_norm": 2.1597912311553955,
"learning_rate": 3.7943726729927154e-06,
"loss": 0.0922,
"step": 830
},
{
"epoch": 1.7850267379679146,
"grad_norm": 2.2167508602142334,
"learning_rate": 3.738449499094121e-06,
"loss": 0.0807,
"step": 835
},
{
"epoch": 1.79572192513369,
"grad_norm": 2.180342197418213,
"learning_rate": 3.682694369316446e-06,
"loss": 0.0741,
"step": 840
},
{
"epoch": 1.8064171122994652,
"grad_norm": 3.1924726963043213,
"learning_rate": 3.6271147104902192e-06,
"loss": 0.0836,
"step": 845
},
{
"epoch": 1.8171122994652407,
"grad_norm": 3.1651058197021484,
"learning_rate": 3.5717179260724544e-06,
"loss": 0.0911,
"step": 850
},
{
"epoch": 1.8278074866310159,
"grad_norm": 3.11163330078125,
"learning_rate": 3.5165113951604874e-06,
"loss": 0.094,
"step": 855
},
{
"epoch": 1.8385026737967913,
"grad_norm": 2.2718892097473145,
"learning_rate": 3.461502471509045e-06,
"loss": 0.0818,
"step": 860
},
{
"epoch": 1.8491978609625668,
"grad_norm": 2.625342607498169,
"learning_rate": 3.4066984825506855e-06,
"loss": 0.0891,
"step": 865
},
{
"epoch": 1.8598930481283422,
"grad_norm": 2.286201000213623,
"learning_rate": 3.35210672841976e-06,
"loss": 0.0906,
"step": 870
},
{
"epoch": 1.8705882352941177,
"grad_norm": 2.516383171081543,
"learning_rate": 3.297734480980002e-06,
"loss": 0.0856,
"step": 875
},
{
"epoch": 1.881283422459893,
"grad_norm": 3.1512904167175293,
"learning_rate": 3.2435889828558753e-06,
"loss": 0.0845,
"step": 880
},
{
"epoch": 1.8919786096256686,
"grad_norm": 3.0289194583892822,
"learning_rate": 3.1896774464678327e-06,
"loss": 0.0956,
"step": 885
},
{
"epoch": 1.902673796791444,
"grad_norm": 2.4393064975738525,
"learning_rate": 3.1360070530715885e-06,
"loss": 0.0824,
"step": 890
},
{
"epoch": 1.9133689839572192,
"grad_norm": 2.774142026901245,
"learning_rate": 3.0825849518015334e-06,
"loss": 0.0873,
"step": 895
},
{
"epoch": 1.9240641711229947,
"grad_norm": 2.386697292327881,
"learning_rate": 3.029418258718454e-06,
"loss": 0.0836,
"step": 900
},
{
"epoch": 1.93475935828877,
"grad_norm": 2.7990691661834717,
"learning_rate": 2.9765140558616287e-06,
"loss": 0.0875,
"step": 905
},
{
"epoch": 1.9454545454545453,
"grad_norm": 2.4611012935638428,
"learning_rate": 2.9238793903054757e-06,
"loss": 0.0791,
"step": 910
},
{
"epoch": 1.9561497326203208,
"grad_norm": 2.292494058609009,
"learning_rate": 2.8715212732208523e-06,
"loss": 0.0777,
"step": 915
},
{
"epoch": 1.9668449197860962,
"grad_norm": 2.5736472606658936,
"learning_rate": 2.819446678941126e-06,
"loss": 0.0853,
"step": 920
},
{
"epoch": 1.9775401069518717,
"grad_norm": 2.792259693145752,
"learning_rate": 2.7676625440331756e-06,
"loss": 0.08,
"step": 925
},
{
"epoch": 1.988235294117647,
"grad_norm": 2.7590394020080566,
"learning_rate": 2.7161757663734012e-06,
"loss": 0.0876,
"step": 930
},
{
"epoch": 1.9989304812834225,
"grad_norm": 3.141918420791626,
"learning_rate": 2.6649932042288994e-06,
"loss": 0.0874,
"step": 935
},
{
"epoch": 2.0085561497326205,
"grad_norm": 1.2265775203704834,
"learning_rate": 2.6141216753439115e-06,
"loss": 0.0265,
"step": 940
},
{
"epoch": 2.0192513368983955,
"grad_norm": 1.1748132705688477,
"learning_rate": 2.5635679560316703e-06,
"loss": 0.026,
"step": 945
},
{
"epoch": 2.029946524064171,
"grad_norm": 1.9212442636489868,
"learning_rate": 2.5133387802717585e-06,
"loss": 0.0245,
"step": 950
},
{
"epoch": 2.0406417112299464,
"grad_norm": 1.0849891901016235,
"learning_rate": 2.4634408388131255e-06,
"loss": 0.0213,
"step": 955
},
{
"epoch": 2.051336898395722,
"grad_norm": 2.341118812561035,
"learning_rate": 2.4138807782828423e-06,
"loss": 0.0217,
"step": 960
},
{
"epoch": 2.0620320855614973,
"grad_norm": 1.3080765008926392,
"learning_rate": 2.3646652003007348e-06,
"loss": 0.0213,
"step": 965
},
{
"epoch": 2.0727272727272728,
"grad_norm": 1.187409520149231,
"learning_rate": 2.3158006606000345e-06,
"loss": 0.0278,
"step": 970
},
{
"epoch": 2.083422459893048,
"grad_norm": 2.2584495544433594,
"learning_rate": 2.267293668154114e-06,
"loss": 0.0239,
"step": 975
},
{
"epoch": 2.0941176470588236,
"grad_norm": 2.2861719131469727,
"learning_rate": 2.2191506843094628e-06,
"loss": 0.0254,
"step": 980
},
{
"epoch": 2.104812834224599,
"grad_norm": 1.3212783336639404,
"learning_rate": 2.1713781219250157e-06,
"loss": 0.022,
"step": 985
},
{
"epoch": 2.1155080213903745,
"grad_norm": 1.6825640201568604,
"learning_rate": 2.1239823445179236e-06,
"loss": 0.0216,
"step": 990
},
{
"epoch": 2.1262032085561495,
"grad_norm": 2.0671329498291016,
"learning_rate": 2.076969665415908e-06,
"loss": 0.0211,
"step": 995
},
{
"epoch": 2.136898395721925,
"grad_norm": 1.9904191493988037,
"learning_rate": 2.0303463469163004e-06,
"loss": 0.0246,
"step": 1000
},
{
"epoch": 2.1475935828877004,
"grad_norm": 1.6527910232543945,
"learning_rate": 1.984118599451866e-06,
"loss": 0.024,
"step": 1005
},
{
"epoch": 2.158288770053476,
"grad_norm": 1.7971508502960205,
"learning_rate": 1.938292580763561e-06,
"loss": 0.025,
"step": 1010
},
{
"epoch": 2.1689839572192513,
"grad_norm": 1.7470921277999878,
"learning_rate": 1.8928743950802863e-06,
"loss": 0.0295,
"step": 1015
},
{
"epoch": 2.1796791443850267,
"grad_norm": 1.0844634771347046,
"learning_rate": 1.847870092305773e-06,
"loss": 0.0158,
"step": 1020
},
{
"epoch": 2.190374331550802,
"grad_norm": 1.632693886756897,
"learning_rate": 1.8032856672127191e-06,
"loss": 0.0211,
"step": 1025
},
{
"epoch": 2.2010695187165776,
"grad_norm": 1.0532259941101074,
"learning_rate": 1.7591270586442517e-06,
"loss": 0.0189,
"step": 1030
},
{
"epoch": 2.211764705882353,
"grad_norm": 1.5501878261566162,
"learning_rate": 1.7154001487228478e-06,
"loss": 0.0175,
"step": 1035
},
{
"epoch": 2.2224598930481285,
"grad_norm": 1.4238228797912598,
"learning_rate": 1.672110762066811e-06,
"loss": 0.0215,
"step": 1040
},
{
"epoch": 2.233155080213904,
"grad_norm": 1.4649953842163086,
"learning_rate": 1.6292646650144072e-06,
"loss": 0.0182,
"step": 1045
},
{
"epoch": 2.243850267379679,
"grad_norm": 2.2017998695373535,
"learning_rate": 1.5868675648557568e-06,
"loss": 0.0176,
"step": 1050
},
{
"epoch": 2.2545454545454544,
"grad_norm": 1.6893876791000366,
"learning_rate": 1.544925109072607e-06,
"loss": 0.02,
"step": 1055
},
{
"epoch": 2.26524064171123,
"grad_norm": 1.3076281547546387,
"learning_rate": 1.50344288458606e-06,
"loss": 0.0185,
"step": 1060
},
{
"epoch": 2.2759358288770053,
"grad_norm": 1.342536449432373,
"learning_rate": 1.462426417012361e-06,
"loss": 0.0165,
"step": 1065
},
{
"epoch": 2.2866310160427807,
"grad_norm": 2.3762521743774414,
"learning_rate": 1.4218811699268752e-06,
"loss": 0.019,
"step": 1070
},
{
"epoch": 2.297326203208556,
"grad_norm": 1.6990896463394165,
"learning_rate": 1.38181254413631e-06,
"loss": 0.0179,
"step": 1075
},
{
"epoch": 2.3080213903743316,
"grad_norm": 1.9363665580749512,
"learning_rate": 1.3422258769592967e-06,
"loss": 0.0199,
"step": 1080
},
{
"epoch": 2.318716577540107,
"grad_norm": 1.7887779474258423,
"learning_rate": 1.3031264415154477e-06,
"loss": 0.0194,
"step": 1085
},
{
"epoch": 2.3294117647058825,
"grad_norm": 1.9284300804138184,
"learning_rate": 1.2645194460229454e-06,
"loss": 0.0213,
"step": 1090
},
{
"epoch": 2.3401069518716575,
"grad_norm": 2.6546132564544678,
"learning_rate": 1.2264100331047878e-06,
"loss": 0.0216,
"step": 1095
},
{
"epoch": 2.350802139037433,
"grad_norm": 1.0779314041137695,
"learning_rate": 1.1888032791037697e-06,
"loss": 0.0198,
"step": 1100
},
{
"epoch": 2.3614973262032084,
"grad_norm": 2.0439534187316895,
"learning_rate": 1.1517041934062834e-06,
"loss": 0.019,
"step": 1105
},
{
"epoch": 2.372192513368984,
"grad_norm": 1.9515999555587769,
"learning_rate": 1.115117717775056e-06,
"loss": 0.0169,
"step": 1110
},
{
"epoch": 2.3828877005347593,
"grad_norm": 1.5731124877929688,
"learning_rate": 1.0790487256908826e-06,
"loss": 0.0219,
"step": 1115
},
{
"epoch": 2.3935828877005347,
"grad_norm": 2.9910495281219482,
"learning_rate": 1.043502021703449e-06,
"loss": 0.0273,
"step": 1120
},
{
"epoch": 2.40427807486631,
"grad_norm": 1.1649227142333984,
"learning_rate": 1.0084823407913563e-06,
"loss": 0.0242,
"step": 1125
},
{
"epoch": 2.4149732620320856,
"grad_norm": 1.618787407875061,
"learning_rate": 9.739943477313918e-07,
"loss": 0.0171,
"step": 1130
},
{
"epoch": 2.425668449197861,
"grad_norm": 1.1850640773773193,
"learning_rate": 9.400426364771648e-07,
"loss": 0.0207,
"step": 1135
},
{
"epoch": 2.4363636363636365,
"grad_norm": 1.6684306859970093,
"learning_rate": 9.066317295471688e-07,
"loss": 0.0154,
"step": 1140
},
{
"epoch": 2.447058823529412,
"grad_norm": 1.5794020891189575,
"learning_rate": 8.737660774223655e-07,
"loss": 0.017,
"step": 1145
},
{
"epoch": 2.4577540106951874,
"grad_norm": 1.608679175376892,
"learning_rate": 8.414500579533536e-07,
"loss": 0.0174,
"step": 1150
},
{
"epoch": 2.4684491978609624,
"grad_norm": 2.3195552825927734,
"learning_rate": 8.096879757772297e-07,
"loss": 0.0268,
"step": 1155
},
{
"epoch": 2.479144385026738,
"grad_norm": 1.937566876411438,
"learning_rate": 7.784840617441858e-07,
"loss": 0.0192,
"step": 1160
},
{
"epoch": 2.4898395721925133,
"grad_norm": 1.0479762554168701,
"learning_rate": 7.47842472353939e-07,
"loss": 0.0163,
"step": 1165
},
{
"epoch": 2.5005347593582887,
"grad_norm": 1.6187900304794312,
"learning_rate": 7.177672892020743e-07,
"loss": 0.0195,
"step": 1170
},
{
"epoch": 2.511229946524064,
"grad_norm": 2.198720932006836,
"learning_rate": 6.882625184363534e-07,
"loss": 0.0182,
"step": 1175
},
{
"epoch": 2.5219251336898396,
"grad_norm": 1.8759390115737915,
"learning_rate": 6.593320902230748e-07,
"loss": 0.0232,
"step": 1180
},
{
"epoch": 2.532620320855615,
"grad_norm": 1.2502473592758179,
"learning_rate": 6.309798582235671e-07,
"loss": 0.0178,
"step": 1185
},
{
"epoch": 2.5433155080213905,
"grad_norm": 1.6022732257843018,
"learning_rate": 6.032095990808567e-07,
"loss": 0.0193,
"step": 1190
},
{
"epoch": 2.5540106951871655,
"grad_norm": 1.390811562538147,
"learning_rate": 5.760250119166061e-07,
"loss": 0.0203,
"step": 1195
},
{
"epoch": 2.564705882352941,
"grad_norm": 1.4281519651412964,
"learning_rate": 5.494297178383729e-07,
"loss": 0.0194,
"step": 1200
},
{
"epoch": 2.5754010695187164,
"grad_norm": 1.2555127143859863,
"learning_rate": 5.234272594572604e-07,
"loss": 0.0172,
"step": 1205
},
{
"epoch": 2.586096256684492,
"grad_norm": 2.223867654800415,
"learning_rate": 4.980211004160306e-07,
"loss": 0.0222,
"step": 1210
},
{
"epoch": 2.5967914438502673,
"grad_norm": 0.8818890452384949,
"learning_rate": 4.7321462492772975e-07,
"loss": 0.0168,
"step": 1215
},
{
"epoch": 2.6074866310160427,
"grad_norm": 2.0135138034820557,
"learning_rate": 4.490111373248918e-07,
"loss": 0.0188,
"step": 1220
},
{
"epoch": 2.618181818181818,
"grad_norm": 2.228736400604248,
"learning_rate": 4.2541386161939424e-07,
"loss": 0.024,
"step": 1225
},
{
"epoch": 2.6288770053475936,
"grad_norm": 1.993802547454834,
"learning_rate": 4.024259410730008e-07,
"loss": 0.0204,
"step": 1230
},
{
"epoch": 2.639572192513369,
"grad_norm": 1.5609642267227173,
"learning_rate": 3.800504377786651e-07,
"loss": 0.0207,
"step": 1235
},
{
"epoch": 2.6502673796791445,
"grad_norm": 2.098813772201538,
"learning_rate": 3.582903322526482e-07,
"loss": 0.0199,
"step": 1240
},
{
"epoch": 2.66096256684492,
"grad_norm": 1.1627248525619507,
"learning_rate": 3.371485230374988e-07,
"loss": 0.0226,
"step": 1245
},
{
"epoch": 2.6716577540106954,
"grad_norm": 1.6938384771347046,
"learning_rate": 3.166278263159539e-07,
"loss": 0.0166,
"step": 1250
},
{
"epoch": 2.682352941176471,
"grad_norm": 2.4418842792510986,
"learning_rate": 2.9673097553581385e-07,
"loss": 0.0226,
"step": 1255
},
{
"epoch": 2.693048128342246,
"grad_norm": 1.8153109550476074,
"learning_rate": 2.7746062104583473e-07,
"loss": 0.0164,
"step": 1260
},
{
"epoch": 2.7037433155080213,
"grad_norm": 1.7912057638168335,
"learning_rate": 2.588193297426844e-07,
"loss": 0.0222,
"step": 1265
},
{
"epoch": 2.7144385026737967,
"grad_norm": 1.2618776559829712,
"learning_rate": 2.4080958472902873e-07,
"loss": 0.0226,
"step": 1270
},
{
"epoch": 2.725133689839572,
"grad_norm": 1.1699613332748413,
"learning_rate": 2.2343378498276391e-07,
"loss": 0.0145,
"step": 1275
},
{
"epoch": 2.7358288770053476,
"grad_norm": 1.2414964437484741,
"learning_rate": 2.0669424503746871e-07,
"loss": 0.0173,
"step": 1280
},
{
"epoch": 2.746524064171123,
"grad_norm": 1.5833706855773926,
"learning_rate": 1.9059319467409244e-07,
"loss": 0.0185,
"step": 1285
},
{
"epoch": 2.7572192513368985,
"grad_norm": 1.5452425479888916,
"learning_rate": 1.7513277862394263e-07,
"loss": 0.0143,
"step": 1290
},
{
"epoch": 2.767914438502674,
"grad_norm": 2.001614809036255,
"learning_rate": 1.603150562829936e-07,
"loss": 0.0212,
"step": 1295
},
{
"epoch": 2.778609625668449,
"grad_norm": 1.535340428352356,
"learning_rate": 1.4614200143756975e-07,
"loss": 0.0156,
"step": 1300
},
{
"epoch": 2.7893048128342244,
"grad_norm": 2.188175678253174,
"learning_rate": 1.3261550200142415e-07,
"loss": 0.0163,
"step": 1305
},
{
"epoch": 2.8,
"grad_norm": 1.8125184774398804,
"learning_rate": 1.197373597642637e-07,
"loss": 0.0176,
"step": 1310
},
{
"epoch": 2.8106951871657753,
"grad_norm": 1.2068591117858887,
"learning_rate": 1.0750929015174216e-07,
"loss": 0.0127,
"step": 1315
},
{
"epoch": 2.8213903743315507,
"grad_norm": 2.0283000469207764,
"learning_rate": 9.593292199695403e-08,
"loss": 0.0166,
"step": 1320
},
{
"epoch": 2.832085561497326,
"grad_norm": 1.5842273235321045,
"learning_rate": 8.500979732347259e-08,
"loss": 0.0189,
"step": 1325
},
{
"epoch": 2.8427807486631016,
"grad_norm": 1.509087085723877,
"learning_rate": 7.47413711399414e-08,
"loss": 0.0178,
"step": 1330
},
{
"epoch": 2.853475935828877,
"grad_norm": 1.6988673210144043,
"learning_rate": 6.51290112462627e-08,
"loss": 0.0139,
"step": 1335
},
{
"epoch": 2.8641711229946525,
"grad_norm": 1.3389947414398193,
"learning_rate": 5.617399805139867e-08,
"loss": 0.0144,
"step": 1340
},
{
"epoch": 2.874866310160428,
"grad_norm": 1.2916324138641357,
"learning_rate": 4.7877524402817345e-08,
"loss": 0.0129,
"step": 1345
},
{
"epoch": 2.8855614973262034,
"grad_norm": 1.6901161670684814,
"learning_rate": 4.024069542759801e-08,
"loss": 0.0191,
"step": 1350
},
{
"epoch": 2.896256684491979,
"grad_norm": 2.0415706634521484,
"learning_rate": 3.3264528385225094e-08,
"loss": 0.0169,
"step": 1355
},
{
"epoch": 2.9069518716577543,
"grad_norm": 1.594831943511963,
"learning_rate": 2.6949952532083768e-08,
"loss": 0.0157,
"step": 1360
},
{
"epoch": 2.9176470588235293,
"grad_norm": 1.7152241468429565,
"learning_rate": 2.1297808997678417e-08,
"loss": 0.0194,
"step": 1365
},
{
"epoch": 2.9283422459893047,
"grad_norm": 1.6763319969177246,
"learning_rate": 1.6308850672591692e-08,
"loss": 0.0148,
"step": 1370
},
{
"epoch": 2.93903743315508,
"grad_norm": 1.905222773551941,
"learning_rate": 1.1983742108195862e-08,
"loss": 0.0168,
"step": 1375
},
{
"epoch": 2.9497326203208556,
"grad_norm": 0.8023597002029419,
"learning_rate": 8.323059428130831e-09,
"loss": 0.0153,
"step": 1380
},
{
"epoch": 2.960427807486631,
"grad_norm": 1.5736637115478516,
"learning_rate": 5.3272902515622e-09,
"loss": 0.0208,
"step": 1385
},
{
"epoch": 2.9711229946524065,
"grad_norm": 1.540812373161316,
"learning_rate": 2.996833628228779e-09,
"loss": 0.017,
"step": 1390
},
{
"epoch": 2.981818181818182,
"grad_norm": 1.0774410963058472,
"learning_rate": 1.3319999852867692e-09,
"loss": 0.0144,
"step": 1395
},
{
"epoch": 2.9925133689839574,
"grad_norm": 1.853932499885559,
"learning_rate": 3.330110859600666e-10,
"loss": 0.02,
"step": 1400
},
{
"epoch": 3.0,
"step": 1404,
"total_flos": 1.0868263403315528e+18,
"train_loss": 0.12404776059961387,
"train_runtime": 1235.844,
"train_samples_per_second": 72.563,
"train_steps_per_second": 1.136
}
],
"logging_steps": 5,
"max_steps": 1404,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.0868263403315528e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}