openr1_codeforces / trainer_state.json
sedrickkeh's picture
End of training
3e6a223 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.994492525570417,
"eval_steps": 500,
"global_step": 1585,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.003147128245476003,
"grad_norm": 6.929598040614756,
"learning_rate": 2.5157232704402517e-07,
"loss": 1.0552,
"step": 1
},
{
"epoch": 0.006294256490952006,
"grad_norm": 6.905796769662321,
"learning_rate": 5.031446540880503e-07,
"loss": 1.0526,
"step": 2
},
{
"epoch": 0.00944138473642801,
"grad_norm": 6.968574034942788,
"learning_rate": 7.547169811320755e-07,
"loss": 1.0558,
"step": 3
},
{
"epoch": 0.012588512981904013,
"grad_norm": 6.851623182903484,
"learning_rate": 1.0062893081761007e-06,
"loss": 1.0581,
"step": 4
},
{
"epoch": 0.015735641227380016,
"grad_norm": 6.816097196762909,
"learning_rate": 1.257861635220126e-06,
"loss": 1.0691,
"step": 5
},
{
"epoch": 0.01888276947285602,
"grad_norm": 6.269657845555971,
"learning_rate": 1.509433962264151e-06,
"loss": 1.0442,
"step": 6
},
{
"epoch": 0.022029897718332022,
"grad_norm": 6.136829210521579,
"learning_rate": 1.7610062893081762e-06,
"loss": 1.0331,
"step": 7
},
{
"epoch": 0.025177025963808025,
"grad_norm": 4.635434838113676,
"learning_rate": 2.0125786163522013e-06,
"loss": 0.9988,
"step": 8
},
{
"epoch": 0.02832415420928403,
"grad_norm": 4.321448680290915,
"learning_rate": 2.2641509433962266e-06,
"loss": 0.9861,
"step": 9
},
{
"epoch": 0.03147128245476003,
"grad_norm": 2.927976062577963,
"learning_rate": 2.515723270440252e-06,
"loss": 0.9606,
"step": 10
},
{
"epoch": 0.03461841070023604,
"grad_norm": 2.853095690681028,
"learning_rate": 2.767295597484277e-06,
"loss": 0.9645,
"step": 11
},
{
"epoch": 0.03776553894571204,
"grad_norm": 2.7462594823658737,
"learning_rate": 3.018867924528302e-06,
"loss": 0.9743,
"step": 12
},
{
"epoch": 0.040912667191188044,
"grad_norm": 3.562428829453312,
"learning_rate": 3.270440251572327e-06,
"loss": 0.9278,
"step": 13
},
{
"epoch": 0.044059795436664044,
"grad_norm": 4.263574542340476,
"learning_rate": 3.5220125786163524e-06,
"loss": 0.9396,
"step": 14
},
{
"epoch": 0.04720692368214005,
"grad_norm": 3.985495537747823,
"learning_rate": 3.7735849056603777e-06,
"loss": 0.9377,
"step": 15
},
{
"epoch": 0.05035405192761605,
"grad_norm": 3.739310444555421,
"learning_rate": 4.025157232704403e-06,
"loss": 0.9249,
"step": 16
},
{
"epoch": 0.05350118017309206,
"grad_norm": 2.9474915656983605,
"learning_rate": 4.276729559748428e-06,
"loss": 0.9276,
"step": 17
},
{
"epoch": 0.05664830841856806,
"grad_norm": 2.270408122661505,
"learning_rate": 4.528301886792453e-06,
"loss": 0.8794,
"step": 18
},
{
"epoch": 0.05979543666404406,
"grad_norm": 2.019432371584577,
"learning_rate": 4.779874213836478e-06,
"loss": 0.8753,
"step": 19
},
{
"epoch": 0.06294256490952006,
"grad_norm": 1.647937215366361,
"learning_rate": 5.031446540880504e-06,
"loss": 0.8624,
"step": 20
},
{
"epoch": 0.06608969315499606,
"grad_norm": 1.2550304135134105,
"learning_rate": 5.283018867924529e-06,
"loss": 0.8442,
"step": 21
},
{
"epoch": 0.06923682140047208,
"grad_norm": 1.1653152615859128,
"learning_rate": 5.534591194968554e-06,
"loss": 0.8329,
"step": 22
},
{
"epoch": 0.07238394964594808,
"grad_norm": 0.9858900450552275,
"learning_rate": 5.786163522012579e-06,
"loss": 0.8175,
"step": 23
},
{
"epoch": 0.07553107789142408,
"grad_norm": 1.022081335979346,
"learning_rate": 6.037735849056604e-06,
"loss": 0.8111,
"step": 24
},
{
"epoch": 0.07867820613690008,
"grad_norm": 0.8753797612953995,
"learning_rate": 6.289308176100629e-06,
"loss": 0.8,
"step": 25
},
{
"epoch": 0.08182533438237609,
"grad_norm": 0.9128953216524504,
"learning_rate": 6.540880503144654e-06,
"loss": 0.7959,
"step": 26
},
{
"epoch": 0.08497246262785209,
"grad_norm": 0.977168784939036,
"learning_rate": 6.792452830188679e-06,
"loss": 0.794,
"step": 27
},
{
"epoch": 0.08811959087332809,
"grad_norm": 0.8642439163002466,
"learning_rate": 7.044025157232705e-06,
"loss": 0.7739,
"step": 28
},
{
"epoch": 0.09126671911880409,
"grad_norm": 0.7734428298213224,
"learning_rate": 7.29559748427673e-06,
"loss": 0.7886,
"step": 29
},
{
"epoch": 0.0944138473642801,
"grad_norm": 0.9003447759767896,
"learning_rate": 7.5471698113207555e-06,
"loss": 0.7766,
"step": 30
},
{
"epoch": 0.0975609756097561,
"grad_norm": 0.9359339373099113,
"learning_rate": 7.79874213836478e-06,
"loss": 0.7634,
"step": 31
},
{
"epoch": 0.1007081038552321,
"grad_norm": 0.7278099390374022,
"learning_rate": 8.050314465408805e-06,
"loss": 0.7609,
"step": 32
},
{
"epoch": 0.1038552321007081,
"grad_norm": 0.7509292280032696,
"learning_rate": 8.301886792452832e-06,
"loss": 0.7633,
"step": 33
},
{
"epoch": 0.10700236034618411,
"grad_norm": 0.7689335379022276,
"learning_rate": 8.553459119496857e-06,
"loss": 0.7442,
"step": 34
},
{
"epoch": 0.11014948859166011,
"grad_norm": 0.7122656766942688,
"learning_rate": 8.805031446540882e-06,
"loss": 0.7424,
"step": 35
},
{
"epoch": 0.11329661683713611,
"grad_norm": 0.6581326216708504,
"learning_rate": 9.056603773584907e-06,
"loss": 0.7356,
"step": 36
},
{
"epoch": 0.11644374508261211,
"grad_norm": 0.6078493790197033,
"learning_rate": 9.308176100628931e-06,
"loss": 0.7439,
"step": 37
},
{
"epoch": 0.11959087332808813,
"grad_norm": 0.5534379930110489,
"learning_rate": 9.559748427672956e-06,
"loss": 0.7392,
"step": 38
},
{
"epoch": 0.12273800157356413,
"grad_norm": 0.6054365759524477,
"learning_rate": 9.811320754716981e-06,
"loss": 0.7362,
"step": 39
},
{
"epoch": 0.12588512981904013,
"grad_norm": 0.6171188975167723,
"learning_rate": 1.0062893081761008e-05,
"loss": 0.7282,
"step": 40
},
{
"epoch": 0.12903225806451613,
"grad_norm": 0.5954823809501688,
"learning_rate": 1.0314465408805033e-05,
"loss": 0.7209,
"step": 41
},
{
"epoch": 0.13217938630999213,
"grad_norm": 0.5100657067756379,
"learning_rate": 1.0566037735849058e-05,
"loss": 0.7276,
"step": 42
},
{
"epoch": 0.13532651455546812,
"grad_norm": 0.5616309144822732,
"learning_rate": 1.0817610062893083e-05,
"loss": 0.7381,
"step": 43
},
{
"epoch": 0.13847364280094415,
"grad_norm": 0.5909101450008207,
"learning_rate": 1.1069182389937107e-05,
"loss": 0.7269,
"step": 44
},
{
"epoch": 0.14162077104642015,
"grad_norm": 0.5890949559740806,
"learning_rate": 1.1320754716981132e-05,
"loss": 0.7096,
"step": 45
},
{
"epoch": 0.14476789929189615,
"grad_norm": 0.5975890175035441,
"learning_rate": 1.1572327044025157e-05,
"loss": 0.7218,
"step": 46
},
{
"epoch": 0.14791502753737215,
"grad_norm": 0.5619034090767466,
"learning_rate": 1.1823899371069182e-05,
"loss": 0.7251,
"step": 47
},
{
"epoch": 0.15106215578284815,
"grad_norm": 0.5774632138045557,
"learning_rate": 1.2075471698113209e-05,
"loss": 0.7137,
"step": 48
},
{
"epoch": 0.15420928402832415,
"grad_norm": 0.5961867782994129,
"learning_rate": 1.2327044025157234e-05,
"loss": 0.7081,
"step": 49
},
{
"epoch": 0.15735641227380015,
"grad_norm": 0.6521015732677928,
"learning_rate": 1.2578616352201259e-05,
"loss": 0.6987,
"step": 50
},
{
"epoch": 0.16050354051927615,
"grad_norm": 0.506284287234856,
"learning_rate": 1.2830188679245283e-05,
"loss": 0.7063,
"step": 51
},
{
"epoch": 0.16365066876475218,
"grad_norm": 0.5516498749754162,
"learning_rate": 1.3081761006289308e-05,
"loss": 0.6963,
"step": 52
},
{
"epoch": 0.16679779701022818,
"grad_norm": 0.6024465952086706,
"learning_rate": 1.3333333333333333e-05,
"loss": 0.7056,
"step": 53
},
{
"epoch": 0.16994492525570418,
"grad_norm": 0.5160023824672929,
"learning_rate": 1.3584905660377358e-05,
"loss": 0.7008,
"step": 54
},
{
"epoch": 0.17309205350118018,
"grad_norm": 0.5791286636248881,
"learning_rate": 1.3836477987421383e-05,
"loss": 0.6996,
"step": 55
},
{
"epoch": 0.17623918174665618,
"grad_norm": 0.5576591803332213,
"learning_rate": 1.408805031446541e-05,
"loss": 0.7102,
"step": 56
},
{
"epoch": 0.17938630999213218,
"grad_norm": 0.6330972767133404,
"learning_rate": 1.4339622641509435e-05,
"loss": 0.6938,
"step": 57
},
{
"epoch": 0.18253343823760818,
"grad_norm": 0.5239745977140408,
"learning_rate": 1.459119496855346e-05,
"loss": 0.6961,
"step": 58
},
{
"epoch": 0.18568056648308418,
"grad_norm": 0.628316605306883,
"learning_rate": 1.4842767295597484e-05,
"loss": 0.6902,
"step": 59
},
{
"epoch": 0.1888276947285602,
"grad_norm": 0.5368099681662118,
"learning_rate": 1.5094339622641511e-05,
"loss": 0.6829,
"step": 60
},
{
"epoch": 0.1919748229740362,
"grad_norm": 0.5734668071542257,
"learning_rate": 1.5345911949685536e-05,
"loss": 0.6968,
"step": 61
},
{
"epoch": 0.1951219512195122,
"grad_norm": 0.6868079128761005,
"learning_rate": 1.559748427672956e-05,
"loss": 0.6858,
"step": 62
},
{
"epoch": 0.1982690794649882,
"grad_norm": 0.5683575072486896,
"learning_rate": 1.5849056603773586e-05,
"loss": 0.6932,
"step": 63
},
{
"epoch": 0.2014162077104642,
"grad_norm": 0.5001123586647835,
"learning_rate": 1.610062893081761e-05,
"loss": 0.6897,
"step": 64
},
{
"epoch": 0.2045633359559402,
"grad_norm": 0.5621386700847243,
"learning_rate": 1.635220125786164e-05,
"loss": 0.6874,
"step": 65
},
{
"epoch": 0.2077104642014162,
"grad_norm": 0.5174313413850266,
"learning_rate": 1.6603773584905664e-05,
"loss": 0.6831,
"step": 66
},
{
"epoch": 0.2108575924468922,
"grad_norm": 0.5217210506959701,
"learning_rate": 1.685534591194969e-05,
"loss": 0.6687,
"step": 67
},
{
"epoch": 0.21400472069236823,
"grad_norm": 0.5035280266030444,
"learning_rate": 1.7106918238993714e-05,
"loss": 0.6834,
"step": 68
},
{
"epoch": 0.21715184893784423,
"grad_norm": 0.5678994919125128,
"learning_rate": 1.735849056603774e-05,
"loss": 0.6951,
"step": 69
},
{
"epoch": 0.22029897718332023,
"grad_norm": 0.5707567615688384,
"learning_rate": 1.7610062893081763e-05,
"loss": 0.6847,
"step": 70
},
{
"epoch": 0.22344610542879623,
"grad_norm": 0.7809985354857447,
"learning_rate": 1.7861635220125788e-05,
"loss": 0.6774,
"step": 71
},
{
"epoch": 0.22659323367427223,
"grad_norm": 1.046093110923802,
"learning_rate": 1.8113207547169813e-05,
"loss": 0.6708,
"step": 72
},
{
"epoch": 0.22974036191974823,
"grad_norm": 1.1135255501368588,
"learning_rate": 1.8364779874213838e-05,
"loss": 0.6706,
"step": 73
},
{
"epoch": 0.23288749016522423,
"grad_norm": 0.7442706325757177,
"learning_rate": 1.8616352201257863e-05,
"loss": 0.688,
"step": 74
},
{
"epoch": 0.23603461841070023,
"grad_norm": 0.7511109436645367,
"learning_rate": 1.8867924528301888e-05,
"loss": 0.6856,
"step": 75
},
{
"epoch": 0.23918174665617625,
"grad_norm": 0.9996568960492868,
"learning_rate": 1.9119496855345913e-05,
"loss": 0.6743,
"step": 76
},
{
"epoch": 0.24232887490165225,
"grad_norm": 1.0404763602720255,
"learning_rate": 1.9371069182389938e-05,
"loss": 0.6769,
"step": 77
},
{
"epoch": 0.24547600314712825,
"grad_norm": 0.9916721164476111,
"learning_rate": 1.9622641509433963e-05,
"loss": 0.6673,
"step": 78
},
{
"epoch": 0.24862313139260425,
"grad_norm": 1.1612341326633233,
"learning_rate": 1.9874213836477987e-05,
"loss": 0.6669,
"step": 79
},
{
"epoch": 0.25177025963808025,
"grad_norm": 0.759493553538397,
"learning_rate": 2.0125786163522016e-05,
"loss": 0.6585,
"step": 80
},
{
"epoch": 0.2549173878835563,
"grad_norm": 1.1730448836839678,
"learning_rate": 2.037735849056604e-05,
"loss": 0.6689,
"step": 81
},
{
"epoch": 0.25806451612903225,
"grad_norm": 1.3145750241994978,
"learning_rate": 2.0628930817610066e-05,
"loss": 0.6666,
"step": 82
},
{
"epoch": 0.2612116443745083,
"grad_norm": 0.8560831722543999,
"learning_rate": 2.088050314465409e-05,
"loss": 0.6605,
"step": 83
},
{
"epoch": 0.26435877261998425,
"grad_norm": 1.307515350251066,
"learning_rate": 2.1132075471698115e-05,
"loss": 0.6777,
"step": 84
},
{
"epoch": 0.2675059008654603,
"grad_norm": 0.7959450309653999,
"learning_rate": 2.138364779874214e-05,
"loss": 0.6628,
"step": 85
},
{
"epoch": 0.27065302911093625,
"grad_norm": 1.3601493926178994,
"learning_rate": 2.1635220125786165e-05,
"loss": 0.6684,
"step": 86
},
{
"epoch": 0.2738001573564123,
"grad_norm": 1.0335217441808127,
"learning_rate": 2.188679245283019e-05,
"loss": 0.6644,
"step": 87
},
{
"epoch": 0.2769472856018883,
"grad_norm": 1.1840289122844756,
"learning_rate": 2.2138364779874215e-05,
"loss": 0.6621,
"step": 88
},
{
"epoch": 0.2800944138473643,
"grad_norm": 0.8482919292822308,
"learning_rate": 2.238993710691824e-05,
"loss": 0.6546,
"step": 89
},
{
"epoch": 0.2832415420928403,
"grad_norm": 1.219466252432804,
"learning_rate": 2.2641509433962265e-05,
"loss": 0.66,
"step": 90
},
{
"epoch": 0.2863886703383163,
"grad_norm": 1.0676120235119377,
"learning_rate": 2.289308176100629e-05,
"loss": 0.6592,
"step": 91
},
{
"epoch": 0.2895357985837923,
"grad_norm": 1.2108955484478103,
"learning_rate": 2.3144654088050315e-05,
"loss": 0.6721,
"step": 92
},
{
"epoch": 0.2926829268292683,
"grad_norm": 0.9120063102836191,
"learning_rate": 2.339622641509434e-05,
"loss": 0.6659,
"step": 93
},
{
"epoch": 0.2958300550747443,
"grad_norm": 1.2734105963895854,
"learning_rate": 2.3647798742138364e-05,
"loss": 0.6707,
"step": 94
},
{
"epoch": 0.2989771833202203,
"grad_norm": 1.0314294710331138,
"learning_rate": 2.3899371069182393e-05,
"loss": 0.6717,
"step": 95
},
{
"epoch": 0.3021243115656963,
"grad_norm": 1.1455964773414549,
"learning_rate": 2.4150943396226418e-05,
"loss": 0.6665,
"step": 96
},
{
"epoch": 0.30527143981117233,
"grad_norm": 1.285838785053339,
"learning_rate": 2.4402515723270442e-05,
"loss": 0.6639,
"step": 97
},
{
"epoch": 0.3084185680566483,
"grad_norm": 1.263209287431304,
"learning_rate": 2.4654088050314467e-05,
"loss": 0.6601,
"step": 98
},
{
"epoch": 0.31156569630212433,
"grad_norm": 0.8978757077780872,
"learning_rate": 2.4905660377358492e-05,
"loss": 0.6545,
"step": 99
},
{
"epoch": 0.3147128245476003,
"grad_norm": 0.9825339416853001,
"learning_rate": 2.5157232704402517e-05,
"loss": 0.6476,
"step": 100
},
{
"epoch": 0.31785995279307633,
"grad_norm": 1.308507693155568,
"learning_rate": 2.5408805031446542e-05,
"loss": 0.6689,
"step": 101
},
{
"epoch": 0.3210070810385523,
"grad_norm": 1.1553828783407727,
"learning_rate": 2.5660377358490567e-05,
"loss": 0.6667,
"step": 102
},
{
"epoch": 0.3241542092840283,
"grad_norm": 1.099078324578191,
"learning_rate": 2.5911949685534592e-05,
"loss": 0.6485,
"step": 103
},
{
"epoch": 0.32730133752950435,
"grad_norm": 1.0681608956982722,
"learning_rate": 2.6163522012578617e-05,
"loss": 0.6492,
"step": 104
},
{
"epoch": 0.3304484657749803,
"grad_norm": 1.0413270700584025,
"learning_rate": 2.641509433962264e-05,
"loss": 0.6565,
"step": 105
},
{
"epoch": 0.33359559402045635,
"grad_norm": 1.1879394567955261,
"learning_rate": 2.6666666666666667e-05,
"loss": 0.6523,
"step": 106
},
{
"epoch": 0.3367427222659323,
"grad_norm": 1.4725003254144884,
"learning_rate": 2.691823899371069e-05,
"loss": 0.6556,
"step": 107
},
{
"epoch": 0.33988985051140835,
"grad_norm": 0.6126879058489698,
"learning_rate": 2.7169811320754716e-05,
"loss": 0.6572,
"step": 108
},
{
"epoch": 0.3430369787568843,
"grad_norm": 1.180095012071221,
"learning_rate": 2.742138364779874e-05,
"loss": 0.6451,
"step": 109
},
{
"epoch": 0.34618410700236035,
"grad_norm": 1.5775784222686664,
"learning_rate": 2.7672955974842766e-05,
"loss": 0.6479,
"step": 110
},
{
"epoch": 0.3493312352478363,
"grad_norm": 1.0102816587441463,
"learning_rate": 2.7924528301886794e-05,
"loss": 0.6537,
"step": 111
},
{
"epoch": 0.35247836349331235,
"grad_norm": 1.844674890611079,
"learning_rate": 2.817610062893082e-05,
"loss": 0.6632,
"step": 112
},
{
"epoch": 0.3556254917387884,
"grad_norm": 1.1133503786547416,
"learning_rate": 2.8427672955974844e-05,
"loss": 0.6588,
"step": 113
},
{
"epoch": 0.35877261998426435,
"grad_norm": 2.0384135105494825,
"learning_rate": 2.867924528301887e-05,
"loss": 0.6566,
"step": 114
},
{
"epoch": 0.3619197482297404,
"grad_norm": 1.6427197756524008,
"learning_rate": 2.8930817610062894e-05,
"loss": 0.6536,
"step": 115
},
{
"epoch": 0.36506687647521635,
"grad_norm": 1.7204143290151555,
"learning_rate": 2.918238993710692e-05,
"loss": 0.6516,
"step": 116
},
{
"epoch": 0.3682140047206924,
"grad_norm": 1.6120937604495096,
"learning_rate": 2.9433962264150944e-05,
"loss": 0.643,
"step": 117
},
{
"epoch": 0.37136113296616835,
"grad_norm": 1.4095110771525443,
"learning_rate": 2.968553459119497e-05,
"loss": 0.6621,
"step": 118
},
{
"epoch": 0.3745082612116444,
"grad_norm": 1.459313179796121,
"learning_rate": 2.9937106918238994e-05,
"loss": 0.6594,
"step": 119
},
{
"epoch": 0.3776553894571204,
"grad_norm": 1.0827125230585957,
"learning_rate": 3.0188679245283022e-05,
"loss": 0.6614,
"step": 120
},
{
"epoch": 0.3808025177025964,
"grad_norm": 1.935573484990545,
"learning_rate": 3.044025157232705e-05,
"loss": 0.6586,
"step": 121
},
{
"epoch": 0.3839496459480724,
"grad_norm": 1.38748901967822,
"learning_rate": 3.069182389937107e-05,
"loss": 0.6587,
"step": 122
},
{
"epoch": 0.3870967741935484,
"grad_norm": 1.949825044536692,
"learning_rate": 3.09433962264151e-05,
"loss": 0.6541,
"step": 123
},
{
"epoch": 0.3902439024390244,
"grad_norm": 1.7165782756639372,
"learning_rate": 3.119496855345912e-05,
"loss": 0.649,
"step": 124
},
{
"epoch": 0.3933910306845004,
"grad_norm": 1.4753090682690373,
"learning_rate": 3.144654088050315e-05,
"loss": 0.6472,
"step": 125
},
{
"epoch": 0.3965381589299764,
"grad_norm": 1.7890266274768591,
"learning_rate": 3.169811320754717e-05,
"loss": 0.6649,
"step": 126
},
{
"epoch": 0.3996852871754524,
"grad_norm": 1.335629902067876,
"learning_rate": 3.19496855345912e-05,
"loss": 0.6544,
"step": 127
},
{
"epoch": 0.4028324154209284,
"grad_norm": 1.5980486069135038,
"learning_rate": 3.220125786163522e-05,
"loss": 0.6511,
"step": 128
},
{
"epoch": 0.40597954366640443,
"grad_norm": 1.3401486224120511,
"learning_rate": 3.245283018867925e-05,
"loss": 0.6435,
"step": 129
},
{
"epoch": 0.4091266719118804,
"grad_norm": 1.5147750007237795,
"learning_rate": 3.270440251572328e-05,
"loss": 0.6493,
"step": 130
},
{
"epoch": 0.41227380015735643,
"grad_norm": 1.5267838685199988,
"learning_rate": 3.29559748427673e-05,
"loss": 0.6535,
"step": 131
},
{
"epoch": 0.4154209284028324,
"grad_norm": 1.2875034331430568,
"learning_rate": 3.320754716981133e-05,
"loss": 0.6434,
"step": 132
},
{
"epoch": 0.41856805664830843,
"grad_norm": 1.6371277196579286,
"learning_rate": 3.345911949685535e-05,
"loss": 0.6584,
"step": 133
},
{
"epoch": 0.4217151848937844,
"grad_norm": 1.5486461984110311,
"learning_rate": 3.371069182389938e-05,
"loss": 0.6458,
"step": 134
},
{
"epoch": 0.42486231313926043,
"grad_norm": 1.3892293338547828,
"learning_rate": 3.39622641509434e-05,
"loss": 0.6447,
"step": 135
},
{
"epoch": 0.42800944138473646,
"grad_norm": 1.6407567446245384,
"learning_rate": 3.421383647798743e-05,
"loss": 0.651,
"step": 136
},
{
"epoch": 0.4311565696302124,
"grad_norm": 1.1366311200838441,
"learning_rate": 3.446540880503145e-05,
"loss": 0.6406,
"step": 137
},
{
"epoch": 0.43430369787568845,
"grad_norm": 1.764056894253217,
"learning_rate": 3.471698113207548e-05,
"loss": 0.649,
"step": 138
},
{
"epoch": 0.4374508261211644,
"grad_norm": 1.2131870613834437,
"learning_rate": 3.49685534591195e-05,
"loss": 0.6433,
"step": 139
},
{
"epoch": 0.44059795436664045,
"grad_norm": 1.617902481723902,
"learning_rate": 3.522012578616353e-05,
"loss": 0.6442,
"step": 140
},
{
"epoch": 0.4437450826121164,
"grad_norm": 1.2609898385086604,
"learning_rate": 3.547169811320755e-05,
"loss": 0.6329,
"step": 141
},
{
"epoch": 0.44689221085759245,
"grad_norm": 1.7132073427376908,
"learning_rate": 3.5723270440251577e-05,
"loss": 0.6503,
"step": 142
},
{
"epoch": 0.4500393391030684,
"grad_norm": 1.2177364936643433,
"learning_rate": 3.59748427672956e-05,
"loss": 0.6319,
"step": 143
},
{
"epoch": 0.45318646734854445,
"grad_norm": 1.4831866244496776,
"learning_rate": 3.6226415094339626e-05,
"loss": 0.6447,
"step": 144
},
{
"epoch": 0.4563335955940205,
"grad_norm": 1.9962471519744966,
"learning_rate": 3.6477987421383655e-05,
"loss": 0.6449,
"step": 145
},
{
"epoch": 0.45948072383949645,
"grad_norm": 1.1203633745786987,
"learning_rate": 3.6729559748427676e-05,
"loss": 0.6447,
"step": 146
},
{
"epoch": 0.4626278520849725,
"grad_norm": 1.7428487048510457,
"learning_rate": 3.6981132075471704e-05,
"loss": 0.6473,
"step": 147
},
{
"epoch": 0.46577498033044845,
"grad_norm": 1.6557025727591974,
"learning_rate": 3.7232704402515726e-05,
"loss": 0.6521,
"step": 148
},
{
"epoch": 0.4689221085759245,
"grad_norm": 1.633226108310503,
"learning_rate": 3.7484276729559754e-05,
"loss": 0.6424,
"step": 149
},
{
"epoch": 0.47206923682140045,
"grad_norm": 1.500343658328648,
"learning_rate": 3.7735849056603776e-05,
"loss": 0.6599,
"step": 150
},
{
"epoch": 0.4752163650668765,
"grad_norm": 1.7811230140847056,
"learning_rate": 3.7987421383647804e-05,
"loss": 0.6365,
"step": 151
},
{
"epoch": 0.4783634933123525,
"grad_norm": 1.4623016887202551,
"learning_rate": 3.8238993710691826e-05,
"loss": 0.6484,
"step": 152
},
{
"epoch": 0.4815106215578285,
"grad_norm": 1.4962594002979546,
"learning_rate": 3.8490566037735854e-05,
"loss": 0.6529,
"step": 153
},
{
"epoch": 0.4846577498033045,
"grad_norm": 1.2968251690411272,
"learning_rate": 3.8742138364779875e-05,
"loss": 0.6405,
"step": 154
},
{
"epoch": 0.4878048780487805,
"grad_norm": 1.6038094180226874,
"learning_rate": 3.8993710691823904e-05,
"loss": 0.6465,
"step": 155
},
{
"epoch": 0.4909520062942565,
"grad_norm": 1.349439118443086,
"learning_rate": 3.9245283018867925e-05,
"loss": 0.6296,
"step": 156
},
{
"epoch": 0.4940991345397325,
"grad_norm": 1.5954765047689583,
"learning_rate": 3.9496855345911953e-05,
"loss": 0.6464,
"step": 157
},
{
"epoch": 0.4972462627852085,
"grad_norm": 1.2357067055631126,
"learning_rate": 3.9748427672955975e-05,
"loss": 0.6476,
"step": 158
},
{
"epoch": 0.5003933910306845,
"grad_norm": 2.005111041334352,
"learning_rate": 4e-05,
"loss": 0.6451,
"step": 159
},
{
"epoch": 0.5035405192761605,
"grad_norm": 1.6640763044427809,
"learning_rate": 3.999995146438705e-05,
"loss": 0.6457,
"step": 160
},
{
"epoch": 0.5066876475216365,
"grad_norm": 1.6470772155507238,
"learning_rate": 3.999980585778375e-05,
"loss": 0.6307,
"step": 161
},
{
"epoch": 0.5098347757671126,
"grad_norm": 1.5584873476563954,
"learning_rate": 3.999956318089682e-05,
"loss": 0.6365,
"step": 162
},
{
"epoch": 0.5129819040125885,
"grad_norm": 0.9821030562408402,
"learning_rate": 3.9999223434904104e-05,
"loss": 0.6618,
"step": 163
},
{
"epoch": 0.5161290322580645,
"grad_norm": 2.065476843178622,
"learning_rate": 3.9998786621454584e-05,
"loss": 0.6486,
"step": 164
},
{
"epoch": 0.5192761605035405,
"grad_norm": 0.9162285189594601,
"learning_rate": 3.999825274266836e-05,
"loss": 0.6344,
"step": 165
},
{
"epoch": 0.5224232887490166,
"grad_norm": 1.817254046632479,
"learning_rate": 3.9997621801136645e-05,
"loss": 0.6578,
"step": 166
},
{
"epoch": 0.5255704169944925,
"grad_norm": 1.6213131489050836,
"learning_rate": 3.999689379992174e-05,
"loss": 0.6478,
"step": 167
},
{
"epoch": 0.5287175452399685,
"grad_norm": 1.5191822069477583,
"learning_rate": 3.9996068742557065e-05,
"loss": 0.634,
"step": 168
},
{
"epoch": 0.5318646734854445,
"grad_norm": 1.3625533453285046,
"learning_rate": 3.999514663304708e-05,
"loss": 0.6416,
"step": 169
},
{
"epoch": 0.5350118017309206,
"grad_norm": 1.7443314517509239,
"learning_rate": 3.999412747586729e-05,
"loss": 0.6323,
"step": 170
},
{
"epoch": 0.5381589299763966,
"grad_norm": 1.2764075205223595,
"learning_rate": 3.999301127596425e-05,
"loss": 0.6389,
"step": 171
},
{
"epoch": 0.5413060582218725,
"grad_norm": 1.752440457864074,
"learning_rate": 3.9991798038755484e-05,
"loss": 0.6419,
"step": 172
},
{
"epoch": 0.5444531864673485,
"grad_norm": 1.435326730571326,
"learning_rate": 3.999048777012953e-05,
"loss": 0.6285,
"step": 173
},
{
"epoch": 0.5476003147128246,
"grad_norm": 1.9146516602093107,
"learning_rate": 3.998908047644587e-05,
"loss": 0.6471,
"step": 174
},
{
"epoch": 0.5507474429583006,
"grad_norm": 1.6167083359019538,
"learning_rate": 3.998757616453486e-05,
"loss": 0.6403,
"step": 175
},
{
"epoch": 0.5538945712037766,
"grad_norm": 1.4023315753110592,
"learning_rate": 3.998597484169779e-05,
"loss": 0.6396,
"step": 176
},
{
"epoch": 0.5570416994492525,
"grad_norm": 1.3268570022647785,
"learning_rate": 3.9984276515706764e-05,
"loss": 0.6381,
"step": 177
},
{
"epoch": 0.5601888276947286,
"grad_norm": 1.3545177342692951,
"learning_rate": 3.998248119480473e-05,
"loss": 0.6498,
"step": 178
},
{
"epoch": 0.5633359559402046,
"grad_norm": 1.0685787928581592,
"learning_rate": 3.998058888770537e-05,
"loss": 0.6474,
"step": 179
},
{
"epoch": 0.5664830841856806,
"grad_norm": 1.5577581689973945,
"learning_rate": 3.997859960359313e-05,
"loss": 0.6294,
"step": 180
},
{
"epoch": 0.5696302124311565,
"grad_norm": 1.5970031198965817,
"learning_rate": 3.997651335212311e-05,
"loss": 0.6228,
"step": 181
},
{
"epoch": 0.5727773406766326,
"grad_norm": 1.0686347825691989,
"learning_rate": 3.997433014342106e-05,
"loss": 0.6153,
"step": 182
},
{
"epoch": 0.5759244689221086,
"grad_norm": 1.1929803859988213,
"learning_rate": 3.9972049988083323e-05,
"loss": 0.65,
"step": 183
},
{
"epoch": 0.5790715971675846,
"grad_norm": 1.5562626483329678,
"learning_rate": 3.9969672897176764e-05,
"loss": 0.6256,
"step": 184
},
{
"epoch": 0.5822187254130606,
"grad_norm": 1.05136149985961,
"learning_rate": 3.996719888223875e-05,
"loss": 0.6324,
"step": 185
},
{
"epoch": 0.5853658536585366,
"grad_norm": 1.0205341272287771,
"learning_rate": 3.996462795527706e-05,
"loss": 0.6452,
"step": 186
},
{
"epoch": 0.5885129819040126,
"grad_norm": 1.3028420216806813,
"learning_rate": 3.996196012876984e-05,
"loss": 0.6371,
"step": 187
},
{
"epoch": 0.5916601101494886,
"grad_norm": 1.4324067360751134,
"learning_rate": 3.995919541566555e-05,
"loss": 0.6432,
"step": 188
},
{
"epoch": 0.5948072383949646,
"grad_norm": 1.2819517544027286,
"learning_rate": 3.995633382938291e-05,
"loss": 0.6261,
"step": 189
},
{
"epoch": 0.5979543666404405,
"grad_norm": 0.9819032579492132,
"learning_rate": 3.995337538381079e-05,
"loss": 0.6347,
"step": 190
},
{
"epoch": 0.6011014948859166,
"grad_norm": 1.2953786226264292,
"learning_rate": 3.9950320093308185e-05,
"loss": 0.6358,
"step": 191
},
{
"epoch": 0.6042486231313926,
"grad_norm": 1.2056447304755342,
"learning_rate": 3.994716797270414e-05,
"loss": 0.6316,
"step": 192
},
{
"epoch": 0.6073957513768686,
"grad_norm": 1.508482407225396,
"learning_rate": 3.9943919037297674e-05,
"loss": 0.6333,
"step": 193
},
{
"epoch": 0.6105428796223447,
"grad_norm": 1.2574626256613217,
"learning_rate": 3.9940573302857675e-05,
"loss": 0.6382,
"step": 194
},
{
"epoch": 0.6136900078678206,
"grad_norm": 1.3371901111268312,
"learning_rate": 3.993713078562288e-05,
"loss": 0.6326,
"step": 195
},
{
"epoch": 0.6168371361132966,
"grad_norm": 1.8420742844545008,
"learning_rate": 3.993359150230177e-05,
"loss": 0.6378,
"step": 196
},
{
"epoch": 0.6199842643587726,
"grad_norm": 1.0329114512418138,
"learning_rate": 3.992995547007245e-05,
"loss": 0.6264,
"step": 197
},
{
"epoch": 0.6231313926042487,
"grad_norm": 2.5550436716710894,
"learning_rate": 3.992622270658264e-05,
"loss": 0.6284,
"step": 198
},
{
"epoch": 0.6262785208497246,
"grad_norm": 1.6496939828823534,
"learning_rate": 3.992239322994953e-05,
"loss": 0.6328,
"step": 199
},
{
"epoch": 0.6294256490952006,
"grad_norm": 2.1055055233410798,
"learning_rate": 3.991846705875973e-05,
"loss": 0.6387,
"step": 200
},
{
"epoch": 0.6325727773406766,
"grad_norm": 1.8595829111365227,
"learning_rate": 3.9914444212069144e-05,
"loss": 0.6352,
"step": 201
},
{
"epoch": 0.6357199055861527,
"grad_norm": 1.8668994107558041,
"learning_rate": 3.99103247094029e-05,
"loss": 0.637,
"step": 202
},
{
"epoch": 0.6388670338316287,
"grad_norm": 1.7037836647511615,
"learning_rate": 3.990610857075527e-05,
"loss": 0.6343,
"step": 203
},
{
"epoch": 0.6420141620771046,
"grad_norm": 1.38923058159769,
"learning_rate": 3.990179581658953e-05,
"loss": 0.6325,
"step": 204
},
{
"epoch": 0.6451612903225806,
"grad_norm": 1.7563051190358938,
"learning_rate": 3.98973864678379e-05,
"loss": 0.6249,
"step": 205
},
{
"epoch": 0.6483084185680567,
"grad_norm": 1.005439976834508,
"learning_rate": 3.9892880545901436e-05,
"loss": 0.6218,
"step": 206
},
{
"epoch": 0.6514555468135327,
"grad_norm": 2.054893019034046,
"learning_rate": 3.988827807264989e-05,
"loss": 0.6367,
"step": 207
},
{
"epoch": 0.6546026750590087,
"grad_norm": 1.2789629172418557,
"learning_rate": 3.988357907042165e-05,
"loss": 0.6426,
"step": 208
},
{
"epoch": 0.6577498033044846,
"grad_norm": 2.517606786751749,
"learning_rate": 3.9878783562023615e-05,
"loss": 0.644,
"step": 209
},
{
"epoch": 0.6608969315499607,
"grad_norm": 2.1032212837598365,
"learning_rate": 3.987389157073108e-05,
"loss": 0.6498,
"step": 210
},
{
"epoch": 0.6640440597954367,
"grad_norm": 1.6077441283045066,
"learning_rate": 3.986890312028763e-05,
"loss": 0.6318,
"step": 211
},
{
"epoch": 0.6671911880409127,
"grad_norm": 1.3352828378955408,
"learning_rate": 3.9863818234904996e-05,
"loss": 0.6394,
"step": 212
},
{
"epoch": 0.6703383162863886,
"grad_norm": 1.6466476954422824,
"learning_rate": 3.985863693926301e-05,
"loss": 0.6412,
"step": 213
},
{
"epoch": 0.6734854445318647,
"grad_norm": 1.2055185581585135,
"learning_rate": 3.9853359258509375e-05,
"loss": 0.6377,
"step": 214
},
{
"epoch": 0.6766325727773407,
"grad_norm": 1.8223991680747094,
"learning_rate": 3.984798521825966e-05,
"loss": 0.6323,
"step": 215
},
{
"epoch": 0.6797797010228167,
"grad_norm": 1.4244972280249446,
"learning_rate": 3.9842514844597106e-05,
"loss": 0.6411,
"step": 216
},
{
"epoch": 0.6829268292682927,
"grad_norm": 1.810262815503657,
"learning_rate": 3.983694816407248e-05,
"loss": 0.6287,
"step": 217
},
{
"epoch": 0.6860739575137687,
"grad_norm": 1.584327074252924,
"learning_rate": 3.983128520370403e-05,
"loss": 0.6226,
"step": 218
},
{
"epoch": 0.6892210857592447,
"grad_norm": 1.4679159722861652,
"learning_rate": 3.982552599097727e-05,
"loss": 0.629,
"step": 219
},
{
"epoch": 0.6923682140047207,
"grad_norm": 1.4505081690005162,
"learning_rate": 3.9819670553844885e-05,
"loss": 0.6337,
"step": 220
},
{
"epoch": 0.6955153422501967,
"grad_norm": 1.3729339166204286,
"learning_rate": 3.981371892072661e-05,
"loss": 0.6241,
"step": 221
},
{
"epoch": 0.6986624704956726,
"grad_norm": 1.2886236495973369,
"learning_rate": 3.9807671120509074e-05,
"loss": 0.6354,
"step": 222
},
{
"epoch": 0.7018095987411487,
"grad_norm": 1.2379553349269816,
"learning_rate": 3.9801527182545624e-05,
"loss": 0.625,
"step": 223
},
{
"epoch": 0.7049567269866247,
"grad_norm": 1.2249283736735637,
"learning_rate": 3.979528713665624e-05,
"loss": 0.6301,
"step": 224
},
{
"epoch": 0.7081038552321007,
"grad_norm": 1.336253285005203,
"learning_rate": 3.978895101312738e-05,
"loss": 0.6292,
"step": 225
},
{
"epoch": 0.7112509834775768,
"grad_norm": 1.1205953980444752,
"learning_rate": 3.9782518842711795e-05,
"loss": 0.626,
"step": 226
},
{
"epoch": 0.7143981117230527,
"grad_norm": 1.7027386427471345,
"learning_rate": 3.977599065662843e-05,
"loss": 0.6246,
"step": 227
},
{
"epoch": 0.7175452399685287,
"grad_norm": 1.1291882936174378,
"learning_rate": 3.976936648656223e-05,
"loss": 0.6282,
"step": 228
},
{
"epoch": 0.7206923682140047,
"grad_norm": 1.2813572025478552,
"learning_rate": 3.976264636466401e-05,
"loss": 0.6271,
"step": 229
},
{
"epoch": 0.7238394964594808,
"grad_norm": 1.310136883140352,
"learning_rate": 3.97558303235503e-05,
"loss": 0.6299,
"step": 230
},
{
"epoch": 0.7269866247049567,
"grad_norm": 1.234929455560365,
"learning_rate": 3.9748918396303166e-05,
"loss": 0.6273,
"step": 231
},
{
"epoch": 0.7301337529504327,
"grad_norm": 1.152399890155409,
"learning_rate": 3.974191061647007e-05,
"loss": 0.6364,
"step": 232
},
{
"epoch": 0.7332808811959087,
"grad_norm": 0.9820218267166709,
"learning_rate": 3.973480701806371e-05,
"loss": 0.6081,
"step": 233
},
{
"epoch": 0.7364280094413848,
"grad_norm": 1.437527267365468,
"learning_rate": 3.972760763556183e-05,
"loss": 0.6335,
"step": 234
},
{
"epoch": 0.7395751376868608,
"grad_norm": 1.2471735910370214,
"learning_rate": 3.972031250390707e-05,
"loss": 0.6245,
"step": 235
},
{
"epoch": 0.7427222659323367,
"grad_norm": 0.7783741133192488,
"learning_rate": 3.97129216585068e-05,
"loss": 0.6174,
"step": 236
},
{
"epoch": 0.7458693941778127,
"grad_norm": 1.422537426798629,
"learning_rate": 3.9705435135232954e-05,
"loss": 0.6259,
"step": 237
},
{
"epoch": 0.7490165224232888,
"grad_norm": 1.1344073454119308,
"learning_rate": 3.9697852970421816e-05,
"loss": 0.6156,
"step": 238
},
{
"epoch": 0.7521636506687648,
"grad_norm": 0.8130085642709579,
"learning_rate": 3.96901752008739e-05,
"loss": 0.6235,
"step": 239
},
{
"epoch": 0.7553107789142408,
"grad_norm": 1.506021663227266,
"learning_rate": 3.968240186385372e-05,
"loss": 0.6308,
"step": 240
},
{
"epoch": 0.7584579071597167,
"grad_norm": 0.8828709445930414,
"learning_rate": 3.967453299708965e-05,
"loss": 0.6249,
"step": 241
},
{
"epoch": 0.7616050354051928,
"grad_norm": 1.1195490114701467,
"learning_rate": 3.966656863877371e-05,
"loss": 0.6222,
"step": 242
},
{
"epoch": 0.7647521636506688,
"grad_norm": 1.159507235093096,
"learning_rate": 3.965850882756141e-05,
"loss": 0.6117,
"step": 243
},
{
"epoch": 0.7678992918961448,
"grad_norm": 1.3618433823111082,
"learning_rate": 3.9650353602571535e-05,
"loss": 0.6164,
"step": 244
},
{
"epoch": 0.7710464201416207,
"grad_norm": 1.4260909924046845,
"learning_rate": 3.9642103003385976e-05,
"loss": 0.6322,
"step": 245
},
{
"epoch": 0.7741935483870968,
"grad_norm": 0.8960267340901048,
"learning_rate": 3.963375707004951e-05,
"loss": 0.6212,
"step": 246
},
{
"epoch": 0.7773406766325728,
"grad_norm": 1.9505783063886386,
"learning_rate": 3.9625315843069635e-05,
"loss": 0.6272,
"step": 247
},
{
"epoch": 0.7804878048780488,
"grad_norm": 1.060698991962841,
"learning_rate": 3.9616779363416375e-05,
"loss": 0.627,
"step": 248
},
{
"epoch": 0.7836349331235248,
"grad_norm": 2.651158418118449,
"learning_rate": 3.9608147672522056e-05,
"loss": 0.632,
"step": 249
},
{
"epoch": 0.7867820613690008,
"grad_norm": 1.9829172683055232,
"learning_rate": 3.959942081228111e-05,
"loss": 0.6334,
"step": 250
},
{
"epoch": 0.7899291896144768,
"grad_norm": 2.0043712787473855,
"learning_rate": 3.9590598825049896e-05,
"loss": 0.6347,
"step": 251
},
{
"epoch": 0.7930763178599528,
"grad_norm": 1.5914049220753375,
"learning_rate": 3.958168175364646e-05,
"loss": 0.6291,
"step": 252
},
{
"epoch": 0.7962234461054288,
"grad_norm": 2.044617279526593,
"learning_rate": 3.9572669641350366e-05,
"loss": 0.6227,
"step": 253
},
{
"epoch": 0.7993705743509048,
"grad_norm": 1.4921666169771846,
"learning_rate": 3.956356253190245e-05,
"loss": 0.6287,
"step": 254
},
{
"epoch": 0.8025177025963808,
"grad_norm": 2.1859673527222374,
"learning_rate": 3.9554360469504616e-05,
"loss": 0.6349,
"step": 255
},
{
"epoch": 0.8056648308418568,
"grad_norm": 1.8780839152913937,
"learning_rate": 3.9545063498819655e-05,
"loss": 0.6322,
"step": 256
},
{
"epoch": 0.8088119590873328,
"grad_norm": 1.8855386967686778,
"learning_rate": 3.9535671664970976e-05,
"loss": 0.6236,
"step": 257
},
{
"epoch": 0.8119590873328089,
"grad_norm": 1.57211297687127,
"learning_rate": 3.952618501354241e-05,
"loss": 0.6266,
"step": 258
},
{
"epoch": 0.8151062155782848,
"grad_norm": 1.6840002131557807,
"learning_rate": 3.951660359057802e-05,
"loss": 0.6348,
"step": 259
},
{
"epoch": 0.8182533438237608,
"grad_norm": 1.4505249387670598,
"learning_rate": 3.9506927442581816e-05,
"loss": 0.6185,
"step": 260
},
{
"epoch": 0.8214004720692368,
"grad_norm": 1.6387407896735724,
"learning_rate": 3.9497156616517584e-05,
"loss": 0.6275,
"step": 261
},
{
"epoch": 0.8245476003147129,
"grad_norm": 1.4191671751611528,
"learning_rate": 3.948729115980862e-05,
"loss": 0.618,
"step": 262
},
{
"epoch": 0.8276947285601888,
"grad_norm": 1.6165413779568125,
"learning_rate": 3.947733112033753e-05,
"loss": 0.6261,
"step": 263
},
{
"epoch": 0.8308418568056648,
"grad_norm": 1.3413248663670272,
"learning_rate": 3.946727654644597e-05,
"loss": 0.6129,
"step": 264
},
{
"epoch": 0.8339889850511408,
"grad_norm": 1.4534436081435673,
"learning_rate": 3.945712748693443e-05,
"loss": 0.6156,
"step": 265
},
{
"epoch": 0.8371361132966169,
"grad_norm": 1.2334383145263632,
"learning_rate": 3.9446883991062e-05,
"loss": 0.6197,
"step": 266
},
{
"epoch": 0.8402832415420929,
"grad_norm": 1.750270105689965,
"learning_rate": 3.94365461085461e-05,
"loss": 0.6061,
"step": 267
},
{
"epoch": 0.8434303697875688,
"grad_norm": 1.5161125269771183,
"learning_rate": 3.94261138895623e-05,
"loss": 0.6227,
"step": 268
},
{
"epoch": 0.8465774980330448,
"grad_norm": 1.5221806720588753,
"learning_rate": 3.9415587384744e-05,
"loss": 0.6323,
"step": 269
},
{
"epoch": 0.8497246262785209,
"grad_norm": 1.3897427486795066,
"learning_rate": 3.940496664518223e-05,
"loss": 0.6207,
"step": 270
},
{
"epoch": 0.8528717545239969,
"grad_norm": 1.4191581228390167,
"learning_rate": 3.939425172242541e-05,
"loss": 0.6068,
"step": 271
},
{
"epoch": 0.8560188827694729,
"grad_norm": 1.0898432509550877,
"learning_rate": 3.9383442668479074e-05,
"loss": 0.6194,
"step": 272
},
{
"epoch": 0.8591660110149488,
"grad_norm": 1.729898493997823,
"learning_rate": 3.937253953580562e-05,
"loss": 0.621,
"step": 273
},
{
"epoch": 0.8623131392604249,
"grad_norm": 1.3978303034299537,
"learning_rate": 3.936154237732409e-05,
"loss": 0.6143,
"step": 274
},
{
"epoch": 0.8654602675059009,
"grad_norm": 1.6560127678824332,
"learning_rate": 3.935045124640985e-05,
"loss": 0.6128,
"step": 275
},
{
"epoch": 0.8686073957513769,
"grad_norm": 1.5405477384904165,
"learning_rate": 3.933926619689438e-05,
"loss": 0.6227,
"step": 276
},
{
"epoch": 0.8717545239968528,
"grad_norm": 1.232950563952649,
"learning_rate": 3.932798728306502e-05,
"loss": 0.6166,
"step": 277
},
{
"epoch": 0.8749016522423289,
"grad_norm": 1.3226181835548478,
"learning_rate": 3.931661455966465e-05,
"loss": 0.6093,
"step": 278
},
{
"epoch": 0.8780487804878049,
"grad_norm": 1.217634421516471,
"learning_rate": 3.930514808189149e-05,
"loss": 0.6155,
"step": 279
},
{
"epoch": 0.8811959087332809,
"grad_norm": 1.0114325069677168,
"learning_rate": 3.929358790539881e-05,
"loss": 0.6253,
"step": 280
},
{
"epoch": 0.8843430369787569,
"grad_norm": 1.1140247331568867,
"learning_rate": 3.92819340862946e-05,
"loss": 0.6239,
"step": 281
},
{
"epoch": 0.8874901652242329,
"grad_norm": 1.0564363406645432,
"learning_rate": 3.927018668114141e-05,
"loss": 0.6211,
"step": 282
},
{
"epoch": 0.8906372934697089,
"grad_norm": 1.4079391384169502,
"learning_rate": 3.925834574695599e-05,
"loss": 0.6182,
"step": 283
},
{
"epoch": 0.8937844217151849,
"grad_norm": 1.039379605395923,
"learning_rate": 3.924641134120903e-05,
"loss": 0.6221,
"step": 284
},
{
"epoch": 0.8969315499606609,
"grad_norm": 1.2213803291786665,
"learning_rate": 3.9234383521824905e-05,
"loss": 0.6161,
"step": 285
},
{
"epoch": 0.9000786782061369,
"grad_norm": 1.0145611815593831,
"learning_rate": 3.922226234718137e-05,
"loss": 0.6148,
"step": 286
},
{
"epoch": 0.9032258064516129,
"grad_norm": 1.437050058531951,
"learning_rate": 3.92100478761093e-05,
"loss": 0.6138,
"step": 287
},
{
"epoch": 0.9063729346970889,
"grad_norm": 0.9288489831721957,
"learning_rate": 3.919774016789237e-05,
"loss": 0.621,
"step": 288
},
{
"epoch": 0.9095200629425649,
"grad_norm": 1.2266763831925402,
"learning_rate": 3.918533928226679e-05,
"loss": 0.6156,
"step": 289
},
{
"epoch": 0.912667191188041,
"grad_norm": 0.9671793379615254,
"learning_rate": 3.917284527942103e-05,
"loss": 0.6167,
"step": 290
},
{
"epoch": 0.9158143194335169,
"grad_norm": 1.244082215609862,
"learning_rate": 3.91602582199955e-05,
"loss": 0.6161,
"step": 291
},
{
"epoch": 0.9189614476789929,
"grad_norm": 1.1463688074695213,
"learning_rate": 3.914757816508225e-05,
"loss": 0.614,
"step": 292
},
{
"epoch": 0.9221085759244689,
"grad_norm": 1.2261559130019553,
"learning_rate": 3.913480517622472e-05,
"loss": 0.6208,
"step": 293
},
{
"epoch": 0.925255704169945,
"grad_norm": 1.0646827207279455,
"learning_rate": 3.9121939315417386e-05,
"loss": 0.6286,
"step": 294
},
{
"epoch": 0.9284028324154209,
"grad_norm": 1.0539629025664061,
"learning_rate": 3.910898064510549e-05,
"loss": 0.6218,
"step": 295
},
{
"epoch": 0.9315499606608969,
"grad_norm": 0.9722762493322944,
"learning_rate": 3.909592922818474e-05,
"loss": 0.621,
"step": 296
},
{
"epoch": 0.9346970889063729,
"grad_norm": 1.0701286553105696,
"learning_rate": 3.908278512800098e-05,
"loss": 0.6215,
"step": 297
},
{
"epoch": 0.937844217151849,
"grad_norm": 1.118122321373602,
"learning_rate": 3.906954840834991e-05,
"loss": 0.6214,
"step": 298
},
{
"epoch": 0.940991345397325,
"grad_norm": 0.8652483960951545,
"learning_rate": 3.9056219133476766e-05,
"loss": 0.6199,
"step": 299
},
{
"epoch": 0.9441384736428009,
"grad_norm": 1.0343112033921007,
"learning_rate": 3.904279736807599e-05,
"loss": 0.6104,
"step": 300
},
{
"epoch": 0.9472856018882769,
"grad_norm": 0.9496922262871718,
"learning_rate": 3.9029283177290944e-05,
"loss": 0.6081,
"step": 301
},
{
"epoch": 0.950432730133753,
"grad_norm": 0.9371417223191364,
"learning_rate": 3.901567662671359e-05,
"loss": 0.6182,
"step": 302
},
{
"epoch": 0.953579858379229,
"grad_norm": 1.1256246896638376,
"learning_rate": 3.9001977782384154e-05,
"loss": 0.6272,
"step": 303
},
{
"epoch": 0.956726986624705,
"grad_norm": 1.1988543784517776,
"learning_rate": 3.898818671079081e-05,
"loss": 0.6193,
"step": 304
},
{
"epoch": 0.9598741148701809,
"grad_norm": 0.6640888343950058,
"learning_rate": 3.897430347886937e-05,
"loss": 0.623,
"step": 305
},
{
"epoch": 0.963021243115657,
"grad_norm": 1.0259693265720489,
"learning_rate": 3.896032815400295e-05,
"loss": 0.6061,
"step": 306
},
{
"epoch": 0.966168371361133,
"grad_norm": 1.6687489855665556,
"learning_rate": 3.894626080402166e-05,
"loss": 0.6158,
"step": 307
},
{
"epoch": 0.969315499606609,
"grad_norm": 0.7655240494240164,
"learning_rate": 3.893210149720222e-05,
"loss": 0.6135,
"step": 308
},
{
"epoch": 0.9724626278520849,
"grad_norm": 1.6527270765731679,
"learning_rate": 3.8917850302267724e-05,
"loss": 0.6214,
"step": 309
},
{
"epoch": 0.975609756097561,
"grad_norm": 0.9742997284638653,
"learning_rate": 3.890350728838719e-05,
"loss": 0.6098,
"step": 310
},
{
"epoch": 0.978756884343037,
"grad_norm": 1.2188875317692218,
"learning_rate": 3.888907252517534e-05,
"loss": 0.6078,
"step": 311
},
{
"epoch": 0.981904012588513,
"grad_norm": 1.4090784716524412,
"learning_rate": 3.887454608269217e-05,
"loss": 0.6148,
"step": 312
},
{
"epoch": 0.985051140833989,
"grad_norm": 0.8677610975018318,
"learning_rate": 3.885992803144266e-05,
"loss": 0.6283,
"step": 313
},
{
"epoch": 0.988198269079465,
"grad_norm": 1.2008659411405975,
"learning_rate": 3.8845218442376416e-05,
"loss": 0.6179,
"step": 314
},
{
"epoch": 0.991345397324941,
"grad_norm": 1.181118863273738,
"learning_rate": 3.883041738688733e-05,
"loss": 0.6138,
"step": 315
},
{
"epoch": 0.994492525570417,
"grad_norm": 0.8124102237663046,
"learning_rate": 3.8815524936813236e-05,
"loss": 0.6024,
"step": 316
},
{
"epoch": 0.997639653815893,
"grad_norm": 1.0702432156884507,
"learning_rate": 3.880054116443556e-05,
"loss": 0.6101,
"step": 317
},
{
"epoch": 1.002360346184107,
"grad_norm": 2.065753171430859,
"learning_rate": 3.878546614247894e-05,
"loss": 1.027,
"step": 318
},
{
"epoch": 1.005507474429583,
"grad_norm": 0.8860268190442901,
"learning_rate": 3.8770299944110934e-05,
"loss": 0.5977,
"step": 319
},
{
"epoch": 1.008654602675059,
"grad_norm": 0.8640393060499529,
"learning_rate": 3.875504264294161e-05,
"loss": 0.5814,
"step": 320
},
{
"epoch": 1.011801730920535,
"grad_norm": 1.0567074386948476,
"learning_rate": 3.873969431302322e-05,
"loss": 0.5838,
"step": 321
},
{
"epoch": 1.014948859166011,
"grad_norm": 1.2201467147928156,
"learning_rate": 3.872425502884982e-05,
"loss": 0.5616,
"step": 322
},
{
"epoch": 1.018095987411487,
"grad_norm": 1.2298258787473502,
"learning_rate": 3.870872486535692e-05,
"loss": 0.605,
"step": 323
},
{
"epoch": 1.021243115656963,
"grad_norm": 0.9570547295059392,
"learning_rate": 3.869310389792113e-05,
"loss": 0.5571,
"step": 324
},
{
"epoch": 1.024390243902439,
"grad_norm": 0.910399791203508,
"learning_rate": 3.867739220235976e-05,
"loss": 0.5803,
"step": 325
},
{
"epoch": 1.027537372147915,
"grad_norm": 1.1255287832975964,
"learning_rate": 3.866158985493049e-05,
"loss": 0.5735,
"step": 326
},
{
"epoch": 1.030684500393391,
"grad_norm": 1.2218937018260718,
"learning_rate": 3.864569693233099e-05,
"loss": 0.5839,
"step": 327
},
{
"epoch": 1.0338316286388671,
"grad_norm": 0.879516220387298,
"learning_rate": 3.862971351169853e-05,
"loss": 0.5897,
"step": 328
},
{
"epoch": 1.036978756884343,
"grad_norm": 0.9597145092489864,
"learning_rate": 3.8613639670609624e-05,
"loss": 0.5539,
"step": 329
},
{
"epoch": 1.040125885129819,
"grad_norm": 1.6528619139400769,
"learning_rate": 3.859747548707963e-05,
"loss": 0.6214,
"step": 330
},
{
"epoch": 1.043273013375295,
"grad_norm": 0.8058486040606079,
"learning_rate": 3.858122103956242e-05,
"loss": 0.5468,
"step": 331
},
{
"epoch": 1.046420141620771,
"grad_norm": 1.623522624231283,
"learning_rate": 3.8564876406949954e-05,
"loss": 0.6004,
"step": 332
},
{
"epoch": 1.049567269866247,
"grad_norm": 1.088449667069518,
"learning_rate": 3.85484416685719e-05,
"loss": 0.6047,
"step": 333
},
{
"epoch": 1.052714398111723,
"grad_norm": 1.2546950767006624,
"learning_rate": 3.8531916904195264e-05,
"loss": 0.5485,
"step": 334
},
{
"epoch": 1.055861526357199,
"grad_norm": 1.1169301229814428,
"learning_rate": 3.851530219402401e-05,
"loss": 0.6023,
"step": 335
},
{
"epoch": 1.0590086546026751,
"grad_norm": 1.5613063395853726,
"learning_rate": 3.8498597618698644e-05,
"loss": 0.5753,
"step": 336
},
{
"epoch": 1.0621557828481512,
"grad_norm": 1.0603407240619633,
"learning_rate": 3.848180325929586e-05,
"loss": 0.6027,
"step": 337
},
{
"epoch": 1.0653029110936272,
"grad_norm": 1.9784565816301807,
"learning_rate": 3.8464919197328106e-05,
"loss": 0.563,
"step": 338
},
{
"epoch": 1.068450039339103,
"grad_norm": 1.6560420050349316,
"learning_rate": 3.84479455147432e-05,
"loss": 0.6156,
"step": 339
},
{
"epoch": 1.071597167584579,
"grad_norm": 1.3178042643700982,
"learning_rate": 3.8430882293923966e-05,
"loss": 0.5593,
"step": 340
},
{
"epoch": 1.074744295830055,
"grad_norm": 1.3767031608324611,
"learning_rate": 3.8413729617687786e-05,
"loss": 0.6085,
"step": 341
},
{
"epoch": 1.077891424075531,
"grad_norm": 1.0336068597716115,
"learning_rate": 3.839648756928622e-05,
"loss": 0.5806,
"step": 342
},
{
"epoch": 1.081038552321007,
"grad_norm": 1.3254526922251793,
"learning_rate": 3.837915623240462e-05,
"loss": 0.5496,
"step": 343
},
{
"epoch": 1.0841856805664831,
"grad_norm": 0.8088669835406253,
"learning_rate": 3.836173569116167e-05,
"loss": 0.6009,
"step": 344
},
{
"epoch": 1.0873328088119592,
"grad_norm": 1.2752754683882992,
"learning_rate": 3.834422603010906e-05,
"loss": 0.5966,
"step": 345
},
{
"epoch": 1.0904799370574352,
"grad_norm": 0.9879821313465822,
"learning_rate": 3.8326627334230995e-05,
"loss": 0.5685,
"step": 346
},
{
"epoch": 1.0936270653029112,
"grad_norm": 0.957023686339743,
"learning_rate": 3.830893968894382e-05,
"loss": 0.5855,
"step": 347
},
{
"epoch": 1.096774193548387,
"grad_norm": 0.8904028341609422,
"learning_rate": 3.82911631800956e-05,
"loss": 0.582,
"step": 348
},
{
"epoch": 1.099921321793863,
"grad_norm": 0.9097097133530387,
"learning_rate": 3.8273297893965725e-05,
"loss": 0.5885,
"step": 349
},
{
"epoch": 1.103068450039339,
"grad_norm": 0.8599788576525904,
"learning_rate": 3.825534391726444e-05,
"loss": 0.5842,
"step": 350
},
{
"epoch": 1.106215578284815,
"grad_norm": 0.9167821894761861,
"learning_rate": 3.823730133713248e-05,
"loss": 0.5778,
"step": 351
},
{
"epoch": 1.1093627065302911,
"grad_norm": 1.1530087162130414,
"learning_rate": 3.821917024114061e-05,
"loss": 0.5544,
"step": 352
},
{
"epoch": 1.1125098347757671,
"grad_norm": 0.8086357525381911,
"learning_rate": 3.820095071728923e-05,
"loss": 0.5976,
"step": 353
},
{
"epoch": 1.1156569630212432,
"grad_norm": 0.7747888713394967,
"learning_rate": 3.81826428540079e-05,
"loss": 0.6041,
"step": 354
},
{
"epoch": 1.1188040912667192,
"grad_norm": 0.7508946410062841,
"learning_rate": 3.816424674015497e-05,
"loss": 0.5864,
"step": 355
},
{
"epoch": 1.1219512195121952,
"grad_norm": 0.7959382362736754,
"learning_rate": 3.814576246501708e-05,
"loss": 0.5587,
"step": 356
},
{
"epoch": 1.125098347757671,
"grad_norm": 0.685910628438955,
"learning_rate": 3.812719011830882e-05,
"loss": 0.5713,
"step": 357
},
{
"epoch": 1.128245476003147,
"grad_norm": 0.7392010420761511,
"learning_rate": 3.8108529790172204e-05,
"loss": 0.5905,
"step": 358
},
{
"epoch": 1.131392604248623,
"grad_norm": 0.6668167046568578,
"learning_rate": 3.8089781571176275e-05,
"loss": 0.5751,
"step": 359
},
{
"epoch": 1.1345397324940991,
"grad_norm": 0.8000258627134674,
"learning_rate": 3.8070945552316674e-05,
"loss": 0.5621,
"step": 360
},
{
"epoch": 1.1376868607395751,
"grad_norm": 0.7389373221950687,
"learning_rate": 3.805202182501516e-05,
"loss": 0.5529,
"step": 361
},
{
"epoch": 1.1408339889850512,
"grad_norm": 0.769018684280187,
"learning_rate": 3.80330104811192e-05,
"loss": 0.6046,
"step": 362
},
{
"epoch": 1.1439811172305272,
"grad_norm": 0.7301516878023783,
"learning_rate": 3.801391161290153e-05,
"loss": 0.5652,
"step": 363
},
{
"epoch": 1.1471282454760032,
"grad_norm": 0.8229419243343139,
"learning_rate": 3.7994725313059676e-05,
"loss": 0.5792,
"step": 364
},
{
"epoch": 1.1502753737214793,
"grad_norm": 0.6680976797053321,
"learning_rate": 3.797545167471552e-05,
"loss": 0.5741,
"step": 365
},
{
"epoch": 1.153422501966955,
"grad_norm": 0.938538926694488,
"learning_rate": 3.795609079141484e-05,
"loss": 0.568,
"step": 366
},
{
"epoch": 1.156569630212431,
"grad_norm": 1.2798546823891832,
"learning_rate": 3.793664275712687e-05,
"loss": 0.5779,
"step": 367
},
{
"epoch": 1.1597167584579071,
"grad_norm": 0.5005920128387191,
"learning_rate": 3.791710766624384e-05,
"loss": 0.5567,
"step": 368
},
{
"epoch": 1.1628638867033831,
"grad_norm": 1.3328977261494417,
"learning_rate": 3.7897485613580516e-05,
"loss": 0.584,
"step": 369
},
{
"epoch": 1.1660110149488592,
"grad_norm": 0.9543328894815445,
"learning_rate": 3.787777669437373e-05,
"loss": 0.5818,
"step": 370
},
{
"epoch": 1.1691581431943352,
"grad_norm": 0.6789082634257576,
"learning_rate": 3.785798100428193e-05,
"loss": 0.5907,
"step": 371
},
{
"epoch": 1.1723052714398112,
"grad_norm": 0.6597293914224007,
"learning_rate": 3.7838098639384705e-05,
"loss": 0.5325,
"step": 372
},
{
"epoch": 1.1754523996852873,
"grad_norm": 1.06089797604037,
"learning_rate": 3.781812969618234e-05,
"loss": 0.5935,
"step": 373
},
{
"epoch": 1.1785995279307633,
"grad_norm": 1.1481361384627329,
"learning_rate": 3.779807427159531e-05,
"loss": 0.5596,
"step": 374
},
{
"epoch": 1.181746656176239,
"grad_norm": 0.7407771811040466,
"learning_rate": 3.777793246296387e-05,
"loss": 0.5916,
"step": 375
},
{
"epoch": 1.1848937844217151,
"grad_norm": 1.2671109018758089,
"learning_rate": 3.775770436804751e-05,
"loss": 0.5651,
"step": 376
},
{
"epoch": 1.1880409126671911,
"grad_norm": 0.8857172105459614,
"learning_rate": 3.7737390085024525e-05,
"loss": 0.6174,
"step": 377
},
{
"epoch": 1.1911880409126672,
"grad_norm": 0.9521903376602633,
"learning_rate": 3.771698971249154e-05,
"loss": 0.5565,
"step": 378
},
{
"epoch": 1.1943351691581432,
"grad_norm": 1.1244996198067643,
"learning_rate": 3.7696503349463014e-05,
"loss": 0.6109,
"step": 379
},
{
"epoch": 1.1974822974036192,
"grad_norm": 0.9098284775149258,
"learning_rate": 3.767593109537076e-05,
"loss": 0.5728,
"step": 380
},
{
"epoch": 1.2006294256490952,
"grad_norm": 1.0765208853493442,
"learning_rate": 3.765527305006348e-05,
"loss": 0.5669,
"step": 381
},
{
"epoch": 1.2037765538945713,
"grad_norm": 0.7147386911125436,
"learning_rate": 3.763452931380626e-05,
"loss": 0.5776,
"step": 382
},
{
"epoch": 1.2069236821400473,
"grad_norm": 0.7742812162089677,
"learning_rate": 3.7613699987280095e-05,
"loss": 0.5561,
"step": 383
},
{
"epoch": 1.210070810385523,
"grad_norm": 0.9406068378416047,
"learning_rate": 3.75927851715814e-05,
"loss": 0.5965,
"step": 384
},
{
"epoch": 1.2132179386309991,
"grad_norm": 0.9172548442849622,
"learning_rate": 3.7571784968221517e-05,
"loss": 0.553,
"step": 385
},
{
"epoch": 1.2163650668764752,
"grad_norm": 1.0846359511575892,
"learning_rate": 3.75506994791262e-05,
"loss": 0.5735,
"step": 386
},
{
"epoch": 1.2195121951219512,
"grad_norm": 0.64281457660973,
"learning_rate": 3.75295288066352e-05,
"loss": 0.6026,
"step": 387
},
{
"epoch": 1.2226593233674272,
"grad_norm": 1.2918057591143557,
"learning_rate": 3.7508273053501644e-05,
"loss": 0.5803,
"step": 388
},
{
"epoch": 1.2258064516129032,
"grad_norm": 0.8285810992075407,
"learning_rate": 3.7486932322891646e-05,
"loss": 0.559,
"step": 389
},
{
"epoch": 1.2289535798583793,
"grad_norm": 0.7903501432162211,
"learning_rate": 3.7465506718383746e-05,
"loss": 0.6019,
"step": 390
},
{
"epoch": 1.2321007081038553,
"grad_norm": 1.0053744792111636,
"learning_rate": 3.744399634396842e-05,
"loss": 0.5609,
"step": 391
},
{
"epoch": 1.2352478363493313,
"grad_norm": 0.8379114151897634,
"learning_rate": 3.742240130404761e-05,
"loss": 0.5899,
"step": 392
},
{
"epoch": 1.2383949645948071,
"grad_norm": 0.9519548865776732,
"learning_rate": 3.740072170343415e-05,
"loss": 0.5973,
"step": 393
},
{
"epoch": 1.2415420928402832,
"grad_norm": 0.7539686484958599,
"learning_rate": 3.737895764735132e-05,
"loss": 0.5714,
"step": 394
},
{
"epoch": 1.2446892210857592,
"grad_norm": 1.0257885860344795,
"learning_rate": 3.735710924143228e-05,
"loss": 0.5829,
"step": 395
},
{
"epoch": 1.2478363493312352,
"grad_norm": 0.7629141726668706,
"learning_rate": 3.733517659171963e-05,
"loss": 0.5801,
"step": 396
},
{
"epoch": 1.2509834775767112,
"grad_norm": 0.7754556508930379,
"learning_rate": 3.731315980466482e-05,
"loss": 0.5472,
"step": 397
},
{
"epoch": 1.2541306058221873,
"grad_norm": 0.7774464891625856,
"learning_rate": 3.729105898712768e-05,
"loss": 0.5962,
"step": 398
},
{
"epoch": 1.2572777340676633,
"grad_norm": 1.174513316510275,
"learning_rate": 3.726887424637588e-05,
"loss": 0.6022,
"step": 399
},
{
"epoch": 1.2604248623131393,
"grad_norm": 0.7079511116657756,
"learning_rate": 3.7246605690084414e-05,
"loss": 0.5344,
"step": 400
},
{
"epoch": 1.2635719905586154,
"grad_norm": 0.9885418604513543,
"learning_rate": 3.722425342633509e-05,
"loss": 0.5927,
"step": 401
},
{
"epoch": 1.2667191188040912,
"grad_norm": 1.4481157564971159,
"learning_rate": 3.7201817563615994e-05,
"loss": 0.5769,
"step": 402
},
{
"epoch": 1.2698662470495672,
"grad_norm": 0.6732395737301099,
"learning_rate": 3.717929821082095e-05,
"loss": 0.5592,
"step": 403
},
{
"epoch": 1.2730133752950432,
"grad_norm": 1.402245331031854,
"learning_rate": 3.7156695477249034e-05,
"loss": 0.583,
"step": 404
},
{
"epoch": 1.2761605035405192,
"grad_norm": 0.7912573192484376,
"learning_rate": 3.713400947260398e-05,
"loss": 0.5765,
"step": 405
},
{
"epoch": 1.2793076317859953,
"grad_norm": 0.7882115805671188,
"learning_rate": 3.711124030699371e-05,
"loss": 0.5971,
"step": 406
},
{
"epoch": 1.2824547600314713,
"grad_norm": 1.0141187171254817,
"learning_rate": 3.7088388090929776e-05,
"loss": 0.5848,
"step": 407
},
{
"epoch": 1.2856018882769473,
"grad_norm": 0.9391592675093976,
"learning_rate": 3.706545293532679e-05,
"loss": 0.5444,
"step": 408
},
{
"epoch": 1.2887490165224234,
"grad_norm": 0.9095509483468426,
"learning_rate": 3.704243495150195e-05,
"loss": 0.5616,
"step": 409
},
{
"epoch": 1.2918961447678994,
"grad_norm": 0.8507529149564299,
"learning_rate": 3.701933425117444e-05,
"loss": 0.5911,
"step": 410
},
{
"epoch": 1.2950432730133752,
"grad_norm": 0.8232008726505865,
"learning_rate": 3.699615094646494e-05,
"loss": 0.5801,
"step": 411
},
{
"epoch": 1.2981904012588514,
"grad_norm": 0.6912446644196883,
"learning_rate": 3.697288514989502e-05,
"loss": 0.558,
"step": 412
},
{
"epoch": 1.3013375295043272,
"grad_norm": 0.6797610400444395,
"learning_rate": 3.694953697438667e-05,
"loss": 0.5922,
"step": 413
},
{
"epoch": 1.3044846577498033,
"grad_norm": 0.9402207805888517,
"learning_rate": 3.6926106533261676e-05,
"loss": 0.5829,
"step": 414
},
{
"epoch": 1.3076317859952793,
"grad_norm": 0.7820671057814051,
"learning_rate": 3.690259394024113e-05,
"loss": 0.5791,
"step": 415
},
{
"epoch": 1.3107789142407553,
"grad_norm": 0.6624377804484535,
"learning_rate": 3.687899930944484e-05,
"loss": 0.5685,
"step": 416
},
{
"epoch": 1.3139260424862313,
"grad_norm": 0.6610022512491404,
"learning_rate": 3.68553227553908e-05,
"loss": 0.5578,
"step": 417
},
{
"epoch": 1.3170731707317074,
"grad_norm": 0.9462547727748756,
"learning_rate": 3.683156439299459e-05,
"loss": 0.5814,
"step": 418
},
{
"epoch": 1.3202202989771834,
"grad_norm": 0.9348204674000715,
"learning_rate": 3.680772433756892e-05,
"loss": 0.5831,
"step": 419
},
{
"epoch": 1.3233674272226592,
"grad_norm": 0.7963248960523923,
"learning_rate": 3.678380270482292e-05,
"loss": 0.5783,
"step": 420
},
{
"epoch": 1.3265145554681355,
"grad_norm": 0.7393400925757423,
"learning_rate": 3.675979961086173e-05,
"loss": 0.5839,
"step": 421
},
{
"epoch": 1.3296616837136113,
"grad_norm": 0.7023192863980032,
"learning_rate": 3.673571517218582e-05,
"loss": 0.5868,
"step": 422
},
{
"epoch": 1.3328088119590873,
"grad_norm": 0.7626883730805661,
"learning_rate": 3.67115495056905e-05,
"loss": 0.5515,
"step": 423
},
{
"epoch": 1.3359559402045633,
"grad_norm": 0.7190658979185762,
"learning_rate": 3.66873027286653e-05,
"loss": 0.5928,
"step": 424
},
{
"epoch": 1.3391030684500393,
"grad_norm": 0.8697111635772896,
"learning_rate": 3.6662974958793456e-05,
"loss": 0.5825,
"step": 425
},
{
"epoch": 1.3422501966955154,
"grad_norm": 1.048689473107099,
"learning_rate": 3.663856631415128e-05,
"loss": 0.5755,
"step": 426
},
{
"epoch": 1.3453973249409914,
"grad_norm": 0.6268689833713994,
"learning_rate": 3.661407691320763e-05,
"loss": 0.5995,
"step": 427
},
{
"epoch": 1.3485444531864674,
"grad_norm": 0.8771144942564589,
"learning_rate": 3.65895068748233e-05,
"loss": 0.529,
"step": 428
},
{
"epoch": 1.3516915814319432,
"grad_norm": 0.9330631648070546,
"learning_rate": 3.65648563182505e-05,
"loss": 0.5698,
"step": 429
},
{
"epoch": 1.3548387096774195,
"grad_norm": 0.6691205901538837,
"learning_rate": 3.65401253631322e-05,
"loss": 0.6007,
"step": 430
},
{
"epoch": 1.3579858379228953,
"grad_norm": 0.8797201937311443,
"learning_rate": 3.651531412950162e-05,
"loss": 0.5688,
"step": 431
},
{
"epoch": 1.3611329661683713,
"grad_norm": 0.812875798385014,
"learning_rate": 3.6490422737781586e-05,
"loss": 0.5599,
"step": 432
},
{
"epoch": 1.3642800944138473,
"grad_norm": 0.739605090858723,
"learning_rate": 3.646545130878401e-05,
"loss": 0.5717,
"step": 433
},
{
"epoch": 1.3674272226593234,
"grad_norm": 0.7027271799286325,
"learning_rate": 3.6440399963709244e-05,
"loss": 0.5919,
"step": 434
},
{
"epoch": 1.3705743509047994,
"grad_norm": 0.6758853392810176,
"learning_rate": 3.641526882414553e-05,
"loss": 0.5847,
"step": 435
},
{
"epoch": 1.3737214791502754,
"grad_norm": 0.6412837261448728,
"learning_rate": 3.639005801206839e-05,
"loss": 0.5786,
"step": 436
},
{
"epoch": 1.3768686073957515,
"grad_norm": 0.7485694482103437,
"learning_rate": 3.6364767649840056e-05,
"loss": 0.5792,
"step": 437
},
{
"epoch": 1.3800157356412273,
"grad_norm": 0.6846051950269184,
"learning_rate": 3.633939786020884e-05,
"loss": 0.5541,
"step": 438
},
{
"epoch": 1.3831628638867035,
"grad_norm": 0.7297707216903999,
"learning_rate": 3.631394876630858e-05,
"loss": 0.5854,
"step": 439
},
{
"epoch": 1.3863099921321793,
"grad_norm": 0.6692029491831896,
"learning_rate": 3.628842049165801e-05,
"loss": 0.5812,
"step": 440
},
{
"epoch": 1.3894571203776553,
"grad_norm": 0.7299333511772628,
"learning_rate": 3.626281316016017e-05,
"loss": 0.568,
"step": 441
},
{
"epoch": 1.3926042486231314,
"grad_norm": 0.6329118009441329,
"learning_rate": 3.623712689610182e-05,
"loss": 0.5833,
"step": 442
},
{
"epoch": 1.3957513768686074,
"grad_norm": 0.7411202253172915,
"learning_rate": 3.6211361824152816e-05,
"loss": 0.5772,
"step": 443
},
{
"epoch": 1.3988985051140834,
"grad_norm": 0.7471937332509346,
"learning_rate": 3.618551806936551e-05,
"loss": 0.5584,
"step": 444
},
{
"epoch": 1.4020456333595595,
"grad_norm": 0.6697271784752193,
"learning_rate": 3.6159595757174154e-05,
"loss": 0.5565,
"step": 445
},
{
"epoch": 1.4051927616050355,
"grad_norm": 0.7260444858526285,
"learning_rate": 3.6133595013394276e-05,
"loss": 0.6017,
"step": 446
},
{
"epoch": 1.4083398898505113,
"grad_norm": 0.6938476082430989,
"learning_rate": 3.6107515964222084e-05,
"loss": 0.5702,
"step": 447
},
{
"epoch": 1.4114870180959875,
"grad_norm": 0.6902716895190657,
"learning_rate": 3.6081358736233836e-05,
"loss": 0.6014,
"step": 448
},
{
"epoch": 1.4146341463414633,
"grad_norm": 0.5887971251866968,
"learning_rate": 3.605512345638525e-05,
"loss": 0.545,
"step": 449
},
{
"epoch": 1.4177812745869394,
"grad_norm": 0.6468738560521924,
"learning_rate": 3.602881025201086e-05,
"loss": 0.6023,
"step": 450
},
{
"epoch": 1.4209284028324154,
"grad_norm": 0.6460851268856366,
"learning_rate": 3.6002419250823415e-05,
"loss": 0.5788,
"step": 451
},
{
"epoch": 1.4240755310778914,
"grad_norm": 0.6553053911353882,
"learning_rate": 3.597595058091325e-05,
"loss": 0.5705,
"step": 452
},
{
"epoch": 1.4272226593233674,
"grad_norm": 0.7688726651106929,
"learning_rate": 3.594940437074769e-05,
"loss": 0.6054,
"step": 453
},
{
"epoch": 1.4303697875688435,
"grad_norm": 0.8193385892036661,
"learning_rate": 3.592278074917039e-05,
"loss": 0.5821,
"step": 454
},
{
"epoch": 1.4335169158143195,
"grad_norm": 0.5937922521766558,
"learning_rate": 3.589607984540072e-05,
"loss": 0.55,
"step": 455
},
{
"epoch": 1.4366640440597953,
"grad_norm": 0.9231773985131014,
"learning_rate": 3.586930178903317e-05,
"loss": 0.5766,
"step": 456
},
{
"epoch": 1.4398111723052716,
"grad_norm": 1.0322774589585058,
"learning_rate": 3.5842446710036664e-05,
"loss": 0.5943,
"step": 457
},
{
"epoch": 1.4429583005507474,
"grad_norm": 0.5933782616518375,
"learning_rate": 3.581551473875397e-05,
"loss": 0.5535,
"step": 458
},
{
"epoch": 1.4461054287962234,
"grad_norm": 0.7009813777712108,
"learning_rate": 3.578850600590108e-05,
"loss": 0.5783,
"step": 459
},
{
"epoch": 1.4492525570416994,
"grad_norm": 0.515906560455954,
"learning_rate": 3.576142064256652e-05,
"loss": 0.5431,
"step": 460
},
{
"epoch": 1.4523996852871754,
"grad_norm": 0.7183973662906478,
"learning_rate": 3.573425878021077e-05,
"loss": 0.5902,
"step": 461
},
{
"epoch": 1.4555468135326515,
"grad_norm": 0.7722657026633934,
"learning_rate": 3.570702055066558e-05,
"loss": 0.5808,
"step": 462
},
{
"epoch": 1.4586939417781275,
"grad_norm": 0.6269618529944755,
"learning_rate": 3.5679706086133384e-05,
"loss": 0.5895,
"step": 463
},
{
"epoch": 1.4618410700236035,
"grad_norm": 0.5326061244869541,
"learning_rate": 3.5652315519186594e-05,
"loss": 0.5394,
"step": 464
},
{
"epoch": 1.4649881982690793,
"grad_norm": 0.7071744531566102,
"learning_rate": 3.562484898276702e-05,
"loss": 0.5843,
"step": 465
},
{
"epoch": 1.4681353265145556,
"grad_norm": 0.4720130096048939,
"learning_rate": 3.559730661018517e-05,
"loss": 0.5768,
"step": 466
},
{
"epoch": 1.4712824547600314,
"grad_norm": 0.7510254194400818,
"learning_rate": 3.556968853511964e-05,
"loss": 0.5587,
"step": 467
},
{
"epoch": 1.4744295830055074,
"grad_norm": 0.7533116530402437,
"learning_rate": 3.5541994891616455e-05,
"loss": 0.5924,
"step": 468
},
{
"epoch": 1.4775767112509834,
"grad_norm": 0.648313383238362,
"learning_rate": 3.55142258140884e-05,
"loss": 0.5672,
"step": 469
},
{
"epoch": 1.4807238394964595,
"grad_norm": 0.6066187042141705,
"learning_rate": 3.548638143731441e-05,
"loss": 0.5393,
"step": 470
},
{
"epoch": 1.4838709677419355,
"grad_norm": 0.5114501688468103,
"learning_rate": 3.545846189643886e-05,
"loss": 0.5775,
"step": 471
},
{
"epoch": 1.4870180959874115,
"grad_norm": 0.6433306121899048,
"learning_rate": 3.5430467326970956e-05,
"loss": 0.5719,
"step": 472
},
{
"epoch": 1.4901652242328876,
"grad_norm": 0.6315145421662225,
"learning_rate": 3.540239786478407e-05,
"loss": 0.5641,
"step": 473
},
{
"epoch": 1.4933123524783634,
"grad_norm": 0.7411652973460111,
"learning_rate": 3.537425364611504e-05,
"loss": 0.5859,
"step": 474
},
{
"epoch": 1.4964594807238396,
"grad_norm": 0.6753402761493095,
"learning_rate": 3.5346034807563566e-05,
"loss": 0.5664,
"step": 475
},
{
"epoch": 1.4996066089693154,
"grad_norm": 0.6593613942752435,
"learning_rate": 3.5317741486091515e-05,
"loss": 0.5777,
"step": 476
},
{
"epoch": 1.5027537372147917,
"grad_norm": 0.6224692930708144,
"learning_rate": 3.5289373819022246e-05,
"loss": 0.5511,
"step": 477
},
{
"epoch": 1.5059008654602675,
"grad_norm": 0.5435829543716864,
"learning_rate": 3.5260931944039976e-05,
"loss": 0.5951,
"step": 478
},
{
"epoch": 1.5090479937057435,
"grad_norm": 0.7727972521914884,
"learning_rate": 3.523241599918908e-05,
"loss": 0.5809,
"step": 479
},
{
"epoch": 1.5121951219512195,
"grad_norm": 0.8135435570540013,
"learning_rate": 3.520382612287346e-05,
"loss": 0.5508,
"step": 480
},
{
"epoch": 1.5153422501966956,
"grad_norm": 0.6445142709480808,
"learning_rate": 3.517516245385582e-05,
"loss": 0.5711,
"step": 481
},
{
"epoch": 1.5184893784421716,
"grad_norm": 0.6571221471891615,
"learning_rate": 3.514642513125704e-05,
"loss": 0.5816,
"step": 482
},
{
"epoch": 1.5216365066876474,
"grad_norm": 0.778181315537639,
"learning_rate": 3.511761429455548e-05,
"loss": 0.5487,
"step": 483
},
{
"epoch": 1.5247836349331236,
"grad_norm": 0.6612223689047932,
"learning_rate": 3.508873008358629e-05,
"loss": 0.5693,
"step": 484
},
{
"epoch": 1.5279307631785994,
"grad_norm": 0.8677284824093338,
"learning_rate": 3.505977263854077e-05,
"loss": 0.5777,
"step": 485
},
{
"epoch": 1.5310778914240757,
"grad_norm": 0.7946151572521599,
"learning_rate": 3.5030742099965654e-05,
"loss": 0.6006,
"step": 486
},
{
"epoch": 1.5342250196695515,
"grad_norm": 0.6423836766306755,
"learning_rate": 3.5001638608762435e-05,
"loss": 0.5902,
"step": 487
},
{
"epoch": 1.5373721479150275,
"grad_norm": 0.9319785025741317,
"learning_rate": 3.497246230618669e-05,
"loss": 0.5391,
"step": 488
},
{
"epoch": 1.5405192761605035,
"grad_norm": 0.646420731333806,
"learning_rate": 3.49432133338474e-05,
"loss": 0.601,
"step": 489
},
{
"epoch": 1.5436664044059796,
"grad_norm": 0.6190175701198456,
"learning_rate": 3.491389183370624e-05,
"loss": 0.5381,
"step": 490
},
{
"epoch": 1.5468135326514556,
"grad_norm": 0.7395463266615069,
"learning_rate": 3.48844979480769e-05,
"loss": 0.6024,
"step": 491
},
{
"epoch": 1.5499606608969314,
"grad_norm": 0.6246796480797395,
"learning_rate": 3.485503181962442e-05,
"loss": 0.558,
"step": 492
},
{
"epoch": 1.5531077891424077,
"grad_norm": 0.6101829668871169,
"learning_rate": 3.4825493591364456e-05,
"loss": 0.5797,
"step": 493
},
{
"epoch": 1.5562549173878835,
"grad_norm": 0.7173522923458697,
"learning_rate": 3.479588340666261e-05,
"loss": 0.5769,
"step": 494
},
{
"epoch": 1.5594020456333597,
"grad_norm": 0.7526388551403992,
"learning_rate": 3.4766201409233725e-05,
"loss": 0.5415,
"step": 495
},
{
"epoch": 1.5625491738788355,
"grad_norm": 0.8776736439637768,
"learning_rate": 3.4736447743141195e-05,
"loss": 0.597,
"step": 496
},
{
"epoch": 1.5656963021243115,
"grad_norm": 0.7955322141679404,
"learning_rate": 3.470662255279627e-05,
"loss": 0.566,
"step": 497
},
{
"epoch": 1.5688434303697876,
"grad_norm": 0.693146892331598,
"learning_rate": 3.467672598295733e-05,
"loss": 0.5666,
"step": 498
},
{
"epoch": 1.5719905586152636,
"grad_norm": 0.5243219068148551,
"learning_rate": 3.4646758178729204e-05,
"loss": 0.5772,
"step": 499
},
{
"epoch": 1.5751376868607396,
"grad_norm": 0.6658543703419947,
"learning_rate": 3.461671928556248e-05,
"loss": 0.5677,
"step": 500
},
{
"epoch": 1.5782848151062154,
"grad_norm": 0.70750696164214,
"learning_rate": 3.458660944925277e-05,
"loss": 0.5694,
"step": 501
},
{
"epoch": 1.5814319433516917,
"grad_norm": 0.8301587050106674,
"learning_rate": 3.455642881594e-05,
"loss": 0.573,
"step": 502
},
{
"epoch": 1.5845790715971675,
"grad_norm": 0.5989742871105483,
"learning_rate": 3.452617753210772e-05,
"loss": 0.5796,
"step": 503
},
{
"epoch": 1.5877261998426437,
"grad_norm": 0.7607728281640884,
"learning_rate": 3.44958557445824e-05,
"loss": 0.5368,
"step": 504
},
{
"epoch": 1.5908733280881195,
"grad_norm": 0.8923738785753914,
"learning_rate": 3.44654636005327e-05,
"loss": 0.5931,
"step": 505
},
{
"epoch": 1.5940204563335956,
"grad_norm": 0.8841841122243062,
"learning_rate": 3.443500124746873e-05,
"loss": 0.5758,
"step": 506
},
{
"epoch": 1.5971675845790716,
"grad_norm": 0.9060495002085946,
"learning_rate": 3.4404468833241406e-05,
"loss": 0.5762,
"step": 507
},
{
"epoch": 1.6003147128245476,
"grad_norm": 0.7266975845748416,
"learning_rate": 3.437386650604168e-05,
"loss": 0.5887,
"step": 508
},
{
"epoch": 1.6034618410700237,
"grad_norm": 0.7727096713217674,
"learning_rate": 3.43431944143998e-05,
"loss": 0.5511,
"step": 509
},
{
"epoch": 1.6066089693154995,
"grad_norm": 0.9700581732284279,
"learning_rate": 3.431245270718466e-05,
"loss": 0.5864,
"step": 510
},
{
"epoch": 1.6097560975609757,
"grad_norm": 1.0421876454341092,
"learning_rate": 3.4281641533603014e-05,
"loss": 0.5623,
"step": 511
},
{
"epoch": 1.6129032258064515,
"grad_norm": 0.6522589351192359,
"learning_rate": 3.425076104319878e-05,
"loss": 0.5834,
"step": 512
},
{
"epoch": 1.6160503540519278,
"grad_norm": 0.7995769943176967,
"learning_rate": 3.4219811385852324e-05,
"loss": 0.5484,
"step": 513
},
{
"epoch": 1.6191974822974036,
"grad_norm": 0.8339896343295545,
"learning_rate": 3.418879271177968e-05,
"loss": 0.5417,
"step": 514
},
{
"epoch": 1.6223446105428796,
"grad_norm": 0.6298557505945559,
"learning_rate": 3.4157705171531904e-05,
"loss": 0.6214,
"step": 515
},
{
"epoch": 1.6254917387883556,
"grad_norm": 0.6627216090718417,
"learning_rate": 3.4126548915994274e-05,
"loss": 0.5306,
"step": 516
},
{
"epoch": 1.6286388670338316,
"grad_norm": 0.8522248942562364,
"learning_rate": 3.4095324096385584e-05,
"loss": 0.5832,
"step": 517
},
{
"epoch": 1.6317859952793077,
"grad_norm": 0.8071162098711794,
"learning_rate": 3.406403086425741e-05,
"loss": 0.5686,
"step": 518
},
{
"epoch": 1.6349331235247835,
"grad_norm": 0.7731287731654394,
"learning_rate": 3.403266937149337e-05,
"loss": 0.5815,
"step": 519
},
{
"epoch": 1.6380802517702597,
"grad_norm": 0.6533031600568543,
"learning_rate": 3.4001239770308394e-05,
"loss": 0.5915,
"step": 520
},
{
"epoch": 1.6412273800157355,
"grad_norm": 0.569436746864039,
"learning_rate": 3.396974221324798e-05,
"loss": 0.5498,
"step": 521
},
{
"epoch": 1.6443745082612118,
"grad_norm": 0.6632852776648928,
"learning_rate": 3.3938176853187456e-05,
"loss": 0.5456,
"step": 522
},
{
"epoch": 1.6475216365066876,
"grad_norm": 0.5695584986960003,
"learning_rate": 3.390654384333122e-05,
"loss": 0.6047,
"step": 523
},
{
"epoch": 1.6506687647521636,
"grad_norm": 0.5940386873921953,
"learning_rate": 3.387484333721203e-05,
"loss": 0.5619,
"step": 524
},
{
"epoch": 1.6538158929976396,
"grad_norm": 0.6418545904839442,
"learning_rate": 3.384307548869023e-05,
"loss": 0.5559,
"step": 525
},
{
"epoch": 1.6569630212431157,
"grad_norm": 0.7206004903104387,
"learning_rate": 3.381124045195304e-05,
"loss": 0.5975,
"step": 526
},
{
"epoch": 1.6601101494885917,
"grad_norm": 0.7182344615178672,
"learning_rate": 3.377933838151374e-05,
"loss": 0.5509,
"step": 527
},
{
"epoch": 1.6632572777340675,
"grad_norm": 0.49685467022501884,
"learning_rate": 3.3747369432210986e-05,
"loss": 0.5579,
"step": 528
},
{
"epoch": 1.6664044059795438,
"grad_norm": 0.7462370895937678,
"learning_rate": 3.371533375920805e-05,
"loss": 0.584,
"step": 529
},
{
"epoch": 1.6695515342250196,
"grad_norm": 0.5844502731729302,
"learning_rate": 3.368323151799201e-05,
"loss": 0.5502,
"step": 530
},
{
"epoch": 1.6726986624704958,
"grad_norm": 0.7995900924270242,
"learning_rate": 3.365106286437309e-05,
"loss": 0.577,
"step": 531
},
{
"epoch": 1.6758457907159716,
"grad_norm": 0.6980897656965266,
"learning_rate": 3.36188279544838e-05,
"loss": 0.5667,
"step": 532
},
{
"epoch": 1.6789929189614476,
"grad_norm": 0.5272573380584673,
"learning_rate": 3.358652694477825e-05,
"loss": 0.5682,
"step": 533
},
{
"epoch": 1.6821400472069237,
"grad_norm": 0.7386619419487044,
"learning_rate": 3.355415999203139e-05,
"loss": 0.5676,
"step": 534
},
{
"epoch": 1.6852871754523997,
"grad_norm": 0.6234902692887895,
"learning_rate": 3.352172725333819e-05,
"loss": 0.5767,
"step": 535
},
{
"epoch": 1.6884343036978757,
"grad_norm": 0.6168550151283233,
"learning_rate": 3.3489228886112955e-05,
"loss": 0.5414,
"step": 536
},
{
"epoch": 1.6915814319433515,
"grad_norm": 0.6616970915400288,
"learning_rate": 3.345666504808848e-05,
"loss": 0.5958,
"step": 537
},
{
"epoch": 1.6947285601888278,
"grad_norm": 0.5250838225987879,
"learning_rate": 3.342403589731537e-05,
"loss": 0.5574,
"step": 538
},
{
"epoch": 1.6978756884343036,
"grad_norm": 0.4902293386353501,
"learning_rate": 3.3391341592161186e-05,
"loss": 0.5668,
"step": 539
},
{
"epoch": 1.7010228166797798,
"grad_norm": 0.6680470248999262,
"learning_rate": 3.3358582291309755e-05,
"loss": 0.576,
"step": 540
},
{
"epoch": 1.7041699449252556,
"grad_norm": 0.5361498475271551,
"learning_rate": 3.3325758153760357e-05,
"loss": 0.5699,
"step": 541
},
{
"epoch": 1.7073170731707317,
"grad_norm": 0.7242179466038916,
"learning_rate": 3.3292869338826944e-05,
"loss": 0.5651,
"step": 542
},
{
"epoch": 1.7104642014162077,
"grad_norm": 0.5773147674606367,
"learning_rate": 3.3259916006137404e-05,
"loss": 0.5399,
"step": 543
},
{
"epoch": 1.7136113296616837,
"grad_norm": 0.7046072332761888,
"learning_rate": 3.3226898315632747e-05,
"loss": 0.5997,
"step": 544
},
{
"epoch": 1.7167584579071598,
"grad_norm": 0.7085851971891551,
"learning_rate": 3.319381642756637e-05,
"loss": 0.5691,
"step": 545
},
{
"epoch": 1.7199055861526356,
"grad_norm": 0.6071053828245467,
"learning_rate": 3.3160670502503245e-05,
"loss": 0.5605,
"step": 546
},
{
"epoch": 1.7230527143981118,
"grad_norm": 0.6496056037574068,
"learning_rate": 3.312746070131913e-05,
"loss": 0.568,
"step": 547
},
{
"epoch": 1.7261998426435876,
"grad_norm": 0.5889215814684938,
"learning_rate": 3.309418718519985e-05,
"loss": 0.5569,
"step": 548
},
{
"epoch": 1.7293469708890639,
"grad_norm": 0.6871169447975174,
"learning_rate": 3.306085011564045e-05,
"loss": 0.5974,
"step": 549
},
{
"epoch": 1.7324940991345397,
"grad_norm": 0.4861712620248983,
"learning_rate": 3.302744965444445e-05,
"loss": 0.5547,
"step": 550
},
{
"epoch": 1.7356412273800157,
"grad_norm": 0.6213321890297462,
"learning_rate": 3.299398596372301e-05,
"loss": 0.5504,
"step": 551
},
{
"epoch": 1.7387883556254917,
"grad_norm": 0.7451909016073313,
"learning_rate": 3.296045920589423e-05,
"loss": 0.5743,
"step": 552
},
{
"epoch": 1.7419354838709677,
"grad_norm": 0.5451433585609038,
"learning_rate": 3.292686954368228e-05,
"loss": 0.5828,
"step": 553
},
{
"epoch": 1.7450826121164438,
"grad_norm": 0.7140894697077395,
"learning_rate": 3.2893217140116636e-05,
"loss": 0.5451,
"step": 554
},
{
"epoch": 1.7482297403619196,
"grad_norm": 0.7917204702318612,
"learning_rate": 3.28595021585313e-05,
"loss": 0.5749,
"step": 555
},
{
"epoch": 1.7513768686073958,
"grad_norm": 0.736299315372961,
"learning_rate": 3.282572476256401e-05,
"loss": 0.5642,
"step": 556
},
{
"epoch": 1.7545239968528716,
"grad_norm": 0.896182709362604,
"learning_rate": 3.2791885116155416e-05,
"loss": 0.5846,
"step": 557
},
{
"epoch": 1.7576711250983479,
"grad_norm": 0.7630553132584026,
"learning_rate": 3.275798338354833e-05,
"loss": 0.5427,
"step": 558
},
{
"epoch": 1.7608182533438237,
"grad_norm": 0.8175895208469556,
"learning_rate": 3.272401972928688e-05,
"loss": 0.5865,
"step": 559
},
{
"epoch": 1.7639653815892997,
"grad_norm": 1.0859209320779282,
"learning_rate": 3.2689994318215754e-05,
"loss": 0.5655,
"step": 560
},
{
"epoch": 1.7671125098347757,
"grad_norm": 0.5813459553206478,
"learning_rate": 3.265590731547936e-05,
"loss": 0.5778,
"step": 561
},
{
"epoch": 1.7702596380802518,
"grad_norm": 0.932110967880023,
"learning_rate": 3.262175888652106e-05,
"loss": 0.5582,
"step": 562
},
{
"epoch": 1.7734067663257278,
"grad_norm": 1.141526320853197,
"learning_rate": 3.258754919708234e-05,
"loss": 0.5511,
"step": 563
},
{
"epoch": 1.7765538945712038,
"grad_norm": 0.7023132464915953,
"learning_rate": 3.255327841320204e-05,
"loss": 0.5876,
"step": 564
},
{
"epoch": 1.7797010228166799,
"grad_norm": 1.273494297512138,
"learning_rate": 3.251894670121549e-05,
"loss": 0.5589,
"step": 565
},
{
"epoch": 1.7828481510621557,
"grad_norm": 0.7051769266957719,
"learning_rate": 3.2484554227753775e-05,
"loss": 0.5926,
"step": 566
},
{
"epoch": 1.785995279307632,
"grad_norm": 0.7937404532482675,
"learning_rate": 3.2450101159742864e-05,
"loss": 0.552,
"step": 567
},
{
"epoch": 1.7891424075531077,
"grad_norm": 0.6969548206163279,
"learning_rate": 3.241558766440284e-05,
"loss": 0.5778,
"step": 568
},
{
"epoch": 1.7922895357985837,
"grad_norm": 0.5249950117268797,
"learning_rate": 3.238101390924706e-05,
"loss": 0.5522,
"step": 569
},
{
"epoch": 1.7954366640440598,
"grad_norm": 0.5989099837768208,
"learning_rate": 3.234638006208138e-05,
"loss": 0.571,
"step": 570
},
{
"epoch": 1.7985837922895358,
"grad_norm": 0.648877478471487,
"learning_rate": 3.231168629100328e-05,
"loss": 0.5605,
"step": 571
},
{
"epoch": 1.8017309205350118,
"grad_norm": 0.7019920843145531,
"learning_rate": 3.227693276440111e-05,
"loss": 0.5837,
"step": 572
},
{
"epoch": 1.8048780487804879,
"grad_norm": 0.7208863035517205,
"learning_rate": 3.224211965095326e-05,
"loss": 0.5771,
"step": 573
},
{
"epoch": 1.8080251770259639,
"grad_norm": 0.582448115703797,
"learning_rate": 3.220724711962729e-05,
"loss": 0.5769,
"step": 574
},
{
"epoch": 1.8111723052714397,
"grad_norm": 0.6093974952092187,
"learning_rate": 3.217231533967917e-05,
"loss": 0.5375,
"step": 575
},
{
"epoch": 1.814319433516916,
"grad_norm": 0.6885252290210996,
"learning_rate": 3.213732448065244e-05,
"loss": 0.5807,
"step": 576
},
{
"epoch": 1.8174665617623917,
"grad_norm": 0.6107097439844204,
"learning_rate": 3.2102274712377384e-05,
"loss": 0.5778,
"step": 577
},
{
"epoch": 1.8206136900078678,
"grad_norm": 0.581905246193802,
"learning_rate": 3.206716620497019e-05,
"loss": 0.5342,
"step": 578
},
{
"epoch": 1.8237608182533438,
"grad_norm": 0.5218197149224699,
"learning_rate": 3.203199912883216e-05,
"loss": 0.5698,
"step": 579
},
{
"epoch": 1.8269079464988198,
"grad_norm": 0.6188418099817343,
"learning_rate": 3.199677365464884e-05,
"loss": 0.5697,
"step": 580
},
{
"epoch": 1.8300550747442959,
"grad_norm": 0.6904870334281441,
"learning_rate": 3.196148995338924e-05,
"loss": 0.5696,
"step": 581
},
{
"epoch": 1.8332022029897719,
"grad_norm": 0.6072285553149802,
"learning_rate": 3.192614819630497e-05,
"loss": 0.5827,
"step": 582
},
{
"epoch": 1.836349331235248,
"grad_norm": 0.5737961625680729,
"learning_rate": 3.189074855492941e-05,
"loss": 0.5633,
"step": 583
},
{
"epoch": 1.8394964594807237,
"grad_norm": 0.5224516792912552,
"learning_rate": 3.185529120107688e-05,
"loss": 0.5523,
"step": 584
},
{
"epoch": 1.8426435877262,
"grad_norm": 0.7333440316338549,
"learning_rate": 3.181977630684183e-05,
"loss": 0.5873,
"step": 585
},
{
"epoch": 1.8457907159716758,
"grad_norm": 0.6648112330877247,
"learning_rate": 3.1784204044597976e-05,
"loss": 0.5526,
"step": 586
},
{
"epoch": 1.8489378442171518,
"grad_norm": 0.5927824601123387,
"learning_rate": 3.174857458699747e-05,
"loss": 0.5667,
"step": 587
},
{
"epoch": 1.8520849724626278,
"grad_norm": 0.8622116267692795,
"learning_rate": 3.171288810697007e-05,
"loss": 0.5867,
"step": 588
},
{
"epoch": 1.8552321007081038,
"grad_norm": 0.5692350152296256,
"learning_rate": 3.167714477772229e-05,
"loss": 0.5831,
"step": 589
},
{
"epoch": 1.8583792289535799,
"grad_norm": 0.755370477014244,
"learning_rate": 3.164134477273658e-05,
"loss": 0.5744,
"step": 590
},
{
"epoch": 1.861526357199056,
"grad_norm": 0.6666470710363162,
"learning_rate": 3.160548826577046e-05,
"loss": 0.5646,
"step": 591
},
{
"epoch": 1.864673485444532,
"grad_norm": 0.6685593450468315,
"learning_rate": 3.156957543085566e-05,
"loss": 0.546,
"step": 592
},
{
"epoch": 1.8678206136900077,
"grad_norm": 0.8519423765856126,
"learning_rate": 3.153360644229735e-05,
"loss": 0.5773,
"step": 593
},
{
"epoch": 1.870967741935484,
"grad_norm": 0.6858413196623089,
"learning_rate": 3.149758147467321e-05,
"loss": 0.5611,
"step": 594
},
{
"epoch": 1.8741148701809598,
"grad_norm": 0.6424906456379045,
"learning_rate": 3.146150070283263e-05,
"loss": 0.5741,
"step": 595
},
{
"epoch": 1.877261998426436,
"grad_norm": 0.7226105609045684,
"learning_rate": 3.142536430189585e-05,
"loss": 0.5635,
"step": 596
},
{
"epoch": 1.8804091266719118,
"grad_norm": 0.5556670020965504,
"learning_rate": 3.1389172447253106e-05,
"loss": 0.5644,
"step": 597
},
{
"epoch": 1.8835562549173879,
"grad_norm": 0.6676742327342214,
"learning_rate": 3.135292531456378e-05,
"loss": 0.5686,
"step": 598
},
{
"epoch": 1.886703383162864,
"grad_norm": 0.5117507094897004,
"learning_rate": 3.131662307975556e-05,
"loss": 0.569,
"step": 599
},
{
"epoch": 1.88985051140834,
"grad_norm": 0.8463097941158746,
"learning_rate": 3.128026591902356e-05,
"loss": 0.5623,
"step": 600
},
{
"epoch": 1.892997639653816,
"grad_norm": 0.5791294005826809,
"learning_rate": 3.1243854008829493e-05,
"loss": 0.5502,
"step": 601
},
{
"epoch": 1.8961447678992918,
"grad_norm": 0.5549775959821279,
"learning_rate": 3.1207387525900795e-05,
"loss": 0.5428,
"step": 602
},
{
"epoch": 1.899291896144768,
"grad_norm": 0.6916786541488753,
"learning_rate": 3.117086664722977e-05,
"loss": 0.5735,
"step": 603
},
{
"epoch": 1.9024390243902438,
"grad_norm": 0.5153262400722942,
"learning_rate": 3.113429155007276e-05,
"loss": 0.5574,
"step": 604
},
{
"epoch": 1.90558615263572,
"grad_norm": 0.6922452984565507,
"learning_rate": 3.109766241194922e-05,
"loss": 0.5672,
"step": 605
},
{
"epoch": 1.9087332808811959,
"grad_norm": 0.6162773584248936,
"learning_rate": 3.106097941064093e-05,
"loss": 0.5616,
"step": 606
},
{
"epoch": 1.911880409126672,
"grad_norm": 0.6022937951308683,
"learning_rate": 3.1024242724191074e-05,
"loss": 0.5729,
"step": 607
},
{
"epoch": 1.915027537372148,
"grad_norm": 0.5971187647434458,
"learning_rate": 3.098745253090342e-05,
"loss": 0.5575,
"step": 608
},
{
"epoch": 1.918174665617624,
"grad_norm": 0.6007052376113771,
"learning_rate": 3.0950609009341424e-05,
"loss": 0.5853,
"step": 609
},
{
"epoch": 1.9213217938631,
"grad_norm": 0.6798286394330892,
"learning_rate": 3.0913712338327376e-05,
"loss": 0.5652,
"step": 610
},
{
"epoch": 1.9244689221085758,
"grad_norm": 0.4998017682786324,
"learning_rate": 3.087676269694153e-05,
"loss": 0.5437,
"step": 611
},
{
"epoch": 1.927616050354052,
"grad_norm": 0.5900141909915693,
"learning_rate": 3.083976026452125e-05,
"loss": 0.5683,
"step": 612
},
{
"epoch": 1.9307631785995278,
"grad_norm": 0.6801189891795948,
"learning_rate": 3.080270522066008e-05,
"loss": 0.5613,
"step": 613
},
{
"epoch": 1.933910306845004,
"grad_norm": 0.6473158183105817,
"learning_rate": 3.076559774520697e-05,
"loss": 0.6134,
"step": 614
},
{
"epoch": 1.93705743509048,
"grad_norm": 0.6272385353530213,
"learning_rate": 3.072843801826532e-05,
"loss": 0.5474,
"step": 615
},
{
"epoch": 1.940204563335956,
"grad_norm": 0.5345902788380883,
"learning_rate": 3.069122622019214e-05,
"loss": 0.5526,
"step": 616
},
{
"epoch": 1.943351691581432,
"grad_norm": 0.5673813938868798,
"learning_rate": 3.065396253159717e-05,
"loss": 0.555,
"step": 617
},
{
"epoch": 1.946498819826908,
"grad_norm": 0.6523154486055993,
"learning_rate": 3.061664713334202e-05,
"loss": 0.5879,
"step": 618
},
{
"epoch": 1.949645948072384,
"grad_norm": 0.5478136247550162,
"learning_rate": 3.057928020653925e-05,
"loss": 0.5626,
"step": 619
},
{
"epoch": 1.9527930763178598,
"grad_norm": 0.6276196177093876,
"learning_rate": 3.054186193255154e-05,
"loss": 0.5753,
"step": 620
},
{
"epoch": 1.955940204563336,
"grad_norm": 0.5863504244120591,
"learning_rate": 3.0504392492990763e-05,
"loss": 0.5411,
"step": 621
},
{
"epoch": 1.9590873328088119,
"grad_norm": 0.6126483770064565,
"learning_rate": 3.0466872069717152e-05,
"loss": 0.57,
"step": 622
},
{
"epoch": 1.9622344610542881,
"grad_norm": 0.7429812948474108,
"learning_rate": 3.0429300844838373e-05,
"loss": 0.5775,
"step": 623
},
{
"epoch": 1.965381589299764,
"grad_norm": 0.5191951431840917,
"learning_rate": 3.0391679000708673e-05,
"loss": 0.5443,
"step": 624
},
{
"epoch": 1.96852871754524,
"grad_norm": 0.518779230546286,
"learning_rate": 3.0354006719927987e-05,
"loss": 0.5356,
"step": 625
},
{
"epoch": 1.971675845790716,
"grad_norm": 0.6399044762326972,
"learning_rate": 3.0316284185341023e-05,
"loss": 0.6069,
"step": 626
},
{
"epoch": 1.974822974036192,
"grad_norm": 0.6811166693200108,
"learning_rate": 3.0278511580036417e-05,
"loss": 0.5327,
"step": 627
},
{
"epoch": 1.977970102281668,
"grad_norm": 0.5946157024716091,
"learning_rate": 3.0240689087345838e-05,
"loss": 0.5843,
"step": 628
},
{
"epoch": 1.9811172305271438,
"grad_norm": 0.6365089414451961,
"learning_rate": 3.0202816890843062e-05,
"loss": 0.5892,
"step": 629
},
{
"epoch": 1.98426435877262,
"grad_norm": 0.7183274496457972,
"learning_rate": 3.016489517434312e-05,
"loss": 0.5145,
"step": 630
},
{
"epoch": 1.987411487018096,
"grad_norm": 0.6164005756653712,
"learning_rate": 3.012692412190138e-05,
"loss": 0.5784,
"step": 631
},
{
"epoch": 1.9905586152635721,
"grad_norm": 0.4388269705199884,
"learning_rate": 3.008890391781268e-05,
"loss": 0.5773,
"step": 632
},
{
"epoch": 1.993705743509048,
"grad_norm": 0.6502984666030054,
"learning_rate": 3.0050834746610412e-05,
"loss": 0.5544,
"step": 633
},
{
"epoch": 1.996852871754524,
"grad_norm": 0.6922648777230136,
"learning_rate": 3.0012716793065634e-05,
"loss": 0.555,
"step": 634
},
{
"epoch": 2.001573564122738,
"grad_norm": 0.8869807905073702,
"learning_rate": 2.9974550242186156e-05,
"loss": 0.9625,
"step": 635
},
{
"epoch": 2.004720692368214,
"grad_norm": 1.0358494933593285,
"learning_rate": 2.993633527921569e-05,
"loss": 0.5616,
"step": 636
},
{
"epoch": 2.00786782061369,
"grad_norm": 0.9346566129032547,
"learning_rate": 2.9898072089632882e-05,
"loss": 0.476,
"step": 637
},
{
"epoch": 2.011014948859166,
"grad_norm": 0.7853808249371318,
"learning_rate": 2.9859760859150488e-05,
"loss": 0.5242,
"step": 638
},
{
"epoch": 2.014162077104642,
"grad_norm": 0.6808785603976919,
"learning_rate": 2.9821401773714394e-05,
"loss": 0.4915,
"step": 639
},
{
"epoch": 2.017309205350118,
"grad_norm": 1.079169965146025,
"learning_rate": 2.9782995019502787e-05,
"loss": 0.5268,
"step": 640
},
{
"epoch": 2.020456333595594,
"grad_norm": 0.6687486233311583,
"learning_rate": 2.97445407829252e-05,
"loss": 0.4946,
"step": 641
},
{
"epoch": 2.02360346184107,
"grad_norm": 0.7161583063428312,
"learning_rate": 2.9706039250621626e-05,
"loss": 0.5282,
"step": 642
},
{
"epoch": 2.026750590086546,
"grad_norm": 0.8180598647734951,
"learning_rate": 2.9667490609461604e-05,
"loss": 0.5101,
"step": 643
},
{
"epoch": 2.029897718332022,
"grad_norm": 0.7917644525656923,
"learning_rate": 2.9628895046543342e-05,
"loss": 0.5029,
"step": 644
},
{
"epoch": 2.033044846577498,
"grad_norm": 0.6513990862103043,
"learning_rate": 2.9590252749192757e-05,
"loss": 0.5118,
"step": 645
},
{
"epoch": 2.036191974822974,
"grad_norm": 0.7808164985343838,
"learning_rate": 2.955156390496262e-05,
"loss": 0.5195,
"step": 646
},
{
"epoch": 2.0393391030684502,
"grad_norm": 0.7912837002277737,
"learning_rate": 2.9512828701631596e-05,
"loss": 0.5265,
"step": 647
},
{
"epoch": 2.042486231313926,
"grad_norm": 0.5929853901605757,
"learning_rate": 2.9474047327203377e-05,
"loss": 0.4743,
"step": 648
},
{
"epoch": 2.045633359559402,
"grad_norm": 0.5776575088322736,
"learning_rate": 2.943521996990573e-05,
"loss": 0.5317,
"step": 649
},
{
"epoch": 2.048780487804878,
"grad_norm": 0.5820287876548046,
"learning_rate": 2.9396346818189633e-05,
"loss": 0.4907,
"step": 650
},
{
"epoch": 2.051927616050354,
"grad_norm": 0.5401403467336713,
"learning_rate": 2.9357428060728286e-05,
"loss": 0.5108,
"step": 651
},
{
"epoch": 2.05507474429583,
"grad_norm": 0.5681209497944092,
"learning_rate": 2.9318463886416286e-05,
"loss": 0.5115,
"step": 652
},
{
"epoch": 2.058221872541306,
"grad_norm": 0.7457016880475935,
"learning_rate": 2.9279454484368622e-05,
"loss": 0.5245,
"step": 653
},
{
"epoch": 2.061369000786782,
"grad_norm": 0.7100559492936808,
"learning_rate": 2.9240400043919846e-05,
"loss": 0.5066,
"step": 654
},
{
"epoch": 2.064516129032258,
"grad_norm": 0.5562962703615024,
"learning_rate": 2.9201300754623046e-05,
"loss": 0.5197,
"step": 655
},
{
"epoch": 2.0676632572777343,
"grad_norm": 0.7228119547511469,
"learning_rate": 2.916215680624904e-05,
"loss": 0.4895,
"step": 656
},
{
"epoch": 2.07081038552321,
"grad_norm": 0.6104757189458597,
"learning_rate": 2.912296838878537e-05,
"loss": 0.5248,
"step": 657
},
{
"epoch": 2.073957513768686,
"grad_norm": 0.6489476520349108,
"learning_rate": 2.9083735692435428e-05,
"loss": 0.5091,
"step": 658
},
{
"epoch": 2.077104642014162,
"grad_norm": 0.5407920259634255,
"learning_rate": 2.9044458907617515e-05,
"loss": 0.5057,
"step": 659
},
{
"epoch": 2.080251770259638,
"grad_norm": 0.5264195062329879,
"learning_rate": 2.9005138224963906e-05,
"loss": 0.5459,
"step": 660
},
{
"epoch": 2.083398898505114,
"grad_norm": 0.5997049734314334,
"learning_rate": 2.896577383531994e-05,
"loss": 0.5098,
"step": 661
},
{
"epoch": 2.08654602675059,
"grad_norm": 0.6428632064411095,
"learning_rate": 2.8926365929743108e-05,
"loss": 0.4887,
"step": 662
},
{
"epoch": 2.0896931549960662,
"grad_norm": 0.45721246844892666,
"learning_rate": 2.888691469950209e-05,
"loss": 0.5194,
"step": 663
},
{
"epoch": 2.092840283241542,
"grad_norm": 0.6356505848651062,
"learning_rate": 2.8847420336075842e-05,
"loss": 0.4969,
"step": 664
},
{
"epoch": 2.0959874114870183,
"grad_norm": 0.5203943278435145,
"learning_rate": 2.880788303115269e-05,
"loss": 0.512,
"step": 665
},
{
"epoch": 2.099134539732494,
"grad_norm": 0.5416771289936778,
"learning_rate": 2.876830297662936e-05,
"loss": 0.5221,
"step": 666
},
{
"epoch": 2.10228166797797,
"grad_norm": 0.48612729179676883,
"learning_rate": 2.8728680364610072e-05,
"loss": 0.4813,
"step": 667
},
{
"epoch": 2.105428796223446,
"grad_norm": 0.5280735471764217,
"learning_rate": 2.8689015387405606e-05,
"loss": 0.5197,
"step": 668
},
{
"epoch": 2.108575924468922,
"grad_norm": 0.6468916366598001,
"learning_rate": 2.8649308237532357e-05,
"loss": 0.5204,
"step": 669
},
{
"epoch": 2.111723052714398,
"grad_norm": 0.4384358357805341,
"learning_rate": 2.860955910771142e-05,
"loss": 0.4934,
"step": 670
},
{
"epoch": 2.114870180959874,
"grad_norm": 0.5293046040256487,
"learning_rate": 2.856976819086762e-05,
"loss": 0.4993,
"step": 671
},
{
"epoch": 2.1180173092053503,
"grad_norm": 0.5255077696255191,
"learning_rate": 2.8529935680128617e-05,
"loss": 0.5212,
"step": 672
},
{
"epoch": 2.121164437450826,
"grad_norm": 0.5342842320364793,
"learning_rate": 2.849006176882394e-05,
"loss": 0.4941,
"step": 673
},
{
"epoch": 2.1243115656963023,
"grad_norm": 0.4952609066161591,
"learning_rate": 2.8450146650484068e-05,
"loss": 0.5259,
"step": 674
},
{
"epoch": 2.127458693941778,
"grad_norm": 0.5889133681118838,
"learning_rate": 2.8410190518839468e-05,
"loss": 0.5226,
"step": 675
},
{
"epoch": 2.1306058221872544,
"grad_norm": 0.5930345041899265,
"learning_rate": 2.8370193567819675e-05,
"loss": 0.5162,
"step": 676
},
{
"epoch": 2.13375295043273,
"grad_norm": 0.4066191621033764,
"learning_rate": 2.833015599155235e-05,
"loss": 0.5079,
"step": 677
},
{
"epoch": 2.136900078678206,
"grad_norm": 0.5174227837268748,
"learning_rate": 2.829007798436232e-05,
"loss": 0.4783,
"step": 678
},
{
"epoch": 2.140047206923682,
"grad_norm": 0.4906472007347332,
"learning_rate": 2.824995974077064e-05,
"loss": 0.5275,
"step": 679
},
{
"epoch": 2.143194335169158,
"grad_norm": 0.46230506366167495,
"learning_rate": 2.820980145549367e-05,
"loss": 0.503,
"step": 680
},
{
"epoch": 2.1463414634146343,
"grad_norm": 0.5467185655256718,
"learning_rate": 2.816960332344212e-05,
"loss": 0.5192,
"step": 681
},
{
"epoch": 2.14948859166011,
"grad_norm": 0.45235629652152315,
"learning_rate": 2.8129365539720073e-05,
"loss": 0.5113,
"step": 682
},
{
"epoch": 2.1526357199055863,
"grad_norm": 0.5083240071575459,
"learning_rate": 2.8089088299624084e-05,
"loss": 0.4992,
"step": 683
},
{
"epoch": 2.155782848151062,
"grad_norm": 0.5603968704569885,
"learning_rate": 2.8048771798642208e-05,
"loss": 0.5324,
"step": 684
},
{
"epoch": 2.1589299763965384,
"grad_norm": 0.4794631589703544,
"learning_rate": 2.8008416232453056e-05,
"loss": 0.482,
"step": 685
},
{
"epoch": 2.162077104642014,
"grad_norm": 0.4324878828428329,
"learning_rate": 2.7968021796924834e-05,
"loss": 0.505,
"step": 686
},
{
"epoch": 2.16522423288749,
"grad_norm": 0.5405954748624254,
"learning_rate": 2.792758868811442e-05,
"loss": 0.5035,
"step": 687
},
{
"epoch": 2.1683713611329662,
"grad_norm": 0.46018663007159943,
"learning_rate": 2.7887117102266373e-05,
"loss": 0.5513,
"step": 688
},
{
"epoch": 2.171518489378442,
"grad_norm": 0.42047087691548024,
"learning_rate": 2.7846607235812032e-05,
"loss": 0.4948,
"step": 689
},
{
"epoch": 2.1746656176239183,
"grad_norm": 0.5427417159577586,
"learning_rate": 2.7806059285368506e-05,
"loss": 0.514,
"step": 690
},
{
"epoch": 2.177812745869394,
"grad_norm": 0.43986897145724735,
"learning_rate": 2.7765473447737767e-05,
"loss": 0.4673,
"step": 691
},
{
"epoch": 2.1809598741148704,
"grad_norm": 0.4754880772061975,
"learning_rate": 2.772484991990565e-05,
"loss": 0.5417,
"step": 692
},
{
"epoch": 2.184107002360346,
"grad_norm": 0.47502258387933016,
"learning_rate": 2.7684188899040955e-05,
"loss": 0.4796,
"step": 693
},
{
"epoch": 2.1872541306058224,
"grad_norm": 0.40705818520163617,
"learning_rate": 2.764349058249443e-05,
"loss": 0.5157,
"step": 694
},
{
"epoch": 2.190401258851298,
"grad_norm": 0.6538511475739581,
"learning_rate": 2.7602755167797853e-05,
"loss": 0.4961,
"step": 695
},
{
"epoch": 2.193548387096774,
"grad_norm": 0.44699881941017616,
"learning_rate": 2.7561982852663045e-05,
"loss": 0.5207,
"step": 696
},
{
"epoch": 2.1966955153422503,
"grad_norm": 0.6212501071680818,
"learning_rate": 2.752117383498095e-05,
"loss": 0.5439,
"step": 697
},
{
"epoch": 2.199842643587726,
"grad_norm": 0.5759129065235197,
"learning_rate": 2.7480328312820633e-05,
"loss": 0.4595,
"step": 698
},
{
"epoch": 2.2029897718332023,
"grad_norm": 0.6097237129221698,
"learning_rate": 2.7439446484428337e-05,
"loss": 0.5334,
"step": 699
},
{
"epoch": 2.206136900078678,
"grad_norm": 0.6817032082261668,
"learning_rate": 2.7398528548226526e-05,
"loss": 0.5309,
"step": 700
},
{
"epoch": 2.2092840283241544,
"grad_norm": 0.5924111305234294,
"learning_rate": 2.7357574702812902e-05,
"loss": 0.484,
"step": 701
},
{
"epoch": 2.21243115656963,
"grad_norm": 0.5523261349165254,
"learning_rate": 2.731658514695948e-05,
"loss": 0.5364,
"step": 702
},
{
"epoch": 2.2155782848151064,
"grad_norm": 0.5572106085621287,
"learning_rate": 2.7275560079611564e-05,
"loss": 0.53,
"step": 703
},
{
"epoch": 2.2187254130605822,
"grad_norm": 0.5065834566763677,
"learning_rate": 2.7234499699886843e-05,
"loss": 0.4973,
"step": 704
},
{
"epoch": 2.221872541306058,
"grad_norm": 0.5003500662242514,
"learning_rate": 2.719340420707439e-05,
"loss": 0.5014,
"step": 705
},
{
"epoch": 2.2250196695515343,
"grad_norm": 0.3741659698585933,
"learning_rate": 2.715227380063369e-05,
"loss": 0.4975,
"step": 706
},
{
"epoch": 2.22816679779701,
"grad_norm": 0.444161319841057,
"learning_rate": 2.711110868019369e-05,
"loss": 0.5203,
"step": 707
},
{
"epoch": 2.2313139260424864,
"grad_norm": 0.4160861591456116,
"learning_rate": 2.706990904555184e-05,
"loss": 0.489,
"step": 708
},
{
"epoch": 2.234461054287962,
"grad_norm": 0.4599418626187508,
"learning_rate": 2.702867509667308e-05,
"loss": 0.5134,
"step": 709
},
{
"epoch": 2.2376081825334384,
"grad_norm": 0.41866439278740125,
"learning_rate": 2.6987407033688914e-05,
"loss": 0.5248,
"step": 710
},
{
"epoch": 2.240755310778914,
"grad_norm": 0.39041212627496713,
"learning_rate": 2.6946105056896406e-05,
"loss": 0.4978,
"step": 711
},
{
"epoch": 2.2439024390243905,
"grad_norm": 0.417852325813291,
"learning_rate": 2.6904769366757245e-05,
"loss": 0.4977,
"step": 712
},
{
"epoch": 2.2470495672698663,
"grad_norm": 0.4808675566465674,
"learning_rate": 2.6863400163896726e-05,
"loss": 0.5183,
"step": 713
},
{
"epoch": 2.250196695515342,
"grad_norm": 0.4161660973559079,
"learning_rate": 2.6821997649102818e-05,
"loss": 0.5278,
"step": 714
},
{
"epoch": 2.2533438237608183,
"grad_norm": 0.45962709409500624,
"learning_rate": 2.678056202332516e-05,
"loss": 0.52,
"step": 715
},
{
"epoch": 2.256490952006294,
"grad_norm": 0.41242936769273836,
"learning_rate": 2.6739093487674104e-05,
"loss": 0.4999,
"step": 716
},
{
"epoch": 2.2596380802517704,
"grad_norm": 0.5058761449658284,
"learning_rate": 2.6697592243419723e-05,
"loss": 0.5216,
"step": 717
},
{
"epoch": 2.262785208497246,
"grad_norm": 0.3744491655107618,
"learning_rate": 2.6656058491990867e-05,
"loss": 0.4743,
"step": 718
},
{
"epoch": 2.2659323367427224,
"grad_norm": 0.5243504074727018,
"learning_rate": 2.661449243497412e-05,
"loss": 0.525,
"step": 719
},
{
"epoch": 2.2690794649881982,
"grad_norm": 0.43302902154306294,
"learning_rate": 2.657289427411291e-05,
"loss": 0.4997,
"step": 720
},
{
"epoch": 2.2722265932336745,
"grad_norm": 0.4225068744947921,
"learning_rate": 2.6531264211306442e-05,
"loss": 0.5153,
"step": 721
},
{
"epoch": 2.2753737214791503,
"grad_norm": 0.45512676490493853,
"learning_rate": 2.6489602448608795e-05,
"loss": 0.5275,
"step": 722
},
{
"epoch": 2.278520849724626,
"grad_norm": 0.3718800854251584,
"learning_rate": 2.6447909188227874e-05,
"loss": 0.4815,
"step": 723
},
{
"epoch": 2.2816679779701023,
"grad_norm": 0.49964042658507957,
"learning_rate": 2.640618463252448e-05,
"loss": 0.5093,
"step": 724
},
{
"epoch": 2.284815106215578,
"grad_norm": 0.5301891798377791,
"learning_rate": 2.63644289840113e-05,
"loss": 0.5242,
"step": 725
},
{
"epoch": 2.2879622344610544,
"grad_norm": 0.4050871646973943,
"learning_rate": 2.6322642445351942e-05,
"loss": 0.5035,
"step": 726
},
{
"epoch": 2.29110936270653,
"grad_norm": 0.5023627156664985,
"learning_rate": 2.6280825219359916e-05,
"loss": 0.507,
"step": 727
},
{
"epoch": 2.2942564909520065,
"grad_norm": 0.4391094098568942,
"learning_rate": 2.6238977508997705e-05,
"loss": 0.5149,
"step": 728
},
{
"epoch": 2.2974036191974823,
"grad_norm": 0.47537818242731106,
"learning_rate": 2.6197099517375728e-05,
"loss": 0.5014,
"step": 729
},
{
"epoch": 2.3005507474429585,
"grad_norm": 0.46340780511359153,
"learning_rate": 2.61551914477514e-05,
"loss": 0.5378,
"step": 730
},
{
"epoch": 2.3036978756884343,
"grad_norm": 0.4381885150917152,
"learning_rate": 2.611325350352808e-05,
"loss": 0.5302,
"step": 731
},
{
"epoch": 2.30684500393391,
"grad_norm": 0.43651726345372543,
"learning_rate": 2.6071285888254173e-05,
"loss": 0.4827,
"step": 732
},
{
"epoch": 2.3099921321793864,
"grad_norm": 0.42700123656874067,
"learning_rate": 2.602928880562206e-05,
"loss": 0.5209,
"step": 733
},
{
"epoch": 2.313139260424862,
"grad_norm": 0.4936338227402977,
"learning_rate": 2.5987262459467168e-05,
"loss": 0.5234,
"step": 734
},
{
"epoch": 2.3162863886703384,
"grad_norm": 0.6422115819077465,
"learning_rate": 2.5945207053766927e-05,
"loss": 0.4934,
"step": 735
},
{
"epoch": 2.3194335169158142,
"grad_norm": 0.5426504118993586,
"learning_rate": 2.5903122792639835e-05,
"loss": 0.4959,
"step": 736
},
{
"epoch": 2.3225806451612905,
"grad_norm": 0.5610780996366858,
"learning_rate": 2.5861009880344435e-05,
"loss": 0.5072,
"step": 737
},
{
"epoch": 2.3257277734067663,
"grad_norm": 0.6128983248924292,
"learning_rate": 2.5818868521278333e-05,
"loss": 0.5295,
"step": 738
},
{
"epoch": 2.3288749016522425,
"grad_norm": 0.4668022048915129,
"learning_rate": 2.577669891997718e-05,
"loss": 0.4872,
"step": 739
},
{
"epoch": 2.3320220298977183,
"grad_norm": 0.5249195533669971,
"learning_rate": 2.573450128111374e-05,
"loss": 0.5111,
"step": 740
},
{
"epoch": 2.335169158143194,
"grad_norm": 0.501737921275211,
"learning_rate": 2.569227580949683e-05,
"loss": 0.5173,
"step": 741
},
{
"epoch": 2.3383162863886704,
"grad_norm": 0.5066876036326522,
"learning_rate": 2.565002271007037e-05,
"loss": 0.502,
"step": 742
},
{
"epoch": 2.341463414634146,
"grad_norm": 0.4365145077451115,
"learning_rate": 2.5607742187912362e-05,
"loss": 0.5084,
"step": 743
},
{
"epoch": 2.3446105428796225,
"grad_norm": 0.44179872832361533,
"learning_rate": 2.5565434448233915e-05,
"loss": 0.5012,
"step": 744
},
{
"epoch": 2.3477576711250983,
"grad_norm": 0.38725032898097067,
"learning_rate": 2.5523099696378238e-05,
"loss": 0.5246,
"step": 745
},
{
"epoch": 2.3509047993705745,
"grad_norm": 0.4516415072913574,
"learning_rate": 2.5480738137819642e-05,
"loss": 0.4923,
"step": 746
},
{
"epoch": 2.3540519276160503,
"grad_norm": 0.405281408168675,
"learning_rate": 2.5438349978162552e-05,
"loss": 0.5248,
"step": 747
},
{
"epoch": 2.3571990558615266,
"grad_norm": 0.45641439852412824,
"learning_rate": 2.5395935423140487e-05,
"loss": 0.5216,
"step": 748
},
{
"epoch": 2.3603461841070024,
"grad_norm": 0.3726940888910086,
"learning_rate": 2.5353494678615107e-05,
"loss": 0.4828,
"step": 749
},
{
"epoch": 2.363493312352478,
"grad_norm": 0.4749642740007426,
"learning_rate": 2.5311027950575152e-05,
"loss": 0.495,
"step": 750
},
{
"epoch": 2.3666404405979544,
"grad_norm": 0.3717895586016363,
"learning_rate": 2.5268535445135505e-05,
"loss": 0.5197,
"step": 751
},
{
"epoch": 2.3697875688434302,
"grad_norm": 0.5304840030716524,
"learning_rate": 2.522601736853613e-05,
"loss": 0.4806,
"step": 752
},
{
"epoch": 2.3729346970889065,
"grad_norm": 0.46220541231520146,
"learning_rate": 2.5183473927141125e-05,
"loss": 0.5116,
"step": 753
},
{
"epoch": 2.3760818253343823,
"grad_norm": 0.5043416489424374,
"learning_rate": 2.514090532743769e-05,
"loss": 0.5078,
"step": 754
},
{
"epoch": 2.3792289535798585,
"grad_norm": 0.42879087005379485,
"learning_rate": 2.509831177603514e-05,
"loss": 0.4947,
"step": 755
},
{
"epoch": 2.3823760818253343,
"grad_norm": 0.4390099762856732,
"learning_rate": 2.505569347966387e-05,
"loss": 0.5158,
"step": 756
},
{
"epoch": 2.3855232100708106,
"grad_norm": 0.42762212558355356,
"learning_rate": 2.5013050645174414e-05,
"loss": 0.501,
"step": 757
},
{
"epoch": 2.3886703383162864,
"grad_norm": 0.39660686265271283,
"learning_rate": 2.497038347953637e-05,
"loss": 0.5072,
"step": 758
},
{
"epoch": 2.391817466561762,
"grad_norm": 0.44332823895817053,
"learning_rate": 2.4927692189837448e-05,
"loss": 0.5079,
"step": 759
},
{
"epoch": 2.3949645948072384,
"grad_norm": 0.3841557280664548,
"learning_rate": 2.4884976983282426e-05,
"loss": 0.5008,
"step": 760
},
{
"epoch": 2.3981117230527143,
"grad_norm": 0.3783162289122301,
"learning_rate": 2.48422380671922e-05,
"loss": 0.5075,
"step": 761
},
{
"epoch": 2.4012588512981905,
"grad_norm": 0.4626314996814243,
"learning_rate": 2.4799475649002706e-05,
"loss": 0.5179,
"step": 762
},
{
"epoch": 2.4044059795436663,
"grad_norm": 0.35098996182801706,
"learning_rate": 2.4756689936263958e-05,
"loss": 0.5106,
"step": 763
},
{
"epoch": 2.4075531077891426,
"grad_norm": 0.5176720851002166,
"learning_rate": 2.471388113663904e-05,
"loss": 0.5474,
"step": 764
},
{
"epoch": 2.4107002360346184,
"grad_norm": 0.44450352974703067,
"learning_rate": 2.4671049457903086e-05,
"loss": 0.443,
"step": 765
},
{
"epoch": 2.4138473642800946,
"grad_norm": 0.5072356340865095,
"learning_rate": 2.4628195107942273e-05,
"loss": 0.5415,
"step": 766
},
{
"epoch": 2.4169944925255704,
"grad_norm": 0.41936557184075446,
"learning_rate": 2.4585318294752818e-05,
"loss": 0.4955,
"step": 767
},
{
"epoch": 2.420141620771046,
"grad_norm": 0.5116342896251873,
"learning_rate": 2.4542419226439953e-05,
"loss": 0.5317,
"step": 768
},
{
"epoch": 2.4232887490165225,
"grad_norm": 0.4557793409894804,
"learning_rate": 2.449949811121695e-05,
"loss": 0.4922,
"step": 769
},
{
"epoch": 2.4264358772619983,
"grad_norm": 0.5252751532185341,
"learning_rate": 2.445655515740406e-05,
"loss": 0.5344,
"step": 770
},
{
"epoch": 2.4295830055074745,
"grad_norm": 0.47915013961773817,
"learning_rate": 2.441359057342755e-05,
"loss": 0.4823,
"step": 771
},
{
"epoch": 2.4327301337529503,
"grad_norm": 0.47466054181163075,
"learning_rate": 2.437060456781865e-05,
"loss": 0.5184,
"step": 772
},
{
"epoch": 2.4358772619984266,
"grad_norm": 0.5095317232071647,
"learning_rate": 2.432759734921259e-05,
"loss": 0.5309,
"step": 773
},
{
"epoch": 2.4390243902439024,
"grad_norm": 0.412356192481546,
"learning_rate": 2.428456912634753e-05,
"loss": 0.4912,
"step": 774
},
{
"epoch": 2.4421715184893786,
"grad_norm": 0.5168874991069228,
"learning_rate": 2.424152010806359e-05,
"loss": 0.5102,
"step": 775
},
{
"epoch": 2.4453186467348544,
"grad_norm": 0.4218110874274139,
"learning_rate": 2.4198450503301816e-05,
"loss": 0.5119,
"step": 776
},
{
"epoch": 2.4484657749803302,
"grad_norm": 0.46960683665524694,
"learning_rate": 2.415536052110318e-05,
"loss": 0.4883,
"step": 777
},
{
"epoch": 2.4516129032258065,
"grad_norm": 0.38171463482040097,
"learning_rate": 2.4112250370607546e-05,
"loss": 0.5033,
"step": 778
},
{
"epoch": 2.4547600314712823,
"grad_norm": 0.5018899134437422,
"learning_rate": 2.4069120261052682e-05,
"loss": 0.5322,
"step": 779
},
{
"epoch": 2.4579071597167585,
"grad_norm": 0.41906891191421164,
"learning_rate": 2.4025970401773204e-05,
"loss": 0.4813,
"step": 780
},
{
"epoch": 2.4610542879622344,
"grad_norm": 0.48223066804804066,
"learning_rate": 2.3982801002199612e-05,
"loss": 0.5062,
"step": 781
},
{
"epoch": 2.4642014162077106,
"grad_norm": 0.5738880479428738,
"learning_rate": 2.393961227185723e-05,
"loss": 0.5011,
"step": 782
},
{
"epoch": 2.4673485444531864,
"grad_norm": 0.5092728168679931,
"learning_rate": 2.38964044203652e-05,
"loss": 0.5038,
"step": 783
},
{
"epoch": 2.4704956726986627,
"grad_norm": 0.5672892970502025,
"learning_rate": 2.385317765743548e-05,
"loss": 0.512,
"step": 784
},
{
"epoch": 2.4736428009441385,
"grad_norm": 0.48443665358354104,
"learning_rate": 2.3809932192871826e-05,
"loss": 0.5249,
"step": 785
},
{
"epoch": 2.4767899291896143,
"grad_norm": 0.4661612625300044,
"learning_rate": 2.3766668236568736e-05,
"loss": 0.5161,
"step": 786
},
{
"epoch": 2.4799370574350905,
"grad_norm": 0.5142533804492694,
"learning_rate": 2.3723385998510478e-05,
"loss": 0.5235,
"step": 787
},
{
"epoch": 2.4830841856805663,
"grad_norm": 0.3731275373448457,
"learning_rate": 2.3680085688770048e-05,
"loss": 0.489,
"step": 788
},
{
"epoch": 2.4862313139260426,
"grad_norm": 0.4812773095674535,
"learning_rate": 2.3636767517508162e-05,
"loss": 0.5204,
"step": 789
},
{
"epoch": 2.4893784421715184,
"grad_norm": 0.36359194843424747,
"learning_rate": 2.359343169497221e-05,
"loss": 0.514,
"step": 790
},
{
"epoch": 2.4925255704169946,
"grad_norm": 0.4419525373424479,
"learning_rate": 2.355007843149526e-05,
"loss": 0.4946,
"step": 791
},
{
"epoch": 2.4956726986624704,
"grad_norm": 0.42979935733957053,
"learning_rate": 2.3506707937495042e-05,
"loss": 0.5281,
"step": 792
},
{
"epoch": 2.4988198269079467,
"grad_norm": 0.3835614105262742,
"learning_rate": 2.3463320423472903e-05,
"loss": 0.5143,
"step": 793
},
{
"epoch": 2.5019669551534225,
"grad_norm": 0.43444497283325684,
"learning_rate": 2.34199161000128e-05,
"loss": 0.5177,
"step": 794
},
{
"epoch": 2.5051140833988983,
"grad_norm": 0.4134676064163985,
"learning_rate": 2.337649517778028e-05,
"loss": 0.5084,
"step": 795
},
{
"epoch": 2.5082612116443745,
"grad_norm": 0.4192423352575172,
"learning_rate": 2.3333057867521452e-05,
"loss": 0.5109,
"step": 796
},
{
"epoch": 2.5114083398898503,
"grad_norm": 0.47607754776561306,
"learning_rate": 2.3289604380061958e-05,
"loss": 0.514,
"step": 797
},
{
"epoch": 2.5145554681353266,
"grad_norm": 0.43895415692577056,
"learning_rate": 2.3246134926305975e-05,
"loss": 0.4877,
"step": 798
},
{
"epoch": 2.5177025963808024,
"grad_norm": 0.45510815079628525,
"learning_rate": 2.3202649717235148e-05,
"loss": 0.5133,
"step": 799
},
{
"epoch": 2.5208497246262787,
"grad_norm": 0.4427380510366315,
"learning_rate": 2.315914896390761e-05,
"loss": 0.5046,
"step": 800
},
{
"epoch": 2.5239968528717545,
"grad_norm": 0.45212416154203056,
"learning_rate": 2.3115632877456934e-05,
"loss": 0.5093,
"step": 801
},
{
"epoch": 2.5271439811172307,
"grad_norm": 0.4848734648886371,
"learning_rate": 2.3072101669091117e-05,
"loss": 0.5403,
"step": 802
},
{
"epoch": 2.5302911093627065,
"grad_norm": 0.4284645582640883,
"learning_rate": 2.3028555550091536e-05,
"loss": 0.5223,
"step": 803
},
{
"epoch": 2.5334382376081823,
"grad_norm": 0.5065448594186726,
"learning_rate": 2.2984994731811958e-05,
"loss": 0.51,
"step": 804
},
{
"epoch": 2.5365853658536586,
"grad_norm": 0.392577515342448,
"learning_rate": 2.2941419425677484e-05,
"loss": 0.5427,
"step": 805
},
{
"epoch": 2.5397324940991344,
"grad_norm": 0.480807545800186,
"learning_rate": 2.289782984318353e-05,
"loss": 0.5121,
"step": 806
},
{
"epoch": 2.5428796223446106,
"grad_norm": 0.4237156745350743,
"learning_rate": 2.2854226195894804e-05,
"loss": 0.4915,
"step": 807
},
{
"epoch": 2.5460267505900864,
"grad_norm": 0.40940864808502503,
"learning_rate": 2.2810608695444292e-05,
"loss": 0.504,
"step": 808
},
{
"epoch": 2.5491738788355627,
"grad_norm": 0.4119851823926001,
"learning_rate": 2.2766977553532187e-05,
"loss": 0.5299,
"step": 809
},
{
"epoch": 2.5523210070810385,
"grad_norm": 0.4130762816212127,
"learning_rate": 2.2723332981924937e-05,
"loss": 0.4969,
"step": 810
},
{
"epoch": 2.5554681353265147,
"grad_norm": 0.41074305886180484,
"learning_rate": 2.2679675192454123e-05,
"loss": 0.4796,
"step": 811
},
{
"epoch": 2.5586152635719905,
"grad_norm": 0.4539999707531148,
"learning_rate": 2.2636004397015512e-05,
"loss": 0.5463,
"step": 812
},
{
"epoch": 2.5617623918174663,
"grad_norm": 0.4264741420489925,
"learning_rate": 2.2592320807567977e-05,
"loss": 0.4768,
"step": 813
},
{
"epoch": 2.5649095200629426,
"grad_norm": 0.4286763576549067,
"learning_rate": 2.2548624636132516e-05,
"loss": 0.5112,
"step": 814
},
{
"epoch": 2.568056648308419,
"grad_norm": 0.44492687170314177,
"learning_rate": 2.2504916094791155e-05,
"loss": 0.508,
"step": 815
},
{
"epoch": 2.5712037765538946,
"grad_norm": 0.37885862660162245,
"learning_rate": 2.2461195395685994e-05,
"loss": 0.5072,
"step": 816
},
{
"epoch": 2.5743509047993705,
"grad_norm": 0.5106741265250734,
"learning_rate": 2.2417462751018113e-05,
"loss": 0.5493,
"step": 817
},
{
"epoch": 2.5774980330448467,
"grad_norm": 0.3632295363346517,
"learning_rate": 2.23737183730466e-05,
"loss": 0.4631,
"step": 818
},
{
"epoch": 2.5806451612903225,
"grad_norm": 0.5365749079329822,
"learning_rate": 2.2329962474087455e-05,
"loss": 0.5158,
"step": 819
},
{
"epoch": 2.5837922895357988,
"grad_norm": 0.4654117353997409,
"learning_rate": 2.2286195266512635e-05,
"loss": 0.5182,
"step": 820
},
{
"epoch": 2.5869394177812746,
"grad_norm": 0.4088610719508472,
"learning_rate": 2.224241696274894e-05,
"loss": 0.5151,
"step": 821
},
{
"epoch": 2.5900865460267504,
"grad_norm": 0.45576138898697316,
"learning_rate": 2.2198627775277078e-05,
"loss": 0.529,
"step": 822
},
{
"epoch": 2.5932336742722266,
"grad_norm": 0.4131002917308441,
"learning_rate": 2.215482791663054e-05,
"loss": 0.4844,
"step": 823
},
{
"epoch": 2.596380802517703,
"grad_norm": 0.4606000370803025,
"learning_rate": 2.2111017599394627e-05,
"loss": 0.5081,
"step": 824
},
{
"epoch": 2.5995279307631787,
"grad_norm": 0.502321230958265,
"learning_rate": 2.20671970362054e-05,
"loss": 0.4879,
"step": 825
},
{
"epoch": 2.6026750590086545,
"grad_norm": 0.42224326757300795,
"learning_rate": 2.2023366439748647e-05,
"loss": 0.4969,
"step": 826
},
{
"epoch": 2.6058221872541307,
"grad_norm": 0.5269049401237401,
"learning_rate": 2.1979526022758857e-05,
"loss": 0.5288,
"step": 827
},
{
"epoch": 2.6089693154996065,
"grad_norm": 0.46230735850829585,
"learning_rate": 2.1935675998018175e-05,
"loss": 0.5053,
"step": 828
},
{
"epoch": 2.612116443745083,
"grad_norm": 0.4727993429576673,
"learning_rate": 2.1891816578355387e-05,
"loss": 0.5197,
"step": 829
},
{
"epoch": 2.6152635719905586,
"grad_norm": 0.44264214533923524,
"learning_rate": 2.1847947976644882e-05,
"loss": 0.5018,
"step": 830
},
{
"epoch": 2.6184107002360344,
"grad_norm": 0.5055955604260659,
"learning_rate": 2.18040704058056e-05,
"loss": 0.5169,
"step": 831
},
{
"epoch": 2.6215578284815106,
"grad_norm": 0.43677389530029376,
"learning_rate": 2.1760184078800016e-05,
"loss": 0.5045,
"step": 832
},
{
"epoch": 2.624704956726987,
"grad_norm": 0.5768947879521789,
"learning_rate": 2.1716289208633116e-05,
"loss": 0.5246,
"step": 833
},
{
"epoch": 2.6278520849724627,
"grad_norm": 0.4712226478373909,
"learning_rate": 2.1672386008351346e-05,
"loss": 0.4935,
"step": 834
},
{
"epoch": 2.6309992132179385,
"grad_norm": 0.5337996631076506,
"learning_rate": 2.162847469104157e-05,
"loss": 0.5203,
"step": 835
},
{
"epoch": 2.6341463414634148,
"grad_norm": 0.4367835591952792,
"learning_rate": 2.1584555469830067e-05,
"loss": 0.4775,
"step": 836
},
{
"epoch": 2.6372934697088906,
"grad_norm": 0.513188259032418,
"learning_rate": 2.154062855788146e-05,
"loss": 0.5008,
"step": 837
},
{
"epoch": 2.640440597954367,
"grad_norm": 0.45447000330227766,
"learning_rate": 2.1496694168397716e-05,
"loss": 0.5321,
"step": 838
},
{
"epoch": 2.6435877261998426,
"grad_norm": 0.47125461614152103,
"learning_rate": 2.1452752514617087e-05,
"loss": 0.4861,
"step": 839
},
{
"epoch": 2.6467348544453184,
"grad_norm": 0.4409432391062593,
"learning_rate": 2.140880380981307e-05,
"loss": 0.5233,
"step": 840
},
{
"epoch": 2.6498819826907947,
"grad_norm": 0.5309669026394624,
"learning_rate": 2.1364848267293424e-05,
"loss": 0.5338,
"step": 841
},
{
"epoch": 2.653029110936271,
"grad_norm": 0.40586300361985456,
"learning_rate": 2.1320886100399045e-05,
"loss": 0.5037,
"step": 842
},
{
"epoch": 2.6561762391817467,
"grad_norm": 0.540051626366727,
"learning_rate": 2.1276917522503017e-05,
"loss": 0.528,
"step": 843
},
{
"epoch": 2.6593233674272225,
"grad_norm": 0.4006507834778298,
"learning_rate": 2.1232942747009516e-05,
"loss": 0.4866,
"step": 844
},
{
"epoch": 2.662470495672699,
"grad_norm": 0.6074678023209782,
"learning_rate": 2.1188961987352825e-05,
"loss": 0.4863,
"step": 845
},
{
"epoch": 2.6656176239181746,
"grad_norm": 0.524924268220035,
"learning_rate": 2.1144975456996254e-05,
"loss": 0.5609,
"step": 846
},
{
"epoch": 2.668764752163651,
"grad_norm": 0.48996765687142846,
"learning_rate": 2.1100983369431113e-05,
"loss": 0.4959,
"step": 847
},
{
"epoch": 2.6719118804091266,
"grad_norm": 0.4249594980122473,
"learning_rate": 2.1056985938175702e-05,
"loss": 0.5125,
"step": 848
},
{
"epoch": 2.6750590086546024,
"grad_norm": 0.49834813656701993,
"learning_rate": 2.1012983376774255e-05,
"loss": 0.5047,
"step": 849
},
{
"epoch": 2.6782061369000787,
"grad_norm": 0.4168575322233032,
"learning_rate": 2.09689758987959e-05,
"loss": 0.5106,
"step": 850
},
{
"epoch": 2.681353265145555,
"grad_norm": 0.46898762750512035,
"learning_rate": 2.0924963717833625e-05,
"loss": 0.4981,
"step": 851
},
{
"epoch": 2.6845003933910307,
"grad_norm": 0.45576203072274146,
"learning_rate": 2.0880947047503245e-05,
"loss": 0.4946,
"step": 852
},
{
"epoch": 2.6876475216365066,
"grad_norm": 0.4738564187252312,
"learning_rate": 2.0836926101442375e-05,
"loss": 0.5331,
"step": 853
},
{
"epoch": 2.690794649881983,
"grad_norm": 0.4302791146012412,
"learning_rate": 2.079290109330937e-05,
"loss": 0.4961,
"step": 854
},
{
"epoch": 2.6939417781274586,
"grad_norm": 0.47679421081246964,
"learning_rate": 2.074887223678231e-05,
"loss": 0.5207,
"step": 855
},
{
"epoch": 2.697088906372935,
"grad_norm": 0.4209024643198245,
"learning_rate": 2.070483974555794e-05,
"loss": 0.5094,
"step": 856
},
{
"epoch": 2.7002360346184107,
"grad_norm": 0.40617414636341587,
"learning_rate": 2.066080383335067e-05,
"loss": 0.5175,
"step": 857
},
{
"epoch": 2.7033831628638865,
"grad_norm": 0.42113681289201627,
"learning_rate": 2.061676471389149e-05,
"loss": 0.4998,
"step": 858
},
{
"epoch": 2.7065302911093627,
"grad_norm": 0.39058660308978405,
"learning_rate": 2.0572722600926963e-05,
"loss": 0.5258,
"step": 859
},
{
"epoch": 2.709677419354839,
"grad_norm": 0.3853478587959275,
"learning_rate": 2.0528677708218182e-05,
"loss": 0.5054,
"step": 860
},
{
"epoch": 2.7128245476003148,
"grad_norm": 0.3661149507703519,
"learning_rate": 2.0484630249539744e-05,
"loss": 0.5043,
"step": 861
},
{
"epoch": 2.7159716758457906,
"grad_norm": 0.40711054029867527,
"learning_rate": 2.0440580438678683e-05,
"loss": 0.5244,
"step": 862
},
{
"epoch": 2.719118804091267,
"grad_norm": 0.3652482622197375,
"learning_rate": 2.0396528489433462e-05,
"loss": 0.4899,
"step": 863
},
{
"epoch": 2.7222659323367426,
"grad_norm": 0.3720015216795421,
"learning_rate": 2.03524746156129e-05,
"loss": 0.4915,
"step": 864
},
{
"epoch": 2.725413060582219,
"grad_norm": 0.44032053432840434,
"learning_rate": 2.0308419031035198e-05,
"loss": 0.5275,
"step": 865
},
{
"epoch": 2.7285601888276947,
"grad_norm": 0.3737368381561298,
"learning_rate": 2.026436194952682e-05,
"loss": 0.5182,
"step": 866
},
{
"epoch": 2.7317073170731705,
"grad_norm": 0.4122697478594307,
"learning_rate": 2.0220303584921517e-05,
"loss": 0.5256,
"step": 867
},
{
"epoch": 2.7348544453186467,
"grad_norm": 0.36665473236295937,
"learning_rate": 2.0176244151059255e-05,
"loss": 0.5173,
"step": 868
},
{
"epoch": 2.738001573564123,
"grad_norm": 0.3335486591184441,
"learning_rate": 2.0132183861785212e-05,
"loss": 0.4572,
"step": 869
},
{
"epoch": 2.741148701809599,
"grad_norm": 0.38036549112647583,
"learning_rate": 2.008812293094869e-05,
"loss": 0.5377,
"step": 870
},
{
"epoch": 2.7442958300550746,
"grad_norm": 0.33037565691132226,
"learning_rate": 2.0044061572402115e-05,
"loss": 0.491,
"step": 871
},
{
"epoch": 2.747442958300551,
"grad_norm": 0.39753101673110847,
"learning_rate": 2e-05,
"loss": 0.5177,
"step": 872
},
{
"epoch": 2.7505900865460267,
"grad_norm": 0.37934933259758424,
"learning_rate": 1.9955938427597892e-05,
"loss": 0.523,
"step": 873
},
{
"epoch": 2.753737214791503,
"grad_norm": 0.4113073150527321,
"learning_rate": 1.9911877069051317e-05,
"loss": 0.501,
"step": 874
},
{
"epoch": 2.7568843430369787,
"grad_norm": 0.36383957044919724,
"learning_rate": 1.9867816138214794e-05,
"loss": 0.4869,
"step": 875
},
{
"epoch": 2.7600314712824545,
"grad_norm": 0.40346499233864774,
"learning_rate": 1.9823755848940745e-05,
"loss": 0.5307,
"step": 876
},
{
"epoch": 2.7631785995279308,
"grad_norm": 0.40358164566370097,
"learning_rate": 1.9779696415078493e-05,
"loss": 0.5199,
"step": 877
},
{
"epoch": 2.766325727773407,
"grad_norm": 0.3387651787698826,
"learning_rate": 1.973563805047319e-05,
"loss": 0.4933,
"step": 878
},
{
"epoch": 2.769472856018883,
"grad_norm": 0.4119862988912321,
"learning_rate": 1.969158096896481e-05,
"loss": 0.505,
"step": 879
},
{
"epoch": 2.7726199842643586,
"grad_norm": 0.398916067627908,
"learning_rate": 1.9647525384387102e-05,
"loss": 0.5274,
"step": 880
},
{
"epoch": 2.775767112509835,
"grad_norm": 0.3925892974223439,
"learning_rate": 1.9603471510566545e-05,
"loss": 0.5213,
"step": 881
},
{
"epoch": 2.7789142407553107,
"grad_norm": 0.4005281636322755,
"learning_rate": 1.955941956132132e-05,
"loss": 0.5029,
"step": 882
},
{
"epoch": 2.782061369000787,
"grad_norm": 0.33952223831530753,
"learning_rate": 1.951536975046026e-05,
"loss": 0.5198,
"step": 883
},
{
"epoch": 2.7852084972462627,
"grad_norm": 0.38608198397536103,
"learning_rate": 1.9471322291781818e-05,
"loss": 0.4766,
"step": 884
},
{
"epoch": 2.7883556254917385,
"grad_norm": 0.365009789979654,
"learning_rate": 1.9427277399073047e-05,
"loss": 0.5072,
"step": 885
},
{
"epoch": 2.791502753737215,
"grad_norm": 0.35042837456840825,
"learning_rate": 1.9383235286108522e-05,
"loss": 0.5237,
"step": 886
},
{
"epoch": 2.794649881982691,
"grad_norm": 0.36211165712974785,
"learning_rate": 1.9339196166649333e-05,
"loss": 0.5082,
"step": 887
},
{
"epoch": 2.797797010228167,
"grad_norm": 0.329398288687774,
"learning_rate": 1.9295160254442062e-05,
"loss": 0.4907,
"step": 888
},
{
"epoch": 2.8009441384736427,
"grad_norm": 0.3921121599600544,
"learning_rate": 1.9251127763217695e-05,
"loss": 0.5087,
"step": 889
},
{
"epoch": 2.804091266719119,
"grad_norm": 0.39222971203676776,
"learning_rate": 1.9207098906690632e-05,
"loss": 0.5155,
"step": 890
},
{
"epoch": 2.8072383949645947,
"grad_norm": 0.3869489751568991,
"learning_rate": 1.916307389855763e-05,
"loss": 0.4923,
"step": 891
},
{
"epoch": 2.810385523210071,
"grad_norm": 0.36534142344404397,
"learning_rate": 1.911905295249676e-05,
"loss": 0.5173,
"step": 892
},
{
"epoch": 2.8135326514555468,
"grad_norm": 0.4096373953978077,
"learning_rate": 1.9075036282166385e-05,
"loss": 0.4787,
"step": 893
},
{
"epoch": 2.8166797797010226,
"grad_norm": 0.42907981958111935,
"learning_rate": 1.903102410120411e-05,
"loss": 0.4999,
"step": 894
},
{
"epoch": 2.819826907946499,
"grad_norm": 0.44754981578982994,
"learning_rate": 1.8987016623225748e-05,
"loss": 0.5223,
"step": 895
},
{
"epoch": 2.822974036191975,
"grad_norm": 0.4503356738206301,
"learning_rate": 1.8943014061824304e-05,
"loss": 0.483,
"step": 896
},
{
"epoch": 2.826121164437451,
"grad_norm": 0.4462637475879935,
"learning_rate": 1.889901663056889e-05,
"loss": 0.5146,
"step": 897
},
{
"epoch": 2.8292682926829267,
"grad_norm": 0.4758553248752999,
"learning_rate": 1.8855024543003756e-05,
"loss": 0.4987,
"step": 898
},
{
"epoch": 2.832415420928403,
"grad_norm": 0.3818873003232349,
"learning_rate": 1.8811038012647175e-05,
"loss": 0.5232,
"step": 899
},
{
"epoch": 2.8355625491738787,
"grad_norm": 0.4459217250813105,
"learning_rate": 1.8767057252990484e-05,
"loss": 0.4975,
"step": 900
},
{
"epoch": 2.838709677419355,
"grad_norm": 0.45703475054822507,
"learning_rate": 1.8723082477496993e-05,
"loss": 0.5226,
"step": 901
},
{
"epoch": 2.841856805664831,
"grad_norm": 0.49145431991499045,
"learning_rate": 1.8679113899600965e-05,
"loss": 0.5098,
"step": 902
},
{
"epoch": 2.8450039339103066,
"grad_norm": 0.3858560750864749,
"learning_rate": 1.8635151732706586e-05,
"loss": 0.512,
"step": 903
},
{
"epoch": 2.848151062155783,
"grad_norm": 0.3978300138355392,
"learning_rate": 1.859119619018693e-05,
"loss": 0.4671,
"step": 904
},
{
"epoch": 2.851298190401259,
"grad_norm": 0.40494272314425184,
"learning_rate": 1.854724748538292e-05,
"loss": 0.5072,
"step": 905
},
{
"epoch": 2.854445318646735,
"grad_norm": 0.4247470229056908,
"learning_rate": 1.850330583160229e-05,
"loss": 0.5315,
"step": 906
},
{
"epoch": 2.8575924468922107,
"grad_norm": 0.4169321625496455,
"learning_rate": 1.8459371442118542e-05,
"loss": 0.5246,
"step": 907
},
{
"epoch": 2.860739575137687,
"grad_norm": 0.3901805457711373,
"learning_rate": 1.8415444530169936e-05,
"loss": 0.5006,
"step": 908
},
{
"epoch": 2.8638867033831628,
"grad_norm": 0.4121320993719406,
"learning_rate": 1.837152530895844e-05,
"loss": 0.507,
"step": 909
},
{
"epoch": 2.867033831628639,
"grad_norm": 0.36255560340653853,
"learning_rate": 1.8327613991648657e-05,
"loss": 0.501,
"step": 910
},
{
"epoch": 2.870180959874115,
"grad_norm": 0.48965568319475056,
"learning_rate": 1.8283710791366887e-05,
"loss": 0.4897,
"step": 911
},
{
"epoch": 2.8733280881195906,
"grad_norm": 0.388852081772549,
"learning_rate": 1.823981592119999e-05,
"loss": 0.5012,
"step": 912
},
{
"epoch": 2.876475216365067,
"grad_norm": 0.42848323102553554,
"learning_rate": 1.8195929594194404e-05,
"loss": 0.4902,
"step": 913
},
{
"epoch": 2.879622344610543,
"grad_norm": 0.47008473589718597,
"learning_rate": 1.8152052023355125e-05,
"loss": 0.5589,
"step": 914
},
{
"epoch": 2.882769472856019,
"grad_norm": 0.30454578165861085,
"learning_rate": 1.8108183421644613e-05,
"loss": 0.4709,
"step": 915
},
{
"epoch": 2.8859166011014947,
"grad_norm": 0.4002304069091809,
"learning_rate": 1.806432400198183e-05,
"loss": 0.5426,
"step": 916
},
{
"epoch": 2.889063729346971,
"grad_norm": 0.3787420784015964,
"learning_rate": 1.8020473977241157e-05,
"loss": 0.4764,
"step": 917
},
{
"epoch": 2.892210857592447,
"grad_norm": 0.4353640762286048,
"learning_rate": 1.797663356025136e-05,
"loss": 0.5275,
"step": 918
},
{
"epoch": 2.895357985837923,
"grad_norm": 0.5073628969017314,
"learning_rate": 1.7932802963794607e-05,
"loss": 0.5153,
"step": 919
},
{
"epoch": 2.898505114083399,
"grad_norm": 0.5041788326127704,
"learning_rate": 1.7888982400605376e-05,
"loss": 0.4932,
"step": 920
},
{
"epoch": 2.9016522423288746,
"grad_norm": 0.460386672312264,
"learning_rate": 1.7845172083369465e-05,
"loss": 0.5119,
"step": 921
},
{
"epoch": 2.904799370574351,
"grad_norm": 0.4781432319925081,
"learning_rate": 1.7801372224722925e-05,
"loss": 0.5004,
"step": 922
},
{
"epoch": 2.907946498819827,
"grad_norm": 0.40584542775039373,
"learning_rate": 1.775758303725106e-05,
"loss": 0.5166,
"step": 923
},
{
"epoch": 2.911093627065303,
"grad_norm": 0.42543709277206193,
"learning_rate": 1.7713804733487375e-05,
"loss": 0.508,
"step": 924
},
{
"epoch": 2.9142407553107788,
"grad_norm": 0.42925628769889695,
"learning_rate": 1.7670037525912555e-05,
"loss": 0.506,
"step": 925
},
{
"epoch": 2.917387883556255,
"grad_norm": 0.3766803868387005,
"learning_rate": 1.762628162695341e-05,
"loss": 0.5205,
"step": 926
},
{
"epoch": 2.920535011801731,
"grad_norm": 0.45357831441605395,
"learning_rate": 1.7582537248981894e-05,
"loss": 0.4778,
"step": 927
},
{
"epoch": 2.923682140047207,
"grad_norm": 0.42810065630689936,
"learning_rate": 1.753880460431401e-05,
"loss": 0.5002,
"step": 928
},
{
"epoch": 2.926829268292683,
"grad_norm": 0.4755212300543142,
"learning_rate": 1.749508390520885e-05,
"loss": 0.5384,
"step": 929
},
{
"epoch": 2.9299763965381587,
"grad_norm": 0.4293108713905954,
"learning_rate": 1.7451375363867487e-05,
"loss": 0.5221,
"step": 930
},
{
"epoch": 2.933123524783635,
"grad_norm": 0.42645458489471416,
"learning_rate": 1.7407679192432023e-05,
"loss": 0.4949,
"step": 931
},
{
"epoch": 2.936270653029111,
"grad_norm": 0.45544102637961936,
"learning_rate": 1.736399560298449e-05,
"loss": 0.5314,
"step": 932
},
{
"epoch": 2.939417781274587,
"grad_norm": 0.4357317492875381,
"learning_rate": 1.732032480754589e-05,
"loss": 0.4868,
"step": 933
},
{
"epoch": 2.9425649095200628,
"grad_norm": 0.5080522514835767,
"learning_rate": 1.7276667018075073e-05,
"loss": 0.5223,
"step": 934
},
{
"epoch": 2.945712037765539,
"grad_norm": 0.4545220250229832,
"learning_rate": 1.7233022446467817e-05,
"loss": 0.4655,
"step": 935
},
{
"epoch": 2.948859166011015,
"grad_norm": 0.5764080721118271,
"learning_rate": 1.7189391304555715e-05,
"loss": 0.5433,
"step": 936
},
{
"epoch": 2.952006294256491,
"grad_norm": 0.5051853544982342,
"learning_rate": 1.71457738041052e-05,
"loss": 0.4882,
"step": 937
},
{
"epoch": 2.955153422501967,
"grad_norm": 0.541977388868933,
"learning_rate": 1.7102170156816473e-05,
"loss": 0.5092,
"step": 938
},
{
"epoch": 2.9583005507474427,
"grad_norm": 0.4421182710596497,
"learning_rate": 1.705858057432252e-05,
"loss": 0.512,
"step": 939
},
{
"epoch": 2.961447678992919,
"grad_norm": 0.4291759732245689,
"learning_rate": 1.7015005268188042e-05,
"loss": 0.5096,
"step": 940
},
{
"epoch": 2.964594807238395,
"grad_norm": 0.42281901679826367,
"learning_rate": 1.6971444449908474e-05,
"loss": 0.496,
"step": 941
},
{
"epoch": 2.967741935483871,
"grad_norm": 0.36550645510303176,
"learning_rate": 1.6927898330908893e-05,
"loss": 0.5298,
"step": 942
},
{
"epoch": 2.970889063729347,
"grad_norm": 0.4439902202892732,
"learning_rate": 1.6884367122543072e-05,
"loss": 0.4919,
"step": 943
},
{
"epoch": 2.974036191974823,
"grad_norm": 0.4145819697915999,
"learning_rate": 1.6840851036092395e-05,
"loss": 0.5012,
"step": 944
},
{
"epoch": 2.977183320220299,
"grad_norm": 0.4024149607221758,
"learning_rate": 1.6797350282764856e-05,
"loss": 0.535,
"step": 945
},
{
"epoch": 2.980330448465775,
"grad_norm": 0.4088679260911552,
"learning_rate": 1.6753865073694028e-05,
"loss": 0.5207,
"step": 946
},
{
"epoch": 2.983477576711251,
"grad_norm": 0.4722988645871877,
"learning_rate": 1.6710395619938042e-05,
"loss": 0.499,
"step": 947
},
{
"epoch": 2.9866247049567267,
"grad_norm": 0.33656951076586683,
"learning_rate": 1.666694213247855e-05,
"loss": 0.5037,
"step": 948
},
{
"epoch": 2.989771833202203,
"grad_norm": 0.4038311667361262,
"learning_rate": 1.6623504822219726e-05,
"loss": 0.5221,
"step": 949
},
{
"epoch": 2.992918961447679,
"grad_norm": 0.3656238746602209,
"learning_rate": 1.658008389998721e-05,
"loss": 0.4887,
"step": 950
},
{
"epoch": 2.996066089693155,
"grad_norm": 0.3597010342945152,
"learning_rate": 1.6536679576527104e-05,
"loss": 0.5313,
"step": 951
},
{
"epoch": 3.000786782061369,
"grad_norm": 0.8437011095934643,
"learning_rate": 1.6493292062504965e-05,
"loss": 0.873,
"step": 952
},
{
"epoch": 3.003933910306845,
"grad_norm": 0.4894489456737364,
"learning_rate": 1.6449921568504747e-05,
"loss": 0.4624,
"step": 953
},
{
"epoch": 3.0070810385523212,
"grad_norm": 0.5196796511583233,
"learning_rate": 1.6406568305027798e-05,
"loss": 0.4646,
"step": 954
},
{
"epoch": 3.010228166797797,
"grad_norm": 0.6421523874225777,
"learning_rate": 1.6363232482491844e-05,
"loss": 0.4526,
"step": 955
},
{
"epoch": 3.013375295043273,
"grad_norm": 0.47988438785390297,
"learning_rate": 1.631991431122995e-05,
"loss": 0.4308,
"step": 956
},
{
"epoch": 3.016522423288749,
"grad_norm": 0.607462624981888,
"learning_rate": 1.627661400148953e-05,
"loss": 0.4859,
"step": 957
},
{
"epoch": 3.019669551534225,
"grad_norm": 0.5030069793452936,
"learning_rate": 1.6233331763431274e-05,
"loss": 0.4513,
"step": 958
},
{
"epoch": 3.022816679779701,
"grad_norm": 0.5778067720286014,
"learning_rate": 1.6190067807128184e-05,
"loss": 0.4531,
"step": 959
},
{
"epoch": 3.025963808025177,
"grad_norm": 0.39106884195501385,
"learning_rate": 1.6146822342564525e-05,
"loss": 0.4201,
"step": 960
},
{
"epoch": 3.029110936270653,
"grad_norm": 0.6507261847344724,
"learning_rate": 1.6103595579634806e-05,
"loss": 0.4846,
"step": 961
},
{
"epoch": 3.032258064516129,
"grad_norm": 0.38933170208126056,
"learning_rate": 1.606038772814278e-05,
"loss": 0.4406,
"step": 962
},
{
"epoch": 3.0354051927616053,
"grad_norm": 0.5285339792483837,
"learning_rate": 1.6017198997800395e-05,
"loss": 0.4531,
"step": 963
},
{
"epoch": 3.038552321007081,
"grad_norm": 0.4647220568403595,
"learning_rate": 1.5974029598226796e-05,
"loss": 0.4513,
"step": 964
},
{
"epoch": 3.041699449252557,
"grad_norm": 0.39006146074109477,
"learning_rate": 1.5930879738947328e-05,
"loss": 0.4525,
"step": 965
},
{
"epoch": 3.044846577498033,
"grad_norm": 0.5111098953041673,
"learning_rate": 1.588774962939246e-05,
"loss": 0.4518,
"step": 966
},
{
"epoch": 3.047993705743509,
"grad_norm": 0.40886909787213843,
"learning_rate": 1.5844639478896827e-05,
"loss": 0.457,
"step": 967
},
{
"epoch": 3.051140833988985,
"grad_norm": 0.41611581011446236,
"learning_rate": 1.580154949669819e-05,
"loss": 0.4295,
"step": 968
},
{
"epoch": 3.054287962234461,
"grad_norm": 0.3846880322150094,
"learning_rate": 1.5758479891936418e-05,
"loss": 0.4377,
"step": 969
},
{
"epoch": 3.0574350904799372,
"grad_norm": 0.39308263395636117,
"learning_rate": 1.5715430873652476e-05,
"loss": 0.449,
"step": 970
},
{
"epoch": 3.060582218725413,
"grad_norm": 0.3664501208137125,
"learning_rate": 1.5672402650787412e-05,
"loss": 0.4367,
"step": 971
},
{
"epoch": 3.0637293469708893,
"grad_norm": 0.39215891536789105,
"learning_rate": 1.5629395432181352e-05,
"loss": 0.4569,
"step": 972
},
{
"epoch": 3.066876475216365,
"grad_norm": 0.37791276065723745,
"learning_rate": 1.5586409426572462e-05,
"loss": 0.4553,
"step": 973
},
{
"epoch": 3.070023603461841,
"grad_norm": 0.34823341311056166,
"learning_rate": 1.554344484259595e-05,
"loss": 0.4417,
"step": 974
},
{
"epoch": 3.073170731707317,
"grad_norm": 0.42528363706310823,
"learning_rate": 1.5500501888783057e-05,
"loss": 0.4421,
"step": 975
},
{
"epoch": 3.076317859952793,
"grad_norm": 0.35592182294418906,
"learning_rate": 1.545758077356005e-05,
"loss": 0.4596,
"step": 976
},
{
"epoch": 3.079464988198269,
"grad_norm": 0.41726134174420965,
"learning_rate": 1.541468170524719e-05,
"loss": 0.4256,
"step": 977
},
{
"epoch": 3.082612116443745,
"grad_norm": 0.3579509357206158,
"learning_rate": 1.537180489205773e-05,
"loss": 0.4766,
"step": 978
},
{
"epoch": 3.0857592446892212,
"grad_norm": 0.34613018627695974,
"learning_rate": 1.5328950542096917e-05,
"loss": 0.43,
"step": 979
},
{
"epoch": 3.088906372934697,
"grad_norm": 0.37351932093944284,
"learning_rate": 1.5286118863360963e-05,
"loss": 0.4533,
"step": 980
},
{
"epoch": 3.0920535011801733,
"grad_norm": 0.3788549663096128,
"learning_rate": 1.5243310063736052e-05,
"loss": 0.4673,
"step": 981
},
{
"epoch": 3.095200629425649,
"grad_norm": 0.37077490540161007,
"learning_rate": 1.5200524350997306e-05,
"loss": 0.4488,
"step": 982
},
{
"epoch": 3.098347757671125,
"grad_norm": 0.3667195042366108,
"learning_rate": 1.5157761932807806e-05,
"loss": 0.4555,
"step": 983
},
{
"epoch": 3.101494885916601,
"grad_norm": 0.3756336351908015,
"learning_rate": 1.5115023016717576e-05,
"loss": 0.455,
"step": 984
},
{
"epoch": 3.104642014162077,
"grad_norm": 0.3467248778353456,
"learning_rate": 1.5072307810162559e-05,
"loss": 0.4429,
"step": 985
},
{
"epoch": 3.107789142407553,
"grad_norm": 0.4074814463065111,
"learning_rate": 1.5029616520463636e-05,
"loss": 0.457,
"step": 986
},
{
"epoch": 3.110936270653029,
"grad_norm": 0.3735336842092654,
"learning_rate": 1.498694935482559e-05,
"loss": 0.4599,
"step": 987
},
{
"epoch": 3.1140833988985053,
"grad_norm": 0.3645860933631925,
"learning_rate": 1.4944306520336129e-05,
"loss": 0.4395,
"step": 988
},
{
"epoch": 3.117230527143981,
"grad_norm": 0.3954959214109424,
"learning_rate": 1.4901688223964871e-05,
"loss": 0.4217,
"step": 989
},
{
"epoch": 3.1203776553894573,
"grad_norm": 0.3285213122250933,
"learning_rate": 1.4859094672562314e-05,
"loss": 0.4578,
"step": 990
},
{
"epoch": 3.123524783634933,
"grad_norm": 0.45616500289860873,
"learning_rate": 1.4816526072858881e-05,
"loss": 0.4545,
"step": 991
},
{
"epoch": 3.126671911880409,
"grad_norm": 0.35479473494648844,
"learning_rate": 1.4773982631463879e-05,
"loss": 0.4504,
"step": 992
},
{
"epoch": 3.129819040125885,
"grad_norm": 0.40625773248569635,
"learning_rate": 1.4731464554864503e-05,
"loss": 0.4594,
"step": 993
},
{
"epoch": 3.132966168371361,
"grad_norm": 0.3417485916649555,
"learning_rate": 1.468897204942485e-05,
"loss": 0.4064,
"step": 994
},
{
"epoch": 3.1361132966168372,
"grad_norm": 0.44443922220128873,
"learning_rate": 1.4646505321384896e-05,
"loss": 0.481,
"step": 995
},
{
"epoch": 3.139260424862313,
"grad_norm": 0.33497076522190666,
"learning_rate": 1.4604064576859513e-05,
"loss": 0.4439,
"step": 996
},
{
"epoch": 3.1424075531077893,
"grad_norm": 0.40460214907111397,
"learning_rate": 1.4561650021837461e-05,
"loss": 0.4535,
"step": 997
},
{
"epoch": 3.145554681353265,
"grad_norm": 0.3548757014421584,
"learning_rate": 1.4519261862180365e-05,
"loss": 0.4305,
"step": 998
},
{
"epoch": 3.1487018095987414,
"grad_norm": 0.39006944874130967,
"learning_rate": 1.447690030362177e-05,
"loss": 0.4605,
"step": 999
},
{
"epoch": 3.151848937844217,
"grad_norm": 0.2947807614670378,
"learning_rate": 1.4434565551766091e-05,
"loss": 0.4375,
"step": 1000
},
{
"epoch": 3.154996066089693,
"grad_norm": 0.37151093567185317,
"learning_rate": 1.4392257812087644e-05,
"loss": 0.4437,
"step": 1001
},
{
"epoch": 3.158143194335169,
"grad_norm": 0.3043566527179534,
"learning_rate": 1.4349977289929639e-05,
"loss": 0.4434,
"step": 1002
},
{
"epoch": 3.161290322580645,
"grad_norm": 0.3239420861001395,
"learning_rate": 1.4307724190503174e-05,
"loss": 0.4602,
"step": 1003
},
{
"epoch": 3.1644374508261213,
"grad_norm": 0.3022915343958959,
"learning_rate": 1.4265498718886263e-05,
"loss": 0.4432,
"step": 1004
},
{
"epoch": 3.167584579071597,
"grad_norm": 0.32662353296688856,
"learning_rate": 1.4223301080022829e-05,
"loss": 0.4655,
"step": 1005
},
{
"epoch": 3.1707317073170733,
"grad_norm": 0.30216819737098594,
"learning_rate": 1.4181131478721679e-05,
"loss": 0.4488,
"step": 1006
},
{
"epoch": 3.173878835562549,
"grad_norm": 0.3194738978026303,
"learning_rate": 1.4138990119655573e-05,
"loss": 0.4664,
"step": 1007
},
{
"epoch": 3.1770259638080254,
"grad_norm": 0.3238992239390259,
"learning_rate": 1.4096877207360172e-05,
"loss": 0.4247,
"step": 1008
},
{
"epoch": 3.180173092053501,
"grad_norm": 0.38746919789270406,
"learning_rate": 1.4054792946233082e-05,
"loss": 0.4439,
"step": 1009
},
{
"epoch": 3.183320220298977,
"grad_norm": 0.31269921601318473,
"learning_rate": 1.4012737540532842e-05,
"loss": 0.4728,
"step": 1010
},
{
"epoch": 3.1864673485444532,
"grad_norm": 0.4583973254397915,
"learning_rate": 1.3970711194377944e-05,
"loss": 0.4425,
"step": 1011
},
{
"epoch": 3.189614476789929,
"grad_norm": 0.3107178198172973,
"learning_rate": 1.3928714111745834e-05,
"loss": 0.4473,
"step": 1012
},
{
"epoch": 3.1927616050354053,
"grad_norm": 0.42357819649857154,
"learning_rate": 1.3886746496471927e-05,
"loss": 0.4447,
"step": 1013
},
{
"epoch": 3.195908733280881,
"grad_norm": 0.337426375904954,
"learning_rate": 1.3844808552248612e-05,
"loss": 0.4678,
"step": 1014
},
{
"epoch": 3.1990558615263573,
"grad_norm": 0.42860966362762376,
"learning_rate": 1.3802900482624275e-05,
"loss": 0.4345,
"step": 1015
},
{
"epoch": 3.202202989771833,
"grad_norm": 0.3879836551122409,
"learning_rate": 1.3761022491002298e-05,
"loss": 0.4367,
"step": 1016
},
{
"epoch": 3.2053501180173094,
"grad_norm": 0.4089210534095531,
"learning_rate": 1.3719174780640087e-05,
"loss": 0.4589,
"step": 1017
},
{
"epoch": 3.208497246262785,
"grad_norm": 0.3593147408746166,
"learning_rate": 1.3677357554648061e-05,
"loss": 0.4503,
"step": 1018
},
{
"epoch": 3.211644374508261,
"grad_norm": 0.31622524795270396,
"learning_rate": 1.36355710159887e-05,
"loss": 0.4558,
"step": 1019
},
{
"epoch": 3.2147915027537373,
"grad_norm": 0.4132536639954306,
"learning_rate": 1.3593815367475518e-05,
"loss": 0.4373,
"step": 1020
},
{
"epoch": 3.217938630999213,
"grad_norm": 0.3606321436614752,
"learning_rate": 1.3552090811772134e-05,
"loss": 0.4504,
"step": 1021
},
{
"epoch": 3.2210857592446893,
"grad_norm": 0.36333999573846615,
"learning_rate": 1.3510397551391212e-05,
"loss": 0.4623,
"step": 1022
},
{
"epoch": 3.224232887490165,
"grad_norm": 0.29550053984309993,
"learning_rate": 1.3468735788693563e-05,
"loss": 0.4287,
"step": 1023
},
{
"epoch": 3.2273800157356414,
"grad_norm": 0.43844592374705915,
"learning_rate": 1.3427105725887098e-05,
"loss": 0.4675,
"step": 1024
},
{
"epoch": 3.230527143981117,
"grad_norm": 0.34132693908368855,
"learning_rate": 1.3385507565025884e-05,
"loss": 0.4495,
"step": 1025
},
{
"epoch": 3.2336742722265934,
"grad_norm": 0.435235912189035,
"learning_rate": 1.334394150800914e-05,
"loss": 0.4518,
"step": 1026
},
{
"epoch": 3.2368214004720692,
"grad_norm": 0.35002633272777456,
"learning_rate": 1.3302407756580278e-05,
"loss": 0.4442,
"step": 1027
},
{
"epoch": 3.239968528717545,
"grad_norm": 0.5022707232854282,
"learning_rate": 1.32609065123259e-05,
"loss": 0.4543,
"step": 1028
},
{
"epoch": 3.2431156569630213,
"grad_norm": 0.3200367426841411,
"learning_rate": 1.3219437976674847e-05,
"loss": 0.4368,
"step": 1029
},
{
"epoch": 3.246262785208497,
"grad_norm": 0.5138494915176968,
"learning_rate": 1.317800235089719e-05,
"loss": 0.464,
"step": 1030
},
{
"epoch": 3.2494099134539733,
"grad_norm": 0.3687799468130162,
"learning_rate": 1.313659983610328e-05,
"loss": 0.4695,
"step": 1031
},
{
"epoch": 3.252557041699449,
"grad_norm": 0.47713155015281183,
"learning_rate": 1.3095230633242761e-05,
"loss": 0.4455,
"step": 1032
},
{
"epoch": 3.2557041699449254,
"grad_norm": 0.37374667859633126,
"learning_rate": 1.3053894943103598e-05,
"loss": 0.4389,
"step": 1033
},
{
"epoch": 3.258851298190401,
"grad_norm": 0.3842402902889691,
"learning_rate": 1.3012592966311091e-05,
"loss": 0.4884,
"step": 1034
},
{
"epoch": 3.2619984264358775,
"grad_norm": 0.3547447427297586,
"learning_rate": 1.2971324903326923e-05,
"loss": 0.4232,
"step": 1035
},
{
"epoch": 3.2651455546813533,
"grad_norm": 0.32157783163039644,
"learning_rate": 1.293009095444816e-05,
"loss": 0.4414,
"step": 1036
},
{
"epoch": 3.2682926829268295,
"grad_norm": 0.3546515803316329,
"learning_rate": 1.2888891319806312e-05,
"loss": 0.4535,
"step": 1037
},
{
"epoch": 3.2714398111723053,
"grad_norm": 0.33391924791649685,
"learning_rate": 1.284772619936632e-05,
"loss": 0.4627,
"step": 1038
},
{
"epoch": 3.274586939417781,
"grad_norm": 0.34842719790074683,
"learning_rate": 1.2806595792925616e-05,
"loss": 0.4348,
"step": 1039
},
{
"epoch": 3.2777340676632574,
"grad_norm": 0.31720262229680574,
"learning_rate": 1.2765500300113163e-05,
"loss": 0.4465,
"step": 1040
},
{
"epoch": 3.280881195908733,
"grad_norm": 0.37969959728816255,
"learning_rate": 1.2724439920388445e-05,
"loss": 0.4726,
"step": 1041
},
{
"epoch": 3.2840283241542094,
"grad_norm": 0.3205555271128335,
"learning_rate": 1.268341485304053e-05,
"loss": 0.4179,
"step": 1042
},
{
"epoch": 3.2871754523996852,
"grad_norm": 0.38765876310575315,
"learning_rate": 1.2642425297187101e-05,
"loss": 0.4647,
"step": 1043
},
{
"epoch": 3.2903225806451615,
"grad_norm": 0.3079512941344033,
"learning_rate": 1.260147145177348e-05,
"loss": 0.4545,
"step": 1044
},
{
"epoch": 3.2934697088906373,
"grad_norm": 0.4183847497019012,
"learning_rate": 1.256055351557167e-05,
"loss": 0.4565,
"step": 1045
},
{
"epoch": 3.2966168371361135,
"grad_norm": 0.3070683731820513,
"learning_rate": 1.2519671687179375e-05,
"loss": 0.4438,
"step": 1046
},
{
"epoch": 3.2997639653815893,
"grad_norm": 0.399817426938315,
"learning_rate": 1.2478826165019053e-05,
"loss": 0.4262,
"step": 1047
},
{
"epoch": 3.302911093627065,
"grad_norm": 0.3547125394607051,
"learning_rate": 1.243801714733696e-05,
"loss": 0.4658,
"step": 1048
},
{
"epoch": 3.3060582218725414,
"grad_norm": 0.3436731206723249,
"learning_rate": 1.2397244832202153e-05,
"loss": 0.4504,
"step": 1049
},
{
"epoch": 3.309205350118017,
"grad_norm": 0.38057310070493267,
"learning_rate": 1.2356509417505573e-05,
"loss": 0.4501,
"step": 1050
},
{
"epoch": 3.3123524783634934,
"grad_norm": 0.27293264885045054,
"learning_rate": 1.231581110095905e-05,
"loss": 0.4407,
"step": 1051
},
{
"epoch": 3.3154996066089693,
"grad_norm": 0.40319367595969435,
"learning_rate": 1.2275150080094348e-05,
"loss": 0.4636,
"step": 1052
},
{
"epoch": 3.3186467348544455,
"grad_norm": 0.31214404066637935,
"learning_rate": 1.2234526552262243e-05,
"loss": 0.4537,
"step": 1053
},
{
"epoch": 3.3217938630999213,
"grad_norm": 0.3196911749183487,
"learning_rate": 1.21939407146315e-05,
"loss": 0.4464,
"step": 1054
},
{
"epoch": 3.3249409913453976,
"grad_norm": 0.43833460528156976,
"learning_rate": 1.2153392764187974e-05,
"loss": 0.4575,
"step": 1055
},
{
"epoch": 3.3280881195908734,
"grad_norm": 0.3320927741275933,
"learning_rate": 1.2112882897733634e-05,
"loss": 0.4548,
"step": 1056
},
{
"epoch": 3.331235247836349,
"grad_norm": 0.38130010599050557,
"learning_rate": 1.2072411311885588e-05,
"loss": 0.4433,
"step": 1057
},
{
"epoch": 3.3343823760818254,
"grad_norm": 0.32380680107010973,
"learning_rate": 1.2031978203075172e-05,
"loss": 0.4331,
"step": 1058
},
{
"epoch": 3.337529504327301,
"grad_norm": 0.36742670403821587,
"learning_rate": 1.1991583767546948e-05,
"loss": 0.4616,
"step": 1059
},
{
"epoch": 3.3406766325727775,
"grad_norm": 0.36279961379407005,
"learning_rate": 1.1951228201357794e-05,
"loss": 0.4571,
"step": 1060
},
{
"epoch": 3.3438237608182533,
"grad_norm": 0.27086173597488794,
"learning_rate": 1.1910911700375924e-05,
"loss": 0.4171,
"step": 1061
},
{
"epoch": 3.3469708890637295,
"grad_norm": 0.40509428003250936,
"learning_rate": 1.1870634460279937e-05,
"loss": 0.4959,
"step": 1062
},
{
"epoch": 3.3501180173092053,
"grad_norm": 0.2788427389494092,
"learning_rate": 1.1830396676557889e-05,
"loss": 0.4259,
"step": 1063
},
{
"epoch": 3.3532651455546816,
"grad_norm": 0.30665073285225036,
"learning_rate": 1.1790198544506333e-05,
"loss": 0.4427,
"step": 1064
},
{
"epoch": 3.3564122738001574,
"grad_norm": 0.32614144159278,
"learning_rate": 1.1750040259229365e-05,
"loss": 0.4537,
"step": 1065
},
{
"epoch": 3.359559402045633,
"grad_norm": 0.3266438015263831,
"learning_rate": 1.170992201563769e-05,
"loss": 0.4392,
"step": 1066
},
{
"epoch": 3.3627065302911094,
"grad_norm": 0.2993872663278503,
"learning_rate": 1.1669844008447654e-05,
"loss": 0.4505,
"step": 1067
},
{
"epoch": 3.3658536585365852,
"grad_norm": 0.2878451066832284,
"learning_rate": 1.1629806432180324e-05,
"loss": 0.4244,
"step": 1068
},
{
"epoch": 3.3690007867820615,
"grad_norm": 0.33355834908904675,
"learning_rate": 1.1589809481160539e-05,
"loss": 0.4556,
"step": 1069
},
{
"epoch": 3.3721479150275373,
"grad_norm": 0.27525931720756097,
"learning_rate": 1.1549853349515939e-05,
"loss": 0.4165,
"step": 1070
},
{
"epoch": 3.3752950432730136,
"grad_norm": 0.31458748223848476,
"learning_rate": 1.1509938231176068e-05,
"loss": 0.4807,
"step": 1071
},
{
"epoch": 3.3784421715184894,
"grad_norm": 0.3033894833023403,
"learning_rate": 1.1470064319871393e-05,
"loss": 0.4384,
"step": 1072
},
{
"epoch": 3.3815892997639656,
"grad_norm": 0.29732541952252484,
"learning_rate": 1.1430231809132386e-05,
"loss": 0.4831,
"step": 1073
},
{
"epoch": 3.3847364280094414,
"grad_norm": 0.29115088294335617,
"learning_rate": 1.139044089228858e-05,
"loss": 0.4428,
"step": 1074
},
{
"epoch": 3.387883556254917,
"grad_norm": 0.296430263712168,
"learning_rate": 1.1350691762467644e-05,
"loss": 0.4515,
"step": 1075
},
{
"epoch": 3.3910306845003935,
"grad_norm": 0.28782588365398537,
"learning_rate": 1.1310984612594394e-05,
"loss": 0.4184,
"step": 1076
},
{
"epoch": 3.3941778127458693,
"grad_norm": 0.3067545371689205,
"learning_rate": 1.1271319635389934e-05,
"loss": 0.4626,
"step": 1077
},
{
"epoch": 3.3973249409913455,
"grad_norm": 0.3161311488832454,
"learning_rate": 1.1231697023370645e-05,
"loss": 0.4504,
"step": 1078
},
{
"epoch": 3.4004720692368213,
"grad_norm": 0.31991819718734327,
"learning_rate": 1.1192116968847313e-05,
"loss": 0.4573,
"step": 1079
},
{
"epoch": 3.4036191974822976,
"grad_norm": 0.2902807527048948,
"learning_rate": 1.1152579663924167e-05,
"loss": 0.4562,
"step": 1080
},
{
"epoch": 3.4067663257277734,
"grad_norm": 0.345254009382499,
"learning_rate": 1.1113085300497919e-05,
"loss": 0.4496,
"step": 1081
},
{
"epoch": 3.4099134539732496,
"grad_norm": 0.27394920734180683,
"learning_rate": 1.1073634070256895e-05,
"loss": 0.4494,
"step": 1082
},
{
"epoch": 3.4130605822187254,
"grad_norm": 0.3148362640218033,
"learning_rate": 1.1034226164680066e-05,
"loss": 0.4688,
"step": 1083
},
{
"epoch": 3.4162077104642012,
"grad_norm": 0.2943354574773677,
"learning_rate": 1.0994861775036101e-05,
"loss": 0.44,
"step": 1084
},
{
"epoch": 3.4193548387096775,
"grad_norm": 0.29129387101628995,
"learning_rate": 1.0955541092382496e-05,
"loss": 0.4464,
"step": 1085
},
{
"epoch": 3.4225019669551533,
"grad_norm": 0.31501719097176634,
"learning_rate": 1.0916264307564574e-05,
"loss": 0.4651,
"step": 1086
},
{
"epoch": 3.4256490952006295,
"grad_norm": 0.33225178849500403,
"learning_rate": 1.0877031611214632e-05,
"loss": 0.4587,
"step": 1087
},
{
"epoch": 3.4287962234461054,
"grad_norm": 0.3345423444622172,
"learning_rate": 1.083784319375097e-05,
"loss": 0.4447,
"step": 1088
},
{
"epoch": 3.4319433516915816,
"grad_norm": 0.27316577067100334,
"learning_rate": 1.0798699245376959e-05,
"loss": 0.4346,
"step": 1089
},
{
"epoch": 3.4350904799370574,
"grad_norm": 0.30011292227050673,
"learning_rate": 1.0759599956080162e-05,
"loss": 0.4725,
"step": 1090
},
{
"epoch": 3.4382376081825337,
"grad_norm": 0.29665774985859605,
"learning_rate": 1.072054551563138e-05,
"loss": 0.4387,
"step": 1091
},
{
"epoch": 3.4413847364280095,
"grad_norm": 0.3039564711038381,
"learning_rate": 1.068153611358372e-05,
"loss": 0.4556,
"step": 1092
},
{
"epoch": 3.4445318646734853,
"grad_norm": 0.30191917389875206,
"learning_rate": 1.0642571939271723e-05,
"loss": 0.4451,
"step": 1093
},
{
"epoch": 3.4476789929189615,
"grad_norm": 0.29199496459557056,
"learning_rate": 1.0603653181810379e-05,
"loss": 0.4725,
"step": 1094
},
{
"epoch": 3.4508261211644373,
"grad_norm": 0.3271566573252477,
"learning_rate": 1.0564780030094272e-05,
"loss": 0.4431,
"step": 1095
},
{
"epoch": 3.4539732494099136,
"grad_norm": 0.27957614050990437,
"learning_rate": 1.0525952672796636e-05,
"loss": 0.4489,
"step": 1096
},
{
"epoch": 3.4571203776553894,
"grad_norm": 0.2911856670468552,
"learning_rate": 1.0487171298368412e-05,
"loss": 0.4611,
"step": 1097
},
{
"epoch": 3.4602675059008656,
"grad_norm": 0.32032706145968165,
"learning_rate": 1.0448436095037385e-05,
"loss": 0.4481,
"step": 1098
},
{
"epoch": 3.4634146341463414,
"grad_norm": 0.32197178012548605,
"learning_rate": 1.040974725080724e-05,
"loss": 0.473,
"step": 1099
},
{
"epoch": 3.4665617623918177,
"grad_norm": 0.3105584033001555,
"learning_rate": 1.0371104953456663e-05,
"loss": 0.468,
"step": 1100
},
{
"epoch": 3.4697088906372935,
"grad_norm": 0.30202604130512095,
"learning_rate": 1.0332509390538404e-05,
"loss": 0.4344,
"step": 1101
},
{
"epoch": 3.4728560188827693,
"grad_norm": 0.30955812132114763,
"learning_rate": 1.0293960749378384e-05,
"loss": 0.4636,
"step": 1102
},
{
"epoch": 3.4760031471282455,
"grad_norm": 0.30252588064347674,
"learning_rate": 1.0255459217074803e-05,
"loss": 0.4607,
"step": 1103
},
{
"epoch": 3.4791502753737213,
"grad_norm": 0.30519810066545155,
"learning_rate": 1.021700498049722e-05,
"loss": 0.4227,
"step": 1104
},
{
"epoch": 3.4822974036191976,
"grad_norm": 0.28348020669441537,
"learning_rate": 1.017859822628561e-05,
"loss": 0.4605,
"step": 1105
},
{
"epoch": 3.4854445318646734,
"grad_norm": 0.28732215547896256,
"learning_rate": 1.0140239140849519e-05,
"loss": 0.4494,
"step": 1106
},
{
"epoch": 3.4885916601101497,
"grad_norm": 0.3173927002942517,
"learning_rate": 1.0101927910367118e-05,
"loss": 0.4524,
"step": 1107
},
{
"epoch": 3.4917387883556255,
"grad_norm": 0.2670843455004391,
"learning_rate": 1.006366472078432e-05,
"loss": 0.4494,
"step": 1108
},
{
"epoch": 3.4948859166011017,
"grad_norm": 0.28811825068841074,
"learning_rate": 1.0025449757813852e-05,
"loss": 0.4524,
"step": 1109
},
{
"epoch": 3.4980330448465775,
"grad_norm": 0.29011073031991363,
"learning_rate": 9.987283206934374e-06,
"loss": 0.4568,
"step": 1110
},
{
"epoch": 3.5011801730920533,
"grad_norm": 0.2651530625627191,
"learning_rate": 9.949165253389588e-06,
"loss": 0.4492,
"step": 1111
},
{
"epoch": 3.5043273013375296,
"grad_norm": 0.28366453872298675,
"learning_rate": 9.911096082187324e-06,
"loss": 0.4428,
"step": 1112
},
{
"epoch": 3.5074744295830054,
"grad_norm": 0.31212089217996963,
"learning_rate": 9.873075878098623e-06,
"loss": 0.4498,
"step": 1113
},
{
"epoch": 3.5106215578284816,
"grad_norm": 0.2839621070001912,
"learning_rate": 9.835104825656884e-06,
"loss": 0.4508,
"step": 1114
},
{
"epoch": 3.5137686860739574,
"grad_norm": 0.3356792889009391,
"learning_rate": 9.797183109156938e-06,
"loss": 0.473,
"step": 1115
},
{
"epoch": 3.5169158143194337,
"grad_norm": 0.2583265687099093,
"learning_rate": 9.759310912654167e-06,
"loss": 0.4527,
"step": 1116
},
{
"epoch": 3.5200629425649095,
"grad_norm": 0.3088294300859714,
"learning_rate": 9.72148841996359e-06,
"loss": 0.4491,
"step": 1117
},
{
"epoch": 3.5232100708103857,
"grad_norm": 0.3100355304231426,
"learning_rate": 9.683715814658987e-06,
"loss": 0.4532,
"step": 1118
},
{
"epoch": 3.5263571990558615,
"grad_norm": 0.28230509785485414,
"learning_rate": 9.645993280072021e-06,
"loss": 0.442,
"step": 1119
},
{
"epoch": 3.5295043273013373,
"grad_norm": 0.302124749981287,
"learning_rate": 9.608320999291333e-06,
"loss": 0.4408,
"step": 1120
},
{
"epoch": 3.5326514555468136,
"grad_norm": 0.31719529084230913,
"learning_rate": 9.570699155161633e-06,
"loss": 0.4501,
"step": 1121
},
{
"epoch": 3.5357985837922894,
"grad_norm": 0.33943970273737095,
"learning_rate": 9.533127930282855e-06,
"loss": 0.4703,
"step": 1122
},
{
"epoch": 3.5389457120377656,
"grad_norm": 0.37639643321932154,
"learning_rate": 9.49560750700924e-06,
"loss": 0.4608,
"step": 1123
},
{
"epoch": 3.5420928402832415,
"grad_norm": 0.26960896872512685,
"learning_rate": 9.458138067448469e-06,
"loss": 0.4265,
"step": 1124
},
{
"epoch": 3.5452399685287177,
"grad_norm": 0.32993014271365495,
"learning_rate": 9.420719793460758e-06,
"loss": 0.4337,
"step": 1125
},
{
"epoch": 3.5483870967741935,
"grad_norm": 0.34626577482812293,
"learning_rate": 9.383352866657987e-06,
"loss": 0.4766,
"step": 1126
},
{
"epoch": 3.5515342250196698,
"grad_norm": 0.30347167116503154,
"learning_rate": 9.346037468402831e-06,
"loss": 0.4318,
"step": 1127
},
{
"epoch": 3.5546813532651456,
"grad_norm": 0.338884524041918,
"learning_rate": 9.308773779807863e-06,
"loss": 0.4561,
"step": 1128
},
{
"epoch": 3.5578284815106214,
"grad_norm": 0.2816135680779377,
"learning_rate": 9.271561981734687e-06,
"loss": 0.4462,
"step": 1129
},
{
"epoch": 3.5609756097560976,
"grad_norm": 0.3152958510112679,
"learning_rate": 9.234402254793033e-06,
"loss": 0.4359,
"step": 1130
},
{
"epoch": 3.5641227380015734,
"grad_norm": 0.28881402590713195,
"learning_rate": 9.19729477933992e-06,
"loss": 0.4623,
"step": 1131
},
{
"epoch": 3.5672698662470497,
"grad_norm": 0.2654482197647601,
"learning_rate": 9.16023973547876e-06,
"loss": 0.4351,
"step": 1132
},
{
"epoch": 3.5704169944925255,
"grad_norm": 0.2793158707050921,
"learning_rate": 9.123237303058474e-06,
"loss": 0.4537,
"step": 1133
},
{
"epoch": 3.5735641227380017,
"grad_norm": 0.32371965320338886,
"learning_rate": 9.086287661672629e-06,
"loss": 0.4293,
"step": 1134
},
{
"epoch": 3.5767112509834775,
"grad_norm": 0.3088971938054911,
"learning_rate": 9.049390990658579e-06,
"loss": 0.4696,
"step": 1135
},
{
"epoch": 3.579858379228954,
"grad_norm": 0.2754420783162646,
"learning_rate": 9.012547469096584e-06,
"loss": 0.4503,
"step": 1136
},
{
"epoch": 3.5830055074744296,
"grad_norm": 0.2897270638504705,
"learning_rate": 8.975757275808936e-06,
"loss": 0.4504,
"step": 1137
},
{
"epoch": 3.5861526357199054,
"grad_norm": 0.32953833717635256,
"learning_rate": 8.93902058935908e-06,
"loss": 0.4516,
"step": 1138
},
{
"epoch": 3.5892997639653816,
"grad_norm": 0.293563094341784,
"learning_rate": 8.902337588050783e-06,
"loss": 0.4605,
"step": 1139
},
{
"epoch": 3.5924468922108574,
"grad_norm": 0.28541037221224175,
"learning_rate": 8.865708449927241e-06,
"loss": 0.4385,
"step": 1140
},
{
"epoch": 3.5955940204563337,
"grad_norm": 0.2769667048762866,
"learning_rate": 8.829133352770236e-06,
"loss": 0.4528,
"step": 1141
},
{
"epoch": 3.5987411487018095,
"grad_norm": 0.3063282915245064,
"learning_rate": 8.792612474099213e-06,
"loss": 0.4595,
"step": 1142
},
{
"epoch": 3.6018882769472857,
"grad_norm": 0.27297998759691816,
"learning_rate": 8.756145991170513e-06,
"loss": 0.4387,
"step": 1143
},
{
"epoch": 3.6050354051927616,
"grad_norm": 0.29288749403302455,
"learning_rate": 8.719734080976441e-06,
"loss": 0.4522,
"step": 1144
},
{
"epoch": 3.608182533438238,
"grad_norm": 0.26484230856498,
"learning_rate": 8.683376920244446e-06,
"loss": 0.4269,
"step": 1145
},
{
"epoch": 3.6113296616837136,
"grad_norm": 0.2594230460233364,
"learning_rate": 8.647074685436223e-06,
"loss": 0.4692,
"step": 1146
},
{
"epoch": 3.6144767899291894,
"grad_norm": 0.3145876246673129,
"learning_rate": 8.610827552746897e-06,
"loss": 0.4552,
"step": 1147
},
{
"epoch": 3.6176239181746657,
"grad_norm": 0.2953504368805096,
"learning_rate": 8.57463569810415e-06,
"loss": 0.4543,
"step": 1148
},
{
"epoch": 3.6207710464201415,
"grad_norm": 0.2903789889032914,
"learning_rate": 8.538499297167385e-06,
"loss": 0.4324,
"step": 1149
},
{
"epoch": 3.6239181746656177,
"grad_norm": 0.2951350855679963,
"learning_rate": 8.502418525326801e-06,
"loss": 0.4382,
"step": 1150
},
{
"epoch": 3.6270653029110935,
"grad_norm": 0.28051588971875474,
"learning_rate": 8.466393557702659e-06,
"loss": 0.4487,
"step": 1151
},
{
"epoch": 3.6302124311565698,
"grad_norm": 0.32378806547935485,
"learning_rate": 8.430424569144345e-06,
"loss": 0.4541,
"step": 1152
},
{
"epoch": 3.6333595594020456,
"grad_norm": 0.33107209363046897,
"learning_rate": 8.394511734229556e-06,
"loss": 0.465,
"step": 1153
},
{
"epoch": 3.636506687647522,
"grad_norm": 0.31121756126159067,
"learning_rate": 8.358655227263424e-06,
"loss": 0.4587,
"step": 1154
},
{
"epoch": 3.6396538158929976,
"grad_norm": 0.3283802721771546,
"learning_rate": 8.322855222277708e-06,
"loss": 0.4456,
"step": 1155
},
{
"epoch": 3.6428009441384734,
"grad_norm": 0.2377363192142234,
"learning_rate": 8.287111893029929e-06,
"loss": 0.4455,
"step": 1156
},
{
"epoch": 3.6459480723839497,
"grad_norm": 0.2932521033552982,
"learning_rate": 8.251425413002534e-06,
"loss": 0.4541,
"step": 1157
},
{
"epoch": 3.6490952006294255,
"grad_norm": 0.2805611062166459,
"learning_rate": 8.215795955402032e-06,
"loss": 0.4489,
"step": 1158
},
{
"epoch": 3.6522423288749017,
"grad_norm": 0.2684630930767302,
"learning_rate": 8.180223693158175e-06,
"loss": 0.4418,
"step": 1159
},
{
"epoch": 3.6553894571203775,
"grad_norm": 0.2833671321964946,
"learning_rate": 8.144708798923125e-06,
"loss": 0.4673,
"step": 1160
},
{
"epoch": 3.658536585365854,
"grad_norm": 0.2587422980629999,
"learning_rate": 8.109251445070602e-06,
"loss": 0.4253,
"step": 1161
},
{
"epoch": 3.6616837136113296,
"grad_norm": 0.2886779130244593,
"learning_rate": 8.073851803695033e-06,
"loss": 0.4686,
"step": 1162
},
{
"epoch": 3.664830841856806,
"grad_norm": 0.2677528808996215,
"learning_rate": 8.03851004661076e-06,
"loss": 0.4403,
"step": 1163
},
{
"epoch": 3.6679779701022817,
"grad_norm": 0.25135806823271106,
"learning_rate": 8.003226345351161e-06,
"loss": 0.4565,
"step": 1164
},
{
"epoch": 3.6711250983477575,
"grad_norm": 0.26013779523809955,
"learning_rate": 7.968000871167849e-06,
"loss": 0.4469,
"step": 1165
},
{
"epoch": 3.6742722265932337,
"grad_norm": 0.2602422338025341,
"learning_rate": 7.93283379502982e-06,
"loss": 0.4506,
"step": 1166
},
{
"epoch": 3.6774193548387095,
"grad_norm": 0.2526902487256982,
"learning_rate": 7.897725287622625e-06,
"loss": 0.4476,
"step": 1167
},
{
"epoch": 3.6805664830841858,
"grad_norm": 0.2788501883728098,
"learning_rate": 7.862675519347562e-06,
"loss": 0.4648,
"step": 1168
},
{
"epoch": 3.6837136113296616,
"grad_norm": 0.26375904220072866,
"learning_rate": 7.827684660320832e-06,
"loss": 0.4482,
"step": 1169
},
{
"epoch": 3.686860739575138,
"grad_norm": 0.27321586538638337,
"learning_rate": 7.792752880372718e-06,
"loss": 0.456,
"step": 1170
},
{
"epoch": 3.6900078678206136,
"grad_norm": 0.2605976271837837,
"learning_rate": 7.757880349046742e-06,
"loss": 0.4167,
"step": 1171
},
{
"epoch": 3.69315499606609,
"grad_norm": 0.3033078670365782,
"learning_rate": 7.723067235598882e-06,
"loss": 0.4889,
"step": 1172
},
{
"epoch": 3.6963021243115657,
"grad_norm": 0.285808272646413,
"learning_rate": 7.688313708996724e-06,
"loss": 0.4731,
"step": 1173
},
{
"epoch": 3.6994492525570415,
"grad_norm": 0.25978835534209727,
"learning_rate": 7.653619937918633e-06,
"loss": 0.4417,
"step": 1174
},
{
"epoch": 3.7025963808025177,
"grad_norm": 0.291667100739017,
"learning_rate": 7.618986090752944e-06,
"loss": 0.4479,
"step": 1175
},
{
"epoch": 3.7057435090479935,
"grad_norm": 0.27599608659380026,
"learning_rate": 7.584412335597164e-06,
"loss": 0.4521,
"step": 1176
},
{
"epoch": 3.70889063729347,
"grad_norm": 0.28827831234418355,
"learning_rate": 7.5498988402571375e-06,
"loss": 0.4696,
"step": 1177
},
{
"epoch": 3.7120377655389456,
"grad_norm": 0.28088824035080084,
"learning_rate": 7.515445772246233e-06,
"loss": 0.4429,
"step": 1178
},
{
"epoch": 3.715184893784422,
"grad_norm": 0.27258796059875257,
"learning_rate": 7.481053298784513e-06,
"loss": 0.4414,
"step": 1179
},
{
"epoch": 3.7183320220298977,
"grad_norm": 0.2633113146422214,
"learning_rate": 7.446721586797965e-06,
"loss": 0.4539,
"step": 1180
},
{
"epoch": 3.721479150275374,
"grad_norm": 0.28264104854960853,
"learning_rate": 7.4124508029176634e-06,
"loss": 0.4691,
"step": 1181
},
{
"epoch": 3.7246262785208497,
"grad_norm": 0.26255314972538524,
"learning_rate": 7.37824111347895e-06,
"loss": 0.4478,
"step": 1182
},
{
"epoch": 3.7277734067663255,
"grad_norm": 0.25393959149162687,
"learning_rate": 7.344092684520647e-06,
"loss": 0.4294,
"step": 1183
},
{
"epoch": 3.7309205350118018,
"grad_norm": 0.26457063732023456,
"learning_rate": 7.310005681784249e-06,
"loss": 0.4511,
"step": 1184
},
{
"epoch": 3.7340676632572776,
"grad_norm": 0.2980208241476433,
"learning_rate": 7.275980270713116e-06,
"loss": 0.45,
"step": 1185
},
{
"epoch": 3.737214791502754,
"grad_norm": 0.2619386826593012,
"learning_rate": 7.242016616451675e-06,
"loss": 0.4706,
"step": 1186
},
{
"epoch": 3.7403619197482296,
"grad_norm": 0.25726156936586797,
"learning_rate": 7.208114883844585e-06,
"loss": 0.4346,
"step": 1187
},
{
"epoch": 3.743509047993706,
"grad_norm": 0.30187535227273843,
"learning_rate": 7.174275237435995e-06,
"loss": 0.4524,
"step": 1188
},
{
"epoch": 3.7466561762391817,
"grad_norm": 0.24968257054319973,
"learning_rate": 7.140497841468708e-06,
"loss": 0.4377,
"step": 1189
},
{
"epoch": 3.749803304484658,
"grad_norm": 0.2898224004095643,
"learning_rate": 7.106782859883377e-06,
"loss": 0.463,
"step": 1190
},
{
"epoch": 3.7529504327301337,
"grad_norm": 0.2684756971084031,
"learning_rate": 7.073130456317728e-06,
"loss": 0.4401,
"step": 1191
},
{
"epoch": 3.7560975609756095,
"grad_norm": 0.2782685168228126,
"learning_rate": 7.03954079410577e-06,
"loss": 0.4541,
"step": 1192
},
{
"epoch": 3.759244689221086,
"grad_norm": 0.2685406178964316,
"learning_rate": 7.0060140362769866e-06,
"loss": 0.4622,
"step": 1193
},
{
"epoch": 3.762391817466562,
"grad_norm": 0.2734876298756259,
"learning_rate": 6.97255034555556e-06,
"loss": 0.4446,
"step": 1194
},
{
"epoch": 3.765538945712038,
"grad_norm": 0.2361780988310677,
"learning_rate": 6.939149884359548e-06,
"loss": 0.4466,
"step": 1195
},
{
"epoch": 3.7686860739575136,
"grad_norm": 0.2392787383755442,
"learning_rate": 6.905812814800148e-06,
"loss": 0.459,
"step": 1196
},
{
"epoch": 3.77183320220299,
"grad_norm": 0.2649448309081467,
"learning_rate": 6.872539298680874e-06,
"loss": 0.4605,
"step": 1197
},
{
"epoch": 3.7749803304484657,
"grad_norm": 0.25909568260718674,
"learning_rate": 6.8393294974967624e-06,
"loss": 0.4326,
"step": 1198
},
{
"epoch": 3.778127458693942,
"grad_norm": 0.2469264004719789,
"learning_rate": 6.806183572433634e-06,
"loss": 0.4558,
"step": 1199
},
{
"epoch": 3.7812745869394178,
"grad_norm": 0.2659864809268254,
"learning_rate": 6.773101684367253e-06,
"loss": 0.4501,
"step": 1200
},
{
"epoch": 3.7844217151848936,
"grad_norm": 0.22605570719334842,
"learning_rate": 6.740083993862599e-06,
"loss": 0.4285,
"step": 1201
},
{
"epoch": 3.78756884343037,
"grad_norm": 0.2568671201381285,
"learning_rate": 6.7071306611730605e-06,
"loss": 0.4559,
"step": 1202
},
{
"epoch": 3.790715971675846,
"grad_norm": 0.2717829883222361,
"learning_rate": 6.674241846239647e-06,
"loss": 0.4506,
"step": 1203
},
{
"epoch": 3.793863099921322,
"grad_norm": 0.24909097103686217,
"learning_rate": 6.641417708690243e-06,
"loss": 0.4466,
"step": 1204
},
{
"epoch": 3.7970102281667977,
"grad_norm": 0.2745372511795683,
"learning_rate": 6.60865840783882e-06,
"loss": 0.4577,
"step": 1205
},
{
"epoch": 3.800157356412274,
"grad_norm": 0.24699266487226804,
"learning_rate": 6.575964102684638e-06,
"loss": 0.4268,
"step": 1206
},
{
"epoch": 3.8033044846577497,
"grad_norm": 0.2809825663762849,
"learning_rate": 6.543334951911524e-06,
"loss": 0.4724,
"step": 1207
},
{
"epoch": 3.806451612903226,
"grad_norm": 0.23191080827372706,
"learning_rate": 6.510771113887051e-06,
"loss": 0.426,
"step": 1208
},
{
"epoch": 3.809598741148702,
"grad_norm": 0.2548225394219185,
"learning_rate": 6.478272746661807e-06,
"loss": 0.4324,
"step": 1209
},
{
"epoch": 3.8127458693941776,
"grad_norm": 0.25442728582670243,
"learning_rate": 6.445840007968615e-06,
"loss": 0.4581,
"step": 1210
},
{
"epoch": 3.815892997639654,
"grad_norm": 0.23164636026813668,
"learning_rate": 6.4134730552217505e-06,
"loss": 0.4377,
"step": 1211
},
{
"epoch": 3.81904012588513,
"grad_norm": 0.2506709647270749,
"learning_rate": 6.3811720455162066e-06,
"loss": 0.4449,
"step": 1212
},
{
"epoch": 3.822187254130606,
"grad_norm": 0.24540007687497106,
"learning_rate": 6.348937135626922e-06,
"loss": 0.4375,
"step": 1213
},
{
"epoch": 3.8253343823760817,
"grad_norm": 0.25688702433945887,
"learning_rate": 6.3167684820079935e-06,
"loss": 0.4532,
"step": 1214
},
{
"epoch": 3.828481510621558,
"grad_norm": 0.25389579407575735,
"learning_rate": 6.284666240791964e-06,
"loss": 0.462,
"step": 1215
},
{
"epoch": 3.8316286388670338,
"grad_norm": 0.2518855884586944,
"learning_rate": 6.25263056778902e-06,
"loss": 0.4383,
"step": 1216
},
{
"epoch": 3.83477576711251,
"grad_norm": 0.24076981553815266,
"learning_rate": 6.220661618486268e-06,
"loss": 0.4448,
"step": 1217
},
{
"epoch": 3.837922895357986,
"grad_norm": 0.28662980317279635,
"learning_rate": 6.188759548046966e-06,
"loss": 0.475,
"step": 1218
},
{
"epoch": 3.8410700236034616,
"grad_norm": 0.273283179790291,
"learning_rate": 6.156924511309772e-06,
"loss": 0.454,
"step": 1219
},
{
"epoch": 3.844217151848938,
"grad_norm": 0.23538840471581582,
"learning_rate": 6.125156662787974e-06,
"loss": 0.4304,
"step": 1220
},
{
"epoch": 3.847364280094414,
"grad_norm": 0.2696502178915839,
"learning_rate": 6.093456156668789e-06,
"loss": 0.4327,
"step": 1221
},
{
"epoch": 3.85051140833989,
"grad_norm": 0.24916492807660676,
"learning_rate": 6.061823146812551e-06,
"loss": 0.4632,
"step": 1222
},
{
"epoch": 3.8536585365853657,
"grad_norm": 0.26232676880028327,
"learning_rate": 6.030257786752025e-06,
"loss": 0.4574,
"step": 1223
},
{
"epoch": 3.856805664830842,
"grad_norm": 0.26236306850412155,
"learning_rate": 5.998760229691609e-06,
"loss": 0.4518,
"step": 1224
},
{
"epoch": 3.859952793076318,
"grad_norm": 0.2532097902831472,
"learning_rate": 5.9673306285066334e-06,
"loss": 0.4417,
"step": 1225
},
{
"epoch": 3.863099921321794,
"grad_norm": 0.25795699902561314,
"learning_rate": 5.935969135742594e-06,
"loss": 0.4697,
"step": 1226
},
{
"epoch": 3.86624704956727,
"grad_norm": 0.25127140846223195,
"learning_rate": 5.904675903614423e-06,
"loss": 0.4405,
"step": 1227
},
{
"epoch": 3.8693941778127456,
"grad_norm": 0.2639369068470998,
"learning_rate": 5.87345108400573e-06,
"loss": 0.4501,
"step": 1228
},
{
"epoch": 3.872541306058222,
"grad_norm": 0.26587113576108584,
"learning_rate": 5.842294828468103e-06,
"loss": 0.4563,
"step": 1229
},
{
"epoch": 3.875688434303698,
"grad_norm": 0.25849684097964204,
"learning_rate": 5.811207288220324e-06,
"loss": 0.4448,
"step": 1230
},
{
"epoch": 3.878835562549174,
"grad_norm": 0.2452714319381085,
"learning_rate": 5.78018861414769e-06,
"loss": 0.4528,
"step": 1231
},
{
"epoch": 3.8819826907946497,
"grad_norm": 0.2979932615250963,
"learning_rate": 5.749238956801224e-06,
"loss": 0.4604,
"step": 1232
},
{
"epoch": 3.885129819040126,
"grad_norm": 0.2878800183330725,
"learning_rate": 5.718358466396989e-06,
"loss": 0.4737,
"step": 1233
},
{
"epoch": 3.888276947285602,
"grad_norm": 0.2706243109140571,
"learning_rate": 5.6875472928153406e-06,
"loss": 0.4477,
"step": 1234
},
{
"epoch": 3.891424075531078,
"grad_norm": 0.2851474406358009,
"learning_rate": 5.656805585600205e-06,
"loss": 0.4393,
"step": 1235
},
{
"epoch": 3.894571203776554,
"grad_norm": 0.25037111117408634,
"learning_rate": 5.626133493958326e-06,
"loss": 0.4413,
"step": 1236
},
{
"epoch": 3.8977183320220297,
"grad_norm": 0.251141291032625,
"learning_rate": 5.595531166758597e-06,
"loss": 0.4342,
"step": 1237
},
{
"epoch": 3.900865460267506,
"grad_norm": 0.28369489479084636,
"learning_rate": 5.564998752531274e-06,
"loss": 0.4535,
"step": 1238
},
{
"epoch": 3.904012588512982,
"grad_norm": 0.2642259062646044,
"learning_rate": 5.534536399467314e-06,
"loss": 0.4641,
"step": 1239
},
{
"epoch": 3.907159716758458,
"grad_norm": 0.2662150854932771,
"learning_rate": 5.504144255417605e-06,
"loss": 0.4522,
"step": 1240
},
{
"epoch": 3.9103068450039338,
"grad_norm": 0.2562572069358635,
"learning_rate": 5.473822467892283e-06,
"loss": 0.4521,
"step": 1241
},
{
"epoch": 3.91345397324941,
"grad_norm": 0.25491920921375993,
"learning_rate": 5.443571184060003e-06,
"loss": 0.4441,
"step": 1242
},
{
"epoch": 3.916601101494886,
"grad_norm": 0.27267676966274623,
"learning_rate": 5.413390550747235e-06,
"loss": 0.455,
"step": 1243
},
{
"epoch": 3.919748229740362,
"grad_norm": 0.25033486517798026,
"learning_rate": 5.383280714437518e-06,
"loss": 0.4448,
"step": 1244
},
{
"epoch": 3.922895357985838,
"grad_norm": 0.2538139344130827,
"learning_rate": 5.3532418212708005e-06,
"loss": 0.4566,
"step": 1245
},
{
"epoch": 3.9260424862313137,
"grad_norm": 0.2393368548565171,
"learning_rate": 5.323274017042679e-06,
"loss": 0.4422,
"step": 1246
},
{
"epoch": 3.92918961447679,
"grad_norm": 0.2641138860959141,
"learning_rate": 5.293377447203736e-06,
"loss": 0.4545,
"step": 1247
},
{
"epoch": 3.932336742722266,
"grad_norm": 0.28051370043311474,
"learning_rate": 5.2635522568588104e-06,
"loss": 0.4439,
"step": 1248
},
{
"epoch": 3.935483870967742,
"grad_norm": 0.2668627514694695,
"learning_rate": 5.233798590766279e-06,
"loss": 0.4441,
"step": 1249
},
{
"epoch": 3.938630999213218,
"grad_norm": 0.27446406089755127,
"learning_rate": 5.204116593337391e-06,
"loss": 0.4736,
"step": 1250
},
{
"epoch": 3.941778127458694,
"grad_norm": 0.24657020997355675,
"learning_rate": 5.174506408635549e-06,
"loss": 0.4403,
"step": 1251
},
{
"epoch": 3.94492525570417,
"grad_norm": 0.2539033223879348,
"learning_rate": 5.144968180375582e-06,
"loss": 0.454,
"step": 1252
},
{
"epoch": 3.948072383949646,
"grad_norm": 0.2688344127945621,
"learning_rate": 5.115502051923107e-06,
"loss": 0.4385,
"step": 1253
},
{
"epoch": 3.951219512195122,
"grad_norm": 0.24967312836049135,
"learning_rate": 5.08610816629377e-06,
"loss": 0.4593,
"step": 1254
},
{
"epoch": 3.9543666404405977,
"grad_norm": 0.2503341374401903,
"learning_rate": 5.056786666152607e-06,
"loss": 0.4561,
"step": 1255
},
{
"epoch": 3.957513768686074,
"grad_norm": 0.24358898694391512,
"learning_rate": 5.027537693813318e-06,
"loss": 0.4299,
"step": 1256
},
{
"epoch": 3.96066089693155,
"grad_norm": 0.25400102773666905,
"learning_rate": 4.998361391237572e-06,
"loss": 0.4567,
"step": 1257
},
{
"epoch": 3.963808025177026,
"grad_norm": 0.25044577158164033,
"learning_rate": 4.96925790003435e-06,
"loss": 0.4395,
"step": 1258
},
{
"epoch": 3.966955153422502,
"grad_norm": 0.2851929917597238,
"learning_rate": 4.940227361459235e-06,
"loss": 0.4441,
"step": 1259
},
{
"epoch": 3.970102281667978,
"grad_norm": 0.2760376789615726,
"learning_rate": 4.9112699164137125e-06,
"loss": 0.4564,
"step": 1260
},
{
"epoch": 3.973249409913454,
"grad_norm": 0.24822387612918478,
"learning_rate": 4.88238570544453e-06,
"loss": 0.4367,
"step": 1261
},
{
"epoch": 3.97639653815893,
"grad_norm": 0.26813541769717697,
"learning_rate": 4.8535748687429626e-06,
"loss": 0.4568,
"step": 1262
},
{
"epoch": 3.979543666404406,
"grad_norm": 0.2741691569090349,
"learning_rate": 4.824837546144183e-06,
"loss": 0.4307,
"step": 1263
},
{
"epoch": 3.9826907946498817,
"grad_norm": 0.2638749468978124,
"learning_rate": 4.796173877126547e-06,
"loss": 0.4514,
"step": 1264
},
{
"epoch": 3.985837922895358,
"grad_norm": 0.2698153680403995,
"learning_rate": 4.767584000810923e-06,
"loss": 0.447,
"step": 1265
},
{
"epoch": 3.9889850511408342,
"grad_norm": 0.25760541199532266,
"learning_rate": 4.73906805596003e-06,
"loss": 0.4569,
"step": 1266
},
{
"epoch": 3.99213217938631,
"grad_norm": 0.27869142570317007,
"learning_rate": 4.7106261809777555e-06,
"loss": 0.4505,
"step": 1267
},
{
"epoch": 3.995279307631786,
"grad_norm": 0.2668849867730668,
"learning_rate": 4.682258513908491e-06,
"loss": 0.4303,
"step": 1268
},
{
"epoch": 3.998426435877262,
"grad_norm": 0.5231563669429266,
"learning_rate": 4.6539651924364386e-06,
"loss": 0.7924,
"step": 1269
},
{
"epoch": 4.003147128245476,
"grad_norm": 0.39985400842560725,
"learning_rate": 4.6257463538849634e-06,
"loss": 0.4139,
"step": 1270
},
{
"epoch": 4.006294256490952,
"grad_norm": 0.34856593564315125,
"learning_rate": 4.5976021352159354e-06,
"loss": 0.4066,
"step": 1271
},
{
"epoch": 4.009441384736428,
"grad_norm": 0.2697848208316017,
"learning_rate": 4.569532673029049e-06,
"loss": 0.405,
"step": 1272
},
{
"epoch": 4.012588512981904,
"grad_norm": 0.36536846636389503,
"learning_rate": 4.541538103561147e-06,
"loss": 0.4099,
"step": 1273
},
{
"epoch": 4.01573564122738,
"grad_norm": 0.3822880389205967,
"learning_rate": 4.5136185626855974e-06,
"loss": 0.4115,
"step": 1274
},
{
"epoch": 4.018882769472856,
"grad_norm": 0.30382860641169,
"learning_rate": 4.4857741859116024e-06,
"loss": 0.406,
"step": 1275
},
{
"epoch": 4.022029897718332,
"grad_norm": 0.3476521357529064,
"learning_rate": 4.458005108383554e-06,
"loss": 0.3933,
"step": 1276
},
{
"epoch": 4.025177025963808,
"grad_norm": 0.38110304808948026,
"learning_rate": 4.430311464880368e-06,
"loss": 0.4128,
"step": 1277
},
{
"epoch": 4.028324154209284,
"grad_norm": 0.3408670039383935,
"learning_rate": 4.402693389814838e-06,
"loss": 0.4046,
"step": 1278
},
{
"epoch": 4.03147128245476,
"grad_norm": 0.28165306615311936,
"learning_rate": 4.3751510172329854e-06,
"loss": 0.4132,
"step": 1279
},
{
"epoch": 4.034618410700236,
"grad_norm": 0.3035311504325217,
"learning_rate": 4.347684480813412e-06,
"loss": 0.4183,
"step": 1280
},
{
"epoch": 4.037765538945712,
"grad_norm": 0.3209565209551781,
"learning_rate": 4.3202939138666225e-06,
"loss": 0.4101,
"step": 1281
},
{
"epoch": 4.040912667191188,
"grad_norm": 0.3182494739242946,
"learning_rate": 4.292979449334423e-06,
"loss": 0.4003,
"step": 1282
},
{
"epoch": 4.044059795436664,
"grad_norm": 0.28165910027825686,
"learning_rate": 4.265741219789234e-06,
"loss": 0.4075,
"step": 1283
},
{
"epoch": 4.04720692368214,
"grad_norm": 0.2984326713469533,
"learning_rate": 4.2385793574334834e-06,
"loss": 0.3958,
"step": 1284
},
{
"epoch": 4.050354051927616,
"grad_norm": 0.2800971603115525,
"learning_rate": 4.211493994098928e-06,
"loss": 0.4103,
"step": 1285
},
{
"epoch": 4.053501180173092,
"grad_norm": 0.28810828941236666,
"learning_rate": 4.184485261246032e-06,
"loss": 0.4067,
"step": 1286
},
{
"epoch": 4.056648308418568,
"grad_norm": 0.27485623439753526,
"learning_rate": 4.157553289963343e-06,
"loss": 0.408,
"step": 1287
},
{
"epoch": 4.059795436664044,
"grad_norm": 0.27989550341259173,
"learning_rate": 4.130698210966839e-06,
"loss": 0.3983,
"step": 1288
},
{
"epoch": 4.06294256490952,
"grad_norm": 0.28334021515417757,
"learning_rate": 4.103920154599282e-06,
"loss": 0.414,
"step": 1289
},
{
"epoch": 4.066089693154996,
"grad_norm": 0.26098813310869456,
"learning_rate": 4.0772192508296136e-06,
"loss": 0.3966,
"step": 1290
},
{
"epoch": 4.069236821400472,
"grad_norm": 0.28050338178042683,
"learning_rate": 4.0505956292523116e-06,
"loss": 0.412,
"step": 1291
},
{
"epoch": 4.072383949645948,
"grad_norm": 0.2505929913804565,
"learning_rate": 4.024049419086755e-06,
"loss": 0.4124,
"step": 1292
},
{
"epoch": 4.075531077891424,
"grad_norm": 0.2761708626349122,
"learning_rate": 3.997580749176597e-06,
"loss": 0.4075,
"step": 1293
},
{
"epoch": 4.0786782061369005,
"grad_norm": 0.27161068055765497,
"learning_rate": 3.9711897479891485e-06,
"loss": 0.4008,
"step": 1294
},
{
"epoch": 4.081825334382376,
"grad_norm": 0.2858129658437941,
"learning_rate": 3.944876543614753e-06,
"loss": 0.4007,
"step": 1295
},
{
"epoch": 4.084972462627852,
"grad_norm": 0.29589833337185484,
"learning_rate": 3.918641263766163e-06,
"loss": 0.4125,
"step": 1296
},
{
"epoch": 4.088119590873328,
"grad_norm": 0.2588524865322138,
"learning_rate": 3.892484035777921e-06,
"loss": 0.4089,
"step": 1297
},
{
"epoch": 4.091266719118804,
"grad_norm": 0.26760010167593534,
"learning_rate": 3.866404986605728e-06,
"loss": 0.4146,
"step": 1298
},
{
"epoch": 4.09441384736428,
"grad_norm": 0.29495649911223,
"learning_rate": 3.840404242825848e-06,
"loss": 0.406,
"step": 1299
},
{
"epoch": 4.097560975609756,
"grad_norm": 0.26662575099279967,
"learning_rate": 3.814481930634497e-06,
"loss": 0.4031,
"step": 1300
},
{
"epoch": 4.100708103855232,
"grad_norm": 0.2583645865911736,
"learning_rate": 3.7886381758471944e-06,
"loss": 0.4059,
"step": 1301
},
{
"epoch": 4.103855232100708,
"grad_norm": 0.25557494028108535,
"learning_rate": 3.7628731038981856e-06,
"loss": 0.4055,
"step": 1302
},
{
"epoch": 4.1070023603461845,
"grad_norm": 0.2837872643694464,
"learning_rate": 3.7371868398398346e-06,
"loss": 0.413,
"step": 1303
},
{
"epoch": 4.11014948859166,
"grad_norm": 0.2513483558936818,
"learning_rate": 3.7115795083419937e-06,
"loss": 0.4096,
"step": 1304
},
{
"epoch": 4.113296616837136,
"grad_norm": 0.2512125129099929,
"learning_rate": 3.6860512336914256e-06,
"loss": 0.409,
"step": 1305
},
{
"epoch": 4.116443745082612,
"grad_norm": 0.2576029104676369,
"learning_rate": 3.6606021397911605e-06,
"loss": 0.3965,
"step": 1306
},
{
"epoch": 4.119590873328088,
"grad_norm": 0.3077600696852511,
"learning_rate": 3.635232350159945e-06,
"loss": 0.4017,
"step": 1307
},
{
"epoch": 4.122738001573564,
"grad_norm": 0.30096423158758295,
"learning_rate": 3.6099419879316065e-06,
"loss": 0.4059,
"step": 1308
},
{
"epoch": 4.12588512981904,
"grad_norm": 0.241419047105453,
"learning_rate": 3.584731175854479e-06,
"loss": 0.4057,
"step": 1309
},
{
"epoch": 4.129032258064516,
"grad_norm": 0.3067465629753672,
"learning_rate": 3.559600036290762e-06,
"loss": 0.4057,
"step": 1310
},
{
"epoch": 4.132179386309992,
"grad_norm": 0.28376888195603434,
"learning_rate": 3.5345486912159954e-06,
"loss": 0.4033,
"step": 1311
},
{
"epoch": 4.1353265145554685,
"grad_norm": 0.23183080156041488,
"learning_rate": 3.5095772622184177e-06,
"loss": 0.4062,
"step": 1312
},
{
"epoch": 4.138473642800944,
"grad_norm": 0.24058407221906955,
"learning_rate": 3.48468587049839e-06,
"loss": 0.4064,
"step": 1313
},
{
"epoch": 4.14162077104642,
"grad_norm": 0.2489883247653874,
"learning_rate": 3.459874636867804e-06,
"loss": 0.4071,
"step": 1314
},
{
"epoch": 4.144767899291896,
"grad_norm": 0.2606905433022103,
"learning_rate": 3.435143681749504e-06,
"loss": 0.408,
"step": 1315
},
{
"epoch": 4.147915027537372,
"grad_norm": 0.2705046247273454,
"learning_rate": 3.4104931251766993e-06,
"loss": 0.3972,
"step": 1316
},
{
"epoch": 4.151062155782848,
"grad_norm": 0.24481800219341115,
"learning_rate": 3.3859230867923842e-06,
"loss": 0.4071,
"step": 1317
},
{
"epoch": 4.154209284028324,
"grad_norm": 0.2300331920081832,
"learning_rate": 3.3614336858487294e-06,
"loss": 0.406,
"step": 1318
},
{
"epoch": 4.1573564122738,
"grad_norm": 0.2622648565386252,
"learning_rate": 3.337025041206552e-06,
"loss": 0.4069,
"step": 1319
},
{
"epoch": 4.160503540519276,
"grad_norm": 0.24755732541806952,
"learning_rate": 3.3126972713347017e-06,
"loss": 0.3964,
"step": 1320
},
{
"epoch": 4.1636506687647525,
"grad_norm": 0.24521010796980297,
"learning_rate": 3.288450494309512e-06,
"loss": 0.4087,
"step": 1321
},
{
"epoch": 4.166797797010228,
"grad_norm": 0.243073517258642,
"learning_rate": 3.264284827814186e-06,
"loss": 0.3996,
"step": 1322
},
{
"epoch": 4.169944925255704,
"grad_norm": 0.2578493124544258,
"learning_rate": 3.240200389138275e-06,
"loss": 0.413,
"step": 1323
},
{
"epoch": 4.17309205350118,
"grad_norm": 0.23083384176711083,
"learning_rate": 3.2161972951770793e-06,
"loss": 0.409,
"step": 1324
},
{
"epoch": 4.176239181746656,
"grad_norm": 0.26371050841393034,
"learning_rate": 3.192275662431088e-06,
"loss": 0.4149,
"step": 1325
},
{
"epoch": 4.1793863099921325,
"grad_norm": 0.21635862942599873,
"learning_rate": 3.168435607005409e-06,
"loss": 0.4152,
"step": 1326
},
{
"epoch": 4.182533438237608,
"grad_norm": 0.25038919966086376,
"learning_rate": 3.1446772446092087e-06,
"loss": 0.3989,
"step": 1327
},
{
"epoch": 4.185680566483084,
"grad_norm": 0.2538265944471299,
"learning_rate": 3.1210006905551602e-06,
"loss": 0.4111,
"step": 1328
},
{
"epoch": 4.18882769472856,
"grad_norm": 0.2523419884908278,
"learning_rate": 3.097406059758874e-06,
"loss": 0.4232,
"step": 1329
},
{
"epoch": 4.191974822974037,
"grad_norm": 0.24458106176738695,
"learning_rate": 3.073893466738325e-06,
"loss": 0.4071,
"step": 1330
},
{
"epoch": 4.195121951219512,
"grad_norm": 0.24334039749426561,
"learning_rate": 3.0504630256133326e-06,
"loss": 0.4061,
"step": 1331
},
{
"epoch": 4.198269079464988,
"grad_norm": 0.2712419815481148,
"learning_rate": 3.0271148501049796e-06,
"loss": 0.4025,
"step": 1332
},
{
"epoch": 4.201416207710464,
"grad_norm": 0.24319759647270833,
"learning_rate": 3.0038490535350685e-06,
"loss": 0.4016,
"step": 1333
},
{
"epoch": 4.20456333595594,
"grad_norm": 0.22524563871518907,
"learning_rate": 2.9806657488255665e-06,
"loss": 0.407,
"step": 1334
},
{
"epoch": 4.2077104642014165,
"grad_norm": 0.24679305363109214,
"learning_rate": 2.9575650484980568e-06,
"loss": 0.3904,
"step": 1335
},
{
"epoch": 4.210857592446892,
"grad_norm": 0.2649918330668857,
"learning_rate": 2.934547064673212e-06,
"loss": 0.4028,
"step": 1336
},
{
"epoch": 4.214004720692368,
"grad_norm": 0.24510719686404595,
"learning_rate": 2.911611909070229e-06,
"loss": 0.3997,
"step": 1337
},
{
"epoch": 4.217151848937844,
"grad_norm": 0.238894620688521,
"learning_rate": 2.888759693006291e-06,
"loss": 0.4008,
"step": 1338
},
{
"epoch": 4.220298977183321,
"grad_norm": 0.21793391307390142,
"learning_rate": 2.8659905273960233e-06,
"loss": 0.4015,
"step": 1339
},
{
"epoch": 4.223446105428796,
"grad_norm": 0.22998316847358136,
"learning_rate": 2.8433045227509693e-06,
"loss": 0.4047,
"step": 1340
},
{
"epoch": 4.226593233674272,
"grad_norm": 0.23351985066501862,
"learning_rate": 2.82070178917905e-06,
"loss": 0.4054,
"step": 1341
},
{
"epoch": 4.229740361919748,
"grad_norm": 0.27760093628492105,
"learning_rate": 2.798182436384014e-06,
"loss": 0.399,
"step": 1342
},
{
"epoch": 4.232887490165224,
"grad_norm": 0.23788980231020992,
"learning_rate": 2.7757465736649147e-06,
"loss": 0.4104,
"step": 1343
},
{
"epoch": 4.2360346184107005,
"grad_norm": 0.24654087188596363,
"learning_rate": 2.753394309915589e-06,
"loss": 0.4029,
"step": 1344
},
{
"epoch": 4.239181746656176,
"grad_norm": 0.22498193770029656,
"learning_rate": 2.731125753624124e-06,
"loss": 0.4177,
"step": 1345
},
{
"epoch": 4.242328874901652,
"grad_norm": 0.24467879466713818,
"learning_rate": 2.708941012872326e-06,
"loss": 0.4037,
"step": 1346
},
{
"epoch": 4.245476003147128,
"grad_norm": 0.23216696901315087,
"learning_rate": 2.6868401953351807e-06,
"loss": 0.4084,
"step": 1347
},
{
"epoch": 4.248623131392605,
"grad_norm": 0.22267899491966328,
"learning_rate": 2.6648234082803705e-06,
"loss": 0.4016,
"step": 1348
},
{
"epoch": 4.25177025963808,
"grad_norm": 0.22320373263218757,
"learning_rate": 2.642890758567722e-06,
"loss": 0.4014,
"step": 1349
},
{
"epoch": 4.254917387883556,
"grad_norm": 0.2282911262813021,
"learning_rate": 2.621042352648693e-06,
"loss": 0.4019,
"step": 1350
},
{
"epoch": 4.258064516129032,
"grad_norm": 0.23150782371834555,
"learning_rate": 2.5992782965658547e-06,
"loss": 0.4047,
"step": 1351
},
{
"epoch": 4.261211644374509,
"grad_norm": 0.22978744608702004,
"learning_rate": 2.5775986959523925e-06,
"loss": 0.4098,
"step": 1352
},
{
"epoch": 4.2643587726199845,
"grad_norm": 0.237921148030588,
"learning_rate": 2.556003656031576e-06,
"loss": 0.4071,
"step": 1353
},
{
"epoch": 4.26750590086546,
"grad_norm": 0.23356194839626504,
"learning_rate": 2.5344932816162615e-06,
"loss": 0.4126,
"step": 1354
},
{
"epoch": 4.270653029110936,
"grad_norm": 0.24938990083349305,
"learning_rate": 2.5130676771083585e-06,
"loss": 0.3942,
"step": 1355
},
{
"epoch": 4.273800157356412,
"grad_norm": 0.2222933450703462,
"learning_rate": 2.4917269464983564e-06,
"loss": 0.407,
"step": 1356
},
{
"epoch": 4.276947285601889,
"grad_norm": 0.23038561355673803,
"learning_rate": 2.470471193364805e-06,
"loss": 0.3991,
"step": 1357
},
{
"epoch": 4.280094413847364,
"grad_norm": 0.24048036243021406,
"learning_rate": 2.4493005208738006e-06,
"loss": 0.4103,
"step": 1358
},
{
"epoch": 4.28324154209284,
"grad_norm": 0.24150535433511164,
"learning_rate": 2.428215031778496e-06,
"loss": 0.4026,
"step": 1359
},
{
"epoch": 4.286388670338316,
"grad_norm": 0.24484793703016558,
"learning_rate": 2.407214828418607e-06,
"loss": 0.3997,
"step": 1360
},
{
"epoch": 4.289535798583792,
"grad_norm": 0.2410295427955201,
"learning_rate": 2.386300012719909e-06,
"loss": 0.4107,
"step": 1361
},
{
"epoch": 4.2926829268292686,
"grad_norm": 0.23834846513449068,
"learning_rate": 2.365470686193745e-06,
"loss": 0.4023,
"step": 1362
},
{
"epoch": 4.295830055074744,
"grad_norm": 0.23615554155542462,
"learning_rate": 2.3447269499365245e-06,
"loss": 0.4108,
"step": 1363
},
{
"epoch": 4.29897718332022,
"grad_norm": 0.2235992225457463,
"learning_rate": 2.3240689046292398e-06,
"loss": 0.4026,
"step": 1364
},
{
"epoch": 4.302124311565696,
"grad_norm": 0.22355351647845262,
"learning_rate": 2.3034966505369918e-06,
"loss": 0.4111,
"step": 1365
},
{
"epoch": 4.305271439811173,
"grad_norm": 0.22346679840500028,
"learning_rate": 2.2830102875084605e-06,
"loss": 0.4071,
"step": 1366
},
{
"epoch": 4.3084185680566485,
"grad_norm": 0.204481079618315,
"learning_rate": 2.26260991497548e-06,
"loss": 0.4174,
"step": 1367
},
{
"epoch": 4.311565696302124,
"grad_norm": 0.23428268541426928,
"learning_rate": 2.242295631952496e-06,
"loss": 0.3975,
"step": 1368
},
{
"epoch": 4.3147128245476,
"grad_norm": 0.2529300915882862,
"learning_rate": 2.222067537036132e-06,
"loss": 0.3963,
"step": 1369
},
{
"epoch": 4.317859952793077,
"grad_norm": 0.23794648521514886,
"learning_rate": 2.2019257284046926e-06,
"loss": 0.4064,
"step": 1370
},
{
"epoch": 4.321007081038553,
"grad_norm": 0.23514607936361892,
"learning_rate": 2.1818703038176703e-06,
"loss": 0.4125,
"step": 1371
},
{
"epoch": 4.324154209284028,
"grad_norm": 0.23494478364751686,
"learning_rate": 2.1619013606152994e-06,
"loss": 0.4045,
"step": 1372
},
{
"epoch": 4.327301337529504,
"grad_norm": 0.22970284055952198,
"learning_rate": 2.142018995718078e-06,
"loss": 0.4011,
"step": 1373
},
{
"epoch": 4.33044846577498,
"grad_norm": 0.22353700147820285,
"learning_rate": 2.122223305626272e-06,
"loss": 0.3961,
"step": 1374
},
{
"epoch": 4.333595594020457,
"grad_norm": 0.23004457401194028,
"learning_rate": 2.1025143864194876e-06,
"loss": 0.4006,
"step": 1375
},
{
"epoch": 4.3367427222659325,
"grad_norm": 0.22271838804401656,
"learning_rate": 2.082892333756161e-06,
"loss": 0.4107,
"step": 1376
},
{
"epoch": 4.339889850511408,
"grad_norm": 0.2536096244783794,
"learning_rate": 2.063357242873134e-06,
"loss": 0.4143,
"step": 1377
},
{
"epoch": 4.343036978756884,
"grad_norm": 0.23437814886577593,
"learning_rate": 2.0439092085851685e-06,
"loss": 0.4137,
"step": 1378
},
{
"epoch": 4.34618410700236,
"grad_norm": 0.2271814604599019,
"learning_rate": 2.0245483252844876e-06,
"loss": 0.4087,
"step": 1379
},
{
"epoch": 4.349331235247837,
"grad_norm": 0.22962388160368896,
"learning_rate": 2.005274686940326e-06,
"loss": 0.4027,
"step": 1380
},
{
"epoch": 4.352478363493312,
"grad_norm": 0.22693012118130107,
"learning_rate": 1.9860883870984725e-06,
"loss": 0.4142,
"step": 1381
},
{
"epoch": 4.355625491738788,
"grad_norm": 0.23266421196656764,
"learning_rate": 1.966989518880802e-06,
"loss": 0.3987,
"step": 1382
},
{
"epoch": 4.358772619984264,
"grad_norm": 0.23705401380270869,
"learning_rate": 1.9479781749848503e-06,
"loss": 0.4116,
"step": 1383
},
{
"epoch": 4.361919748229741,
"grad_norm": 0.23435894430026766,
"learning_rate": 1.929054447683334e-06,
"loss": 0.4022,
"step": 1384
},
{
"epoch": 4.3650668764752165,
"grad_norm": 0.23802959991267317,
"learning_rate": 1.9102184288237246e-06,
"loss": 0.4176,
"step": 1385
},
{
"epoch": 4.368214004720692,
"grad_norm": 0.22275801058423705,
"learning_rate": 1.8914702098277948e-06,
"loss": 0.4059,
"step": 1386
},
{
"epoch": 4.371361132966168,
"grad_norm": 0.22939658749057806,
"learning_rate": 1.87280988169118e-06,
"loss": 0.4094,
"step": 1387
},
{
"epoch": 4.374508261211645,
"grad_norm": 0.25039433961364527,
"learning_rate": 1.854237534982919e-06,
"loss": 0.407,
"step": 1388
},
{
"epoch": 4.377655389457121,
"grad_norm": 0.2212864081241604,
"learning_rate": 1.8357532598450412e-06,
"loss": 0.3965,
"step": 1389
},
{
"epoch": 4.380802517702596,
"grad_norm": 0.22895286253748667,
"learning_rate": 1.8173571459921024e-06,
"loss": 0.3999,
"step": 1390
},
{
"epoch": 4.383949645948072,
"grad_norm": 0.2143332455962388,
"learning_rate": 1.7990492827107742e-06,
"loss": 0.4143,
"step": 1391
},
{
"epoch": 4.387096774193548,
"grad_norm": 0.21916080732094737,
"learning_rate": 1.780829758859388e-06,
"loss": 0.4053,
"step": 1392
},
{
"epoch": 4.390243902439025,
"grad_norm": 0.22708688040625066,
"learning_rate": 1.7626986628675236e-06,
"loss": 0.3959,
"step": 1393
},
{
"epoch": 4.3933910306845005,
"grad_norm": 0.2410409889887991,
"learning_rate": 1.7446560827355652e-06,
"loss": 0.402,
"step": 1394
},
{
"epoch": 4.396538158929976,
"grad_norm": 0.23099349556256393,
"learning_rate": 1.726702106034286e-06,
"loss": 0.4148,
"step": 1395
},
{
"epoch": 4.399685287175452,
"grad_norm": 0.22073791862182493,
"learning_rate": 1.7088368199044047e-06,
"loss": 0.4178,
"step": 1396
},
{
"epoch": 4.402832415420928,
"grad_norm": 0.22316395047578508,
"learning_rate": 1.6910603110561895e-06,
"loss": 0.4119,
"step": 1397
},
{
"epoch": 4.405979543666405,
"grad_norm": 0.2189088160487407,
"learning_rate": 1.6733726657690108e-06,
"loss": 0.4025,
"step": 1398
},
{
"epoch": 4.4091266719118805,
"grad_norm": 0.21750370355701165,
"learning_rate": 1.6557739698909436e-06,
"loss": 0.4111,
"step": 1399
},
{
"epoch": 4.412273800157356,
"grad_norm": 0.23427229679001665,
"learning_rate": 1.6382643088383288e-06,
"loss": 0.3983,
"step": 1400
},
{
"epoch": 4.415420928402832,
"grad_norm": 0.214624345546693,
"learning_rate": 1.620843767595388e-06,
"loss": 0.4142,
"step": 1401
},
{
"epoch": 4.418568056648309,
"grad_norm": 0.22190509237137726,
"learning_rate": 1.6035124307137828e-06,
"loss": 0.4018,
"step": 1402
},
{
"epoch": 4.421715184893785,
"grad_norm": 0.22933036447627164,
"learning_rate": 1.5862703823122205e-06,
"loss": 0.4041,
"step": 1403
},
{
"epoch": 4.42486231313926,
"grad_norm": 0.22856794413059017,
"learning_rate": 1.5691177060760377e-06,
"loss": 0.4078,
"step": 1404
},
{
"epoch": 4.428009441384736,
"grad_norm": 0.227335410882579,
"learning_rate": 1.5520544852568042e-06,
"loss": 0.3982,
"step": 1405
},
{
"epoch": 4.431156569630213,
"grad_norm": 0.21207415831235102,
"learning_rate": 1.5350808026719023e-06,
"loss": 0.4071,
"step": 1406
},
{
"epoch": 4.434303697875689,
"grad_norm": 0.22335589734890615,
"learning_rate": 1.5181967407041454e-06,
"loss": 0.3971,
"step": 1407
},
{
"epoch": 4.4374508261211645,
"grad_norm": 0.22022397625368526,
"learning_rate": 1.5014023813013578e-06,
"loss": 0.4105,
"step": 1408
},
{
"epoch": 4.44059795436664,
"grad_norm": 0.22294865873844572,
"learning_rate": 1.4846978059759964e-06,
"loss": 0.4152,
"step": 1409
},
{
"epoch": 4.443745082612116,
"grad_norm": 0.23078813748776092,
"learning_rate": 1.46808309580474e-06,
"loss": 0.4012,
"step": 1410
},
{
"epoch": 4.446892210857593,
"grad_norm": 0.2162240274521999,
"learning_rate": 1.451558331428109e-06,
"loss": 0.3941,
"step": 1411
},
{
"epoch": 4.450039339103069,
"grad_norm": 0.23037478782989287,
"learning_rate": 1.4351235930500497e-06,
"loss": 0.3952,
"step": 1412
},
{
"epoch": 4.453186467348544,
"grad_norm": 0.22007340136864595,
"learning_rate": 1.4187789604375812e-06,
"loss": 0.4118,
"step": 1413
},
{
"epoch": 4.45633359559402,
"grad_norm": 0.23576271215221423,
"learning_rate": 1.4025245129203736e-06,
"loss": 0.4059,
"step": 1414
},
{
"epoch": 4.459480723839496,
"grad_norm": 0.22441915779719693,
"learning_rate": 1.3863603293903839e-06,
"loss": 0.4015,
"step": 1415
},
{
"epoch": 4.462627852084973,
"grad_norm": 0.2284380804154695,
"learning_rate": 1.3702864883014755e-06,
"loss": 0.4128,
"step": 1416
},
{
"epoch": 4.4657749803304485,
"grad_norm": 0.22050257816998334,
"learning_rate": 1.3543030676690138e-06,
"loss": 0.4102,
"step": 1417
},
{
"epoch": 4.468922108575924,
"grad_norm": 0.22331735465975805,
"learning_rate": 1.338410145069511e-06,
"loss": 0.4139,
"step": 1418
},
{
"epoch": 4.4720692368214,
"grad_norm": 0.2196984071690986,
"learning_rate": 1.3226077976402473e-06,
"loss": 0.404,
"step": 1419
},
{
"epoch": 4.475216365066877,
"grad_norm": 0.2109055906141469,
"learning_rate": 1.3068961020788761e-06,
"loss": 0.4115,
"step": 1420
},
{
"epoch": 4.478363493312353,
"grad_norm": 0.21755200155816148,
"learning_rate": 1.2912751346430818e-06,
"loss": 0.391,
"step": 1421
},
{
"epoch": 4.481510621557828,
"grad_norm": 0.2132260845561604,
"learning_rate": 1.2757449711501835e-06,
"loss": 0.4082,
"step": 1422
},
{
"epoch": 4.484657749803304,
"grad_norm": 0.2239953728568239,
"learning_rate": 1.2603056869767816e-06,
"loss": 0.4102,
"step": 1423
},
{
"epoch": 4.487804878048781,
"grad_norm": 0.21787966690564375,
"learning_rate": 1.244957357058394e-06,
"loss": 0.4115,
"step": 1424
},
{
"epoch": 4.490952006294257,
"grad_norm": 0.21981977423701385,
"learning_rate": 1.2297000558890716e-06,
"loss": 0.4033,
"step": 1425
},
{
"epoch": 4.4940991345397325,
"grad_norm": 0.21317215521752114,
"learning_rate": 1.2145338575210653e-06,
"loss": 0.4047,
"step": 1426
},
{
"epoch": 4.497246262785208,
"grad_norm": 0.22312130217927098,
"learning_rate": 1.1994588355644509e-06,
"loss": 0.4053,
"step": 1427
},
{
"epoch": 4.500393391030684,
"grad_norm": 0.22447203330251364,
"learning_rate": 1.1844750631867675e-06,
"loss": 0.4119,
"step": 1428
},
{
"epoch": 4.503540519276161,
"grad_norm": 0.22046801333751878,
"learning_rate": 1.1695826131126765e-06,
"loss": 0.4191,
"step": 1429
},
{
"epoch": 4.506687647521637,
"grad_norm": 0.22112092363500188,
"learning_rate": 1.154781557623592e-06,
"loss": 0.4048,
"step": 1430
},
{
"epoch": 4.5098347757671124,
"grad_norm": 0.2200529030796755,
"learning_rate": 1.1400719685573458e-06,
"loss": 0.4097,
"step": 1431
},
{
"epoch": 4.512981904012588,
"grad_norm": 0.22276401643430727,
"learning_rate": 1.125453917307837e-06,
"loss": 0.4073,
"step": 1432
},
{
"epoch": 4.516129032258064,
"grad_norm": 0.22154029972838743,
"learning_rate": 1.1109274748246634e-06,
"loss": 0.4043,
"step": 1433
},
{
"epoch": 4.519276160503541,
"grad_norm": 0.21625363180786117,
"learning_rate": 1.0964927116128088e-06,
"loss": 0.4038,
"step": 1434
},
{
"epoch": 4.522423288749017,
"grad_norm": 0.22587542625356785,
"learning_rate": 1.0821496977322822e-06,
"loss": 0.4083,
"step": 1435
},
{
"epoch": 4.525570416994492,
"grad_norm": 0.2298925317259965,
"learning_rate": 1.0678985027977795e-06,
"loss": 0.4097,
"step": 1436
},
{
"epoch": 4.528717545239968,
"grad_norm": 0.21553526604171172,
"learning_rate": 1.0537391959783495e-06,
"loss": 0.4169,
"step": 1437
},
{
"epoch": 4.531864673485445,
"grad_norm": 0.2176940353070853,
"learning_rate": 1.0396718459970522e-06,
"loss": 0.4075,
"step": 1438
},
{
"epoch": 4.535011801730921,
"grad_norm": 0.22147515126537134,
"learning_rate": 1.0256965211306347e-06,
"loss": 0.4082,
"step": 1439
},
{
"epoch": 4.5381589299763965,
"grad_norm": 0.22956285830976184,
"learning_rate": 1.011813289209198e-06,
"loss": 0.4068,
"step": 1440
},
{
"epoch": 4.541306058221872,
"grad_norm": 0.20714083744900075,
"learning_rate": 9.980222176158527e-07,
"loss": 0.4088,
"step": 1441
},
{
"epoch": 4.544453186467349,
"grad_norm": 0.22430766033401095,
"learning_rate": 9.84323373286411e-07,
"loss": 0.4004,
"step": 1442
},
{
"epoch": 4.547600314712825,
"grad_norm": 0.21407640131514646,
"learning_rate": 9.707168227090546e-07,
"loss": 0.4037,
"step": 1443
},
{
"epoch": 4.550747442958301,
"grad_norm": 0.2186295462567785,
"learning_rate": 9.572026319240147e-07,
"loss": 0.4194,
"step": 1444
},
{
"epoch": 4.553894571203776,
"grad_norm": 0.21975559225432,
"learning_rate": 9.437808665232429e-07,
"loss": 0.4044,
"step": 1445
},
{
"epoch": 4.557041699449252,
"grad_norm": 0.22000522416273766,
"learning_rate": 9.304515916500923e-07,
"loss": 0.4137,
"step": 1446
},
{
"epoch": 4.560188827694729,
"grad_norm": 0.21013017876360932,
"learning_rate": 9.172148719990237e-07,
"loss": 0.4073,
"step": 1447
},
{
"epoch": 4.563335955940205,
"grad_norm": 0.21581120172406837,
"learning_rate": 9.040707718152664e-07,
"loss": 0.4047,
"step": 1448
},
{
"epoch": 4.5664830841856805,
"grad_norm": 0.21451324203213717,
"learning_rate": 8.910193548945134e-07,
"loss": 0.4151,
"step": 1449
},
{
"epoch": 4.569630212431156,
"grad_norm": 0.22318798632431602,
"learning_rate": 8.780606845826179e-07,
"loss": 0.4004,
"step": 1450
},
{
"epoch": 4.572777340676632,
"grad_norm": 0.1991294427029985,
"learning_rate": 8.651948237752816e-07,
"loss": 0.4096,
"step": 1451
},
{
"epoch": 4.575924468922109,
"grad_norm": 0.21945769936920279,
"learning_rate": 8.524218349177515e-07,
"loss": 0.4058,
"step": 1452
},
{
"epoch": 4.579071597167585,
"grad_norm": 0.21294148461496837,
"learning_rate": 8.397417800045083e-07,
"loss": 0.4104,
"step": 1453
},
{
"epoch": 4.58221872541306,
"grad_norm": 0.2289947471169245,
"learning_rate": 8.271547205789731e-07,
"loss": 0.3924,
"step": 1454
},
{
"epoch": 4.585365853658536,
"grad_norm": 0.22297476089583868,
"learning_rate": 8.146607177332111e-07,
"loss": 0.3915,
"step": 1455
},
{
"epoch": 4.588512981904013,
"grad_norm": 0.2162476472768259,
"learning_rate": 8.022598321076369e-07,
"loss": 0.3994,
"step": 1456
},
{
"epoch": 4.591660110149489,
"grad_norm": 0.2232886473508073,
"learning_rate": 7.899521238907048e-07,
"loss": 0.4009,
"step": 1457
},
{
"epoch": 4.5948072383949645,
"grad_norm": 0.23138631812380756,
"learning_rate": 7.777376528186286e-07,
"loss": 0.4112,
"step": 1458
},
{
"epoch": 4.59795436664044,
"grad_norm": 0.21409653415620175,
"learning_rate": 7.656164781750975e-07,
"loss": 0.4181,
"step": 1459
},
{
"epoch": 4.601101494885917,
"grad_norm": 0.22193142902845509,
"learning_rate": 7.535886587909736e-07,
"loss": 0.404,
"step": 1460
},
{
"epoch": 4.604248623131393,
"grad_norm": 0.21959675902623732,
"learning_rate": 7.416542530440174e-07,
"loss": 0.4057,
"step": 1461
},
{
"epoch": 4.607395751376869,
"grad_norm": 0.21867188925414008,
"learning_rate": 7.298133188585921e-07,
"loss": 0.4006,
"step": 1462
},
{
"epoch": 4.610542879622344,
"grad_norm": 0.23145111441732844,
"learning_rate": 7.180659137054014e-07,
"loss": 0.3977,
"step": 1463
},
{
"epoch": 4.61369000786782,
"grad_norm": 0.20970090083827234,
"learning_rate": 7.064120946011987e-07,
"loss": 0.4074,
"step": 1464
},
{
"epoch": 4.616837136113297,
"grad_norm": 0.2179204076829795,
"learning_rate": 6.948519181085101e-07,
"loss": 0.4027,
"step": 1465
},
{
"epoch": 4.619984264358773,
"grad_norm": 0.21581183670826473,
"learning_rate": 6.833854403353535e-07,
"loss": 0.4148,
"step": 1466
},
{
"epoch": 4.6231313926042485,
"grad_norm": 0.21450682726152265,
"learning_rate": 6.720127169349888e-07,
"loss": 0.412,
"step": 1467
},
{
"epoch": 4.626278520849724,
"grad_norm": 0.2250984916103483,
"learning_rate": 6.607338031056243e-07,
"loss": 0.4041,
"step": 1468
},
{
"epoch": 4.6294256490952,
"grad_norm": 0.21695604933835538,
"learning_rate": 6.49548753590159e-07,
"loss": 0.3942,
"step": 1469
},
{
"epoch": 4.632572777340677,
"grad_norm": 0.20819532381718653,
"learning_rate": 6.384576226759165e-07,
"loss": 0.4117,
"step": 1470
},
{
"epoch": 4.635719905586153,
"grad_norm": 0.22286060436310867,
"learning_rate": 6.274604641943783e-07,
"loss": 0.4033,
"step": 1471
},
{
"epoch": 4.6388670338316285,
"grad_norm": 0.21036765805313895,
"learning_rate": 6.165573315209283e-07,
"loss": 0.4036,
"step": 1472
},
{
"epoch": 4.642014162077104,
"grad_norm": 0.21227522537219817,
"learning_rate": 6.057482775745938e-07,
"loss": 0.4082,
"step": 1473
},
{
"epoch": 4.645161290322581,
"grad_norm": 0.21892049964637353,
"learning_rate": 5.950333548177734e-07,
"loss": 0.4096,
"step": 1474
},
{
"epoch": 4.648308418568057,
"grad_norm": 0.2176553643569443,
"learning_rate": 5.84412615256007e-07,
"loss": 0.4014,
"step": 1475
},
{
"epoch": 4.651455546813533,
"grad_norm": 0.2199131756115072,
"learning_rate": 5.738861104377003e-07,
"loss": 0.4144,
"step": 1476
},
{
"epoch": 4.654602675059008,
"grad_norm": 0.21639846694910558,
"learning_rate": 5.634538914539001e-07,
"loss": 0.4158,
"step": 1477
},
{
"epoch": 4.657749803304485,
"grad_norm": 0.21197788045529345,
"learning_rate": 5.531160089380061e-07,
"loss": 0.4098,
"step": 1478
},
{
"epoch": 4.660896931549961,
"grad_norm": 0.2171708383066465,
"learning_rate": 5.428725130655732e-07,
"loss": 0.4075,
"step": 1479
},
{
"epoch": 4.664044059795437,
"grad_norm": 0.2102587081469045,
"learning_rate": 5.327234535540337e-07,
"loss": 0.4078,
"step": 1480
},
{
"epoch": 4.6671911880409125,
"grad_norm": 0.2246748030537084,
"learning_rate": 5.226688796624757e-07,
"loss": 0.4073,
"step": 1481
},
{
"epoch": 4.670338316286388,
"grad_norm": 0.21599194135810337,
"learning_rate": 5.127088401913827e-07,
"loss": 0.409,
"step": 1482
},
{
"epoch": 4.673485444531865,
"grad_norm": 0.222001456808214,
"learning_rate": 5.028433834824186e-07,
"loss": 0.407,
"step": 1483
},
{
"epoch": 4.676632572777341,
"grad_norm": 0.22556759982109623,
"learning_rate": 4.930725574181838e-07,
"loss": 0.4029,
"step": 1484
},
{
"epoch": 4.679779701022817,
"grad_norm": 0.22675132381159563,
"learning_rate": 4.833964094219834e-07,
"loss": 0.4081,
"step": 1485
},
{
"epoch": 4.682926829268292,
"grad_norm": 0.22311156946152774,
"learning_rate": 4.738149864575925e-07,
"loss": 0.4072,
"step": 1486
},
{
"epoch": 4.686073957513768,
"grad_norm": 0.21164988462618337,
"learning_rate": 4.6432833502903176e-07,
"loss": 0.3928,
"step": 1487
},
{
"epoch": 4.689221085759245,
"grad_norm": 0.21501199816999694,
"learning_rate": 4.549365011803475e-07,
"loss": 0.4077,
"step": 1488
},
{
"epoch": 4.692368214004721,
"grad_norm": 0.20616052059898535,
"learning_rate": 4.456395304953853e-07,
"loss": 0.3986,
"step": 1489
},
{
"epoch": 4.6955153422501965,
"grad_norm": 0.21548324024487242,
"learning_rate": 4.3643746809755695e-07,
"loss": 0.4098,
"step": 1490
},
{
"epoch": 4.698662470495672,
"grad_norm": 0.2231505323697405,
"learning_rate": 4.27330358649638e-07,
"loss": 0.4042,
"step": 1491
},
{
"epoch": 4.701809598741149,
"grad_norm": 0.22843117462309953,
"learning_rate": 4.183182463535418e-07,
"loss": 0.4096,
"step": 1492
},
{
"epoch": 4.704956726986625,
"grad_norm": 0.20462231224872493,
"learning_rate": 4.094011749501103e-07,
"loss": 0.4065,
"step": 1493
},
{
"epoch": 4.708103855232101,
"grad_norm": 0.21804695686103368,
"learning_rate": 4.005791877188947e-07,
"loss": 0.4143,
"step": 1494
},
{
"epoch": 4.711250983477576,
"grad_norm": 0.2086192038711514,
"learning_rate": 3.9185232747794843e-07,
"loss": 0.3912,
"step": 1495
},
{
"epoch": 4.714398111723053,
"grad_norm": 0.211022943339496,
"learning_rate": 3.832206365836255e-07,
"loss": 0.4026,
"step": 1496
},
{
"epoch": 4.717545239968529,
"grad_norm": 0.2088349486284656,
"learning_rate": 3.7468415693036495e-07,
"loss": 0.4075,
"step": 1497
},
{
"epoch": 4.720692368214005,
"grad_norm": 0.2089695668261833,
"learning_rate": 3.662429299504955e-07,
"loss": 0.4174,
"step": 1498
},
{
"epoch": 4.7238394964594805,
"grad_norm": 0.20713627934049905,
"learning_rate": 3.578969966140289e-07,
"loss": 0.4091,
"step": 1499
},
{
"epoch": 4.726986624704956,
"grad_norm": 0.2173964909886165,
"learning_rate": 3.4964639742846253e-07,
"loss": 0.4136,
"step": 1500
},
{
"epoch": 4.730133752950433,
"grad_norm": 0.21628797092246888,
"learning_rate": 3.414911724385905e-07,
"loss": 0.4062,
"step": 1501
},
{
"epoch": 4.733280881195909,
"grad_norm": 0.20929765491064145,
"learning_rate": 3.3343136122629295e-07,
"loss": 0.4037,
"step": 1502
},
{
"epoch": 4.736428009441385,
"grad_norm": 0.2225887429417968,
"learning_rate": 3.254670029103579e-07,
"loss": 0.398,
"step": 1503
},
{
"epoch": 4.7395751376868605,
"grad_norm": 0.24208779005585707,
"learning_rate": 3.1759813614628656e-07,
"loss": 0.4057,
"step": 1504
},
{
"epoch": 4.742722265932336,
"grad_norm": 0.22395770260059794,
"learning_rate": 3.098247991261061e-07,
"loss": 0.4049,
"step": 1505
},
{
"epoch": 4.745869394177813,
"grad_norm": 0.2047845376910192,
"learning_rate": 3.0214702957818587e-07,
"loss": 0.4126,
"step": 1506
},
{
"epoch": 4.749016522423289,
"grad_norm": 0.2133448799717391,
"learning_rate": 2.945648647670485e-07,
"loss": 0.4091,
"step": 1507
},
{
"epoch": 4.752163650668765,
"grad_norm": 0.2173251110200107,
"learning_rate": 2.870783414931988e-07,
"loss": 0.396,
"step": 1508
},
{
"epoch": 4.755310778914241,
"grad_norm": 0.2187167201560873,
"learning_rate": 2.796874960929352e-07,
"loss": 0.4124,
"step": 1509
},
{
"epoch": 4.758457907159717,
"grad_norm": 0.2208294332041325,
"learning_rate": 2.723923644381765e-07,
"loss": 0.4002,
"step": 1510
},
{
"epoch": 4.761605035405193,
"grad_norm": 0.21817133183867887,
"learning_rate": 2.6519298193629526e-07,
"loss": 0.4144,
"step": 1511
},
{
"epoch": 4.764752163650669,
"grad_norm": 0.21498445279173387,
"learning_rate": 2.580893835299314e-07,
"loss": 0.4029,
"step": 1512
},
{
"epoch": 4.7678992918961445,
"grad_norm": 0.21297319478984542,
"learning_rate": 2.5108160369683663e-07,
"loss": 0.4071,
"step": 1513
},
{
"epoch": 4.771046420141621,
"grad_norm": 0.21291427119757153,
"learning_rate": 2.441696764497037e-07,
"loss": 0.407,
"step": 1514
},
{
"epoch": 4.774193548387097,
"grad_norm": 0.2159769244693944,
"learning_rate": 2.3735363533599065e-07,
"loss": 0.3867,
"step": 1515
},
{
"epoch": 4.777340676632573,
"grad_norm": 0.218700499287757,
"learning_rate": 2.3063351343777241e-07,
"loss": 0.4028,
"step": 1516
},
{
"epoch": 4.780487804878049,
"grad_norm": 0.20727391837895415,
"learning_rate": 2.2400934337157176e-07,
"loss": 0.407,
"step": 1517
},
{
"epoch": 4.783634933123524,
"grad_norm": 0.22594037043531556,
"learning_rate": 2.1748115728820852e-07,
"loss": 0.4058,
"step": 1518
},
{
"epoch": 4.786782061369001,
"grad_norm": 0.21081874655691793,
"learning_rate": 2.110489868726262e-07,
"loss": 0.4177,
"step": 1519
},
{
"epoch": 4.789929189614477,
"grad_norm": 0.22151547664122262,
"learning_rate": 2.047128633437634e-07,
"loss": 0.4057,
"step": 1520
},
{
"epoch": 4.793076317859953,
"grad_norm": 0.22101898922423646,
"learning_rate": 1.9847281745438262e-07,
"loss": 0.415,
"step": 1521
},
{
"epoch": 4.7962234461054285,
"grad_norm": 0.2193144165610068,
"learning_rate": 1.923288794909306e-07,
"loss": 0.4104,
"step": 1522
},
{
"epoch": 4.799370574350904,
"grad_norm": 0.21581312267270572,
"learning_rate": 1.862810792733849e-07,
"loss": 0.3954,
"step": 1523
},
{
"epoch": 4.802517702596381,
"grad_norm": 0.21917215039327326,
"learning_rate": 1.8032944615511638e-07,
"loss": 0.4151,
"step": 1524
},
{
"epoch": 4.805664830841857,
"grad_norm": 0.20125184352963307,
"learning_rate": 1.7447400902273813e-07,
"loss": 0.4118,
"step": 1525
},
{
"epoch": 4.808811959087333,
"grad_norm": 0.21956751608662672,
"learning_rate": 1.6871479629597897e-07,
"loss": 0.4098,
"step": 1526
},
{
"epoch": 4.811959087332809,
"grad_norm": 0.21515776124286765,
"learning_rate": 1.630518359275235e-07,
"loss": 0.4022,
"step": 1527
},
{
"epoch": 4.815106215578285,
"grad_norm": 0.21441421698457916,
"learning_rate": 1.574851554029011e-07,
"loss": 0.4095,
"step": 1528
},
{
"epoch": 4.818253343823761,
"grad_norm": 0.21770150209148334,
"learning_rate": 1.5201478174033724e-07,
"loss": 0.4007,
"step": 1529
},
{
"epoch": 4.821400472069237,
"grad_norm": 0.21413660508932328,
"learning_rate": 1.4664074149062457e-07,
"loss": 0.4102,
"step": 1530
},
{
"epoch": 4.8245476003147125,
"grad_norm": 0.2076863584844004,
"learning_rate": 1.4136306073699868e-07,
"loss": 0.4144,
"step": 1531
},
{
"epoch": 4.827694728560189,
"grad_norm": 0.22291240712551658,
"learning_rate": 1.3618176509500257e-07,
"loss": 0.4014,
"step": 1532
},
{
"epoch": 4.830841856805665,
"grad_norm": 0.21671960130680187,
"learning_rate": 1.3109687971237793e-07,
"loss": 0.3931,
"step": 1533
},
{
"epoch": 4.833988985051141,
"grad_norm": 0.203265956592449,
"learning_rate": 1.2610842926892076e-07,
"loss": 0.4173,
"step": 1534
},
{
"epoch": 4.837136113296617,
"grad_norm": 0.20458987302893475,
"learning_rate": 1.2121643797638805e-07,
"loss": 0.4206,
"step": 1535
},
{
"epoch": 4.840283241542092,
"grad_norm": 0.21532940470424342,
"learning_rate": 1.1642092957835582e-07,
"loss": 0.4078,
"step": 1536
},
{
"epoch": 4.843430369787569,
"grad_norm": 0.2052403167798288,
"learning_rate": 1.1172192735011689e-07,
"loss": 0.4073,
"step": 1537
},
{
"epoch": 4.846577498033045,
"grad_norm": 0.21654113378965234,
"learning_rate": 1.0711945409856983e-07,
"loss": 0.4066,
"step": 1538
},
{
"epoch": 4.849724626278521,
"grad_norm": 0.20462899076304167,
"learning_rate": 1.0261353216209691e-07,
"loss": 0.4111,
"step": 1539
},
{
"epoch": 4.8528717545239966,
"grad_norm": 0.2081899249002058,
"learning_rate": 9.820418341047078e-08,
"loss": 0.4142,
"step": 1540
},
{
"epoch": 4.856018882769473,
"grad_norm": 0.21737945180393162,
"learning_rate": 9.389142924473238e-08,
"loss": 0.4068,
"step": 1541
},
{
"epoch": 4.859166011014949,
"grad_norm": 0.21645245039465824,
"learning_rate": 8.96752905970999e-08,
"loss": 0.4037,
"step": 1542
},
{
"epoch": 4.862313139260425,
"grad_norm": 0.21851150478427397,
"learning_rate": 8.555578793085994e-08,
"loss": 0.4036,
"step": 1543
},
{
"epoch": 4.865460267505901,
"grad_norm": 0.2120811042181584,
"learning_rate": 8.153294124027211e-08,
"loss": 0.4121,
"step": 1544
},
{
"epoch": 4.868607395751377,
"grad_norm": 0.20575624884205296,
"learning_rate": 7.760677005046902e-08,
"loss": 0.4069,
"step": 1545
},
{
"epoch": 4.871754523996853,
"grad_norm": 0.21391937617578313,
"learning_rate": 7.377729341736306e-08,
"loss": 0.4041,
"step": 1546
},
{
"epoch": 4.874901652242329,
"grad_norm": 0.21156486072971747,
"learning_rate": 7.004452992755317e-08,
"loss": 0.3964,
"step": 1547
},
{
"epoch": 4.878048780487805,
"grad_norm": 0.20671844316636703,
"learning_rate": 6.640849769823599e-08,
"loss": 0.4057,
"step": 1548
},
{
"epoch": 4.881195908733281,
"grad_norm": 0.21259600584021066,
"learning_rate": 6.286921437711924e-08,
"loss": 0.3976,
"step": 1549
},
{
"epoch": 4.884343036978757,
"grad_norm": 0.2058375833275837,
"learning_rate": 5.9426697142328514e-08,
"loss": 0.4107,
"step": 1550
},
{
"epoch": 4.887490165224233,
"grad_norm": 0.20744913477051533,
"learning_rate": 5.6080962702333983e-08,
"loss": 0.4079,
"step": 1551
},
{
"epoch": 4.890637293469709,
"grad_norm": 0.22577938836544023,
"learning_rate": 5.2832027295861566e-08,
"loss": 0.3983,
"step": 1552
},
{
"epoch": 4.893784421715185,
"grad_norm": 0.20294764681156627,
"learning_rate": 4.967990669181744e-08,
"loss": 0.4063,
"step": 1553
},
{
"epoch": 4.8969315499606605,
"grad_norm": 0.20869502528574135,
"learning_rate": 4.6624616189214765e-08,
"loss": 0.4044,
"step": 1554
},
{
"epoch": 4.900078678206137,
"grad_norm": 0.20440330759726497,
"learning_rate": 4.3666170617093774e-08,
"loss": 0.4137,
"step": 1555
},
{
"epoch": 4.903225806451613,
"grad_norm": 0.2181494330788591,
"learning_rate": 4.080458433444845e-08,
"loss": 0.404,
"step": 1556
},
{
"epoch": 4.906372934697089,
"grad_norm": 0.21141190073150423,
"learning_rate": 3.803987123016439e-08,
"loss": 0.4121,
"step": 1557
},
{
"epoch": 4.909520062942565,
"grad_norm": 0.20552978785557943,
"learning_rate": 3.5372044722945534e-08,
"loss": 0.4035,
"step": 1558
},
{
"epoch": 4.912667191188041,
"grad_norm": 0.21497441633582837,
"learning_rate": 3.2801117761254163e-08,
"loss": 0.4148,
"step": 1559
},
{
"epoch": 4.915814319433517,
"grad_norm": 0.2128972917412232,
"learning_rate": 3.03271028232377e-08,
"loss": 0.3996,
"step": 1560
},
{
"epoch": 4.918961447678993,
"grad_norm": 0.21833114877923207,
"learning_rate": 2.7950011916682007e-08,
"loss": 0.4065,
"step": 1561
},
{
"epoch": 4.922108575924469,
"grad_norm": 0.2158153544540106,
"learning_rate": 2.566985657894483e-08,
"loss": 0.4102,
"step": 1562
},
{
"epoch": 4.925255704169945,
"grad_norm": 0.21773560895208463,
"learning_rate": 2.3486647876895806e-08,
"loss": 0.3977,
"step": 1563
},
{
"epoch": 4.928402832415421,
"grad_norm": 0.21076258848562276,
"learning_rate": 2.14003964068743e-08,
"loss": 0.3986,
"step": 1564
},
{
"epoch": 4.931549960660897,
"grad_norm": 0.21467255291492235,
"learning_rate": 1.9411112294629442e-08,
"loss": 0.4124,
"step": 1565
},
{
"epoch": 4.934697088906373,
"grad_norm": 0.20923975584514357,
"learning_rate": 1.751880519527571e-08,
"loss": 0.4023,
"step": 1566
},
{
"epoch": 4.937844217151849,
"grad_norm": 0.20904400841214246,
"learning_rate": 1.5723484293237446e-08,
"loss": 0.4082,
"step": 1567
},
{
"epoch": 4.940991345397325,
"grad_norm": 0.20840331694698017,
"learning_rate": 1.4025158302217735e-08,
"loss": 0.4053,
"step": 1568
},
{
"epoch": 4.944138473642801,
"grad_norm": 0.21324068817526423,
"learning_rate": 1.242383546514736e-08,
"loss": 0.4033,
"step": 1569
},
{
"epoch": 4.947285601888277,
"grad_norm": 0.21410240816558584,
"learning_rate": 1.0919523554140387e-08,
"loss": 0.4034,
"step": 1570
},
{
"epoch": 4.950432730133753,
"grad_norm": 0.20995826331255832,
"learning_rate": 9.512229870469736e-09,
"loss": 0.4041,
"step": 1571
},
{
"epoch": 4.9535798583792285,
"grad_norm": 0.22014255278213607,
"learning_rate": 8.201961244520552e-09,
"loss": 0.4013,
"step": 1572
},
{
"epoch": 4.956726986624705,
"grad_norm": 0.20658086287862656,
"learning_rate": 6.988724035761341e-09,
"loss": 0.4055,
"step": 1573
},
{
"epoch": 4.959874114870181,
"grad_norm": 0.21532440781117343,
"learning_rate": 5.872524132715107e-09,
"loss": 0.407,
"step": 1574
},
{
"epoch": 4.963021243115657,
"grad_norm": 0.22086256851698965,
"learning_rate": 4.853366952928262e-09,
"loss": 0.4036,
"step": 1575
},
{
"epoch": 4.966168371361133,
"grad_norm": 0.22229005085819714,
"learning_rate": 3.931257442937319e-09,
"loss": 0.4138,
"step": 1576
},
{
"epoch": 4.969315499606609,
"grad_norm": 0.20761601002360436,
"learning_rate": 3.1062000782600134e-09,
"loss": 0.397,
"step": 1577
},
{
"epoch": 4.972462627852085,
"grad_norm": 0.2268431980032955,
"learning_rate": 2.3781988633619914e-09,
"loss": 0.4074,
"step": 1578
},
{
"epoch": 4.975609756097561,
"grad_norm": 0.21131761190647141,
"learning_rate": 1.7472573316457132e-09,
"loss": 0.4057,
"step": 1579
},
{
"epoch": 4.978756884343037,
"grad_norm": 0.21908993904462645,
"learning_rate": 1.2133785454193636e-09,
"loss": 0.4021,
"step": 1580
},
{
"epoch": 4.9819040125885135,
"grad_norm": 0.21296216958733533,
"learning_rate": 7.765650958990734e-10,
"loss": 0.4024,
"step": 1581
},
{
"epoch": 4.985051140833989,
"grad_norm": 0.2097355620791621,
"learning_rate": 4.368191031844937e-10,
"loss": 0.4038,
"step": 1582
},
{
"epoch": 4.988198269079465,
"grad_norm": 0.2119478059627475,
"learning_rate": 1.9414221625435602e-10,
"loss": 0.4153,
"step": 1583
},
{
"epoch": 4.991345397324941,
"grad_norm": 0.21027656821222138,
"learning_rate": 4.853561295536935e-11,
"loss": 0.3986,
"step": 1584
},
{
"epoch": 4.994492525570417,
"grad_norm": 0.20504931593427467,
"learning_rate": 0.0,
"loss": 0.3896,
"step": 1585
},
{
"epoch": 4.994492525570417,
"step": 1585,
"total_flos": 5.954272714247897e+18,
"train_loss": 0.5234080795045907,
"train_runtime": 72319.3947,
"train_samples_per_second": 2.811,
"train_steps_per_second": 0.022
}
],
"logging_steps": 1.0,
"max_steps": 1585,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.954272714247897e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}