flyingbugs's picture
Model save
3853717 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9971862689926843,
"eval_steps": 500,
"global_step": 2664,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0011254924029262803,
"grad_norm": 52.80103639717517,
"learning_rate": 1.8726591760299626e-07,
"loss": 11.1109,
"step": 1
},
{
"epoch": 0.0022509848058525606,
"grad_norm": 54.14343134228205,
"learning_rate": 3.7453183520599253e-07,
"loss": 11.1518,
"step": 2
},
{
"epoch": 0.0033764772087788407,
"grad_norm": 55.956204989803545,
"learning_rate": 5.617977528089887e-07,
"loss": 11.0032,
"step": 3
},
{
"epoch": 0.004501969611705121,
"grad_norm": 52.83455219346365,
"learning_rate": 7.490636704119851e-07,
"loss": 10.9916,
"step": 4
},
{
"epoch": 0.005627462014631401,
"grad_norm": 55.14267600803135,
"learning_rate": 9.363295880149814e-07,
"loss": 11.0876,
"step": 5
},
{
"epoch": 0.006752954417557681,
"grad_norm": 54.398858725586614,
"learning_rate": 1.1235955056179775e-06,
"loss": 11.1606,
"step": 6
},
{
"epoch": 0.007878446820483961,
"grad_norm": 54.32209978082585,
"learning_rate": 1.310861423220974e-06,
"loss": 11.0376,
"step": 7
},
{
"epoch": 0.009003939223410242,
"grad_norm": 58.95322793439154,
"learning_rate": 1.4981273408239701e-06,
"loss": 10.7967,
"step": 8
},
{
"epoch": 0.010129431626336522,
"grad_norm": 61.239998437535036,
"learning_rate": 1.6853932584269663e-06,
"loss": 10.6462,
"step": 9
},
{
"epoch": 0.011254924029262802,
"grad_norm": 61.97431021564583,
"learning_rate": 1.8726591760299627e-06,
"loss": 10.7333,
"step": 10
},
{
"epoch": 0.012380416432189083,
"grad_norm": 81.14048682798334,
"learning_rate": 2.0599250936329587e-06,
"loss": 9.5671,
"step": 11
},
{
"epoch": 0.013505908835115363,
"grad_norm": 86.6613892998916,
"learning_rate": 2.247191011235955e-06,
"loss": 9.3288,
"step": 12
},
{
"epoch": 0.014631401238041642,
"grad_norm": 97.56062675082107,
"learning_rate": 2.4344569288389516e-06,
"loss": 8.8569,
"step": 13
},
{
"epoch": 0.015756893640967922,
"grad_norm": 99.72668638976701,
"learning_rate": 2.621722846441948e-06,
"loss": 8.8229,
"step": 14
},
{
"epoch": 0.016882386043894203,
"grad_norm": 68.62441737325906,
"learning_rate": 2.808988764044944e-06,
"loss": 4.106,
"step": 15
},
{
"epoch": 0.018007878446820485,
"grad_norm": 60.37331191202543,
"learning_rate": 2.9962546816479402e-06,
"loss": 3.608,
"step": 16
},
{
"epoch": 0.019133370849746763,
"grad_norm": 48.17406200608835,
"learning_rate": 3.1835205992509364e-06,
"loss": 3.0803,
"step": 17
},
{
"epoch": 0.020258863252673044,
"grad_norm": 36.4342554407753,
"learning_rate": 3.3707865168539327e-06,
"loss": 2.5875,
"step": 18
},
{
"epoch": 0.021384355655599326,
"grad_norm": 31.524091079255932,
"learning_rate": 3.558052434456929e-06,
"loss": 2.33,
"step": 19
},
{
"epoch": 0.022509848058525603,
"grad_norm": 6.74346485385345,
"learning_rate": 3.7453183520599255e-06,
"loss": 1.3777,
"step": 20
},
{
"epoch": 0.023635340461451885,
"grad_norm": 5.1275756648686786,
"learning_rate": 3.932584269662922e-06,
"loss": 1.3449,
"step": 21
},
{
"epoch": 0.024760832864378166,
"grad_norm": 4.059089851221617,
"learning_rate": 4.1198501872659175e-06,
"loss": 1.248,
"step": 22
},
{
"epoch": 0.025886325267304444,
"grad_norm": 3.461699822238443,
"learning_rate": 4.307116104868914e-06,
"loss": 1.2294,
"step": 23
},
{
"epoch": 0.027011817670230726,
"grad_norm": 2.6194967883079197,
"learning_rate": 4.49438202247191e-06,
"loss": 1.1185,
"step": 24
},
{
"epoch": 0.028137310073157007,
"grad_norm": 2.1602870716248015,
"learning_rate": 4.6816479400749066e-06,
"loss": 1.1193,
"step": 25
},
{
"epoch": 0.029262802476083285,
"grad_norm": 1.8135748254982695,
"learning_rate": 4.868913857677903e-06,
"loss": 1.0589,
"step": 26
},
{
"epoch": 0.030388294879009566,
"grad_norm": 1.321133306398482,
"learning_rate": 5.056179775280899e-06,
"loss": 0.9617,
"step": 27
},
{
"epoch": 0.031513787281935844,
"grad_norm": 31.284838936554156,
"learning_rate": 5.243445692883896e-06,
"loss": 0.9937,
"step": 28
},
{
"epoch": 0.032639279684862126,
"grad_norm": 1.8832749187420972,
"learning_rate": 5.430711610486891e-06,
"loss": 0.9144,
"step": 29
},
{
"epoch": 0.03376477208778841,
"grad_norm": 1.4607030575064903,
"learning_rate": 5.617977528089888e-06,
"loss": 0.8985,
"step": 30
},
{
"epoch": 0.03489026449071469,
"grad_norm": 1.0370555540895343,
"learning_rate": 5.805243445692885e-06,
"loss": 0.8404,
"step": 31
},
{
"epoch": 0.03601575689364097,
"grad_norm": 0.9098468238765742,
"learning_rate": 5.9925093632958805e-06,
"loss": 0.8352,
"step": 32
},
{
"epoch": 0.03714124929656725,
"grad_norm": 0.8661052907885602,
"learning_rate": 6.179775280898876e-06,
"loss": 0.8258,
"step": 33
},
{
"epoch": 0.038266741699493526,
"grad_norm": 0.7740808609488935,
"learning_rate": 6.367041198501873e-06,
"loss": 0.8324,
"step": 34
},
{
"epoch": 0.03939223410241981,
"grad_norm": 0.7782713659204045,
"learning_rate": 6.554307116104869e-06,
"loss": 0.7588,
"step": 35
},
{
"epoch": 0.04051772650534609,
"grad_norm": 0.6841020945645767,
"learning_rate": 6.741573033707865e-06,
"loss": 0.7682,
"step": 36
},
{
"epoch": 0.04164321890827237,
"grad_norm": 0.6004121622938939,
"learning_rate": 6.928838951310862e-06,
"loss": 0.7549,
"step": 37
},
{
"epoch": 0.04276871131119865,
"grad_norm": 0.6229597097596257,
"learning_rate": 7.116104868913858e-06,
"loss": 0.7376,
"step": 38
},
{
"epoch": 0.04389420371412493,
"grad_norm": 0.7141033532392286,
"learning_rate": 7.303370786516854e-06,
"loss": 0.7535,
"step": 39
},
{
"epoch": 0.04501969611705121,
"grad_norm": 0.5725153155935927,
"learning_rate": 7.490636704119851e-06,
"loss": 0.7185,
"step": 40
},
{
"epoch": 0.04614518851997749,
"grad_norm": 0.5549438958370185,
"learning_rate": 7.677902621722846e-06,
"loss": 0.7518,
"step": 41
},
{
"epoch": 0.04727068092290377,
"grad_norm": 0.4660101265627369,
"learning_rate": 7.865168539325843e-06,
"loss": 0.6787,
"step": 42
},
{
"epoch": 0.04839617332583005,
"grad_norm": 0.4908539170309294,
"learning_rate": 8.05243445692884e-06,
"loss": 0.7032,
"step": 43
},
{
"epoch": 0.04952166572875633,
"grad_norm": 0.48924522260651016,
"learning_rate": 8.239700374531835e-06,
"loss": 0.6803,
"step": 44
},
{
"epoch": 0.050647158131682614,
"grad_norm": 0.475140896111031,
"learning_rate": 8.426966292134832e-06,
"loss": 0.6475,
"step": 45
},
{
"epoch": 0.05177265053460889,
"grad_norm": 0.4644093059355716,
"learning_rate": 8.614232209737828e-06,
"loss": 0.7013,
"step": 46
},
{
"epoch": 0.05289814293753517,
"grad_norm": 0.40301032630352857,
"learning_rate": 8.801498127340826e-06,
"loss": 0.6463,
"step": 47
},
{
"epoch": 0.05402363534046145,
"grad_norm": 0.43480363638927505,
"learning_rate": 8.98876404494382e-06,
"loss": 0.6775,
"step": 48
},
{
"epoch": 0.05514912774338773,
"grad_norm": 0.43971181451177166,
"learning_rate": 9.176029962546817e-06,
"loss": 0.7007,
"step": 49
},
{
"epoch": 0.056274620146314014,
"grad_norm": 0.41896418177510275,
"learning_rate": 9.363295880149813e-06,
"loss": 0.6468,
"step": 50
},
{
"epoch": 0.057400112549240295,
"grad_norm": 0.4149971177588748,
"learning_rate": 9.550561797752809e-06,
"loss": 0.628,
"step": 51
},
{
"epoch": 0.05852560495216657,
"grad_norm": 0.37242192155253623,
"learning_rate": 9.737827715355806e-06,
"loss": 0.652,
"step": 52
},
{
"epoch": 0.05965109735509285,
"grad_norm": 0.327485240758468,
"learning_rate": 9.925093632958802e-06,
"loss": 0.6334,
"step": 53
},
{
"epoch": 0.06077658975801913,
"grad_norm": 0.36141343502753065,
"learning_rate": 1.0112359550561798e-05,
"loss": 0.6259,
"step": 54
},
{
"epoch": 0.061902082160945414,
"grad_norm": 0.38897211704559004,
"learning_rate": 1.0299625468164795e-05,
"loss": 0.6226,
"step": 55
},
{
"epoch": 0.06302757456387169,
"grad_norm": 0.36207952707026725,
"learning_rate": 1.0486891385767791e-05,
"loss": 0.6289,
"step": 56
},
{
"epoch": 0.06415306696679797,
"grad_norm": 0.28595916020001694,
"learning_rate": 1.0674157303370787e-05,
"loss": 0.6149,
"step": 57
},
{
"epoch": 0.06527855936972425,
"grad_norm": 0.29359683815567633,
"learning_rate": 1.0861423220973783e-05,
"loss": 0.5888,
"step": 58
},
{
"epoch": 0.06640405177265053,
"grad_norm": 0.3228509053817298,
"learning_rate": 1.104868913857678e-05,
"loss": 0.6328,
"step": 59
},
{
"epoch": 0.06752954417557681,
"grad_norm": 0.3068303518794903,
"learning_rate": 1.1235955056179776e-05,
"loss": 0.571,
"step": 60
},
{
"epoch": 0.0686550365785031,
"grad_norm": 0.3231501093567655,
"learning_rate": 1.1423220973782772e-05,
"loss": 0.5728,
"step": 61
},
{
"epoch": 0.06978052898142938,
"grad_norm": 0.2827526067701556,
"learning_rate": 1.161048689138577e-05,
"loss": 0.5919,
"step": 62
},
{
"epoch": 0.07090602138435566,
"grad_norm": 0.3490733036925077,
"learning_rate": 1.1797752808988765e-05,
"loss": 0.6319,
"step": 63
},
{
"epoch": 0.07203151378728194,
"grad_norm": 0.36049201575238243,
"learning_rate": 1.1985018726591761e-05,
"loss": 0.6065,
"step": 64
},
{
"epoch": 0.07315700619020822,
"grad_norm": 0.2817612900392732,
"learning_rate": 1.2172284644194758e-05,
"loss": 0.6022,
"step": 65
},
{
"epoch": 0.0742824985931345,
"grad_norm": 0.27300283931060443,
"learning_rate": 1.2359550561797752e-05,
"loss": 0.5783,
"step": 66
},
{
"epoch": 0.07540799099606077,
"grad_norm": 0.3421112627990278,
"learning_rate": 1.254681647940075e-05,
"loss": 0.576,
"step": 67
},
{
"epoch": 0.07653348339898705,
"grad_norm": 0.33598705329341366,
"learning_rate": 1.2734082397003746e-05,
"loss": 0.5835,
"step": 68
},
{
"epoch": 0.07765897580191333,
"grad_norm": 0.27960476280957486,
"learning_rate": 1.2921348314606743e-05,
"loss": 0.5987,
"step": 69
},
{
"epoch": 0.07878446820483961,
"grad_norm": 0.2965350125129841,
"learning_rate": 1.3108614232209737e-05,
"loss": 0.6026,
"step": 70
},
{
"epoch": 0.0799099606077659,
"grad_norm": 0.3122772390396813,
"learning_rate": 1.3295880149812733e-05,
"loss": 0.574,
"step": 71
},
{
"epoch": 0.08103545301069218,
"grad_norm": 0.3021816040434541,
"learning_rate": 1.348314606741573e-05,
"loss": 0.5771,
"step": 72
},
{
"epoch": 0.08216094541361846,
"grad_norm": 0.2831578746374877,
"learning_rate": 1.3670411985018728e-05,
"loss": 0.5675,
"step": 73
},
{
"epoch": 0.08328643781654474,
"grad_norm": 0.32441513984635667,
"learning_rate": 1.3857677902621724e-05,
"loss": 0.5652,
"step": 74
},
{
"epoch": 0.08441193021947102,
"grad_norm": 0.31509832756589373,
"learning_rate": 1.4044943820224721e-05,
"loss": 0.5725,
"step": 75
},
{
"epoch": 0.0855374226223973,
"grad_norm": 0.3068003737845105,
"learning_rate": 1.4232209737827715e-05,
"loss": 0.5921,
"step": 76
},
{
"epoch": 0.08666291502532358,
"grad_norm": 0.28569121242288503,
"learning_rate": 1.4419475655430711e-05,
"loss": 0.5517,
"step": 77
},
{
"epoch": 0.08778840742824986,
"grad_norm": 0.30318713510099926,
"learning_rate": 1.4606741573033709e-05,
"loss": 0.5786,
"step": 78
},
{
"epoch": 0.08891389983117615,
"grad_norm": 0.32791686866753017,
"learning_rate": 1.4794007490636705e-05,
"loss": 0.5835,
"step": 79
},
{
"epoch": 0.09003939223410241,
"grad_norm": 0.34541735995694495,
"learning_rate": 1.4981273408239702e-05,
"loss": 0.6003,
"step": 80
},
{
"epoch": 0.0911648846370287,
"grad_norm": 0.24219057822403553,
"learning_rate": 1.5168539325842698e-05,
"loss": 0.5634,
"step": 81
},
{
"epoch": 0.09229037703995498,
"grad_norm": 0.3066124460269189,
"learning_rate": 1.5355805243445692e-05,
"loss": 0.5385,
"step": 82
},
{
"epoch": 0.09341586944288126,
"grad_norm": 0.36004311246679105,
"learning_rate": 1.554307116104869e-05,
"loss": 0.542,
"step": 83
},
{
"epoch": 0.09454136184580754,
"grad_norm": 0.277294813524559,
"learning_rate": 1.5730337078651687e-05,
"loss": 0.5467,
"step": 84
},
{
"epoch": 0.09566685424873382,
"grad_norm": 0.2742529403377881,
"learning_rate": 1.591760299625468e-05,
"loss": 0.5337,
"step": 85
},
{
"epoch": 0.0967923466516601,
"grad_norm": 0.37776459034853405,
"learning_rate": 1.610486891385768e-05,
"loss": 0.5392,
"step": 86
},
{
"epoch": 0.09791783905458638,
"grad_norm": 0.29713498839858976,
"learning_rate": 1.6292134831460676e-05,
"loss": 0.5513,
"step": 87
},
{
"epoch": 0.09904333145751266,
"grad_norm": 0.2677802103514856,
"learning_rate": 1.647940074906367e-05,
"loss": 0.5435,
"step": 88
},
{
"epoch": 0.10016882386043895,
"grad_norm": 0.3282651538789268,
"learning_rate": 1.6666666666666667e-05,
"loss": 0.5556,
"step": 89
},
{
"epoch": 0.10129431626336523,
"grad_norm": 0.2903898300830952,
"learning_rate": 1.6853932584269665e-05,
"loss": 0.5182,
"step": 90
},
{
"epoch": 0.10241980866629151,
"grad_norm": 0.32940772248776146,
"learning_rate": 1.704119850187266e-05,
"loss": 0.5651,
"step": 91
},
{
"epoch": 0.10354530106921778,
"grad_norm": 0.29877064796568714,
"learning_rate": 1.7228464419475657e-05,
"loss": 0.5232,
"step": 92
},
{
"epoch": 0.10467079347214406,
"grad_norm": 0.3033306544759112,
"learning_rate": 1.7415730337078654e-05,
"loss": 0.5415,
"step": 93
},
{
"epoch": 0.10579628587507034,
"grad_norm": 0.298699393195244,
"learning_rate": 1.760299625468165e-05,
"loss": 0.5351,
"step": 94
},
{
"epoch": 0.10692177827799662,
"grad_norm": 0.27344653088956217,
"learning_rate": 1.7790262172284646e-05,
"loss": 0.5401,
"step": 95
},
{
"epoch": 0.1080472706809229,
"grad_norm": 0.2901283593528549,
"learning_rate": 1.797752808988764e-05,
"loss": 0.5548,
"step": 96
},
{
"epoch": 0.10917276308384918,
"grad_norm": 0.2955399438690073,
"learning_rate": 1.8164794007490637e-05,
"loss": 0.5336,
"step": 97
},
{
"epoch": 0.11029825548677546,
"grad_norm": 0.3044365394746884,
"learning_rate": 1.8352059925093635e-05,
"loss": 0.5095,
"step": 98
},
{
"epoch": 0.11142374788970175,
"grad_norm": 0.26929920330702195,
"learning_rate": 1.8539325842696632e-05,
"loss": 0.5569,
"step": 99
},
{
"epoch": 0.11254924029262803,
"grad_norm": 0.36727845819131605,
"learning_rate": 1.8726591760299626e-05,
"loss": 0.5818,
"step": 100
},
{
"epoch": 0.11367473269555431,
"grad_norm": 0.2836581651837986,
"learning_rate": 1.891385767790262e-05,
"loss": 0.5373,
"step": 101
},
{
"epoch": 0.11480022509848059,
"grad_norm": 0.29593257115280464,
"learning_rate": 1.9101123595505618e-05,
"loss": 0.5131,
"step": 102
},
{
"epoch": 0.11592571750140687,
"grad_norm": 0.29964886160890525,
"learning_rate": 1.9288389513108615e-05,
"loss": 0.5044,
"step": 103
},
{
"epoch": 0.11705120990433314,
"grad_norm": 0.30009105696644967,
"learning_rate": 1.9475655430711613e-05,
"loss": 0.536,
"step": 104
},
{
"epoch": 0.11817670230725942,
"grad_norm": 0.29291717707624504,
"learning_rate": 1.9662921348314607e-05,
"loss": 0.5505,
"step": 105
},
{
"epoch": 0.1193021947101857,
"grad_norm": 0.3294836067555843,
"learning_rate": 1.9850187265917604e-05,
"loss": 0.5505,
"step": 106
},
{
"epoch": 0.12042768711311198,
"grad_norm": 0.29401137621422074,
"learning_rate": 2.00374531835206e-05,
"loss": 0.528,
"step": 107
},
{
"epoch": 0.12155317951603826,
"grad_norm": 0.3030811720009754,
"learning_rate": 2.0224719101123596e-05,
"loss": 0.538,
"step": 108
},
{
"epoch": 0.12267867191896455,
"grad_norm": 0.32674282662604665,
"learning_rate": 2.0411985018726593e-05,
"loss": 0.541,
"step": 109
},
{
"epoch": 0.12380416432189083,
"grad_norm": 0.30319983351235286,
"learning_rate": 2.059925093632959e-05,
"loss": 0.504,
"step": 110
},
{
"epoch": 0.12492965672481711,
"grad_norm": 0.3402565154469349,
"learning_rate": 2.0786516853932585e-05,
"loss": 0.5251,
"step": 111
},
{
"epoch": 0.12605514912774338,
"grad_norm": 0.31872601282001034,
"learning_rate": 2.0973782771535582e-05,
"loss": 0.5286,
"step": 112
},
{
"epoch": 0.12718064153066966,
"grad_norm": 0.34754536732763297,
"learning_rate": 2.1161048689138577e-05,
"loss": 0.5235,
"step": 113
},
{
"epoch": 0.12830613393359594,
"grad_norm": 0.30998860710868686,
"learning_rate": 2.1348314606741574e-05,
"loss": 0.525,
"step": 114
},
{
"epoch": 0.12943162633652222,
"grad_norm": 0.32990918540472725,
"learning_rate": 2.153558052434457e-05,
"loss": 0.5265,
"step": 115
},
{
"epoch": 0.1305571187394485,
"grad_norm": 0.3423710738146026,
"learning_rate": 2.1722846441947566e-05,
"loss": 0.5338,
"step": 116
},
{
"epoch": 0.13168261114237478,
"grad_norm": 0.2872199647047314,
"learning_rate": 2.1910112359550563e-05,
"loss": 0.5299,
"step": 117
},
{
"epoch": 0.13280810354530106,
"grad_norm": 0.3317448545714626,
"learning_rate": 2.209737827715356e-05,
"loss": 0.4959,
"step": 118
},
{
"epoch": 0.13393359594822735,
"grad_norm": 0.31417498563521173,
"learning_rate": 2.2284644194756555e-05,
"loss": 0.52,
"step": 119
},
{
"epoch": 0.13505908835115363,
"grad_norm": 0.3645759776734259,
"learning_rate": 2.2471910112359552e-05,
"loss": 0.5296,
"step": 120
},
{
"epoch": 0.1361845807540799,
"grad_norm": 0.3180662213331512,
"learning_rate": 2.2659176029962546e-05,
"loss": 0.5063,
"step": 121
},
{
"epoch": 0.1373100731570062,
"grad_norm": 0.3716923342200342,
"learning_rate": 2.2846441947565544e-05,
"loss": 0.5046,
"step": 122
},
{
"epoch": 0.13843556555993247,
"grad_norm": 0.39150702044794555,
"learning_rate": 2.303370786516854e-05,
"loss": 0.4959,
"step": 123
},
{
"epoch": 0.13956105796285875,
"grad_norm": 0.3713739740316023,
"learning_rate": 2.322097378277154e-05,
"loss": 0.5015,
"step": 124
},
{
"epoch": 0.14068655036578503,
"grad_norm": 0.355150041365192,
"learning_rate": 2.3408239700374533e-05,
"loss": 0.5029,
"step": 125
},
{
"epoch": 0.14181204276871132,
"grad_norm": 0.47357406433732624,
"learning_rate": 2.359550561797753e-05,
"loss": 0.519,
"step": 126
},
{
"epoch": 0.1429375351716376,
"grad_norm": 0.35841513558308474,
"learning_rate": 2.3782771535580524e-05,
"loss": 0.517,
"step": 127
},
{
"epoch": 0.14406302757456388,
"grad_norm": 0.32127121635614614,
"learning_rate": 2.3970037453183522e-05,
"loss": 0.5068,
"step": 128
},
{
"epoch": 0.14518851997749016,
"grad_norm": 0.41380038534756397,
"learning_rate": 2.415730337078652e-05,
"loss": 0.53,
"step": 129
},
{
"epoch": 0.14631401238041644,
"grad_norm": 0.3342860962607464,
"learning_rate": 2.4344569288389517e-05,
"loss": 0.5131,
"step": 130
},
{
"epoch": 0.14743950478334272,
"grad_norm": 0.328086226882181,
"learning_rate": 2.453183520599251e-05,
"loss": 0.5359,
"step": 131
},
{
"epoch": 0.148564997186269,
"grad_norm": 0.3980527154392636,
"learning_rate": 2.4719101123595505e-05,
"loss": 0.4915,
"step": 132
},
{
"epoch": 0.14969048958919529,
"grad_norm": 0.3664150255854856,
"learning_rate": 2.4906367041198502e-05,
"loss": 0.5239,
"step": 133
},
{
"epoch": 0.15081598199212154,
"grad_norm": 0.36032405515932203,
"learning_rate": 2.50936329588015e-05,
"loss": 0.5085,
"step": 134
},
{
"epoch": 0.15194147439504782,
"grad_norm": 0.4406027959320581,
"learning_rate": 2.5280898876404497e-05,
"loss": 0.5126,
"step": 135
},
{
"epoch": 0.1530669667979741,
"grad_norm": 0.344695754779841,
"learning_rate": 2.546816479400749e-05,
"loss": 0.5122,
"step": 136
},
{
"epoch": 0.15419245920090038,
"grad_norm": 0.3726483933183008,
"learning_rate": 2.565543071161049e-05,
"loss": 0.4905,
"step": 137
},
{
"epoch": 0.15531795160382666,
"grad_norm": 0.3449312763960655,
"learning_rate": 2.5842696629213486e-05,
"loss": 0.4987,
"step": 138
},
{
"epoch": 0.15644344400675295,
"grad_norm": 0.35328970504291957,
"learning_rate": 2.6029962546816484e-05,
"loss": 0.5054,
"step": 139
},
{
"epoch": 0.15756893640967923,
"grad_norm": 0.3700337092111675,
"learning_rate": 2.6217228464419475e-05,
"loss": 0.509,
"step": 140
},
{
"epoch": 0.1586944288126055,
"grad_norm": 0.301320056673764,
"learning_rate": 2.6404494382022472e-05,
"loss": 0.4958,
"step": 141
},
{
"epoch": 0.1598199212155318,
"grad_norm": 0.4191378953980472,
"learning_rate": 2.6591760299625466e-05,
"loss": 0.5387,
"step": 142
},
{
"epoch": 0.16094541361845807,
"grad_norm": 0.3880541184543602,
"learning_rate": 2.6779026217228464e-05,
"loss": 0.5227,
"step": 143
},
{
"epoch": 0.16207090602138435,
"grad_norm": 0.39927231059272483,
"learning_rate": 2.696629213483146e-05,
"loss": 0.5237,
"step": 144
},
{
"epoch": 0.16319639842431063,
"grad_norm": 0.3961271339819255,
"learning_rate": 2.715355805243446e-05,
"loss": 0.5194,
"step": 145
},
{
"epoch": 0.16432189082723692,
"grad_norm": 0.4376696251019293,
"learning_rate": 2.7340823970037456e-05,
"loss": 0.5178,
"step": 146
},
{
"epoch": 0.1654473832301632,
"grad_norm": 0.44058938921182966,
"learning_rate": 2.752808988764045e-05,
"loss": 0.4998,
"step": 147
},
{
"epoch": 0.16657287563308948,
"grad_norm": 0.35261095257281155,
"learning_rate": 2.7715355805243448e-05,
"loss": 0.499,
"step": 148
},
{
"epoch": 0.16769836803601576,
"grad_norm": 0.5218533410763981,
"learning_rate": 2.7902621722846445e-05,
"loss": 0.5273,
"step": 149
},
{
"epoch": 0.16882386043894204,
"grad_norm": 0.4737891842741366,
"learning_rate": 2.8089887640449443e-05,
"loss": 0.5003,
"step": 150
},
{
"epoch": 0.16994935284186832,
"grad_norm": 0.392922001496729,
"learning_rate": 2.8277153558052437e-05,
"loss": 0.5016,
"step": 151
},
{
"epoch": 0.1710748452447946,
"grad_norm": 0.5302514501231146,
"learning_rate": 2.846441947565543e-05,
"loss": 0.5172,
"step": 152
},
{
"epoch": 0.17220033764772089,
"grad_norm": 0.49803115823639127,
"learning_rate": 2.8651685393258425e-05,
"loss": 0.4946,
"step": 153
},
{
"epoch": 0.17332583005064717,
"grad_norm": 0.4128451133760804,
"learning_rate": 2.8838951310861422e-05,
"loss": 0.5232,
"step": 154
},
{
"epoch": 0.17445132245357345,
"grad_norm": 0.6316627266885098,
"learning_rate": 2.902621722846442e-05,
"loss": 0.5059,
"step": 155
},
{
"epoch": 0.17557681485649973,
"grad_norm": 0.5295204042861669,
"learning_rate": 2.9213483146067417e-05,
"loss": 0.5243,
"step": 156
},
{
"epoch": 0.176702307259426,
"grad_norm": 0.45607245497823934,
"learning_rate": 2.940074906367041e-05,
"loss": 0.4821,
"step": 157
},
{
"epoch": 0.1778277996623523,
"grad_norm": 0.6021144875229769,
"learning_rate": 2.958801498127341e-05,
"loss": 0.5103,
"step": 158
},
{
"epoch": 0.17895329206527855,
"grad_norm": 0.48529780373586173,
"learning_rate": 2.9775280898876406e-05,
"loss": 0.4922,
"step": 159
},
{
"epoch": 0.18007878446820483,
"grad_norm": 0.4250055623471545,
"learning_rate": 2.9962546816479404e-05,
"loss": 0.4904,
"step": 160
},
{
"epoch": 0.1812042768711311,
"grad_norm": 0.6512919492582171,
"learning_rate": 3.01498127340824e-05,
"loss": 0.5145,
"step": 161
},
{
"epoch": 0.1823297692740574,
"grad_norm": 0.45356537836570343,
"learning_rate": 3.0337078651685396e-05,
"loss": 0.489,
"step": 162
},
{
"epoch": 0.18345526167698367,
"grad_norm": 0.4587778769232854,
"learning_rate": 3.052434456928839e-05,
"loss": 0.5031,
"step": 163
},
{
"epoch": 0.18458075407990995,
"grad_norm": 0.5209259547751122,
"learning_rate": 3.0711610486891384e-05,
"loss": 0.5122,
"step": 164
},
{
"epoch": 0.18570624648283623,
"grad_norm": 0.3205075873383086,
"learning_rate": 3.089887640449438e-05,
"loss": 0.484,
"step": 165
},
{
"epoch": 0.18683173888576252,
"grad_norm": 0.44421922243323253,
"learning_rate": 3.108614232209738e-05,
"loss": 0.4885,
"step": 166
},
{
"epoch": 0.1879572312886888,
"grad_norm": 0.38376560722257824,
"learning_rate": 3.1273408239700376e-05,
"loss": 0.5137,
"step": 167
},
{
"epoch": 0.18908272369161508,
"grad_norm": 0.33178548545336345,
"learning_rate": 3.1460674157303374e-05,
"loss": 0.5214,
"step": 168
},
{
"epoch": 0.19020821609454136,
"grad_norm": 0.3543354285220051,
"learning_rate": 3.164794007490637e-05,
"loss": 0.4652,
"step": 169
},
{
"epoch": 0.19133370849746764,
"grad_norm": 0.34821873695435235,
"learning_rate": 3.183520599250936e-05,
"loss": 0.4695,
"step": 170
},
{
"epoch": 0.19245920090039392,
"grad_norm": 0.346452239854666,
"learning_rate": 3.202247191011236e-05,
"loss": 0.4891,
"step": 171
},
{
"epoch": 0.1935846933033202,
"grad_norm": 0.4398933317388218,
"learning_rate": 3.220973782771536e-05,
"loss": 0.4911,
"step": 172
},
{
"epoch": 0.19471018570624649,
"grad_norm": 0.3624677233826849,
"learning_rate": 3.2397003745318354e-05,
"loss": 0.4912,
"step": 173
},
{
"epoch": 0.19583567810917277,
"grad_norm": 0.3699640798637125,
"learning_rate": 3.258426966292135e-05,
"loss": 0.5004,
"step": 174
},
{
"epoch": 0.19696117051209905,
"grad_norm": 0.41958584077529965,
"learning_rate": 3.277153558052435e-05,
"loss": 0.4752,
"step": 175
},
{
"epoch": 0.19808666291502533,
"grad_norm": 0.42502324118725465,
"learning_rate": 3.295880149812734e-05,
"loss": 0.4922,
"step": 176
},
{
"epoch": 0.1992121553179516,
"grad_norm": 0.36517865445954867,
"learning_rate": 3.314606741573034e-05,
"loss": 0.5048,
"step": 177
},
{
"epoch": 0.2003376477208779,
"grad_norm": 0.41574946856579004,
"learning_rate": 3.3333333333333335e-05,
"loss": 0.4821,
"step": 178
},
{
"epoch": 0.20146314012380417,
"grad_norm": 0.4378349227779501,
"learning_rate": 3.352059925093633e-05,
"loss": 0.4933,
"step": 179
},
{
"epoch": 0.20258863252673046,
"grad_norm": 0.4776232193190497,
"learning_rate": 3.370786516853933e-05,
"loss": 0.4751,
"step": 180
},
{
"epoch": 0.20371412492965674,
"grad_norm": 0.43848154415790724,
"learning_rate": 3.389513108614232e-05,
"loss": 0.4807,
"step": 181
},
{
"epoch": 0.20483961733258302,
"grad_norm": 0.5845165253893854,
"learning_rate": 3.408239700374532e-05,
"loss": 0.5009,
"step": 182
},
{
"epoch": 0.20596510973550927,
"grad_norm": 0.4082870415882244,
"learning_rate": 3.4269662921348316e-05,
"loss": 0.4979,
"step": 183
},
{
"epoch": 0.20709060213843555,
"grad_norm": 0.4718495231442118,
"learning_rate": 3.445692883895131e-05,
"loss": 0.5038,
"step": 184
},
{
"epoch": 0.20821609454136183,
"grad_norm": 0.473455230356082,
"learning_rate": 3.464419475655431e-05,
"loss": 0.4833,
"step": 185
},
{
"epoch": 0.20934158694428812,
"grad_norm": 0.46125301819415737,
"learning_rate": 3.483146067415731e-05,
"loss": 0.4889,
"step": 186
},
{
"epoch": 0.2104670793472144,
"grad_norm": 0.4432990112364735,
"learning_rate": 3.5018726591760305e-05,
"loss": 0.4807,
"step": 187
},
{
"epoch": 0.21159257175014068,
"grad_norm": 0.5322159815450351,
"learning_rate": 3.52059925093633e-05,
"loss": 0.5219,
"step": 188
},
{
"epoch": 0.21271806415306696,
"grad_norm": 0.3936101857237881,
"learning_rate": 3.5393258426966294e-05,
"loss": 0.4922,
"step": 189
},
{
"epoch": 0.21384355655599324,
"grad_norm": 0.53345431181708,
"learning_rate": 3.558052434456929e-05,
"loss": 0.4897,
"step": 190
},
{
"epoch": 0.21496904895891952,
"grad_norm": 0.6142180117371377,
"learning_rate": 3.576779026217228e-05,
"loss": 0.493,
"step": 191
},
{
"epoch": 0.2160945413618458,
"grad_norm": 0.41631870227046885,
"learning_rate": 3.595505617977528e-05,
"loss": 0.4893,
"step": 192
},
{
"epoch": 0.21722003376477209,
"grad_norm": 0.4305104245523169,
"learning_rate": 3.614232209737828e-05,
"loss": 0.4866,
"step": 193
},
{
"epoch": 0.21834552616769837,
"grad_norm": 0.5169903617970245,
"learning_rate": 3.6329588014981274e-05,
"loss": 0.4901,
"step": 194
},
{
"epoch": 0.21947101857062465,
"grad_norm": 0.3860200825591215,
"learning_rate": 3.651685393258427e-05,
"loss": 0.4549,
"step": 195
},
{
"epoch": 0.22059651097355093,
"grad_norm": 0.5230520554579277,
"learning_rate": 3.670411985018727e-05,
"loss": 0.4724,
"step": 196
},
{
"epoch": 0.2217220033764772,
"grad_norm": 0.39548431249473126,
"learning_rate": 3.689138576779027e-05,
"loss": 0.4911,
"step": 197
},
{
"epoch": 0.2228474957794035,
"grad_norm": 0.48800271319592975,
"learning_rate": 3.7078651685393264e-05,
"loss": 0.4572,
"step": 198
},
{
"epoch": 0.22397298818232977,
"grad_norm": 0.41978987611240903,
"learning_rate": 3.726591760299626e-05,
"loss": 0.4829,
"step": 199
},
{
"epoch": 0.22509848058525606,
"grad_norm": 0.5469472170008755,
"learning_rate": 3.745318352059925e-05,
"loss": 0.5274,
"step": 200
},
{
"epoch": 0.22622397298818234,
"grad_norm": 0.3918679709299485,
"learning_rate": 3.764044943820225e-05,
"loss": 0.4766,
"step": 201
},
{
"epoch": 0.22734946539110862,
"grad_norm": 0.4611366578168398,
"learning_rate": 3.782771535580524e-05,
"loss": 0.4533,
"step": 202
},
{
"epoch": 0.2284749577940349,
"grad_norm": 0.35525042824778863,
"learning_rate": 3.801498127340824e-05,
"loss": 0.4801,
"step": 203
},
{
"epoch": 0.22960045019696118,
"grad_norm": 0.39795327337608555,
"learning_rate": 3.8202247191011236e-05,
"loss": 0.4796,
"step": 204
},
{
"epoch": 0.23072594259988746,
"grad_norm": 0.40314796565206873,
"learning_rate": 3.838951310861423e-05,
"loss": 0.4746,
"step": 205
},
{
"epoch": 0.23185143500281374,
"grad_norm": 0.6186856651894296,
"learning_rate": 3.857677902621723e-05,
"loss": 0.483,
"step": 206
},
{
"epoch": 0.23297692740574,
"grad_norm": 0.6262230101782875,
"learning_rate": 3.876404494382023e-05,
"loss": 0.5026,
"step": 207
},
{
"epoch": 0.23410241980866628,
"grad_norm": 0.6885063622065476,
"learning_rate": 3.8951310861423226e-05,
"loss": 0.4961,
"step": 208
},
{
"epoch": 0.23522791221159256,
"grad_norm": 0.46434262483818484,
"learning_rate": 3.913857677902622e-05,
"loss": 0.4824,
"step": 209
},
{
"epoch": 0.23635340461451884,
"grad_norm": 0.5876521011749303,
"learning_rate": 3.9325842696629214e-05,
"loss": 0.4799,
"step": 210
},
{
"epoch": 0.23747889701744512,
"grad_norm": 0.5679577524617186,
"learning_rate": 3.951310861423221e-05,
"loss": 0.4976,
"step": 211
},
{
"epoch": 0.2386043894203714,
"grad_norm": 0.4948818608996542,
"learning_rate": 3.970037453183521e-05,
"loss": 0.4689,
"step": 212
},
{
"epoch": 0.23972988182329769,
"grad_norm": 0.5366944392366912,
"learning_rate": 3.98876404494382e-05,
"loss": 0.5052,
"step": 213
},
{
"epoch": 0.24085537422622397,
"grad_norm": 0.46091893449282645,
"learning_rate": 4.00749063670412e-05,
"loss": 0.504,
"step": 214
},
{
"epoch": 0.24198086662915025,
"grad_norm": 0.6227113043840353,
"learning_rate": 4.0262172284644194e-05,
"loss": 0.4947,
"step": 215
},
{
"epoch": 0.24310635903207653,
"grad_norm": 0.37975248168226977,
"learning_rate": 4.044943820224719e-05,
"loss": 0.4681,
"step": 216
},
{
"epoch": 0.2442318514350028,
"grad_norm": 0.6602796166859184,
"learning_rate": 4.063670411985019e-05,
"loss": 0.4809,
"step": 217
},
{
"epoch": 0.2453573438379291,
"grad_norm": 0.46707379726848597,
"learning_rate": 4.082397003745319e-05,
"loss": 0.4586,
"step": 218
},
{
"epoch": 0.24648283624085537,
"grad_norm": 0.58153678436466,
"learning_rate": 4.1011235955056184e-05,
"loss": 0.508,
"step": 219
},
{
"epoch": 0.24760832864378166,
"grad_norm": 0.6402167998756934,
"learning_rate": 4.119850187265918e-05,
"loss": 0.5039,
"step": 220
},
{
"epoch": 0.24873382104670794,
"grad_norm": 0.5794603595581886,
"learning_rate": 4.138576779026217e-05,
"loss": 0.4653,
"step": 221
},
{
"epoch": 0.24985931344963422,
"grad_norm": 0.5230659913502629,
"learning_rate": 4.157303370786517e-05,
"loss": 0.5038,
"step": 222
},
{
"epoch": 0.2509848058525605,
"grad_norm": 0.6799656883961334,
"learning_rate": 4.176029962546817e-05,
"loss": 0.4898,
"step": 223
},
{
"epoch": 0.25211029825548675,
"grad_norm": 0.6621661466046944,
"learning_rate": 4.1947565543071165e-05,
"loss": 0.4844,
"step": 224
},
{
"epoch": 0.25323579065841306,
"grad_norm": 0.5616167950816823,
"learning_rate": 4.2134831460674156e-05,
"loss": 0.488,
"step": 225
},
{
"epoch": 0.2543612830613393,
"grad_norm": 0.6575771702191943,
"learning_rate": 4.232209737827715e-05,
"loss": 0.4791,
"step": 226
},
{
"epoch": 0.2554867754642656,
"grad_norm": 0.5488369093936996,
"learning_rate": 4.250936329588015e-05,
"loss": 0.5005,
"step": 227
},
{
"epoch": 0.2566122678671919,
"grad_norm": 0.6144786136277036,
"learning_rate": 4.269662921348315e-05,
"loss": 0.4663,
"step": 228
},
{
"epoch": 0.2577377602701182,
"grad_norm": 0.600777544617447,
"learning_rate": 4.2883895131086146e-05,
"loss": 0.4786,
"step": 229
},
{
"epoch": 0.25886325267304444,
"grad_norm": 0.8023306491917337,
"learning_rate": 4.307116104868914e-05,
"loss": 0.4861,
"step": 230
},
{
"epoch": 0.25998874507597075,
"grad_norm": 0.5046450430408941,
"learning_rate": 4.3258426966292134e-05,
"loss": 0.4727,
"step": 231
},
{
"epoch": 0.261114237478897,
"grad_norm": 0.7089229481882726,
"learning_rate": 4.344569288389513e-05,
"loss": 0.4907,
"step": 232
},
{
"epoch": 0.2622397298818233,
"grad_norm": 0.7578774630778147,
"learning_rate": 4.363295880149813e-05,
"loss": 0.4568,
"step": 233
},
{
"epoch": 0.26336522228474957,
"grad_norm": 0.43469225376554593,
"learning_rate": 4.3820224719101126e-05,
"loss": 0.4882,
"step": 234
},
{
"epoch": 0.2644907146876759,
"grad_norm": 0.7064451875198454,
"learning_rate": 4.4007490636704124e-05,
"loss": 0.4777,
"step": 235
},
{
"epoch": 0.26561620709060213,
"grad_norm": 0.5236443057755517,
"learning_rate": 4.419475655430712e-05,
"loss": 0.4541,
"step": 236
},
{
"epoch": 0.26674169949352844,
"grad_norm": 0.49987222413307647,
"learning_rate": 4.438202247191011e-05,
"loss": 0.4813,
"step": 237
},
{
"epoch": 0.2678671918964547,
"grad_norm": 0.4887524110978959,
"learning_rate": 4.456928838951311e-05,
"loss": 0.4904,
"step": 238
},
{
"epoch": 0.268992684299381,
"grad_norm": 0.43081504689209343,
"learning_rate": 4.475655430711611e-05,
"loss": 0.4828,
"step": 239
},
{
"epoch": 0.27011817670230726,
"grad_norm": 0.4927726763052006,
"learning_rate": 4.4943820224719104e-05,
"loss": 0.4661,
"step": 240
},
{
"epoch": 0.27124366910523356,
"grad_norm": 0.48347879754332407,
"learning_rate": 4.51310861423221e-05,
"loss": 0.4698,
"step": 241
},
{
"epoch": 0.2723691615081598,
"grad_norm": 0.4867564616696551,
"learning_rate": 4.531835205992509e-05,
"loss": 0.49,
"step": 242
},
{
"epoch": 0.2734946539110861,
"grad_norm": 0.5374635091852026,
"learning_rate": 4.550561797752809e-05,
"loss": 0.4559,
"step": 243
},
{
"epoch": 0.2746201463140124,
"grad_norm": 0.4772645954109197,
"learning_rate": 4.569288389513109e-05,
"loss": 0.4717,
"step": 244
},
{
"epoch": 0.27574563871693863,
"grad_norm": 0.4347548915476938,
"learning_rate": 4.5880149812734085e-05,
"loss": 0.4532,
"step": 245
},
{
"epoch": 0.27687113111986494,
"grad_norm": 0.4759288672038084,
"learning_rate": 4.606741573033708e-05,
"loss": 0.4945,
"step": 246
},
{
"epoch": 0.2779966235227912,
"grad_norm": 0.5792894788506712,
"learning_rate": 4.625468164794008e-05,
"loss": 0.4983,
"step": 247
},
{
"epoch": 0.2791221159257175,
"grad_norm": 0.3980178738338264,
"learning_rate": 4.644194756554308e-05,
"loss": 0.4827,
"step": 248
},
{
"epoch": 0.28024760832864376,
"grad_norm": 0.5408869118408165,
"learning_rate": 4.662921348314607e-05,
"loss": 0.4528,
"step": 249
},
{
"epoch": 0.28137310073157007,
"grad_norm": 0.5689847204055498,
"learning_rate": 4.6816479400749066e-05,
"loss": 0.4598,
"step": 250
},
{
"epoch": 0.2824985931344963,
"grad_norm": 0.5617698646408457,
"learning_rate": 4.700374531835206e-05,
"loss": 0.5022,
"step": 251
},
{
"epoch": 0.28362408553742263,
"grad_norm": 0.45230255865587565,
"learning_rate": 4.719101123595506e-05,
"loss": 0.47,
"step": 252
},
{
"epoch": 0.2847495779403489,
"grad_norm": 0.42374590684633967,
"learning_rate": 4.737827715355805e-05,
"loss": 0.466,
"step": 253
},
{
"epoch": 0.2858750703432752,
"grad_norm": 0.4180084231861174,
"learning_rate": 4.756554307116105e-05,
"loss": 0.4747,
"step": 254
},
{
"epoch": 0.28700056274620145,
"grad_norm": 0.4787386782007702,
"learning_rate": 4.7752808988764046e-05,
"loss": 0.4611,
"step": 255
},
{
"epoch": 0.28812605514912776,
"grad_norm": 0.40084601864669134,
"learning_rate": 4.7940074906367044e-05,
"loss": 0.4577,
"step": 256
},
{
"epoch": 0.289251547552054,
"grad_norm": 0.5597034903024327,
"learning_rate": 4.812734082397004e-05,
"loss": 0.4843,
"step": 257
},
{
"epoch": 0.2903770399549803,
"grad_norm": 0.4389515417232518,
"learning_rate": 4.831460674157304e-05,
"loss": 0.4667,
"step": 258
},
{
"epoch": 0.2915025323579066,
"grad_norm": 0.5153267551952044,
"learning_rate": 4.8501872659176036e-05,
"loss": 0.4543,
"step": 259
},
{
"epoch": 0.2926280247608329,
"grad_norm": 0.4414645886637002,
"learning_rate": 4.8689138576779034e-05,
"loss": 0.4687,
"step": 260
},
{
"epoch": 0.29375351716375914,
"grad_norm": 0.5441323388581608,
"learning_rate": 4.8876404494382024e-05,
"loss": 0.4904,
"step": 261
},
{
"epoch": 0.29487900956668545,
"grad_norm": 0.47357188882841866,
"learning_rate": 4.906367041198502e-05,
"loss": 0.4646,
"step": 262
},
{
"epoch": 0.2960045019696117,
"grad_norm": 0.43582942744547837,
"learning_rate": 4.925093632958801e-05,
"loss": 0.4747,
"step": 263
},
{
"epoch": 0.297129994372538,
"grad_norm": 0.4659511261837298,
"learning_rate": 4.943820224719101e-05,
"loss": 0.4613,
"step": 264
},
{
"epoch": 0.29825548677546426,
"grad_norm": 0.4561502948161637,
"learning_rate": 4.962546816479401e-05,
"loss": 0.4691,
"step": 265
},
{
"epoch": 0.29938097917839057,
"grad_norm": 0.4541481932977169,
"learning_rate": 4.9812734082397005e-05,
"loss": 0.4661,
"step": 266
},
{
"epoch": 0.3005064715813168,
"grad_norm": 0.47547350717861037,
"learning_rate": 5e-05,
"loss": 0.4678,
"step": 267
},
{
"epoch": 0.3016319639842431,
"grad_norm": 0.406819603933647,
"learning_rate": 4.997914059240717e-05,
"loss": 0.4381,
"step": 268
},
{
"epoch": 0.3027574563871694,
"grad_norm": 0.4460099057965906,
"learning_rate": 4.9958281184814356e-05,
"loss": 0.468,
"step": 269
},
{
"epoch": 0.30388294879009564,
"grad_norm": 0.5142189061381415,
"learning_rate": 4.9937421777221527e-05,
"loss": 0.4714,
"step": 270
},
{
"epoch": 0.30500844119302195,
"grad_norm": 0.5665935978369622,
"learning_rate": 4.9916562369628704e-05,
"loss": 0.4818,
"step": 271
},
{
"epoch": 0.3061339335959482,
"grad_norm": 0.47186870063336683,
"learning_rate": 4.989570296203588e-05,
"loss": 0.4793,
"step": 272
},
{
"epoch": 0.3072594259988745,
"grad_norm": 0.457167782453592,
"learning_rate": 4.987484355444306e-05,
"loss": 0.4404,
"step": 273
},
{
"epoch": 0.30838491840180077,
"grad_norm": 0.5127134764884651,
"learning_rate": 4.985398414685023e-05,
"loss": 0.465,
"step": 274
},
{
"epoch": 0.3095104108047271,
"grad_norm": 0.6532795598287193,
"learning_rate": 4.983312473925741e-05,
"loss": 0.518,
"step": 275
},
{
"epoch": 0.31063590320765333,
"grad_norm": 0.6930677368699536,
"learning_rate": 4.981226533166458e-05,
"loss": 0.4866,
"step": 276
},
{
"epoch": 0.31176139561057964,
"grad_norm": 0.5061640947852638,
"learning_rate": 4.979140592407176e-05,
"loss": 0.4609,
"step": 277
},
{
"epoch": 0.3128868880135059,
"grad_norm": 0.5257849117764034,
"learning_rate": 4.9770546516478936e-05,
"loss": 0.457,
"step": 278
},
{
"epoch": 0.3140123804164322,
"grad_norm": 0.4321326701858845,
"learning_rate": 4.974968710888611e-05,
"loss": 0.4454,
"step": 279
},
{
"epoch": 0.31513787281935846,
"grad_norm": 0.38723391299657206,
"learning_rate": 4.972882770129328e-05,
"loss": 0.4643,
"step": 280
},
{
"epoch": 0.31626336522228476,
"grad_norm": 0.5298180060499573,
"learning_rate": 4.970796829370046e-05,
"loss": 0.4746,
"step": 281
},
{
"epoch": 0.317388857625211,
"grad_norm": 0.4061129177315826,
"learning_rate": 4.968710888610764e-05,
"loss": 0.4483,
"step": 282
},
{
"epoch": 0.3185143500281373,
"grad_norm": 0.44266904421962466,
"learning_rate": 4.9666249478514814e-05,
"loss": 0.4462,
"step": 283
},
{
"epoch": 0.3196398424310636,
"grad_norm": 0.5723516835538393,
"learning_rate": 4.964539007092199e-05,
"loss": 0.4912,
"step": 284
},
{
"epoch": 0.3207653348339899,
"grad_norm": 0.4268190071809671,
"learning_rate": 4.962453066332917e-05,
"loss": 0.4747,
"step": 285
},
{
"epoch": 0.32189082723691614,
"grad_norm": 0.5119510744401135,
"learning_rate": 4.960367125573634e-05,
"loss": 0.4822,
"step": 286
},
{
"epoch": 0.32301631963984245,
"grad_norm": 0.6845433911954164,
"learning_rate": 4.9582811848143515e-05,
"loss": 0.4784,
"step": 287
},
{
"epoch": 0.3241418120427687,
"grad_norm": 0.5512193560031954,
"learning_rate": 4.956195244055069e-05,
"loss": 0.4652,
"step": 288
},
{
"epoch": 0.325267304445695,
"grad_norm": 0.44176750937437276,
"learning_rate": 4.954109303295786e-05,
"loss": 0.4591,
"step": 289
},
{
"epoch": 0.32639279684862127,
"grad_norm": 0.4802436465499188,
"learning_rate": 4.952023362536504e-05,
"loss": 0.4646,
"step": 290
},
{
"epoch": 0.3275182892515476,
"grad_norm": 0.5130069834143126,
"learning_rate": 4.9499374217772216e-05,
"loss": 0.4632,
"step": 291
},
{
"epoch": 0.32864378165447383,
"grad_norm": 0.3492902519652929,
"learning_rate": 4.947851481017939e-05,
"loss": 0.4479,
"step": 292
},
{
"epoch": 0.3297692740574001,
"grad_norm": 0.44213997370659125,
"learning_rate": 4.945765540258657e-05,
"loss": 0.5066,
"step": 293
},
{
"epoch": 0.3308947664603264,
"grad_norm": 0.42377286032567374,
"learning_rate": 4.943679599499375e-05,
"loss": 0.4302,
"step": 294
},
{
"epoch": 0.33202025886325265,
"grad_norm": 0.394814830036649,
"learning_rate": 4.941593658740092e-05,
"loss": 0.4767,
"step": 295
},
{
"epoch": 0.33314575126617896,
"grad_norm": 0.4318002819787604,
"learning_rate": 4.9395077179808094e-05,
"loss": 0.4672,
"step": 296
},
{
"epoch": 0.3342712436691052,
"grad_norm": 0.4312528420970919,
"learning_rate": 4.937421777221527e-05,
"loss": 0.4301,
"step": 297
},
{
"epoch": 0.3353967360720315,
"grad_norm": 0.4506557238069578,
"learning_rate": 4.935335836462245e-05,
"loss": 0.4799,
"step": 298
},
{
"epoch": 0.3365222284749578,
"grad_norm": 0.4035767985622815,
"learning_rate": 4.933249895702962e-05,
"loss": 0.4685,
"step": 299
},
{
"epoch": 0.3376477208778841,
"grad_norm": 0.3974620013501897,
"learning_rate": 4.93116395494368e-05,
"loss": 0.4597,
"step": 300
},
{
"epoch": 0.33877321328081034,
"grad_norm": 0.4262423662726608,
"learning_rate": 4.929078014184397e-05,
"loss": 0.5115,
"step": 301
},
{
"epoch": 0.33989870568373665,
"grad_norm": 0.46574622767948337,
"learning_rate": 4.926992073425115e-05,
"loss": 0.4633,
"step": 302
},
{
"epoch": 0.3410241980866629,
"grad_norm": 0.3662096934434811,
"learning_rate": 4.9249061326658326e-05,
"loss": 0.459,
"step": 303
},
{
"epoch": 0.3421496904895892,
"grad_norm": 0.4345723771900289,
"learning_rate": 4.92282019190655e-05,
"loss": 0.4912,
"step": 304
},
{
"epoch": 0.34327518289251546,
"grad_norm": 0.43786190381782847,
"learning_rate": 4.9207342511472674e-05,
"loss": 0.4433,
"step": 305
},
{
"epoch": 0.34440067529544177,
"grad_norm": 0.502359362326431,
"learning_rate": 4.918648310387986e-05,
"loss": 0.4656,
"step": 306
},
{
"epoch": 0.345526167698368,
"grad_norm": 0.36663337654610195,
"learning_rate": 4.916562369628703e-05,
"loss": 0.4902,
"step": 307
},
{
"epoch": 0.34665166010129433,
"grad_norm": 0.5430348378224088,
"learning_rate": 4.9144764288694205e-05,
"loss": 0.4762,
"step": 308
},
{
"epoch": 0.3477771525042206,
"grad_norm": 0.5126322468885988,
"learning_rate": 4.912390488110138e-05,
"loss": 0.4537,
"step": 309
},
{
"epoch": 0.3489026449071469,
"grad_norm": 0.5095172872961993,
"learning_rate": 4.910304547350855e-05,
"loss": 0.4741,
"step": 310
},
{
"epoch": 0.35002813731007315,
"grad_norm": 0.48284683021884783,
"learning_rate": 4.908218606591573e-05,
"loss": 0.4726,
"step": 311
},
{
"epoch": 0.35115362971299946,
"grad_norm": 0.489802898549214,
"learning_rate": 4.9061326658322906e-05,
"loss": 0.4686,
"step": 312
},
{
"epoch": 0.3522791221159257,
"grad_norm": 0.39249156296212534,
"learning_rate": 4.904046725073008e-05,
"loss": 0.4695,
"step": 313
},
{
"epoch": 0.353404614518852,
"grad_norm": 0.5323916362107045,
"learning_rate": 4.901960784313725e-05,
"loss": 0.4965,
"step": 314
},
{
"epoch": 0.3545301069217783,
"grad_norm": 0.4533239040835354,
"learning_rate": 4.899874843554444e-05,
"loss": 0.4707,
"step": 315
},
{
"epoch": 0.3556555993247046,
"grad_norm": 0.4206387502485126,
"learning_rate": 4.897788902795161e-05,
"loss": 0.4579,
"step": 316
},
{
"epoch": 0.35678109172763084,
"grad_norm": 0.6269190751003352,
"learning_rate": 4.8957029620358784e-05,
"loss": 0.4693,
"step": 317
},
{
"epoch": 0.3579065841305571,
"grad_norm": 0.5737738257462925,
"learning_rate": 4.893617021276596e-05,
"loss": 0.4618,
"step": 318
},
{
"epoch": 0.3590320765334834,
"grad_norm": 0.4441915442589318,
"learning_rate": 4.891531080517314e-05,
"loss": 0.4873,
"step": 319
},
{
"epoch": 0.36015756893640966,
"grad_norm": 0.4262860610042779,
"learning_rate": 4.889445139758031e-05,
"loss": 0.4878,
"step": 320
},
{
"epoch": 0.36128306133933596,
"grad_norm": 0.4858584345693206,
"learning_rate": 4.8873591989987485e-05,
"loss": 0.4669,
"step": 321
},
{
"epoch": 0.3624085537422622,
"grad_norm": 0.3534387563975198,
"learning_rate": 4.885273258239466e-05,
"loss": 0.4811,
"step": 322
},
{
"epoch": 0.3635340461451885,
"grad_norm": 0.5271950734928447,
"learning_rate": 4.883187317480184e-05,
"loss": 0.4711,
"step": 323
},
{
"epoch": 0.3646595385481148,
"grad_norm": 0.409281575876073,
"learning_rate": 4.8811013767209016e-05,
"loss": 0.4659,
"step": 324
},
{
"epoch": 0.3657850309510411,
"grad_norm": 0.539894161794808,
"learning_rate": 4.879015435961619e-05,
"loss": 0.4617,
"step": 325
},
{
"epoch": 0.36691052335396734,
"grad_norm": 0.5540340872215955,
"learning_rate": 4.876929495202336e-05,
"loss": 0.4488,
"step": 326
},
{
"epoch": 0.36803601575689365,
"grad_norm": 0.5450024514176801,
"learning_rate": 4.874843554443054e-05,
"loss": 0.4443,
"step": 327
},
{
"epoch": 0.3691615081598199,
"grad_norm": 0.4022020669388415,
"learning_rate": 4.872757613683772e-05,
"loss": 0.4447,
"step": 328
},
{
"epoch": 0.3702870005627462,
"grad_norm": 0.5015493685184448,
"learning_rate": 4.8706716729244894e-05,
"loss": 0.4805,
"step": 329
},
{
"epoch": 0.37141249296567247,
"grad_norm": 0.5205564906682338,
"learning_rate": 4.8685857321652064e-05,
"loss": 0.4727,
"step": 330
},
{
"epoch": 0.3725379853685988,
"grad_norm": 0.48440539535018856,
"learning_rate": 4.866499791405924e-05,
"loss": 0.4789,
"step": 331
},
{
"epoch": 0.37366347777152503,
"grad_norm": 0.5749114846228484,
"learning_rate": 4.864413850646642e-05,
"loss": 0.4452,
"step": 332
},
{
"epoch": 0.37478897017445134,
"grad_norm": 0.5347894114064871,
"learning_rate": 4.8623279098873595e-05,
"loss": 0.466,
"step": 333
},
{
"epoch": 0.3759144625773776,
"grad_norm": 0.5219184880333937,
"learning_rate": 4.860241969128077e-05,
"loss": 0.4297,
"step": 334
},
{
"epoch": 0.3770399549803039,
"grad_norm": 0.4540713981584441,
"learning_rate": 4.858156028368794e-05,
"loss": 0.4716,
"step": 335
},
{
"epoch": 0.37816544738323016,
"grad_norm": 0.65356051165203,
"learning_rate": 4.856070087609512e-05,
"loss": 0.4709,
"step": 336
},
{
"epoch": 0.37929093978615647,
"grad_norm": 0.4310409303487337,
"learning_rate": 4.8539841468502296e-05,
"loss": 0.4931,
"step": 337
},
{
"epoch": 0.3804164321890827,
"grad_norm": 0.7081733344034011,
"learning_rate": 4.851898206090947e-05,
"loss": 0.4787,
"step": 338
},
{
"epoch": 0.38154192459200903,
"grad_norm": 0.4155006654801174,
"learning_rate": 4.8498122653316644e-05,
"loss": 0.4547,
"step": 339
},
{
"epoch": 0.3826674169949353,
"grad_norm": 0.6789073369172818,
"learning_rate": 4.847726324572383e-05,
"loss": 0.4526,
"step": 340
},
{
"epoch": 0.38379290939786154,
"grad_norm": 0.4700941252868989,
"learning_rate": 4.8456403838131e-05,
"loss": 0.4585,
"step": 341
},
{
"epoch": 0.38491840180078785,
"grad_norm": 0.5721349237605509,
"learning_rate": 4.8435544430538175e-05,
"loss": 0.4305,
"step": 342
},
{
"epoch": 0.3860438942037141,
"grad_norm": 0.6679337639014323,
"learning_rate": 4.841468502294535e-05,
"loss": 0.4727,
"step": 343
},
{
"epoch": 0.3871693866066404,
"grad_norm": 0.6136310708197658,
"learning_rate": 4.839382561535253e-05,
"loss": 0.4632,
"step": 344
},
{
"epoch": 0.38829487900956666,
"grad_norm": 0.560884620312814,
"learning_rate": 4.83729662077597e-05,
"loss": 0.4438,
"step": 345
},
{
"epoch": 0.38942037141249297,
"grad_norm": 0.6098279474363337,
"learning_rate": 4.835210680016688e-05,
"loss": 0.4407,
"step": 346
},
{
"epoch": 0.3905458638154192,
"grad_norm": 0.5154661466104475,
"learning_rate": 4.833124739257405e-05,
"loss": 0.446,
"step": 347
},
{
"epoch": 0.39167135621834553,
"grad_norm": 0.7038644519573083,
"learning_rate": 4.831038798498123e-05,
"loss": 0.4646,
"step": 348
},
{
"epoch": 0.3927968486212718,
"grad_norm": 0.5207244024602116,
"learning_rate": 4.828952857738841e-05,
"loss": 0.4476,
"step": 349
},
{
"epoch": 0.3939223410241981,
"grad_norm": 0.5084794430652734,
"learning_rate": 4.8268669169795584e-05,
"loss": 0.4478,
"step": 350
},
{
"epoch": 0.39504783342712435,
"grad_norm": 0.5120708559114392,
"learning_rate": 4.8247809762202754e-05,
"loss": 0.4622,
"step": 351
},
{
"epoch": 0.39617332583005066,
"grad_norm": 0.6265345182325885,
"learning_rate": 4.822695035460993e-05,
"loss": 0.4706,
"step": 352
},
{
"epoch": 0.3972988182329769,
"grad_norm": 0.5750184584832099,
"learning_rate": 4.820609094701711e-05,
"loss": 0.4882,
"step": 353
},
{
"epoch": 0.3984243106359032,
"grad_norm": 0.5490268536187386,
"learning_rate": 4.818523153942428e-05,
"loss": 0.4601,
"step": 354
},
{
"epoch": 0.3995498030388295,
"grad_norm": 0.5317594975111523,
"learning_rate": 4.816437213183146e-05,
"loss": 0.4587,
"step": 355
},
{
"epoch": 0.4006752954417558,
"grad_norm": 0.49454469908724474,
"learning_rate": 4.814351272423863e-05,
"loss": 0.4559,
"step": 356
},
{
"epoch": 0.40180078784468204,
"grad_norm": 0.5764930655203424,
"learning_rate": 4.812265331664581e-05,
"loss": 0.4677,
"step": 357
},
{
"epoch": 0.40292628024760835,
"grad_norm": 0.4254928567571142,
"learning_rate": 4.8101793909052986e-05,
"loss": 0.4468,
"step": 358
},
{
"epoch": 0.4040517726505346,
"grad_norm": 0.5563885643276592,
"learning_rate": 4.808093450146016e-05,
"loss": 0.4333,
"step": 359
},
{
"epoch": 0.4051772650534609,
"grad_norm": 0.4150167393933345,
"learning_rate": 4.806007509386733e-05,
"loss": 0.4515,
"step": 360
},
{
"epoch": 0.40630275745638716,
"grad_norm": 0.4751390092870927,
"learning_rate": 4.803921568627452e-05,
"loss": 0.4776,
"step": 361
},
{
"epoch": 0.4074282498593135,
"grad_norm": 0.43506496230734293,
"learning_rate": 4.801835627868169e-05,
"loss": 0.4533,
"step": 362
},
{
"epoch": 0.4085537422622397,
"grad_norm": 0.4488704280500811,
"learning_rate": 4.7997496871088864e-05,
"loss": 0.4555,
"step": 363
},
{
"epoch": 0.40967923466516604,
"grad_norm": 0.4888805568606264,
"learning_rate": 4.797663746349604e-05,
"loss": 0.4529,
"step": 364
},
{
"epoch": 0.4108047270680923,
"grad_norm": 0.47826665202586255,
"learning_rate": 4.795577805590322e-05,
"loss": 0.4565,
"step": 365
},
{
"epoch": 0.41193021947101854,
"grad_norm": 0.427518205471488,
"learning_rate": 4.793491864831039e-05,
"loss": 0.4622,
"step": 366
},
{
"epoch": 0.41305571187394485,
"grad_norm": 0.4657645019409657,
"learning_rate": 4.7914059240717565e-05,
"loss": 0.4731,
"step": 367
},
{
"epoch": 0.4141812042768711,
"grad_norm": 0.4980798737202691,
"learning_rate": 4.789319983312474e-05,
"loss": 0.4496,
"step": 368
},
{
"epoch": 0.4153066966797974,
"grad_norm": 0.4106122018359277,
"learning_rate": 4.787234042553192e-05,
"loss": 0.4537,
"step": 369
},
{
"epoch": 0.41643218908272367,
"grad_norm": 0.46050479994884624,
"learning_rate": 4.785148101793909e-05,
"loss": 0.4434,
"step": 370
},
{
"epoch": 0.41755768148565,
"grad_norm": 0.5221350337150601,
"learning_rate": 4.783062161034627e-05,
"loss": 0.458,
"step": 371
},
{
"epoch": 0.41868317388857623,
"grad_norm": 0.44101630868691777,
"learning_rate": 4.780976220275344e-05,
"loss": 0.4614,
"step": 372
},
{
"epoch": 0.41980866629150254,
"grad_norm": 0.5767546396305836,
"learning_rate": 4.778890279516062e-05,
"loss": 0.4829,
"step": 373
},
{
"epoch": 0.4209341586944288,
"grad_norm": 0.47996101159798066,
"learning_rate": 4.77680433875678e-05,
"loss": 0.4569,
"step": 374
},
{
"epoch": 0.4220596510973551,
"grad_norm": 0.5033820590275159,
"learning_rate": 4.774718397997497e-05,
"loss": 0.453,
"step": 375
},
{
"epoch": 0.42318514350028136,
"grad_norm": 0.4218904797929267,
"learning_rate": 4.7726324572382145e-05,
"loss": 0.4936,
"step": 376
},
{
"epoch": 0.42431063590320767,
"grad_norm": 0.41845341895601695,
"learning_rate": 4.770546516478932e-05,
"loss": 0.4371,
"step": 377
},
{
"epoch": 0.4254361283061339,
"grad_norm": 0.38945322748226296,
"learning_rate": 4.76846057571965e-05,
"loss": 0.4391,
"step": 378
},
{
"epoch": 0.42656162070906023,
"grad_norm": 0.4161185110299941,
"learning_rate": 4.766374634960367e-05,
"loss": 0.4485,
"step": 379
},
{
"epoch": 0.4276871131119865,
"grad_norm": 0.3864477310593941,
"learning_rate": 4.764288694201085e-05,
"loss": 0.4543,
"step": 380
},
{
"epoch": 0.4288126055149128,
"grad_norm": 0.48714053443872946,
"learning_rate": 4.762202753441802e-05,
"loss": 0.4809,
"step": 381
},
{
"epoch": 0.42993809791783905,
"grad_norm": 0.44027599652018634,
"learning_rate": 4.76011681268252e-05,
"loss": 0.4664,
"step": 382
},
{
"epoch": 0.43106359032076536,
"grad_norm": 0.46260149439173776,
"learning_rate": 4.758030871923238e-05,
"loss": 0.4358,
"step": 383
},
{
"epoch": 0.4321890827236916,
"grad_norm": 0.37340303441017136,
"learning_rate": 4.7559449311639554e-05,
"loss": 0.4444,
"step": 384
},
{
"epoch": 0.4333145751266179,
"grad_norm": 0.4414988549473453,
"learning_rate": 4.7538589904046724e-05,
"loss": 0.4338,
"step": 385
},
{
"epoch": 0.43444006752954417,
"grad_norm": 0.4002550060272223,
"learning_rate": 4.751773049645391e-05,
"loss": 0.4494,
"step": 386
},
{
"epoch": 0.4355655599324705,
"grad_norm": 0.4158146887262931,
"learning_rate": 4.749687108886108e-05,
"loss": 0.4454,
"step": 387
},
{
"epoch": 0.43669105233539673,
"grad_norm": 0.35977608941282263,
"learning_rate": 4.7476011681268255e-05,
"loss": 0.4533,
"step": 388
},
{
"epoch": 0.43781654473832304,
"grad_norm": 0.4764697673218214,
"learning_rate": 4.745515227367543e-05,
"loss": 0.4835,
"step": 389
},
{
"epoch": 0.4389420371412493,
"grad_norm": 0.35081968018481574,
"learning_rate": 4.743429286608261e-05,
"loss": 0.4579,
"step": 390
},
{
"epoch": 0.44006752954417555,
"grad_norm": 0.4170219497175011,
"learning_rate": 4.741343345848978e-05,
"loss": 0.4398,
"step": 391
},
{
"epoch": 0.44119302194710186,
"grad_norm": 0.44456892241843987,
"learning_rate": 4.739257405089696e-05,
"loss": 0.4805,
"step": 392
},
{
"epoch": 0.4423185143500281,
"grad_norm": 0.3810686961824828,
"learning_rate": 4.737171464330413e-05,
"loss": 0.4687,
"step": 393
},
{
"epoch": 0.4434440067529544,
"grad_norm": 0.4459516182156416,
"learning_rate": 4.73508552357113e-05,
"loss": 0.4441,
"step": 394
},
{
"epoch": 0.4445694991558807,
"grad_norm": 0.36574072948327524,
"learning_rate": 4.732999582811849e-05,
"loss": 0.4759,
"step": 395
},
{
"epoch": 0.445694991558807,
"grad_norm": 0.46519670122225776,
"learning_rate": 4.730913642052566e-05,
"loss": 0.4622,
"step": 396
},
{
"epoch": 0.44682048396173324,
"grad_norm": 0.3782284810519757,
"learning_rate": 4.7288277012932834e-05,
"loss": 0.4518,
"step": 397
},
{
"epoch": 0.44794597636465955,
"grad_norm": 0.4321697226823169,
"learning_rate": 4.726741760534001e-05,
"loss": 0.4213,
"step": 398
},
{
"epoch": 0.4490714687675858,
"grad_norm": 0.3846389059595841,
"learning_rate": 4.724655819774719e-05,
"loss": 0.4473,
"step": 399
},
{
"epoch": 0.4501969611705121,
"grad_norm": 0.4148349323542458,
"learning_rate": 4.722569879015436e-05,
"loss": 0.447,
"step": 400
},
{
"epoch": 0.45132245357343836,
"grad_norm": 0.3987423433461808,
"learning_rate": 4.720483938256154e-05,
"loss": 0.4428,
"step": 401
},
{
"epoch": 0.4524479459763647,
"grad_norm": 0.42246987876628445,
"learning_rate": 4.718397997496871e-05,
"loss": 0.4456,
"step": 402
},
{
"epoch": 0.4535734383792909,
"grad_norm": 0.4060448399568812,
"learning_rate": 4.716312056737589e-05,
"loss": 0.4734,
"step": 403
},
{
"epoch": 0.45469893078221724,
"grad_norm": 0.38939419691921573,
"learning_rate": 4.7142261159783066e-05,
"loss": 0.4506,
"step": 404
},
{
"epoch": 0.4558244231851435,
"grad_norm": 0.39441558158161155,
"learning_rate": 4.712140175219024e-05,
"loss": 0.4611,
"step": 405
},
{
"epoch": 0.4569499155880698,
"grad_norm": 0.37043790127930454,
"learning_rate": 4.7100542344597413e-05,
"loss": 0.4446,
"step": 406
},
{
"epoch": 0.45807540799099605,
"grad_norm": 0.39081323070794516,
"learning_rate": 4.707968293700459e-05,
"loss": 0.4586,
"step": 407
},
{
"epoch": 0.45920090039392236,
"grad_norm": 0.38815613346341743,
"learning_rate": 4.705882352941177e-05,
"loss": 0.4731,
"step": 408
},
{
"epoch": 0.4603263927968486,
"grad_norm": 0.4081757852974463,
"learning_rate": 4.7037964121818944e-05,
"loss": 0.4392,
"step": 409
},
{
"epoch": 0.4614518851997749,
"grad_norm": 0.3789626983185206,
"learning_rate": 4.7017104714226115e-05,
"loss": 0.4561,
"step": 410
},
{
"epoch": 0.4625773776027012,
"grad_norm": 0.5000455667230893,
"learning_rate": 4.69962453066333e-05,
"loss": 0.4642,
"step": 411
},
{
"epoch": 0.4637028700056275,
"grad_norm": 0.3422337438589666,
"learning_rate": 4.697538589904047e-05,
"loss": 0.4592,
"step": 412
},
{
"epoch": 0.46482836240855374,
"grad_norm": 0.5638947084171662,
"learning_rate": 4.6954526491447646e-05,
"loss": 0.451,
"step": 413
},
{
"epoch": 0.46595385481148,
"grad_norm": 0.38536737227105394,
"learning_rate": 4.693366708385482e-05,
"loss": 0.4386,
"step": 414
},
{
"epoch": 0.4670793472144063,
"grad_norm": 0.46615900085704925,
"learning_rate": 4.691280767626199e-05,
"loss": 0.4572,
"step": 415
},
{
"epoch": 0.46820483961733256,
"grad_norm": 0.45954601145736806,
"learning_rate": 4.689194826866917e-05,
"loss": 0.4745,
"step": 416
},
{
"epoch": 0.46933033202025887,
"grad_norm": 0.3925870159696147,
"learning_rate": 4.687108886107635e-05,
"loss": 0.4383,
"step": 417
},
{
"epoch": 0.4704558244231851,
"grad_norm": 0.4232172013685177,
"learning_rate": 4.6850229453483524e-05,
"loss": 0.4455,
"step": 418
},
{
"epoch": 0.47158131682611143,
"grad_norm": 0.4709258500108095,
"learning_rate": 4.6829370045890694e-05,
"loss": 0.4329,
"step": 419
},
{
"epoch": 0.4727068092290377,
"grad_norm": 0.5478084541084817,
"learning_rate": 4.680851063829788e-05,
"loss": 0.4778,
"step": 420
},
{
"epoch": 0.473832301631964,
"grad_norm": 0.39060968027365583,
"learning_rate": 4.678765123070505e-05,
"loss": 0.446,
"step": 421
},
{
"epoch": 0.47495779403489025,
"grad_norm": 0.43252322301543766,
"learning_rate": 4.6766791823112225e-05,
"loss": 0.4606,
"step": 422
},
{
"epoch": 0.47608328643781656,
"grad_norm": 0.48537861169690405,
"learning_rate": 4.67459324155194e-05,
"loss": 0.4845,
"step": 423
},
{
"epoch": 0.4772087788407428,
"grad_norm": 0.34601404275255593,
"learning_rate": 4.672507300792658e-05,
"loss": 0.4357,
"step": 424
},
{
"epoch": 0.4783342712436691,
"grad_norm": 0.42339913946057167,
"learning_rate": 4.670421360033375e-05,
"loss": 0.4421,
"step": 425
},
{
"epoch": 0.47945976364659537,
"grad_norm": 0.39857659754496044,
"learning_rate": 4.668335419274093e-05,
"loss": 0.4448,
"step": 426
},
{
"epoch": 0.4805852560495217,
"grad_norm": 0.38982322860737306,
"learning_rate": 4.66624947851481e-05,
"loss": 0.4449,
"step": 427
},
{
"epoch": 0.48171074845244793,
"grad_norm": 0.4167533082716713,
"learning_rate": 4.664163537755528e-05,
"loss": 0.4538,
"step": 428
},
{
"epoch": 0.48283624085537424,
"grad_norm": 0.38396785885791673,
"learning_rate": 4.662077596996246e-05,
"loss": 0.4665,
"step": 429
},
{
"epoch": 0.4839617332583005,
"grad_norm": 0.4460443959564988,
"learning_rate": 4.6599916562369634e-05,
"loss": 0.4695,
"step": 430
},
{
"epoch": 0.4850872256612268,
"grad_norm": 0.4307496077176856,
"learning_rate": 4.6579057154776804e-05,
"loss": 0.479,
"step": 431
},
{
"epoch": 0.48621271806415306,
"grad_norm": 0.4703944597323029,
"learning_rate": 4.655819774718399e-05,
"loss": 0.4616,
"step": 432
},
{
"epoch": 0.48733821046707937,
"grad_norm": 0.4532939669627873,
"learning_rate": 4.653733833959116e-05,
"loss": 0.4386,
"step": 433
},
{
"epoch": 0.4884637028700056,
"grad_norm": 0.38992923384312006,
"learning_rate": 4.651647893199833e-05,
"loss": 0.4403,
"step": 434
},
{
"epoch": 0.48958919527293193,
"grad_norm": 0.41316331078388,
"learning_rate": 4.649561952440551e-05,
"loss": 0.431,
"step": 435
},
{
"epoch": 0.4907146876758582,
"grad_norm": 0.36589748301197256,
"learning_rate": 4.647476011681268e-05,
"loss": 0.4487,
"step": 436
},
{
"epoch": 0.4918401800787845,
"grad_norm": 0.4790306414346754,
"learning_rate": 4.645390070921986e-05,
"loss": 0.465,
"step": 437
},
{
"epoch": 0.49296567248171075,
"grad_norm": 0.3884802942940033,
"learning_rate": 4.6433041301627036e-05,
"loss": 0.4488,
"step": 438
},
{
"epoch": 0.494091164884637,
"grad_norm": 0.43931802766911515,
"learning_rate": 4.641218189403421e-05,
"loss": 0.4844,
"step": 439
},
{
"epoch": 0.4952166572875633,
"grad_norm": 0.35209964530255544,
"learning_rate": 4.6391322486441383e-05,
"loss": 0.4442,
"step": 440
},
{
"epoch": 0.49634214969048956,
"grad_norm": 0.38004709563408534,
"learning_rate": 4.637046307884857e-05,
"loss": 0.4753,
"step": 441
},
{
"epoch": 0.4974676420934159,
"grad_norm": 0.3409798351543027,
"learning_rate": 4.634960367125574e-05,
"loss": 0.4342,
"step": 442
},
{
"epoch": 0.4985931344963421,
"grad_norm": 0.39326837683822974,
"learning_rate": 4.6328744263662914e-05,
"loss": 0.4539,
"step": 443
},
{
"epoch": 0.49971862689926844,
"grad_norm": 0.34187980768631865,
"learning_rate": 4.630788485607009e-05,
"loss": 0.4551,
"step": 444
},
{
"epoch": 0.5008441193021947,
"grad_norm": 0.3788406979843315,
"learning_rate": 4.628702544847727e-05,
"loss": 0.4536,
"step": 445
},
{
"epoch": 0.501969611705121,
"grad_norm": 0.37725539074157804,
"learning_rate": 4.626616604088444e-05,
"loss": 0.4391,
"step": 446
},
{
"epoch": 0.5030951041080473,
"grad_norm": 0.3294085025027009,
"learning_rate": 4.6245306633291616e-05,
"loss": 0.4702,
"step": 447
},
{
"epoch": 0.5042205965109735,
"grad_norm": 0.33534954747479645,
"learning_rate": 4.622444722569879e-05,
"loss": 0.437,
"step": 448
},
{
"epoch": 0.5053460889138999,
"grad_norm": 0.4077362372846078,
"learning_rate": 4.620358781810597e-05,
"loss": 0.4335,
"step": 449
},
{
"epoch": 0.5064715813168261,
"grad_norm": 0.39599735165417416,
"learning_rate": 4.618272841051314e-05,
"loss": 0.4428,
"step": 450
},
{
"epoch": 0.5075970737197524,
"grad_norm": 0.3481976377046397,
"learning_rate": 4.6161869002920323e-05,
"loss": 0.4344,
"step": 451
},
{
"epoch": 0.5087225661226786,
"grad_norm": 0.35576918023033427,
"learning_rate": 4.6141009595327494e-05,
"loss": 0.4343,
"step": 452
},
{
"epoch": 0.509848058525605,
"grad_norm": 0.4458349270928396,
"learning_rate": 4.612015018773467e-05,
"loss": 0.4418,
"step": 453
},
{
"epoch": 0.5109735509285313,
"grad_norm": 0.39940538094885114,
"learning_rate": 4.609929078014185e-05,
"loss": 0.4466,
"step": 454
},
{
"epoch": 0.5120990433314575,
"grad_norm": 0.401792941347298,
"learning_rate": 4.607843137254902e-05,
"loss": 0.4317,
"step": 455
},
{
"epoch": 0.5132245357343838,
"grad_norm": 0.3570336540962956,
"learning_rate": 4.6057571964956195e-05,
"loss": 0.4116,
"step": 456
},
{
"epoch": 0.5143500281373101,
"grad_norm": 0.4132726931400482,
"learning_rate": 4.603671255736337e-05,
"loss": 0.4451,
"step": 457
},
{
"epoch": 0.5154755205402364,
"grad_norm": 0.3450781738437834,
"learning_rate": 4.601585314977055e-05,
"loss": 0.4245,
"step": 458
},
{
"epoch": 0.5166010129431626,
"grad_norm": 0.4044295769667828,
"learning_rate": 4.599499374217772e-05,
"loss": 0.4261,
"step": 459
},
{
"epoch": 0.5177265053460889,
"grad_norm": 0.4460134360979799,
"learning_rate": 4.59741343345849e-05,
"loss": 0.4569,
"step": 460
},
{
"epoch": 0.5188519977490152,
"grad_norm": 0.35709408492200145,
"learning_rate": 4.595327492699207e-05,
"loss": 0.447,
"step": 461
},
{
"epoch": 0.5199774901519415,
"grad_norm": 0.49622852545171614,
"learning_rate": 4.593241551939925e-05,
"loss": 0.4482,
"step": 462
},
{
"epoch": 0.5211029825548678,
"grad_norm": 0.43774205674931815,
"learning_rate": 4.591155611180643e-05,
"loss": 0.4447,
"step": 463
},
{
"epoch": 0.522228474957794,
"grad_norm": 0.4071440395299347,
"learning_rate": 4.5890696704213604e-05,
"loss": 0.428,
"step": 464
},
{
"epoch": 0.5233539673607203,
"grad_norm": 0.42816990064501337,
"learning_rate": 4.5869837296620774e-05,
"loss": 0.4264,
"step": 465
},
{
"epoch": 0.5244794597636466,
"grad_norm": 0.4003821811209746,
"learning_rate": 4.584897788902796e-05,
"loss": 0.4502,
"step": 466
},
{
"epoch": 0.5256049521665729,
"grad_norm": 0.4306225774416482,
"learning_rate": 4.582811848143513e-05,
"loss": 0.448,
"step": 467
},
{
"epoch": 0.5267304445694991,
"grad_norm": 0.438000243330178,
"learning_rate": 4.5807259073842305e-05,
"loss": 0.4461,
"step": 468
},
{
"epoch": 0.5278559369724254,
"grad_norm": 0.5459912564891531,
"learning_rate": 4.578639966624948e-05,
"loss": 0.4454,
"step": 469
},
{
"epoch": 0.5289814293753518,
"grad_norm": 0.3937540607846447,
"learning_rate": 4.576554025865666e-05,
"loss": 0.4511,
"step": 470
},
{
"epoch": 0.530106921778278,
"grad_norm": 0.5255901368328048,
"learning_rate": 4.574468085106383e-05,
"loss": 0.4656,
"step": 471
},
{
"epoch": 0.5312324141812043,
"grad_norm": 0.37290888598540667,
"learning_rate": 4.572382144347101e-05,
"loss": 0.4346,
"step": 472
},
{
"epoch": 0.5323579065841305,
"grad_norm": 0.5151271720318875,
"learning_rate": 4.570296203587818e-05,
"loss": 0.4386,
"step": 473
},
{
"epoch": 0.5334833989870569,
"grad_norm": 0.5196455814853196,
"learning_rate": 4.568210262828536e-05,
"loss": 0.4281,
"step": 474
},
{
"epoch": 0.5346088913899831,
"grad_norm": 0.5516694216088329,
"learning_rate": 4.566124322069254e-05,
"loss": 0.4678,
"step": 475
},
{
"epoch": 0.5357343837929094,
"grad_norm": 0.40935239231865317,
"learning_rate": 4.564038381309971e-05,
"loss": 0.4392,
"step": 476
},
{
"epoch": 0.5368598761958356,
"grad_norm": 0.4232251188780467,
"learning_rate": 4.5619524405506884e-05,
"loss": 0.4541,
"step": 477
},
{
"epoch": 0.537985368598762,
"grad_norm": 0.47065714592515695,
"learning_rate": 4.559866499791406e-05,
"loss": 0.4573,
"step": 478
},
{
"epoch": 0.5391108610016883,
"grad_norm": 0.45139662325934604,
"learning_rate": 4.557780559032124e-05,
"loss": 0.466,
"step": 479
},
{
"epoch": 0.5402363534046145,
"grad_norm": 0.43277954798040297,
"learning_rate": 4.555694618272841e-05,
"loss": 0.4395,
"step": 480
},
{
"epoch": 0.5413618458075408,
"grad_norm": 0.3937402339467187,
"learning_rate": 4.553608677513559e-05,
"loss": 0.4598,
"step": 481
},
{
"epoch": 0.5424873382104671,
"grad_norm": 0.43149504891962365,
"learning_rate": 4.551522736754276e-05,
"loss": 0.4292,
"step": 482
},
{
"epoch": 0.5436128306133934,
"grad_norm": 0.3833426447527127,
"learning_rate": 4.549436795994994e-05,
"loss": 0.4462,
"step": 483
},
{
"epoch": 0.5447383230163196,
"grad_norm": 0.5753891830767618,
"learning_rate": 4.5473508552357116e-05,
"loss": 0.4674,
"step": 484
},
{
"epoch": 0.5458638154192459,
"grad_norm": 0.37095342775133894,
"learning_rate": 4.5452649144764293e-05,
"loss": 0.4502,
"step": 485
},
{
"epoch": 0.5469893078221723,
"grad_norm": 0.44452090514956777,
"learning_rate": 4.5431789737171464e-05,
"loss": 0.4195,
"step": 486
},
{
"epoch": 0.5481148002250985,
"grad_norm": 0.39266318915026655,
"learning_rate": 4.541093032957864e-05,
"loss": 0.4308,
"step": 487
},
{
"epoch": 0.5492402926280248,
"grad_norm": 0.38579575811998595,
"learning_rate": 4.539007092198582e-05,
"loss": 0.4732,
"step": 488
},
{
"epoch": 0.550365785030951,
"grad_norm": 0.3927716846528752,
"learning_rate": 4.5369211514392995e-05,
"loss": 0.4554,
"step": 489
},
{
"epoch": 0.5514912774338773,
"grad_norm": 0.3518677438969378,
"learning_rate": 4.5348352106800165e-05,
"loss": 0.435,
"step": 490
},
{
"epoch": 0.5526167698368036,
"grad_norm": 0.3989470078182982,
"learning_rate": 4.532749269920735e-05,
"loss": 0.4581,
"step": 491
},
{
"epoch": 0.5537422622397299,
"grad_norm": 0.31132596342015495,
"learning_rate": 4.530663329161452e-05,
"loss": 0.4166,
"step": 492
},
{
"epoch": 0.5548677546426561,
"grad_norm": 0.42773832735938333,
"learning_rate": 4.5285773884021696e-05,
"loss": 0.4498,
"step": 493
},
{
"epoch": 0.5559932470455824,
"grad_norm": 0.3337455720428052,
"learning_rate": 4.526491447642887e-05,
"loss": 0.4959,
"step": 494
},
{
"epoch": 0.5571187394485088,
"grad_norm": 0.3784028479466481,
"learning_rate": 4.524405506883605e-05,
"loss": 0.4528,
"step": 495
},
{
"epoch": 0.558244231851435,
"grad_norm": 0.3649036934635355,
"learning_rate": 4.522319566124322e-05,
"loss": 0.4382,
"step": 496
},
{
"epoch": 0.5593697242543613,
"grad_norm": 0.37624738124672374,
"learning_rate": 4.52023362536504e-05,
"loss": 0.444,
"step": 497
},
{
"epoch": 0.5604952166572875,
"grad_norm": 0.41375280657115326,
"learning_rate": 4.5181476846057574e-05,
"loss": 0.445,
"step": 498
},
{
"epoch": 0.5616207090602139,
"grad_norm": 0.4473059694404265,
"learning_rate": 4.5160617438464744e-05,
"loss": 0.425,
"step": 499
},
{
"epoch": 0.5627462014631401,
"grad_norm": 0.37225084914483775,
"learning_rate": 4.513975803087193e-05,
"loss": 0.441,
"step": 500
},
{
"epoch": 0.5638716938660664,
"grad_norm": 0.3940588853331884,
"learning_rate": 4.51188986232791e-05,
"loss": 0.4466,
"step": 501
},
{
"epoch": 0.5649971862689926,
"grad_norm": 0.3470104737718654,
"learning_rate": 4.5098039215686275e-05,
"loss": 0.4474,
"step": 502
},
{
"epoch": 0.566122678671919,
"grad_norm": 0.4164834773144051,
"learning_rate": 4.507717980809345e-05,
"loss": 0.4442,
"step": 503
},
{
"epoch": 0.5672481710748453,
"grad_norm": 0.3652420299854053,
"learning_rate": 4.505632040050063e-05,
"loss": 0.4436,
"step": 504
},
{
"epoch": 0.5683736634777715,
"grad_norm": 0.4103075119748004,
"learning_rate": 4.50354609929078e-05,
"loss": 0.4459,
"step": 505
},
{
"epoch": 0.5694991558806978,
"grad_norm": 0.39102170524673335,
"learning_rate": 4.501460158531498e-05,
"loss": 0.4268,
"step": 506
},
{
"epoch": 0.5706246482836241,
"grad_norm": 0.4942727267066722,
"learning_rate": 4.499374217772215e-05,
"loss": 0.4722,
"step": 507
},
{
"epoch": 0.5717501406865504,
"grad_norm": 0.3465319015459766,
"learning_rate": 4.497288277012933e-05,
"loss": 0.4408,
"step": 508
},
{
"epoch": 0.5728756330894766,
"grad_norm": 0.4074806411985911,
"learning_rate": 4.495202336253651e-05,
"loss": 0.4212,
"step": 509
},
{
"epoch": 0.5740011254924029,
"grad_norm": 0.38192085376045243,
"learning_rate": 4.4931163954943684e-05,
"loss": 0.41,
"step": 510
},
{
"epoch": 0.5751266178953293,
"grad_norm": 0.3702590158057979,
"learning_rate": 4.4910304547350854e-05,
"loss": 0.4255,
"step": 511
},
{
"epoch": 0.5762521102982555,
"grad_norm": 0.3436403538534127,
"learning_rate": 4.488944513975804e-05,
"loss": 0.4538,
"step": 512
},
{
"epoch": 0.5773776027011818,
"grad_norm": 0.3877342893162592,
"learning_rate": 4.486858573216521e-05,
"loss": 0.4182,
"step": 513
},
{
"epoch": 0.578503095104108,
"grad_norm": 0.3460201187876074,
"learning_rate": 4.4847726324572385e-05,
"loss": 0.4432,
"step": 514
},
{
"epoch": 0.5796285875070343,
"grad_norm": 0.34511398785310915,
"learning_rate": 4.482686691697956e-05,
"loss": 0.4469,
"step": 515
},
{
"epoch": 0.5807540799099606,
"grad_norm": 0.4258487344474797,
"learning_rate": 4.480600750938674e-05,
"loss": 0.4583,
"step": 516
},
{
"epoch": 0.5818795723128869,
"grad_norm": 0.36803297271961477,
"learning_rate": 4.478514810179391e-05,
"loss": 0.4428,
"step": 517
},
{
"epoch": 0.5830050647158131,
"grad_norm": 0.46401852203645827,
"learning_rate": 4.4764288694201086e-05,
"loss": 0.4456,
"step": 518
},
{
"epoch": 0.5841305571187394,
"grad_norm": 0.39205048802946624,
"learning_rate": 4.4743429286608263e-05,
"loss": 0.441,
"step": 519
},
{
"epoch": 0.5852560495216658,
"grad_norm": 0.39757611365031714,
"learning_rate": 4.4722569879015434e-05,
"loss": 0.4428,
"step": 520
},
{
"epoch": 0.586381541924592,
"grad_norm": 0.3647536671953435,
"learning_rate": 4.470171047142262e-05,
"loss": 0.4513,
"step": 521
},
{
"epoch": 0.5875070343275183,
"grad_norm": 0.39429072510874175,
"learning_rate": 4.468085106382979e-05,
"loss": 0.4132,
"step": 522
},
{
"epoch": 0.5886325267304445,
"grad_norm": 0.40901642747342404,
"learning_rate": 4.4659991656236965e-05,
"loss": 0.4481,
"step": 523
},
{
"epoch": 0.5897580191333709,
"grad_norm": 0.3992749524524198,
"learning_rate": 4.463913224864414e-05,
"loss": 0.4468,
"step": 524
},
{
"epoch": 0.5908835115362971,
"grad_norm": 0.4722275927889856,
"learning_rate": 4.461827284105132e-05,
"loss": 0.4428,
"step": 525
},
{
"epoch": 0.5920090039392234,
"grad_norm": 0.42866183958875864,
"learning_rate": 4.459741343345849e-05,
"loss": 0.4194,
"step": 526
},
{
"epoch": 0.5931344963421497,
"grad_norm": 0.38204868156886707,
"learning_rate": 4.4576554025865666e-05,
"loss": 0.4402,
"step": 527
},
{
"epoch": 0.594259988745076,
"grad_norm": 0.35148215802167393,
"learning_rate": 4.455569461827284e-05,
"loss": 0.4542,
"step": 528
},
{
"epoch": 0.5953854811480023,
"grad_norm": 0.40153400690617524,
"learning_rate": 4.453483521068002e-05,
"loss": 0.4102,
"step": 529
},
{
"epoch": 0.5965109735509285,
"grad_norm": 0.46986899886821576,
"learning_rate": 4.45139758030872e-05,
"loss": 0.4436,
"step": 530
},
{
"epoch": 0.5976364659538548,
"grad_norm": 0.35390475462960685,
"learning_rate": 4.4493116395494374e-05,
"loss": 0.4398,
"step": 531
},
{
"epoch": 0.5987619583567811,
"grad_norm": 0.4482185977258061,
"learning_rate": 4.4472256987901544e-05,
"loss": 0.4326,
"step": 532
},
{
"epoch": 0.5998874507597074,
"grad_norm": 0.44232865264761434,
"learning_rate": 4.445139758030872e-05,
"loss": 0.4325,
"step": 533
},
{
"epoch": 0.6010129431626337,
"grad_norm": 0.4183843016810463,
"learning_rate": 4.44305381727159e-05,
"loss": 0.4553,
"step": 534
},
{
"epoch": 0.6021384355655599,
"grad_norm": 0.4242250812536985,
"learning_rate": 4.4409678765123075e-05,
"loss": 0.4232,
"step": 535
},
{
"epoch": 0.6032639279684862,
"grad_norm": 0.3888142076123292,
"learning_rate": 4.4388819357530245e-05,
"loss": 0.4241,
"step": 536
},
{
"epoch": 0.6043894203714125,
"grad_norm": 0.40486855004609845,
"learning_rate": 4.436795994993743e-05,
"loss": 0.4191,
"step": 537
},
{
"epoch": 0.6055149127743388,
"grad_norm": 0.47154131963320084,
"learning_rate": 4.43471005423446e-05,
"loss": 0.4595,
"step": 538
},
{
"epoch": 0.606640405177265,
"grad_norm": 0.38490507840256083,
"learning_rate": 4.432624113475177e-05,
"loss": 0.4199,
"step": 539
},
{
"epoch": 0.6077658975801913,
"grad_norm": 0.46096486506497264,
"learning_rate": 4.430538172715895e-05,
"loss": 0.4448,
"step": 540
},
{
"epoch": 0.6088913899831176,
"grad_norm": 0.4947895759240074,
"learning_rate": 4.428452231956612e-05,
"loss": 0.4342,
"step": 541
},
{
"epoch": 0.6100168823860439,
"grad_norm": 0.3829854963511767,
"learning_rate": 4.42636629119733e-05,
"loss": 0.4186,
"step": 542
},
{
"epoch": 0.6111423747889702,
"grad_norm": 0.6245507343869451,
"learning_rate": 4.424280350438048e-05,
"loss": 0.441,
"step": 543
},
{
"epoch": 0.6122678671918964,
"grad_norm": 0.5300235385565563,
"learning_rate": 4.4221944096787654e-05,
"loss": 0.4375,
"step": 544
},
{
"epoch": 0.6133933595948228,
"grad_norm": 0.4930881980261961,
"learning_rate": 4.4201084689194824e-05,
"loss": 0.4621,
"step": 545
},
{
"epoch": 0.614518851997749,
"grad_norm": 0.5638424830870375,
"learning_rate": 4.418022528160201e-05,
"loss": 0.4411,
"step": 546
},
{
"epoch": 0.6156443444006753,
"grad_norm": 0.3716115037856444,
"learning_rate": 4.415936587400918e-05,
"loss": 0.4528,
"step": 547
},
{
"epoch": 0.6167698368036015,
"grad_norm": 0.5223401927324024,
"learning_rate": 4.4138506466416355e-05,
"loss": 0.4327,
"step": 548
},
{
"epoch": 0.6178953292065279,
"grad_norm": 0.37311721165933265,
"learning_rate": 4.411764705882353e-05,
"loss": 0.4058,
"step": 549
},
{
"epoch": 0.6190208216094542,
"grad_norm": 0.532332931429002,
"learning_rate": 4.409678765123071e-05,
"loss": 0.4445,
"step": 550
},
{
"epoch": 0.6201463140123804,
"grad_norm": 0.5059754011866813,
"learning_rate": 4.407592824363788e-05,
"loss": 0.4257,
"step": 551
},
{
"epoch": 0.6212718064153067,
"grad_norm": 0.4904818838015066,
"learning_rate": 4.405506883604506e-05,
"loss": 0.4353,
"step": 552
},
{
"epoch": 0.622397298818233,
"grad_norm": 0.6200335434273374,
"learning_rate": 4.4034209428452233e-05,
"loss": 0.4416,
"step": 553
},
{
"epoch": 0.6235227912211593,
"grad_norm": 0.3199203022808196,
"learning_rate": 4.401335002085941e-05,
"loss": 0.4355,
"step": 554
},
{
"epoch": 0.6246482836240855,
"grad_norm": 0.5681807784529108,
"learning_rate": 4.399249061326659e-05,
"loss": 0.431,
"step": 555
},
{
"epoch": 0.6257737760270118,
"grad_norm": 0.3995337627796738,
"learning_rate": 4.3971631205673764e-05,
"loss": 0.4312,
"step": 556
},
{
"epoch": 0.6268992684299382,
"grad_norm": 0.5466993132659691,
"learning_rate": 4.3950771798080935e-05,
"loss": 0.4311,
"step": 557
},
{
"epoch": 0.6280247608328644,
"grad_norm": 0.5670240814298136,
"learning_rate": 4.392991239048811e-05,
"loss": 0.4564,
"step": 558
},
{
"epoch": 0.6291502532357907,
"grad_norm": 0.47107566738859724,
"learning_rate": 4.390905298289529e-05,
"loss": 0.4436,
"step": 559
},
{
"epoch": 0.6302757456387169,
"grad_norm": 0.5380491861675493,
"learning_rate": 4.388819357530246e-05,
"loss": 0.4024,
"step": 560
},
{
"epoch": 0.6314012380416432,
"grad_norm": 0.37407644137036594,
"learning_rate": 4.386733416770964e-05,
"loss": 0.4276,
"step": 561
},
{
"epoch": 0.6325267304445695,
"grad_norm": 0.5179459476960132,
"learning_rate": 4.384647476011681e-05,
"loss": 0.4231,
"step": 562
},
{
"epoch": 0.6336522228474958,
"grad_norm": 0.3832305989594554,
"learning_rate": 4.382561535252399e-05,
"loss": 0.4254,
"step": 563
},
{
"epoch": 0.634777715250422,
"grad_norm": 0.48824132268901227,
"learning_rate": 4.380475594493117e-05,
"loss": 0.4414,
"step": 564
},
{
"epoch": 0.6359032076533483,
"grad_norm": 0.45846104242587143,
"learning_rate": 4.3783896537338344e-05,
"loss": 0.4374,
"step": 565
},
{
"epoch": 0.6370287000562747,
"grad_norm": 0.5017380646906237,
"learning_rate": 4.3763037129745514e-05,
"loss": 0.4478,
"step": 566
},
{
"epoch": 0.6381541924592009,
"grad_norm": 0.4706523687823463,
"learning_rate": 4.374217772215269e-05,
"loss": 0.4393,
"step": 567
},
{
"epoch": 0.6392796848621272,
"grad_norm": 0.43746034371341663,
"learning_rate": 4.372131831455987e-05,
"loss": 0.4289,
"step": 568
},
{
"epoch": 0.6404051772650534,
"grad_norm": 0.4971311348473273,
"learning_rate": 4.3700458906967045e-05,
"loss": 0.4632,
"step": 569
},
{
"epoch": 0.6415306696679798,
"grad_norm": 0.32424868625443787,
"learning_rate": 4.367959949937422e-05,
"loss": 0.4439,
"step": 570
},
{
"epoch": 0.642656162070906,
"grad_norm": 0.5530000470387829,
"learning_rate": 4.36587400917814e-05,
"loss": 0.4438,
"step": 571
},
{
"epoch": 0.6437816544738323,
"grad_norm": 0.3619983421314401,
"learning_rate": 4.363788068418857e-05,
"loss": 0.4193,
"step": 572
},
{
"epoch": 0.6449071468767585,
"grad_norm": 0.46202193194933755,
"learning_rate": 4.3617021276595746e-05,
"loss": 0.4308,
"step": 573
},
{
"epoch": 0.6460326392796849,
"grad_norm": 0.4798799400653708,
"learning_rate": 4.359616186900292e-05,
"loss": 0.4072,
"step": 574
},
{
"epoch": 0.6471581316826112,
"grad_norm": 0.42761886423074474,
"learning_rate": 4.35753024614101e-05,
"loss": 0.4357,
"step": 575
},
{
"epoch": 0.6482836240855374,
"grad_norm": 0.4906300910854437,
"learning_rate": 4.355444305381727e-05,
"loss": 0.441,
"step": 576
},
{
"epoch": 0.6494091164884637,
"grad_norm": 0.4312074811449326,
"learning_rate": 4.3533583646224454e-05,
"loss": 0.468,
"step": 577
},
{
"epoch": 0.65053460889139,
"grad_norm": 0.4999437976070137,
"learning_rate": 4.3512724238631624e-05,
"loss": 0.4442,
"step": 578
},
{
"epoch": 0.6516601012943163,
"grad_norm": 0.45200142374904256,
"learning_rate": 4.34918648310388e-05,
"loss": 0.4351,
"step": 579
},
{
"epoch": 0.6527855936972425,
"grad_norm": 0.4481417460480344,
"learning_rate": 4.347100542344598e-05,
"loss": 0.4351,
"step": 580
},
{
"epoch": 0.6539110861001688,
"grad_norm": 0.416680484799885,
"learning_rate": 4.345014601585315e-05,
"loss": 0.4726,
"step": 581
},
{
"epoch": 0.6550365785030952,
"grad_norm": 0.45466741269285743,
"learning_rate": 4.3429286608260325e-05,
"loss": 0.4445,
"step": 582
},
{
"epoch": 0.6561620709060214,
"grad_norm": 0.3767132482639794,
"learning_rate": 4.34084272006675e-05,
"loss": 0.4494,
"step": 583
},
{
"epoch": 0.6572875633089477,
"grad_norm": 0.4045713565741537,
"learning_rate": 4.338756779307468e-05,
"loss": 0.4478,
"step": 584
},
{
"epoch": 0.6584130557118739,
"grad_norm": 0.41406546702832436,
"learning_rate": 4.336670838548185e-05,
"loss": 0.4296,
"step": 585
},
{
"epoch": 0.6595385481148002,
"grad_norm": 0.45192122020443987,
"learning_rate": 4.334584897788903e-05,
"loss": 0.4548,
"step": 586
},
{
"epoch": 0.6606640405177265,
"grad_norm": 0.42522165235824544,
"learning_rate": 4.3324989570296203e-05,
"loss": 0.4545,
"step": 587
},
{
"epoch": 0.6617895329206528,
"grad_norm": 0.4025019554306989,
"learning_rate": 4.330413016270338e-05,
"loss": 0.427,
"step": 588
},
{
"epoch": 0.662915025323579,
"grad_norm": 0.40092550396367915,
"learning_rate": 4.328327075511056e-05,
"loss": 0.4357,
"step": 589
},
{
"epoch": 0.6640405177265053,
"grad_norm": 0.4029073566780126,
"learning_rate": 4.3262411347517734e-05,
"loss": 0.4437,
"step": 590
},
{
"epoch": 0.6651660101294317,
"grad_norm": 0.3754421567640776,
"learning_rate": 4.3241551939924905e-05,
"loss": 0.4488,
"step": 591
},
{
"epoch": 0.6662915025323579,
"grad_norm": 0.4093131149759515,
"learning_rate": 4.322069253233209e-05,
"loss": 0.4296,
"step": 592
},
{
"epoch": 0.6674169949352842,
"grad_norm": 0.37396980661829454,
"learning_rate": 4.319983312473926e-05,
"loss": 0.4135,
"step": 593
},
{
"epoch": 0.6685424873382104,
"grad_norm": 0.39676170583430237,
"learning_rate": 4.3178973717146436e-05,
"loss": 0.4407,
"step": 594
},
{
"epoch": 0.6696679797411368,
"grad_norm": 0.3324304115520877,
"learning_rate": 4.315811430955361e-05,
"loss": 0.4272,
"step": 595
},
{
"epoch": 0.670793472144063,
"grad_norm": 0.41321744590045745,
"learning_rate": 4.313725490196079e-05,
"loss": 0.4535,
"step": 596
},
{
"epoch": 0.6719189645469893,
"grad_norm": 0.37423186701221084,
"learning_rate": 4.311639549436796e-05,
"loss": 0.4243,
"step": 597
},
{
"epoch": 0.6730444569499155,
"grad_norm": 0.34707644350816663,
"learning_rate": 4.309553608677514e-05,
"loss": 0.4224,
"step": 598
},
{
"epoch": 0.6741699493528419,
"grad_norm": 0.39162388441219653,
"learning_rate": 4.3074676679182314e-05,
"loss": 0.4117,
"step": 599
},
{
"epoch": 0.6752954417557682,
"grad_norm": 0.3757134091896751,
"learning_rate": 4.305381727158949e-05,
"loss": 0.4372,
"step": 600
},
{
"epoch": 0.6764209341586944,
"grad_norm": 0.486157183762819,
"learning_rate": 4.303295786399667e-05,
"loss": 0.4487,
"step": 601
},
{
"epoch": 0.6775464265616207,
"grad_norm": 0.34615222028756854,
"learning_rate": 4.301209845640384e-05,
"loss": 0.438,
"step": 602
},
{
"epoch": 0.678671918964547,
"grad_norm": 0.4148015924613456,
"learning_rate": 4.2991239048811015e-05,
"loss": 0.4545,
"step": 603
},
{
"epoch": 0.6797974113674733,
"grad_norm": 0.3870669252002002,
"learning_rate": 4.297037964121819e-05,
"loss": 0.4078,
"step": 604
},
{
"epoch": 0.6809229037703995,
"grad_norm": 0.31630147919989027,
"learning_rate": 4.294952023362537e-05,
"loss": 0.4179,
"step": 605
},
{
"epoch": 0.6820483961733258,
"grad_norm": 0.4078672238404797,
"learning_rate": 4.292866082603254e-05,
"loss": 0.4363,
"step": 606
},
{
"epoch": 0.6831738885762522,
"grad_norm": 0.38181818903469905,
"learning_rate": 4.2907801418439716e-05,
"loss": 0.4387,
"step": 607
},
{
"epoch": 0.6842993809791784,
"grad_norm": 0.40887483819289494,
"learning_rate": 4.288694201084689e-05,
"loss": 0.4279,
"step": 608
},
{
"epoch": 0.6854248733821047,
"grad_norm": 0.45835023477255316,
"learning_rate": 4.286608260325407e-05,
"loss": 0.4553,
"step": 609
},
{
"epoch": 0.6865503657850309,
"grad_norm": 0.4496240755238681,
"learning_rate": 4.284522319566125e-05,
"loss": 0.4511,
"step": 610
},
{
"epoch": 0.6876758581879572,
"grad_norm": 0.47923459811565877,
"learning_rate": 4.2824363788068424e-05,
"loss": 0.4494,
"step": 611
},
{
"epoch": 0.6888013505908835,
"grad_norm": 0.4563499971704832,
"learning_rate": 4.2803504380475594e-05,
"loss": 0.4498,
"step": 612
},
{
"epoch": 0.6899268429938098,
"grad_norm": 0.4658484510143094,
"learning_rate": 4.278264497288277e-05,
"loss": 0.446,
"step": 613
},
{
"epoch": 0.691052335396736,
"grad_norm": 0.40099697936257683,
"learning_rate": 4.276178556528995e-05,
"loss": 0.4138,
"step": 614
},
{
"epoch": 0.6921778277996623,
"grad_norm": 0.40681610293383885,
"learning_rate": 4.2740926157697125e-05,
"loss": 0.4428,
"step": 615
},
{
"epoch": 0.6933033202025887,
"grad_norm": 0.492856289321406,
"learning_rate": 4.2720066750104295e-05,
"loss": 0.429,
"step": 616
},
{
"epoch": 0.6944288126055149,
"grad_norm": 0.40198116454411964,
"learning_rate": 4.269920734251148e-05,
"loss": 0.4319,
"step": 617
},
{
"epoch": 0.6955543050084412,
"grad_norm": 0.4049661414838683,
"learning_rate": 4.267834793491865e-05,
"loss": 0.4371,
"step": 618
},
{
"epoch": 0.6966797974113674,
"grad_norm": 0.4200912676835283,
"learning_rate": 4.2657488527325826e-05,
"loss": 0.4273,
"step": 619
},
{
"epoch": 0.6978052898142938,
"grad_norm": 0.3579260644405867,
"learning_rate": 4.2636629119733e-05,
"loss": 0.436,
"step": 620
},
{
"epoch": 0.69893078221722,
"grad_norm": 0.41261145773033614,
"learning_rate": 4.261576971214018e-05,
"loss": 0.4355,
"step": 621
},
{
"epoch": 0.7000562746201463,
"grad_norm": 0.38195673870959623,
"learning_rate": 4.259491030454735e-05,
"loss": 0.4407,
"step": 622
},
{
"epoch": 0.7011817670230726,
"grad_norm": 0.47251318617526117,
"learning_rate": 4.257405089695453e-05,
"loss": 0.4434,
"step": 623
},
{
"epoch": 0.7023072594259989,
"grad_norm": 0.413024502756469,
"learning_rate": 4.2553191489361704e-05,
"loss": 0.4228,
"step": 624
},
{
"epoch": 0.7034327518289252,
"grad_norm": 0.4129659836054336,
"learning_rate": 4.2532332081768875e-05,
"loss": 0.4298,
"step": 625
},
{
"epoch": 0.7045582442318514,
"grad_norm": 0.4371192692750543,
"learning_rate": 4.251147267417606e-05,
"loss": 0.422,
"step": 626
},
{
"epoch": 0.7056837366347777,
"grad_norm": 0.3209464880480147,
"learning_rate": 4.249061326658323e-05,
"loss": 0.4159,
"step": 627
},
{
"epoch": 0.706809229037704,
"grad_norm": 0.38213551742408286,
"learning_rate": 4.2469753858990406e-05,
"loss": 0.4651,
"step": 628
},
{
"epoch": 0.7079347214406303,
"grad_norm": 0.37077014672780895,
"learning_rate": 4.244889445139758e-05,
"loss": 0.428,
"step": 629
},
{
"epoch": 0.7090602138435566,
"grad_norm": 0.37388919361570394,
"learning_rate": 4.242803504380476e-05,
"loss": 0.4487,
"step": 630
},
{
"epoch": 0.7101857062464828,
"grad_norm": 0.355919224811824,
"learning_rate": 4.240717563621193e-05,
"loss": 0.4273,
"step": 631
},
{
"epoch": 0.7113111986494092,
"grad_norm": 0.3479874917806637,
"learning_rate": 4.2386316228619114e-05,
"loss": 0.4298,
"step": 632
},
{
"epoch": 0.7124366910523354,
"grad_norm": 0.39097161117850043,
"learning_rate": 4.2365456821026284e-05,
"loss": 0.4251,
"step": 633
},
{
"epoch": 0.7135621834552617,
"grad_norm": 0.39131656322095426,
"learning_rate": 4.234459741343346e-05,
"loss": 0.4261,
"step": 634
},
{
"epoch": 0.7146876758581879,
"grad_norm": 1.9337556498338822,
"learning_rate": 4.232373800584064e-05,
"loss": 0.4553,
"step": 635
},
{
"epoch": 0.7158131682611142,
"grad_norm": 0.7715880476594418,
"learning_rate": 4.2302878598247815e-05,
"loss": 0.4349,
"step": 636
},
{
"epoch": 0.7169386606640406,
"grad_norm": 0.4198490504250616,
"learning_rate": 4.2282019190654985e-05,
"loss": 0.4427,
"step": 637
},
{
"epoch": 0.7180641530669668,
"grad_norm": 0.6436591462942758,
"learning_rate": 4.226115978306216e-05,
"loss": 0.4428,
"step": 638
},
{
"epoch": 0.7191896454698931,
"grad_norm": 0.46958357266306217,
"learning_rate": 4.224030037546934e-05,
"loss": 0.4096,
"step": 639
},
{
"epoch": 0.7203151378728193,
"grad_norm": 0.5409557375822074,
"learning_rate": 4.2219440967876516e-05,
"loss": 0.4165,
"step": 640
},
{
"epoch": 0.7214406302757457,
"grad_norm": 0.505386305383113,
"learning_rate": 4.219858156028369e-05,
"loss": 0.4232,
"step": 641
},
{
"epoch": 0.7225661226786719,
"grad_norm": 0.47036754544713516,
"learning_rate": 4.217772215269087e-05,
"loss": 0.4187,
"step": 642
},
{
"epoch": 0.7236916150815982,
"grad_norm": 0.5935180204625328,
"learning_rate": 4.215686274509804e-05,
"loss": 0.4326,
"step": 643
},
{
"epoch": 0.7248171074845244,
"grad_norm": 0.37111793924942255,
"learning_rate": 4.213600333750522e-05,
"loss": 0.4235,
"step": 644
},
{
"epoch": 0.7259425998874508,
"grad_norm": 0.6111195959607152,
"learning_rate": 4.2115143929912394e-05,
"loss": 0.4254,
"step": 645
},
{
"epoch": 0.727068092290377,
"grad_norm": 0.35910288955770575,
"learning_rate": 4.2094284522319564e-05,
"loss": 0.4244,
"step": 646
},
{
"epoch": 0.7281935846933033,
"grad_norm": 0.4804262191052388,
"learning_rate": 4.207342511472674e-05,
"loss": 0.4357,
"step": 647
},
{
"epoch": 0.7293190770962296,
"grad_norm": 0.43546853881795533,
"learning_rate": 4.205256570713392e-05,
"loss": 0.4471,
"step": 648
},
{
"epoch": 0.7304445694991559,
"grad_norm": 0.36651215549293115,
"learning_rate": 4.2031706299541095e-05,
"loss": 0.4359,
"step": 649
},
{
"epoch": 0.7315700619020822,
"grad_norm": 0.5416106337436408,
"learning_rate": 4.201084689194827e-05,
"loss": 0.4514,
"step": 650
},
{
"epoch": 0.7326955543050084,
"grad_norm": 0.37666903051702594,
"learning_rate": 4.198998748435545e-05,
"loss": 0.4423,
"step": 651
},
{
"epoch": 0.7338210467079347,
"grad_norm": 0.44989927473315283,
"learning_rate": 4.196912807676262e-05,
"loss": 0.4268,
"step": 652
},
{
"epoch": 0.734946539110861,
"grad_norm": 0.3864335324626091,
"learning_rate": 4.1948268669169796e-05,
"loss": 0.4494,
"step": 653
},
{
"epoch": 0.7360720315137873,
"grad_norm": 0.4000593109156678,
"learning_rate": 4.192740926157697e-05,
"loss": 0.4434,
"step": 654
},
{
"epoch": 0.7371975239167136,
"grad_norm": 0.423242298419072,
"learning_rate": 4.190654985398415e-05,
"loss": 0.4328,
"step": 655
},
{
"epoch": 0.7383230163196398,
"grad_norm": 0.44706912801056875,
"learning_rate": 4.188569044639132e-05,
"loss": 0.4254,
"step": 656
},
{
"epoch": 0.7394485087225662,
"grad_norm": 0.5086338570156853,
"learning_rate": 4.1864831038798504e-05,
"loss": 0.4425,
"step": 657
},
{
"epoch": 0.7405740011254924,
"grad_norm": 0.4676027167307538,
"learning_rate": 4.1843971631205674e-05,
"loss": 0.4491,
"step": 658
},
{
"epoch": 0.7416994935284187,
"grad_norm": 0.46458396727329027,
"learning_rate": 4.182311222361285e-05,
"loss": 0.4015,
"step": 659
},
{
"epoch": 0.7428249859313449,
"grad_norm": 0.390783744931949,
"learning_rate": 4.180225281602003e-05,
"loss": 0.4184,
"step": 660
},
{
"epoch": 0.7439504783342712,
"grad_norm": 0.44526805252316143,
"learning_rate": 4.1781393408427205e-05,
"loss": 0.4035,
"step": 661
},
{
"epoch": 0.7450759707371976,
"grad_norm": 0.4217385671488669,
"learning_rate": 4.1760534000834376e-05,
"loss": 0.4332,
"step": 662
},
{
"epoch": 0.7462014631401238,
"grad_norm": 0.44487860783732935,
"learning_rate": 4.173967459324156e-05,
"loss": 0.4266,
"step": 663
},
{
"epoch": 0.7473269555430501,
"grad_norm": 0.4296879305918086,
"learning_rate": 4.171881518564873e-05,
"loss": 0.4205,
"step": 664
},
{
"epoch": 0.7484524479459763,
"grad_norm": 0.4948881491751457,
"learning_rate": 4.16979557780559e-05,
"loss": 0.4447,
"step": 665
},
{
"epoch": 0.7495779403489027,
"grad_norm": 0.41381310448412767,
"learning_rate": 4.1677096370463084e-05,
"loss": 0.435,
"step": 666
},
{
"epoch": 0.7507034327518289,
"grad_norm": 0.4138662471855155,
"learning_rate": 4.1656236962870254e-05,
"loss": 0.4351,
"step": 667
},
{
"epoch": 0.7518289251547552,
"grad_norm": 0.3869476402415003,
"learning_rate": 4.163537755527743e-05,
"loss": 0.4319,
"step": 668
},
{
"epoch": 0.7529544175576814,
"grad_norm": 0.4882528682989917,
"learning_rate": 4.161451814768461e-05,
"loss": 0.4123,
"step": 669
},
{
"epoch": 0.7540799099606078,
"grad_norm": 0.3739890771080639,
"learning_rate": 4.1593658740091785e-05,
"loss": 0.4268,
"step": 670
},
{
"epoch": 0.7552054023635341,
"grad_norm": 0.5032273771625602,
"learning_rate": 4.1572799332498955e-05,
"loss": 0.4404,
"step": 671
},
{
"epoch": 0.7563308947664603,
"grad_norm": 0.38387128180956526,
"learning_rate": 4.155193992490614e-05,
"loss": 0.4505,
"step": 672
},
{
"epoch": 0.7574563871693866,
"grad_norm": 0.4995032503495298,
"learning_rate": 4.153108051731331e-05,
"loss": 0.4211,
"step": 673
},
{
"epoch": 0.7585818795723129,
"grad_norm": 0.46352751067691306,
"learning_rate": 4.1510221109720486e-05,
"loss": 0.4253,
"step": 674
},
{
"epoch": 0.7597073719752392,
"grad_norm": 0.4661239773263893,
"learning_rate": 4.148936170212766e-05,
"loss": 0.4533,
"step": 675
},
{
"epoch": 0.7608328643781654,
"grad_norm": 0.42916960855475605,
"learning_rate": 4.146850229453484e-05,
"loss": 0.4333,
"step": 676
},
{
"epoch": 0.7619583567810917,
"grad_norm": 0.40989406943220275,
"learning_rate": 4.144764288694201e-05,
"loss": 0.4413,
"step": 677
},
{
"epoch": 0.7630838491840181,
"grad_norm": 0.7522787094637527,
"learning_rate": 4.1426783479349194e-05,
"loss": 0.4522,
"step": 678
},
{
"epoch": 0.7642093415869443,
"grad_norm": 0.4277705459587538,
"learning_rate": 4.1405924071756364e-05,
"loss": 0.4348,
"step": 679
},
{
"epoch": 0.7653348339898706,
"grad_norm": 0.4684118417529332,
"learning_rate": 4.138506466416354e-05,
"loss": 0.422,
"step": 680
},
{
"epoch": 0.7664603263927968,
"grad_norm": 0.5197963821538139,
"learning_rate": 4.136420525657072e-05,
"loss": 0.4299,
"step": 681
},
{
"epoch": 0.7675858187957231,
"grad_norm": 0.5235576475586984,
"learning_rate": 4.1343345848977895e-05,
"loss": 0.438,
"step": 682
},
{
"epoch": 0.7687113111986494,
"grad_norm": 0.46712550772065836,
"learning_rate": 4.1322486441385065e-05,
"loss": 0.4344,
"step": 683
},
{
"epoch": 0.7698368036015757,
"grad_norm": 0.3222703692853798,
"learning_rate": 4.130162703379224e-05,
"loss": 0.4263,
"step": 684
},
{
"epoch": 0.770962296004502,
"grad_norm": 0.5188367404561216,
"learning_rate": 4.128076762619942e-05,
"loss": 0.425,
"step": 685
},
{
"epoch": 0.7720877884074282,
"grad_norm": 0.5386961427613608,
"learning_rate": 4.125990821860659e-05,
"loss": 0.4276,
"step": 686
},
{
"epoch": 0.7732132808103546,
"grad_norm": 0.42911439453279915,
"learning_rate": 4.1239048811013766e-05,
"loss": 0.4309,
"step": 687
},
{
"epoch": 0.7743387732132808,
"grad_norm": 0.5088405648165022,
"learning_rate": 4.121818940342094e-05,
"loss": 0.4493,
"step": 688
},
{
"epoch": 0.7754642656162071,
"grad_norm": 0.3815644681020926,
"learning_rate": 4.119732999582812e-05,
"loss": 0.4077,
"step": 689
},
{
"epoch": 0.7765897580191333,
"grad_norm": 0.4840279343366164,
"learning_rate": 4.11764705882353e-05,
"loss": 0.4172,
"step": 690
},
{
"epoch": 0.7777152504220597,
"grad_norm": 0.333716982007624,
"learning_rate": 4.1155611180642474e-05,
"loss": 0.42,
"step": 691
},
{
"epoch": 0.7788407428249859,
"grad_norm": 0.5086503847022227,
"learning_rate": 4.1134751773049644e-05,
"loss": 0.4388,
"step": 692
},
{
"epoch": 0.7799662352279122,
"grad_norm": 0.5138077690790301,
"learning_rate": 4.111389236545682e-05,
"loss": 0.4472,
"step": 693
},
{
"epoch": 0.7810917276308385,
"grad_norm": 0.5073604041295958,
"learning_rate": 4.1093032957864e-05,
"loss": 0.4319,
"step": 694
},
{
"epoch": 0.7822172200337648,
"grad_norm": 0.5070487690193936,
"learning_rate": 4.1072173550271175e-05,
"loss": 0.4406,
"step": 695
},
{
"epoch": 0.7833427124366911,
"grad_norm": 0.39744464693598625,
"learning_rate": 4.1051314142678346e-05,
"loss": 0.434,
"step": 696
},
{
"epoch": 0.7844682048396173,
"grad_norm": 0.4541031658226192,
"learning_rate": 4.103045473508553e-05,
"loss": 0.4454,
"step": 697
},
{
"epoch": 0.7855936972425436,
"grad_norm": 0.3491750229319607,
"learning_rate": 4.10095953274927e-05,
"loss": 0.4332,
"step": 698
},
{
"epoch": 0.7867191896454699,
"grad_norm": 0.4022760008208042,
"learning_rate": 4.0988735919899877e-05,
"loss": 0.4296,
"step": 699
},
{
"epoch": 0.7878446820483962,
"grad_norm": 0.34684627806001544,
"learning_rate": 4.0967876512307054e-05,
"loss": 0.4331,
"step": 700
},
{
"epoch": 0.7889701744513224,
"grad_norm": 0.4050405845879203,
"learning_rate": 4.094701710471423e-05,
"loss": 0.4464,
"step": 701
},
{
"epoch": 0.7900956668542487,
"grad_norm": 0.36395612381945763,
"learning_rate": 4.09261576971214e-05,
"loss": 0.4444,
"step": 702
},
{
"epoch": 0.7912211592571751,
"grad_norm": 0.398848237592344,
"learning_rate": 4.0905298289528585e-05,
"loss": 0.4288,
"step": 703
},
{
"epoch": 0.7923466516601013,
"grad_norm": 0.40745644685078164,
"learning_rate": 4.0884438881935755e-05,
"loss": 0.4329,
"step": 704
},
{
"epoch": 0.7934721440630276,
"grad_norm": 0.3547156716364725,
"learning_rate": 4.0863579474342925e-05,
"loss": 0.4191,
"step": 705
},
{
"epoch": 0.7945976364659538,
"grad_norm": 0.377680056161795,
"learning_rate": 4.084272006675011e-05,
"loss": 0.4376,
"step": 706
},
{
"epoch": 0.7957231288688801,
"grad_norm": 0.4073180644016936,
"learning_rate": 4.082186065915728e-05,
"loss": 0.4559,
"step": 707
},
{
"epoch": 0.7968486212718064,
"grad_norm": 0.45186446852813356,
"learning_rate": 4.0801001251564456e-05,
"loss": 0.4277,
"step": 708
},
{
"epoch": 0.7979741136747327,
"grad_norm": 0.36933911451661233,
"learning_rate": 4.078014184397163e-05,
"loss": 0.45,
"step": 709
},
{
"epoch": 0.799099606077659,
"grad_norm": 0.35833391487238614,
"learning_rate": 4.075928243637881e-05,
"loss": 0.4403,
"step": 710
},
{
"epoch": 0.8002250984805852,
"grad_norm": 0.3901982149558614,
"learning_rate": 4.073842302878598e-05,
"loss": 0.4216,
"step": 711
},
{
"epoch": 0.8013505908835116,
"grad_norm": 0.40940384251834244,
"learning_rate": 4.0717563621193164e-05,
"loss": 0.4021,
"step": 712
},
{
"epoch": 0.8024760832864378,
"grad_norm": 0.42919683308516116,
"learning_rate": 4.0696704213600334e-05,
"loss": 0.4376,
"step": 713
},
{
"epoch": 0.8036015756893641,
"grad_norm": 0.4073165345943137,
"learning_rate": 4.067584480600751e-05,
"loss": 0.4153,
"step": 714
},
{
"epoch": 0.8047270680922903,
"grad_norm": 0.4178501498334503,
"learning_rate": 4.065498539841469e-05,
"loss": 0.414,
"step": 715
},
{
"epoch": 0.8058525604952167,
"grad_norm": 0.4403993787350139,
"learning_rate": 4.0634125990821865e-05,
"loss": 0.4399,
"step": 716
},
{
"epoch": 0.806978052898143,
"grad_norm": 0.4114972670954794,
"learning_rate": 4.0613266583229035e-05,
"loss": 0.439,
"step": 717
},
{
"epoch": 0.8081035453010692,
"grad_norm": 0.407394844667869,
"learning_rate": 4.059240717563622e-05,
"loss": 0.4123,
"step": 718
},
{
"epoch": 0.8092290377039955,
"grad_norm": 0.39800729593005324,
"learning_rate": 4.057154776804339e-05,
"loss": 0.4236,
"step": 719
},
{
"epoch": 0.8103545301069218,
"grad_norm": 0.4287708410386054,
"learning_rate": 4.0550688360450566e-05,
"loss": 0.4256,
"step": 720
},
{
"epoch": 0.8114800225098481,
"grad_norm": 0.4016484816358628,
"learning_rate": 4.052982895285774e-05,
"loss": 0.4281,
"step": 721
},
{
"epoch": 0.8126055149127743,
"grad_norm": 0.3719724351542615,
"learning_rate": 4.050896954526492e-05,
"loss": 0.4077,
"step": 722
},
{
"epoch": 0.8137310073157006,
"grad_norm": 0.4023100055568255,
"learning_rate": 4.048811013767209e-05,
"loss": 0.4461,
"step": 723
},
{
"epoch": 0.814856499718627,
"grad_norm": 0.4117093051704328,
"learning_rate": 4.046725073007927e-05,
"loss": 0.4175,
"step": 724
},
{
"epoch": 0.8159819921215532,
"grad_norm": 0.34286385689334,
"learning_rate": 4.0446391322486444e-05,
"loss": 0.4356,
"step": 725
},
{
"epoch": 0.8171074845244795,
"grad_norm": 0.35591813739094097,
"learning_rate": 4.0425531914893614e-05,
"loss": 0.4434,
"step": 726
},
{
"epoch": 0.8182329769274057,
"grad_norm": 0.43567208149763015,
"learning_rate": 4.040467250730079e-05,
"loss": 0.437,
"step": 727
},
{
"epoch": 0.8193584693303321,
"grad_norm": 0.3799825439351934,
"learning_rate": 4.038381309970797e-05,
"loss": 0.4248,
"step": 728
},
{
"epoch": 0.8204839617332583,
"grad_norm": 0.38216998051723755,
"learning_rate": 4.0362953692115145e-05,
"loss": 0.4253,
"step": 729
},
{
"epoch": 0.8216094541361846,
"grad_norm": 0.39231774228135824,
"learning_rate": 4.034209428452232e-05,
"loss": 0.4223,
"step": 730
},
{
"epoch": 0.8227349465391108,
"grad_norm": 0.4102144130938295,
"learning_rate": 4.03212348769295e-05,
"loss": 0.4348,
"step": 731
},
{
"epoch": 0.8238604389420371,
"grad_norm": 0.37115430835787877,
"learning_rate": 4.030037546933667e-05,
"loss": 0.409,
"step": 732
},
{
"epoch": 0.8249859313449635,
"grad_norm": 0.40499256266698164,
"learning_rate": 4.0279516061743847e-05,
"loss": 0.4089,
"step": 733
},
{
"epoch": 0.8261114237478897,
"grad_norm": 0.4916550738089128,
"learning_rate": 4.0258656654151024e-05,
"loss": 0.4272,
"step": 734
},
{
"epoch": 0.827236916150816,
"grad_norm": 0.3681620907401364,
"learning_rate": 4.02377972465582e-05,
"loss": 0.4446,
"step": 735
},
{
"epoch": 0.8283624085537422,
"grad_norm": 0.4795384990908562,
"learning_rate": 4.021693783896537e-05,
"loss": 0.4357,
"step": 736
},
{
"epoch": 0.8294879009566686,
"grad_norm": 0.3684736183097587,
"learning_rate": 4.0196078431372555e-05,
"loss": 0.4119,
"step": 737
},
{
"epoch": 0.8306133933595948,
"grad_norm": 0.43877380657382786,
"learning_rate": 4.0175219023779725e-05,
"loss": 0.4403,
"step": 738
},
{
"epoch": 0.8317388857625211,
"grad_norm": 0.37814204050253025,
"learning_rate": 4.01543596161869e-05,
"loss": 0.434,
"step": 739
},
{
"epoch": 0.8328643781654473,
"grad_norm": 0.45099287248352765,
"learning_rate": 4.013350020859408e-05,
"loss": 0.4149,
"step": 740
},
{
"epoch": 0.8339898705683737,
"grad_norm": 0.34915848381393966,
"learning_rate": 4.0112640801001256e-05,
"loss": 0.4136,
"step": 741
},
{
"epoch": 0.8351153629713,
"grad_norm": 0.5037598255538088,
"learning_rate": 4.0091781393408426e-05,
"loss": 0.4398,
"step": 742
},
{
"epoch": 0.8362408553742262,
"grad_norm": 0.3612809802844246,
"learning_rate": 4.007092198581561e-05,
"loss": 0.4162,
"step": 743
},
{
"epoch": 0.8373663477771525,
"grad_norm": 0.3979488796549818,
"learning_rate": 4.005006257822278e-05,
"loss": 0.3981,
"step": 744
},
{
"epoch": 0.8384918401800788,
"grad_norm": 0.4440135625243805,
"learning_rate": 4.002920317062996e-05,
"loss": 0.429,
"step": 745
},
{
"epoch": 0.8396173325830051,
"grad_norm": 0.3448234757480279,
"learning_rate": 4.0008343763037134e-05,
"loss": 0.4324,
"step": 746
},
{
"epoch": 0.8407428249859313,
"grad_norm": 0.4775835287224156,
"learning_rate": 3.9987484355444304e-05,
"loss": 0.4249,
"step": 747
},
{
"epoch": 0.8418683173888576,
"grad_norm": 0.3566220202478078,
"learning_rate": 3.996662494785148e-05,
"loss": 0.4211,
"step": 748
},
{
"epoch": 0.842993809791784,
"grad_norm": 0.5285144169481172,
"learning_rate": 3.994576554025866e-05,
"loss": 0.4168,
"step": 749
},
{
"epoch": 0.8441193021947102,
"grad_norm": 0.33354278924631714,
"learning_rate": 3.9924906132665835e-05,
"loss": 0.4261,
"step": 750
},
{
"epoch": 0.8452447945976365,
"grad_norm": 0.3372581050524173,
"learning_rate": 3.9904046725073005e-05,
"loss": 0.4269,
"step": 751
},
{
"epoch": 0.8463702870005627,
"grad_norm": 0.3146244454332402,
"learning_rate": 3.988318731748019e-05,
"loss": 0.4204,
"step": 752
},
{
"epoch": 0.8474957794034891,
"grad_norm": 0.3706478564537626,
"learning_rate": 3.986232790988736e-05,
"loss": 0.4521,
"step": 753
},
{
"epoch": 0.8486212718064153,
"grad_norm": 0.34630012288221157,
"learning_rate": 3.9841468502294536e-05,
"loss": 0.4142,
"step": 754
},
{
"epoch": 0.8497467642093416,
"grad_norm": 0.36373433568245944,
"learning_rate": 3.982060909470171e-05,
"loss": 0.4252,
"step": 755
},
{
"epoch": 0.8508722566122678,
"grad_norm": 0.3554211790643752,
"learning_rate": 3.979974968710889e-05,
"loss": 0.4474,
"step": 756
},
{
"epoch": 0.8519977490151941,
"grad_norm": 0.30960141279598913,
"learning_rate": 3.977889027951606e-05,
"loss": 0.4167,
"step": 757
},
{
"epoch": 0.8531232414181205,
"grad_norm": 0.37614788680975125,
"learning_rate": 3.9758030871923244e-05,
"loss": 0.4505,
"step": 758
},
{
"epoch": 0.8542487338210467,
"grad_norm": 0.3938651785575828,
"learning_rate": 3.9737171464330414e-05,
"loss": 0.4349,
"step": 759
},
{
"epoch": 0.855374226223973,
"grad_norm": 0.3460524380953148,
"learning_rate": 3.971631205673759e-05,
"loss": 0.4396,
"step": 760
},
{
"epoch": 0.8564997186268992,
"grad_norm": 0.430535629585179,
"learning_rate": 3.969545264914477e-05,
"loss": 0.4154,
"step": 761
},
{
"epoch": 0.8576252110298256,
"grad_norm": 0.34446139273212933,
"learning_rate": 3.9674593241551945e-05,
"loss": 0.3931,
"step": 762
},
{
"epoch": 0.8587507034327518,
"grad_norm": 0.42192087717244775,
"learning_rate": 3.9653733833959115e-05,
"loss": 0.4261,
"step": 763
},
{
"epoch": 0.8598761958356781,
"grad_norm": 0.40550449281201056,
"learning_rate": 3.963287442636629e-05,
"loss": 0.4569,
"step": 764
},
{
"epoch": 0.8610016882386043,
"grad_norm": 0.3566914781532168,
"learning_rate": 3.961201501877347e-05,
"loss": 0.4141,
"step": 765
},
{
"epoch": 0.8621271806415307,
"grad_norm": 0.3843475406384751,
"learning_rate": 3.9591155611180646e-05,
"loss": 0.4332,
"step": 766
},
{
"epoch": 0.863252673044457,
"grad_norm": 0.3366748222918633,
"learning_rate": 3.9570296203587817e-05,
"loss": 0.4123,
"step": 767
},
{
"epoch": 0.8643781654473832,
"grad_norm": 0.41416075386046697,
"learning_rate": 3.9549436795994994e-05,
"loss": 0.426,
"step": 768
},
{
"epoch": 0.8655036578503095,
"grad_norm": 0.3752366359688814,
"learning_rate": 3.952857738840217e-05,
"loss": 0.4402,
"step": 769
},
{
"epoch": 0.8666291502532358,
"grad_norm": 0.37688991154499113,
"learning_rate": 3.950771798080935e-05,
"loss": 0.4244,
"step": 770
},
{
"epoch": 0.8677546426561621,
"grad_norm": 0.42637480636595876,
"learning_rate": 3.9486858573216525e-05,
"loss": 0.438,
"step": 771
},
{
"epoch": 0.8688801350590883,
"grad_norm": 0.3568635983835573,
"learning_rate": 3.9465999165623695e-05,
"loss": 0.424,
"step": 772
},
{
"epoch": 0.8700056274620146,
"grad_norm": 0.38797711927011286,
"learning_rate": 3.944513975803087e-05,
"loss": 0.4321,
"step": 773
},
{
"epoch": 0.871131119864941,
"grad_norm": 0.3904824359653345,
"learning_rate": 3.942428035043805e-05,
"loss": 0.4161,
"step": 774
},
{
"epoch": 0.8722566122678672,
"grad_norm": 0.47345678909928446,
"learning_rate": 3.9403420942845226e-05,
"loss": 0.4503,
"step": 775
},
{
"epoch": 0.8733821046707935,
"grad_norm": 0.37497244213020403,
"learning_rate": 3.9382561535252396e-05,
"loss": 0.4255,
"step": 776
},
{
"epoch": 0.8745075970737197,
"grad_norm": 0.4047268746098847,
"learning_rate": 3.936170212765958e-05,
"loss": 0.4327,
"step": 777
},
{
"epoch": 0.8756330894766461,
"grad_norm": 0.3834914449330313,
"learning_rate": 3.934084272006675e-05,
"loss": 0.4079,
"step": 778
},
{
"epoch": 0.8767585818795723,
"grad_norm": 0.43021072579406455,
"learning_rate": 3.931998331247393e-05,
"loss": 0.4143,
"step": 779
},
{
"epoch": 0.8778840742824986,
"grad_norm": 0.3793510230856374,
"learning_rate": 3.9299123904881104e-05,
"loss": 0.431,
"step": 780
},
{
"epoch": 0.8790095666854248,
"grad_norm": 0.37164807483969337,
"learning_rate": 3.927826449728828e-05,
"loss": 0.4341,
"step": 781
},
{
"epoch": 0.8801350590883511,
"grad_norm": 0.3807695648271021,
"learning_rate": 3.925740508969545e-05,
"loss": 0.4096,
"step": 782
},
{
"epoch": 0.8812605514912775,
"grad_norm": 0.3502384590348891,
"learning_rate": 3.9236545682102635e-05,
"loss": 0.4117,
"step": 783
},
{
"epoch": 0.8823860438942037,
"grad_norm": 0.41955082958695283,
"learning_rate": 3.9215686274509805e-05,
"loss": 0.4179,
"step": 784
},
{
"epoch": 0.88351153629713,
"grad_norm": 0.3435394433133878,
"learning_rate": 3.919482686691698e-05,
"loss": 0.4215,
"step": 785
},
{
"epoch": 0.8846370287000562,
"grad_norm": 0.44230838156044133,
"learning_rate": 3.917396745932416e-05,
"loss": 0.436,
"step": 786
},
{
"epoch": 0.8857625211029826,
"grad_norm": 0.3248857597519066,
"learning_rate": 3.9153108051731336e-05,
"loss": 0.41,
"step": 787
},
{
"epoch": 0.8868880135059088,
"grad_norm": 0.48949666561437843,
"learning_rate": 3.9132248644138506e-05,
"loss": 0.4348,
"step": 788
},
{
"epoch": 0.8880135059088351,
"grad_norm": 0.32922044316292487,
"learning_rate": 3.911138923654568e-05,
"loss": 0.4084,
"step": 789
},
{
"epoch": 0.8891389983117614,
"grad_norm": 0.4097616554209572,
"learning_rate": 3.909052982895286e-05,
"loss": 0.4127,
"step": 790
},
{
"epoch": 0.8902644907146877,
"grad_norm": 0.3847502404740843,
"learning_rate": 3.906967042136003e-05,
"loss": 0.4322,
"step": 791
},
{
"epoch": 0.891389983117614,
"grad_norm": 0.39373480839252734,
"learning_rate": 3.9048811013767214e-05,
"loss": 0.4085,
"step": 792
},
{
"epoch": 0.8925154755205402,
"grad_norm": 0.4665639076471283,
"learning_rate": 3.9027951606174384e-05,
"loss": 0.4281,
"step": 793
},
{
"epoch": 0.8936409679234665,
"grad_norm": 0.32986547499650304,
"learning_rate": 3.900709219858156e-05,
"loss": 0.4261,
"step": 794
},
{
"epoch": 0.8947664603263928,
"grad_norm": 0.484689789318943,
"learning_rate": 3.898623279098874e-05,
"loss": 0.4236,
"step": 795
},
{
"epoch": 0.8958919527293191,
"grad_norm": 0.3386378151140954,
"learning_rate": 3.8965373383395915e-05,
"loss": 0.4117,
"step": 796
},
{
"epoch": 0.8970174451322454,
"grad_norm": 0.4810985090257936,
"learning_rate": 3.8944513975803085e-05,
"loss": 0.4199,
"step": 797
},
{
"epoch": 0.8981429375351716,
"grad_norm": 0.34069553000131625,
"learning_rate": 3.892365456821027e-05,
"loss": 0.4279,
"step": 798
},
{
"epoch": 0.899268429938098,
"grad_norm": 0.39752219752724677,
"learning_rate": 3.890279516061744e-05,
"loss": 0.4172,
"step": 799
},
{
"epoch": 0.9003939223410242,
"grad_norm": 0.39022425914879927,
"learning_rate": 3.8881935753024616e-05,
"loss": 0.3978,
"step": 800
},
{
"epoch": 0.9015194147439505,
"grad_norm": 0.3458579209805956,
"learning_rate": 3.8861076345431793e-05,
"loss": 0.4135,
"step": 801
},
{
"epoch": 0.9026449071468767,
"grad_norm": 0.4407202352189913,
"learning_rate": 3.884021693783897e-05,
"loss": 0.4154,
"step": 802
},
{
"epoch": 0.9037703995498031,
"grad_norm": 0.47173587942543654,
"learning_rate": 3.881935753024614e-05,
"loss": 0.4572,
"step": 803
},
{
"epoch": 0.9048958919527293,
"grad_norm": 0.5188592329216469,
"learning_rate": 3.879849812265332e-05,
"loss": 0.42,
"step": 804
},
{
"epoch": 0.9060213843556556,
"grad_norm": 0.35403721006820305,
"learning_rate": 3.8777638715060495e-05,
"loss": 0.3938,
"step": 805
},
{
"epoch": 0.9071468767585819,
"grad_norm": 0.4545974955129778,
"learning_rate": 3.875677930746767e-05,
"loss": 0.4285,
"step": 806
},
{
"epoch": 0.9082723691615081,
"grad_norm": 0.38332622486859297,
"learning_rate": 3.873591989987485e-05,
"loss": 0.4324,
"step": 807
},
{
"epoch": 0.9093978615644345,
"grad_norm": 0.4520007540189171,
"learning_rate": 3.8715060492282026e-05,
"loss": 0.4305,
"step": 808
},
{
"epoch": 0.9105233539673607,
"grad_norm": 0.4148933007482292,
"learning_rate": 3.8694201084689196e-05,
"loss": 0.4218,
"step": 809
},
{
"epoch": 0.911648846370287,
"grad_norm": 0.35859066112911336,
"learning_rate": 3.867334167709637e-05,
"loss": 0.4395,
"step": 810
},
{
"epoch": 0.9127743387732132,
"grad_norm": 0.3384384473732276,
"learning_rate": 3.865248226950355e-05,
"loss": 0.4331,
"step": 811
},
{
"epoch": 0.9138998311761396,
"grad_norm": 0.3212850763014723,
"learning_rate": 3.863162286191072e-05,
"loss": 0.4371,
"step": 812
},
{
"epoch": 0.9150253235790659,
"grad_norm": 0.34283786993488946,
"learning_rate": 3.86107634543179e-05,
"loss": 0.4213,
"step": 813
},
{
"epoch": 0.9161508159819921,
"grad_norm": 0.3578410656828841,
"learning_rate": 3.8589904046725074e-05,
"loss": 0.4264,
"step": 814
},
{
"epoch": 0.9172763083849184,
"grad_norm": 0.33865929644502085,
"learning_rate": 3.856904463913225e-05,
"loss": 0.4242,
"step": 815
},
{
"epoch": 0.9184018007878447,
"grad_norm": 0.3392167511998851,
"learning_rate": 3.854818523153942e-05,
"loss": 0.435,
"step": 816
},
{
"epoch": 0.919527293190771,
"grad_norm": 0.4361222901548229,
"learning_rate": 3.8527325823946605e-05,
"loss": 0.4351,
"step": 817
},
{
"epoch": 0.9206527855936972,
"grad_norm": 0.38626109347018045,
"learning_rate": 3.8506466416353775e-05,
"loss": 0.4413,
"step": 818
},
{
"epoch": 0.9217782779966235,
"grad_norm": 0.376739528222001,
"learning_rate": 3.848560700876095e-05,
"loss": 0.4162,
"step": 819
},
{
"epoch": 0.9229037703995498,
"grad_norm": 0.38666458978007023,
"learning_rate": 3.846474760116813e-05,
"loss": 0.4308,
"step": 820
},
{
"epoch": 0.9240292628024761,
"grad_norm": 0.49211116299516156,
"learning_rate": 3.8443888193575306e-05,
"loss": 0.4319,
"step": 821
},
{
"epoch": 0.9251547552054024,
"grad_norm": 0.35408915013798653,
"learning_rate": 3.8423028785982476e-05,
"loss": 0.4095,
"step": 822
},
{
"epoch": 0.9262802476083286,
"grad_norm": 0.4801831963166499,
"learning_rate": 3.840216937838966e-05,
"loss": 0.4357,
"step": 823
},
{
"epoch": 0.927405740011255,
"grad_norm": 0.355137877995065,
"learning_rate": 3.838130997079683e-05,
"loss": 0.4089,
"step": 824
},
{
"epoch": 0.9285312324141812,
"grad_norm": 0.39619886118735476,
"learning_rate": 3.836045056320401e-05,
"loss": 0.4275,
"step": 825
},
{
"epoch": 0.9296567248171075,
"grad_norm": 0.4149029728443111,
"learning_rate": 3.8339591155611184e-05,
"loss": 0.4271,
"step": 826
},
{
"epoch": 0.9307822172200337,
"grad_norm": 0.3576650599906339,
"learning_rate": 3.831873174801836e-05,
"loss": 0.4132,
"step": 827
},
{
"epoch": 0.93190770962296,
"grad_norm": 0.3906733425105834,
"learning_rate": 3.829787234042553e-05,
"loss": 0.4344,
"step": 828
},
{
"epoch": 0.9330332020258864,
"grad_norm": 0.3593657860758568,
"learning_rate": 3.8277012932832715e-05,
"loss": 0.4149,
"step": 829
},
{
"epoch": 0.9341586944288126,
"grad_norm": 0.3817439606842503,
"learning_rate": 3.8256153525239885e-05,
"loss": 0.4069,
"step": 830
},
{
"epoch": 0.9352841868317389,
"grad_norm": 0.3973105618276613,
"learning_rate": 3.8235294117647055e-05,
"loss": 0.4324,
"step": 831
},
{
"epoch": 0.9364096792346651,
"grad_norm": 0.366999411331023,
"learning_rate": 3.821443471005424e-05,
"loss": 0.4222,
"step": 832
},
{
"epoch": 0.9375351716375915,
"grad_norm": 0.3464567606261278,
"learning_rate": 3.819357530246141e-05,
"loss": 0.4246,
"step": 833
},
{
"epoch": 0.9386606640405177,
"grad_norm": 0.4438404878074898,
"learning_rate": 3.8172715894868586e-05,
"loss": 0.4143,
"step": 834
},
{
"epoch": 0.939786156443444,
"grad_norm": 0.3129033931624516,
"learning_rate": 3.8151856487275763e-05,
"loss": 0.4307,
"step": 835
},
{
"epoch": 0.9409116488463702,
"grad_norm": 0.4970325181275813,
"learning_rate": 3.813099707968294e-05,
"loss": 0.4212,
"step": 836
},
{
"epoch": 0.9420371412492966,
"grad_norm": 0.3998089884639407,
"learning_rate": 3.811013767209011e-05,
"loss": 0.4208,
"step": 837
},
{
"epoch": 0.9431626336522229,
"grad_norm": 0.40099412686189473,
"learning_rate": 3.8089278264497294e-05,
"loss": 0.422,
"step": 838
},
{
"epoch": 0.9442881260551491,
"grad_norm": 0.5330625499056586,
"learning_rate": 3.8068418856904465e-05,
"loss": 0.4132,
"step": 839
},
{
"epoch": 0.9454136184580754,
"grad_norm": 0.3036089387603486,
"learning_rate": 3.804755944931164e-05,
"loss": 0.3962,
"step": 840
},
{
"epoch": 0.9465391108610017,
"grad_norm": 0.5044966493127429,
"learning_rate": 3.802670004171882e-05,
"loss": 0.4228,
"step": 841
},
{
"epoch": 0.947664603263928,
"grad_norm": 0.3885120765788415,
"learning_rate": 3.8005840634125996e-05,
"loss": 0.4228,
"step": 842
},
{
"epoch": 0.9487900956668542,
"grad_norm": 0.3285021365606724,
"learning_rate": 3.7984981226533166e-05,
"loss": 0.4287,
"step": 843
},
{
"epoch": 0.9499155880697805,
"grad_norm": 0.3980934273222264,
"learning_rate": 3.796412181894034e-05,
"loss": 0.409,
"step": 844
},
{
"epoch": 0.9510410804727069,
"grad_norm": 0.31490077533714694,
"learning_rate": 3.794326241134752e-05,
"loss": 0.4364,
"step": 845
},
{
"epoch": 0.9521665728756331,
"grad_norm": 0.35959204795616695,
"learning_rate": 3.79224030037547e-05,
"loss": 0.4283,
"step": 846
},
{
"epoch": 0.9532920652785594,
"grad_norm": 0.4126210300706387,
"learning_rate": 3.7901543596161874e-05,
"loss": 0.4197,
"step": 847
},
{
"epoch": 0.9544175576814856,
"grad_norm": 0.3222558281528205,
"learning_rate": 3.788068418856905e-05,
"loss": 0.4277,
"step": 848
},
{
"epoch": 0.955543050084412,
"grad_norm": 0.41145117521139,
"learning_rate": 3.785982478097622e-05,
"loss": 0.4257,
"step": 849
},
{
"epoch": 0.9566685424873382,
"grad_norm": 0.351379058945545,
"learning_rate": 3.78389653733834e-05,
"loss": 0.4181,
"step": 850
},
{
"epoch": 0.9577940348902645,
"grad_norm": 0.31029125166165955,
"learning_rate": 3.7818105965790575e-05,
"loss": 0.4297,
"step": 851
},
{
"epoch": 0.9589195272931907,
"grad_norm": 0.3285912637236667,
"learning_rate": 3.7797246558197745e-05,
"loss": 0.4138,
"step": 852
},
{
"epoch": 0.960045019696117,
"grad_norm": 0.4036547080190634,
"learning_rate": 3.777638715060492e-05,
"loss": 0.4366,
"step": 853
},
{
"epoch": 0.9611705120990434,
"grad_norm": 0.3686324816741614,
"learning_rate": 3.77555277430121e-05,
"loss": 0.4337,
"step": 854
},
{
"epoch": 0.9622960045019696,
"grad_norm": 0.48923825142344834,
"learning_rate": 3.7734668335419276e-05,
"loss": 0.4343,
"step": 855
},
{
"epoch": 0.9634214969048959,
"grad_norm": 0.3013034390380091,
"learning_rate": 3.7713808927826446e-05,
"loss": 0.4216,
"step": 856
},
{
"epoch": 0.9645469893078221,
"grad_norm": 0.4352224768520518,
"learning_rate": 3.769294952023363e-05,
"loss": 0.3941,
"step": 857
},
{
"epoch": 0.9656724817107485,
"grad_norm": 0.3513506338023819,
"learning_rate": 3.76720901126408e-05,
"loss": 0.4166,
"step": 858
},
{
"epoch": 0.9667979741136747,
"grad_norm": 0.40350199029637573,
"learning_rate": 3.765123070504798e-05,
"loss": 0.413,
"step": 859
},
{
"epoch": 0.967923466516601,
"grad_norm": 0.362061939817286,
"learning_rate": 3.7630371297455154e-05,
"loss": 0.411,
"step": 860
},
{
"epoch": 0.9690489589195272,
"grad_norm": 0.3399440337787816,
"learning_rate": 3.760951188986233e-05,
"loss": 0.4392,
"step": 861
},
{
"epoch": 0.9701744513224536,
"grad_norm": 0.37743255775219053,
"learning_rate": 3.75886524822695e-05,
"loss": 0.4433,
"step": 862
},
{
"epoch": 0.9712999437253799,
"grad_norm": 0.3638973275713123,
"learning_rate": 3.7567793074676685e-05,
"loss": 0.4203,
"step": 863
},
{
"epoch": 0.9724254361283061,
"grad_norm": 0.3277233424581398,
"learning_rate": 3.7546933667083855e-05,
"loss": 0.398,
"step": 864
},
{
"epoch": 0.9735509285312324,
"grad_norm": 0.3141565988423171,
"learning_rate": 3.752607425949103e-05,
"loss": 0.3959,
"step": 865
},
{
"epoch": 0.9746764209341587,
"grad_norm": 0.35936283889585385,
"learning_rate": 3.750521485189821e-05,
"loss": 0.432,
"step": 866
},
{
"epoch": 0.975801913337085,
"grad_norm": 0.31770357894398493,
"learning_rate": 3.7484355444305386e-05,
"loss": 0.431,
"step": 867
},
{
"epoch": 0.9769274057400112,
"grad_norm": 0.3123167816580969,
"learning_rate": 3.7463496036712556e-05,
"loss": 0.4025,
"step": 868
},
{
"epoch": 0.9780528981429375,
"grad_norm": 0.3692934723238402,
"learning_rate": 3.744263662911974e-05,
"loss": 0.4221,
"step": 869
},
{
"epoch": 0.9791783905458639,
"grad_norm": 0.354161129420181,
"learning_rate": 3.742177722152691e-05,
"loss": 0.4172,
"step": 870
},
{
"epoch": 0.9803038829487901,
"grad_norm": 0.36175776122206693,
"learning_rate": 3.740091781393409e-05,
"loss": 0.4247,
"step": 871
},
{
"epoch": 0.9814293753517164,
"grad_norm": 0.33883677517413535,
"learning_rate": 3.7380058406341264e-05,
"loss": 0.4068,
"step": 872
},
{
"epoch": 0.9825548677546426,
"grad_norm": 0.42954345350848233,
"learning_rate": 3.7359198998748435e-05,
"loss": 0.422,
"step": 873
},
{
"epoch": 0.983680360157569,
"grad_norm": 0.3555531618337076,
"learning_rate": 3.733833959115561e-05,
"loss": 0.4117,
"step": 874
},
{
"epoch": 0.9848058525604952,
"grad_norm": 0.3137468970852999,
"learning_rate": 3.731748018356279e-05,
"loss": 0.4044,
"step": 875
},
{
"epoch": 0.9859313449634215,
"grad_norm": 0.32456521220251544,
"learning_rate": 3.7296620775969966e-05,
"loss": 0.4133,
"step": 876
},
{
"epoch": 0.9870568373663478,
"grad_norm": 0.31014819015532874,
"learning_rate": 3.7275761368377136e-05,
"loss": 0.4141,
"step": 877
},
{
"epoch": 0.988182329769274,
"grad_norm": 0.32436938468507787,
"learning_rate": 3.725490196078432e-05,
"loss": 0.4095,
"step": 878
},
{
"epoch": 0.9893078221722004,
"grad_norm": 0.33188432959790465,
"learning_rate": 3.723404255319149e-05,
"loss": 0.4029,
"step": 879
},
{
"epoch": 0.9904333145751266,
"grad_norm": 0.3654774295033461,
"learning_rate": 3.721318314559867e-05,
"loss": 0.4255,
"step": 880
},
{
"epoch": 0.9915588069780529,
"grad_norm": 0.366785518306503,
"learning_rate": 3.7192323738005844e-05,
"loss": 0.4449,
"step": 881
},
{
"epoch": 0.9926842993809791,
"grad_norm": 0.29826436924819194,
"learning_rate": 3.717146433041302e-05,
"loss": 0.4048,
"step": 882
},
{
"epoch": 0.9938097917839055,
"grad_norm": 0.3152150195685499,
"learning_rate": 3.715060492282019e-05,
"loss": 0.4156,
"step": 883
},
{
"epoch": 0.9949352841868317,
"grad_norm": 0.3790269660933605,
"learning_rate": 3.712974551522737e-05,
"loss": 0.4258,
"step": 884
},
{
"epoch": 0.996060776589758,
"grad_norm": 0.3234490218718985,
"learning_rate": 3.7108886107634545e-05,
"loss": 0.4225,
"step": 885
},
{
"epoch": 0.9971862689926843,
"grad_norm": 0.3692387676152948,
"learning_rate": 3.708802670004172e-05,
"loss": 0.4193,
"step": 886
},
{
"epoch": 0.9983117613956106,
"grad_norm": 0.3548734847141469,
"learning_rate": 3.70671672924489e-05,
"loss": 0.4223,
"step": 887
},
{
"epoch": 0.9994372537985369,
"grad_norm": 0.41390225631408695,
"learning_rate": 3.7046307884856076e-05,
"loss": 0.4223,
"step": 888
},
{
"epoch": 1.0,
"grad_norm": 0.41390225631408695,
"learning_rate": 3.7025448477263246e-05,
"loss": 0.43,
"step": 889
},
{
"epoch": 1.0011254924029263,
"grad_norm": 0.5580430959772991,
"learning_rate": 3.700458906967042e-05,
"loss": 0.3606,
"step": 890
},
{
"epoch": 1.0022509848058525,
"grad_norm": 0.38786233021722444,
"learning_rate": 3.69837296620776e-05,
"loss": 0.3516,
"step": 891
},
{
"epoch": 1.0033764772087788,
"grad_norm": 0.3607061049938279,
"learning_rate": 3.696287025448478e-05,
"loss": 0.3586,
"step": 892
},
{
"epoch": 1.004501969611705,
"grad_norm": 0.30722252549464857,
"learning_rate": 3.694201084689195e-05,
"loss": 0.3566,
"step": 893
},
{
"epoch": 1.0056274620146315,
"grad_norm": 0.4162073977345431,
"learning_rate": 3.6921151439299124e-05,
"loss": 0.3517,
"step": 894
},
{
"epoch": 1.0067529544175577,
"grad_norm": 0.3477012359425953,
"learning_rate": 3.69002920317063e-05,
"loss": 0.356,
"step": 895
},
{
"epoch": 1.007878446820484,
"grad_norm": 0.34334848103470345,
"learning_rate": 3.687943262411347e-05,
"loss": 0.3529,
"step": 896
},
{
"epoch": 1.0090039392234103,
"grad_norm": 0.3303966213040783,
"learning_rate": 3.6858573216520655e-05,
"loss": 0.3775,
"step": 897
},
{
"epoch": 1.0101294316263365,
"grad_norm": 0.41641276263804705,
"learning_rate": 3.6837713808927825e-05,
"loss": 0.3482,
"step": 898
},
{
"epoch": 1.0112549240292628,
"grad_norm": 0.3475211971953784,
"learning_rate": 3.6816854401335e-05,
"loss": 0.3469,
"step": 899
},
{
"epoch": 1.012380416432189,
"grad_norm": 0.3630650367930452,
"learning_rate": 3.679599499374218e-05,
"loss": 0.3733,
"step": 900
},
{
"epoch": 1.0135059088351153,
"grad_norm": 0.33917879336611284,
"learning_rate": 3.6775135586149356e-05,
"loss": 0.3613,
"step": 901
},
{
"epoch": 1.0146314012380417,
"grad_norm": 0.3916615454670656,
"learning_rate": 3.6754276178556526e-05,
"loss": 0.3642,
"step": 902
},
{
"epoch": 1.015756893640968,
"grad_norm": 0.387700709428207,
"learning_rate": 3.673341677096371e-05,
"loss": 0.337,
"step": 903
},
{
"epoch": 1.0168823860438942,
"grad_norm": 0.311008874794384,
"learning_rate": 3.671255736337088e-05,
"loss": 0.3464,
"step": 904
},
{
"epoch": 1.0180078784468205,
"grad_norm": 0.34204508328431077,
"learning_rate": 3.669169795577806e-05,
"loss": 0.3493,
"step": 905
},
{
"epoch": 1.0191333708497468,
"grad_norm": 0.35056912513693533,
"learning_rate": 3.6670838548185234e-05,
"loss": 0.3847,
"step": 906
},
{
"epoch": 1.020258863252673,
"grad_norm": 0.3603063090886555,
"learning_rate": 3.664997914059241e-05,
"loss": 0.3696,
"step": 907
},
{
"epoch": 1.0213843556555993,
"grad_norm": 0.3406429440812445,
"learning_rate": 3.662911973299958e-05,
"loss": 0.3585,
"step": 908
},
{
"epoch": 1.0225098480585255,
"grad_norm": 0.4146200559571759,
"learning_rate": 3.6608260325406765e-05,
"loss": 0.3617,
"step": 909
},
{
"epoch": 1.023635340461452,
"grad_norm": 0.30025908743312035,
"learning_rate": 3.6587400917813936e-05,
"loss": 0.3416,
"step": 910
},
{
"epoch": 1.0247608328643782,
"grad_norm": 0.4720811356812383,
"learning_rate": 3.656654151022111e-05,
"loss": 0.3533,
"step": 911
},
{
"epoch": 1.0258863252673045,
"grad_norm": 0.29184795311941897,
"learning_rate": 3.654568210262829e-05,
"loss": 0.3493,
"step": 912
},
{
"epoch": 1.0270118176702308,
"grad_norm": 0.385289462825186,
"learning_rate": 3.6524822695035466e-05,
"loss": 0.393,
"step": 913
},
{
"epoch": 1.028137310073157,
"grad_norm": 0.3107082501520784,
"learning_rate": 3.650396328744264e-05,
"loss": 0.3677,
"step": 914
},
{
"epoch": 1.0292628024760833,
"grad_norm": 0.2892635060197107,
"learning_rate": 3.6483103879849814e-05,
"loss": 0.3484,
"step": 915
},
{
"epoch": 1.0303882948790095,
"grad_norm": 0.37383301152112214,
"learning_rate": 3.646224447225699e-05,
"loss": 0.3741,
"step": 916
},
{
"epoch": 1.0315137872819358,
"grad_norm": 0.32042127431190587,
"learning_rate": 3.644138506466416e-05,
"loss": 0.3453,
"step": 917
},
{
"epoch": 1.032639279684862,
"grad_norm": 0.3227805251806716,
"learning_rate": 3.6420525657071345e-05,
"loss": 0.347,
"step": 918
},
{
"epoch": 1.0337647720877885,
"grad_norm": 0.33975552005827825,
"learning_rate": 3.6399666249478515e-05,
"loss": 0.342,
"step": 919
},
{
"epoch": 1.0348902644907148,
"grad_norm": 0.3053184721102955,
"learning_rate": 3.637880684188569e-05,
"loss": 0.368,
"step": 920
},
{
"epoch": 1.036015756893641,
"grad_norm": 0.4171758873578538,
"learning_rate": 3.635794743429287e-05,
"loss": 0.3506,
"step": 921
},
{
"epoch": 1.0371412492965673,
"grad_norm": 0.35788110167643483,
"learning_rate": 3.6337088026700046e-05,
"loss": 0.3678,
"step": 922
},
{
"epoch": 1.0382667416994935,
"grad_norm": 0.40422162482455976,
"learning_rate": 3.6316228619107216e-05,
"loss": 0.3841,
"step": 923
},
{
"epoch": 1.0393922341024198,
"grad_norm": 0.42302051382729106,
"learning_rate": 3.629536921151439e-05,
"loss": 0.3609,
"step": 924
},
{
"epoch": 1.040517726505346,
"grad_norm": 0.3002900676912074,
"learning_rate": 3.627450980392157e-05,
"loss": 0.3764,
"step": 925
},
{
"epoch": 1.0416432189082723,
"grad_norm": 0.4216178632940728,
"learning_rate": 3.625365039632875e-05,
"loss": 0.3525,
"step": 926
},
{
"epoch": 1.0427687113111987,
"grad_norm": 0.36722403261101394,
"learning_rate": 3.6232790988735924e-05,
"loss": 0.3651,
"step": 927
},
{
"epoch": 1.043894203714125,
"grad_norm": 0.37487765396444256,
"learning_rate": 3.62119315811431e-05,
"loss": 0.3732,
"step": 928
},
{
"epoch": 1.0450196961170513,
"grad_norm": 0.40248279158053446,
"learning_rate": 3.619107217355027e-05,
"loss": 0.3514,
"step": 929
},
{
"epoch": 1.0461451885199775,
"grad_norm": 0.34487298402942634,
"learning_rate": 3.617021276595745e-05,
"loss": 0.3453,
"step": 930
},
{
"epoch": 1.0472706809229038,
"grad_norm": 0.35894348708147356,
"learning_rate": 3.6149353358364625e-05,
"loss": 0.3445,
"step": 931
},
{
"epoch": 1.04839617332583,
"grad_norm": 0.46543989700724425,
"learning_rate": 3.61284939507718e-05,
"loss": 0.3554,
"step": 932
},
{
"epoch": 1.0495216657287563,
"grad_norm": 0.32251577447042856,
"learning_rate": 3.610763454317897e-05,
"loss": 0.3571,
"step": 933
},
{
"epoch": 1.0506471581316825,
"grad_norm": 0.3539766683535758,
"learning_rate": 3.608677513558615e-05,
"loss": 0.3291,
"step": 934
},
{
"epoch": 1.051772650534609,
"grad_norm": 0.34471085249350447,
"learning_rate": 3.6065915727993326e-05,
"loss": 0.3764,
"step": 935
},
{
"epoch": 1.0528981429375353,
"grad_norm": 0.33468302525089494,
"learning_rate": 3.6045056320400496e-05,
"loss": 0.3479,
"step": 936
},
{
"epoch": 1.0540236353404615,
"grad_norm": 0.36538591134232934,
"learning_rate": 3.602419691280768e-05,
"loss": 0.3642,
"step": 937
},
{
"epoch": 1.0551491277433878,
"grad_norm": 0.35282922968280045,
"learning_rate": 3.600333750521485e-05,
"loss": 0.3446,
"step": 938
},
{
"epoch": 1.056274620146314,
"grad_norm": 0.35478764255979334,
"learning_rate": 3.598247809762203e-05,
"loss": 0.3752,
"step": 939
},
{
"epoch": 1.0574001125492403,
"grad_norm": 0.3565613451966995,
"learning_rate": 3.5961618690029204e-05,
"loss": 0.362,
"step": 940
},
{
"epoch": 1.0585256049521665,
"grad_norm": 0.33132722259601055,
"learning_rate": 3.594075928243638e-05,
"loss": 0.3559,
"step": 941
},
{
"epoch": 1.0596510973550928,
"grad_norm": 0.34347700089780575,
"learning_rate": 3.591989987484355e-05,
"loss": 0.3641,
"step": 942
},
{
"epoch": 1.060776589758019,
"grad_norm": 0.2772476546624268,
"learning_rate": 3.5899040467250735e-05,
"loss": 0.3433,
"step": 943
},
{
"epoch": 1.0619020821609455,
"grad_norm": 0.36078868188752466,
"learning_rate": 3.5878181059657906e-05,
"loss": 0.36,
"step": 944
},
{
"epoch": 1.0630275745638718,
"grad_norm": 0.2927763816273808,
"learning_rate": 3.585732165206508e-05,
"loss": 0.3757,
"step": 945
},
{
"epoch": 1.064153066966798,
"grad_norm": 0.31067799008966573,
"learning_rate": 3.583646224447226e-05,
"loss": 0.3375,
"step": 946
},
{
"epoch": 1.0652785593697243,
"grad_norm": 0.30786259543828726,
"learning_rate": 3.5815602836879437e-05,
"loss": 0.3691,
"step": 947
},
{
"epoch": 1.0664040517726505,
"grad_norm": 0.34927488285962766,
"learning_rate": 3.579474342928661e-05,
"loss": 0.3512,
"step": 948
},
{
"epoch": 1.0675295441755768,
"grad_norm": 0.3134128528998366,
"learning_rate": 3.577388402169379e-05,
"loss": 0.3684,
"step": 949
},
{
"epoch": 1.068655036578503,
"grad_norm": 0.3684381541500359,
"learning_rate": 3.575302461410096e-05,
"loss": 0.3635,
"step": 950
},
{
"epoch": 1.0697805289814293,
"grad_norm": 0.3071501276127385,
"learning_rate": 3.573216520650814e-05,
"loss": 0.3629,
"step": 951
},
{
"epoch": 1.0709060213843558,
"grad_norm": 0.3650935121688607,
"learning_rate": 3.5711305798915315e-05,
"loss": 0.352,
"step": 952
},
{
"epoch": 1.072031513787282,
"grad_norm": 0.3004157301630184,
"learning_rate": 3.569044639132249e-05,
"loss": 0.3627,
"step": 953
},
{
"epoch": 1.0731570061902083,
"grad_norm": 0.3588467213474463,
"learning_rate": 3.566958698372966e-05,
"loss": 0.378,
"step": 954
},
{
"epoch": 1.0742824985931345,
"grad_norm": 0.38695693104692636,
"learning_rate": 3.564872757613684e-05,
"loss": 0.3558,
"step": 955
},
{
"epoch": 1.0754079909960608,
"grad_norm": 0.30329694533620805,
"learning_rate": 3.5627868168544016e-05,
"loss": 0.3841,
"step": 956
},
{
"epoch": 1.076533483398987,
"grad_norm": 0.34905611952609783,
"learning_rate": 3.5607008760951186e-05,
"loss": 0.3689,
"step": 957
},
{
"epoch": 1.0776589758019133,
"grad_norm": 0.28800778538826344,
"learning_rate": 3.558614935335837e-05,
"loss": 0.3543,
"step": 958
},
{
"epoch": 1.0787844682048395,
"grad_norm": 0.3746527261236155,
"learning_rate": 3.556528994576554e-05,
"loss": 0.366,
"step": 959
},
{
"epoch": 1.079909960607766,
"grad_norm": 0.32663591501026235,
"learning_rate": 3.554443053817272e-05,
"loss": 0.3499,
"step": 960
},
{
"epoch": 1.0810354530106923,
"grad_norm": 0.3328189109583666,
"learning_rate": 3.5523571130579894e-05,
"loss": 0.353,
"step": 961
},
{
"epoch": 1.0821609454136185,
"grad_norm": 0.31964664375303664,
"learning_rate": 3.550271172298707e-05,
"loss": 0.3672,
"step": 962
},
{
"epoch": 1.0832864378165448,
"grad_norm": 0.36918332363958006,
"learning_rate": 3.548185231539424e-05,
"loss": 0.3798,
"step": 963
},
{
"epoch": 1.084411930219471,
"grad_norm": 0.3254223917013834,
"learning_rate": 3.546099290780142e-05,
"loss": 0.3559,
"step": 964
},
{
"epoch": 1.0855374226223973,
"grad_norm": 0.3008814703536633,
"learning_rate": 3.5440133500208595e-05,
"loss": 0.3609,
"step": 965
},
{
"epoch": 1.0866629150253235,
"grad_norm": 0.35240736109329646,
"learning_rate": 3.541927409261577e-05,
"loss": 0.3777,
"step": 966
},
{
"epoch": 1.0877884074282498,
"grad_norm": 0.3869312281732699,
"learning_rate": 3.539841468502295e-05,
"loss": 0.3724,
"step": 967
},
{
"epoch": 1.088913899831176,
"grad_norm": 0.30726021570614737,
"learning_rate": 3.5377555277430126e-05,
"loss": 0.3531,
"step": 968
},
{
"epoch": 1.0900393922341025,
"grad_norm": 0.34236583353183286,
"learning_rate": 3.5356695869837296e-05,
"loss": 0.3608,
"step": 969
},
{
"epoch": 1.0911648846370288,
"grad_norm": 0.2916866803109591,
"learning_rate": 3.533583646224447e-05,
"loss": 0.3624,
"step": 970
},
{
"epoch": 1.092290377039955,
"grad_norm": 0.3145203080926422,
"learning_rate": 3.531497705465165e-05,
"loss": 0.3684,
"step": 971
},
{
"epoch": 1.0934158694428813,
"grad_norm": 0.2873541218671502,
"learning_rate": 3.529411764705883e-05,
"loss": 0.3617,
"step": 972
},
{
"epoch": 1.0945413618458075,
"grad_norm": 0.3506652103429166,
"learning_rate": 3.5273258239466e-05,
"loss": 0.3583,
"step": 973
},
{
"epoch": 1.0956668542487338,
"grad_norm": 0.3025123158669694,
"learning_rate": 3.525239883187318e-05,
"loss": 0.3472,
"step": 974
},
{
"epoch": 1.09679234665166,
"grad_norm": 0.2899074126357094,
"learning_rate": 3.523153942428035e-05,
"loss": 0.3675,
"step": 975
},
{
"epoch": 1.0979178390545863,
"grad_norm": 0.3150990472406033,
"learning_rate": 3.521068001668753e-05,
"loss": 0.3636,
"step": 976
},
{
"epoch": 1.0990433314575128,
"grad_norm": 0.35489391655027186,
"learning_rate": 3.5189820609094705e-05,
"loss": 0.3384,
"step": 977
},
{
"epoch": 1.100168823860439,
"grad_norm": 0.3041199542435297,
"learning_rate": 3.5168961201501876e-05,
"loss": 0.3571,
"step": 978
},
{
"epoch": 1.1012943162633653,
"grad_norm": 0.31637443077212757,
"learning_rate": 3.514810179390905e-05,
"loss": 0.3703,
"step": 979
},
{
"epoch": 1.1024198086662915,
"grad_norm": 0.33113581691565325,
"learning_rate": 3.512724238631623e-05,
"loss": 0.35,
"step": 980
},
{
"epoch": 1.1035453010692178,
"grad_norm": 0.3300457711599469,
"learning_rate": 3.5106382978723407e-05,
"loss": 0.3485,
"step": 981
},
{
"epoch": 1.104670793472144,
"grad_norm": 0.37342013448224476,
"learning_rate": 3.508552357113058e-05,
"loss": 0.3543,
"step": 982
},
{
"epoch": 1.1057962858750703,
"grad_norm": 0.36084265787497494,
"learning_rate": 3.506466416353776e-05,
"loss": 0.3499,
"step": 983
},
{
"epoch": 1.1069217782779965,
"grad_norm": 0.36650053348727774,
"learning_rate": 3.504380475594493e-05,
"loss": 0.3727,
"step": 984
},
{
"epoch": 1.108047270680923,
"grad_norm": 0.38335191540233127,
"learning_rate": 3.502294534835211e-05,
"loss": 0.3557,
"step": 985
},
{
"epoch": 1.1091727630838493,
"grad_norm": 0.36320976195356514,
"learning_rate": 3.5002085940759285e-05,
"loss": 0.382,
"step": 986
},
{
"epoch": 1.1102982554867755,
"grad_norm": 0.38636958474248506,
"learning_rate": 3.498122653316646e-05,
"loss": 0.3402,
"step": 987
},
{
"epoch": 1.1114237478897018,
"grad_norm": 0.38017701551768956,
"learning_rate": 3.496036712557363e-05,
"loss": 0.3742,
"step": 988
},
{
"epoch": 1.112549240292628,
"grad_norm": 0.3198258149962093,
"learning_rate": 3.4939507717980816e-05,
"loss": 0.3432,
"step": 989
},
{
"epoch": 1.1136747326955543,
"grad_norm": 0.38060186204014107,
"learning_rate": 3.4918648310387986e-05,
"loss": 0.364,
"step": 990
},
{
"epoch": 1.1148002250984805,
"grad_norm": 0.3522538503310745,
"learning_rate": 3.489778890279516e-05,
"loss": 0.3862,
"step": 991
},
{
"epoch": 1.1159257175014068,
"grad_norm": 0.34893950721299544,
"learning_rate": 3.487692949520234e-05,
"loss": 0.3674,
"step": 992
},
{
"epoch": 1.117051209904333,
"grad_norm": 0.3145664530999275,
"learning_rate": 3.485607008760952e-05,
"loss": 0.3623,
"step": 993
},
{
"epoch": 1.1181767023072595,
"grad_norm": 0.38231007603706296,
"learning_rate": 3.483521068001669e-05,
"loss": 0.3513,
"step": 994
},
{
"epoch": 1.1193021947101858,
"grad_norm": 0.29574406471189,
"learning_rate": 3.481435127242387e-05,
"loss": 0.3686,
"step": 995
},
{
"epoch": 1.120427687113112,
"grad_norm": 0.3786384191919254,
"learning_rate": 3.479349186483104e-05,
"loss": 0.3496,
"step": 996
},
{
"epoch": 1.1215531795160383,
"grad_norm": 0.27933782961377807,
"learning_rate": 3.477263245723821e-05,
"loss": 0.3865,
"step": 997
},
{
"epoch": 1.1226786719189645,
"grad_norm": 0.3796958540762593,
"learning_rate": 3.4751773049645395e-05,
"loss": 0.3701,
"step": 998
},
{
"epoch": 1.1238041643218908,
"grad_norm": 0.31019085193512064,
"learning_rate": 3.4730913642052565e-05,
"loss": 0.3544,
"step": 999
},
{
"epoch": 1.124929656724817,
"grad_norm": 0.3894747761447629,
"learning_rate": 3.471005423445974e-05,
"loss": 0.3613,
"step": 1000
},
{
"epoch": 1.1260551491277433,
"grad_norm": 0.3848999285142024,
"learning_rate": 3.468919482686692e-05,
"loss": 0.3572,
"step": 1001
},
{
"epoch": 1.1271806415306695,
"grad_norm": 0.4075083886945119,
"learning_rate": 3.4668335419274096e-05,
"loss": 0.3534,
"step": 1002
},
{
"epoch": 1.128306133933596,
"grad_norm": 0.4244922841249029,
"learning_rate": 3.4647476011681266e-05,
"loss": 0.3857,
"step": 1003
},
{
"epoch": 1.1294316263365223,
"grad_norm": 0.3575947287049676,
"learning_rate": 3.462661660408844e-05,
"loss": 0.3494,
"step": 1004
},
{
"epoch": 1.1305571187394485,
"grad_norm": 0.3920246518678635,
"learning_rate": 3.460575719649562e-05,
"loss": 0.3693,
"step": 1005
},
{
"epoch": 1.1316826111423748,
"grad_norm": 0.3065280136400847,
"learning_rate": 3.45848977889028e-05,
"loss": 0.3352,
"step": 1006
},
{
"epoch": 1.132808103545301,
"grad_norm": 0.38525744406438595,
"learning_rate": 3.4564038381309974e-05,
"loss": 0.353,
"step": 1007
},
{
"epoch": 1.1339335959482273,
"grad_norm": 0.47272322177864035,
"learning_rate": 3.454317897371715e-05,
"loss": 0.3673,
"step": 1008
},
{
"epoch": 1.1350590883511535,
"grad_norm": 0.3327944075995892,
"learning_rate": 3.452231956612432e-05,
"loss": 0.3523,
"step": 1009
},
{
"epoch": 1.13618458075408,
"grad_norm": 0.42906579303424525,
"learning_rate": 3.45014601585315e-05,
"loss": 0.3577,
"step": 1010
},
{
"epoch": 1.1373100731570063,
"grad_norm": 0.31630743768076713,
"learning_rate": 3.4480600750938675e-05,
"loss": 0.3571,
"step": 1011
},
{
"epoch": 1.1384355655599325,
"grad_norm": 0.41005007736044136,
"learning_rate": 3.445974134334585e-05,
"loss": 0.362,
"step": 1012
},
{
"epoch": 1.1395610579628588,
"grad_norm": 0.3846148750924408,
"learning_rate": 3.443888193575302e-05,
"loss": 0.3554,
"step": 1013
},
{
"epoch": 1.140686550365785,
"grad_norm": 0.39499988480138304,
"learning_rate": 3.4418022528160206e-05,
"loss": 0.367,
"step": 1014
},
{
"epoch": 1.1418120427687113,
"grad_norm": 0.35657946077097175,
"learning_rate": 3.4397163120567377e-05,
"loss": 0.3694,
"step": 1015
},
{
"epoch": 1.1429375351716375,
"grad_norm": 0.3728438143327632,
"learning_rate": 3.4376303712974554e-05,
"loss": 0.3713,
"step": 1016
},
{
"epoch": 1.1440630275745638,
"grad_norm": 0.34659822653002426,
"learning_rate": 3.435544430538173e-05,
"loss": 0.3584,
"step": 1017
},
{
"epoch": 1.14518851997749,
"grad_norm": 0.3828982028856398,
"learning_rate": 3.43345848977889e-05,
"loss": 0.357,
"step": 1018
},
{
"epoch": 1.1463140123804165,
"grad_norm": 0.35840428604352054,
"learning_rate": 3.431372549019608e-05,
"loss": 0.3658,
"step": 1019
},
{
"epoch": 1.1474395047833428,
"grad_norm": 0.3642341763560189,
"learning_rate": 3.4292866082603255e-05,
"loss": 0.3768,
"step": 1020
},
{
"epoch": 1.148564997186269,
"grad_norm": 0.47028026081900165,
"learning_rate": 3.427200667501043e-05,
"loss": 0.3448,
"step": 1021
},
{
"epoch": 1.1496904895891953,
"grad_norm": 0.33137638092807364,
"learning_rate": 3.42511472674176e-05,
"loss": 0.3855,
"step": 1022
},
{
"epoch": 1.1508159819921215,
"grad_norm": 0.4049631157313659,
"learning_rate": 3.4230287859824786e-05,
"loss": 0.3801,
"step": 1023
},
{
"epoch": 1.1519414743950478,
"grad_norm": 0.3829633936239526,
"learning_rate": 3.4209428452231956e-05,
"loss": 0.3791,
"step": 1024
},
{
"epoch": 1.153066966797974,
"grad_norm": 0.42759635786809663,
"learning_rate": 3.418856904463913e-05,
"loss": 0.3676,
"step": 1025
},
{
"epoch": 1.1541924592009003,
"grad_norm": 0.3728776125817692,
"learning_rate": 3.416770963704631e-05,
"loss": 0.3622,
"step": 1026
},
{
"epoch": 1.1553179516038266,
"grad_norm": 0.39380341402257635,
"learning_rate": 3.414685022945349e-05,
"loss": 0.3785,
"step": 1027
},
{
"epoch": 1.156443444006753,
"grad_norm": 0.32076593702973827,
"learning_rate": 3.412599082186066e-05,
"loss": 0.3745,
"step": 1028
},
{
"epoch": 1.1575689364096793,
"grad_norm": 0.32908758752319733,
"learning_rate": 3.410513141426784e-05,
"loss": 0.3496,
"step": 1029
},
{
"epoch": 1.1586944288126055,
"grad_norm": 0.41768970871312155,
"learning_rate": 3.408427200667501e-05,
"loss": 0.3575,
"step": 1030
},
{
"epoch": 1.1598199212155318,
"grad_norm": 0.3106359891104045,
"learning_rate": 3.406341259908219e-05,
"loss": 0.3516,
"step": 1031
},
{
"epoch": 1.160945413618458,
"grad_norm": 0.3870701068020313,
"learning_rate": 3.4042553191489365e-05,
"loss": 0.3356,
"step": 1032
},
{
"epoch": 1.1620709060213843,
"grad_norm": 0.38611106269123546,
"learning_rate": 3.402169378389654e-05,
"loss": 0.3469,
"step": 1033
},
{
"epoch": 1.1631963984243106,
"grad_norm": 0.3255124156021805,
"learning_rate": 3.400083437630371e-05,
"loss": 0.3722,
"step": 1034
},
{
"epoch": 1.164321890827237,
"grad_norm": 0.32836642792719567,
"learning_rate": 3.3979974968710896e-05,
"loss": 0.3544,
"step": 1035
},
{
"epoch": 1.1654473832301633,
"grad_norm": 0.3805911934596958,
"learning_rate": 3.3959115561118066e-05,
"loss": 0.3982,
"step": 1036
},
{
"epoch": 1.1665728756330895,
"grad_norm": 0.3368162160417577,
"learning_rate": 3.393825615352524e-05,
"loss": 0.3679,
"step": 1037
},
{
"epoch": 1.1676983680360158,
"grad_norm": 0.31363563073754847,
"learning_rate": 3.391739674593242e-05,
"loss": 0.3529,
"step": 1038
},
{
"epoch": 1.168823860438942,
"grad_norm": 0.34006739877010494,
"learning_rate": 3.389653733833959e-05,
"loss": 0.3463,
"step": 1039
},
{
"epoch": 1.1699493528418683,
"grad_norm": 0.3100061821836274,
"learning_rate": 3.387567793074677e-05,
"loss": 0.3381,
"step": 1040
},
{
"epoch": 1.1710748452447945,
"grad_norm": 0.3065807803890228,
"learning_rate": 3.3854818523153944e-05,
"loss": 0.3651,
"step": 1041
},
{
"epoch": 1.1722003376477208,
"grad_norm": 0.32611882573130585,
"learning_rate": 3.383395911556112e-05,
"loss": 0.3529,
"step": 1042
},
{
"epoch": 1.173325830050647,
"grad_norm": 0.28895452201759864,
"learning_rate": 3.381309970796829e-05,
"loss": 0.3307,
"step": 1043
},
{
"epoch": 1.1744513224535735,
"grad_norm": 0.31616663311663623,
"learning_rate": 3.379224030037547e-05,
"loss": 0.3615,
"step": 1044
},
{
"epoch": 1.1755768148564998,
"grad_norm": 0.2999011173077538,
"learning_rate": 3.3771380892782645e-05,
"loss": 0.3527,
"step": 1045
},
{
"epoch": 1.176702307259426,
"grad_norm": 0.28604936736274933,
"learning_rate": 3.375052148518982e-05,
"loss": 0.361,
"step": 1046
},
{
"epoch": 1.1778277996623523,
"grad_norm": 0.3028269137775988,
"learning_rate": 3.3729662077597e-05,
"loss": 0.3668,
"step": 1047
},
{
"epoch": 1.1789532920652785,
"grad_norm": 0.36698195409495143,
"learning_rate": 3.3708802670004176e-05,
"loss": 0.352,
"step": 1048
},
{
"epoch": 1.1800787844682048,
"grad_norm": 0.2951939270230831,
"learning_rate": 3.3687943262411347e-05,
"loss": 0.3533,
"step": 1049
},
{
"epoch": 1.181204276871131,
"grad_norm": 0.4064761843327334,
"learning_rate": 3.3667083854818524e-05,
"loss": 0.3601,
"step": 1050
},
{
"epoch": 1.1823297692740573,
"grad_norm": 0.325934767924338,
"learning_rate": 3.36462244472257e-05,
"loss": 0.366,
"step": 1051
},
{
"epoch": 1.1834552616769836,
"grad_norm": 0.3444374492643726,
"learning_rate": 3.362536503963288e-05,
"loss": 0.3591,
"step": 1052
},
{
"epoch": 1.18458075407991,
"grad_norm": 0.3902013079098464,
"learning_rate": 3.360450563204005e-05,
"loss": 0.3609,
"step": 1053
},
{
"epoch": 1.1857062464828363,
"grad_norm": 0.3552567977795283,
"learning_rate": 3.358364622444723e-05,
"loss": 0.3824,
"step": 1054
},
{
"epoch": 1.1868317388857625,
"grad_norm": 0.5473634143542325,
"learning_rate": 3.35627868168544e-05,
"loss": 0.344,
"step": 1055
},
{
"epoch": 1.1879572312886888,
"grad_norm": 0.31822857141954713,
"learning_rate": 3.354192740926158e-05,
"loss": 0.34,
"step": 1056
},
{
"epoch": 1.189082723691615,
"grad_norm": 0.35648383062484057,
"learning_rate": 3.3521068001668756e-05,
"loss": 0.3664,
"step": 1057
},
{
"epoch": 1.1902082160945413,
"grad_norm": 0.3533726981414865,
"learning_rate": 3.350020859407593e-05,
"loss": 0.3643,
"step": 1058
},
{
"epoch": 1.1913337084974676,
"grad_norm": 0.38846901904691766,
"learning_rate": 3.34793491864831e-05,
"loss": 0.364,
"step": 1059
},
{
"epoch": 1.192459200900394,
"grad_norm": 0.32829805282614477,
"learning_rate": 3.345848977889028e-05,
"loss": 0.3505,
"step": 1060
},
{
"epoch": 1.1935846933033203,
"grad_norm": 0.3371243132688832,
"learning_rate": 3.343763037129746e-05,
"loss": 0.3706,
"step": 1061
},
{
"epoch": 1.1947101857062465,
"grad_norm": 0.29390329610439453,
"learning_rate": 3.341677096370463e-05,
"loss": 0.3513,
"step": 1062
},
{
"epoch": 1.1958356781091728,
"grad_norm": 0.3589333659631211,
"learning_rate": 3.339591155611181e-05,
"loss": 0.364,
"step": 1063
},
{
"epoch": 1.196961170512099,
"grad_norm": 0.3025901807833534,
"learning_rate": 3.337505214851898e-05,
"loss": 0.3716,
"step": 1064
},
{
"epoch": 1.1980866629150253,
"grad_norm": 0.2990903113895738,
"learning_rate": 3.335419274092616e-05,
"loss": 0.3703,
"step": 1065
},
{
"epoch": 1.1992121553179516,
"grad_norm": 0.3084522992492389,
"learning_rate": 3.3333333333333335e-05,
"loss": 0.3423,
"step": 1066
},
{
"epoch": 1.2003376477208778,
"grad_norm": 0.2833543979726358,
"learning_rate": 3.331247392574051e-05,
"loss": 0.3794,
"step": 1067
},
{
"epoch": 1.201463140123804,
"grad_norm": 0.3502254927161911,
"learning_rate": 3.329161451814768e-05,
"loss": 0.3588,
"step": 1068
},
{
"epoch": 1.2025886325267305,
"grad_norm": 0.2824861573083505,
"learning_rate": 3.3270755110554866e-05,
"loss": 0.3375,
"step": 1069
},
{
"epoch": 1.2037141249296568,
"grad_norm": 0.32275485870283527,
"learning_rate": 3.3249895702962036e-05,
"loss": 0.3659,
"step": 1070
},
{
"epoch": 1.204839617332583,
"grad_norm": 0.2831771258197277,
"learning_rate": 3.322903629536921e-05,
"loss": 0.3608,
"step": 1071
},
{
"epoch": 1.2059651097355093,
"grad_norm": 0.4099026461053303,
"learning_rate": 3.320817688777639e-05,
"loss": 0.3657,
"step": 1072
},
{
"epoch": 1.2070906021384356,
"grad_norm": 0.2988459528156424,
"learning_rate": 3.318731748018357e-05,
"loss": 0.3612,
"step": 1073
},
{
"epoch": 1.2082160945413618,
"grad_norm": 0.3285103143387034,
"learning_rate": 3.316645807259074e-05,
"loss": 0.3243,
"step": 1074
},
{
"epoch": 1.209341586944288,
"grad_norm": 0.3140866233456728,
"learning_rate": 3.314559866499792e-05,
"loss": 0.3605,
"step": 1075
},
{
"epoch": 1.2104670793472143,
"grad_norm": 0.3136304362130377,
"learning_rate": 3.312473925740509e-05,
"loss": 0.3518,
"step": 1076
},
{
"epoch": 1.2115925717501406,
"grad_norm": 0.37704920383565566,
"learning_rate": 3.310387984981227e-05,
"loss": 0.3505,
"step": 1077
},
{
"epoch": 1.212718064153067,
"grad_norm": 0.3386484501892276,
"learning_rate": 3.3083020442219445e-05,
"loss": 0.3581,
"step": 1078
},
{
"epoch": 1.2138435565559933,
"grad_norm": 0.2756287445653643,
"learning_rate": 3.306216103462662e-05,
"loss": 0.3508,
"step": 1079
},
{
"epoch": 1.2149690489589196,
"grad_norm": 0.3727446701400803,
"learning_rate": 3.304130162703379e-05,
"loss": 0.3637,
"step": 1080
},
{
"epoch": 1.2160945413618458,
"grad_norm": 0.34205106067470487,
"learning_rate": 3.302044221944097e-05,
"loss": 0.3719,
"step": 1081
},
{
"epoch": 1.217220033764772,
"grad_norm": 0.277943785807938,
"learning_rate": 3.2999582811848146e-05,
"loss": 0.3663,
"step": 1082
},
{
"epoch": 1.2183455261676983,
"grad_norm": 0.32778887865165535,
"learning_rate": 3.2978723404255317e-05,
"loss": 0.3788,
"step": 1083
},
{
"epoch": 1.2194710185706246,
"grad_norm": 0.35850613973050943,
"learning_rate": 3.2957863996662494e-05,
"loss": 0.3516,
"step": 1084
},
{
"epoch": 1.220596510973551,
"grad_norm": 0.32265446214334986,
"learning_rate": 3.293700458906967e-05,
"loss": 0.3637,
"step": 1085
},
{
"epoch": 1.2217220033764773,
"grad_norm": 0.32337082624436203,
"learning_rate": 3.291614518147685e-05,
"loss": 0.353,
"step": 1086
},
{
"epoch": 1.2228474957794035,
"grad_norm": 0.3566976026538077,
"learning_rate": 3.2895285773884024e-05,
"loss": 0.3702,
"step": 1087
},
{
"epoch": 1.2239729881823298,
"grad_norm": 0.3602820801303339,
"learning_rate": 3.28744263662912e-05,
"loss": 0.3478,
"step": 1088
},
{
"epoch": 1.225098480585256,
"grad_norm": 0.3167430696040855,
"learning_rate": 3.285356695869837e-05,
"loss": 0.3659,
"step": 1089
},
{
"epoch": 1.2262239729881823,
"grad_norm": 0.3018055329469023,
"learning_rate": 3.283270755110555e-05,
"loss": 0.3588,
"step": 1090
},
{
"epoch": 1.2273494653911086,
"grad_norm": 0.3230175863991661,
"learning_rate": 3.2811848143512726e-05,
"loss": 0.3843,
"step": 1091
},
{
"epoch": 1.2284749577940348,
"grad_norm": 0.33437635395778137,
"learning_rate": 3.27909887359199e-05,
"loss": 0.3767,
"step": 1092
},
{
"epoch": 1.229600450196961,
"grad_norm": 0.2981951069574055,
"learning_rate": 3.277012932832707e-05,
"loss": 0.3595,
"step": 1093
},
{
"epoch": 1.2307259425998875,
"grad_norm": 0.3380111260459277,
"learning_rate": 3.2749269920734257e-05,
"loss": 0.3709,
"step": 1094
},
{
"epoch": 1.2318514350028138,
"grad_norm": 0.3105746941727841,
"learning_rate": 3.272841051314143e-05,
"loss": 0.3683,
"step": 1095
},
{
"epoch": 1.23297692740574,
"grad_norm": 0.30933018515689015,
"learning_rate": 3.2707551105548604e-05,
"loss": 0.3592,
"step": 1096
},
{
"epoch": 1.2341024198086663,
"grad_norm": 0.3281758704709362,
"learning_rate": 3.268669169795578e-05,
"loss": 0.3606,
"step": 1097
},
{
"epoch": 1.2352279122115926,
"grad_norm": 0.27019773248184603,
"learning_rate": 3.266583229036296e-05,
"loss": 0.3761,
"step": 1098
},
{
"epoch": 1.2363534046145188,
"grad_norm": 0.3336597777910512,
"learning_rate": 3.264497288277013e-05,
"loss": 0.3959,
"step": 1099
},
{
"epoch": 1.237478897017445,
"grad_norm": 0.31907940795687006,
"learning_rate": 3.262411347517731e-05,
"loss": 0.3615,
"step": 1100
},
{
"epoch": 1.2386043894203713,
"grad_norm": 0.3307050654215005,
"learning_rate": 3.260325406758448e-05,
"loss": 0.3705,
"step": 1101
},
{
"epoch": 1.2397298818232976,
"grad_norm": 0.31539392998297533,
"learning_rate": 3.258239465999165e-05,
"loss": 0.3495,
"step": 1102
},
{
"epoch": 1.240855374226224,
"grad_norm": 0.33795761226174353,
"learning_rate": 3.2561535252398836e-05,
"loss": 0.3268,
"step": 1103
},
{
"epoch": 1.2419808666291503,
"grad_norm": 0.3027820655749601,
"learning_rate": 3.2540675844806006e-05,
"loss": 0.3408,
"step": 1104
},
{
"epoch": 1.2431063590320766,
"grad_norm": 0.3679583282318403,
"learning_rate": 3.251981643721318e-05,
"loss": 0.3677,
"step": 1105
},
{
"epoch": 1.2442318514350028,
"grad_norm": 0.3613263755121883,
"learning_rate": 3.249895702962036e-05,
"loss": 0.3829,
"step": 1106
},
{
"epoch": 1.245357343837929,
"grad_norm": 0.24750292423243272,
"learning_rate": 3.247809762202754e-05,
"loss": 0.3482,
"step": 1107
},
{
"epoch": 1.2464828362408553,
"grad_norm": 0.3522480210531916,
"learning_rate": 3.245723821443471e-05,
"loss": 0.3625,
"step": 1108
},
{
"epoch": 1.2476083286437816,
"grad_norm": 0.31652713432842655,
"learning_rate": 3.243637880684189e-05,
"loss": 0.3671,
"step": 1109
},
{
"epoch": 1.248733821046708,
"grad_norm": 0.2933496822923214,
"learning_rate": 3.241551939924906e-05,
"loss": 0.3497,
"step": 1110
},
{
"epoch": 1.2498593134496343,
"grad_norm": 0.32050895251241057,
"learning_rate": 3.239465999165624e-05,
"loss": 0.3716,
"step": 1111
},
{
"epoch": 1.2509848058525606,
"grad_norm": 0.3121935413715743,
"learning_rate": 3.2373800584063415e-05,
"loss": 0.3771,
"step": 1112
},
{
"epoch": 1.2521102982554868,
"grad_norm": 0.32265588805772627,
"learning_rate": 3.235294117647059e-05,
"loss": 0.3439,
"step": 1113
},
{
"epoch": 1.253235790658413,
"grad_norm": 0.3064712619565091,
"learning_rate": 3.233208176887776e-05,
"loss": 0.3546,
"step": 1114
},
{
"epoch": 1.2543612830613393,
"grad_norm": 0.2996084699077036,
"learning_rate": 3.2311222361284946e-05,
"loss": 0.3833,
"step": 1115
},
{
"epoch": 1.2554867754642656,
"grad_norm": 0.3107489193677045,
"learning_rate": 3.2290362953692116e-05,
"loss": 0.364,
"step": 1116
},
{
"epoch": 1.2566122678671918,
"grad_norm": 0.29187918061969403,
"learning_rate": 3.226950354609929e-05,
"loss": 0.3504,
"step": 1117
},
{
"epoch": 1.257737760270118,
"grad_norm": 0.2941379965996245,
"learning_rate": 3.224864413850647e-05,
"loss": 0.3392,
"step": 1118
},
{
"epoch": 1.2588632526730446,
"grad_norm": 0.25115923486308955,
"learning_rate": 3.222778473091365e-05,
"loss": 0.3876,
"step": 1119
},
{
"epoch": 1.2599887450759708,
"grad_norm": 0.3252010811279875,
"learning_rate": 3.220692532332082e-05,
"loss": 0.3375,
"step": 1120
},
{
"epoch": 1.261114237478897,
"grad_norm": 0.29814156629055977,
"learning_rate": 3.2186065915727994e-05,
"loss": 0.3401,
"step": 1121
},
{
"epoch": 1.2622397298818233,
"grad_norm": 0.31902570430326976,
"learning_rate": 3.216520650813517e-05,
"loss": 0.3732,
"step": 1122
},
{
"epoch": 1.2633652222847496,
"grad_norm": 0.3010703802720578,
"learning_rate": 3.214434710054234e-05,
"loss": 0.358,
"step": 1123
},
{
"epoch": 1.2644907146876758,
"grad_norm": 0.32852710550779146,
"learning_rate": 3.2123487692949525e-05,
"loss": 0.3525,
"step": 1124
},
{
"epoch": 1.265616207090602,
"grad_norm": 0.32212180119638056,
"learning_rate": 3.2102628285356696e-05,
"loss": 0.3749,
"step": 1125
},
{
"epoch": 1.2667416994935286,
"grad_norm": 0.378384113691716,
"learning_rate": 3.208176887776387e-05,
"loss": 0.3669,
"step": 1126
},
{
"epoch": 1.2678671918964546,
"grad_norm": 0.31165403587755924,
"learning_rate": 3.206090947017105e-05,
"loss": 0.3559,
"step": 1127
},
{
"epoch": 1.268992684299381,
"grad_norm": 0.3679615615830758,
"learning_rate": 3.2040050062578227e-05,
"loss": 0.3623,
"step": 1128
},
{
"epoch": 1.2701181767023073,
"grad_norm": 0.3467806488910905,
"learning_rate": 3.20191906549854e-05,
"loss": 0.3771,
"step": 1129
},
{
"epoch": 1.2712436691052336,
"grad_norm": 1.9333645104311041,
"learning_rate": 3.1998331247392574e-05,
"loss": 0.3809,
"step": 1130
},
{
"epoch": 1.2723691615081598,
"grad_norm": 0.4215069325465578,
"learning_rate": 3.197747183979975e-05,
"loss": 0.3489,
"step": 1131
},
{
"epoch": 1.273494653911086,
"grad_norm": 0.2879811482225369,
"learning_rate": 3.195661243220693e-05,
"loss": 0.3627,
"step": 1132
},
{
"epoch": 1.2746201463140123,
"grad_norm": 0.4477759704739148,
"learning_rate": 3.19357530246141e-05,
"loss": 0.3623,
"step": 1133
},
{
"epoch": 1.2757456387169386,
"grad_norm": 0.3424164269682256,
"learning_rate": 3.191489361702128e-05,
"loss": 0.3476,
"step": 1134
},
{
"epoch": 1.276871131119865,
"grad_norm": 0.32862691867356353,
"learning_rate": 3.189403420942845e-05,
"loss": 0.3649,
"step": 1135
},
{
"epoch": 1.277996623522791,
"grad_norm": 0.3209270264744574,
"learning_rate": 3.187317480183563e-05,
"loss": 0.3535,
"step": 1136
},
{
"epoch": 1.2791221159257176,
"grad_norm": 0.3565891148820592,
"learning_rate": 3.1852315394242806e-05,
"loss": 0.3443,
"step": 1137
},
{
"epoch": 1.2802476083286438,
"grad_norm": 0.28408074419058515,
"learning_rate": 3.183145598664998e-05,
"loss": 0.369,
"step": 1138
},
{
"epoch": 1.28137310073157,
"grad_norm": 0.3637840011075196,
"learning_rate": 3.181059657905715e-05,
"loss": 0.3608,
"step": 1139
},
{
"epoch": 1.2824985931344963,
"grad_norm": 0.3595209908718878,
"learning_rate": 3.178973717146434e-05,
"loss": 0.3493,
"step": 1140
},
{
"epoch": 1.2836240855374226,
"grad_norm": 0.26496883846043384,
"learning_rate": 3.176887776387151e-05,
"loss": 0.3633,
"step": 1141
},
{
"epoch": 1.2847495779403488,
"grad_norm": 0.3336179309407727,
"learning_rate": 3.1748018356278684e-05,
"loss": 0.3662,
"step": 1142
},
{
"epoch": 1.285875070343275,
"grad_norm": 0.32668676414933834,
"learning_rate": 3.172715894868586e-05,
"loss": 0.3671,
"step": 1143
},
{
"epoch": 1.2870005627462016,
"grad_norm": 0.31252062188747054,
"learning_rate": 3.170629954109303e-05,
"loss": 0.3647,
"step": 1144
},
{
"epoch": 1.2881260551491278,
"grad_norm": 0.31744497936057164,
"learning_rate": 3.168544013350021e-05,
"loss": 0.3622,
"step": 1145
},
{
"epoch": 1.289251547552054,
"grad_norm": 0.2862050055745393,
"learning_rate": 3.1664580725907385e-05,
"loss": 0.3883,
"step": 1146
},
{
"epoch": 1.2903770399549803,
"grad_norm": 0.30021118499678395,
"learning_rate": 3.164372131831456e-05,
"loss": 0.3579,
"step": 1147
},
{
"epoch": 1.2915025323579066,
"grad_norm": 0.2910467656286127,
"learning_rate": 3.162286191072173e-05,
"loss": 0.3534,
"step": 1148
},
{
"epoch": 1.2926280247608328,
"grad_norm": 0.28678455388133556,
"learning_rate": 3.1602002503128916e-05,
"loss": 0.3497,
"step": 1149
},
{
"epoch": 1.293753517163759,
"grad_norm": 0.27836486011517614,
"learning_rate": 3.1581143095536086e-05,
"loss": 0.3443,
"step": 1150
},
{
"epoch": 1.2948790095666856,
"grad_norm": 0.30812952315806486,
"learning_rate": 3.156028368794326e-05,
"loss": 0.3893,
"step": 1151
},
{
"epoch": 1.2960045019696116,
"grad_norm": 0.2874885069684301,
"learning_rate": 3.153942428035044e-05,
"loss": 0.3685,
"step": 1152
},
{
"epoch": 1.297129994372538,
"grad_norm": 0.3347706854010768,
"learning_rate": 3.151856487275762e-05,
"loss": 0.3927,
"step": 1153
},
{
"epoch": 1.2982554867754643,
"grad_norm": 0.32176469835749927,
"learning_rate": 3.149770546516479e-05,
"loss": 0.38,
"step": 1154
},
{
"epoch": 1.2993809791783906,
"grad_norm": 0.2898256632538439,
"learning_rate": 3.147684605757197e-05,
"loss": 0.3616,
"step": 1155
},
{
"epoch": 1.3005064715813168,
"grad_norm": 0.33352792276895776,
"learning_rate": 3.145598664997914e-05,
"loss": 0.3748,
"step": 1156
},
{
"epoch": 1.301631963984243,
"grad_norm": 0.3099488701941323,
"learning_rate": 3.143512724238632e-05,
"loss": 0.372,
"step": 1157
},
{
"epoch": 1.3027574563871693,
"grad_norm": 0.30363398725151736,
"learning_rate": 3.1414267834793495e-05,
"loss": 0.3701,
"step": 1158
},
{
"epoch": 1.3038829487900956,
"grad_norm": 0.28447681088914367,
"learning_rate": 3.139340842720067e-05,
"loss": 0.3774,
"step": 1159
},
{
"epoch": 1.305008441193022,
"grad_norm": 0.3003448198337203,
"learning_rate": 3.137254901960784e-05,
"loss": 0.3598,
"step": 1160
},
{
"epoch": 1.306133933595948,
"grad_norm": 0.3379084483923677,
"learning_rate": 3.135168961201502e-05,
"loss": 0.3708,
"step": 1161
},
{
"epoch": 1.3072594259988746,
"grad_norm": 0.28091894310377574,
"learning_rate": 3.13308302044222e-05,
"loss": 0.3718,
"step": 1162
},
{
"epoch": 1.3083849184018008,
"grad_norm": 0.33666696805419777,
"learning_rate": 3.1309970796829374e-05,
"loss": 0.3521,
"step": 1163
},
{
"epoch": 1.309510410804727,
"grad_norm": 0.2784271381389026,
"learning_rate": 3.128911138923655e-05,
"loss": 0.365,
"step": 1164
},
{
"epoch": 1.3106359032076533,
"grad_norm": 0.32996125555463496,
"learning_rate": 3.126825198164372e-05,
"loss": 0.3663,
"step": 1165
},
{
"epoch": 1.3117613956105796,
"grad_norm": 0.29339874231665497,
"learning_rate": 3.12473925740509e-05,
"loss": 0.3579,
"step": 1166
},
{
"epoch": 1.3128868880135058,
"grad_norm": 0.27539689603551204,
"learning_rate": 3.1226533166458075e-05,
"loss": 0.3428,
"step": 1167
},
{
"epoch": 1.314012380416432,
"grad_norm": 0.30586051604779685,
"learning_rate": 3.120567375886525e-05,
"loss": 0.3742,
"step": 1168
},
{
"epoch": 1.3151378728193586,
"grad_norm": 0.318506663490862,
"learning_rate": 3.118481435127242e-05,
"loss": 0.3555,
"step": 1169
},
{
"epoch": 1.3162633652222848,
"grad_norm": 0.32193633165736774,
"learning_rate": 3.11639549436796e-05,
"loss": 0.3798,
"step": 1170
},
{
"epoch": 1.317388857625211,
"grad_norm": 0.26006223637970205,
"learning_rate": 3.1143095536086776e-05,
"loss": 0.3688,
"step": 1171
},
{
"epoch": 1.3185143500281373,
"grad_norm": 0.3091644393869938,
"learning_rate": 3.112223612849395e-05,
"loss": 0.344,
"step": 1172
},
{
"epoch": 1.3196398424310636,
"grad_norm": 0.2950119209666744,
"learning_rate": 3.110137672090112e-05,
"loss": 0.3807,
"step": 1173
},
{
"epoch": 1.3207653348339898,
"grad_norm": 0.2975497460062189,
"learning_rate": 3.108051731330831e-05,
"loss": 0.3581,
"step": 1174
},
{
"epoch": 1.321890827236916,
"grad_norm": 0.3248317526622501,
"learning_rate": 3.105965790571548e-05,
"loss": 0.3455,
"step": 1175
},
{
"epoch": 1.3230163196398426,
"grad_norm": 0.28753503994393237,
"learning_rate": 3.1038798498122654e-05,
"loss": 0.3526,
"step": 1176
},
{
"epoch": 1.3241418120427686,
"grad_norm": 0.309321520103074,
"learning_rate": 3.101793909052983e-05,
"loss": 0.3671,
"step": 1177
},
{
"epoch": 1.325267304445695,
"grad_norm": 0.31093843252643993,
"learning_rate": 3.099707968293701e-05,
"loss": 0.3797,
"step": 1178
},
{
"epoch": 1.3263927968486213,
"grad_norm": 0.2941320554481767,
"learning_rate": 3.097622027534418e-05,
"loss": 0.3656,
"step": 1179
},
{
"epoch": 1.3275182892515476,
"grad_norm": 0.33353760439258306,
"learning_rate": 3.095536086775136e-05,
"loss": 0.3659,
"step": 1180
},
{
"epoch": 1.3286437816544738,
"grad_norm": 0.2569769588199655,
"learning_rate": 3.093450146015853e-05,
"loss": 0.3842,
"step": 1181
},
{
"epoch": 1.3297692740574,
"grad_norm": 0.3330169931726158,
"learning_rate": 3.091364205256571e-05,
"loss": 0.3643,
"step": 1182
},
{
"epoch": 1.3308947664603263,
"grad_norm": 0.2876950022849873,
"learning_rate": 3.0892782644972886e-05,
"loss": 0.357,
"step": 1183
},
{
"epoch": 1.3320202588632526,
"grad_norm": 0.31915621662192034,
"learning_rate": 3.087192323738006e-05,
"loss": 0.3817,
"step": 1184
},
{
"epoch": 1.333145751266179,
"grad_norm": 0.31039603557721346,
"learning_rate": 3.085106382978723e-05,
"loss": 0.3705,
"step": 1185
},
{
"epoch": 1.334271243669105,
"grad_norm": 0.35493105743167,
"learning_rate": 3.083020442219441e-05,
"loss": 0.375,
"step": 1186
},
{
"epoch": 1.3353967360720316,
"grad_norm": 0.2887229895411605,
"learning_rate": 3.080934501460159e-05,
"loss": 0.3705,
"step": 1187
},
{
"epoch": 1.3365222284749578,
"grad_norm": 0.33105501034844814,
"learning_rate": 3.078848560700876e-05,
"loss": 0.3517,
"step": 1188
},
{
"epoch": 1.337647720877884,
"grad_norm": 0.296938807012652,
"learning_rate": 3.076762619941594e-05,
"loss": 0.3569,
"step": 1189
},
{
"epoch": 1.3387732132808103,
"grad_norm": 0.2631825890999598,
"learning_rate": 3.074676679182311e-05,
"loss": 0.3719,
"step": 1190
},
{
"epoch": 1.3398987056837366,
"grad_norm": 0.3302060927278823,
"learning_rate": 3.072590738423029e-05,
"loss": 0.357,
"step": 1191
},
{
"epoch": 1.3410241980866628,
"grad_norm": 0.284964127664028,
"learning_rate": 3.0705047976637465e-05,
"loss": 0.3637,
"step": 1192
},
{
"epoch": 1.342149690489589,
"grad_norm": 0.3450486482153124,
"learning_rate": 3.068418856904464e-05,
"loss": 0.3545,
"step": 1193
},
{
"epoch": 1.3432751828925156,
"grad_norm": 0.3097272198664404,
"learning_rate": 3.066332916145181e-05,
"loss": 0.3615,
"step": 1194
},
{
"epoch": 1.3444006752954418,
"grad_norm": 0.31428949130893125,
"learning_rate": 3.0642469753858996e-05,
"loss": 0.3577,
"step": 1195
},
{
"epoch": 1.345526167698368,
"grad_norm": 0.3459630485656923,
"learning_rate": 3.062161034626617e-05,
"loss": 0.38,
"step": 1196
},
{
"epoch": 1.3466516601012943,
"grad_norm": 0.34840227455144135,
"learning_rate": 3.0600750938673344e-05,
"loss": 0.3731,
"step": 1197
},
{
"epoch": 1.3477771525042206,
"grad_norm": 0.36198967619880806,
"learning_rate": 3.057989153108052e-05,
"loss": 0.3637,
"step": 1198
},
{
"epoch": 1.3489026449071468,
"grad_norm": 0.36980762481338214,
"learning_rate": 3.05590321234877e-05,
"loss": 0.3545,
"step": 1199
},
{
"epoch": 1.350028137310073,
"grad_norm": 0.34738553940185973,
"learning_rate": 3.053817271589487e-05,
"loss": 0.358,
"step": 1200
},
{
"epoch": 1.3511536297129996,
"grad_norm": 0.28899221690961746,
"learning_rate": 3.0517313308302048e-05,
"loss": 0.3692,
"step": 1201
},
{
"epoch": 1.3522791221159256,
"grad_norm": 0.348414484092682,
"learning_rate": 3.0496453900709222e-05,
"loss": 0.3562,
"step": 1202
},
{
"epoch": 1.353404614518852,
"grad_norm": 0.28821763744716605,
"learning_rate": 3.04755944931164e-05,
"loss": 0.3623,
"step": 1203
},
{
"epoch": 1.3545301069217783,
"grad_norm": 0.34701713808150375,
"learning_rate": 3.0454735085523572e-05,
"loss": 0.3667,
"step": 1204
},
{
"epoch": 1.3556555993247046,
"grad_norm": 0.34731368442368854,
"learning_rate": 3.0433875677930746e-05,
"loss": 0.3767,
"step": 1205
},
{
"epoch": 1.3567810917276308,
"grad_norm": 0.2873367344993991,
"learning_rate": 3.0413016270337923e-05,
"loss": 0.3594,
"step": 1206
},
{
"epoch": 1.357906584130557,
"grad_norm": 0.33455871764963324,
"learning_rate": 3.0392156862745097e-05,
"loss": 0.3649,
"step": 1207
},
{
"epoch": 1.3590320765334833,
"grad_norm": 0.3444401979121362,
"learning_rate": 3.0371297455152277e-05,
"loss": 0.3656,
"step": 1208
},
{
"epoch": 1.3601575689364096,
"grad_norm": 0.2899964778052406,
"learning_rate": 3.0350438047559447e-05,
"loss": 0.3457,
"step": 1209
},
{
"epoch": 1.361283061339336,
"grad_norm": 0.33907151317470086,
"learning_rate": 3.0329578639966627e-05,
"loss": 0.3627,
"step": 1210
},
{
"epoch": 1.362408553742262,
"grad_norm": 0.3044719983267248,
"learning_rate": 3.03087192323738e-05,
"loss": 0.3654,
"step": 1211
},
{
"epoch": 1.3635340461451886,
"grad_norm": 0.3161562860694256,
"learning_rate": 3.0287859824780978e-05,
"loss": 0.342,
"step": 1212
},
{
"epoch": 1.3646595385481148,
"grad_norm": 0.3068022604762919,
"learning_rate": 3.026700041718815e-05,
"loss": 0.3539,
"step": 1213
},
{
"epoch": 1.365785030951041,
"grad_norm": 0.2850677267934718,
"learning_rate": 3.024614100959533e-05,
"loss": 0.3725,
"step": 1214
},
{
"epoch": 1.3669105233539673,
"grad_norm": 0.3160554970609396,
"learning_rate": 3.0225281602002502e-05,
"loss": 0.3868,
"step": 1215
},
{
"epoch": 1.3680360157568936,
"grad_norm": 0.3373572652606873,
"learning_rate": 3.0204422194409683e-05,
"loss": 0.3711,
"step": 1216
},
{
"epoch": 1.3691615081598199,
"grad_norm": 0.27352074329674897,
"learning_rate": 3.0183562786816856e-05,
"loss": 0.3593,
"step": 1217
},
{
"epoch": 1.370287000562746,
"grad_norm": 0.32866686300985715,
"learning_rate": 3.0162703379224033e-05,
"loss": 0.3547,
"step": 1218
},
{
"epoch": 1.3714124929656726,
"grad_norm": 0.28785334578687116,
"learning_rate": 3.0141843971631207e-05,
"loss": 0.3703,
"step": 1219
},
{
"epoch": 1.3725379853685988,
"grad_norm": 0.3336610653280944,
"learning_rate": 3.0120984564038384e-05,
"loss": 0.3594,
"step": 1220
},
{
"epoch": 1.373663477771525,
"grad_norm": 0.3252899566864213,
"learning_rate": 3.0100125156445557e-05,
"loss": 0.3666,
"step": 1221
},
{
"epoch": 1.3747889701744513,
"grad_norm": 0.34272700759634595,
"learning_rate": 3.0079265748852738e-05,
"loss": 0.3673,
"step": 1222
},
{
"epoch": 1.3759144625773776,
"grad_norm": 0.2839456991740852,
"learning_rate": 3.0058406341259908e-05,
"loss": 0.3731,
"step": 1223
},
{
"epoch": 1.3770399549803038,
"grad_norm": 0.34144486456169987,
"learning_rate": 3.0037546933667088e-05,
"loss": 0.3535,
"step": 1224
},
{
"epoch": 1.37816544738323,
"grad_norm": 0.3510452096605386,
"learning_rate": 3.0016687526074262e-05,
"loss": 0.3783,
"step": 1225
},
{
"epoch": 1.3792909397861566,
"grad_norm": 0.3219709429564443,
"learning_rate": 2.9995828118481435e-05,
"loss": 0.3653,
"step": 1226
},
{
"epoch": 1.3804164321890826,
"grad_norm": 0.2922536824083754,
"learning_rate": 2.9974968710888612e-05,
"loss": 0.3487,
"step": 1227
},
{
"epoch": 1.381541924592009,
"grad_norm": 0.3146465080311366,
"learning_rate": 2.9954109303295786e-05,
"loss": 0.3582,
"step": 1228
},
{
"epoch": 1.3826674169949353,
"grad_norm": 0.3202141542926466,
"learning_rate": 2.9933249895702963e-05,
"loss": 0.3767,
"step": 1229
},
{
"epoch": 1.3837929093978616,
"grad_norm": 0.3126664207698992,
"learning_rate": 2.9912390488110137e-05,
"loss": 0.3861,
"step": 1230
},
{
"epoch": 1.3849184018007878,
"grad_norm": 0.3006754510665695,
"learning_rate": 2.9891531080517317e-05,
"loss": 0.3826,
"step": 1231
},
{
"epoch": 1.386043894203714,
"grad_norm": 0.34183231562741717,
"learning_rate": 2.9870671672924487e-05,
"loss": 0.3384,
"step": 1232
},
{
"epoch": 1.3871693866066404,
"grad_norm": 0.2981637621431096,
"learning_rate": 2.9849812265331668e-05,
"loss": 0.3644,
"step": 1233
},
{
"epoch": 1.3882948790095666,
"grad_norm": 0.32927113911951866,
"learning_rate": 2.982895285773884e-05,
"loss": 0.37,
"step": 1234
},
{
"epoch": 1.389420371412493,
"grad_norm": 0.3516964621170918,
"learning_rate": 2.9808093450146018e-05,
"loss": 0.3734,
"step": 1235
},
{
"epoch": 1.3905458638154191,
"grad_norm": 0.28294383540669815,
"learning_rate": 2.9787234042553192e-05,
"loss": 0.3545,
"step": 1236
},
{
"epoch": 1.3916713562183456,
"grad_norm": 0.36437808290704293,
"learning_rate": 2.976637463496037e-05,
"loss": 0.3626,
"step": 1237
},
{
"epoch": 1.3927968486212718,
"grad_norm": 0.31704230935830585,
"learning_rate": 2.9745515227367542e-05,
"loss": 0.3536,
"step": 1238
},
{
"epoch": 1.393922341024198,
"grad_norm": 0.3234586646771036,
"learning_rate": 2.9724655819774723e-05,
"loss": 0.3373,
"step": 1239
},
{
"epoch": 1.3950478334271244,
"grad_norm": 0.3108185575862165,
"learning_rate": 2.9703796412181893e-05,
"loss": 0.3685,
"step": 1240
},
{
"epoch": 1.3961733258300506,
"grad_norm": 0.3752154178147501,
"learning_rate": 2.9682937004589073e-05,
"loss": 0.3676,
"step": 1241
},
{
"epoch": 1.3972988182329769,
"grad_norm": 0.2884249248162915,
"learning_rate": 2.9662077596996247e-05,
"loss": 0.3806,
"step": 1242
},
{
"epoch": 1.3984243106359031,
"grad_norm": 0.34853899665658195,
"learning_rate": 2.9641218189403424e-05,
"loss": 0.35,
"step": 1243
},
{
"epoch": 1.3995498030388296,
"grad_norm": 0.38250560263603123,
"learning_rate": 2.9620358781810597e-05,
"loss": 0.3671,
"step": 1244
},
{
"epoch": 1.4006752954417558,
"grad_norm": 0.31599487532627424,
"learning_rate": 2.9599499374217778e-05,
"loss": 0.3833,
"step": 1245
},
{
"epoch": 1.401800787844682,
"grad_norm": 0.3500438918178945,
"learning_rate": 2.9578639966624948e-05,
"loss": 0.365,
"step": 1246
},
{
"epoch": 1.4029262802476083,
"grad_norm": 0.41585464581458353,
"learning_rate": 2.955778055903212e-05,
"loss": 0.3918,
"step": 1247
},
{
"epoch": 1.4040517726505346,
"grad_norm": 0.314777082319376,
"learning_rate": 2.9536921151439302e-05,
"loss": 0.3657,
"step": 1248
},
{
"epoch": 1.4051772650534609,
"grad_norm": 0.3941954750192581,
"learning_rate": 2.9516061743846472e-05,
"loss": 0.3472,
"step": 1249
},
{
"epoch": 1.406302757456387,
"grad_norm": 0.40917669615827007,
"learning_rate": 2.9495202336253653e-05,
"loss": 0.3646,
"step": 1250
},
{
"epoch": 1.4074282498593136,
"grad_norm": 0.32821878108438296,
"learning_rate": 2.9474342928660826e-05,
"loss": 0.3636,
"step": 1251
},
{
"epoch": 1.4085537422622396,
"grad_norm": 0.4247196049076011,
"learning_rate": 2.9453483521068003e-05,
"loss": 0.3412,
"step": 1252
},
{
"epoch": 1.409679234665166,
"grad_norm": 0.3851171427422802,
"learning_rate": 2.9432624113475177e-05,
"loss": 0.3819,
"step": 1253
},
{
"epoch": 1.4108047270680923,
"grad_norm": 0.3469070969772743,
"learning_rate": 2.9411764705882354e-05,
"loss": 0.3616,
"step": 1254
},
{
"epoch": 1.4119302194710186,
"grad_norm": 0.444050908424801,
"learning_rate": 2.9390905298289527e-05,
"loss": 0.3565,
"step": 1255
},
{
"epoch": 1.4130557118739449,
"grad_norm": 0.2879248830093118,
"learning_rate": 2.9370045890696708e-05,
"loss": 0.3601,
"step": 1256
},
{
"epoch": 1.414181204276871,
"grad_norm": 0.4483867224495224,
"learning_rate": 2.934918648310388e-05,
"loss": 0.3682,
"step": 1257
},
{
"epoch": 1.4153066966797974,
"grad_norm": 0.3000179145995747,
"learning_rate": 2.9328327075511058e-05,
"loss": 0.3636,
"step": 1258
},
{
"epoch": 1.4164321890827236,
"grad_norm": 0.38516280700639255,
"learning_rate": 2.9307467667918232e-05,
"loss": 0.3641,
"step": 1259
},
{
"epoch": 1.41755768148565,
"grad_norm": 0.3233722496864056,
"learning_rate": 2.928660826032541e-05,
"loss": 0.353,
"step": 1260
},
{
"epoch": 1.4186831738885761,
"grad_norm": 0.35206171266037123,
"learning_rate": 2.9265748852732582e-05,
"loss": 0.3715,
"step": 1261
},
{
"epoch": 1.4198086662915026,
"grad_norm": 0.3416243699378654,
"learning_rate": 2.9244889445139763e-05,
"loss": 0.374,
"step": 1262
},
{
"epoch": 1.4209341586944289,
"grad_norm": 0.354667895796559,
"learning_rate": 2.9224030037546933e-05,
"loss": 0.3587,
"step": 1263
},
{
"epoch": 1.422059651097355,
"grad_norm": 0.35090199403773553,
"learning_rate": 2.9203170629954113e-05,
"loss": 0.3679,
"step": 1264
},
{
"epoch": 1.4231851435002814,
"grad_norm": 0.37870594240912114,
"learning_rate": 2.9182311222361287e-05,
"loss": 0.3497,
"step": 1265
},
{
"epoch": 1.4243106359032076,
"grad_norm": 0.3281629547838438,
"learning_rate": 2.9161451814768464e-05,
"loss": 0.3457,
"step": 1266
},
{
"epoch": 1.4254361283061339,
"grad_norm": 0.35899900559464903,
"learning_rate": 2.9140592407175638e-05,
"loss": 0.3627,
"step": 1267
},
{
"epoch": 1.4265616207090601,
"grad_norm": 0.30995421196825085,
"learning_rate": 2.911973299958281e-05,
"loss": 0.3939,
"step": 1268
},
{
"epoch": 1.4276871131119866,
"grad_norm": 0.3707647704012859,
"learning_rate": 2.9098873591989988e-05,
"loss": 0.3683,
"step": 1269
},
{
"epoch": 1.4288126055149128,
"grad_norm": 0.3493957956050667,
"learning_rate": 2.9078014184397162e-05,
"loss": 0.3922,
"step": 1270
},
{
"epoch": 1.429938097917839,
"grad_norm": 0.3392405036440076,
"learning_rate": 2.9057154776804342e-05,
"loss": 0.3675,
"step": 1271
},
{
"epoch": 1.4310635903207654,
"grad_norm": 0.42908164729503306,
"learning_rate": 2.9036295369211512e-05,
"loss": 0.3497,
"step": 1272
},
{
"epoch": 1.4321890827236916,
"grad_norm": 0.3217604733902604,
"learning_rate": 2.9015435961618693e-05,
"loss": 0.3419,
"step": 1273
},
{
"epoch": 1.4333145751266179,
"grad_norm": 0.285513498828739,
"learning_rate": 2.8994576554025866e-05,
"loss": 0.3525,
"step": 1274
},
{
"epoch": 1.4344400675295441,
"grad_norm": 0.31722207811402275,
"learning_rate": 2.8973717146433043e-05,
"loss": 0.3537,
"step": 1275
},
{
"epoch": 1.4355655599324706,
"grad_norm": 0.3425240574013163,
"learning_rate": 2.8952857738840217e-05,
"loss": 0.3536,
"step": 1276
},
{
"epoch": 1.4366910523353966,
"grad_norm": 0.37106807604623343,
"learning_rate": 2.8931998331247394e-05,
"loss": 0.3495,
"step": 1277
},
{
"epoch": 1.437816544738323,
"grad_norm": 0.28250215963205166,
"learning_rate": 2.8911138923654567e-05,
"loss": 0.3481,
"step": 1278
},
{
"epoch": 1.4389420371412494,
"grad_norm": 0.3549294685688371,
"learning_rate": 2.8890279516061748e-05,
"loss": 0.376,
"step": 1279
},
{
"epoch": 1.4400675295441756,
"grad_norm": 0.30171989887856726,
"learning_rate": 2.886942010846892e-05,
"loss": 0.3755,
"step": 1280
},
{
"epoch": 1.4411930219471019,
"grad_norm": 0.3139221398315422,
"learning_rate": 2.88485607008761e-05,
"loss": 0.3513,
"step": 1281
},
{
"epoch": 1.4423185143500281,
"grad_norm": 0.3174768026324268,
"learning_rate": 2.8827701293283272e-05,
"loss": 0.3569,
"step": 1282
},
{
"epoch": 1.4434440067529544,
"grad_norm": 0.31543813617403643,
"learning_rate": 2.880684188569045e-05,
"loss": 0.3551,
"step": 1283
},
{
"epoch": 1.4445694991558806,
"grad_norm": 0.34405513448928665,
"learning_rate": 2.8785982478097623e-05,
"loss": 0.3615,
"step": 1284
},
{
"epoch": 1.445694991558807,
"grad_norm": 0.3314207471763474,
"learning_rate": 2.8765123070504803e-05,
"loss": 0.364,
"step": 1285
},
{
"epoch": 1.4468204839617331,
"grad_norm": 0.30011672219405916,
"learning_rate": 2.8744263662911973e-05,
"loss": 0.3541,
"step": 1286
},
{
"epoch": 1.4479459763646596,
"grad_norm": 0.36939176440073757,
"learning_rate": 2.8723404255319154e-05,
"loss": 0.3466,
"step": 1287
},
{
"epoch": 1.4490714687675859,
"grad_norm": 0.38877177781745204,
"learning_rate": 2.8702544847726327e-05,
"loss": 0.3786,
"step": 1288
},
{
"epoch": 1.4501969611705121,
"grad_norm": 0.3409728207807626,
"learning_rate": 2.8681685440133497e-05,
"loss": 0.3791,
"step": 1289
},
{
"epoch": 1.4513224535734384,
"grad_norm": 0.42817414825924877,
"learning_rate": 2.8660826032540678e-05,
"loss": 0.3491,
"step": 1290
},
{
"epoch": 1.4524479459763646,
"grad_norm": 0.35198856950809654,
"learning_rate": 2.863996662494785e-05,
"loss": 0.3679,
"step": 1291
},
{
"epoch": 1.4535734383792909,
"grad_norm": 0.32113222683338927,
"learning_rate": 2.861910721735503e-05,
"loss": 0.3467,
"step": 1292
},
{
"epoch": 1.4546989307822171,
"grad_norm": 0.321401722942131,
"learning_rate": 2.8598247809762202e-05,
"loss": 0.3565,
"step": 1293
},
{
"epoch": 1.4558244231851436,
"grad_norm": 0.3506970283170861,
"learning_rate": 2.857738840216938e-05,
"loss": 0.3691,
"step": 1294
},
{
"epoch": 1.4569499155880699,
"grad_norm": 0.3366249220650027,
"learning_rate": 2.8556528994576552e-05,
"loss": 0.3679,
"step": 1295
},
{
"epoch": 1.458075407990996,
"grad_norm": 0.3863825955582127,
"learning_rate": 2.8535669586983733e-05,
"loss": 0.3656,
"step": 1296
},
{
"epoch": 1.4592009003939224,
"grad_norm": 0.3294265653501134,
"learning_rate": 2.8514810179390906e-05,
"loss": 0.363,
"step": 1297
},
{
"epoch": 1.4603263927968486,
"grad_norm": 0.3400753421715317,
"learning_rate": 2.8493950771798083e-05,
"loss": 0.3677,
"step": 1298
},
{
"epoch": 1.4614518851997749,
"grad_norm": 0.34166651813969356,
"learning_rate": 2.8473091364205257e-05,
"loss": 0.3975,
"step": 1299
},
{
"epoch": 1.4625773776027011,
"grad_norm": 0.32157145849641844,
"learning_rate": 2.8452231956612434e-05,
"loss": 0.3599,
"step": 1300
},
{
"epoch": 1.4637028700056276,
"grad_norm": 0.29318800149996266,
"learning_rate": 2.8431372549019608e-05,
"loss": 0.3639,
"step": 1301
},
{
"epoch": 1.4648283624085536,
"grad_norm": 0.3275747590035082,
"learning_rate": 2.8410513141426788e-05,
"loss": 0.3405,
"step": 1302
},
{
"epoch": 1.46595385481148,
"grad_norm": 0.2669444641392893,
"learning_rate": 2.8389653733833958e-05,
"loss": 0.3556,
"step": 1303
},
{
"epoch": 1.4670793472144064,
"grad_norm": 0.2984242508636962,
"learning_rate": 2.836879432624114e-05,
"loss": 0.3475,
"step": 1304
},
{
"epoch": 1.4682048396173326,
"grad_norm": 0.34309575853919444,
"learning_rate": 2.8347934918648312e-05,
"loss": 0.3236,
"step": 1305
},
{
"epoch": 1.4693303320202589,
"grad_norm": 0.3386670126038393,
"learning_rate": 2.832707551105549e-05,
"loss": 0.3692,
"step": 1306
},
{
"epoch": 1.4704558244231851,
"grad_norm": 0.3041272096720939,
"learning_rate": 2.8306216103462663e-05,
"loss": 0.3672,
"step": 1307
},
{
"epoch": 1.4715813168261114,
"grad_norm": 0.4280504900617411,
"learning_rate": 2.828535669586984e-05,
"loss": 0.3532,
"step": 1308
},
{
"epoch": 1.4727068092290376,
"grad_norm": 0.28299915352373894,
"learning_rate": 2.8264497288277013e-05,
"loss": 0.3389,
"step": 1309
},
{
"epoch": 1.473832301631964,
"grad_norm": 0.33312026594342037,
"learning_rate": 2.8243637880684187e-05,
"loss": 0.3711,
"step": 1310
},
{
"epoch": 1.4749577940348901,
"grad_norm": 0.3324677079496402,
"learning_rate": 2.8222778473091367e-05,
"loss": 0.3637,
"step": 1311
},
{
"epoch": 1.4760832864378166,
"grad_norm": 0.3180122895020907,
"learning_rate": 2.8201919065498537e-05,
"loss": 0.3681,
"step": 1312
},
{
"epoch": 1.4772087788407429,
"grad_norm": 0.3454913512736821,
"learning_rate": 2.8181059657905718e-05,
"loss": 0.3788,
"step": 1313
},
{
"epoch": 1.4783342712436691,
"grad_norm": 0.2954651043434191,
"learning_rate": 2.816020025031289e-05,
"loss": 0.3629,
"step": 1314
},
{
"epoch": 1.4794597636465954,
"grad_norm": 0.31225437993089217,
"learning_rate": 2.813934084272007e-05,
"loss": 0.3483,
"step": 1315
},
{
"epoch": 1.4805852560495216,
"grad_norm": 0.2931420257748457,
"learning_rate": 2.8118481435127242e-05,
"loss": 0.3443,
"step": 1316
},
{
"epoch": 1.4817107484524479,
"grad_norm": 0.3077463608704642,
"learning_rate": 2.809762202753442e-05,
"loss": 0.3562,
"step": 1317
},
{
"epoch": 1.4828362408553741,
"grad_norm": 0.2868052518006215,
"learning_rate": 2.8076762619941593e-05,
"loss": 0.3532,
"step": 1318
},
{
"epoch": 1.4839617332583006,
"grad_norm": 0.28223423866564457,
"learning_rate": 2.8055903212348773e-05,
"loss": 0.3334,
"step": 1319
},
{
"epoch": 1.4850872256612269,
"grad_norm": 0.2934968437108151,
"learning_rate": 2.8035043804755947e-05,
"loss": 0.3609,
"step": 1320
},
{
"epoch": 1.4862127180641531,
"grad_norm": 0.3726867856164999,
"learning_rate": 2.8014184397163124e-05,
"loss": 0.3658,
"step": 1321
},
{
"epoch": 1.4873382104670794,
"grad_norm": 0.31940065928357514,
"learning_rate": 2.7993324989570297e-05,
"loss": 0.3752,
"step": 1322
},
{
"epoch": 1.4884637028700056,
"grad_norm": 0.343528935258811,
"learning_rate": 2.7972465581977474e-05,
"loss": 0.3689,
"step": 1323
},
{
"epoch": 1.4895891952729319,
"grad_norm": 0.29324201562634045,
"learning_rate": 2.7951606174384648e-05,
"loss": 0.3701,
"step": 1324
},
{
"epoch": 1.4907146876758581,
"grad_norm": 0.307447149562183,
"learning_rate": 2.7930746766791828e-05,
"loss": 0.3623,
"step": 1325
},
{
"epoch": 1.4918401800787846,
"grad_norm": 0.3370769636245937,
"learning_rate": 2.7909887359199e-05,
"loss": 0.3599,
"step": 1326
},
{
"epoch": 1.4929656724817106,
"grad_norm": 0.2871673492029565,
"learning_rate": 2.788902795160618e-05,
"loss": 0.3719,
"step": 1327
},
{
"epoch": 1.4940911648846371,
"grad_norm": 0.36895913560450455,
"learning_rate": 2.7868168544013352e-05,
"loss": 0.3678,
"step": 1328
},
{
"epoch": 1.4952166572875634,
"grad_norm": 0.30425325809005394,
"learning_rate": 2.784730913642053e-05,
"loss": 0.3654,
"step": 1329
},
{
"epoch": 1.4963421496904896,
"grad_norm": 0.3331261517980334,
"learning_rate": 2.7826449728827703e-05,
"loss": 0.3693,
"step": 1330
},
{
"epoch": 1.4974676420934159,
"grad_norm": 0.2798679646502201,
"learning_rate": 2.7805590321234876e-05,
"loss": 0.3709,
"step": 1331
},
{
"epoch": 1.4985931344963421,
"grad_norm": 0.37466709997642333,
"learning_rate": 2.7784730913642053e-05,
"loss": 0.363,
"step": 1332
},
{
"epoch": 1.4997186268992684,
"grad_norm": 0.35357276036020097,
"learning_rate": 2.7763871506049227e-05,
"loss": 0.3683,
"step": 1333
},
{
"epoch": 1.5008441193021946,
"grad_norm": 0.3354334941577856,
"learning_rate": 2.7743012098456404e-05,
"loss": 0.3438,
"step": 1334
},
{
"epoch": 1.501969611705121,
"grad_norm": 0.4101041564365979,
"learning_rate": 2.7722152690863578e-05,
"loss": 0.3835,
"step": 1335
},
{
"epoch": 1.5030951041080471,
"grad_norm": 0.36255802157025624,
"learning_rate": 2.7701293283270758e-05,
"loss": 0.3477,
"step": 1336
},
{
"epoch": 1.5042205965109736,
"grad_norm": 0.4061869558301693,
"learning_rate": 2.768043387567793e-05,
"loss": 0.3688,
"step": 1337
},
{
"epoch": 1.5053460889138999,
"grad_norm": 0.41388849066334216,
"learning_rate": 2.765957446808511e-05,
"loss": 0.3672,
"step": 1338
},
{
"epoch": 1.5064715813168261,
"grad_norm": 0.330932817498522,
"learning_rate": 2.7638715060492282e-05,
"loss": 0.3643,
"step": 1339
},
{
"epoch": 1.5075970737197524,
"grad_norm": 0.3399783880240679,
"learning_rate": 2.761785565289946e-05,
"loss": 0.3529,
"step": 1340
},
{
"epoch": 1.5087225661226786,
"grad_norm": 0.28341696113530734,
"learning_rate": 2.7596996245306633e-05,
"loss": 0.3496,
"step": 1341
},
{
"epoch": 1.509848058525605,
"grad_norm": 0.334967765352759,
"learning_rate": 2.7576136837713813e-05,
"loss": 0.3415,
"step": 1342
},
{
"epoch": 1.5109735509285311,
"grad_norm": 0.2668604468971969,
"learning_rate": 2.7555277430120983e-05,
"loss": 0.3634,
"step": 1343
},
{
"epoch": 1.5120990433314576,
"grad_norm": 0.31156721053710673,
"learning_rate": 2.7534418022528164e-05,
"loss": 0.3641,
"step": 1344
},
{
"epoch": 1.5132245357343836,
"grad_norm": 0.29754957914675184,
"learning_rate": 2.7513558614935337e-05,
"loss": 0.3785,
"step": 1345
},
{
"epoch": 1.5143500281373101,
"grad_norm": 0.2872566093068787,
"learning_rate": 2.7492699207342514e-05,
"loss": 0.3623,
"step": 1346
},
{
"epoch": 1.5154755205402364,
"grad_norm": 0.3526852777204813,
"learning_rate": 2.7471839799749688e-05,
"loss": 0.3723,
"step": 1347
},
{
"epoch": 1.5166010129431626,
"grad_norm": 0.31241125025784733,
"learning_rate": 2.7450980392156865e-05,
"loss": 0.3507,
"step": 1348
},
{
"epoch": 1.5177265053460889,
"grad_norm": 0.3508625079587985,
"learning_rate": 2.743012098456404e-05,
"loss": 0.3608,
"step": 1349
},
{
"epoch": 1.5188519977490151,
"grad_norm": 0.32157619105794166,
"learning_rate": 2.740926157697122e-05,
"loss": 0.3581,
"step": 1350
},
{
"epoch": 1.5199774901519416,
"grad_norm": 0.3494380418250329,
"learning_rate": 2.7388402169378392e-05,
"loss": 0.3609,
"step": 1351
},
{
"epoch": 1.5211029825548676,
"grad_norm": 0.3055065567786005,
"learning_rate": 2.7367542761785563e-05,
"loss": 0.3672,
"step": 1352
},
{
"epoch": 1.5222284749577941,
"grad_norm": 0.3950982220672214,
"learning_rate": 2.7346683354192743e-05,
"loss": 0.376,
"step": 1353
},
{
"epoch": 1.5233539673607202,
"grad_norm": 0.27852848240062467,
"learning_rate": 2.7325823946599917e-05,
"loss": 0.3485,
"step": 1354
},
{
"epoch": 1.5244794597636466,
"grad_norm": 0.3737867565000807,
"learning_rate": 2.7304964539007094e-05,
"loss": 0.3664,
"step": 1355
},
{
"epoch": 1.5256049521665729,
"grad_norm": 0.3119606266731146,
"learning_rate": 2.7284105131414267e-05,
"loss": 0.3619,
"step": 1356
},
{
"epoch": 1.5267304445694991,
"grad_norm": 0.33933519597699924,
"learning_rate": 2.7263245723821444e-05,
"loss": 0.4008,
"step": 1357
},
{
"epoch": 1.5278559369724254,
"grad_norm": 0.3275255812573412,
"learning_rate": 2.7242386316228618e-05,
"loss": 0.3702,
"step": 1358
},
{
"epoch": 1.5289814293753516,
"grad_norm": 0.3747569415524062,
"learning_rate": 2.7221526908635798e-05,
"loss": 0.3553,
"step": 1359
},
{
"epoch": 1.5301069217782781,
"grad_norm": 0.30992658499062065,
"learning_rate": 2.7200667501042972e-05,
"loss": 0.3556,
"step": 1360
},
{
"epoch": 1.5312324141812041,
"grad_norm": 0.36837860346575607,
"learning_rate": 2.717980809345015e-05,
"loss": 0.3536,
"step": 1361
},
{
"epoch": 1.5323579065841306,
"grad_norm": 0.3422637051978812,
"learning_rate": 2.7158948685857322e-05,
"loss": 0.3676,
"step": 1362
},
{
"epoch": 1.5334833989870569,
"grad_norm": 0.2882475832928599,
"learning_rate": 2.71380892782645e-05,
"loss": 0.3714,
"step": 1363
},
{
"epoch": 1.5346088913899831,
"grad_norm": 0.4680385431354928,
"learning_rate": 2.7117229870671673e-05,
"loss": 0.3728,
"step": 1364
},
{
"epoch": 1.5357343837929094,
"grad_norm": 0.28687340173500175,
"learning_rate": 2.7096370463078853e-05,
"loss": 0.3677,
"step": 1365
},
{
"epoch": 1.5368598761958356,
"grad_norm": 0.3168437934125687,
"learning_rate": 2.7075511055486023e-05,
"loss": 0.3516,
"step": 1366
},
{
"epoch": 1.5379853685987621,
"grad_norm": 0.34254703558592353,
"learning_rate": 2.7054651647893204e-05,
"loss": 0.3733,
"step": 1367
},
{
"epoch": 1.5391108610016881,
"grad_norm": 0.32210383347863225,
"learning_rate": 2.7033792240300377e-05,
"loss": 0.3657,
"step": 1368
},
{
"epoch": 1.5402363534046146,
"grad_norm": 0.2951642244458056,
"learning_rate": 2.7012932832707554e-05,
"loss": 0.3624,
"step": 1369
},
{
"epoch": 1.5413618458075407,
"grad_norm": 0.32973184204270484,
"learning_rate": 2.6992073425114728e-05,
"loss": 0.3466,
"step": 1370
},
{
"epoch": 1.5424873382104671,
"grad_norm": 0.32937201569972335,
"learning_rate": 2.6971214017521905e-05,
"loss": 0.3609,
"step": 1371
},
{
"epoch": 1.5436128306133934,
"grad_norm": 0.294240889891016,
"learning_rate": 2.695035460992908e-05,
"loss": 0.3528,
"step": 1372
},
{
"epoch": 1.5447383230163196,
"grad_norm": 0.38730632898384704,
"learning_rate": 2.6929495202336252e-05,
"loss": 0.3592,
"step": 1373
},
{
"epoch": 1.545863815419246,
"grad_norm": 0.265405748658469,
"learning_rate": 2.6908635794743433e-05,
"loss": 0.3523,
"step": 1374
},
{
"epoch": 1.5469893078221721,
"grad_norm": 0.3090293159321234,
"learning_rate": 2.6887776387150603e-05,
"loss": 0.373,
"step": 1375
},
{
"epoch": 1.5481148002250986,
"grad_norm": 0.33125373511524786,
"learning_rate": 2.6866916979557783e-05,
"loss": 0.3376,
"step": 1376
},
{
"epoch": 1.5492402926280247,
"grad_norm": 0.3859675477375762,
"learning_rate": 2.6846057571964957e-05,
"loss": 0.3595,
"step": 1377
},
{
"epoch": 1.5503657850309511,
"grad_norm": 0.2702204865287381,
"learning_rate": 2.6825198164372134e-05,
"loss": 0.3526,
"step": 1378
},
{
"epoch": 1.5514912774338772,
"grad_norm": 0.4216493180934553,
"learning_rate": 2.6804338756779307e-05,
"loss": 0.3634,
"step": 1379
},
{
"epoch": 1.5526167698368036,
"grad_norm": 0.3402054598291514,
"learning_rate": 2.6783479349186484e-05,
"loss": 0.3814,
"step": 1380
},
{
"epoch": 1.5537422622397299,
"grad_norm": 0.3634322127130347,
"learning_rate": 2.6762619941593658e-05,
"loss": 0.3739,
"step": 1381
},
{
"epoch": 1.5548677546426561,
"grad_norm": 0.4033902015465824,
"learning_rate": 2.6741760534000838e-05,
"loss": 0.3731,
"step": 1382
},
{
"epoch": 1.5559932470455824,
"grad_norm": 0.40808104649969373,
"learning_rate": 2.672090112640801e-05,
"loss": 0.3522,
"step": 1383
},
{
"epoch": 1.5571187394485086,
"grad_norm": 0.39575554889808146,
"learning_rate": 2.670004171881519e-05,
"loss": 0.3466,
"step": 1384
},
{
"epoch": 1.5582442318514351,
"grad_norm": 0.33725542522344404,
"learning_rate": 2.6679182311222362e-05,
"loss": 0.3594,
"step": 1385
},
{
"epoch": 1.5593697242543612,
"grad_norm": 0.3562002474404248,
"learning_rate": 2.665832290362954e-05,
"loss": 0.3766,
"step": 1386
},
{
"epoch": 1.5604952166572876,
"grad_norm": 0.2792679981992388,
"learning_rate": 2.6637463496036713e-05,
"loss": 0.354,
"step": 1387
},
{
"epoch": 1.5616207090602139,
"grad_norm": 0.3631975807941906,
"learning_rate": 2.661660408844389e-05,
"loss": 0.3628,
"step": 1388
},
{
"epoch": 1.5627462014631401,
"grad_norm": 0.2922697632757867,
"learning_rate": 2.6595744680851064e-05,
"loss": 0.3786,
"step": 1389
},
{
"epoch": 1.5638716938660664,
"grad_norm": 0.3930094259783832,
"learning_rate": 2.6574885273258244e-05,
"loss": 0.349,
"step": 1390
},
{
"epoch": 1.5649971862689926,
"grad_norm": 0.2753952015092564,
"learning_rate": 2.6554025865665418e-05,
"loss": 0.3606,
"step": 1391
},
{
"epoch": 1.5661226786719191,
"grad_norm": 0.323233762383296,
"learning_rate": 2.6533166458072595e-05,
"loss": 0.3584,
"step": 1392
},
{
"epoch": 1.5672481710748452,
"grad_norm": 0.3065899573190829,
"learning_rate": 2.6512307050479768e-05,
"loss": 0.3569,
"step": 1393
},
{
"epoch": 1.5683736634777716,
"grad_norm": 0.29359629957776534,
"learning_rate": 2.6491447642886942e-05,
"loss": 0.3721,
"step": 1394
},
{
"epoch": 1.5694991558806977,
"grad_norm": 0.3453639950913077,
"learning_rate": 2.647058823529412e-05,
"loss": 0.3674,
"step": 1395
},
{
"epoch": 1.5706246482836241,
"grad_norm": 0.29618728974968406,
"learning_rate": 2.6449728827701292e-05,
"loss": 0.3728,
"step": 1396
},
{
"epoch": 1.5717501406865504,
"grad_norm": 0.4022340400841394,
"learning_rate": 2.642886942010847e-05,
"loss": 0.3599,
"step": 1397
},
{
"epoch": 1.5728756330894766,
"grad_norm": 0.34040909178052503,
"learning_rate": 2.6408010012515643e-05,
"loss": 0.3452,
"step": 1398
},
{
"epoch": 1.574001125492403,
"grad_norm": 0.39633565400793064,
"learning_rate": 2.6387150604922823e-05,
"loss": 0.3638,
"step": 1399
},
{
"epoch": 1.5751266178953292,
"grad_norm": 0.3469815814003443,
"learning_rate": 2.6366291197329997e-05,
"loss": 0.3617,
"step": 1400
},
{
"epoch": 1.5762521102982556,
"grad_norm": 0.3858237301262129,
"learning_rate": 2.6345431789737174e-05,
"loss": 0.3592,
"step": 1401
},
{
"epoch": 1.5773776027011817,
"grad_norm": 0.36968305499637627,
"learning_rate": 2.6324572382144347e-05,
"loss": 0.3506,
"step": 1402
},
{
"epoch": 1.5785030951041081,
"grad_norm": 0.3505404658131974,
"learning_rate": 2.6303712974551524e-05,
"loss": 0.3686,
"step": 1403
},
{
"epoch": 1.5796285875070342,
"grad_norm": 0.33758728331020843,
"learning_rate": 2.6282853566958698e-05,
"loss": 0.3527,
"step": 1404
},
{
"epoch": 1.5807540799099606,
"grad_norm": 0.3435492065868497,
"learning_rate": 2.626199415936588e-05,
"loss": 0.3475,
"step": 1405
},
{
"epoch": 1.581879572312887,
"grad_norm": 0.3490084416143491,
"learning_rate": 2.624113475177305e-05,
"loss": 0.3607,
"step": 1406
},
{
"epoch": 1.5830050647158131,
"grad_norm": 0.31414180653905893,
"learning_rate": 2.622027534418023e-05,
"loss": 0.3504,
"step": 1407
},
{
"epoch": 1.5841305571187394,
"grad_norm": 0.3599821696826535,
"learning_rate": 2.6199415936587403e-05,
"loss": 0.3615,
"step": 1408
},
{
"epoch": 1.5852560495216657,
"grad_norm": 0.42310764019699615,
"learning_rate": 2.617855652899458e-05,
"loss": 0.3724,
"step": 1409
},
{
"epoch": 1.5863815419245921,
"grad_norm": 0.2833525199592301,
"learning_rate": 2.6157697121401753e-05,
"loss": 0.3617,
"step": 1410
},
{
"epoch": 1.5875070343275182,
"grad_norm": 0.3619653752728842,
"learning_rate": 2.613683771380893e-05,
"loss": 0.3535,
"step": 1411
},
{
"epoch": 1.5886325267304446,
"grad_norm": 0.31893555877641494,
"learning_rate": 2.6115978306216104e-05,
"loss": 0.3739,
"step": 1412
},
{
"epoch": 1.589758019133371,
"grad_norm": 0.367002811604332,
"learning_rate": 2.6095118898623284e-05,
"loss": 0.3553,
"step": 1413
},
{
"epoch": 1.5908835115362971,
"grad_norm": 0.27151097727860346,
"learning_rate": 2.6074259491030458e-05,
"loss": 0.3347,
"step": 1414
},
{
"epoch": 1.5920090039392234,
"grad_norm": 0.3131896896726996,
"learning_rate": 2.6053400083437628e-05,
"loss": 0.3546,
"step": 1415
},
{
"epoch": 1.5931344963421497,
"grad_norm": 0.36676987492115576,
"learning_rate": 2.6032540675844808e-05,
"loss": 0.3675,
"step": 1416
},
{
"epoch": 1.5942599887450761,
"grad_norm": 0.2950227483896426,
"learning_rate": 2.6011681268251982e-05,
"loss": 0.3648,
"step": 1417
},
{
"epoch": 1.5953854811480022,
"grad_norm": 0.34344487884738795,
"learning_rate": 2.599082186065916e-05,
"loss": 0.3597,
"step": 1418
},
{
"epoch": 1.5965109735509286,
"grad_norm": 0.320230789996618,
"learning_rate": 2.5969962453066332e-05,
"loss": 0.3524,
"step": 1419
},
{
"epoch": 1.5976364659538547,
"grad_norm": 0.32035648740276107,
"learning_rate": 2.594910304547351e-05,
"loss": 0.3595,
"step": 1420
},
{
"epoch": 1.5987619583567811,
"grad_norm": 0.2888199453121108,
"learning_rate": 2.5928243637880683e-05,
"loss": 0.3862,
"step": 1421
},
{
"epoch": 1.5998874507597074,
"grad_norm": 0.32236255339509834,
"learning_rate": 2.5907384230287863e-05,
"loss": 0.3476,
"step": 1422
},
{
"epoch": 1.6010129431626337,
"grad_norm": 0.3203989927659959,
"learning_rate": 2.5886524822695034e-05,
"loss": 0.3702,
"step": 1423
},
{
"epoch": 1.60213843556556,
"grad_norm": 0.2911113101367755,
"learning_rate": 2.5865665415102214e-05,
"loss": 0.3688,
"step": 1424
},
{
"epoch": 1.6032639279684862,
"grad_norm": 0.35071227735634586,
"learning_rate": 2.5844806007509388e-05,
"loss": 0.3808,
"step": 1425
},
{
"epoch": 1.6043894203714126,
"grad_norm": 1.359117007518664,
"learning_rate": 2.5823946599916565e-05,
"loss": 0.3551,
"step": 1426
},
{
"epoch": 1.6055149127743387,
"grad_norm": 0.33498969187479993,
"learning_rate": 2.5803087192323738e-05,
"loss": 0.3593,
"step": 1427
},
{
"epoch": 1.6066404051772651,
"grad_norm": 0.30337597464705507,
"learning_rate": 2.5782227784730915e-05,
"loss": 0.3674,
"step": 1428
},
{
"epoch": 1.6077658975801912,
"grad_norm": 0.3207844519265783,
"learning_rate": 2.576136837713809e-05,
"loss": 0.3557,
"step": 1429
},
{
"epoch": 1.6088913899831176,
"grad_norm": 0.3185723538525886,
"learning_rate": 2.574050896954527e-05,
"loss": 0.3633,
"step": 1430
},
{
"epoch": 1.610016882386044,
"grad_norm": 0.3110802343229136,
"learning_rate": 2.5719649561952443e-05,
"loss": 0.3621,
"step": 1431
},
{
"epoch": 1.6111423747889702,
"grad_norm": 0.39120392030901746,
"learning_rate": 2.569879015435962e-05,
"loss": 0.3718,
"step": 1432
},
{
"epoch": 1.6122678671918964,
"grad_norm": 0.3044483498179327,
"learning_rate": 2.5677930746766793e-05,
"loss": 0.3525,
"step": 1433
},
{
"epoch": 1.6133933595948227,
"grad_norm": 0.36593260259263516,
"learning_rate": 2.565707133917397e-05,
"loss": 0.3724,
"step": 1434
},
{
"epoch": 1.6145188519977491,
"grad_norm": 0.34991456432334755,
"learning_rate": 2.5636211931581144e-05,
"loss": 0.3682,
"step": 1435
},
{
"epoch": 1.6156443444006752,
"grad_norm": 0.32304123149901537,
"learning_rate": 2.5615352523988317e-05,
"loss": 0.3496,
"step": 1436
},
{
"epoch": 1.6167698368036016,
"grad_norm": 0.34708749419764806,
"learning_rate": 2.5594493116395494e-05,
"loss": 0.3913,
"step": 1437
},
{
"epoch": 1.617895329206528,
"grad_norm": 0.32488187134050506,
"learning_rate": 2.5573633708802668e-05,
"loss": 0.3469,
"step": 1438
},
{
"epoch": 1.6190208216094542,
"grad_norm": 0.31694764933224345,
"learning_rate": 2.555277430120985e-05,
"loss": 0.3903,
"step": 1439
},
{
"epoch": 1.6201463140123804,
"grad_norm": 0.2966648293508749,
"learning_rate": 2.5531914893617022e-05,
"loss": 0.3434,
"step": 1440
},
{
"epoch": 1.6212718064153067,
"grad_norm": 0.3130351777750274,
"learning_rate": 2.55110554860242e-05,
"loss": 0.3642,
"step": 1441
},
{
"epoch": 1.6223972988182331,
"grad_norm": 0.288157295810494,
"learning_rate": 2.5490196078431373e-05,
"loss": 0.3515,
"step": 1442
},
{
"epoch": 1.6235227912211592,
"grad_norm": 0.34698217632629985,
"learning_rate": 2.546933667083855e-05,
"loss": 0.3733,
"step": 1443
},
{
"epoch": 1.6246482836240856,
"grad_norm": 0.2724092253095966,
"learning_rate": 2.5448477263245723e-05,
"loss": 0.3497,
"step": 1444
},
{
"epoch": 1.6257737760270117,
"grad_norm": 0.24953001796720836,
"learning_rate": 2.5427617855652904e-05,
"loss": 0.3573,
"step": 1445
},
{
"epoch": 1.6268992684299382,
"grad_norm": 0.299260486745094,
"learning_rate": 2.5406758448060074e-05,
"loss": 0.3873,
"step": 1446
},
{
"epoch": 1.6280247608328644,
"grad_norm": 0.26925589680552175,
"learning_rate": 2.5385899040467254e-05,
"loss": 0.3508,
"step": 1447
},
{
"epoch": 1.6291502532357907,
"grad_norm": 0.29454604423730374,
"learning_rate": 2.5365039632874428e-05,
"loss": 0.3591,
"step": 1448
},
{
"epoch": 1.630275745638717,
"grad_norm": 0.27324874812018735,
"learning_rate": 2.5344180225281605e-05,
"loss": 0.3625,
"step": 1449
},
{
"epoch": 1.6314012380416432,
"grad_norm": 0.27258225073759196,
"learning_rate": 2.5323320817688778e-05,
"loss": 0.3554,
"step": 1450
},
{
"epoch": 1.6325267304445696,
"grad_norm": 0.3035610321463261,
"learning_rate": 2.5302461410095955e-05,
"loss": 0.3535,
"step": 1451
},
{
"epoch": 1.6336522228474957,
"grad_norm": 0.3628567082505913,
"learning_rate": 2.528160200250313e-05,
"loss": 0.358,
"step": 1452
},
{
"epoch": 1.6347777152504221,
"grad_norm": 0.26138414055253223,
"learning_rate": 2.526074259491031e-05,
"loss": 0.3664,
"step": 1453
},
{
"epoch": 1.6359032076533482,
"grad_norm": 0.3503328861643792,
"learning_rate": 2.5239883187317483e-05,
"loss": 0.3377,
"step": 1454
},
{
"epoch": 1.6370287000562747,
"grad_norm": 0.2673845892434079,
"learning_rate": 2.521902377972466e-05,
"loss": 0.3474,
"step": 1455
},
{
"epoch": 1.638154192459201,
"grad_norm": 0.27470271868463625,
"learning_rate": 2.5198164372131833e-05,
"loss": 0.3639,
"step": 1456
},
{
"epoch": 1.6392796848621272,
"grad_norm": 0.3112867744755204,
"learning_rate": 2.5177304964539007e-05,
"loss": 0.3662,
"step": 1457
},
{
"epoch": 1.6404051772650534,
"grad_norm": 0.29872249045203997,
"learning_rate": 2.5156445556946184e-05,
"loss": 0.3569,
"step": 1458
},
{
"epoch": 1.6415306696679797,
"grad_norm": 0.2950030580824579,
"learning_rate": 2.5135586149353358e-05,
"loss": 0.3877,
"step": 1459
},
{
"epoch": 1.6426561620709061,
"grad_norm": 0.30740378724405815,
"learning_rate": 2.5114726741760535e-05,
"loss": 0.368,
"step": 1460
},
{
"epoch": 1.6437816544738322,
"grad_norm": 0.43735074719358724,
"learning_rate": 2.5093867334167708e-05,
"loss": 0.3862,
"step": 1461
},
{
"epoch": 1.6449071468767587,
"grad_norm": 0.344358904604338,
"learning_rate": 2.507300792657489e-05,
"loss": 0.3556,
"step": 1462
},
{
"epoch": 1.646032639279685,
"grad_norm": 0.3606518532796079,
"learning_rate": 2.505214851898206e-05,
"loss": 0.3506,
"step": 1463
},
{
"epoch": 1.6471581316826112,
"grad_norm": 0.26793935225288906,
"learning_rate": 2.503128911138924e-05,
"loss": 0.3644,
"step": 1464
},
{
"epoch": 1.6482836240855374,
"grad_norm": 0.36553458630391006,
"learning_rate": 2.5010429703796413e-05,
"loss": 0.3786,
"step": 1465
},
{
"epoch": 1.6494091164884637,
"grad_norm": 0.3032742387012001,
"learning_rate": 2.4989570296203586e-05,
"loss": 0.3606,
"step": 1466
},
{
"epoch": 1.6505346088913901,
"grad_norm": 0.2573644911193979,
"learning_rate": 2.4968710888610763e-05,
"loss": 0.3758,
"step": 1467
},
{
"epoch": 1.6516601012943162,
"grad_norm": 0.3260439897844004,
"learning_rate": 2.494785148101794e-05,
"loss": 0.3701,
"step": 1468
},
{
"epoch": 1.6527855936972426,
"grad_norm": 0.2791366230994869,
"learning_rate": 2.4926992073425114e-05,
"loss": 0.3608,
"step": 1469
},
{
"epoch": 1.6539110861001687,
"grad_norm": 0.28073773442639216,
"learning_rate": 2.490613266583229e-05,
"loss": 0.3552,
"step": 1470
},
{
"epoch": 1.6550365785030952,
"grad_norm": 0.2751936808067321,
"learning_rate": 2.4885273258239468e-05,
"loss": 0.3551,
"step": 1471
},
{
"epoch": 1.6561620709060214,
"grad_norm": 0.31105318511449315,
"learning_rate": 2.486441385064664e-05,
"loss": 0.3846,
"step": 1472
},
{
"epoch": 1.6572875633089477,
"grad_norm": 0.2779436567942526,
"learning_rate": 2.484355444305382e-05,
"loss": 0.342,
"step": 1473
},
{
"epoch": 1.658413055711874,
"grad_norm": 0.260118994793512,
"learning_rate": 2.4822695035460995e-05,
"loss": 0.3416,
"step": 1474
},
{
"epoch": 1.6595385481148002,
"grad_norm": 0.30797304765243294,
"learning_rate": 2.480183562786817e-05,
"loss": 0.3649,
"step": 1475
},
{
"epoch": 1.6606640405177266,
"grad_norm": 0.27879300341701935,
"learning_rate": 2.4780976220275346e-05,
"loss": 0.3577,
"step": 1476
},
{
"epoch": 1.6617895329206527,
"grad_norm": 0.2618302523010228,
"learning_rate": 2.476011681268252e-05,
"loss": 0.373,
"step": 1477
},
{
"epoch": 1.6629150253235792,
"grad_norm": 0.2691572921484226,
"learning_rate": 2.4739257405089697e-05,
"loss": 0.3382,
"step": 1478
},
{
"epoch": 1.6640405177265052,
"grad_norm": 0.3021887597303646,
"learning_rate": 2.4718397997496874e-05,
"loss": 0.3561,
"step": 1479
},
{
"epoch": 1.6651660101294317,
"grad_norm": 0.29571070245395525,
"learning_rate": 2.4697538589904047e-05,
"loss": 0.3666,
"step": 1480
},
{
"epoch": 1.666291502532358,
"grad_norm": 0.3060388532862541,
"learning_rate": 2.4676679182311224e-05,
"loss": 0.3574,
"step": 1481
},
{
"epoch": 1.6674169949352842,
"grad_norm": 0.262863158327581,
"learning_rate": 2.46558197747184e-05,
"loss": 0.3515,
"step": 1482
},
{
"epoch": 1.6685424873382104,
"grad_norm": 0.26211725924142215,
"learning_rate": 2.4634960367125575e-05,
"loss": 0.3673,
"step": 1483
},
{
"epoch": 1.6696679797411367,
"grad_norm": 0.27559909119280296,
"learning_rate": 2.461410095953275e-05,
"loss": 0.3699,
"step": 1484
},
{
"epoch": 1.6707934721440632,
"grad_norm": 0.3286258665699544,
"learning_rate": 2.459324155193993e-05,
"loss": 0.3547,
"step": 1485
},
{
"epoch": 1.6719189645469892,
"grad_norm": 0.28394671282033973,
"learning_rate": 2.4572382144347102e-05,
"loss": 0.3699,
"step": 1486
},
{
"epoch": 1.6730444569499157,
"grad_norm": 0.28904710622589413,
"learning_rate": 2.4551522736754276e-05,
"loss": 0.3498,
"step": 1487
},
{
"epoch": 1.674169949352842,
"grad_norm": 0.3427205931479807,
"learning_rate": 2.4530663329161453e-05,
"loss": 0.3501,
"step": 1488
},
{
"epoch": 1.6752954417557682,
"grad_norm": 0.29275848448510483,
"learning_rate": 2.4509803921568626e-05,
"loss": 0.3436,
"step": 1489
},
{
"epoch": 1.6764209341586944,
"grad_norm": 0.281092401245526,
"learning_rate": 2.4488944513975803e-05,
"loss": 0.3618,
"step": 1490
},
{
"epoch": 1.6775464265616207,
"grad_norm": 0.3028349680860752,
"learning_rate": 2.446808510638298e-05,
"loss": 0.36,
"step": 1491
},
{
"epoch": 1.6786719189645471,
"grad_norm": 0.2740774488090583,
"learning_rate": 2.4447225698790154e-05,
"loss": 0.3761,
"step": 1492
},
{
"epoch": 1.6797974113674732,
"grad_norm": 0.3565931288895132,
"learning_rate": 2.442636629119733e-05,
"loss": 0.3656,
"step": 1493
},
{
"epoch": 1.6809229037703997,
"grad_norm": 0.30994295777900555,
"learning_rate": 2.4405506883604508e-05,
"loss": 0.3709,
"step": 1494
},
{
"epoch": 1.6820483961733257,
"grad_norm": 0.2770746159082683,
"learning_rate": 2.438464747601168e-05,
"loss": 0.3668,
"step": 1495
},
{
"epoch": 1.6831738885762522,
"grad_norm": 0.28851250362528635,
"learning_rate": 2.436378806841886e-05,
"loss": 0.3375,
"step": 1496
},
{
"epoch": 1.6842993809791784,
"grad_norm": 0.30502905628031945,
"learning_rate": 2.4342928660826032e-05,
"loss": 0.372,
"step": 1497
},
{
"epoch": 1.6854248733821047,
"grad_norm": 0.27606414842777804,
"learning_rate": 2.432206925323321e-05,
"loss": 0.3667,
"step": 1498
},
{
"epoch": 1.686550365785031,
"grad_norm": 0.2807248595539354,
"learning_rate": 2.4301209845640386e-05,
"loss": 0.3372,
"step": 1499
},
{
"epoch": 1.6876758581879572,
"grad_norm": 0.3189944644623768,
"learning_rate": 2.428035043804756e-05,
"loss": 0.3556,
"step": 1500
},
{
"epoch": 1.6888013505908837,
"grad_norm": 0.3542996839432631,
"learning_rate": 2.4259491030454737e-05,
"loss": 0.3657,
"step": 1501
},
{
"epoch": 1.6899268429938097,
"grad_norm": 0.26759487532851395,
"learning_rate": 2.4238631622861914e-05,
"loss": 0.3537,
"step": 1502
},
{
"epoch": 1.6910523353967362,
"grad_norm": 0.32892071648122306,
"learning_rate": 2.4217772215269087e-05,
"loss": 0.3679,
"step": 1503
},
{
"epoch": 1.6921778277996622,
"grad_norm": 0.27325117871239496,
"learning_rate": 2.4196912807676264e-05,
"loss": 0.3636,
"step": 1504
},
{
"epoch": 1.6933033202025887,
"grad_norm": 0.31473981377419813,
"learning_rate": 2.417605340008344e-05,
"loss": 0.3546,
"step": 1505
},
{
"epoch": 1.694428812605515,
"grad_norm": 0.6213973467005295,
"learning_rate": 2.4155193992490615e-05,
"loss": 0.3569,
"step": 1506
},
{
"epoch": 1.6955543050084412,
"grad_norm": 0.29664784115736215,
"learning_rate": 2.4134334584897792e-05,
"loss": 0.3541,
"step": 1507
},
{
"epoch": 1.6966797974113674,
"grad_norm": 0.30075562806982764,
"learning_rate": 2.4113475177304965e-05,
"loss": 0.3442,
"step": 1508
},
{
"epoch": 1.6978052898142937,
"grad_norm": 0.2798816842619607,
"learning_rate": 2.409261576971214e-05,
"loss": 0.3723,
"step": 1509
},
{
"epoch": 1.6989307822172202,
"grad_norm": 0.3125716574597028,
"learning_rate": 2.4071756362119316e-05,
"loss": 0.3534,
"step": 1510
},
{
"epoch": 1.7000562746201462,
"grad_norm": 0.2695382685076537,
"learning_rate": 2.4050896954526493e-05,
"loss": 0.3501,
"step": 1511
},
{
"epoch": 1.7011817670230727,
"grad_norm": 0.30428973956664224,
"learning_rate": 2.4030037546933667e-05,
"loss": 0.361,
"step": 1512
},
{
"epoch": 1.702307259425999,
"grad_norm": 0.2954859753709326,
"learning_rate": 2.4009178139340844e-05,
"loss": 0.348,
"step": 1513
},
{
"epoch": 1.7034327518289252,
"grad_norm": 0.2535522065599469,
"learning_rate": 2.398831873174802e-05,
"loss": 0.3448,
"step": 1514
},
{
"epoch": 1.7045582442318514,
"grad_norm": 0.2877878849798194,
"learning_rate": 2.3967459324155194e-05,
"loss": 0.3615,
"step": 1515
},
{
"epoch": 1.7056837366347777,
"grad_norm": 0.2679693175700858,
"learning_rate": 2.394659991656237e-05,
"loss": 0.3575,
"step": 1516
},
{
"epoch": 1.7068092290377042,
"grad_norm": 0.270042339489181,
"learning_rate": 2.3925740508969545e-05,
"loss": 0.3612,
"step": 1517
},
{
"epoch": 1.7079347214406302,
"grad_norm": 0.3277570559960174,
"learning_rate": 2.390488110137672e-05,
"loss": 0.3539,
"step": 1518
},
{
"epoch": 1.7090602138435567,
"grad_norm": 0.273010908537002,
"learning_rate": 2.38840216937839e-05,
"loss": 0.3813,
"step": 1519
},
{
"epoch": 1.7101857062464827,
"grad_norm": 0.3163418829636289,
"learning_rate": 2.3863162286191072e-05,
"loss": 0.367,
"step": 1520
},
{
"epoch": 1.7113111986494092,
"grad_norm": 0.2790546740132572,
"learning_rate": 2.384230287859825e-05,
"loss": 0.3616,
"step": 1521
},
{
"epoch": 1.7124366910523354,
"grad_norm": 0.3794647847000264,
"learning_rate": 2.3821443471005426e-05,
"loss": 0.3264,
"step": 1522
},
{
"epoch": 1.7135621834552617,
"grad_norm": 0.27180490681435693,
"learning_rate": 2.38005840634126e-05,
"loss": 0.3667,
"step": 1523
},
{
"epoch": 1.714687675858188,
"grad_norm": 0.3192761006046379,
"learning_rate": 2.3779724655819777e-05,
"loss": 0.3516,
"step": 1524
},
{
"epoch": 1.7158131682611142,
"grad_norm": 0.29770507590073336,
"learning_rate": 2.3758865248226954e-05,
"loss": 0.3616,
"step": 1525
},
{
"epoch": 1.7169386606640407,
"grad_norm": 0.3198828879152863,
"learning_rate": 2.3738005840634127e-05,
"loss": 0.3529,
"step": 1526
},
{
"epoch": 1.7180641530669667,
"grad_norm": 0.3090153579359256,
"learning_rate": 2.3717146433041304e-05,
"loss": 0.3526,
"step": 1527
},
{
"epoch": 1.7191896454698932,
"grad_norm": 0.3212232642206978,
"learning_rate": 2.369628702544848e-05,
"loss": 0.3607,
"step": 1528
},
{
"epoch": 1.7203151378728192,
"grad_norm": 0.30043128684782044,
"learning_rate": 2.367542761785565e-05,
"loss": 0.352,
"step": 1529
},
{
"epoch": 1.7214406302757457,
"grad_norm": 0.29295625581516677,
"learning_rate": 2.365456821026283e-05,
"loss": 0.3523,
"step": 1530
},
{
"epoch": 1.722566122678672,
"grad_norm": 0.3148385769404437,
"learning_rate": 2.3633708802670006e-05,
"loss": 0.3428,
"step": 1531
},
{
"epoch": 1.7236916150815982,
"grad_norm": 0.2809729795961225,
"learning_rate": 2.361284939507718e-05,
"loss": 0.3501,
"step": 1532
},
{
"epoch": 1.7248171074845244,
"grad_norm": 0.26779520094077724,
"learning_rate": 2.3591989987484356e-05,
"loss": 0.3692,
"step": 1533
},
{
"epoch": 1.7259425998874507,
"grad_norm": 0.34366805506707354,
"learning_rate": 2.3571130579891533e-05,
"loss": 0.3487,
"step": 1534
},
{
"epoch": 1.7270680922903772,
"grad_norm": 0.31386821776914015,
"learning_rate": 2.3550271172298707e-05,
"loss": 0.3738,
"step": 1535
},
{
"epoch": 1.7281935846933032,
"grad_norm": 0.27129248750888196,
"learning_rate": 2.3529411764705884e-05,
"loss": 0.347,
"step": 1536
},
{
"epoch": 1.7293190770962297,
"grad_norm": 0.30115138922829,
"learning_rate": 2.3508552357113057e-05,
"loss": 0.3644,
"step": 1537
},
{
"epoch": 1.730444569499156,
"grad_norm": 0.33400000355343035,
"learning_rate": 2.3487692949520234e-05,
"loss": 0.3735,
"step": 1538
},
{
"epoch": 1.7315700619020822,
"grad_norm": 0.2817713843812286,
"learning_rate": 2.346683354192741e-05,
"loss": 0.3536,
"step": 1539
},
{
"epoch": 1.7326955543050084,
"grad_norm": 0.2536336062497862,
"learning_rate": 2.3445974134334585e-05,
"loss": 0.3531,
"step": 1540
},
{
"epoch": 1.7338210467079347,
"grad_norm": 0.292119578282618,
"learning_rate": 2.3425114726741762e-05,
"loss": 0.3917,
"step": 1541
},
{
"epoch": 1.7349465391108612,
"grad_norm": 0.301736936214816,
"learning_rate": 2.340425531914894e-05,
"loss": 0.3368,
"step": 1542
},
{
"epoch": 1.7360720315137872,
"grad_norm": 0.2834782995265929,
"learning_rate": 2.3383395911556112e-05,
"loss": 0.3627,
"step": 1543
},
{
"epoch": 1.7371975239167137,
"grad_norm": 0.3472332663859999,
"learning_rate": 2.336253650396329e-05,
"loss": 0.3563,
"step": 1544
},
{
"epoch": 1.7383230163196397,
"grad_norm": 0.2770080632091572,
"learning_rate": 2.3341677096370466e-05,
"loss": 0.3638,
"step": 1545
},
{
"epoch": 1.7394485087225662,
"grad_norm": 0.28038474675505726,
"learning_rate": 2.332081768877764e-05,
"loss": 0.3578,
"step": 1546
},
{
"epoch": 1.7405740011254924,
"grad_norm": 0.29413387062574675,
"learning_rate": 2.3299958281184817e-05,
"loss": 0.3581,
"step": 1547
},
{
"epoch": 1.7416994935284187,
"grad_norm": 0.250154894365378,
"learning_rate": 2.3279098873591994e-05,
"loss": 0.3524,
"step": 1548
},
{
"epoch": 1.742824985931345,
"grad_norm": 0.27004730168507385,
"learning_rate": 2.3258239465999164e-05,
"loss": 0.3592,
"step": 1549
},
{
"epoch": 1.7439504783342712,
"grad_norm": 0.30931998115710535,
"learning_rate": 2.323738005840634e-05,
"loss": 0.3633,
"step": 1550
},
{
"epoch": 1.7450759707371977,
"grad_norm": 0.260094920014104,
"learning_rate": 2.3216520650813518e-05,
"loss": 0.36,
"step": 1551
},
{
"epoch": 1.7462014631401237,
"grad_norm": 0.28020792072208933,
"learning_rate": 2.3195661243220692e-05,
"loss": 0.3619,
"step": 1552
},
{
"epoch": 1.7473269555430502,
"grad_norm": 0.29150594274575353,
"learning_rate": 2.317480183562787e-05,
"loss": 0.3752,
"step": 1553
},
{
"epoch": 1.7484524479459762,
"grad_norm": 0.2780077227889234,
"learning_rate": 2.3153942428035046e-05,
"loss": 0.3404,
"step": 1554
},
{
"epoch": 1.7495779403489027,
"grad_norm": 0.26577200333767786,
"learning_rate": 2.313308302044222e-05,
"loss": 0.3457,
"step": 1555
},
{
"epoch": 1.750703432751829,
"grad_norm": 0.297363502447975,
"learning_rate": 2.3112223612849396e-05,
"loss": 0.3473,
"step": 1556
},
{
"epoch": 1.7518289251547552,
"grad_norm": 0.26278420558469534,
"learning_rate": 2.309136420525657e-05,
"loss": 0.3474,
"step": 1557
},
{
"epoch": 1.7529544175576814,
"grad_norm": 0.26900103936760594,
"learning_rate": 2.3070504797663747e-05,
"loss": 0.3531,
"step": 1558
},
{
"epoch": 1.7540799099606077,
"grad_norm": 0.32212325532836394,
"learning_rate": 2.3049645390070924e-05,
"loss": 0.3671,
"step": 1559
},
{
"epoch": 1.7552054023635342,
"grad_norm": 0.2970498028319336,
"learning_rate": 2.3028785982478097e-05,
"loss": 0.3488,
"step": 1560
},
{
"epoch": 1.7563308947664602,
"grad_norm": 0.3127346620449437,
"learning_rate": 2.3007926574885274e-05,
"loss": 0.3585,
"step": 1561
},
{
"epoch": 1.7574563871693867,
"grad_norm": 0.2765174597912786,
"learning_rate": 2.298706716729245e-05,
"loss": 0.3394,
"step": 1562
},
{
"epoch": 1.758581879572313,
"grad_norm": 0.3009783707148293,
"learning_rate": 2.2966207759699625e-05,
"loss": 0.3629,
"step": 1563
},
{
"epoch": 1.7597073719752392,
"grad_norm": 0.30115522865418154,
"learning_rate": 2.2945348352106802e-05,
"loss": 0.3725,
"step": 1564
},
{
"epoch": 1.7608328643781654,
"grad_norm": 0.2930519854916032,
"learning_rate": 2.292448894451398e-05,
"loss": 0.3569,
"step": 1565
},
{
"epoch": 1.7619583567810917,
"grad_norm": 0.3047405558309698,
"learning_rate": 2.2903629536921153e-05,
"loss": 0.3714,
"step": 1566
},
{
"epoch": 1.7630838491840182,
"grad_norm": 0.2590925307869418,
"learning_rate": 2.288277012932833e-05,
"loss": 0.3429,
"step": 1567
},
{
"epoch": 1.7642093415869442,
"grad_norm": 0.26569956950346013,
"learning_rate": 2.2861910721735507e-05,
"loss": 0.3563,
"step": 1568
},
{
"epoch": 1.7653348339898707,
"grad_norm": 0.3212454261162196,
"learning_rate": 2.284105131414268e-05,
"loss": 0.3628,
"step": 1569
},
{
"epoch": 1.7664603263927967,
"grad_norm": 0.25121736290737545,
"learning_rate": 2.2820191906549854e-05,
"loss": 0.3407,
"step": 1570
},
{
"epoch": 1.7675858187957232,
"grad_norm": 0.2568310971026976,
"learning_rate": 2.279933249895703e-05,
"loss": 0.3556,
"step": 1571
},
{
"epoch": 1.7687113111986494,
"grad_norm": 0.2766368973128219,
"learning_rate": 2.2778473091364204e-05,
"loss": 0.3488,
"step": 1572
},
{
"epoch": 1.7698368036015757,
"grad_norm": 0.2830150867726597,
"learning_rate": 2.275761368377138e-05,
"loss": 0.3563,
"step": 1573
},
{
"epoch": 1.770962296004502,
"grad_norm": 0.2704782966697743,
"learning_rate": 2.2736754276178558e-05,
"loss": 0.3785,
"step": 1574
},
{
"epoch": 1.7720877884074282,
"grad_norm": 0.28693834596503254,
"learning_rate": 2.2715894868585732e-05,
"loss": 0.3568,
"step": 1575
},
{
"epoch": 1.7732132808103547,
"grad_norm": 0.300591423339274,
"learning_rate": 2.269503546099291e-05,
"loss": 0.3401,
"step": 1576
},
{
"epoch": 1.7743387732132807,
"grad_norm": 0.25477856684248135,
"learning_rate": 2.2674176053400082e-05,
"loss": 0.3485,
"step": 1577
},
{
"epoch": 1.7754642656162072,
"grad_norm": 0.27289380951433195,
"learning_rate": 2.265331664580726e-05,
"loss": 0.3607,
"step": 1578
},
{
"epoch": 1.7765897580191332,
"grad_norm": 0.28248295859121,
"learning_rate": 2.2632457238214436e-05,
"loss": 0.3511,
"step": 1579
},
{
"epoch": 1.7777152504220597,
"grad_norm": 0.2658629597762577,
"learning_rate": 2.261159783062161e-05,
"loss": 0.377,
"step": 1580
},
{
"epoch": 1.778840742824986,
"grad_norm": 0.27220952413476507,
"learning_rate": 2.2590738423028787e-05,
"loss": 0.3557,
"step": 1581
},
{
"epoch": 1.7799662352279122,
"grad_norm": 0.2328823780748166,
"learning_rate": 2.2569879015435964e-05,
"loss": 0.3437,
"step": 1582
},
{
"epoch": 1.7810917276308385,
"grad_norm": 0.27552976720286626,
"learning_rate": 2.2549019607843138e-05,
"loss": 0.3499,
"step": 1583
},
{
"epoch": 1.7822172200337647,
"grad_norm": 0.26988928502984605,
"learning_rate": 2.2528160200250315e-05,
"loss": 0.3739,
"step": 1584
},
{
"epoch": 1.7833427124366912,
"grad_norm": 0.28360069868054577,
"learning_rate": 2.250730079265749e-05,
"loss": 0.3586,
"step": 1585
},
{
"epoch": 1.7844682048396172,
"grad_norm": 0.30703362231564924,
"learning_rate": 2.2486441385064665e-05,
"loss": 0.3402,
"step": 1586
},
{
"epoch": 1.7855936972425437,
"grad_norm": 0.24229886888660893,
"learning_rate": 2.2465581977471842e-05,
"loss": 0.3749,
"step": 1587
},
{
"epoch": 1.78671918964547,
"grad_norm": 0.3052949362012416,
"learning_rate": 2.244472256987902e-05,
"loss": 0.3432,
"step": 1588
},
{
"epoch": 1.7878446820483962,
"grad_norm": 0.30727845879387705,
"learning_rate": 2.2423863162286193e-05,
"loss": 0.352,
"step": 1589
},
{
"epoch": 1.7889701744513224,
"grad_norm": 0.2871867401825979,
"learning_rate": 2.240300375469337e-05,
"loss": 0.3654,
"step": 1590
},
{
"epoch": 1.7900956668542487,
"grad_norm": 0.29179973425408784,
"learning_rate": 2.2382144347100543e-05,
"loss": 0.3616,
"step": 1591
},
{
"epoch": 1.7912211592571752,
"grad_norm": 0.3214829947161843,
"learning_rate": 2.2361284939507717e-05,
"loss": 0.3512,
"step": 1592
},
{
"epoch": 1.7923466516601012,
"grad_norm": 0.24147795550593532,
"learning_rate": 2.2340425531914894e-05,
"loss": 0.3462,
"step": 1593
},
{
"epoch": 1.7934721440630277,
"grad_norm": 0.27689467276611157,
"learning_rate": 2.231956612432207e-05,
"loss": 0.36,
"step": 1594
},
{
"epoch": 1.7945976364659537,
"grad_norm": 0.28150686561848237,
"learning_rate": 2.2298706716729244e-05,
"loss": 0.3532,
"step": 1595
},
{
"epoch": 1.7957231288688802,
"grad_norm": 0.2581225749795623,
"learning_rate": 2.227784730913642e-05,
"loss": 0.3559,
"step": 1596
},
{
"epoch": 1.7968486212718064,
"grad_norm": 0.3039816245853392,
"learning_rate": 2.22569879015436e-05,
"loss": 0.3538,
"step": 1597
},
{
"epoch": 1.7979741136747327,
"grad_norm": 0.25714237851869526,
"learning_rate": 2.2236128493950772e-05,
"loss": 0.3442,
"step": 1598
},
{
"epoch": 1.799099606077659,
"grad_norm": 0.24074871831754024,
"learning_rate": 2.221526908635795e-05,
"loss": 0.349,
"step": 1599
},
{
"epoch": 1.8002250984805852,
"grad_norm": 0.28044366274540433,
"learning_rate": 2.2194409678765123e-05,
"loss": 0.3476,
"step": 1600
},
{
"epoch": 1.8013505908835117,
"grad_norm": 0.3055206598860284,
"learning_rate": 2.21735502711723e-05,
"loss": 0.329,
"step": 1601
},
{
"epoch": 1.8024760832864377,
"grad_norm": 0.32077315537142925,
"learning_rate": 2.2152690863579477e-05,
"loss": 0.3632,
"step": 1602
},
{
"epoch": 1.8036015756893642,
"grad_norm": 0.3191853556743268,
"learning_rate": 2.213183145598665e-05,
"loss": 0.3724,
"step": 1603
},
{
"epoch": 1.8047270680922902,
"grad_norm": 0.309559125522351,
"learning_rate": 2.2110972048393827e-05,
"loss": 0.36,
"step": 1604
},
{
"epoch": 1.8058525604952167,
"grad_norm": 0.31187663837864876,
"learning_rate": 2.2090112640801004e-05,
"loss": 0.344,
"step": 1605
},
{
"epoch": 1.806978052898143,
"grad_norm": 0.3041182224443529,
"learning_rate": 2.2069253233208178e-05,
"loss": 0.353,
"step": 1606
},
{
"epoch": 1.8081035453010692,
"grad_norm": 0.29282481275876526,
"learning_rate": 2.2048393825615355e-05,
"loss": 0.3486,
"step": 1607
},
{
"epoch": 1.8092290377039955,
"grad_norm": 0.29147172423218604,
"learning_rate": 2.202753441802253e-05,
"loss": 0.3672,
"step": 1608
},
{
"epoch": 1.8103545301069217,
"grad_norm": 0.28412350307097217,
"learning_rate": 2.2006675010429705e-05,
"loss": 0.3353,
"step": 1609
},
{
"epoch": 1.8114800225098482,
"grad_norm": 0.3067481559384326,
"learning_rate": 2.1985815602836882e-05,
"loss": 0.3768,
"step": 1610
},
{
"epoch": 1.8126055149127742,
"grad_norm": 0.28038315675437364,
"learning_rate": 2.1964956195244056e-05,
"loss": 0.3654,
"step": 1611
},
{
"epoch": 1.8137310073157007,
"grad_norm": 0.35539158393187636,
"learning_rate": 2.194409678765123e-05,
"loss": 0.3538,
"step": 1612
},
{
"epoch": 1.814856499718627,
"grad_norm": 0.25885136003612047,
"learning_rate": 2.1923237380058406e-05,
"loss": 0.3644,
"step": 1613
},
{
"epoch": 1.8159819921215532,
"grad_norm": 0.26093239365043436,
"learning_rate": 2.1902377972465583e-05,
"loss": 0.3708,
"step": 1614
},
{
"epoch": 1.8171074845244795,
"grad_norm": 0.2961872877279803,
"learning_rate": 2.1881518564872757e-05,
"loss": 0.3596,
"step": 1615
},
{
"epoch": 1.8182329769274057,
"grad_norm": 0.25680902610020434,
"learning_rate": 2.1860659157279934e-05,
"loss": 0.3577,
"step": 1616
},
{
"epoch": 1.8193584693303322,
"grad_norm": 0.25991234140815395,
"learning_rate": 2.183979974968711e-05,
"loss": 0.3633,
"step": 1617
},
{
"epoch": 1.8204839617332582,
"grad_norm": 0.2613869530348512,
"learning_rate": 2.1818940342094285e-05,
"loss": 0.3418,
"step": 1618
},
{
"epoch": 1.8216094541361847,
"grad_norm": 0.24790904990988194,
"learning_rate": 2.179808093450146e-05,
"loss": 0.3632,
"step": 1619
},
{
"epoch": 1.8227349465391107,
"grad_norm": 0.28799463478351406,
"learning_rate": 2.1777221526908635e-05,
"loss": 0.3699,
"step": 1620
},
{
"epoch": 1.8238604389420372,
"grad_norm": 0.25548160538250764,
"learning_rate": 2.1756362119315812e-05,
"loss": 0.3442,
"step": 1621
},
{
"epoch": 1.8249859313449635,
"grad_norm": 0.2985142619761683,
"learning_rate": 2.173550271172299e-05,
"loss": 0.3486,
"step": 1622
},
{
"epoch": 1.8261114237478897,
"grad_norm": 0.2972946035959545,
"learning_rate": 2.1714643304130163e-05,
"loss": 0.3414,
"step": 1623
},
{
"epoch": 1.827236916150816,
"grad_norm": 0.26170651498968683,
"learning_rate": 2.169378389653734e-05,
"loss": 0.3284,
"step": 1624
},
{
"epoch": 1.8283624085537422,
"grad_norm": 0.2524407858115918,
"learning_rate": 2.1672924488944517e-05,
"loss": 0.3599,
"step": 1625
},
{
"epoch": 1.8294879009566687,
"grad_norm": 0.3335691621333924,
"learning_rate": 2.165206508135169e-05,
"loss": 0.3603,
"step": 1626
},
{
"epoch": 1.8306133933595947,
"grad_norm": 0.2768913167073537,
"learning_rate": 2.1631205673758867e-05,
"loss": 0.3588,
"step": 1627
},
{
"epoch": 1.8317388857625212,
"grad_norm": 0.30050684042922793,
"learning_rate": 2.1610346266166044e-05,
"loss": 0.372,
"step": 1628
},
{
"epoch": 1.8328643781654472,
"grad_norm": 0.2901843574196796,
"learning_rate": 2.1589486858573218e-05,
"loss": 0.3639,
"step": 1629
},
{
"epoch": 1.8339898705683737,
"grad_norm": 0.29902669217912486,
"learning_rate": 2.1568627450980395e-05,
"loss": 0.3735,
"step": 1630
},
{
"epoch": 1.8351153629713,
"grad_norm": 0.30980781618970216,
"learning_rate": 2.154776804338757e-05,
"loss": 0.3572,
"step": 1631
},
{
"epoch": 1.8362408553742262,
"grad_norm": 0.26616420601594276,
"learning_rate": 2.1526908635794745e-05,
"loss": 0.3582,
"step": 1632
},
{
"epoch": 1.8373663477771525,
"grad_norm": 0.29096782812841715,
"learning_rate": 2.150604922820192e-05,
"loss": 0.3533,
"step": 1633
},
{
"epoch": 1.8384918401800787,
"grad_norm": 0.29936454913412547,
"learning_rate": 2.1485189820609096e-05,
"loss": 0.3441,
"step": 1634
},
{
"epoch": 1.8396173325830052,
"grad_norm": 0.34946000879087785,
"learning_rate": 2.146433041301627e-05,
"loss": 0.3628,
"step": 1635
},
{
"epoch": 1.8407428249859312,
"grad_norm": 0.2623712677205065,
"learning_rate": 2.1443471005423447e-05,
"loss": 0.3545,
"step": 1636
},
{
"epoch": 1.8418683173888577,
"grad_norm": 0.2753735634753566,
"learning_rate": 2.1422611597830624e-05,
"loss": 0.3528,
"step": 1637
},
{
"epoch": 1.842993809791784,
"grad_norm": 0.31812525886192866,
"learning_rate": 2.1401752190237797e-05,
"loss": 0.3697,
"step": 1638
},
{
"epoch": 1.8441193021947102,
"grad_norm": 0.29105961621045684,
"learning_rate": 2.1380892782644974e-05,
"loss": 0.3546,
"step": 1639
},
{
"epoch": 1.8452447945976365,
"grad_norm": 0.2691984264982239,
"learning_rate": 2.1360033375052148e-05,
"loss": 0.3536,
"step": 1640
},
{
"epoch": 1.8463702870005627,
"grad_norm": 0.2993538772854178,
"learning_rate": 2.1339173967459325e-05,
"loss": 0.363,
"step": 1641
},
{
"epoch": 1.8474957794034892,
"grad_norm": 0.29783181788963287,
"learning_rate": 2.13183145598665e-05,
"loss": 0.3592,
"step": 1642
},
{
"epoch": 1.8486212718064152,
"grad_norm": 0.2775688059074239,
"learning_rate": 2.1297455152273675e-05,
"loss": 0.3581,
"step": 1643
},
{
"epoch": 1.8497467642093417,
"grad_norm": 0.3133614801924746,
"learning_rate": 2.1276595744680852e-05,
"loss": 0.3498,
"step": 1644
},
{
"epoch": 1.8508722566122677,
"grad_norm": 0.2772230818911116,
"learning_rate": 2.125573633708803e-05,
"loss": 0.3558,
"step": 1645
},
{
"epoch": 1.8519977490151942,
"grad_norm": 0.30827283116401644,
"learning_rate": 2.1234876929495203e-05,
"loss": 0.3614,
"step": 1646
},
{
"epoch": 1.8531232414181205,
"grad_norm": 0.24090218764810817,
"learning_rate": 2.121401752190238e-05,
"loss": 0.3662,
"step": 1647
},
{
"epoch": 1.8542487338210467,
"grad_norm": 0.28761910481188807,
"learning_rate": 2.1193158114309557e-05,
"loss": 0.3441,
"step": 1648
},
{
"epoch": 1.855374226223973,
"grad_norm": 0.2560509442786654,
"learning_rate": 2.117229870671673e-05,
"loss": 0.3547,
"step": 1649
},
{
"epoch": 1.8564997186268992,
"grad_norm": 0.30034883076743724,
"learning_rate": 2.1151439299123907e-05,
"loss": 0.3449,
"step": 1650
},
{
"epoch": 1.8576252110298257,
"grad_norm": 0.34444462233589906,
"learning_rate": 2.113057989153108e-05,
"loss": 0.3837,
"step": 1651
},
{
"epoch": 1.8587507034327517,
"grad_norm": 0.27692690682489174,
"learning_rate": 2.1109720483938258e-05,
"loss": 0.3607,
"step": 1652
},
{
"epoch": 1.8598761958356782,
"grad_norm": 0.26001142796077303,
"learning_rate": 2.1088861076345435e-05,
"loss": 0.3398,
"step": 1653
},
{
"epoch": 1.8610016882386042,
"grad_norm": 0.25366060360784753,
"learning_rate": 2.106800166875261e-05,
"loss": 0.3656,
"step": 1654
},
{
"epoch": 1.8621271806415307,
"grad_norm": 0.25058815872177637,
"learning_rate": 2.1047142261159782e-05,
"loss": 0.3439,
"step": 1655
},
{
"epoch": 1.863252673044457,
"grad_norm": 0.28664975028041284,
"learning_rate": 2.102628285356696e-05,
"loss": 0.3583,
"step": 1656
},
{
"epoch": 1.8643781654473832,
"grad_norm": 0.2732549675529288,
"learning_rate": 2.1005423445974136e-05,
"loss": 0.3305,
"step": 1657
},
{
"epoch": 1.8655036578503095,
"grad_norm": 0.2773666490469463,
"learning_rate": 2.098456403838131e-05,
"loss": 0.3591,
"step": 1658
},
{
"epoch": 1.8666291502532357,
"grad_norm": 0.2690002427002813,
"learning_rate": 2.0963704630788487e-05,
"loss": 0.3684,
"step": 1659
},
{
"epoch": 1.8677546426561622,
"grad_norm": 0.27085097978896006,
"learning_rate": 2.094284522319566e-05,
"loss": 0.3384,
"step": 1660
},
{
"epoch": 1.8688801350590882,
"grad_norm": 0.24697707069643743,
"learning_rate": 2.0921985815602837e-05,
"loss": 0.3572,
"step": 1661
},
{
"epoch": 1.8700056274620147,
"grad_norm": 0.2764605247602527,
"learning_rate": 2.0901126408010014e-05,
"loss": 0.3552,
"step": 1662
},
{
"epoch": 1.871131119864941,
"grad_norm": 0.2902550139143697,
"learning_rate": 2.0880267000417188e-05,
"loss": 0.3581,
"step": 1663
},
{
"epoch": 1.8722566122678672,
"grad_norm": 0.25734658506325125,
"learning_rate": 2.0859407592824365e-05,
"loss": 0.3509,
"step": 1664
},
{
"epoch": 1.8733821046707935,
"grad_norm": 0.29290615718137913,
"learning_rate": 2.0838548185231542e-05,
"loss": 0.3448,
"step": 1665
},
{
"epoch": 1.8745075970737197,
"grad_norm": 0.2633403418767797,
"learning_rate": 2.0817688777638715e-05,
"loss": 0.3556,
"step": 1666
},
{
"epoch": 1.8756330894766462,
"grad_norm": 0.3044255909775045,
"learning_rate": 2.0796829370045892e-05,
"loss": 0.3451,
"step": 1667
},
{
"epoch": 1.8767585818795722,
"grad_norm": 0.2932864685525451,
"learning_rate": 2.077596996245307e-05,
"loss": 0.3657,
"step": 1668
},
{
"epoch": 1.8778840742824987,
"grad_norm": 0.31135509455954635,
"learning_rate": 2.0755110554860243e-05,
"loss": 0.3734,
"step": 1669
},
{
"epoch": 1.8790095666854247,
"grad_norm": 0.2664061935893102,
"learning_rate": 2.073425114726742e-05,
"loss": 0.3629,
"step": 1670
},
{
"epoch": 1.8801350590883512,
"grad_norm": 0.2707969930148503,
"learning_rate": 2.0713391739674597e-05,
"loss": 0.3483,
"step": 1671
},
{
"epoch": 1.8812605514912775,
"grad_norm": 0.2582761473461036,
"learning_rate": 2.069253233208177e-05,
"loss": 0.366,
"step": 1672
},
{
"epoch": 1.8823860438942037,
"grad_norm": 0.2818191859830275,
"learning_rate": 2.0671672924488947e-05,
"loss": 0.3606,
"step": 1673
},
{
"epoch": 1.88351153629713,
"grad_norm": 0.274907626023918,
"learning_rate": 2.065081351689612e-05,
"loss": 0.3733,
"step": 1674
},
{
"epoch": 1.8846370287000562,
"grad_norm": 0.25302448281459705,
"learning_rate": 2.0629954109303295e-05,
"loss": 0.344,
"step": 1675
},
{
"epoch": 1.8857625211029827,
"grad_norm": 0.2601145397643824,
"learning_rate": 2.060909470171047e-05,
"loss": 0.3655,
"step": 1676
},
{
"epoch": 1.8868880135059087,
"grad_norm": 0.2598011168749623,
"learning_rate": 2.058823529411765e-05,
"loss": 0.3583,
"step": 1677
},
{
"epoch": 1.8880135059088352,
"grad_norm": 0.2764045861628215,
"learning_rate": 2.0567375886524822e-05,
"loss": 0.3358,
"step": 1678
},
{
"epoch": 1.8891389983117612,
"grad_norm": 0.2505563945259788,
"learning_rate": 2.0546516478932e-05,
"loss": 0.3326,
"step": 1679
},
{
"epoch": 1.8902644907146877,
"grad_norm": 0.2593385914562438,
"learning_rate": 2.0525657071339173e-05,
"loss": 0.344,
"step": 1680
},
{
"epoch": 1.891389983117614,
"grad_norm": 0.32013539903668187,
"learning_rate": 2.050479766374635e-05,
"loss": 0.3678,
"step": 1681
},
{
"epoch": 1.8925154755205402,
"grad_norm": 0.2850992099914004,
"learning_rate": 2.0483938256153527e-05,
"loss": 0.397,
"step": 1682
},
{
"epoch": 1.8936409679234665,
"grad_norm": 0.3016034620250037,
"learning_rate": 2.04630788485607e-05,
"loss": 0.3358,
"step": 1683
},
{
"epoch": 1.8947664603263927,
"grad_norm": 0.322626269426066,
"learning_rate": 2.0442219440967877e-05,
"loss": 0.3493,
"step": 1684
},
{
"epoch": 1.8958919527293192,
"grad_norm": 0.27415129738901345,
"learning_rate": 2.0421360033375054e-05,
"loss": 0.3612,
"step": 1685
},
{
"epoch": 1.8970174451322452,
"grad_norm": 0.3202508460747489,
"learning_rate": 2.0400500625782228e-05,
"loss": 0.3449,
"step": 1686
},
{
"epoch": 1.8981429375351717,
"grad_norm": 0.2610128644172156,
"learning_rate": 2.0379641218189405e-05,
"loss": 0.334,
"step": 1687
},
{
"epoch": 1.899268429938098,
"grad_norm": 0.26431886989489495,
"learning_rate": 2.0358781810596582e-05,
"loss": 0.3701,
"step": 1688
},
{
"epoch": 1.9003939223410242,
"grad_norm": 0.32289025222752066,
"learning_rate": 2.0337922403003756e-05,
"loss": 0.3772,
"step": 1689
},
{
"epoch": 1.9015194147439505,
"grad_norm": 0.27620099175466095,
"learning_rate": 2.0317062995410932e-05,
"loss": 0.3634,
"step": 1690
},
{
"epoch": 1.9026449071468767,
"grad_norm": 0.30452855448211125,
"learning_rate": 2.029620358781811e-05,
"loss": 0.3619,
"step": 1691
},
{
"epoch": 1.9037703995498032,
"grad_norm": 0.30999319017283444,
"learning_rate": 2.0275344180225283e-05,
"loss": 0.3472,
"step": 1692
},
{
"epoch": 1.9048958919527292,
"grad_norm": 0.34073549354424293,
"learning_rate": 2.025448477263246e-05,
"loss": 0.3417,
"step": 1693
},
{
"epoch": 1.9060213843556557,
"grad_norm": 0.28162550986145274,
"learning_rate": 2.0233625365039634e-05,
"loss": 0.3536,
"step": 1694
},
{
"epoch": 1.9071468767585817,
"grad_norm": 0.3215339598711887,
"learning_rate": 2.0212765957446807e-05,
"loss": 0.3682,
"step": 1695
},
{
"epoch": 1.9082723691615082,
"grad_norm": 0.34154514944007364,
"learning_rate": 2.0191906549853984e-05,
"loss": 0.3573,
"step": 1696
},
{
"epoch": 1.9093978615644345,
"grad_norm": 0.27450876997174517,
"learning_rate": 2.017104714226116e-05,
"loss": 0.3656,
"step": 1697
},
{
"epoch": 1.9105233539673607,
"grad_norm": 0.32973694211143484,
"learning_rate": 2.0150187734668335e-05,
"loss": 0.3729,
"step": 1698
},
{
"epoch": 1.911648846370287,
"grad_norm": 0.33057591238589434,
"learning_rate": 2.0129328327075512e-05,
"loss": 0.371,
"step": 1699
},
{
"epoch": 1.9127743387732132,
"grad_norm": 0.28948186161364625,
"learning_rate": 2.0108468919482685e-05,
"loss": 0.3397,
"step": 1700
},
{
"epoch": 1.9138998311761397,
"grad_norm": 0.3007970569880779,
"learning_rate": 2.0087609511889862e-05,
"loss": 0.3643,
"step": 1701
},
{
"epoch": 1.9150253235790657,
"grad_norm": 0.2612518404162693,
"learning_rate": 2.006675010429704e-05,
"loss": 0.3532,
"step": 1702
},
{
"epoch": 1.9161508159819922,
"grad_norm": 0.31521980587085163,
"learning_rate": 2.0045890696704213e-05,
"loss": 0.3572,
"step": 1703
},
{
"epoch": 1.9172763083849182,
"grad_norm": 0.32716978204799535,
"learning_rate": 2.002503128911139e-05,
"loss": 0.3655,
"step": 1704
},
{
"epoch": 1.9184018007878447,
"grad_norm": 0.2848312721456602,
"learning_rate": 2.0004171881518567e-05,
"loss": 0.3293,
"step": 1705
},
{
"epoch": 1.919527293190771,
"grad_norm": 0.2849516222624094,
"learning_rate": 1.998331247392574e-05,
"loss": 0.3493,
"step": 1706
},
{
"epoch": 1.9206527855936972,
"grad_norm": 0.2748223750385321,
"learning_rate": 1.9962453066332917e-05,
"loss": 0.3361,
"step": 1707
},
{
"epoch": 1.9217782779966235,
"grad_norm": 0.3052533145067581,
"learning_rate": 1.9941593658740094e-05,
"loss": 0.3697,
"step": 1708
},
{
"epoch": 1.9229037703995497,
"grad_norm": 0.2819225673013518,
"learning_rate": 1.9920734251147268e-05,
"loss": 0.3598,
"step": 1709
},
{
"epoch": 1.9240292628024762,
"grad_norm": 0.28297852832083414,
"learning_rate": 1.9899874843554445e-05,
"loss": 0.3421,
"step": 1710
},
{
"epoch": 1.9251547552054022,
"grad_norm": 0.32135792331365465,
"learning_rate": 1.9879015435961622e-05,
"loss": 0.3728,
"step": 1711
},
{
"epoch": 1.9262802476083287,
"grad_norm": 0.2485116486993189,
"learning_rate": 1.9858156028368796e-05,
"loss": 0.3494,
"step": 1712
},
{
"epoch": 1.927405740011255,
"grad_norm": 0.2749683711636245,
"learning_rate": 1.9837296620775973e-05,
"loss": 0.346,
"step": 1713
},
{
"epoch": 1.9285312324141812,
"grad_norm": 0.2642179410888402,
"learning_rate": 1.9816437213183146e-05,
"loss": 0.3548,
"step": 1714
},
{
"epoch": 1.9296567248171075,
"grad_norm": 0.25158261695078715,
"learning_rate": 1.9795577805590323e-05,
"loss": 0.359,
"step": 1715
},
{
"epoch": 1.9307822172200337,
"grad_norm": 0.27223176041458313,
"learning_rate": 1.9774718397997497e-05,
"loss": 0.3414,
"step": 1716
},
{
"epoch": 1.93190770962296,
"grad_norm": 0.2782144577617854,
"learning_rate": 1.9753858990404674e-05,
"loss": 0.3561,
"step": 1717
},
{
"epoch": 1.9330332020258862,
"grad_norm": 0.27538099734788146,
"learning_rate": 1.9732999582811847e-05,
"loss": 0.3472,
"step": 1718
},
{
"epoch": 1.9341586944288127,
"grad_norm": 0.2960828119915571,
"learning_rate": 1.9712140175219024e-05,
"loss": 0.3496,
"step": 1719
},
{
"epoch": 1.9352841868317388,
"grad_norm": 0.258095045594745,
"learning_rate": 1.9691280767626198e-05,
"loss": 0.3517,
"step": 1720
},
{
"epoch": 1.9364096792346652,
"grad_norm": 0.3024256600541793,
"learning_rate": 1.9670421360033375e-05,
"loss": 0.3586,
"step": 1721
},
{
"epoch": 1.9375351716375915,
"grad_norm": 0.29098939153442666,
"learning_rate": 1.9649561952440552e-05,
"loss": 0.3643,
"step": 1722
},
{
"epoch": 1.9386606640405177,
"grad_norm": 0.25782898610022725,
"learning_rate": 1.9628702544847726e-05,
"loss": 0.367,
"step": 1723
},
{
"epoch": 1.939786156443444,
"grad_norm": 0.3495526740430891,
"learning_rate": 1.9607843137254903e-05,
"loss": 0.3577,
"step": 1724
},
{
"epoch": 1.9409116488463702,
"grad_norm": 0.2728973828660973,
"learning_rate": 1.958698372966208e-05,
"loss": 0.3554,
"step": 1725
},
{
"epoch": 1.9420371412492967,
"grad_norm": 0.2901290142358023,
"learning_rate": 1.9566124322069253e-05,
"loss": 0.37,
"step": 1726
},
{
"epoch": 1.9431626336522227,
"grad_norm": 0.3031752356222974,
"learning_rate": 1.954526491447643e-05,
"loss": 0.3638,
"step": 1727
},
{
"epoch": 1.9442881260551492,
"grad_norm": 0.260909753207997,
"learning_rate": 1.9524405506883607e-05,
"loss": 0.3618,
"step": 1728
},
{
"epoch": 1.9454136184580753,
"grad_norm": 0.28948350014768964,
"learning_rate": 1.950354609929078e-05,
"loss": 0.3401,
"step": 1729
},
{
"epoch": 1.9465391108610017,
"grad_norm": 0.2623446580726307,
"learning_rate": 1.9482686691697958e-05,
"loss": 0.3618,
"step": 1730
},
{
"epoch": 1.947664603263928,
"grad_norm": 0.2666588748626957,
"learning_rate": 1.9461827284105135e-05,
"loss": 0.3424,
"step": 1731
},
{
"epoch": 1.9487900956668542,
"grad_norm": 0.23758227129892492,
"learning_rate": 1.9440967876512308e-05,
"loss": 0.3647,
"step": 1732
},
{
"epoch": 1.9499155880697805,
"grad_norm": 1.0235070433552647,
"learning_rate": 1.9420108468919485e-05,
"loss": 0.362,
"step": 1733
},
{
"epoch": 1.9510410804727067,
"grad_norm": 0.28481161066229677,
"learning_rate": 1.939924906132666e-05,
"loss": 0.3631,
"step": 1734
},
{
"epoch": 1.9521665728756332,
"grad_norm": 0.2848122389618838,
"learning_rate": 1.9378389653733836e-05,
"loss": 0.3469,
"step": 1735
},
{
"epoch": 1.9532920652785593,
"grad_norm": 0.2759014719515173,
"learning_rate": 1.9357530246141013e-05,
"loss": 0.3425,
"step": 1736
},
{
"epoch": 1.9544175576814857,
"grad_norm": 0.27874949300316715,
"learning_rate": 1.9336670838548186e-05,
"loss": 0.3855,
"step": 1737
},
{
"epoch": 1.955543050084412,
"grad_norm": 0.31363642679753656,
"learning_rate": 1.931581143095536e-05,
"loss": 0.3536,
"step": 1738
},
{
"epoch": 1.9566685424873382,
"grad_norm": 0.2556224324207228,
"learning_rate": 1.9294952023362537e-05,
"loss": 0.3432,
"step": 1739
},
{
"epoch": 1.9577940348902645,
"grad_norm": 0.2670888092453423,
"learning_rate": 1.927409261576971e-05,
"loss": 0.3509,
"step": 1740
},
{
"epoch": 1.9589195272931907,
"grad_norm": 0.25001267900165874,
"learning_rate": 1.9253233208176888e-05,
"loss": 0.3323,
"step": 1741
},
{
"epoch": 1.960045019696117,
"grad_norm": 0.2974207872384669,
"learning_rate": 1.9232373800584064e-05,
"loss": 0.3544,
"step": 1742
},
{
"epoch": 1.9611705120990433,
"grad_norm": 0.27472747483190185,
"learning_rate": 1.9211514392991238e-05,
"loss": 0.3521,
"step": 1743
},
{
"epoch": 1.9622960045019697,
"grad_norm": 0.2683475797146492,
"learning_rate": 1.9190654985398415e-05,
"loss": 0.3682,
"step": 1744
},
{
"epoch": 1.9634214969048958,
"grad_norm": 0.3822465905741808,
"learning_rate": 1.9169795577805592e-05,
"loss": 0.3535,
"step": 1745
},
{
"epoch": 1.9645469893078222,
"grad_norm": 0.29811948702966473,
"learning_rate": 1.9148936170212766e-05,
"loss": 0.3732,
"step": 1746
},
{
"epoch": 1.9656724817107485,
"grad_norm": 0.30142259657958986,
"learning_rate": 1.9128076762619943e-05,
"loss": 0.3432,
"step": 1747
},
{
"epoch": 1.9667979741136747,
"grad_norm": 0.3786818892770981,
"learning_rate": 1.910721735502712e-05,
"loss": 0.3654,
"step": 1748
},
{
"epoch": 1.967923466516601,
"grad_norm": 0.27029496791481233,
"learning_rate": 1.9086357947434293e-05,
"loss": 0.3682,
"step": 1749
},
{
"epoch": 1.9690489589195272,
"grad_norm": 0.262798379544428,
"learning_rate": 1.906549853984147e-05,
"loss": 0.3568,
"step": 1750
},
{
"epoch": 1.9701744513224537,
"grad_norm": 0.3135712581670641,
"learning_rate": 1.9044639132248647e-05,
"loss": 0.3511,
"step": 1751
},
{
"epoch": 1.9712999437253798,
"grad_norm": 0.3158145619580369,
"learning_rate": 1.902377972465582e-05,
"loss": 0.36,
"step": 1752
},
{
"epoch": 1.9724254361283062,
"grad_norm": 0.318706113946463,
"learning_rate": 1.9002920317062998e-05,
"loss": 0.3562,
"step": 1753
},
{
"epoch": 1.9735509285312323,
"grad_norm": 0.310806681735437,
"learning_rate": 1.898206090947017e-05,
"loss": 0.3514,
"step": 1754
},
{
"epoch": 1.9746764209341587,
"grad_norm": 0.2849866940009224,
"learning_rate": 1.896120150187735e-05,
"loss": 0.3477,
"step": 1755
},
{
"epoch": 1.975801913337085,
"grad_norm": 0.2810634482745697,
"learning_rate": 1.8940342094284525e-05,
"loss": 0.3644,
"step": 1756
},
{
"epoch": 1.9769274057400112,
"grad_norm": 0.2878137639733897,
"learning_rate": 1.89194826866917e-05,
"loss": 0.3594,
"step": 1757
},
{
"epoch": 1.9780528981429375,
"grad_norm": 0.26057909113445293,
"learning_rate": 1.8898623279098873e-05,
"loss": 0.3755,
"step": 1758
},
{
"epoch": 1.9791783905458638,
"grad_norm": 0.27092989925442396,
"learning_rate": 1.887776387150605e-05,
"loss": 0.3648,
"step": 1759
},
{
"epoch": 1.9803038829487902,
"grad_norm": 0.2845108154959281,
"learning_rate": 1.8856904463913223e-05,
"loss": 0.3449,
"step": 1760
},
{
"epoch": 1.9814293753517163,
"grad_norm": 0.24467445189735315,
"learning_rate": 1.88360450563204e-05,
"loss": 0.3558,
"step": 1761
},
{
"epoch": 1.9825548677546427,
"grad_norm": 0.2715643743977259,
"learning_rate": 1.8815185648727577e-05,
"loss": 0.3567,
"step": 1762
},
{
"epoch": 1.983680360157569,
"grad_norm": 0.2613996036084293,
"learning_rate": 1.879432624113475e-05,
"loss": 0.3467,
"step": 1763
},
{
"epoch": 1.9848058525604952,
"grad_norm": 0.2816357872833296,
"learning_rate": 1.8773466833541928e-05,
"loss": 0.3417,
"step": 1764
},
{
"epoch": 1.9859313449634215,
"grad_norm": 0.29529698315579805,
"learning_rate": 1.8752607425949105e-05,
"loss": 0.3527,
"step": 1765
},
{
"epoch": 1.9870568373663478,
"grad_norm": 0.27238727861070106,
"learning_rate": 1.8731748018356278e-05,
"loss": 0.3717,
"step": 1766
},
{
"epoch": 1.988182329769274,
"grad_norm": 0.27577156414013015,
"learning_rate": 1.8710888610763455e-05,
"loss": 0.3632,
"step": 1767
},
{
"epoch": 1.9893078221722003,
"grad_norm": 0.31287278872365587,
"learning_rate": 1.8690029203170632e-05,
"loss": 0.3678,
"step": 1768
},
{
"epoch": 1.9904333145751267,
"grad_norm": 0.2910024485455243,
"learning_rate": 1.8669169795577806e-05,
"loss": 0.3661,
"step": 1769
},
{
"epoch": 1.9915588069780528,
"grad_norm": 0.29522751930001573,
"learning_rate": 1.8648310387984983e-05,
"loss": 0.3733,
"step": 1770
},
{
"epoch": 1.9926842993809792,
"grad_norm": 0.2931943333929543,
"learning_rate": 1.862745098039216e-05,
"loss": 0.3554,
"step": 1771
},
{
"epoch": 1.9938097917839055,
"grad_norm": 0.29961502454826516,
"learning_rate": 1.8606591572799333e-05,
"loss": 0.3534,
"step": 1772
},
{
"epoch": 1.9949352841868317,
"grad_norm": 0.3016308875068367,
"learning_rate": 1.858573216520651e-05,
"loss": 0.3868,
"step": 1773
},
{
"epoch": 1.996060776589758,
"grad_norm": 0.3051815491365933,
"learning_rate": 1.8564872757613684e-05,
"loss": 0.3573,
"step": 1774
},
{
"epoch": 1.9971862689926843,
"grad_norm": 0.3463472368237023,
"learning_rate": 1.854401335002086e-05,
"loss": 0.3568,
"step": 1775
},
{
"epoch": 1.9983117613956107,
"grad_norm": 0.30250184431483823,
"learning_rate": 1.8523153942428038e-05,
"loss": 0.3679,
"step": 1776
},
{
"epoch": 1.9994372537985368,
"grad_norm": 0.27076812267453526,
"learning_rate": 1.850229453483521e-05,
"loss": 0.359,
"step": 1777
},
{
"epoch": 2.0,
"grad_norm": 0.41375343672066456,
"learning_rate": 1.848143512724239e-05,
"loss": 0.3219,
"step": 1778
},
{
"epoch": 2.0011254924029265,
"grad_norm": 0.30690460412174675,
"learning_rate": 1.8460575719649562e-05,
"loss": 0.2847,
"step": 1779
},
{
"epoch": 2.0022509848058525,
"grad_norm": 0.29252851536455965,
"learning_rate": 1.8439716312056736e-05,
"loss": 0.2916,
"step": 1780
},
{
"epoch": 2.003376477208779,
"grad_norm": 0.2867585999652241,
"learning_rate": 1.8418856904463913e-05,
"loss": 0.304,
"step": 1781
},
{
"epoch": 2.004501969611705,
"grad_norm": 0.3147976773039966,
"learning_rate": 1.839799749687109e-05,
"loss": 0.2891,
"step": 1782
},
{
"epoch": 2.0056274620146315,
"grad_norm": 0.2441846828289504,
"learning_rate": 1.8377138089278263e-05,
"loss": 0.2909,
"step": 1783
},
{
"epoch": 2.0067529544175575,
"grad_norm": 0.2593896216365388,
"learning_rate": 1.835627868168544e-05,
"loss": 0.2753,
"step": 1784
},
{
"epoch": 2.007878446820484,
"grad_norm": 0.2893905461877493,
"learning_rate": 1.8335419274092617e-05,
"loss": 0.2818,
"step": 1785
},
{
"epoch": 2.00900393922341,
"grad_norm": 0.2846562929483248,
"learning_rate": 1.831455986649979e-05,
"loss": 0.2907,
"step": 1786
},
{
"epoch": 2.0101294316263365,
"grad_norm": 0.2566724532797832,
"learning_rate": 1.8293700458906968e-05,
"loss": 0.2865,
"step": 1787
},
{
"epoch": 2.011254924029263,
"grad_norm": 0.30986557763389416,
"learning_rate": 1.8272841051314145e-05,
"loss": 0.299,
"step": 1788
},
{
"epoch": 2.012380416432189,
"grad_norm": 0.2790346837879426,
"learning_rate": 1.825198164372132e-05,
"loss": 0.2872,
"step": 1789
},
{
"epoch": 2.0135059088351155,
"grad_norm": 0.28965248515971675,
"learning_rate": 1.8231122236128495e-05,
"loss": 0.282,
"step": 1790
},
{
"epoch": 2.0146314012380415,
"grad_norm": 0.26758447999158064,
"learning_rate": 1.8210262828535672e-05,
"loss": 0.2854,
"step": 1791
},
{
"epoch": 2.015756893640968,
"grad_norm": 0.25752829835015667,
"learning_rate": 1.8189403420942846e-05,
"loss": 0.2875,
"step": 1792
},
{
"epoch": 2.016882386043894,
"grad_norm": 0.26237094621373575,
"learning_rate": 1.8168544013350023e-05,
"loss": 0.2861,
"step": 1793
},
{
"epoch": 2.0180078784468205,
"grad_norm": 0.25324822624548066,
"learning_rate": 1.8147684605757196e-05,
"loss": 0.2804,
"step": 1794
},
{
"epoch": 2.019133370849747,
"grad_norm": 0.27650509711437854,
"learning_rate": 1.8126825198164373e-05,
"loss": 0.298,
"step": 1795
},
{
"epoch": 2.020258863252673,
"grad_norm": 0.271607108362916,
"learning_rate": 1.810596579057155e-05,
"loss": 0.2793,
"step": 1796
},
{
"epoch": 2.0213843556555995,
"grad_norm": 0.2763902863245182,
"learning_rate": 1.8085106382978724e-05,
"loss": 0.2824,
"step": 1797
},
{
"epoch": 2.0225098480585255,
"grad_norm": 0.29074430245042243,
"learning_rate": 1.80642469753859e-05,
"loss": 0.2847,
"step": 1798
},
{
"epoch": 2.023635340461452,
"grad_norm": 0.252760394282513,
"learning_rate": 1.8043387567793075e-05,
"loss": 0.2729,
"step": 1799
},
{
"epoch": 2.024760832864378,
"grad_norm": 0.25115826895976634,
"learning_rate": 1.8022528160200248e-05,
"loss": 0.2903,
"step": 1800
},
{
"epoch": 2.0258863252673045,
"grad_norm": 0.31665556656306054,
"learning_rate": 1.8001668752607425e-05,
"loss": 0.2806,
"step": 1801
},
{
"epoch": 2.0270118176702305,
"grad_norm": 0.27565102328032076,
"learning_rate": 1.7980809345014602e-05,
"loss": 0.2781,
"step": 1802
},
{
"epoch": 2.028137310073157,
"grad_norm": 0.26334129144996565,
"learning_rate": 1.7959949937421776e-05,
"loss": 0.2865,
"step": 1803
},
{
"epoch": 2.0292628024760835,
"grad_norm": 0.29084203177119927,
"learning_rate": 1.7939090529828953e-05,
"loss": 0.2915,
"step": 1804
},
{
"epoch": 2.0303882948790095,
"grad_norm": 0.24821063662817036,
"learning_rate": 1.791823112223613e-05,
"loss": 0.2784,
"step": 1805
},
{
"epoch": 2.031513787281936,
"grad_norm": 0.2550931735301453,
"learning_rate": 1.7897371714643303e-05,
"loss": 0.2836,
"step": 1806
},
{
"epoch": 2.032639279684862,
"grad_norm": 0.27634727649104684,
"learning_rate": 1.787651230705048e-05,
"loss": 0.3069,
"step": 1807
},
{
"epoch": 2.0337647720877885,
"grad_norm": 0.24014034990048097,
"learning_rate": 1.7855652899457657e-05,
"loss": 0.2858,
"step": 1808
},
{
"epoch": 2.0348902644907145,
"grad_norm": 0.23529224395747875,
"learning_rate": 1.783479349186483e-05,
"loss": 0.292,
"step": 1809
},
{
"epoch": 2.036015756893641,
"grad_norm": 0.2226918871531934,
"learning_rate": 1.7813934084272008e-05,
"loss": 0.289,
"step": 1810
},
{
"epoch": 2.037141249296567,
"grad_norm": 0.24875514083553227,
"learning_rate": 1.7793074676679185e-05,
"loss": 0.2879,
"step": 1811
},
{
"epoch": 2.0382667416994935,
"grad_norm": 0.22101380287283037,
"learning_rate": 1.777221526908636e-05,
"loss": 0.2785,
"step": 1812
},
{
"epoch": 2.03939223410242,
"grad_norm": 0.24344041835452335,
"learning_rate": 1.7751355861493535e-05,
"loss": 0.2768,
"step": 1813
},
{
"epoch": 2.040517726505346,
"grad_norm": 0.24709305555007302,
"learning_rate": 1.773049645390071e-05,
"loss": 0.2785,
"step": 1814
},
{
"epoch": 2.0416432189082725,
"grad_norm": 0.23036508957897686,
"learning_rate": 1.7709637046307886e-05,
"loss": 0.2829,
"step": 1815
},
{
"epoch": 2.0427687113111985,
"grad_norm": 0.304777104086667,
"learning_rate": 1.7688777638715063e-05,
"loss": 0.2842,
"step": 1816
},
{
"epoch": 2.043894203714125,
"grad_norm": 0.24563307593084063,
"learning_rate": 1.7667918231122237e-05,
"loss": 0.2842,
"step": 1817
},
{
"epoch": 2.045019696117051,
"grad_norm": 0.24049156495572827,
"learning_rate": 1.7647058823529414e-05,
"loss": 0.29,
"step": 1818
},
{
"epoch": 2.0461451885199775,
"grad_norm": 0.26152397114334225,
"learning_rate": 1.762619941593659e-05,
"loss": 0.2943,
"step": 1819
},
{
"epoch": 2.047270680922904,
"grad_norm": 0.24701566468961217,
"learning_rate": 1.7605340008343764e-05,
"loss": 0.28,
"step": 1820
},
{
"epoch": 2.04839617332583,
"grad_norm": 0.22113320376779072,
"learning_rate": 1.7584480600750938e-05,
"loss": 0.2824,
"step": 1821
},
{
"epoch": 2.0495216657287565,
"grad_norm": 0.2498303273769485,
"learning_rate": 1.7563621193158115e-05,
"loss": 0.2764,
"step": 1822
},
{
"epoch": 2.0506471581316825,
"grad_norm": 0.2613079367123678,
"learning_rate": 1.754276178556529e-05,
"loss": 0.3029,
"step": 1823
},
{
"epoch": 2.051772650534609,
"grad_norm": 0.2533549657170249,
"learning_rate": 1.7521902377972465e-05,
"loss": 0.2941,
"step": 1824
},
{
"epoch": 2.052898142937535,
"grad_norm": 0.24525113996522538,
"learning_rate": 1.7501042970379642e-05,
"loss": 0.2791,
"step": 1825
},
{
"epoch": 2.0540236353404615,
"grad_norm": 0.22636672236346222,
"learning_rate": 1.7480183562786816e-05,
"loss": 0.2708,
"step": 1826
},
{
"epoch": 2.0551491277433875,
"grad_norm": 0.2318404892918077,
"learning_rate": 1.7459324155193993e-05,
"loss": 0.2831,
"step": 1827
},
{
"epoch": 2.056274620146314,
"grad_norm": 0.22908482292345286,
"learning_rate": 1.743846474760117e-05,
"loss": 0.2791,
"step": 1828
},
{
"epoch": 2.0574001125492405,
"grad_norm": 0.23199016490767796,
"learning_rate": 1.7417605340008343e-05,
"loss": 0.2899,
"step": 1829
},
{
"epoch": 2.0585256049521665,
"grad_norm": 0.22679432927238993,
"learning_rate": 1.739674593241552e-05,
"loss": 0.2705,
"step": 1830
},
{
"epoch": 2.059651097355093,
"grad_norm": 0.240936280203786,
"learning_rate": 1.7375886524822697e-05,
"loss": 0.2796,
"step": 1831
},
{
"epoch": 2.060776589758019,
"grad_norm": 0.23052791316981805,
"learning_rate": 1.735502711722987e-05,
"loss": 0.298,
"step": 1832
},
{
"epoch": 2.0619020821609455,
"grad_norm": 0.22399826316835342,
"learning_rate": 1.7334167709637048e-05,
"loss": 0.2768,
"step": 1833
},
{
"epoch": 2.0630275745638715,
"grad_norm": 0.24389711598920422,
"learning_rate": 1.731330830204422e-05,
"loss": 0.2789,
"step": 1834
},
{
"epoch": 2.064153066966798,
"grad_norm": 0.24531794065173357,
"learning_rate": 1.72924488944514e-05,
"loss": 0.2841,
"step": 1835
},
{
"epoch": 2.065278559369724,
"grad_norm": 0.2857308138585535,
"learning_rate": 1.7271589486858576e-05,
"loss": 0.2746,
"step": 1836
},
{
"epoch": 2.0664040517726505,
"grad_norm": 0.2331548964731216,
"learning_rate": 1.725073007926575e-05,
"loss": 0.2779,
"step": 1837
},
{
"epoch": 2.067529544175577,
"grad_norm": 0.23649426513105923,
"learning_rate": 1.7229870671672926e-05,
"loss": 0.2913,
"step": 1838
},
{
"epoch": 2.068655036578503,
"grad_norm": 0.2777841981435879,
"learning_rate": 1.7209011264080103e-05,
"loss": 0.2826,
"step": 1839
},
{
"epoch": 2.0697805289814295,
"grad_norm": 0.27066327686914066,
"learning_rate": 1.7188151856487277e-05,
"loss": 0.2893,
"step": 1840
},
{
"epoch": 2.0709060213843555,
"grad_norm": 0.23134899380353294,
"learning_rate": 1.716729244889445e-05,
"loss": 0.2804,
"step": 1841
},
{
"epoch": 2.072031513787282,
"grad_norm": 0.29223852513335047,
"learning_rate": 1.7146433041301627e-05,
"loss": 0.2916,
"step": 1842
},
{
"epoch": 2.073157006190208,
"grad_norm": 0.2735960908953659,
"learning_rate": 1.71255736337088e-05,
"loss": 0.2852,
"step": 1843
},
{
"epoch": 2.0742824985931345,
"grad_norm": 0.26821528502754455,
"learning_rate": 1.7104714226115978e-05,
"loss": 0.2891,
"step": 1844
},
{
"epoch": 2.0754079909960605,
"grad_norm": 0.26154260311021144,
"learning_rate": 1.7083854818523155e-05,
"loss": 0.2875,
"step": 1845
},
{
"epoch": 2.076533483398987,
"grad_norm": 0.31021830521974225,
"learning_rate": 1.706299541093033e-05,
"loss": 0.2776,
"step": 1846
},
{
"epoch": 2.0776589758019135,
"grad_norm": 0.2788988641156972,
"learning_rate": 1.7042136003337505e-05,
"loss": 0.2886,
"step": 1847
},
{
"epoch": 2.0787844682048395,
"grad_norm": 0.2907858072020635,
"learning_rate": 1.7021276595744682e-05,
"loss": 0.2895,
"step": 1848
},
{
"epoch": 2.079909960607766,
"grad_norm": 0.2542410475178318,
"learning_rate": 1.7000417188151856e-05,
"loss": 0.2856,
"step": 1849
},
{
"epoch": 2.081035453010692,
"grad_norm": 0.24197984345301113,
"learning_rate": 1.6979557780559033e-05,
"loss": 0.2824,
"step": 1850
},
{
"epoch": 2.0821609454136185,
"grad_norm": 0.2557692899387776,
"learning_rate": 1.695869837296621e-05,
"loss": 0.2909,
"step": 1851
},
{
"epoch": 2.0832864378165445,
"grad_norm": 0.23793678801447735,
"learning_rate": 1.6937838965373384e-05,
"loss": 0.2689,
"step": 1852
},
{
"epoch": 2.084411930219471,
"grad_norm": 0.29107842473943085,
"learning_rate": 1.691697955778056e-05,
"loss": 0.284,
"step": 1853
},
{
"epoch": 2.0855374226223975,
"grad_norm": 0.24607914318213508,
"learning_rate": 1.6896120150187734e-05,
"loss": 0.2957,
"step": 1854
},
{
"epoch": 2.0866629150253235,
"grad_norm": 0.21651709890692455,
"learning_rate": 1.687526074259491e-05,
"loss": 0.2677,
"step": 1855
},
{
"epoch": 2.08778840742825,
"grad_norm": 0.22707602957596063,
"learning_rate": 1.6854401335002088e-05,
"loss": 0.2854,
"step": 1856
},
{
"epoch": 2.088913899831176,
"grad_norm": 0.24846772345755247,
"learning_rate": 1.6833541927409262e-05,
"loss": 0.2679,
"step": 1857
},
{
"epoch": 2.0900393922341025,
"grad_norm": 0.27573122817807033,
"learning_rate": 1.681268251981644e-05,
"loss": 0.3007,
"step": 1858
},
{
"epoch": 2.0911648846370285,
"grad_norm": 0.23927598344656173,
"learning_rate": 1.6791823112223616e-05,
"loss": 0.2754,
"step": 1859
},
{
"epoch": 2.092290377039955,
"grad_norm": 0.23518656387715997,
"learning_rate": 1.677096370463079e-05,
"loss": 0.2766,
"step": 1860
},
{
"epoch": 2.093415869442881,
"grad_norm": 0.24448942505615562,
"learning_rate": 1.6750104297037966e-05,
"loss": 0.2913,
"step": 1861
},
{
"epoch": 2.0945413618458075,
"grad_norm": 0.2336572648593039,
"learning_rate": 1.672924488944514e-05,
"loss": 0.2942,
"step": 1862
},
{
"epoch": 2.095666854248734,
"grad_norm": 0.22716116914003923,
"learning_rate": 1.6708385481852313e-05,
"loss": 0.2881,
"step": 1863
},
{
"epoch": 2.09679234665166,
"grad_norm": 0.2849566981299875,
"learning_rate": 1.668752607425949e-05,
"loss": 0.2805,
"step": 1864
},
{
"epoch": 2.0979178390545865,
"grad_norm": 0.21858945358126292,
"learning_rate": 1.6666666666666667e-05,
"loss": 0.2807,
"step": 1865
},
{
"epoch": 2.0990433314575125,
"grad_norm": 0.23697889851760048,
"learning_rate": 1.664580725907384e-05,
"loss": 0.2791,
"step": 1866
},
{
"epoch": 2.100168823860439,
"grad_norm": 0.23126363606688877,
"learning_rate": 1.6624947851481018e-05,
"loss": 0.2878,
"step": 1867
},
{
"epoch": 2.101294316263365,
"grad_norm": 0.2651888938021785,
"learning_rate": 1.6604088443888195e-05,
"loss": 0.2859,
"step": 1868
},
{
"epoch": 2.1024198086662915,
"grad_norm": 0.23270730320241492,
"learning_rate": 1.658322903629537e-05,
"loss": 0.296,
"step": 1869
},
{
"epoch": 2.103545301069218,
"grad_norm": 0.3042844752542814,
"learning_rate": 1.6562369628702546e-05,
"loss": 0.3132,
"step": 1870
},
{
"epoch": 2.104670793472144,
"grad_norm": 0.24339368794150576,
"learning_rate": 1.6541510221109723e-05,
"loss": 0.2766,
"step": 1871
},
{
"epoch": 2.1057962858750705,
"grad_norm": 0.26285249575918274,
"learning_rate": 1.6520650813516896e-05,
"loss": 0.285,
"step": 1872
},
{
"epoch": 2.1069217782779965,
"grad_norm": 0.25652733532840694,
"learning_rate": 1.6499791405924073e-05,
"loss": 0.294,
"step": 1873
},
{
"epoch": 2.108047270680923,
"grad_norm": 0.2543828171392836,
"learning_rate": 1.6478931998331247e-05,
"loss": 0.2746,
"step": 1874
},
{
"epoch": 2.109172763083849,
"grad_norm": 0.2352973105601401,
"learning_rate": 1.6458072590738424e-05,
"loss": 0.2743,
"step": 1875
},
{
"epoch": 2.1102982554867755,
"grad_norm": 0.25036103571955426,
"learning_rate": 1.64372131831456e-05,
"loss": 0.2909,
"step": 1876
},
{
"epoch": 2.1114237478897016,
"grad_norm": 0.30189436640559725,
"learning_rate": 1.6416353775552774e-05,
"loss": 0.2969,
"step": 1877
},
{
"epoch": 2.112549240292628,
"grad_norm": 0.24769249438614963,
"learning_rate": 1.639549436795995e-05,
"loss": 0.282,
"step": 1878
},
{
"epoch": 2.1136747326955545,
"grad_norm": 0.25738252269107714,
"learning_rate": 1.6374634960367128e-05,
"loss": 0.2987,
"step": 1879
},
{
"epoch": 2.1148002250984805,
"grad_norm": 0.3340073534477609,
"learning_rate": 1.6353775552774302e-05,
"loss": 0.2962,
"step": 1880
},
{
"epoch": 2.115925717501407,
"grad_norm": 0.2641953651128952,
"learning_rate": 1.633291614518148e-05,
"loss": 0.2827,
"step": 1881
},
{
"epoch": 2.117051209904333,
"grad_norm": 0.24068009657515393,
"learning_rate": 1.6312056737588656e-05,
"loss": 0.2795,
"step": 1882
},
{
"epoch": 2.1181767023072595,
"grad_norm": 0.2507429416561855,
"learning_rate": 1.6291197329995826e-05,
"loss": 0.285,
"step": 1883
},
{
"epoch": 2.1193021947101855,
"grad_norm": 0.2533889963610055,
"learning_rate": 1.6270337922403003e-05,
"loss": 0.2941,
"step": 1884
},
{
"epoch": 2.120427687113112,
"grad_norm": 0.24688775417233455,
"learning_rate": 1.624947851481018e-05,
"loss": 0.3012,
"step": 1885
},
{
"epoch": 2.121553179516038,
"grad_norm": 0.24915605314972172,
"learning_rate": 1.6228619107217354e-05,
"loss": 0.2993,
"step": 1886
},
{
"epoch": 2.1226786719189645,
"grad_norm": 0.279903262549959,
"learning_rate": 1.620775969962453e-05,
"loss": 0.2834,
"step": 1887
},
{
"epoch": 2.123804164321891,
"grad_norm": 0.23182376171306077,
"learning_rate": 1.6186900292031708e-05,
"loss": 0.2916,
"step": 1888
},
{
"epoch": 2.124929656724817,
"grad_norm": 0.23000197495504152,
"learning_rate": 1.616604088443888e-05,
"loss": 0.2747,
"step": 1889
},
{
"epoch": 2.1260551491277435,
"grad_norm": 0.23823286526237422,
"learning_rate": 1.6145181476846058e-05,
"loss": 0.2781,
"step": 1890
},
{
"epoch": 2.1271806415306695,
"grad_norm": 0.21627846743690535,
"learning_rate": 1.6124322069253235e-05,
"loss": 0.2706,
"step": 1891
},
{
"epoch": 2.128306133933596,
"grad_norm": 0.23892946673013887,
"learning_rate": 1.610346266166041e-05,
"loss": 0.2914,
"step": 1892
},
{
"epoch": 2.129431626336522,
"grad_norm": 0.2656649096475775,
"learning_rate": 1.6082603254067586e-05,
"loss": 0.2866,
"step": 1893
},
{
"epoch": 2.1305571187394485,
"grad_norm": 0.2227237824020542,
"learning_rate": 1.6061743846474763e-05,
"loss": 0.2625,
"step": 1894
},
{
"epoch": 2.1316826111423746,
"grad_norm": 0.23235111483198348,
"learning_rate": 1.6040884438881936e-05,
"loss": 0.2859,
"step": 1895
},
{
"epoch": 2.132808103545301,
"grad_norm": 0.25920239106869064,
"learning_rate": 1.6020025031289113e-05,
"loss": 0.2959,
"step": 1896
},
{
"epoch": 2.1339335959482275,
"grad_norm": 0.23719185530213213,
"learning_rate": 1.5999165623696287e-05,
"loss": 0.2897,
"step": 1897
},
{
"epoch": 2.1350590883511535,
"grad_norm": 0.22876937310915393,
"learning_rate": 1.5978306216103464e-05,
"loss": 0.2788,
"step": 1898
},
{
"epoch": 2.13618458075408,
"grad_norm": 0.26616238576961354,
"learning_rate": 1.595744680851064e-05,
"loss": 0.2889,
"step": 1899
},
{
"epoch": 2.137310073157006,
"grad_norm": 0.2166404539813475,
"learning_rate": 1.5936587400917814e-05,
"loss": 0.282,
"step": 1900
},
{
"epoch": 2.1384355655599325,
"grad_norm": 0.23700101129905038,
"learning_rate": 1.591572799332499e-05,
"loss": 0.2968,
"step": 1901
},
{
"epoch": 2.1395610579628586,
"grad_norm": 0.2285745331225241,
"learning_rate": 1.589486858573217e-05,
"loss": 0.2841,
"step": 1902
},
{
"epoch": 2.140686550365785,
"grad_norm": 0.23783838496188303,
"learning_rate": 1.5874009178139342e-05,
"loss": 0.2909,
"step": 1903
},
{
"epoch": 2.1418120427687115,
"grad_norm": 0.23082103720915573,
"learning_rate": 1.5853149770546516e-05,
"loss": 0.2824,
"step": 1904
},
{
"epoch": 2.1429375351716375,
"grad_norm": 0.25094828821607146,
"learning_rate": 1.5832290362953693e-05,
"loss": 0.285,
"step": 1905
},
{
"epoch": 2.144063027574564,
"grad_norm": 0.22431109979899386,
"learning_rate": 1.5811430955360866e-05,
"loss": 0.2737,
"step": 1906
},
{
"epoch": 2.14518851997749,
"grad_norm": 0.22492379294000237,
"learning_rate": 1.5790571547768043e-05,
"loss": 0.2726,
"step": 1907
},
{
"epoch": 2.1463140123804165,
"grad_norm": 0.2314053442523269,
"learning_rate": 1.576971214017522e-05,
"loss": 0.2754,
"step": 1908
},
{
"epoch": 2.1474395047833426,
"grad_norm": 0.24673230264182605,
"learning_rate": 1.5748852732582394e-05,
"loss": 0.2921,
"step": 1909
},
{
"epoch": 2.148564997186269,
"grad_norm": 0.23606707383444092,
"learning_rate": 1.572799332498957e-05,
"loss": 0.2804,
"step": 1910
},
{
"epoch": 2.1496904895891955,
"grad_norm": 0.235724127482375,
"learning_rate": 1.5707133917396748e-05,
"loss": 0.2861,
"step": 1911
},
{
"epoch": 2.1508159819921215,
"grad_norm": 0.24483607505245927,
"learning_rate": 1.568627450980392e-05,
"loss": 0.2878,
"step": 1912
},
{
"epoch": 2.151941474395048,
"grad_norm": 0.2552535556772291,
"learning_rate": 1.56654151022111e-05,
"loss": 0.2857,
"step": 1913
},
{
"epoch": 2.153066966797974,
"grad_norm": 0.22983484882907804,
"learning_rate": 1.5644555694618275e-05,
"loss": 0.2872,
"step": 1914
},
{
"epoch": 2.1541924592009005,
"grad_norm": 0.25772716013553465,
"learning_rate": 1.562369628702545e-05,
"loss": 0.2959,
"step": 1915
},
{
"epoch": 2.1553179516038266,
"grad_norm": 0.29415752414459184,
"learning_rate": 1.5602836879432626e-05,
"loss": 0.2909,
"step": 1916
},
{
"epoch": 2.156443444006753,
"grad_norm": 0.2564449243204837,
"learning_rate": 1.55819774718398e-05,
"loss": 0.3047,
"step": 1917
},
{
"epoch": 2.157568936409679,
"grad_norm": 0.2357261136445965,
"learning_rate": 1.5561118064246976e-05,
"loss": 0.2891,
"step": 1918
},
{
"epoch": 2.1586944288126055,
"grad_norm": 0.23236268840383198,
"learning_rate": 1.5540258656654153e-05,
"loss": 0.2936,
"step": 1919
},
{
"epoch": 2.159819921215532,
"grad_norm": 0.2526992651991741,
"learning_rate": 1.5519399249061327e-05,
"loss": 0.279,
"step": 1920
},
{
"epoch": 2.160945413618458,
"grad_norm": 0.26601612523224494,
"learning_rate": 1.5498539841468504e-05,
"loss": 0.2805,
"step": 1921
},
{
"epoch": 2.1620709060213845,
"grad_norm": 0.23000298824921414,
"learning_rate": 1.547768043387568e-05,
"loss": 0.2837,
"step": 1922
},
{
"epoch": 2.1631963984243106,
"grad_norm": 0.24154706706349996,
"learning_rate": 1.5456821026282855e-05,
"loss": 0.2846,
"step": 1923
},
{
"epoch": 2.164321890827237,
"grad_norm": 0.2360397761989054,
"learning_rate": 1.543596161869003e-05,
"loss": 0.283,
"step": 1924
},
{
"epoch": 2.165447383230163,
"grad_norm": 0.2218092507800359,
"learning_rate": 1.5415102211097205e-05,
"loss": 0.2771,
"step": 1925
},
{
"epoch": 2.1665728756330895,
"grad_norm": 0.2386052630849636,
"learning_rate": 1.539424280350438e-05,
"loss": 0.2924,
"step": 1926
},
{
"epoch": 2.1676983680360156,
"grad_norm": 0.24947895655376598,
"learning_rate": 1.5373383395911556e-05,
"loss": 0.2925,
"step": 1927
},
{
"epoch": 2.168823860438942,
"grad_norm": 0.2800300772115473,
"learning_rate": 1.5352523988318733e-05,
"loss": 0.2931,
"step": 1928
},
{
"epoch": 2.1699493528418685,
"grad_norm": 0.22636221415787847,
"learning_rate": 1.5331664580725906e-05,
"loss": 0.2875,
"step": 1929
},
{
"epoch": 2.1710748452447945,
"grad_norm": 0.24386646248262941,
"learning_rate": 1.5310805173133083e-05,
"loss": 0.2905,
"step": 1930
},
{
"epoch": 2.172200337647721,
"grad_norm": 0.22365055654311475,
"learning_rate": 1.528994576554026e-05,
"loss": 0.2802,
"step": 1931
},
{
"epoch": 2.173325830050647,
"grad_norm": 0.25602509803802304,
"learning_rate": 1.5269086357947434e-05,
"loss": 0.2724,
"step": 1932
},
{
"epoch": 2.1744513224535735,
"grad_norm": 0.20551321646228457,
"learning_rate": 1.5248226950354611e-05,
"loss": 0.2721,
"step": 1933
},
{
"epoch": 2.1755768148564996,
"grad_norm": 0.22807897179549413,
"learning_rate": 1.5227367542761786e-05,
"loss": 0.2892,
"step": 1934
},
{
"epoch": 2.176702307259426,
"grad_norm": 0.25852860308404757,
"learning_rate": 1.5206508135168961e-05,
"loss": 0.3027,
"step": 1935
},
{
"epoch": 2.177827799662352,
"grad_norm": 0.2279934128116949,
"learning_rate": 1.5185648727576138e-05,
"loss": 0.2801,
"step": 1936
},
{
"epoch": 2.1789532920652785,
"grad_norm": 0.2351411289469844,
"learning_rate": 1.5164789319983314e-05,
"loss": 0.28,
"step": 1937
},
{
"epoch": 2.180078784468205,
"grad_norm": 0.21552640522315936,
"learning_rate": 1.5143929912390489e-05,
"loss": 0.2947,
"step": 1938
},
{
"epoch": 2.181204276871131,
"grad_norm": 0.24626801791538866,
"learning_rate": 1.5123070504797664e-05,
"loss": 0.2783,
"step": 1939
},
{
"epoch": 2.1823297692740575,
"grad_norm": 0.23012350879449098,
"learning_rate": 1.5102211097204841e-05,
"loss": 0.2774,
"step": 1940
},
{
"epoch": 2.1834552616769836,
"grad_norm": 0.23081070838683507,
"learning_rate": 1.5081351689612017e-05,
"loss": 0.2799,
"step": 1941
},
{
"epoch": 2.18458075407991,
"grad_norm": 0.2490023091368887,
"learning_rate": 1.5060492282019192e-05,
"loss": 0.2916,
"step": 1942
},
{
"epoch": 2.185706246482836,
"grad_norm": 0.23226830279863933,
"learning_rate": 1.5039632874426369e-05,
"loss": 0.274,
"step": 1943
},
{
"epoch": 2.1868317388857625,
"grad_norm": 0.23814945426894574,
"learning_rate": 1.5018773466833544e-05,
"loss": 0.2987,
"step": 1944
},
{
"epoch": 2.1879572312886886,
"grad_norm": 0.22888208424137457,
"learning_rate": 1.4997914059240718e-05,
"loss": 0.2809,
"step": 1945
},
{
"epoch": 2.189082723691615,
"grad_norm": 0.22117598909045155,
"learning_rate": 1.4977054651647893e-05,
"loss": 0.2869,
"step": 1946
},
{
"epoch": 2.1902082160945415,
"grad_norm": 0.2635412507153887,
"learning_rate": 1.4956195244055068e-05,
"loss": 0.3102,
"step": 1947
},
{
"epoch": 2.1913337084974676,
"grad_norm": 0.21434697577713013,
"learning_rate": 1.4935335836462244e-05,
"loss": 0.2748,
"step": 1948
},
{
"epoch": 2.192459200900394,
"grad_norm": 0.23605470994586675,
"learning_rate": 1.491447642886942e-05,
"loss": 0.2859,
"step": 1949
},
{
"epoch": 2.19358469330332,
"grad_norm": 0.2405759189766832,
"learning_rate": 1.4893617021276596e-05,
"loss": 0.2942,
"step": 1950
},
{
"epoch": 2.1947101857062465,
"grad_norm": 0.22131821842232993,
"learning_rate": 1.4872757613683771e-05,
"loss": 0.2884,
"step": 1951
},
{
"epoch": 2.1958356781091726,
"grad_norm": 0.23216071326486187,
"learning_rate": 1.4851898206090946e-05,
"loss": 0.2741,
"step": 1952
},
{
"epoch": 2.196961170512099,
"grad_norm": 0.2261133526570407,
"learning_rate": 1.4831038798498123e-05,
"loss": 0.2963,
"step": 1953
},
{
"epoch": 2.1980866629150255,
"grad_norm": 0.2302291451269135,
"learning_rate": 1.4810179390905299e-05,
"loss": 0.2828,
"step": 1954
},
{
"epoch": 2.1992121553179516,
"grad_norm": 0.2535578449757302,
"learning_rate": 1.4789319983312474e-05,
"loss": 0.3054,
"step": 1955
},
{
"epoch": 2.200337647720878,
"grad_norm": 0.2353316415549731,
"learning_rate": 1.4768460575719651e-05,
"loss": 0.2851,
"step": 1956
},
{
"epoch": 2.201463140123804,
"grad_norm": 0.22300891391695027,
"learning_rate": 1.4747601168126826e-05,
"loss": 0.2685,
"step": 1957
},
{
"epoch": 2.2025886325267305,
"grad_norm": 0.24986486980542502,
"learning_rate": 1.4726741760534002e-05,
"loss": 0.2956,
"step": 1958
},
{
"epoch": 2.2037141249296566,
"grad_norm": 0.2180771271590878,
"learning_rate": 1.4705882352941177e-05,
"loss": 0.2922,
"step": 1959
},
{
"epoch": 2.204839617332583,
"grad_norm": 0.22680565869396152,
"learning_rate": 1.4685022945348354e-05,
"loss": 0.2909,
"step": 1960
},
{
"epoch": 2.205965109735509,
"grad_norm": 0.23513680764714112,
"learning_rate": 1.4664163537755529e-05,
"loss": 0.2786,
"step": 1961
},
{
"epoch": 2.2070906021384356,
"grad_norm": 0.24973876085692792,
"learning_rate": 1.4643304130162704e-05,
"loss": 0.2853,
"step": 1962
},
{
"epoch": 2.208216094541362,
"grad_norm": 0.22544610054019418,
"learning_rate": 1.4622444722569881e-05,
"loss": 0.2831,
"step": 1963
},
{
"epoch": 2.209341586944288,
"grad_norm": 0.22330625417293162,
"learning_rate": 1.4601585314977057e-05,
"loss": 0.2867,
"step": 1964
},
{
"epoch": 2.2104670793472145,
"grad_norm": 0.22525152317015681,
"learning_rate": 1.4580725907384232e-05,
"loss": 0.29,
"step": 1965
},
{
"epoch": 2.2115925717501406,
"grad_norm": 0.22249714982600474,
"learning_rate": 1.4559866499791406e-05,
"loss": 0.299,
"step": 1966
},
{
"epoch": 2.212718064153067,
"grad_norm": 0.24092275848280195,
"learning_rate": 1.4539007092198581e-05,
"loss": 0.2896,
"step": 1967
},
{
"epoch": 2.213843556555993,
"grad_norm": 0.22252299992217103,
"learning_rate": 1.4518147684605756e-05,
"loss": 0.258,
"step": 1968
},
{
"epoch": 2.2149690489589196,
"grad_norm": 0.23636046190697863,
"learning_rate": 1.4497288277012933e-05,
"loss": 0.2908,
"step": 1969
},
{
"epoch": 2.216094541361846,
"grad_norm": 0.2596597997389332,
"learning_rate": 1.4476428869420108e-05,
"loss": 0.27,
"step": 1970
},
{
"epoch": 2.217220033764772,
"grad_norm": 0.2531683961826357,
"learning_rate": 1.4455569461827284e-05,
"loss": 0.2832,
"step": 1971
},
{
"epoch": 2.2183455261676985,
"grad_norm": 0.2593605264440698,
"learning_rate": 1.443471005423446e-05,
"loss": 0.2841,
"step": 1972
},
{
"epoch": 2.2194710185706246,
"grad_norm": 0.26699737148304314,
"learning_rate": 1.4413850646641636e-05,
"loss": 0.2799,
"step": 1973
},
{
"epoch": 2.220596510973551,
"grad_norm": 0.2294951675397686,
"learning_rate": 1.4392991239048811e-05,
"loss": 0.2909,
"step": 1974
},
{
"epoch": 2.221722003376477,
"grad_norm": 0.2245538365625567,
"learning_rate": 1.4372131831455987e-05,
"loss": 0.2799,
"step": 1975
},
{
"epoch": 2.2228474957794035,
"grad_norm": 0.2646561800422373,
"learning_rate": 1.4351272423863164e-05,
"loss": 0.2765,
"step": 1976
},
{
"epoch": 2.2239729881823296,
"grad_norm": 0.21416819505340884,
"learning_rate": 1.4330413016270339e-05,
"loss": 0.2801,
"step": 1977
},
{
"epoch": 2.225098480585256,
"grad_norm": 0.21948393418095735,
"learning_rate": 1.4309553608677514e-05,
"loss": 0.2866,
"step": 1978
},
{
"epoch": 2.2262239729881825,
"grad_norm": 0.22822972297920066,
"learning_rate": 1.428869420108469e-05,
"loss": 0.287,
"step": 1979
},
{
"epoch": 2.2273494653911086,
"grad_norm": 0.2160982046115744,
"learning_rate": 1.4267834793491866e-05,
"loss": 0.2925,
"step": 1980
},
{
"epoch": 2.228474957794035,
"grad_norm": 0.23144554832269953,
"learning_rate": 1.4246975385899042e-05,
"loss": 0.3105,
"step": 1981
},
{
"epoch": 2.229600450196961,
"grad_norm": 0.2419669093281673,
"learning_rate": 1.4226115978306217e-05,
"loss": 0.2795,
"step": 1982
},
{
"epoch": 2.2307259425998875,
"grad_norm": 0.2333075873767841,
"learning_rate": 1.4205256570713394e-05,
"loss": 0.2879,
"step": 1983
},
{
"epoch": 2.2318514350028136,
"grad_norm": 0.2381375140609149,
"learning_rate": 1.418439716312057e-05,
"loss": 0.2969,
"step": 1984
},
{
"epoch": 2.23297692740574,
"grad_norm": 0.22837890307254083,
"learning_rate": 1.4163537755527745e-05,
"loss": 0.283,
"step": 1985
},
{
"epoch": 2.234102419808666,
"grad_norm": 0.24448338514717682,
"learning_rate": 1.414267834793492e-05,
"loss": 0.3013,
"step": 1986
},
{
"epoch": 2.2352279122115926,
"grad_norm": 0.23420776111487138,
"learning_rate": 1.4121818940342093e-05,
"loss": 0.2878,
"step": 1987
},
{
"epoch": 2.236353404614519,
"grad_norm": 0.23579942650757943,
"learning_rate": 1.4100959532749269e-05,
"loss": 0.288,
"step": 1988
},
{
"epoch": 2.237478897017445,
"grad_norm": 0.23043040793992384,
"learning_rate": 1.4080100125156446e-05,
"loss": 0.305,
"step": 1989
},
{
"epoch": 2.2386043894203715,
"grad_norm": 0.24659768389490117,
"learning_rate": 1.4059240717563621e-05,
"loss": 0.2778,
"step": 1990
},
{
"epoch": 2.2397298818232976,
"grad_norm": 0.2525101439952681,
"learning_rate": 1.4038381309970796e-05,
"loss": 0.2797,
"step": 1991
},
{
"epoch": 2.240855374226224,
"grad_norm": 0.2180718742006463,
"learning_rate": 1.4017521902377973e-05,
"loss": 0.2824,
"step": 1992
},
{
"epoch": 2.24198086662915,
"grad_norm": 0.23039632064460322,
"learning_rate": 1.3996662494785149e-05,
"loss": 0.2732,
"step": 1993
},
{
"epoch": 2.2431063590320766,
"grad_norm": 0.24390939737808814,
"learning_rate": 1.3975803087192324e-05,
"loss": 0.2942,
"step": 1994
},
{
"epoch": 2.2442318514350026,
"grad_norm": 0.22495659632157705,
"learning_rate": 1.39549436795995e-05,
"loss": 0.2786,
"step": 1995
},
{
"epoch": 2.245357343837929,
"grad_norm": 0.2220671184762533,
"learning_rate": 1.3934084272006676e-05,
"loss": 0.2902,
"step": 1996
},
{
"epoch": 2.2464828362408555,
"grad_norm": 0.2063740174423525,
"learning_rate": 1.3913224864413851e-05,
"loss": 0.2794,
"step": 1997
},
{
"epoch": 2.2476083286437816,
"grad_norm": 0.22864397206918258,
"learning_rate": 1.3892365456821027e-05,
"loss": 0.2899,
"step": 1998
},
{
"epoch": 2.248733821046708,
"grad_norm": 0.22641553859678237,
"learning_rate": 1.3871506049228202e-05,
"loss": 0.2913,
"step": 1999
},
{
"epoch": 2.249859313449634,
"grad_norm": 0.23273883384894037,
"learning_rate": 1.3850646641635379e-05,
"loss": 0.2896,
"step": 2000
},
{
"epoch": 2.2509848058525606,
"grad_norm": 0.25883856114515486,
"learning_rate": 1.3829787234042554e-05,
"loss": 0.2853,
"step": 2001
},
{
"epoch": 2.2521102982554866,
"grad_norm": 0.24543011458191846,
"learning_rate": 1.380892782644973e-05,
"loss": 0.2921,
"step": 2002
},
{
"epoch": 2.253235790658413,
"grad_norm": 0.2573780345794268,
"learning_rate": 1.3788068418856907e-05,
"loss": 0.2987,
"step": 2003
},
{
"epoch": 2.254361283061339,
"grad_norm": 0.2344713538028616,
"learning_rate": 1.3767209011264082e-05,
"loss": 0.2788,
"step": 2004
},
{
"epoch": 2.2554867754642656,
"grad_norm": 0.2655728653325266,
"learning_rate": 1.3746349603671257e-05,
"loss": 0.2891,
"step": 2005
},
{
"epoch": 2.256612267867192,
"grad_norm": 0.24696226562693468,
"learning_rate": 1.3725490196078432e-05,
"loss": 0.2832,
"step": 2006
},
{
"epoch": 2.257737760270118,
"grad_norm": 0.27074526345994904,
"learning_rate": 1.370463078848561e-05,
"loss": 0.2991,
"step": 2007
},
{
"epoch": 2.2588632526730446,
"grad_norm": 0.21307871627731073,
"learning_rate": 1.3683771380892781e-05,
"loss": 0.2802,
"step": 2008
},
{
"epoch": 2.2599887450759706,
"grad_norm": 0.6066306419781285,
"learning_rate": 1.3662911973299958e-05,
"loss": 0.3119,
"step": 2009
},
{
"epoch": 2.261114237478897,
"grad_norm": 0.2354767198892578,
"learning_rate": 1.3642052565707134e-05,
"loss": 0.2931,
"step": 2010
},
{
"epoch": 2.2622397298818235,
"grad_norm": 0.3113759768538715,
"learning_rate": 1.3621193158114309e-05,
"loss": 0.3097,
"step": 2011
},
{
"epoch": 2.2633652222847496,
"grad_norm": 0.23868520954039024,
"learning_rate": 1.3600333750521486e-05,
"loss": 0.295,
"step": 2012
},
{
"epoch": 2.264490714687676,
"grad_norm": 0.25599462430091524,
"learning_rate": 1.3579474342928661e-05,
"loss": 0.2934,
"step": 2013
},
{
"epoch": 2.265616207090602,
"grad_norm": 0.2378852312729475,
"learning_rate": 1.3558614935335836e-05,
"loss": 0.2905,
"step": 2014
},
{
"epoch": 2.2667416994935286,
"grad_norm": 0.23537606322412846,
"learning_rate": 1.3537755527743012e-05,
"loss": 0.2933,
"step": 2015
},
{
"epoch": 2.2678671918964546,
"grad_norm": 0.24999030117110338,
"learning_rate": 1.3516896120150189e-05,
"loss": 0.2907,
"step": 2016
},
{
"epoch": 2.268992684299381,
"grad_norm": 0.2189977535068501,
"learning_rate": 1.3496036712557364e-05,
"loss": 0.2768,
"step": 2017
},
{
"epoch": 2.270118176702307,
"grad_norm": 0.21605143200933585,
"learning_rate": 1.347517730496454e-05,
"loss": 0.2825,
"step": 2018
},
{
"epoch": 2.2712436691052336,
"grad_norm": 0.2190715820513759,
"learning_rate": 1.3454317897371716e-05,
"loss": 0.2985,
"step": 2019
},
{
"epoch": 2.27236916150816,
"grad_norm": 0.2419287070815025,
"learning_rate": 1.3433458489778892e-05,
"loss": 0.2812,
"step": 2020
},
{
"epoch": 2.273494653911086,
"grad_norm": 0.23856366222450073,
"learning_rate": 1.3412599082186067e-05,
"loss": 0.2754,
"step": 2021
},
{
"epoch": 2.2746201463140125,
"grad_norm": 0.23128552323354076,
"learning_rate": 1.3391739674593242e-05,
"loss": 0.2967,
"step": 2022
},
{
"epoch": 2.2757456387169386,
"grad_norm": 0.23110186859812204,
"learning_rate": 1.3370880267000419e-05,
"loss": 0.2905,
"step": 2023
},
{
"epoch": 2.276871131119865,
"grad_norm": 0.23791496512553711,
"learning_rate": 1.3350020859407594e-05,
"loss": 0.2956,
"step": 2024
},
{
"epoch": 2.277996623522791,
"grad_norm": 0.270895607021542,
"learning_rate": 1.332916145181477e-05,
"loss": 0.2979,
"step": 2025
},
{
"epoch": 2.2791221159257176,
"grad_norm": 0.2622847660820458,
"learning_rate": 1.3308302044221945e-05,
"loss": 0.2805,
"step": 2026
},
{
"epoch": 2.2802476083286436,
"grad_norm": 0.2451853343226485,
"learning_rate": 1.3287442636629122e-05,
"loss": 0.2849,
"step": 2027
},
{
"epoch": 2.28137310073157,
"grad_norm": 0.2181534341062286,
"learning_rate": 1.3266583229036297e-05,
"loss": 0.2843,
"step": 2028
},
{
"epoch": 2.2824985931344965,
"grad_norm": 0.2350791322319216,
"learning_rate": 1.3245723821443471e-05,
"loss": 0.2804,
"step": 2029
},
{
"epoch": 2.2836240855374226,
"grad_norm": 0.24384265303411898,
"learning_rate": 1.3224864413850646e-05,
"loss": 0.2844,
"step": 2030
},
{
"epoch": 2.284749577940349,
"grad_norm": 0.21471389480099612,
"learning_rate": 1.3204005006257821e-05,
"loss": 0.2766,
"step": 2031
},
{
"epoch": 2.285875070343275,
"grad_norm": 0.2558686689697758,
"learning_rate": 1.3183145598664998e-05,
"loss": 0.3006,
"step": 2032
},
{
"epoch": 2.2870005627462016,
"grad_norm": 0.24596519958308774,
"learning_rate": 1.3162286191072174e-05,
"loss": 0.2791,
"step": 2033
},
{
"epoch": 2.2881260551491276,
"grad_norm": 0.22178993068098377,
"learning_rate": 1.3141426783479349e-05,
"loss": 0.3006,
"step": 2034
},
{
"epoch": 2.289251547552054,
"grad_norm": 0.21211849808178426,
"learning_rate": 1.3120567375886524e-05,
"loss": 0.2879,
"step": 2035
},
{
"epoch": 2.29037703995498,
"grad_norm": 0.26189329024450775,
"learning_rate": 1.3099707968293701e-05,
"loss": 0.2919,
"step": 2036
},
{
"epoch": 2.2915025323579066,
"grad_norm": 0.24020801441451947,
"learning_rate": 1.3078848560700877e-05,
"loss": 0.2936,
"step": 2037
},
{
"epoch": 2.292628024760833,
"grad_norm": 0.2444872387207359,
"learning_rate": 1.3057989153108052e-05,
"loss": 0.3098,
"step": 2038
},
{
"epoch": 2.293753517163759,
"grad_norm": 0.21895214125433066,
"learning_rate": 1.3037129745515229e-05,
"loss": 0.2743,
"step": 2039
},
{
"epoch": 2.2948790095666856,
"grad_norm": 0.2496911198777528,
"learning_rate": 1.3016270337922404e-05,
"loss": 0.2918,
"step": 2040
},
{
"epoch": 2.2960045019696116,
"grad_norm": 2.8468163932596022,
"learning_rate": 1.299541093032958e-05,
"loss": 0.2845,
"step": 2041
},
{
"epoch": 2.297129994372538,
"grad_norm": 0.2414636263089686,
"learning_rate": 1.2974551522736755e-05,
"loss": 0.2854,
"step": 2042
},
{
"epoch": 2.298255486775464,
"grad_norm": 0.22863532322963662,
"learning_rate": 1.2953692115143932e-05,
"loss": 0.2769,
"step": 2043
},
{
"epoch": 2.2993809791783906,
"grad_norm": 0.20797566641270696,
"learning_rate": 1.2932832707551107e-05,
"loss": 0.2738,
"step": 2044
},
{
"epoch": 2.3005064715813166,
"grad_norm": 0.2813082678198765,
"learning_rate": 1.2911973299958282e-05,
"loss": 0.2919,
"step": 2045
},
{
"epoch": 2.301631963984243,
"grad_norm": 0.21880645593009593,
"learning_rate": 1.2891113892365458e-05,
"loss": 0.2845,
"step": 2046
},
{
"epoch": 2.3027574563871696,
"grad_norm": 0.21662277253245404,
"learning_rate": 1.2870254484772635e-05,
"loss": 0.2999,
"step": 2047
},
{
"epoch": 2.3038829487900956,
"grad_norm": 0.23410910980013766,
"learning_rate": 1.284939507717981e-05,
"loss": 0.2803,
"step": 2048
},
{
"epoch": 2.305008441193022,
"grad_norm": 0.24807871237848997,
"learning_rate": 1.2828535669586985e-05,
"loss": 0.2887,
"step": 2049
},
{
"epoch": 2.306133933595948,
"grad_norm": 0.23958912163692958,
"learning_rate": 1.2807676261994159e-05,
"loss": 0.2806,
"step": 2050
},
{
"epoch": 2.3072594259988746,
"grad_norm": 0.24170572287325667,
"learning_rate": 1.2786816854401334e-05,
"loss": 0.2911,
"step": 2051
},
{
"epoch": 2.3083849184018006,
"grad_norm": 0.2071987326770734,
"learning_rate": 1.2765957446808511e-05,
"loss": 0.281,
"step": 2052
},
{
"epoch": 2.309510410804727,
"grad_norm": 0.2685294387603238,
"learning_rate": 1.2745098039215686e-05,
"loss": 0.2988,
"step": 2053
},
{
"epoch": 2.310635903207653,
"grad_norm": 0.24356419885452857,
"learning_rate": 1.2724238631622862e-05,
"loss": 0.2918,
"step": 2054
},
{
"epoch": 2.3117613956105796,
"grad_norm": 0.22854669119255341,
"learning_rate": 1.2703379224030037e-05,
"loss": 0.2906,
"step": 2055
},
{
"epoch": 2.312886888013506,
"grad_norm": 0.24689465925397477,
"learning_rate": 1.2682519816437214e-05,
"loss": 0.2935,
"step": 2056
},
{
"epoch": 2.314012380416432,
"grad_norm": 0.21811055770500665,
"learning_rate": 1.2661660408844389e-05,
"loss": 0.3016,
"step": 2057
},
{
"epoch": 2.3151378728193586,
"grad_norm": 0.2493408748518838,
"learning_rate": 1.2640801001251564e-05,
"loss": 0.2834,
"step": 2058
},
{
"epoch": 2.3162633652222846,
"grad_norm": 0.25721873798899103,
"learning_rate": 1.2619941593658741e-05,
"loss": 0.2983,
"step": 2059
},
{
"epoch": 2.317388857625211,
"grad_norm": 0.22179194109950803,
"learning_rate": 1.2599082186065917e-05,
"loss": 0.2753,
"step": 2060
},
{
"epoch": 2.3185143500281375,
"grad_norm": 0.266595262773116,
"learning_rate": 1.2578222778473092e-05,
"loss": 0.2839,
"step": 2061
},
{
"epoch": 2.3196398424310636,
"grad_norm": 0.24206558428702046,
"learning_rate": 1.2557363370880267e-05,
"loss": 0.2853,
"step": 2062
},
{
"epoch": 2.32076533483399,
"grad_norm": 0.2454398984492763,
"learning_rate": 1.2536503963287444e-05,
"loss": 0.3028,
"step": 2063
},
{
"epoch": 2.321890827236916,
"grad_norm": 0.2321058588488482,
"learning_rate": 1.251564455569462e-05,
"loss": 0.269,
"step": 2064
},
{
"epoch": 2.3230163196398426,
"grad_norm": 0.27267795334721745,
"learning_rate": 1.2494785148101793e-05,
"loss": 0.2792,
"step": 2065
},
{
"epoch": 2.3241418120427686,
"grad_norm": 0.23234175584418776,
"learning_rate": 1.247392574050897e-05,
"loss": 0.28,
"step": 2066
},
{
"epoch": 2.325267304445695,
"grad_norm": 0.2063643654191112,
"learning_rate": 1.2453066332916145e-05,
"loss": 0.2742,
"step": 2067
},
{
"epoch": 2.326392796848621,
"grad_norm": 0.22497515405636748,
"learning_rate": 1.243220692532332e-05,
"loss": 0.2904,
"step": 2068
},
{
"epoch": 2.3275182892515476,
"grad_norm": 0.20800896572524227,
"learning_rate": 1.2411347517730498e-05,
"loss": 0.2974,
"step": 2069
},
{
"epoch": 2.328643781654474,
"grad_norm": 0.22460235366838985,
"learning_rate": 1.2390488110137673e-05,
"loss": 0.2777,
"step": 2070
},
{
"epoch": 2.3297692740574,
"grad_norm": 0.23776076812455357,
"learning_rate": 1.2369628702544848e-05,
"loss": 0.2829,
"step": 2071
},
{
"epoch": 2.3308947664603266,
"grad_norm": 0.2570845084981786,
"learning_rate": 1.2348769294952024e-05,
"loss": 0.2945,
"step": 2072
},
{
"epoch": 2.3320202588632526,
"grad_norm": 0.2385004836723248,
"learning_rate": 1.23279098873592e-05,
"loss": 0.2867,
"step": 2073
},
{
"epoch": 2.333145751266179,
"grad_norm": 0.24982697079123078,
"learning_rate": 1.2307050479766376e-05,
"loss": 0.2857,
"step": 2074
},
{
"epoch": 2.334271243669105,
"grad_norm": 0.24642888230370272,
"learning_rate": 1.2286191072173551e-05,
"loss": 0.3053,
"step": 2075
},
{
"epoch": 2.3353967360720316,
"grad_norm": 0.26130363264507717,
"learning_rate": 1.2265331664580726e-05,
"loss": 0.2916,
"step": 2076
},
{
"epoch": 2.3365222284749576,
"grad_norm": 0.2124033043327759,
"learning_rate": 1.2244472256987902e-05,
"loss": 0.2764,
"step": 2077
},
{
"epoch": 2.337647720877884,
"grad_norm": 0.2440455128961208,
"learning_rate": 1.2223612849395077e-05,
"loss": 0.3075,
"step": 2078
},
{
"epoch": 2.3387732132808106,
"grad_norm": 0.245304116279532,
"learning_rate": 1.2202753441802254e-05,
"loss": 0.2895,
"step": 2079
},
{
"epoch": 2.3398987056837366,
"grad_norm": 0.2372202784047367,
"learning_rate": 1.218189403420943e-05,
"loss": 0.2785,
"step": 2080
},
{
"epoch": 2.341024198086663,
"grad_norm": 0.23688709955054182,
"learning_rate": 1.2161034626616605e-05,
"loss": 0.2777,
"step": 2081
},
{
"epoch": 2.342149690489589,
"grad_norm": 0.2482625923726943,
"learning_rate": 1.214017521902378e-05,
"loss": 0.2833,
"step": 2082
},
{
"epoch": 2.3432751828925156,
"grad_norm": 0.22738968926541633,
"learning_rate": 1.2119315811430957e-05,
"loss": 0.2709,
"step": 2083
},
{
"epoch": 2.3444006752954416,
"grad_norm": 0.25147592555620085,
"learning_rate": 1.2098456403838132e-05,
"loss": 0.3008,
"step": 2084
},
{
"epoch": 2.345526167698368,
"grad_norm": 0.22363924741115862,
"learning_rate": 1.2077596996245307e-05,
"loss": 0.2872,
"step": 2085
},
{
"epoch": 2.346651660101294,
"grad_norm": 0.23011558044098404,
"learning_rate": 1.2056737588652483e-05,
"loss": 0.2901,
"step": 2086
},
{
"epoch": 2.3477771525042206,
"grad_norm": 0.232959308790496,
"learning_rate": 1.2035878181059658e-05,
"loss": 0.2859,
"step": 2087
},
{
"epoch": 2.348902644907147,
"grad_norm": 0.24124633231018813,
"learning_rate": 1.2015018773466833e-05,
"loss": 0.2946,
"step": 2088
},
{
"epoch": 2.350028137310073,
"grad_norm": 0.23315070454396,
"learning_rate": 1.199415936587401e-05,
"loss": 0.2719,
"step": 2089
},
{
"epoch": 2.3511536297129996,
"grad_norm": 0.21247783763819528,
"learning_rate": 1.1973299958281186e-05,
"loss": 0.28,
"step": 2090
},
{
"epoch": 2.3522791221159256,
"grad_norm": 0.23387492479149327,
"learning_rate": 1.195244055068836e-05,
"loss": 0.2792,
"step": 2091
},
{
"epoch": 2.353404614518852,
"grad_norm": 0.22205981665359048,
"learning_rate": 1.1931581143095536e-05,
"loss": 0.2869,
"step": 2092
},
{
"epoch": 2.354530106921778,
"grad_norm": 0.2407814917985092,
"learning_rate": 1.1910721735502713e-05,
"loss": 0.2837,
"step": 2093
},
{
"epoch": 2.3556555993247046,
"grad_norm": 0.22636696159410108,
"learning_rate": 1.1889862327909888e-05,
"loss": 0.2717,
"step": 2094
},
{
"epoch": 2.3567810917276306,
"grad_norm": 0.20010784358214667,
"learning_rate": 1.1869002920317064e-05,
"loss": 0.2638,
"step": 2095
},
{
"epoch": 2.357906584130557,
"grad_norm": 0.2302624047508185,
"learning_rate": 1.184814351272424e-05,
"loss": 0.2889,
"step": 2096
},
{
"epoch": 2.3590320765334836,
"grad_norm": 0.21841431009246395,
"learning_rate": 1.1827284105131414e-05,
"loss": 0.2836,
"step": 2097
},
{
"epoch": 2.3601575689364096,
"grad_norm": 0.21162560341411857,
"learning_rate": 1.180642469753859e-05,
"loss": 0.2744,
"step": 2098
},
{
"epoch": 2.361283061339336,
"grad_norm": 0.23437617625703946,
"learning_rate": 1.1785565289945767e-05,
"loss": 0.2897,
"step": 2099
},
{
"epoch": 2.362408553742262,
"grad_norm": 0.2443861444498022,
"learning_rate": 1.1764705882352942e-05,
"loss": 0.2773,
"step": 2100
},
{
"epoch": 2.3635340461451886,
"grad_norm": 0.20195447071682132,
"learning_rate": 1.1743846474760117e-05,
"loss": 0.2852,
"step": 2101
},
{
"epoch": 2.3646595385481146,
"grad_norm": 0.22050201926310495,
"learning_rate": 1.1722987067167292e-05,
"loss": 0.2811,
"step": 2102
},
{
"epoch": 2.365785030951041,
"grad_norm": 0.24700086215612232,
"learning_rate": 1.170212765957447e-05,
"loss": 0.2944,
"step": 2103
},
{
"epoch": 2.366910523353967,
"grad_norm": 0.21968358349344858,
"learning_rate": 1.1681268251981645e-05,
"loss": 0.2876,
"step": 2104
},
{
"epoch": 2.3680360157568936,
"grad_norm": 0.22004217949350546,
"learning_rate": 1.166040884438882e-05,
"loss": 0.298,
"step": 2105
},
{
"epoch": 2.36916150815982,
"grad_norm": 0.25166613532562226,
"learning_rate": 1.1639549436795997e-05,
"loss": 0.3039,
"step": 2106
},
{
"epoch": 2.370287000562746,
"grad_norm": 0.2470279532584483,
"learning_rate": 1.161869002920317e-05,
"loss": 0.2905,
"step": 2107
},
{
"epoch": 2.3714124929656726,
"grad_norm": 0.22645956564251657,
"learning_rate": 1.1597830621610346e-05,
"loss": 0.2857,
"step": 2108
},
{
"epoch": 2.3725379853685986,
"grad_norm": 0.236072091244946,
"learning_rate": 1.1576971214017523e-05,
"loss": 0.2876,
"step": 2109
},
{
"epoch": 2.373663477771525,
"grad_norm": 0.22344125624637598,
"learning_rate": 1.1556111806424698e-05,
"loss": 0.2871,
"step": 2110
},
{
"epoch": 2.3747889701744516,
"grad_norm": 0.2740686740181796,
"learning_rate": 1.1535252398831873e-05,
"loss": 0.3108,
"step": 2111
},
{
"epoch": 2.3759144625773776,
"grad_norm": 0.2633526483901452,
"learning_rate": 1.1514392991239049e-05,
"loss": 0.2776,
"step": 2112
},
{
"epoch": 2.377039954980304,
"grad_norm": 0.23782573779211985,
"learning_rate": 1.1493533583646226e-05,
"loss": 0.2882,
"step": 2113
},
{
"epoch": 2.37816544738323,
"grad_norm": 0.20792260803969095,
"learning_rate": 1.1472674176053401e-05,
"loss": 0.2711,
"step": 2114
},
{
"epoch": 2.3792909397861566,
"grad_norm": 0.27657171997481406,
"learning_rate": 1.1451814768460576e-05,
"loss": 0.2866,
"step": 2115
},
{
"epoch": 2.3804164321890826,
"grad_norm": 0.23403563768428892,
"learning_rate": 1.1430955360867753e-05,
"loss": 0.2936,
"step": 2116
},
{
"epoch": 2.381541924592009,
"grad_norm": 0.2268435928373139,
"learning_rate": 1.1410095953274927e-05,
"loss": 0.2856,
"step": 2117
},
{
"epoch": 2.382667416994935,
"grad_norm": 0.2547588426300782,
"learning_rate": 1.1389236545682102e-05,
"loss": 0.2825,
"step": 2118
},
{
"epoch": 2.3837929093978616,
"grad_norm": 0.2251742508238687,
"learning_rate": 1.1368377138089279e-05,
"loss": 0.27,
"step": 2119
},
{
"epoch": 2.384918401800788,
"grad_norm": 0.23999465674072157,
"learning_rate": 1.1347517730496454e-05,
"loss": 0.3047,
"step": 2120
},
{
"epoch": 2.386043894203714,
"grad_norm": 0.23662614777077606,
"learning_rate": 1.132665832290363e-05,
"loss": 0.2988,
"step": 2121
},
{
"epoch": 2.3871693866066406,
"grad_norm": 0.2186602406031677,
"learning_rate": 1.1305798915310805e-05,
"loss": 0.2728,
"step": 2122
},
{
"epoch": 2.3882948790095666,
"grad_norm": 0.23328788430189215,
"learning_rate": 1.1284939507717982e-05,
"loss": 0.2899,
"step": 2123
},
{
"epoch": 2.389420371412493,
"grad_norm": 0.22192987090662689,
"learning_rate": 1.1264080100125157e-05,
"loss": 0.2988,
"step": 2124
},
{
"epoch": 2.390545863815419,
"grad_norm": 0.2250119571074059,
"learning_rate": 1.1243220692532333e-05,
"loss": 0.2956,
"step": 2125
},
{
"epoch": 2.3916713562183456,
"grad_norm": 0.23070732043801884,
"learning_rate": 1.122236128493951e-05,
"loss": 0.2902,
"step": 2126
},
{
"epoch": 2.3927968486212716,
"grad_norm": 0.22518386141677244,
"learning_rate": 1.1201501877346685e-05,
"loss": 0.2659,
"step": 2127
},
{
"epoch": 2.393922341024198,
"grad_norm": 0.21584313221933796,
"learning_rate": 1.1180642469753858e-05,
"loss": 0.288,
"step": 2128
},
{
"epoch": 2.3950478334271246,
"grad_norm": 0.24985482700142989,
"learning_rate": 1.1159783062161035e-05,
"loss": 0.2874,
"step": 2129
},
{
"epoch": 2.3961733258300506,
"grad_norm": 0.23472182953400522,
"learning_rate": 1.113892365456821e-05,
"loss": 0.2797,
"step": 2130
},
{
"epoch": 2.397298818232977,
"grad_norm": 0.23998025758676889,
"learning_rate": 1.1118064246975386e-05,
"loss": 0.2919,
"step": 2131
},
{
"epoch": 2.398424310635903,
"grad_norm": 0.21809256274475072,
"learning_rate": 1.1097204839382561e-05,
"loss": 0.2758,
"step": 2132
},
{
"epoch": 2.3995498030388296,
"grad_norm": 0.23454882483715764,
"learning_rate": 1.1076345431789738e-05,
"loss": 0.2952,
"step": 2133
},
{
"epoch": 2.4006752954417556,
"grad_norm": 0.2246557652335286,
"learning_rate": 1.1055486024196914e-05,
"loss": 0.2981,
"step": 2134
},
{
"epoch": 2.401800787844682,
"grad_norm": 0.25315434629928985,
"learning_rate": 1.1034626616604089e-05,
"loss": 0.3066,
"step": 2135
},
{
"epoch": 2.402926280247608,
"grad_norm": 0.22665010188162998,
"learning_rate": 1.1013767209011266e-05,
"loss": 0.2923,
"step": 2136
},
{
"epoch": 2.4040517726505346,
"grad_norm": 0.2501297143991106,
"learning_rate": 1.0992907801418441e-05,
"loss": 0.2782,
"step": 2137
},
{
"epoch": 2.405177265053461,
"grad_norm": 0.23355470062481642,
"learning_rate": 1.0972048393825615e-05,
"loss": 0.2958,
"step": 2138
},
{
"epoch": 2.406302757456387,
"grad_norm": 0.22524426706184972,
"learning_rate": 1.0951188986232792e-05,
"loss": 0.2971,
"step": 2139
},
{
"epoch": 2.4074282498593136,
"grad_norm": 0.2652706460468126,
"learning_rate": 1.0930329578639967e-05,
"loss": 0.3063,
"step": 2140
},
{
"epoch": 2.4085537422622396,
"grad_norm": 0.27147074305958385,
"learning_rate": 1.0909470171047142e-05,
"loss": 0.2835,
"step": 2141
},
{
"epoch": 2.409679234665166,
"grad_norm": 0.21263510668944327,
"learning_rate": 1.0888610763454318e-05,
"loss": 0.2759,
"step": 2142
},
{
"epoch": 2.410804727068092,
"grad_norm": 0.2414280469462777,
"learning_rate": 1.0867751355861495e-05,
"loss": 0.3011,
"step": 2143
},
{
"epoch": 2.4119302194710186,
"grad_norm": 0.2563072903091181,
"learning_rate": 1.084689194826867e-05,
"loss": 0.3011,
"step": 2144
},
{
"epoch": 2.4130557118739446,
"grad_norm": 0.26752154229648717,
"learning_rate": 1.0826032540675845e-05,
"loss": 0.3025,
"step": 2145
},
{
"epoch": 2.414181204276871,
"grad_norm": 0.2191490869328681,
"learning_rate": 1.0805173133083022e-05,
"loss": 0.2838,
"step": 2146
},
{
"epoch": 2.4153066966797976,
"grad_norm": 0.2754295487570369,
"learning_rate": 1.0784313725490197e-05,
"loss": 0.2745,
"step": 2147
},
{
"epoch": 2.4164321890827236,
"grad_norm": 0.27496282852437165,
"learning_rate": 1.0763454317897373e-05,
"loss": 0.2949,
"step": 2148
},
{
"epoch": 2.41755768148565,
"grad_norm": 0.200617793351321,
"learning_rate": 1.0742594910304548e-05,
"loss": 0.2844,
"step": 2149
},
{
"epoch": 2.418683173888576,
"grad_norm": 0.2261161966006784,
"learning_rate": 1.0721735502711723e-05,
"loss": 0.3074,
"step": 2150
},
{
"epoch": 2.4198086662915026,
"grad_norm": 0.2721691004576223,
"learning_rate": 1.0700876095118899e-05,
"loss": 0.2887,
"step": 2151
},
{
"epoch": 2.4209341586944286,
"grad_norm": 0.2468891901297125,
"learning_rate": 1.0680016687526074e-05,
"loss": 0.2826,
"step": 2152
},
{
"epoch": 2.422059651097355,
"grad_norm": 0.22981129965663172,
"learning_rate": 1.065915727993325e-05,
"loss": 0.2982,
"step": 2153
},
{
"epoch": 2.423185143500281,
"grad_norm": 0.23778012142265284,
"learning_rate": 1.0638297872340426e-05,
"loss": 0.2803,
"step": 2154
},
{
"epoch": 2.4243106359032076,
"grad_norm": 0.2502133834738445,
"learning_rate": 1.0617438464747601e-05,
"loss": 0.2894,
"step": 2155
},
{
"epoch": 2.425436128306134,
"grad_norm": 0.24021651572443242,
"learning_rate": 1.0596579057154778e-05,
"loss": 0.2792,
"step": 2156
},
{
"epoch": 2.42656162070906,
"grad_norm": 0.21287174233579118,
"learning_rate": 1.0575719649561954e-05,
"loss": 0.2653,
"step": 2157
},
{
"epoch": 2.4276871131119866,
"grad_norm": 0.2596480129053586,
"learning_rate": 1.0554860241969129e-05,
"loss": 0.2812,
"step": 2158
},
{
"epoch": 2.4288126055149126,
"grad_norm": 0.22839461536852768,
"learning_rate": 1.0534000834376304e-05,
"loss": 0.2887,
"step": 2159
},
{
"epoch": 2.429938097917839,
"grad_norm": 0.25082900774514266,
"learning_rate": 1.051314142678348e-05,
"loss": 0.2848,
"step": 2160
},
{
"epoch": 2.4310635903207656,
"grad_norm": 0.21582263533702323,
"learning_rate": 1.0492282019190655e-05,
"loss": 0.2848,
"step": 2161
},
{
"epoch": 2.4321890827236916,
"grad_norm": 0.20981491769940364,
"learning_rate": 1.047142261159783e-05,
"loss": 0.2771,
"step": 2162
},
{
"epoch": 2.433314575126618,
"grad_norm": 0.2478690946929455,
"learning_rate": 1.0450563204005007e-05,
"loss": 0.2978,
"step": 2163
},
{
"epoch": 2.434440067529544,
"grad_norm": 0.23623868199579823,
"learning_rate": 1.0429703796412182e-05,
"loss": 0.2868,
"step": 2164
},
{
"epoch": 2.4355655599324706,
"grad_norm": 0.22479499127056093,
"learning_rate": 1.0408844388819358e-05,
"loss": 0.2864,
"step": 2165
},
{
"epoch": 2.4366910523353966,
"grad_norm": 0.22778228885333549,
"learning_rate": 1.0387984981226535e-05,
"loss": 0.2703,
"step": 2166
},
{
"epoch": 2.437816544738323,
"grad_norm": 0.24955225194107739,
"learning_rate": 1.036712557363371e-05,
"loss": 0.2881,
"step": 2167
},
{
"epoch": 2.438942037141249,
"grad_norm": 0.24688296808661256,
"learning_rate": 1.0346266166040885e-05,
"loss": 0.2892,
"step": 2168
},
{
"epoch": 2.4400675295441756,
"grad_norm": 0.21066675955547629,
"learning_rate": 1.032540675844806e-05,
"loss": 0.2665,
"step": 2169
},
{
"epoch": 2.441193021947102,
"grad_norm": 0.23152630032898566,
"learning_rate": 1.0304547350855236e-05,
"loss": 0.2879,
"step": 2170
},
{
"epoch": 2.442318514350028,
"grad_norm": 0.23881733868242846,
"learning_rate": 1.0283687943262411e-05,
"loss": 0.284,
"step": 2171
},
{
"epoch": 2.4434440067529546,
"grad_norm": 0.24727876228693577,
"learning_rate": 1.0262828535669586e-05,
"loss": 0.2799,
"step": 2172
},
{
"epoch": 2.4445694991558806,
"grad_norm": 0.2237223246325913,
"learning_rate": 1.0241969128076763e-05,
"loss": 0.2839,
"step": 2173
},
{
"epoch": 2.445694991558807,
"grad_norm": 0.2255880979174184,
"learning_rate": 1.0221109720483939e-05,
"loss": 0.3039,
"step": 2174
},
{
"epoch": 2.446820483961733,
"grad_norm": 0.2170555923070572,
"learning_rate": 1.0200250312891114e-05,
"loss": 0.2768,
"step": 2175
},
{
"epoch": 2.4479459763646596,
"grad_norm": 0.20774037005388524,
"learning_rate": 1.0179390905298291e-05,
"loss": 0.2834,
"step": 2176
},
{
"epoch": 2.4490714687675856,
"grad_norm": 0.2265465766383895,
"learning_rate": 1.0158531497705466e-05,
"loss": 0.3058,
"step": 2177
},
{
"epoch": 2.450196961170512,
"grad_norm": 0.20568423154158125,
"learning_rate": 1.0137672090112642e-05,
"loss": 0.2692,
"step": 2178
},
{
"epoch": 2.4513224535734386,
"grad_norm": 0.22834881557663556,
"learning_rate": 1.0116812682519817e-05,
"loss": 0.2892,
"step": 2179
},
{
"epoch": 2.4524479459763646,
"grad_norm": 0.22028619420108753,
"learning_rate": 1.0095953274926992e-05,
"loss": 0.2919,
"step": 2180
},
{
"epoch": 2.453573438379291,
"grad_norm": 0.2425115217082142,
"learning_rate": 1.0075093867334167e-05,
"loss": 0.2764,
"step": 2181
},
{
"epoch": 2.454698930782217,
"grad_norm": 0.22791631771168733,
"learning_rate": 1.0054234459741343e-05,
"loss": 0.267,
"step": 2182
},
{
"epoch": 2.4558244231851436,
"grad_norm": 0.1926906774199996,
"learning_rate": 1.003337505214852e-05,
"loss": 0.2683,
"step": 2183
},
{
"epoch": 2.4569499155880696,
"grad_norm": 0.23290868443818466,
"learning_rate": 1.0012515644555695e-05,
"loss": 0.2791,
"step": 2184
},
{
"epoch": 2.458075407990996,
"grad_norm": 0.2968317001207595,
"learning_rate": 9.99165623696287e-06,
"loss": 0.2959,
"step": 2185
},
{
"epoch": 2.459200900393922,
"grad_norm": 0.250149366010179,
"learning_rate": 9.970796829370047e-06,
"loss": 0.2968,
"step": 2186
},
{
"epoch": 2.4603263927968486,
"grad_norm": 0.23676392349962846,
"learning_rate": 9.949937421777223e-06,
"loss": 0.2766,
"step": 2187
},
{
"epoch": 2.461451885199775,
"grad_norm": 0.2968220951755795,
"learning_rate": 9.929078014184398e-06,
"loss": 0.2858,
"step": 2188
},
{
"epoch": 2.462577377602701,
"grad_norm": 0.24526431357390857,
"learning_rate": 9.908218606591573e-06,
"loss": 0.2776,
"step": 2189
},
{
"epoch": 2.4637028700056276,
"grad_norm": 0.2072075588604563,
"learning_rate": 9.887359198998748e-06,
"loss": 0.2786,
"step": 2190
},
{
"epoch": 2.4648283624085536,
"grad_norm": 0.24560787407943072,
"learning_rate": 9.866499791405924e-06,
"loss": 0.3076,
"step": 2191
},
{
"epoch": 2.46595385481148,
"grad_norm": 0.2807855048371902,
"learning_rate": 9.845640383813099e-06,
"loss": 0.3059,
"step": 2192
},
{
"epoch": 2.467079347214406,
"grad_norm": 0.21339940056568182,
"learning_rate": 9.824780976220276e-06,
"loss": 0.2835,
"step": 2193
},
{
"epoch": 2.4682048396173326,
"grad_norm": 0.23237003408073176,
"learning_rate": 9.803921568627451e-06,
"loss": 0.2774,
"step": 2194
},
{
"epoch": 2.4693303320202586,
"grad_norm": 0.22471960654724552,
"learning_rate": 9.783062161034627e-06,
"loss": 0.3059,
"step": 2195
},
{
"epoch": 2.470455824423185,
"grad_norm": 0.21133212055331363,
"learning_rate": 9.762202753441804e-06,
"loss": 0.2804,
"step": 2196
},
{
"epoch": 2.4715813168261116,
"grad_norm": 0.22866555875952663,
"learning_rate": 9.741343345848979e-06,
"loss": 0.2668,
"step": 2197
},
{
"epoch": 2.4727068092290376,
"grad_norm": 0.23022775610838142,
"learning_rate": 9.720483938256154e-06,
"loss": 0.2941,
"step": 2198
},
{
"epoch": 2.473832301631964,
"grad_norm": 0.24916655338248048,
"learning_rate": 9.69962453066333e-06,
"loss": 0.2875,
"step": 2199
},
{
"epoch": 2.47495779403489,
"grad_norm": 0.22598137001947038,
"learning_rate": 9.678765123070506e-06,
"loss": 0.287,
"step": 2200
},
{
"epoch": 2.4760832864378166,
"grad_norm": 0.19562266451832722,
"learning_rate": 9.65790571547768e-06,
"loss": 0.2726,
"step": 2201
},
{
"epoch": 2.4772087788407426,
"grad_norm": 0.22330052278775112,
"learning_rate": 9.637046307884855e-06,
"loss": 0.2862,
"step": 2202
},
{
"epoch": 2.478334271243669,
"grad_norm": 0.22895521592496432,
"learning_rate": 9.616186900292032e-06,
"loss": 0.2858,
"step": 2203
},
{
"epoch": 2.479459763646595,
"grad_norm": 0.22023179481636448,
"learning_rate": 9.595327492699208e-06,
"loss": 0.2768,
"step": 2204
},
{
"epoch": 2.4805852560495216,
"grad_norm": 0.23642223233900708,
"learning_rate": 9.574468085106383e-06,
"loss": 0.2878,
"step": 2205
},
{
"epoch": 2.481710748452448,
"grad_norm": 0.2391107708431571,
"learning_rate": 9.55360867751356e-06,
"loss": 0.2879,
"step": 2206
},
{
"epoch": 2.482836240855374,
"grad_norm": 0.24152975198499732,
"learning_rate": 9.532749269920735e-06,
"loss": 0.2955,
"step": 2207
},
{
"epoch": 2.4839617332583006,
"grad_norm": 0.23299532148669774,
"learning_rate": 9.51188986232791e-06,
"loss": 0.2962,
"step": 2208
},
{
"epoch": 2.4850872256612266,
"grad_norm": 0.20896130963456966,
"learning_rate": 9.491030454735086e-06,
"loss": 0.284,
"step": 2209
},
{
"epoch": 2.486212718064153,
"grad_norm": 0.2105385871507124,
"learning_rate": 9.470171047142263e-06,
"loss": 0.2729,
"step": 2210
},
{
"epoch": 2.4873382104670796,
"grad_norm": 0.21484947461149867,
"learning_rate": 9.449311639549436e-06,
"loss": 0.2926,
"step": 2211
},
{
"epoch": 2.4884637028700056,
"grad_norm": 0.2190385482446419,
"learning_rate": 9.428452231956612e-06,
"loss": 0.2825,
"step": 2212
},
{
"epoch": 2.489589195272932,
"grad_norm": 0.2142534982080354,
"learning_rate": 9.407592824363789e-06,
"loss": 0.2751,
"step": 2213
},
{
"epoch": 2.490714687675858,
"grad_norm": 0.21708738862041638,
"learning_rate": 9.386733416770964e-06,
"loss": 0.2786,
"step": 2214
},
{
"epoch": 2.4918401800787846,
"grad_norm": 0.2181940682924344,
"learning_rate": 9.365874009178139e-06,
"loss": 0.2891,
"step": 2215
},
{
"epoch": 2.4929656724817106,
"grad_norm": 0.24361785849546538,
"learning_rate": 9.345014601585316e-06,
"loss": 0.2862,
"step": 2216
},
{
"epoch": 2.494091164884637,
"grad_norm": 0.2074874339468701,
"learning_rate": 9.324155193992491e-06,
"loss": 0.2779,
"step": 2217
},
{
"epoch": 2.495216657287563,
"grad_norm": 0.22232685525965187,
"learning_rate": 9.303295786399667e-06,
"loss": 0.2872,
"step": 2218
},
{
"epoch": 2.4963421496904896,
"grad_norm": 0.22940288362612324,
"learning_rate": 9.282436378806842e-06,
"loss": 0.2675,
"step": 2219
},
{
"epoch": 2.497467642093416,
"grad_norm": 0.22467443084840247,
"learning_rate": 9.261576971214019e-06,
"loss": 0.2801,
"step": 2220
},
{
"epoch": 2.498593134496342,
"grad_norm": 0.23139305058585594,
"learning_rate": 9.240717563621194e-06,
"loss": 0.2743,
"step": 2221
},
{
"epoch": 2.4997186268992686,
"grad_norm": 0.2360404033010022,
"learning_rate": 9.219858156028368e-06,
"loss": 0.286,
"step": 2222
},
{
"epoch": 2.5008441193021946,
"grad_norm": 0.20605066820343487,
"learning_rate": 9.198998748435545e-06,
"loss": 0.2823,
"step": 2223
},
{
"epoch": 2.501969611705121,
"grad_norm": 0.2564735378536905,
"learning_rate": 9.17813934084272e-06,
"loss": 0.2947,
"step": 2224
},
{
"epoch": 2.503095104108047,
"grad_norm": 0.2320837293470589,
"learning_rate": 9.157279933249895e-06,
"loss": 0.2768,
"step": 2225
},
{
"epoch": 2.5042205965109736,
"grad_norm": 0.208589920793005,
"learning_rate": 9.136420525657072e-06,
"loss": 0.29,
"step": 2226
},
{
"epoch": 2.5053460889138996,
"grad_norm": 0.21389293826499295,
"learning_rate": 9.115561118064248e-06,
"loss": 0.2798,
"step": 2227
},
{
"epoch": 2.506471581316826,
"grad_norm": 0.22046720544274087,
"learning_rate": 9.094701710471423e-06,
"loss": 0.2937,
"step": 2228
},
{
"epoch": 2.5075970737197526,
"grad_norm": 0.22495729889410385,
"learning_rate": 9.073842302878598e-06,
"loss": 0.2879,
"step": 2229
},
{
"epoch": 2.5087225661226786,
"grad_norm": 0.20269539252904967,
"learning_rate": 9.052982895285775e-06,
"loss": 0.2774,
"step": 2230
},
{
"epoch": 2.509848058525605,
"grad_norm": 0.1980840443630393,
"learning_rate": 9.03212348769295e-06,
"loss": 0.281,
"step": 2231
},
{
"epoch": 2.510973550928531,
"grad_norm": 0.22695316930947035,
"learning_rate": 9.011264080100124e-06,
"loss": 0.2922,
"step": 2232
},
{
"epoch": 2.5120990433314576,
"grad_norm": 0.20934803359715298,
"learning_rate": 8.990404672507301e-06,
"loss": 0.2794,
"step": 2233
},
{
"epoch": 2.5132245357343836,
"grad_norm": 0.21112109217582253,
"learning_rate": 8.969545264914476e-06,
"loss": 0.2907,
"step": 2234
},
{
"epoch": 2.51435002813731,
"grad_norm": 0.22573731590530483,
"learning_rate": 8.948685857321652e-06,
"loss": 0.291,
"step": 2235
},
{
"epoch": 2.515475520540236,
"grad_norm": 0.23892740697159065,
"learning_rate": 8.927826449728829e-06,
"loss": 0.273,
"step": 2236
},
{
"epoch": 2.5166010129431626,
"grad_norm": 0.22535585510058634,
"learning_rate": 8.906967042136004e-06,
"loss": 0.2837,
"step": 2237
},
{
"epoch": 2.517726505346089,
"grad_norm": 0.2194038445722204,
"learning_rate": 8.88610763454318e-06,
"loss": 0.2927,
"step": 2238
},
{
"epoch": 2.518851997749015,
"grad_norm": 0.22905667951438685,
"learning_rate": 8.865248226950355e-06,
"loss": 0.2789,
"step": 2239
},
{
"epoch": 2.5199774901519416,
"grad_norm": 0.22625912351056832,
"learning_rate": 8.844388819357532e-06,
"loss": 0.2755,
"step": 2240
},
{
"epoch": 2.5211029825548676,
"grad_norm": 0.23804689181224994,
"learning_rate": 8.823529411764707e-06,
"loss": 0.2792,
"step": 2241
},
{
"epoch": 2.522228474957794,
"grad_norm": 0.2105408688549035,
"learning_rate": 8.802670004171882e-06,
"loss": 0.2972,
"step": 2242
},
{
"epoch": 2.52335396736072,
"grad_norm": 0.22340033958156802,
"learning_rate": 8.781810596579057e-06,
"loss": 0.2946,
"step": 2243
},
{
"epoch": 2.5244794597636466,
"grad_norm": 0.2297895889368776,
"learning_rate": 8.760951188986233e-06,
"loss": 0.2938,
"step": 2244
},
{
"epoch": 2.5256049521665727,
"grad_norm": 0.2259147810494066,
"learning_rate": 8.740091781393408e-06,
"loss": 0.2813,
"step": 2245
},
{
"epoch": 2.526730444569499,
"grad_norm": 0.23547288393006746,
"learning_rate": 8.719232373800585e-06,
"loss": 0.2994,
"step": 2246
},
{
"epoch": 2.5278559369724256,
"grad_norm": 0.22543446780315715,
"learning_rate": 8.69837296620776e-06,
"loss": 0.2846,
"step": 2247
},
{
"epoch": 2.5289814293753516,
"grad_norm": 0.2154532957908738,
"learning_rate": 8.677513558614936e-06,
"loss": 0.2879,
"step": 2248
},
{
"epoch": 2.530106921778278,
"grad_norm": 0.2351801079174597,
"learning_rate": 8.65665415102211e-06,
"loss": 0.2765,
"step": 2249
},
{
"epoch": 2.531232414181204,
"grad_norm": 0.21366786894791512,
"learning_rate": 8.635794743429288e-06,
"loss": 0.2734,
"step": 2250
},
{
"epoch": 2.5323579065841306,
"grad_norm": 0.23645349161640047,
"learning_rate": 8.614935335836463e-06,
"loss": 0.2984,
"step": 2251
},
{
"epoch": 2.533483398987057,
"grad_norm": 0.23434820101602807,
"learning_rate": 8.594075928243638e-06,
"loss": 0.2968,
"step": 2252
},
{
"epoch": 2.534608891389983,
"grad_norm": 0.23800902126311332,
"learning_rate": 8.573216520650814e-06,
"loss": 0.2828,
"step": 2253
},
{
"epoch": 2.535734383792909,
"grad_norm": 0.2538132352809376,
"learning_rate": 8.552357113057989e-06,
"loss": 0.2843,
"step": 2254
},
{
"epoch": 2.5368598761958356,
"grad_norm": 0.21371163966598017,
"learning_rate": 8.531497705465164e-06,
"loss": 0.2751,
"step": 2255
},
{
"epoch": 2.537985368598762,
"grad_norm": 0.21482253029817228,
"learning_rate": 8.510638297872341e-06,
"loss": 0.3006,
"step": 2256
},
{
"epoch": 2.539110861001688,
"grad_norm": 0.21834391394392152,
"learning_rate": 8.489778890279517e-06,
"loss": 0.288,
"step": 2257
},
{
"epoch": 2.5402363534046146,
"grad_norm": 0.2385102842630092,
"learning_rate": 8.468919482686692e-06,
"loss": 0.2873,
"step": 2258
},
{
"epoch": 2.5413618458075407,
"grad_norm": 0.2496691464287376,
"learning_rate": 8.448060075093867e-06,
"loss": 0.2868,
"step": 2259
},
{
"epoch": 2.542487338210467,
"grad_norm": 0.2014232955964171,
"learning_rate": 8.427200667501044e-06,
"loss": 0.2851,
"step": 2260
},
{
"epoch": 2.5436128306133936,
"grad_norm": 0.23384968447549695,
"learning_rate": 8.40634125990822e-06,
"loss": 0.2842,
"step": 2261
},
{
"epoch": 2.5447383230163196,
"grad_norm": 0.21977668018953103,
"learning_rate": 8.385481852315395e-06,
"loss": 0.2883,
"step": 2262
},
{
"epoch": 2.5458638154192457,
"grad_norm": 0.21776563647468017,
"learning_rate": 8.36462244472257e-06,
"loss": 0.2856,
"step": 2263
},
{
"epoch": 2.546989307822172,
"grad_norm": 0.2027944392061715,
"learning_rate": 8.343763037129745e-06,
"loss": 0.2765,
"step": 2264
},
{
"epoch": 2.5481148002250986,
"grad_norm": 0.21029054091165603,
"learning_rate": 8.32290362953692e-06,
"loss": 0.2767,
"step": 2265
},
{
"epoch": 2.5492402926280247,
"grad_norm": 0.21418622748856342,
"learning_rate": 8.302044221944098e-06,
"loss": 0.2948,
"step": 2266
},
{
"epoch": 2.550365785030951,
"grad_norm": 0.21907388139154874,
"learning_rate": 8.281184814351273e-06,
"loss": 0.2736,
"step": 2267
},
{
"epoch": 2.551491277433877,
"grad_norm": 0.21904845452521604,
"learning_rate": 8.260325406758448e-06,
"loss": 0.3056,
"step": 2268
},
{
"epoch": 2.5526167698368036,
"grad_norm": 0.20459659904962244,
"learning_rate": 8.239465999165623e-06,
"loss": 0.2807,
"step": 2269
},
{
"epoch": 2.55374226223973,
"grad_norm": 0.20176624512330674,
"learning_rate": 8.2186065915728e-06,
"loss": 0.282,
"step": 2270
},
{
"epoch": 2.554867754642656,
"grad_norm": 0.2171053854970344,
"learning_rate": 8.197747183979976e-06,
"loss": 0.2867,
"step": 2271
},
{
"epoch": 2.555993247045582,
"grad_norm": 0.21608909264471945,
"learning_rate": 8.176887776387151e-06,
"loss": 0.2843,
"step": 2272
},
{
"epoch": 2.5571187394485086,
"grad_norm": 0.22363745774157207,
"learning_rate": 8.156028368794328e-06,
"loss": 0.292,
"step": 2273
},
{
"epoch": 2.558244231851435,
"grad_norm": 0.1967157457122503,
"learning_rate": 8.135168961201502e-06,
"loss": 0.2785,
"step": 2274
},
{
"epoch": 2.559369724254361,
"grad_norm": 0.21025592482731642,
"learning_rate": 8.114309553608677e-06,
"loss": 0.2758,
"step": 2275
},
{
"epoch": 2.5604952166572876,
"grad_norm": 0.21847518826316134,
"learning_rate": 8.093450146015854e-06,
"loss": 0.2969,
"step": 2276
},
{
"epoch": 2.5616207090602137,
"grad_norm": 0.22602160924202305,
"learning_rate": 8.072590738423029e-06,
"loss": 0.2883,
"step": 2277
},
{
"epoch": 2.56274620146314,
"grad_norm": 0.20256712231044452,
"learning_rate": 8.051731330830204e-06,
"loss": 0.2695,
"step": 2278
},
{
"epoch": 2.5638716938660666,
"grad_norm": 0.20681151204540096,
"learning_rate": 8.030871923237381e-06,
"loss": 0.2654,
"step": 2279
},
{
"epoch": 2.5649971862689926,
"grad_norm": 0.23344582380587078,
"learning_rate": 8.010012515644557e-06,
"loss": 0.2986,
"step": 2280
},
{
"epoch": 2.566122678671919,
"grad_norm": 0.22256302367590555,
"learning_rate": 7.989153108051732e-06,
"loss": 0.2833,
"step": 2281
},
{
"epoch": 2.567248171074845,
"grad_norm": 0.20447744073654678,
"learning_rate": 7.968293700458907e-06,
"loss": 0.2859,
"step": 2282
},
{
"epoch": 2.5683736634777716,
"grad_norm": 0.20565529180207448,
"learning_rate": 7.947434292866084e-06,
"loss": 0.2742,
"step": 2283
},
{
"epoch": 2.5694991558806977,
"grad_norm": 0.21066765721313158,
"learning_rate": 7.926574885273258e-06,
"loss": 0.2944,
"step": 2284
},
{
"epoch": 2.570624648283624,
"grad_norm": 0.21517637060390432,
"learning_rate": 7.905715477680433e-06,
"loss": 0.2875,
"step": 2285
},
{
"epoch": 2.57175014068655,
"grad_norm": 0.21947956446898098,
"learning_rate": 7.88485607008761e-06,
"loss": 0.3004,
"step": 2286
},
{
"epoch": 2.5728756330894766,
"grad_norm": 0.22114557622949502,
"learning_rate": 7.863996662494785e-06,
"loss": 0.2976,
"step": 2287
},
{
"epoch": 2.574001125492403,
"grad_norm": 0.22379469537623312,
"learning_rate": 7.84313725490196e-06,
"loss": 0.2878,
"step": 2288
},
{
"epoch": 2.575126617895329,
"grad_norm": 0.2071839477449149,
"learning_rate": 7.822277847309138e-06,
"loss": 0.2795,
"step": 2289
},
{
"epoch": 2.5762521102982556,
"grad_norm": 0.2237931852947739,
"learning_rate": 7.801418439716313e-06,
"loss": 0.2971,
"step": 2290
},
{
"epoch": 2.5773776027011817,
"grad_norm": 0.21266520141625195,
"learning_rate": 7.780559032123488e-06,
"loss": 0.2853,
"step": 2291
},
{
"epoch": 2.578503095104108,
"grad_norm": 0.2486160020515366,
"learning_rate": 7.759699624530664e-06,
"loss": 0.2961,
"step": 2292
},
{
"epoch": 2.579628587507034,
"grad_norm": 0.2068308805691666,
"learning_rate": 7.73884021693784e-06,
"loss": 0.2691,
"step": 2293
},
{
"epoch": 2.5807540799099606,
"grad_norm": 0.2230851463060974,
"learning_rate": 7.717980809345016e-06,
"loss": 0.2995,
"step": 2294
},
{
"epoch": 2.5818795723128867,
"grad_norm": 0.2374977031933618,
"learning_rate": 7.69712140175219e-06,
"loss": 0.283,
"step": 2295
},
{
"epoch": 2.583005064715813,
"grad_norm": 0.24062860705542086,
"learning_rate": 7.676261994159366e-06,
"loss": 0.2957,
"step": 2296
},
{
"epoch": 2.5841305571187396,
"grad_norm": 0.20537260389777368,
"learning_rate": 7.655402586566542e-06,
"loss": 0.2921,
"step": 2297
},
{
"epoch": 2.5852560495216657,
"grad_norm": 0.21853998967769137,
"learning_rate": 7.634543178973717e-06,
"loss": 0.2894,
"step": 2298
},
{
"epoch": 2.586381541924592,
"grad_norm": 0.21880735610653707,
"learning_rate": 7.613683771380893e-06,
"loss": 0.2791,
"step": 2299
},
{
"epoch": 2.587507034327518,
"grad_norm": 0.22402757654717384,
"learning_rate": 7.592824363788069e-06,
"loss": 0.289,
"step": 2300
},
{
"epoch": 2.5886325267304446,
"grad_norm": 0.2147892961394563,
"learning_rate": 7.5719649561952445e-06,
"loss": 0.2812,
"step": 2301
},
{
"epoch": 2.589758019133371,
"grad_norm": 0.22876144329979556,
"learning_rate": 7.551105548602421e-06,
"loss": 0.2933,
"step": 2302
},
{
"epoch": 2.590883511536297,
"grad_norm": 0.23442708833949216,
"learning_rate": 7.530246141009596e-06,
"loss": 0.2814,
"step": 2303
},
{
"epoch": 2.592009003939223,
"grad_norm": 0.23484614963727998,
"learning_rate": 7.509386733416772e-06,
"loss": 0.3144,
"step": 2304
},
{
"epoch": 2.5931344963421497,
"grad_norm": 0.20921707796315442,
"learning_rate": 7.4885273258239465e-06,
"loss": 0.286,
"step": 2305
},
{
"epoch": 2.594259988745076,
"grad_norm": 0.22775240735379326,
"learning_rate": 7.467667918231122e-06,
"loss": 0.2788,
"step": 2306
},
{
"epoch": 2.595385481148002,
"grad_norm": 0.22230000059940203,
"learning_rate": 7.446808510638298e-06,
"loss": 0.2815,
"step": 2307
},
{
"epoch": 2.5965109735509286,
"grad_norm": 0.24545298078462746,
"learning_rate": 7.425949103045473e-06,
"loss": 0.2735,
"step": 2308
},
{
"epoch": 2.5976364659538547,
"grad_norm": 0.19625632990047406,
"learning_rate": 7.405089695452649e-06,
"loss": 0.2731,
"step": 2309
},
{
"epoch": 2.598761958356781,
"grad_norm": 0.20900090173879718,
"learning_rate": 7.3842302878598255e-06,
"loss": 0.2721,
"step": 2310
},
{
"epoch": 2.5998874507597076,
"grad_norm": 0.2123621289927944,
"learning_rate": 7.363370880267001e-06,
"loss": 0.2698,
"step": 2311
},
{
"epoch": 2.6010129431626337,
"grad_norm": 0.21369756285267333,
"learning_rate": 7.342511472674177e-06,
"loss": 0.2834,
"step": 2312
},
{
"epoch": 2.6021384355655597,
"grad_norm": 0.22793059698710658,
"learning_rate": 7.321652065081352e-06,
"loss": 0.2851,
"step": 2313
},
{
"epoch": 2.603263927968486,
"grad_norm": 0.2134184284204459,
"learning_rate": 7.300792657488528e-06,
"loss": 0.2811,
"step": 2314
},
{
"epoch": 2.6043894203714126,
"grad_norm": 0.21325834093200643,
"learning_rate": 7.279933249895703e-06,
"loss": 0.2936,
"step": 2315
},
{
"epoch": 2.6055149127743387,
"grad_norm": 0.1991068712411994,
"learning_rate": 7.259073842302878e-06,
"loss": 0.2834,
"step": 2316
},
{
"epoch": 2.606640405177265,
"grad_norm": 0.22901278666536629,
"learning_rate": 7.238214434710054e-06,
"loss": 0.3015,
"step": 2317
},
{
"epoch": 2.607765897580191,
"grad_norm": 0.21881134986820416,
"learning_rate": 7.21735502711723e-06,
"loss": 0.2876,
"step": 2318
},
{
"epoch": 2.6088913899831176,
"grad_norm": 0.22029025156059676,
"learning_rate": 7.196495619524406e-06,
"loss": 0.2988,
"step": 2319
},
{
"epoch": 2.610016882386044,
"grad_norm": 0.21007112294863065,
"learning_rate": 7.175636211931582e-06,
"loss": 0.2763,
"step": 2320
},
{
"epoch": 2.61114237478897,
"grad_norm": 0.2126401817051627,
"learning_rate": 7.154776804338757e-06,
"loss": 0.2805,
"step": 2321
},
{
"epoch": 2.612267867191896,
"grad_norm": 0.20852511391858303,
"learning_rate": 7.133917396745933e-06,
"loss": 0.2936,
"step": 2322
},
{
"epoch": 2.6133933595948227,
"grad_norm": 0.21781244059761962,
"learning_rate": 7.1130579891531085e-06,
"loss": 0.2892,
"step": 2323
},
{
"epoch": 2.614518851997749,
"grad_norm": 0.22501438470662116,
"learning_rate": 7.092198581560285e-06,
"loss": 0.2839,
"step": 2324
},
{
"epoch": 2.615644344400675,
"grad_norm": 0.20568012937631386,
"learning_rate": 7.07133917396746e-06,
"loss": 0.2927,
"step": 2325
},
{
"epoch": 2.6167698368036016,
"grad_norm": 0.21222804494470973,
"learning_rate": 7.050479766374634e-06,
"loss": 0.281,
"step": 2326
},
{
"epoch": 2.6178953292065277,
"grad_norm": 0.20938841222313492,
"learning_rate": 7.0296203587818105e-06,
"loss": 0.2845,
"step": 2327
},
{
"epoch": 2.619020821609454,
"grad_norm": 0.21620523521239354,
"learning_rate": 7.008760951188987e-06,
"loss": 0.2801,
"step": 2328
},
{
"epoch": 2.6201463140123806,
"grad_norm": 0.2506118426158015,
"learning_rate": 6.987901543596162e-06,
"loss": 0.2954,
"step": 2329
},
{
"epoch": 2.6212718064153067,
"grad_norm": 0.1973550955823624,
"learning_rate": 6.967042136003338e-06,
"loss": 0.2686,
"step": 2330
},
{
"epoch": 2.622397298818233,
"grad_norm": 0.2066937107804017,
"learning_rate": 6.946182728410513e-06,
"loss": 0.2774,
"step": 2331
},
{
"epoch": 2.623522791221159,
"grad_norm": 0.2202250524273311,
"learning_rate": 6.9253233208176895e-06,
"loss": 0.3068,
"step": 2332
},
{
"epoch": 2.6246482836240856,
"grad_norm": 0.21755861209547087,
"learning_rate": 6.904463913224865e-06,
"loss": 0.2723,
"step": 2333
},
{
"epoch": 2.6257737760270117,
"grad_norm": 0.21926633058957373,
"learning_rate": 6.883604505632041e-06,
"loss": 0.2903,
"step": 2334
},
{
"epoch": 2.626899268429938,
"grad_norm": 0.2130377637928427,
"learning_rate": 6.862745098039216e-06,
"loss": 0.2804,
"step": 2335
},
{
"epoch": 2.628024760832864,
"grad_norm": 0.19225627811370669,
"learning_rate": 6.841885690446391e-06,
"loss": 0.2887,
"step": 2336
},
{
"epoch": 2.6291502532357907,
"grad_norm": 0.20057254466754687,
"learning_rate": 6.821026282853567e-06,
"loss": 0.2837,
"step": 2337
},
{
"epoch": 2.630275745638717,
"grad_norm": 0.23011758988414296,
"learning_rate": 6.800166875260743e-06,
"loss": 0.2931,
"step": 2338
},
{
"epoch": 2.631401238041643,
"grad_norm": 0.23666344897934585,
"learning_rate": 6.779307467667918e-06,
"loss": 0.2782,
"step": 2339
},
{
"epoch": 2.6325267304445696,
"grad_norm": 0.2147014731643971,
"learning_rate": 6.758448060075094e-06,
"loss": 0.2912,
"step": 2340
},
{
"epoch": 2.6336522228474957,
"grad_norm": 0.21234094073131748,
"learning_rate": 6.73758865248227e-06,
"loss": 0.2914,
"step": 2341
},
{
"epoch": 2.634777715250422,
"grad_norm": 0.20060222937545014,
"learning_rate": 6.716729244889446e-06,
"loss": 0.2766,
"step": 2342
},
{
"epoch": 2.635903207653348,
"grad_norm": 0.21917036852395436,
"learning_rate": 6.695869837296621e-06,
"loss": 0.3013,
"step": 2343
},
{
"epoch": 2.6370287000562747,
"grad_norm": 0.20864615144591028,
"learning_rate": 6.675010429703797e-06,
"loss": 0.2792,
"step": 2344
},
{
"epoch": 2.6381541924592007,
"grad_norm": 0.2192912221143167,
"learning_rate": 6.6541510221109725e-06,
"loss": 0.284,
"step": 2345
},
{
"epoch": 2.639279684862127,
"grad_norm": 0.2039309149630558,
"learning_rate": 6.633291614518149e-06,
"loss": 0.2948,
"step": 2346
},
{
"epoch": 2.6404051772650536,
"grad_norm": 0.22259191163490286,
"learning_rate": 6.612432206925323e-06,
"loss": 0.2737,
"step": 2347
},
{
"epoch": 2.6415306696679797,
"grad_norm": 0.2419705952514684,
"learning_rate": 6.591572799332499e-06,
"loss": 0.2894,
"step": 2348
},
{
"epoch": 2.642656162070906,
"grad_norm": 0.20985587856472956,
"learning_rate": 6.5707133917396745e-06,
"loss": 0.2947,
"step": 2349
},
{
"epoch": 2.643781654473832,
"grad_norm": 0.20042601124344012,
"learning_rate": 6.549853984146851e-06,
"loss": 0.271,
"step": 2350
},
{
"epoch": 2.6449071468767587,
"grad_norm": 0.20416712565695233,
"learning_rate": 6.528994576554026e-06,
"loss": 0.2743,
"step": 2351
},
{
"epoch": 2.646032639279685,
"grad_norm": 0.2184086174145368,
"learning_rate": 6.508135168961202e-06,
"loss": 0.3015,
"step": 2352
},
{
"epoch": 2.647158131682611,
"grad_norm": 0.24131101578961572,
"learning_rate": 6.487275761368377e-06,
"loss": 0.2775,
"step": 2353
},
{
"epoch": 2.648283624085537,
"grad_norm": 0.22701304755475593,
"learning_rate": 6.4664163537755535e-06,
"loss": 0.2857,
"step": 2354
},
{
"epoch": 2.6494091164884637,
"grad_norm": 0.20712379248467747,
"learning_rate": 6.445556946182729e-06,
"loss": 0.2758,
"step": 2355
},
{
"epoch": 2.65053460889139,
"grad_norm": 0.23341164321770264,
"learning_rate": 6.424697538589905e-06,
"loss": 0.268,
"step": 2356
},
{
"epoch": 2.651660101294316,
"grad_norm": 0.24310923952152994,
"learning_rate": 6.403838130997079e-06,
"loss": 0.2833,
"step": 2357
},
{
"epoch": 2.6527855936972426,
"grad_norm": 0.229839416220484,
"learning_rate": 6.3829787234042555e-06,
"loss": 0.2889,
"step": 2358
},
{
"epoch": 2.6539110861001687,
"grad_norm": 0.23215272147883,
"learning_rate": 6.362119315811431e-06,
"loss": 0.2975,
"step": 2359
},
{
"epoch": 2.655036578503095,
"grad_norm": 0.24954803338960216,
"learning_rate": 6.341259908218607e-06,
"loss": 0.2946,
"step": 2360
},
{
"epoch": 2.6561620709060216,
"grad_norm": 0.23026522507576283,
"learning_rate": 6.320400500625782e-06,
"loss": 0.268,
"step": 2361
},
{
"epoch": 2.6572875633089477,
"grad_norm": 0.23021270773997743,
"learning_rate": 6.299541093032958e-06,
"loss": 0.261,
"step": 2362
},
{
"epoch": 2.6584130557118737,
"grad_norm": 0.21115861014586346,
"learning_rate": 6.278681685440134e-06,
"loss": 0.2771,
"step": 2363
},
{
"epoch": 2.6595385481148,
"grad_norm": 0.2405585243153947,
"learning_rate": 6.25782227784731e-06,
"loss": 0.2806,
"step": 2364
},
{
"epoch": 2.6606640405177266,
"grad_norm": 0.2497609269658003,
"learning_rate": 6.236962870254485e-06,
"loss": 0.2874,
"step": 2365
},
{
"epoch": 2.6617895329206527,
"grad_norm": 0.22645791008309762,
"learning_rate": 6.21610346266166e-06,
"loss": 0.3004,
"step": 2366
},
{
"epoch": 2.662915025323579,
"grad_norm": 0.2197914591989606,
"learning_rate": 6.1952440550688365e-06,
"loss": 0.2794,
"step": 2367
},
{
"epoch": 2.664040517726505,
"grad_norm": 0.22234883908095063,
"learning_rate": 6.174384647476012e-06,
"loss": 0.3004,
"step": 2368
},
{
"epoch": 2.6651660101294317,
"grad_norm": 0.24165293762861514,
"learning_rate": 6.153525239883188e-06,
"loss": 0.2822,
"step": 2369
},
{
"epoch": 2.666291502532358,
"grad_norm": 0.2552276571924829,
"learning_rate": 6.132665832290363e-06,
"loss": 0.2909,
"step": 2370
},
{
"epoch": 2.667416994935284,
"grad_norm": 0.21066092081346655,
"learning_rate": 6.1118064246975385e-06,
"loss": 0.2798,
"step": 2371
},
{
"epoch": 2.66854248733821,
"grad_norm": 0.2142596843222076,
"learning_rate": 6.090947017104715e-06,
"loss": 0.2776,
"step": 2372
},
{
"epoch": 2.6696679797411367,
"grad_norm": 0.24551341038876937,
"learning_rate": 6.07008760951189e-06,
"loss": 0.2865,
"step": 2373
},
{
"epoch": 2.670793472144063,
"grad_norm": 0.2340361094417635,
"learning_rate": 6.049228201919066e-06,
"loss": 0.2904,
"step": 2374
},
{
"epoch": 2.671918964546989,
"grad_norm": 0.21307302351051388,
"learning_rate": 6.028368794326241e-06,
"loss": 0.2937,
"step": 2375
},
{
"epoch": 2.6730444569499157,
"grad_norm": 0.2512900946420438,
"learning_rate": 6.007509386733417e-06,
"loss": 0.2842,
"step": 2376
},
{
"epoch": 2.6741699493528417,
"grad_norm": 0.20979466873445987,
"learning_rate": 5.986649979140593e-06,
"loss": 0.2931,
"step": 2377
},
{
"epoch": 2.675295441755768,
"grad_norm": 0.21119960362679138,
"learning_rate": 5.965790571547768e-06,
"loss": 0.2842,
"step": 2378
},
{
"epoch": 2.6764209341586946,
"grad_norm": 0.19883138313973867,
"learning_rate": 5.944931163954944e-06,
"loss": 0.2771,
"step": 2379
},
{
"epoch": 2.6775464265616207,
"grad_norm": 0.19968752507295803,
"learning_rate": 5.92407175636212e-06,
"loss": 0.2751,
"step": 2380
},
{
"epoch": 2.678671918964547,
"grad_norm": 0.22015736000540867,
"learning_rate": 5.903212348769295e-06,
"loss": 0.2799,
"step": 2381
},
{
"epoch": 2.679797411367473,
"grad_norm": 0.21339223053410786,
"learning_rate": 5.882352941176471e-06,
"loss": 0.2869,
"step": 2382
},
{
"epoch": 2.6809229037703997,
"grad_norm": 0.19762335590197813,
"learning_rate": 5.861493533583646e-06,
"loss": 0.2829,
"step": 2383
},
{
"epoch": 2.6820483961733257,
"grad_norm": 0.2074037589352283,
"learning_rate": 5.840634125990822e-06,
"loss": 0.2909,
"step": 2384
},
{
"epoch": 2.683173888576252,
"grad_norm": 0.2117788165142603,
"learning_rate": 5.8197747183979985e-06,
"loss": 0.2732,
"step": 2385
},
{
"epoch": 2.684299380979178,
"grad_norm": 0.23282240403764579,
"learning_rate": 5.798915310805173e-06,
"loss": 0.2939,
"step": 2386
},
{
"epoch": 2.6854248733821047,
"grad_norm": 0.21921536525716026,
"learning_rate": 5.778055903212349e-06,
"loss": 0.2844,
"step": 2387
},
{
"epoch": 2.686550365785031,
"grad_norm": 0.2104234762422923,
"learning_rate": 5.757196495619524e-06,
"loss": 0.2893,
"step": 2388
},
{
"epoch": 2.687675858187957,
"grad_norm": 0.20466965592113787,
"learning_rate": 5.7363370880267005e-06,
"loss": 0.2839,
"step": 2389
},
{
"epoch": 2.6888013505908837,
"grad_norm": 0.21641871937130808,
"learning_rate": 5.715477680433877e-06,
"loss": 0.2813,
"step": 2390
},
{
"epoch": 2.6899268429938097,
"grad_norm": 0.19783335996013016,
"learning_rate": 5.694618272841051e-06,
"loss": 0.2681,
"step": 2391
},
{
"epoch": 2.691052335396736,
"grad_norm": 0.23211934348643096,
"learning_rate": 5.673758865248227e-06,
"loss": 0.2737,
"step": 2392
},
{
"epoch": 2.692177827799662,
"grad_norm": 0.21211198195733516,
"learning_rate": 5.6528994576554025e-06,
"loss": 0.293,
"step": 2393
},
{
"epoch": 2.6933033202025887,
"grad_norm": 0.19763398145044694,
"learning_rate": 5.632040050062579e-06,
"loss": 0.2669,
"step": 2394
},
{
"epoch": 2.6944288126055147,
"grad_norm": 0.2084177545225768,
"learning_rate": 5.611180642469755e-06,
"loss": 0.2727,
"step": 2395
},
{
"epoch": 2.695554305008441,
"grad_norm": 0.23874699170463168,
"learning_rate": 5.590321234876929e-06,
"loss": 0.2871,
"step": 2396
},
{
"epoch": 2.6966797974113677,
"grad_norm": 0.22847881193172173,
"learning_rate": 5.569461827284105e-06,
"loss": 0.2704,
"step": 2397
},
{
"epoch": 2.6978052898142937,
"grad_norm": 0.21260192114331203,
"learning_rate": 5.548602419691281e-06,
"loss": 0.2765,
"step": 2398
},
{
"epoch": 2.69893078221722,
"grad_norm": 0.2481199298540293,
"learning_rate": 5.527743012098457e-06,
"loss": 0.2978,
"step": 2399
},
{
"epoch": 2.700056274620146,
"grad_norm": 0.22837599699280178,
"learning_rate": 5.506883604505633e-06,
"loss": 0.2778,
"step": 2400
},
{
"epoch": 2.7011817670230727,
"grad_norm": 0.20862397407681496,
"learning_rate": 5.486024196912807e-06,
"loss": 0.282,
"step": 2401
},
{
"epoch": 2.702307259425999,
"grad_norm": 0.21499220301713443,
"learning_rate": 5.4651647893199835e-06,
"loss": 0.2946,
"step": 2402
},
{
"epoch": 2.703432751828925,
"grad_norm": 0.21876537448943154,
"learning_rate": 5.444305381727159e-06,
"loss": 0.2999,
"step": 2403
},
{
"epoch": 2.704558244231851,
"grad_norm": 0.19584715347664308,
"learning_rate": 5.423445974134335e-06,
"loss": 0.2811,
"step": 2404
},
{
"epoch": 2.7056837366347777,
"grad_norm": 0.199638455026977,
"learning_rate": 5.402586566541511e-06,
"loss": 0.2922,
"step": 2405
},
{
"epoch": 2.706809229037704,
"grad_norm": 0.1959593267413218,
"learning_rate": 5.381727158948686e-06,
"loss": 0.2803,
"step": 2406
},
{
"epoch": 2.70793472144063,
"grad_norm": 0.2172940746080715,
"learning_rate": 5.360867751355862e-06,
"loss": 0.2998,
"step": 2407
},
{
"epoch": 2.7090602138435567,
"grad_norm": 0.21226736116643366,
"learning_rate": 5.340008343763037e-06,
"loss": 0.2793,
"step": 2408
},
{
"epoch": 2.7101857062464827,
"grad_norm": 0.2292349157751999,
"learning_rate": 5.319148936170213e-06,
"loss": 0.2925,
"step": 2409
},
{
"epoch": 2.711311198649409,
"grad_norm": 0.2065757335406282,
"learning_rate": 5.298289528577389e-06,
"loss": 0.2852,
"step": 2410
},
{
"epoch": 2.7124366910523356,
"grad_norm": 0.21318268769213233,
"learning_rate": 5.2774301209845645e-06,
"loss": 0.3101,
"step": 2411
},
{
"epoch": 2.7135621834552617,
"grad_norm": 0.19490619907298698,
"learning_rate": 5.25657071339174e-06,
"loss": 0.2845,
"step": 2412
},
{
"epoch": 2.7146876758581877,
"grad_norm": 0.20387752435372739,
"learning_rate": 5.235711305798915e-06,
"loss": 0.2891,
"step": 2413
},
{
"epoch": 2.715813168261114,
"grad_norm": 0.21901749886212424,
"learning_rate": 5.214851898206091e-06,
"loss": 0.2829,
"step": 2414
},
{
"epoch": 2.7169386606640407,
"grad_norm": 0.21269033641949117,
"learning_rate": 5.193992490613267e-06,
"loss": 0.2866,
"step": 2415
},
{
"epoch": 2.7180641530669667,
"grad_norm": 0.204672643305428,
"learning_rate": 5.173133083020443e-06,
"loss": 0.2928,
"step": 2416
},
{
"epoch": 2.719189645469893,
"grad_norm": 0.22695422804231133,
"learning_rate": 5.152273675427618e-06,
"loss": 0.3034,
"step": 2417
},
{
"epoch": 2.720315137872819,
"grad_norm": 0.20992539451243944,
"learning_rate": 5.131414267834793e-06,
"loss": 0.2756,
"step": 2418
},
{
"epoch": 2.7214406302757457,
"grad_norm": 0.23946364092165132,
"learning_rate": 5.110554860241969e-06,
"loss": 0.2619,
"step": 2419
},
{
"epoch": 2.722566122678672,
"grad_norm": 0.22133441280710264,
"learning_rate": 5.0896954526491455e-06,
"loss": 0.3024,
"step": 2420
},
{
"epoch": 2.723691615081598,
"grad_norm": 0.19132609073407808,
"learning_rate": 5.068836045056321e-06,
"loss": 0.2809,
"step": 2421
},
{
"epoch": 2.724817107484524,
"grad_norm": 0.20537504751422384,
"learning_rate": 5.047976637463496e-06,
"loss": 0.2981,
"step": 2422
},
{
"epoch": 2.7259425998874507,
"grad_norm": 0.2026641684698212,
"learning_rate": 5.027117229870671e-06,
"loss": 0.2831,
"step": 2423
},
{
"epoch": 2.727068092290377,
"grad_norm": 0.220392207872778,
"learning_rate": 5.0062578222778475e-06,
"loss": 0.2974,
"step": 2424
},
{
"epoch": 2.728193584693303,
"grad_norm": 0.20374793230025023,
"learning_rate": 4.985398414685024e-06,
"loss": 0.2843,
"step": 2425
},
{
"epoch": 2.7293190770962297,
"grad_norm": 0.2182187646308083,
"learning_rate": 4.964539007092199e-06,
"loss": 0.2827,
"step": 2426
},
{
"epoch": 2.7304445694991557,
"grad_norm": 0.20515095934912667,
"learning_rate": 4.943679599499374e-06,
"loss": 0.2823,
"step": 2427
},
{
"epoch": 2.731570061902082,
"grad_norm": 0.2274911617803538,
"learning_rate": 4.9228201919065495e-06,
"loss": 0.2874,
"step": 2428
},
{
"epoch": 2.7326955543050087,
"grad_norm": 0.20240468754950888,
"learning_rate": 4.901960784313726e-06,
"loss": 0.2901,
"step": 2429
},
{
"epoch": 2.7338210467079347,
"grad_norm": 0.21135908550005916,
"learning_rate": 4.881101376720902e-06,
"loss": 0.2792,
"step": 2430
},
{
"epoch": 2.734946539110861,
"grad_norm": 0.21034155921896855,
"learning_rate": 4.860241969128077e-06,
"loss": 0.2911,
"step": 2431
},
{
"epoch": 2.736072031513787,
"grad_norm": 0.2073170761162975,
"learning_rate": 4.839382561535253e-06,
"loss": 0.2768,
"step": 2432
},
{
"epoch": 2.7371975239167137,
"grad_norm": 0.20922141980047607,
"learning_rate": 4.818523153942428e-06,
"loss": 0.2818,
"step": 2433
},
{
"epoch": 2.7383230163196397,
"grad_norm": 0.20817722346637343,
"learning_rate": 4.797663746349604e-06,
"loss": 0.2799,
"step": 2434
},
{
"epoch": 2.739448508722566,
"grad_norm": 0.20367161604931538,
"learning_rate": 4.77680433875678e-06,
"loss": 0.3002,
"step": 2435
},
{
"epoch": 2.740574001125492,
"grad_norm": 0.20301772018260328,
"learning_rate": 4.755944931163955e-06,
"loss": 0.2734,
"step": 2436
},
{
"epoch": 2.7416994935284187,
"grad_norm": 0.19056871488718577,
"learning_rate": 4.735085523571131e-06,
"loss": 0.2753,
"step": 2437
},
{
"epoch": 2.742824985931345,
"grad_norm": 0.25569233009986714,
"learning_rate": 4.714226115978306e-06,
"loss": 0.2991,
"step": 2438
},
{
"epoch": 2.743950478334271,
"grad_norm": 0.20649414202471206,
"learning_rate": 4.693366708385482e-06,
"loss": 0.2754,
"step": 2439
},
{
"epoch": 2.7450759707371977,
"grad_norm": 0.20526360074859962,
"learning_rate": 4.672507300792658e-06,
"loss": 0.2846,
"step": 2440
},
{
"epoch": 2.7462014631401237,
"grad_norm": 0.19992362065471733,
"learning_rate": 4.651647893199833e-06,
"loss": 0.2883,
"step": 2441
},
{
"epoch": 2.74732695554305,
"grad_norm": 0.20760115590523082,
"learning_rate": 4.6307884856070095e-06,
"loss": 0.2734,
"step": 2442
},
{
"epoch": 2.748452447945976,
"grad_norm": 0.20022172726588305,
"learning_rate": 4.609929078014184e-06,
"loss": 0.2746,
"step": 2443
},
{
"epoch": 2.7495779403489027,
"grad_norm": 0.21742210771505113,
"learning_rate": 4.58906967042136e-06,
"loss": 0.2712,
"step": 2444
},
{
"epoch": 2.7507034327518287,
"grad_norm": 0.1894221520336593,
"learning_rate": 4.568210262828536e-06,
"loss": 0.269,
"step": 2445
},
{
"epoch": 2.751828925154755,
"grad_norm": 0.2014338015112663,
"learning_rate": 4.5473508552357115e-06,
"loss": 0.2754,
"step": 2446
},
{
"epoch": 2.7529544175576817,
"grad_norm": 0.2767717819280576,
"learning_rate": 4.526491447642888e-06,
"loss": 0.2704,
"step": 2447
},
{
"epoch": 2.7540799099606077,
"grad_norm": 0.2380476325461093,
"learning_rate": 4.505632040050062e-06,
"loss": 0.2861,
"step": 2448
},
{
"epoch": 2.755205402363534,
"grad_norm": 0.21164580868587568,
"learning_rate": 4.484772632457238e-06,
"loss": 0.2768,
"step": 2449
},
{
"epoch": 2.75633089476646,
"grad_norm": 0.2008040936085221,
"learning_rate": 4.463913224864414e-06,
"loss": 0.275,
"step": 2450
},
{
"epoch": 2.7574563871693867,
"grad_norm": 0.227682709177952,
"learning_rate": 4.44305381727159e-06,
"loss": 0.3032,
"step": 2451
},
{
"epoch": 2.758581879572313,
"grad_norm": 0.24775815808302365,
"learning_rate": 4.422194409678766e-06,
"loss": 0.2952,
"step": 2452
},
{
"epoch": 2.759707371975239,
"grad_norm": 0.23280742706720892,
"learning_rate": 4.401335002085941e-06,
"loss": 0.2773,
"step": 2453
},
{
"epoch": 2.760832864378165,
"grad_norm": 0.19451290711254568,
"learning_rate": 4.380475594493116e-06,
"loss": 0.2693,
"step": 2454
},
{
"epoch": 2.7619583567810917,
"grad_norm": 0.20939579388836216,
"learning_rate": 4.3596161869002925e-06,
"loss": 0.2866,
"step": 2455
},
{
"epoch": 2.763083849184018,
"grad_norm": 0.20672270738688484,
"learning_rate": 4.338756779307468e-06,
"loss": 0.2904,
"step": 2456
},
{
"epoch": 2.764209341586944,
"grad_norm": 0.19033409152598357,
"learning_rate": 4.317897371714644e-06,
"loss": 0.2718,
"step": 2457
},
{
"epoch": 2.7653348339898707,
"grad_norm": 0.2166789978324445,
"learning_rate": 4.297037964121819e-06,
"loss": 0.2983,
"step": 2458
},
{
"epoch": 2.7664603263927967,
"grad_norm": 0.216447417269072,
"learning_rate": 4.2761785565289945e-06,
"loss": 0.2858,
"step": 2459
},
{
"epoch": 2.767585818795723,
"grad_norm": 0.362675866181273,
"learning_rate": 4.255319148936171e-06,
"loss": 0.2881,
"step": 2460
},
{
"epoch": 2.7687113111986497,
"grad_norm": 0.19521272525143493,
"learning_rate": 4.234459741343346e-06,
"loss": 0.2724,
"step": 2461
},
{
"epoch": 2.7698368036015757,
"grad_norm": 0.19682316401858674,
"learning_rate": 4.213600333750522e-06,
"loss": 0.2859,
"step": 2462
},
{
"epoch": 2.7709622960045017,
"grad_norm": 0.21277271670047132,
"learning_rate": 4.192740926157697e-06,
"loss": 0.2974,
"step": 2463
},
{
"epoch": 2.772087788407428,
"grad_norm": 0.21323098001422092,
"learning_rate": 4.171881518564873e-06,
"loss": 0.2858,
"step": 2464
},
{
"epoch": 2.7732132808103547,
"grad_norm": 0.2443899119261561,
"learning_rate": 4.151022110972049e-06,
"loss": 0.2851,
"step": 2465
},
{
"epoch": 2.7743387732132807,
"grad_norm": 0.2139564808006101,
"learning_rate": 4.130162703379224e-06,
"loss": 0.2949,
"step": 2466
},
{
"epoch": 2.775464265616207,
"grad_norm": 0.2212119000303061,
"learning_rate": 4.1093032957864e-06,
"loss": 0.2751,
"step": 2467
},
{
"epoch": 2.776589758019133,
"grad_norm": 0.20484020499228098,
"learning_rate": 4.0884438881935755e-06,
"loss": 0.2771,
"step": 2468
},
{
"epoch": 2.7777152504220597,
"grad_norm": 0.20123952910830462,
"learning_rate": 4.067584480600751e-06,
"loss": 0.3006,
"step": 2469
},
{
"epoch": 2.778840742824986,
"grad_norm": 0.21577294384729115,
"learning_rate": 4.046725073007927e-06,
"loss": 0.2996,
"step": 2470
},
{
"epoch": 2.779966235227912,
"grad_norm": 0.22582357217712795,
"learning_rate": 4.025865665415102e-06,
"loss": 0.2717,
"step": 2471
},
{
"epoch": 2.7810917276308382,
"grad_norm": 0.2134397866045082,
"learning_rate": 4.005006257822278e-06,
"loss": 0.2825,
"step": 2472
},
{
"epoch": 2.7822172200337647,
"grad_norm": 0.20324964622528435,
"learning_rate": 3.984146850229454e-06,
"loss": 0.2799,
"step": 2473
},
{
"epoch": 2.783342712436691,
"grad_norm": 0.20795174662693527,
"learning_rate": 3.963287442636629e-06,
"loss": 0.2854,
"step": 2474
},
{
"epoch": 2.784468204839617,
"grad_norm": 0.1997012004956189,
"learning_rate": 3.942428035043805e-06,
"loss": 0.2764,
"step": 2475
},
{
"epoch": 2.7855936972425437,
"grad_norm": 0.2096005762129211,
"learning_rate": 3.92156862745098e-06,
"loss": 0.276,
"step": 2476
},
{
"epoch": 2.7867191896454697,
"grad_norm": 0.2177380985810915,
"learning_rate": 3.9007092198581565e-06,
"loss": 0.2893,
"step": 2477
},
{
"epoch": 2.787844682048396,
"grad_norm": 0.22482324020736472,
"learning_rate": 3.879849812265332e-06,
"loss": 0.2657,
"step": 2478
},
{
"epoch": 2.7889701744513227,
"grad_norm": 0.19885926541241514,
"learning_rate": 3.858990404672508e-06,
"loss": 0.2884,
"step": 2479
},
{
"epoch": 2.7900956668542487,
"grad_norm": 0.19908222649936524,
"learning_rate": 3.838130997079683e-06,
"loss": 0.2742,
"step": 2480
},
{
"epoch": 2.791221159257175,
"grad_norm": 0.20760067726563736,
"learning_rate": 3.8172715894868585e-06,
"loss": 0.296,
"step": 2481
},
{
"epoch": 2.792346651660101,
"grad_norm": 0.2533774016177821,
"learning_rate": 3.7964121818940346e-06,
"loss": 0.2861,
"step": 2482
},
{
"epoch": 2.7934721440630277,
"grad_norm": 0.21052580972405535,
"learning_rate": 3.7755527743012103e-06,
"loss": 0.288,
"step": 2483
},
{
"epoch": 2.7945976364659537,
"grad_norm": 0.206699855575665,
"learning_rate": 3.754693366708386e-06,
"loss": 0.288,
"step": 2484
},
{
"epoch": 2.79572312886888,
"grad_norm": 0.19585545352446415,
"learning_rate": 3.733833959115561e-06,
"loss": 0.2747,
"step": 2485
},
{
"epoch": 2.7968486212718062,
"grad_norm": 0.2062750655598761,
"learning_rate": 3.7129745515227366e-06,
"loss": 0.275,
"step": 2486
},
{
"epoch": 2.7979741136747327,
"grad_norm": 0.19740523384916275,
"learning_rate": 3.6921151439299128e-06,
"loss": 0.2762,
"step": 2487
},
{
"epoch": 2.799099606077659,
"grad_norm": 0.23460327350417823,
"learning_rate": 3.6712557363370885e-06,
"loss": 0.2663,
"step": 2488
},
{
"epoch": 2.800225098480585,
"grad_norm": 0.27745757559360934,
"learning_rate": 3.650396328744264e-06,
"loss": 0.318,
"step": 2489
},
{
"epoch": 2.8013505908835117,
"grad_norm": 0.20040376636692844,
"learning_rate": 3.629536921151439e-06,
"loss": 0.2947,
"step": 2490
},
{
"epoch": 2.8024760832864377,
"grad_norm": 0.2065083158200927,
"learning_rate": 3.608677513558615e-06,
"loss": 0.2856,
"step": 2491
},
{
"epoch": 2.803601575689364,
"grad_norm": 0.2609652931637163,
"learning_rate": 3.587818105965791e-06,
"loss": 0.2778,
"step": 2492
},
{
"epoch": 2.80472706809229,
"grad_norm": 0.19707420225094988,
"learning_rate": 3.5669586983729666e-06,
"loss": 0.2847,
"step": 2493
},
{
"epoch": 2.8058525604952167,
"grad_norm": 0.214409640748111,
"learning_rate": 3.5460992907801423e-06,
"loss": 0.2728,
"step": 2494
},
{
"epoch": 2.8069780528981427,
"grad_norm": 0.21397007514173588,
"learning_rate": 3.525239883187317e-06,
"loss": 0.2939,
"step": 2495
},
{
"epoch": 2.808103545301069,
"grad_norm": 0.2211780277225568,
"learning_rate": 3.5043804755944933e-06,
"loss": 0.2941,
"step": 2496
},
{
"epoch": 2.8092290377039957,
"grad_norm": 0.19746449192767923,
"learning_rate": 3.483521068001669e-06,
"loss": 0.2865,
"step": 2497
},
{
"epoch": 2.8103545301069217,
"grad_norm": 0.24933709829603812,
"learning_rate": 3.4626616604088447e-06,
"loss": 0.292,
"step": 2498
},
{
"epoch": 2.811480022509848,
"grad_norm": 0.20025420830978435,
"learning_rate": 3.4418022528160205e-06,
"loss": 0.2936,
"step": 2499
},
{
"epoch": 2.812605514912774,
"grad_norm": 0.21106860467217742,
"learning_rate": 3.4209428452231953e-06,
"loss": 0.2749,
"step": 2500
},
{
"epoch": 2.8137310073157007,
"grad_norm": 0.244173358210336,
"learning_rate": 3.4000834376303715e-06,
"loss": 0.2921,
"step": 2501
},
{
"epoch": 2.814856499718627,
"grad_norm": 0.23708488619881102,
"learning_rate": 3.379224030037547e-06,
"loss": 0.286,
"step": 2502
},
{
"epoch": 2.815981992121553,
"grad_norm": 0.20203389839535874,
"learning_rate": 3.358364622444723e-06,
"loss": 0.2823,
"step": 2503
},
{
"epoch": 2.8171074845244792,
"grad_norm": 0.20180941230046498,
"learning_rate": 3.3375052148518986e-06,
"loss": 0.2795,
"step": 2504
},
{
"epoch": 2.8182329769274057,
"grad_norm": 0.1923974204466329,
"learning_rate": 3.3166458072590743e-06,
"loss": 0.2719,
"step": 2505
},
{
"epoch": 2.819358469330332,
"grad_norm": 0.210276870318239,
"learning_rate": 3.2957863996662496e-06,
"loss": 0.2831,
"step": 2506
},
{
"epoch": 2.820483961733258,
"grad_norm": 0.21238988385557064,
"learning_rate": 3.2749269920734253e-06,
"loss": 0.278,
"step": 2507
},
{
"epoch": 2.8216094541361847,
"grad_norm": 0.19338349502478416,
"learning_rate": 3.254067584480601e-06,
"loss": 0.2717,
"step": 2508
},
{
"epoch": 2.8227349465391107,
"grad_norm": 0.19141784353147265,
"learning_rate": 3.2332081768877767e-06,
"loss": 0.2687,
"step": 2509
},
{
"epoch": 2.823860438942037,
"grad_norm": 0.21299210269078242,
"learning_rate": 3.2123487692949525e-06,
"loss": 0.273,
"step": 2510
},
{
"epoch": 2.8249859313449637,
"grad_norm": 0.19645234999837438,
"learning_rate": 3.1914893617021277e-06,
"loss": 0.274,
"step": 2511
},
{
"epoch": 2.8261114237478897,
"grad_norm": 0.2154736738045129,
"learning_rate": 3.1706299541093035e-06,
"loss": 0.2798,
"step": 2512
},
{
"epoch": 2.8272369161508157,
"grad_norm": 0.20747340932057978,
"learning_rate": 3.149770546516479e-06,
"loss": 0.2911,
"step": 2513
},
{
"epoch": 2.828362408553742,
"grad_norm": 0.21668977371189152,
"learning_rate": 3.128911138923655e-06,
"loss": 0.2809,
"step": 2514
},
{
"epoch": 2.8294879009566687,
"grad_norm": 0.19752342856018376,
"learning_rate": 3.10805173133083e-06,
"loss": 0.2913,
"step": 2515
},
{
"epoch": 2.8306133933595947,
"grad_norm": 0.2129254469727926,
"learning_rate": 3.087192323738006e-06,
"loss": 0.2991,
"step": 2516
},
{
"epoch": 2.831738885762521,
"grad_norm": 0.2135496695763609,
"learning_rate": 3.0663329161451816e-06,
"loss": 0.3121,
"step": 2517
},
{
"epoch": 2.8328643781654472,
"grad_norm": 0.20217705970631564,
"learning_rate": 3.0454735085523573e-06,
"loss": 0.2699,
"step": 2518
},
{
"epoch": 2.8339898705683737,
"grad_norm": 0.18655748638369685,
"learning_rate": 3.024614100959533e-06,
"loss": 0.281,
"step": 2519
},
{
"epoch": 2.8351153629713,
"grad_norm": 0.20710373842178917,
"learning_rate": 3.0037546933667083e-06,
"loss": 0.2818,
"step": 2520
},
{
"epoch": 2.836240855374226,
"grad_norm": 0.1889143314236119,
"learning_rate": 2.982895285773884e-06,
"loss": 0.2686,
"step": 2521
},
{
"epoch": 2.8373663477771522,
"grad_norm": 0.1916678750234142,
"learning_rate": 2.96203587818106e-06,
"loss": 0.28,
"step": 2522
},
{
"epoch": 2.8384918401800787,
"grad_norm": 0.19425390855255784,
"learning_rate": 2.9411764705882355e-06,
"loss": 0.2979,
"step": 2523
},
{
"epoch": 2.839617332583005,
"grad_norm": 0.20681634843217112,
"learning_rate": 2.920317062995411e-06,
"loss": 0.2871,
"step": 2524
},
{
"epoch": 2.8407428249859312,
"grad_norm": 0.20023152119316995,
"learning_rate": 2.8994576554025865e-06,
"loss": 0.2801,
"step": 2525
},
{
"epoch": 2.8418683173888577,
"grad_norm": 0.20177659743093546,
"learning_rate": 2.878598247809762e-06,
"loss": 0.2816,
"step": 2526
},
{
"epoch": 2.8429938097917837,
"grad_norm": 0.19585899941288504,
"learning_rate": 2.8577388402169383e-06,
"loss": 0.2679,
"step": 2527
},
{
"epoch": 2.84411930219471,
"grad_norm": 0.17859425580387345,
"learning_rate": 2.8368794326241136e-06,
"loss": 0.2707,
"step": 2528
},
{
"epoch": 2.8452447945976367,
"grad_norm": 0.2039828332143888,
"learning_rate": 2.8160200250312893e-06,
"loss": 0.2991,
"step": 2529
},
{
"epoch": 2.8463702870005627,
"grad_norm": 0.2030074794985201,
"learning_rate": 2.7951606174384646e-06,
"loss": 0.2793,
"step": 2530
},
{
"epoch": 2.847495779403489,
"grad_norm": 0.19766777443355418,
"learning_rate": 2.7743012098456403e-06,
"loss": 0.2823,
"step": 2531
},
{
"epoch": 2.8486212718064152,
"grad_norm": 0.19892663863890306,
"learning_rate": 2.7534418022528165e-06,
"loss": 0.2806,
"step": 2532
},
{
"epoch": 2.8497467642093417,
"grad_norm": 0.21275315081509835,
"learning_rate": 2.7325823946599917e-06,
"loss": 0.2701,
"step": 2533
},
{
"epoch": 2.8508722566122677,
"grad_norm": 0.21033439554694824,
"learning_rate": 2.7117229870671675e-06,
"loss": 0.2854,
"step": 2534
},
{
"epoch": 2.851997749015194,
"grad_norm": 0.19767645497600447,
"learning_rate": 2.690863579474343e-06,
"loss": 0.288,
"step": 2535
},
{
"epoch": 2.8531232414181202,
"grad_norm": 0.2081071860608737,
"learning_rate": 2.6700041718815185e-06,
"loss": 0.2853,
"step": 2536
},
{
"epoch": 2.8542487338210467,
"grad_norm": 0.19156924866913874,
"learning_rate": 2.6491447642886946e-06,
"loss": 0.2768,
"step": 2537
},
{
"epoch": 2.855374226223973,
"grad_norm": 0.21948381030849073,
"learning_rate": 2.62828535669587e-06,
"loss": 0.2862,
"step": 2538
},
{
"epoch": 2.856499718626899,
"grad_norm": 0.194801788917978,
"learning_rate": 2.6074259491030456e-06,
"loss": 0.2797,
"step": 2539
},
{
"epoch": 2.8576252110298257,
"grad_norm": 0.22618411821723028,
"learning_rate": 2.5865665415102213e-06,
"loss": 0.3074,
"step": 2540
},
{
"epoch": 2.8587507034327517,
"grad_norm": 0.20254836873582036,
"learning_rate": 2.5657071339173966e-06,
"loss": 0.2699,
"step": 2541
},
{
"epoch": 2.859876195835678,
"grad_norm": 0.19524970967882507,
"learning_rate": 2.5448477263245727e-06,
"loss": 0.2781,
"step": 2542
},
{
"epoch": 2.8610016882386042,
"grad_norm": 0.19523879122973248,
"learning_rate": 2.523988318731748e-06,
"loss": 0.2901,
"step": 2543
},
{
"epoch": 2.8621271806415307,
"grad_norm": 0.19385408771091103,
"learning_rate": 2.5031289111389237e-06,
"loss": 0.2861,
"step": 2544
},
{
"epoch": 2.8632526730444567,
"grad_norm": 0.19805715117689787,
"learning_rate": 2.4822695035460995e-06,
"loss": 0.2692,
"step": 2545
},
{
"epoch": 2.864378165447383,
"grad_norm": 0.19729671382210393,
"learning_rate": 2.4614100959532747e-06,
"loss": 0.293,
"step": 2546
},
{
"epoch": 2.8655036578503097,
"grad_norm": 0.19120385629590778,
"learning_rate": 2.440550688360451e-06,
"loss": 0.2809,
"step": 2547
},
{
"epoch": 2.8666291502532357,
"grad_norm": 0.20169331917856845,
"learning_rate": 2.4196912807676266e-06,
"loss": 0.2865,
"step": 2548
},
{
"epoch": 2.867754642656162,
"grad_norm": 0.20308102680675588,
"learning_rate": 2.398831873174802e-06,
"loss": 0.292,
"step": 2549
},
{
"epoch": 2.8688801350590882,
"grad_norm": 0.21920991074207272,
"learning_rate": 2.3779724655819776e-06,
"loss": 0.2769,
"step": 2550
},
{
"epoch": 2.8700056274620147,
"grad_norm": 0.1909570999276725,
"learning_rate": 2.357113057989153e-06,
"loss": 0.2826,
"step": 2551
},
{
"epoch": 2.871131119864941,
"grad_norm": 0.20820870859741275,
"learning_rate": 2.336253650396329e-06,
"loss": 0.2811,
"step": 2552
},
{
"epoch": 2.872256612267867,
"grad_norm": 0.19636795603937476,
"learning_rate": 2.3153942428035047e-06,
"loss": 0.2817,
"step": 2553
},
{
"epoch": 2.8733821046707932,
"grad_norm": 0.2072299777143624,
"learning_rate": 2.29453483521068e-06,
"loss": 0.2782,
"step": 2554
},
{
"epoch": 2.8745075970737197,
"grad_norm": 0.21169030898396585,
"learning_rate": 2.2736754276178557e-06,
"loss": 0.2674,
"step": 2555
},
{
"epoch": 2.875633089476646,
"grad_norm": 0.21706530639456728,
"learning_rate": 2.252816020025031e-06,
"loss": 0.2759,
"step": 2556
},
{
"epoch": 2.8767585818795722,
"grad_norm": 0.2028602319447674,
"learning_rate": 2.231956612432207e-06,
"loss": 0.2785,
"step": 2557
},
{
"epoch": 2.8778840742824987,
"grad_norm": 0.19144432960054086,
"learning_rate": 2.211097204839383e-06,
"loss": 0.2833,
"step": 2558
},
{
"epoch": 2.8790095666854247,
"grad_norm": 0.2093653426820825,
"learning_rate": 2.190237797246558e-06,
"loss": 0.2815,
"step": 2559
},
{
"epoch": 2.880135059088351,
"grad_norm": 0.18440025227997578,
"learning_rate": 2.169378389653734e-06,
"loss": 0.2786,
"step": 2560
},
{
"epoch": 2.8812605514912777,
"grad_norm": 0.19656732367474106,
"learning_rate": 2.1485189820609096e-06,
"loss": 0.2803,
"step": 2561
},
{
"epoch": 2.8823860438942037,
"grad_norm": 0.19783394536320123,
"learning_rate": 2.1276595744680853e-06,
"loss": 0.2812,
"step": 2562
},
{
"epoch": 2.8835115362971298,
"grad_norm": 0.18647997820338216,
"learning_rate": 2.106800166875261e-06,
"loss": 0.2781,
"step": 2563
},
{
"epoch": 2.8846370287000562,
"grad_norm": 0.19982575823634316,
"learning_rate": 2.0859407592824363e-06,
"loss": 0.2816,
"step": 2564
},
{
"epoch": 2.8857625211029827,
"grad_norm": 0.195813225255546,
"learning_rate": 2.065081351689612e-06,
"loss": 0.2788,
"step": 2565
},
{
"epoch": 2.8868880135059087,
"grad_norm": 0.2043720027738115,
"learning_rate": 2.0442219440967877e-06,
"loss": 0.286,
"step": 2566
},
{
"epoch": 2.888013505908835,
"grad_norm": 0.19093315445014475,
"learning_rate": 2.0233625365039634e-06,
"loss": 0.2707,
"step": 2567
},
{
"epoch": 2.8891389983117612,
"grad_norm": 0.21027820905380143,
"learning_rate": 2.002503128911139e-06,
"loss": 0.3025,
"step": 2568
},
{
"epoch": 2.8902644907146877,
"grad_norm": 0.2026225980219709,
"learning_rate": 1.9816437213183145e-06,
"loss": 0.2853,
"step": 2569
},
{
"epoch": 2.891389983117614,
"grad_norm": 0.1861779080477095,
"learning_rate": 1.96078431372549e-06,
"loss": 0.2783,
"step": 2570
},
{
"epoch": 2.8925154755205402,
"grad_norm": 0.20932901602980997,
"learning_rate": 1.939924906132666e-06,
"loss": 0.2885,
"step": 2571
},
{
"epoch": 2.8936409679234663,
"grad_norm": 0.2073789548556899,
"learning_rate": 1.9190654985398416e-06,
"loss": 0.2801,
"step": 2572
},
{
"epoch": 2.8947664603263927,
"grad_norm": 0.18609584484982244,
"learning_rate": 1.8982060909470173e-06,
"loss": 0.2744,
"step": 2573
},
{
"epoch": 2.895891952729319,
"grad_norm": 0.1812033482553102,
"learning_rate": 1.877346683354193e-06,
"loss": 0.2859,
"step": 2574
},
{
"epoch": 2.8970174451322452,
"grad_norm": 0.20994333414010466,
"learning_rate": 1.8564872757613683e-06,
"loss": 0.2904,
"step": 2575
},
{
"epoch": 2.8981429375351717,
"grad_norm": 0.19743578503548526,
"learning_rate": 1.8356278681685442e-06,
"loss": 0.2761,
"step": 2576
},
{
"epoch": 2.8992684299380977,
"grad_norm": 0.19166570858524684,
"learning_rate": 1.8147684605757195e-06,
"loss": 0.2918,
"step": 2577
},
{
"epoch": 2.9003939223410242,
"grad_norm": 0.20463881347239937,
"learning_rate": 1.7939090529828954e-06,
"loss": 0.2889,
"step": 2578
},
{
"epoch": 2.9015194147439507,
"grad_norm": 0.19047750537612482,
"learning_rate": 1.7730496453900712e-06,
"loss": 0.2864,
"step": 2579
},
{
"epoch": 2.9026449071468767,
"grad_norm": 0.19091573754700542,
"learning_rate": 1.7521902377972467e-06,
"loss": 0.2927,
"step": 2580
},
{
"epoch": 2.903770399549803,
"grad_norm": 0.1876198954341774,
"learning_rate": 1.7313308302044224e-06,
"loss": 0.268,
"step": 2581
},
{
"epoch": 2.9048958919527292,
"grad_norm": 0.20218741071231236,
"learning_rate": 1.7104714226115977e-06,
"loss": 0.2818,
"step": 2582
},
{
"epoch": 2.9060213843556557,
"grad_norm": 0.19751340574083814,
"learning_rate": 1.6896120150187736e-06,
"loss": 0.289,
"step": 2583
},
{
"epoch": 2.9071468767585817,
"grad_norm": 0.1929293500822755,
"learning_rate": 1.6687526074259493e-06,
"loss": 0.2701,
"step": 2584
},
{
"epoch": 2.908272369161508,
"grad_norm": 0.19409315370373764,
"learning_rate": 1.6478931998331248e-06,
"loss": 0.294,
"step": 2585
},
{
"epoch": 2.9093978615644343,
"grad_norm": 0.19513663853793442,
"learning_rate": 1.6270337922403005e-06,
"loss": 0.2909,
"step": 2586
},
{
"epoch": 2.9105233539673607,
"grad_norm": 0.18520192324031312,
"learning_rate": 1.6061743846474762e-06,
"loss": 0.2879,
"step": 2587
},
{
"epoch": 2.911648846370287,
"grad_norm": 0.19393576190790643,
"learning_rate": 1.5853149770546517e-06,
"loss": 0.2818,
"step": 2588
},
{
"epoch": 2.9127743387732132,
"grad_norm": 0.1913327489426411,
"learning_rate": 1.5644555694618274e-06,
"loss": 0.2845,
"step": 2589
},
{
"epoch": 2.9138998311761397,
"grad_norm": 0.19600196772862405,
"learning_rate": 1.543596161869003e-06,
"loss": 0.2877,
"step": 2590
},
{
"epoch": 2.9150253235790657,
"grad_norm": 0.2003116343512408,
"learning_rate": 1.5227367542761787e-06,
"loss": 0.2761,
"step": 2591
},
{
"epoch": 2.916150815981992,
"grad_norm": 0.18645182971152968,
"learning_rate": 1.5018773466833542e-06,
"loss": 0.2851,
"step": 2592
},
{
"epoch": 2.9172763083849182,
"grad_norm": 0.20894705309538278,
"learning_rate": 1.48101793909053e-06,
"loss": 0.2903,
"step": 2593
},
{
"epoch": 2.9184018007878447,
"grad_norm": 0.19120582673378814,
"learning_rate": 1.4601585314977056e-06,
"loss": 0.2776,
"step": 2594
},
{
"epoch": 2.9195272931907708,
"grad_norm": 0.20278259638182897,
"learning_rate": 1.439299123904881e-06,
"loss": 0.2787,
"step": 2595
},
{
"epoch": 2.9206527855936972,
"grad_norm": 0.19583937073430013,
"learning_rate": 1.4184397163120568e-06,
"loss": 0.2811,
"step": 2596
},
{
"epoch": 2.9217782779966237,
"grad_norm": 0.1941542530021111,
"learning_rate": 1.3975803087192323e-06,
"loss": 0.2658,
"step": 2597
},
{
"epoch": 2.9229037703995497,
"grad_norm": 0.20963262803457552,
"learning_rate": 1.3767209011264082e-06,
"loss": 0.278,
"step": 2598
},
{
"epoch": 2.924029262802476,
"grad_norm": 0.2018274661438912,
"learning_rate": 1.3558614935335837e-06,
"loss": 0.294,
"step": 2599
},
{
"epoch": 2.9251547552054022,
"grad_norm": 0.19436944269264386,
"learning_rate": 1.3350020859407592e-06,
"loss": 0.2834,
"step": 2600
},
{
"epoch": 2.9262802476083287,
"grad_norm": 0.1878287577440724,
"learning_rate": 1.314142678347935e-06,
"loss": 0.2765,
"step": 2601
},
{
"epoch": 2.927405740011255,
"grad_norm": 0.18341006129215123,
"learning_rate": 1.2932832707551107e-06,
"loss": 0.2746,
"step": 2602
},
{
"epoch": 2.9285312324141812,
"grad_norm": 0.19735771783507766,
"learning_rate": 1.2724238631622864e-06,
"loss": 0.2913,
"step": 2603
},
{
"epoch": 2.9296567248171073,
"grad_norm": 0.18771599689886934,
"learning_rate": 1.2515644555694619e-06,
"loss": 0.2753,
"step": 2604
},
{
"epoch": 2.9307822172200337,
"grad_norm": 0.19841768486183753,
"learning_rate": 1.2307050479766374e-06,
"loss": 0.2814,
"step": 2605
},
{
"epoch": 2.93190770962296,
"grad_norm": 0.1956614956245663,
"learning_rate": 1.2098456403838133e-06,
"loss": 0.2743,
"step": 2606
},
{
"epoch": 2.9330332020258862,
"grad_norm": 0.2002743148871214,
"learning_rate": 1.1889862327909888e-06,
"loss": 0.2888,
"step": 2607
},
{
"epoch": 2.9341586944288127,
"grad_norm": 0.21318426731074547,
"learning_rate": 1.1681268251981645e-06,
"loss": 0.2959,
"step": 2608
},
{
"epoch": 2.9352841868317388,
"grad_norm": 0.18809272462436055,
"learning_rate": 1.14726741760534e-06,
"loss": 0.277,
"step": 2609
},
{
"epoch": 2.9364096792346652,
"grad_norm": 0.19427439279930914,
"learning_rate": 1.1264080100125155e-06,
"loss": 0.2927,
"step": 2610
},
{
"epoch": 2.9375351716375917,
"grad_norm": 0.2079310357704345,
"learning_rate": 1.1055486024196914e-06,
"loss": 0.2828,
"step": 2611
},
{
"epoch": 2.9386606640405177,
"grad_norm": 0.19416657363268003,
"learning_rate": 1.084689194826867e-06,
"loss": 0.2911,
"step": 2612
},
{
"epoch": 2.9397861564434438,
"grad_norm": 0.19916119078493613,
"learning_rate": 1.0638297872340427e-06,
"loss": 0.2924,
"step": 2613
},
{
"epoch": 2.9409116488463702,
"grad_norm": 0.1983245462408925,
"learning_rate": 1.0429703796412182e-06,
"loss": 0.2708,
"step": 2614
},
{
"epoch": 2.9420371412492967,
"grad_norm": 0.18590780784131763,
"learning_rate": 1.0221109720483939e-06,
"loss": 0.2863,
"step": 2615
},
{
"epoch": 2.9431626336522227,
"grad_norm": 0.1872997802264514,
"learning_rate": 1.0012515644555696e-06,
"loss": 0.2811,
"step": 2616
},
{
"epoch": 2.9442881260551492,
"grad_norm": 0.178184423835207,
"learning_rate": 9.80392156862745e-07,
"loss": 0.2731,
"step": 2617
},
{
"epoch": 2.9454136184580753,
"grad_norm": 0.20370420485130178,
"learning_rate": 9.595327492699208e-07,
"loss": 0.298,
"step": 2618
},
{
"epoch": 2.9465391108610017,
"grad_norm": 0.19363752795605113,
"learning_rate": 9.386733416770965e-07,
"loss": 0.2862,
"step": 2619
},
{
"epoch": 2.947664603263928,
"grad_norm": 0.19337040777721937,
"learning_rate": 9.178139340842721e-07,
"loss": 0.2938,
"step": 2620
},
{
"epoch": 2.9487900956668542,
"grad_norm": 0.20062145944273124,
"learning_rate": 8.969545264914477e-07,
"loss": 0.2854,
"step": 2621
},
{
"epoch": 2.9499155880697803,
"grad_norm": 0.19780261743838537,
"learning_rate": 8.760951188986233e-07,
"loss": 0.2853,
"step": 2622
},
{
"epoch": 2.9510410804727067,
"grad_norm": 0.18300708396430374,
"learning_rate": 8.552357113057988e-07,
"loss": 0.2859,
"step": 2623
},
{
"epoch": 2.952166572875633,
"grad_norm": 0.19659081949531576,
"learning_rate": 8.343763037129747e-07,
"loss": 0.2873,
"step": 2624
},
{
"epoch": 2.9532920652785593,
"grad_norm": 0.18879456754476104,
"learning_rate": 8.135168961201503e-07,
"loss": 0.3059,
"step": 2625
},
{
"epoch": 2.9544175576814857,
"grad_norm": 0.1970187549688716,
"learning_rate": 7.926574885273259e-07,
"loss": 0.2751,
"step": 2626
},
{
"epoch": 2.9555430500844118,
"grad_norm": 0.18765481178184712,
"learning_rate": 7.717980809345015e-07,
"loss": 0.2828,
"step": 2627
},
{
"epoch": 2.9566685424873382,
"grad_norm": 0.18024019057588778,
"learning_rate": 7.509386733416771e-07,
"loss": 0.2796,
"step": 2628
},
{
"epoch": 2.9577940348902647,
"grad_norm": 0.1911404613455637,
"learning_rate": 7.300792657488528e-07,
"loss": 0.2857,
"step": 2629
},
{
"epoch": 2.9589195272931907,
"grad_norm": 0.19138951788504163,
"learning_rate": 7.092198581560284e-07,
"loss": 0.2792,
"step": 2630
},
{
"epoch": 2.9600450196961168,
"grad_norm": 0.1910935556506434,
"learning_rate": 6.883604505632041e-07,
"loss": 0.2845,
"step": 2631
},
{
"epoch": 2.9611705120990433,
"grad_norm": 0.19203721334554208,
"learning_rate": 6.675010429703796e-07,
"loss": 0.291,
"step": 2632
},
{
"epoch": 2.9622960045019697,
"grad_norm": 0.1928645139619539,
"learning_rate": 6.466416353775553e-07,
"loss": 0.2929,
"step": 2633
},
{
"epoch": 2.9634214969048958,
"grad_norm": 0.19918761206527566,
"learning_rate": 6.257822277847309e-07,
"loss": 0.3019,
"step": 2634
},
{
"epoch": 2.9645469893078222,
"grad_norm": 0.1854244511600504,
"learning_rate": 6.049228201919066e-07,
"loss": 0.2762,
"step": 2635
},
{
"epoch": 2.9656724817107483,
"grad_norm": 0.1790914058015419,
"learning_rate": 5.840634125990823e-07,
"loss": 0.2775,
"step": 2636
},
{
"epoch": 2.9667979741136747,
"grad_norm": 0.18939335360021642,
"learning_rate": 5.632040050062578e-07,
"loss": 0.2868,
"step": 2637
},
{
"epoch": 2.967923466516601,
"grad_norm": 0.18950602005484965,
"learning_rate": 5.423445974134335e-07,
"loss": 0.2806,
"step": 2638
},
{
"epoch": 2.9690489589195272,
"grad_norm": 0.2057097341756207,
"learning_rate": 5.214851898206091e-07,
"loss": 0.2959,
"step": 2639
},
{
"epoch": 2.9701744513224537,
"grad_norm": 0.18914510377229687,
"learning_rate": 5.006257822277848e-07,
"loss": 0.2813,
"step": 2640
},
{
"epoch": 2.9712999437253798,
"grad_norm": 0.21148991319548016,
"learning_rate": 4.797663746349604e-07,
"loss": 0.2921,
"step": 2641
},
{
"epoch": 2.9724254361283062,
"grad_norm": 0.19281924718786836,
"learning_rate": 4.5890696704213606e-07,
"loss": 0.2855,
"step": 2642
},
{
"epoch": 2.9735509285312323,
"grad_norm": 0.18955123443150448,
"learning_rate": 4.3804755944931167e-07,
"loss": 0.2959,
"step": 2643
},
{
"epoch": 2.9746764209341587,
"grad_norm": 0.18330368999509078,
"learning_rate": 4.171881518564873e-07,
"loss": 0.2867,
"step": 2644
},
{
"epoch": 2.9758019133370848,
"grad_norm": 0.19150353774187667,
"learning_rate": 3.9632874426366293e-07,
"loss": 0.2807,
"step": 2645
},
{
"epoch": 2.9769274057400112,
"grad_norm": 0.18078724381796288,
"learning_rate": 3.7546933667083854e-07,
"loss": 0.2659,
"step": 2646
},
{
"epoch": 2.9780528981429377,
"grad_norm": 0.1956344780829508,
"learning_rate": 3.546099290780142e-07,
"loss": 0.2836,
"step": 2647
},
{
"epoch": 2.9791783905458638,
"grad_norm": 0.18154947344603503,
"learning_rate": 3.337505214851898e-07,
"loss": 0.2691,
"step": 2648
},
{
"epoch": 2.9803038829487902,
"grad_norm": 0.19970667773834722,
"learning_rate": 3.1289111389236547e-07,
"loss": 0.2939,
"step": 2649
},
{
"epoch": 2.9814293753517163,
"grad_norm": 0.1970797047209464,
"learning_rate": 2.9203170629954113e-07,
"loss": 0.2973,
"step": 2650
},
{
"epoch": 2.9825548677546427,
"grad_norm": 0.17558302415161703,
"learning_rate": 2.7117229870671674e-07,
"loss": 0.2687,
"step": 2651
},
{
"epoch": 2.983680360157569,
"grad_norm": 0.19984560884749847,
"learning_rate": 2.503128911138924e-07,
"loss": 0.2917,
"step": 2652
},
{
"epoch": 2.9848058525604952,
"grad_norm": 0.18904253639700785,
"learning_rate": 2.2945348352106803e-07,
"loss": 0.2881,
"step": 2653
},
{
"epoch": 2.9859313449634213,
"grad_norm": 0.18998537420423053,
"learning_rate": 2.0859407592824366e-07,
"loss": 0.2899,
"step": 2654
},
{
"epoch": 2.9870568373663478,
"grad_norm": 0.18137614988061299,
"learning_rate": 1.8773466833541927e-07,
"loss": 0.2673,
"step": 2655
},
{
"epoch": 2.9881823297692742,
"grad_norm": 0.20380937660099302,
"learning_rate": 1.668752607425949e-07,
"loss": 0.2855,
"step": 2656
},
{
"epoch": 2.9893078221722003,
"grad_norm": 0.18697756553685133,
"learning_rate": 1.4601585314977056e-07,
"loss": 0.2855,
"step": 2657
},
{
"epoch": 2.9904333145751267,
"grad_norm": 0.18757566803524747,
"learning_rate": 1.251564455569462e-07,
"loss": 0.2881,
"step": 2658
},
{
"epoch": 2.9915588069780528,
"grad_norm": 0.18601807443604287,
"learning_rate": 1.0429703796412183e-07,
"loss": 0.2848,
"step": 2659
},
{
"epoch": 2.9926842993809792,
"grad_norm": 0.19183363220520985,
"learning_rate": 8.343763037129745e-08,
"loss": 0.2958,
"step": 2660
},
{
"epoch": 2.9938097917839057,
"grad_norm": 0.20102940785890344,
"learning_rate": 6.25782227784731e-08,
"loss": 0.281,
"step": 2661
},
{
"epoch": 2.9949352841868317,
"grad_norm": 0.189431904457012,
"learning_rate": 4.1718815185648726e-08,
"loss": 0.2813,
"step": 2662
},
{
"epoch": 2.996060776589758,
"grad_norm": 0.17662567105058288,
"learning_rate": 2.0859407592824363e-08,
"loss": 0.2782,
"step": 2663
},
{
"epoch": 2.9971862689926843,
"grad_norm": 0.18268732320356856,
"learning_rate": 0.0,
"loss": 0.2705,
"step": 2664
},
{
"epoch": 2.9971862689926843,
"step": 2664,
"total_flos": 2.27802848659977e+18,
"train_loss": 0.43048976833845404,
"train_runtime": 155129.8221,
"train_samples_per_second": 0.275,
"train_steps_per_second": 0.017
}
],
"logging_steps": 1,
"max_steps": 2664,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.27802848659977e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}