tc_test / trainer_state.json
cfli's picture
Upload folder using huggingface_hub
8734122 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 1172,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0008532423208191126,
"grad_norm": 476.3564758300781,
"learning_rate": 0.0,
"loss": 9.6172,
"step": 1
},
{
"epoch": 0.0017064846416382253,
"grad_norm": 473.1830139160156,
"learning_rate": 1.6949152542372883e-07,
"loss": 9.1602,
"step": 2
},
{
"epoch": 0.002559726962457338,
"grad_norm": 576.52734375,
"learning_rate": 3.3898305084745766e-07,
"loss": 10.8633,
"step": 3
},
{
"epoch": 0.0034129692832764505,
"grad_norm": 394.45654296875,
"learning_rate": 5.084745762711865e-07,
"loss": 7.9258,
"step": 4
},
{
"epoch": 0.004266211604095563,
"grad_norm": 435.7214660644531,
"learning_rate": 6.779661016949153e-07,
"loss": 10.0195,
"step": 5
},
{
"epoch": 0.005119453924914676,
"grad_norm": 417.97314453125,
"learning_rate": 8.474576271186441e-07,
"loss": 9.6953,
"step": 6
},
{
"epoch": 0.005972696245733789,
"grad_norm": 379.0962219238281,
"learning_rate": 1.016949152542373e-06,
"loss": 9.4453,
"step": 7
},
{
"epoch": 0.006825938566552901,
"grad_norm": 333.2943420410156,
"learning_rate": 1.186440677966102e-06,
"loss": 7.7344,
"step": 8
},
{
"epoch": 0.007679180887372013,
"grad_norm": 312.8900451660156,
"learning_rate": 1.3559322033898307e-06,
"loss": 7.8281,
"step": 9
},
{
"epoch": 0.008532423208191127,
"grad_norm": 365.3479919433594,
"learning_rate": 1.5254237288135596e-06,
"loss": 9.1172,
"step": 10
},
{
"epoch": 0.00938566552901024,
"grad_norm": 267.8406982421875,
"learning_rate": 1.6949152542372882e-06,
"loss": 8.2539,
"step": 11
},
{
"epoch": 0.010238907849829351,
"grad_norm": 393.07293701171875,
"learning_rate": 1.8644067796610171e-06,
"loss": 9.0859,
"step": 12
},
{
"epoch": 0.011092150170648464,
"grad_norm": 585.0484008789062,
"learning_rate": 2.033898305084746e-06,
"loss": 9.7812,
"step": 13
},
{
"epoch": 0.011945392491467578,
"grad_norm": 658.748779296875,
"learning_rate": 2.203389830508475e-06,
"loss": 9.1484,
"step": 14
},
{
"epoch": 0.012798634812286689,
"grad_norm": 607.95068359375,
"learning_rate": 2.372881355932204e-06,
"loss": 9.6484,
"step": 15
},
{
"epoch": 0.013651877133105802,
"grad_norm": 722.521728515625,
"learning_rate": 2.5423728813559323e-06,
"loss": 10.4219,
"step": 16
},
{
"epoch": 0.014505119453924915,
"grad_norm": 504.6495666503906,
"learning_rate": 2.7118644067796613e-06,
"loss": 7.9531,
"step": 17
},
{
"epoch": 0.015358361774744027,
"grad_norm": 480.6727294921875,
"learning_rate": 2.8813559322033903e-06,
"loss": 7.3555,
"step": 18
},
{
"epoch": 0.016211604095563138,
"grad_norm": 289.67529296875,
"learning_rate": 3.0508474576271192e-06,
"loss": 6.4062,
"step": 19
},
{
"epoch": 0.017064846416382253,
"grad_norm": 143.08592224121094,
"learning_rate": 3.2203389830508473e-06,
"loss": 5.4492,
"step": 20
},
{
"epoch": 0.017918088737201365,
"grad_norm": 145.436279296875,
"learning_rate": 3.3898305084745763e-06,
"loss": 5.2383,
"step": 21
},
{
"epoch": 0.01877133105802048,
"grad_norm": 363.3115539550781,
"learning_rate": 3.5593220338983053e-06,
"loss": 6.668,
"step": 22
},
{
"epoch": 0.01962457337883959,
"grad_norm": 311.4585266113281,
"learning_rate": 3.7288135593220342e-06,
"loss": 6.0078,
"step": 23
},
{
"epoch": 0.020477815699658702,
"grad_norm": 132.5367431640625,
"learning_rate": 3.898305084745763e-06,
"loss": 4.8789,
"step": 24
},
{
"epoch": 0.021331058020477817,
"grad_norm": 59.49905776977539,
"learning_rate": 4.067796610169492e-06,
"loss": 4.9141,
"step": 25
},
{
"epoch": 0.02218430034129693,
"grad_norm": 144.73834228515625,
"learning_rate": 4.23728813559322e-06,
"loss": 5.2695,
"step": 26
},
{
"epoch": 0.02303754266211604,
"grad_norm": 72.64948272705078,
"learning_rate": 4.40677966101695e-06,
"loss": 5.3594,
"step": 27
},
{
"epoch": 0.023890784982935155,
"grad_norm": 67.38402557373047,
"learning_rate": 4.576271186440678e-06,
"loss": 4.5137,
"step": 28
},
{
"epoch": 0.024744027303754267,
"grad_norm": 70.24736785888672,
"learning_rate": 4.745762711864408e-06,
"loss": 4.7305,
"step": 29
},
{
"epoch": 0.025597269624573378,
"grad_norm": 37.224700927734375,
"learning_rate": 4.915254237288136e-06,
"loss": 4.5859,
"step": 30
},
{
"epoch": 0.026450511945392493,
"grad_norm": 52.372982025146484,
"learning_rate": 5.084745762711865e-06,
"loss": 4.6504,
"step": 31
},
{
"epoch": 0.027303754266211604,
"grad_norm": 58.09052658081055,
"learning_rate": 5.254237288135594e-06,
"loss": 5.0195,
"step": 32
},
{
"epoch": 0.028156996587030716,
"grad_norm": 273.7731018066406,
"learning_rate": 5.423728813559323e-06,
"loss": 4.877,
"step": 33
},
{
"epoch": 0.02901023890784983,
"grad_norm": 257.3023986816406,
"learning_rate": 5.593220338983051e-06,
"loss": 5.1367,
"step": 34
},
{
"epoch": 0.029863481228668942,
"grad_norm": 37.15542984008789,
"learning_rate": 5.7627118644067805e-06,
"loss": 4.498,
"step": 35
},
{
"epoch": 0.030716723549488054,
"grad_norm": 57.637229919433594,
"learning_rate": 5.932203389830509e-06,
"loss": 4.498,
"step": 36
},
{
"epoch": 0.031569965870307165,
"grad_norm": 161.52142333984375,
"learning_rate": 6.1016949152542385e-06,
"loss": 4.8535,
"step": 37
},
{
"epoch": 0.032423208191126277,
"grad_norm": 144.53648376464844,
"learning_rate": 6.271186440677966e-06,
"loss": 5.0547,
"step": 38
},
{
"epoch": 0.033276450511945395,
"grad_norm": 97.21565246582031,
"learning_rate": 6.440677966101695e-06,
"loss": 4.4238,
"step": 39
},
{
"epoch": 0.034129692832764506,
"grad_norm": 189.5275115966797,
"learning_rate": 6.610169491525424e-06,
"loss": 4.6973,
"step": 40
},
{
"epoch": 0.03498293515358362,
"grad_norm": 52.70186233520508,
"learning_rate": 6.779661016949153e-06,
"loss": 4.2754,
"step": 41
},
{
"epoch": 0.03583617747440273,
"grad_norm": 204.4432830810547,
"learning_rate": 6.949152542372882e-06,
"loss": 5.0195,
"step": 42
},
{
"epoch": 0.03668941979522184,
"grad_norm": 179.79095458984375,
"learning_rate": 7.1186440677966106e-06,
"loss": 5.0684,
"step": 43
},
{
"epoch": 0.03754266211604096,
"grad_norm": 119.27928924560547,
"learning_rate": 7.288135593220339e-06,
"loss": 4.6699,
"step": 44
},
{
"epoch": 0.03839590443686007,
"grad_norm": 83.18463897705078,
"learning_rate": 7.4576271186440685e-06,
"loss": 4.0254,
"step": 45
},
{
"epoch": 0.03924914675767918,
"grad_norm": 121.39764404296875,
"learning_rate": 7.627118644067797e-06,
"loss": 4.6602,
"step": 46
},
{
"epoch": 0.04010238907849829,
"grad_norm": 32.138038635253906,
"learning_rate": 7.796610169491526e-06,
"loss": 4.5078,
"step": 47
},
{
"epoch": 0.040955631399317405,
"grad_norm": 105.33744812011719,
"learning_rate": 7.966101694915255e-06,
"loss": 4.1465,
"step": 48
},
{
"epoch": 0.041808873720136516,
"grad_norm": 163.0584259033203,
"learning_rate": 8.135593220338983e-06,
"loss": 4.7871,
"step": 49
},
{
"epoch": 0.042662116040955635,
"grad_norm": 90.8143081665039,
"learning_rate": 8.305084745762712e-06,
"loss": 4.3184,
"step": 50
},
{
"epoch": 0.043515358361774746,
"grad_norm": 130.4494171142578,
"learning_rate": 8.47457627118644e-06,
"loss": 4.6992,
"step": 51
},
{
"epoch": 0.04436860068259386,
"grad_norm": 120.24535369873047,
"learning_rate": 8.64406779661017e-06,
"loss": 4.6289,
"step": 52
},
{
"epoch": 0.04522184300341297,
"grad_norm": 129.9118194580078,
"learning_rate": 8.8135593220339e-06,
"loss": 4.2207,
"step": 53
},
{
"epoch": 0.04607508532423208,
"grad_norm": 96.74512481689453,
"learning_rate": 8.983050847457628e-06,
"loss": 4.1035,
"step": 54
},
{
"epoch": 0.04692832764505119,
"grad_norm": 127.90748596191406,
"learning_rate": 9.152542372881356e-06,
"loss": 4.7188,
"step": 55
},
{
"epoch": 0.04778156996587031,
"grad_norm": 79.55717468261719,
"learning_rate": 9.322033898305085e-06,
"loss": 3.7441,
"step": 56
},
{
"epoch": 0.04863481228668942,
"grad_norm": 107.03738403320312,
"learning_rate": 9.491525423728815e-06,
"loss": 4.3281,
"step": 57
},
{
"epoch": 0.04948805460750853,
"grad_norm": 137.00802612304688,
"learning_rate": 9.661016949152544e-06,
"loss": 4.5039,
"step": 58
},
{
"epoch": 0.050341296928327645,
"grad_norm": 56.521270751953125,
"learning_rate": 9.830508474576272e-06,
"loss": 4.1992,
"step": 59
},
{
"epoch": 0.051194539249146756,
"grad_norm": 91.46454620361328,
"learning_rate": 1e-05,
"loss": 4.2109,
"step": 60
},
{
"epoch": 0.05204778156996587,
"grad_norm": 77.05010986328125,
"learning_rate": 9.991015274034143e-06,
"loss": 3.8574,
"step": 61
},
{
"epoch": 0.052901023890784986,
"grad_norm": 23.841175079345703,
"learning_rate": 9.982030548068285e-06,
"loss": 3.9512,
"step": 62
},
{
"epoch": 0.0537542662116041,
"grad_norm": 84.35244750976562,
"learning_rate": 9.973045822102425e-06,
"loss": 3.9473,
"step": 63
},
{
"epoch": 0.05460750853242321,
"grad_norm": 127.69181823730469,
"learning_rate": 9.96406109613657e-06,
"loss": 3.8828,
"step": 64
},
{
"epoch": 0.05546075085324232,
"grad_norm": 27.332721710205078,
"learning_rate": 9.955076370170711e-06,
"loss": 3.8457,
"step": 65
},
{
"epoch": 0.05631399317406143,
"grad_norm": 127.41008758544922,
"learning_rate": 9.946091644204853e-06,
"loss": 4.3711,
"step": 66
},
{
"epoch": 0.05716723549488054,
"grad_norm": 50.57866287231445,
"learning_rate": 9.937106918238994e-06,
"loss": 3.4824,
"step": 67
},
{
"epoch": 0.05802047781569966,
"grad_norm": 47.33999252319336,
"learning_rate": 9.928122192273136e-06,
"loss": 3.8301,
"step": 68
},
{
"epoch": 0.05887372013651877,
"grad_norm": 28.65631675720215,
"learning_rate": 9.919137466307278e-06,
"loss": 3.5703,
"step": 69
},
{
"epoch": 0.059726962457337884,
"grad_norm": 100.2359619140625,
"learning_rate": 9.91015274034142e-06,
"loss": 4.5332,
"step": 70
},
{
"epoch": 0.060580204778156996,
"grad_norm": 27.0611572265625,
"learning_rate": 9.901168014375562e-06,
"loss": 3.5859,
"step": 71
},
{
"epoch": 0.06143344709897611,
"grad_norm": 119.60940551757812,
"learning_rate": 9.892183288409704e-06,
"loss": 4.0703,
"step": 72
},
{
"epoch": 0.06228668941979522,
"grad_norm": 113.57787322998047,
"learning_rate": 9.883198562443846e-06,
"loss": 4.0352,
"step": 73
},
{
"epoch": 0.06313993174061433,
"grad_norm": 22.393367767333984,
"learning_rate": 9.874213836477988e-06,
"loss": 4.0293,
"step": 74
},
{
"epoch": 0.06399317406143344,
"grad_norm": 77.3707504272461,
"learning_rate": 9.86522911051213e-06,
"loss": 3.6855,
"step": 75
},
{
"epoch": 0.06484641638225255,
"grad_norm": 23.650131225585938,
"learning_rate": 9.856244384546273e-06,
"loss": 3.8066,
"step": 76
},
{
"epoch": 0.06569965870307168,
"grad_norm": 78.92176055908203,
"learning_rate": 9.847259658580413e-06,
"loss": 4.0781,
"step": 77
},
{
"epoch": 0.06655290102389079,
"grad_norm": 57.146488189697266,
"learning_rate": 9.838274932614557e-06,
"loss": 3.8672,
"step": 78
},
{
"epoch": 0.0674061433447099,
"grad_norm": 65.50660705566406,
"learning_rate": 9.829290206648699e-06,
"loss": 3.2881,
"step": 79
},
{
"epoch": 0.06825938566552901,
"grad_norm": 71.74410247802734,
"learning_rate": 9.820305480682841e-06,
"loss": 3.8984,
"step": 80
},
{
"epoch": 0.06911262798634812,
"grad_norm": 34.78994369506836,
"learning_rate": 9.811320754716981e-06,
"loss": 3.8711,
"step": 81
},
{
"epoch": 0.06996587030716724,
"grad_norm": 115.17135620117188,
"learning_rate": 9.802336028751123e-06,
"loss": 4.6289,
"step": 82
},
{
"epoch": 0.07081911262798635,
"grad_norm": 74.1488037109375,
"learning_rate": 9.793351302785265e-06,
"loss": 4.1133,
"step": 83
},
{
"epoch": 0.07167235494880546,
"grad_norm": 60.60784149169922,
"learning_rate": 9.784366576819408e-06,
"loss": 4.1113,
"step": 84
},
{
"epoch": 0.07252559726962457,
"grad_norm": 138.30191040039062,
"learning_rate": 9.77538185085355e-06,
"loss": 3.7441,
"step": 85
},
{
"epoch": 0.07337883959044368,
"grad_norm": 29.931669235229492,
"learning_rate": 9.766397124887692e-06,
"loss": 3.375,
"step": 86
},
{
"epoch": 0.07423208191126279,
"grad_norm": 53.22774887084961,
"learning_rate": 9.757412398921834e-06,
"loss": 3.6758,
"step": 87
},
{
"epoch": 0.07508532423208192,
"grad_norm": 38.70452880859375,
"learning_rate": 9.748427672955976e-06,
"loss": 4.002,
"step": 88
},
{
"epoch": 0.07593856655290103,
"grad_norm": 30.55535125732422,
"learning_rate": 9.739442946990118e-06,
"loss": 4.207,
"step": 89
},
{
"epoch": 0.07679180887372014,
"grad_norm": 27.83077049255371,
"learning_rate": 9.73045822102426e-06,
"loss": 3.4453,
"step": 90
},
{
"epoch": 0.07764505119453925,
"grad_norm": 121.7099380493164,
"learning_rate": 9.7214734950584e-06,
"loss": 3.9688,
"step": 91
},
{
"epoch": 0.07849829351535836,
"grad_norm": 81.67149353027344,
"learning_rate": 9.712488769092544e-06,
"loss": 3.8965,
"step": 92
},
{
"epoch": 0.07935153583617748,
"grad_norm": 39.18846893310547,
"learning_rate": 9.703504043126686e-06,
"loss": 3.8633,
"step": 93
},
{
"epoch": 0.08020477815699659,
"grad_norm": 84.66485595703125,
"learning_rate": 9.694519317160828e-06,
"loss": 3.9961,
"step": 94
},
{
"epoch": 0.0810580204778157,
"grad_norm": 82.30975341796875,
"learning_rate": 9.685534591194969e-06,
"loss": 3.793,
"step": 95
},
{
"epoch": 0.08191126279863481,
"grad_norm": 88.6453628540039,
"learning_rate": 9.676549865229111e-06,
"loss": 3.8477,
"step": 96
},
{
"epoch": 0.08276450511945392,
"grad_norm": 105.96221160888672,
"learning_rate": 9.667565139263253e-06,
"loss": 3.9941,
"step": 97
},
{
"epoch": 0.08361774744027303,
"grad_norm": 23.890165328979492,
"learning_rate": 9.658580413297395e-06,
"loss": 3.3945,
"step": 98
},
{
"epoch": 0.08447098976109214,
"grad_norm": 77.99059295654297,
"learning_rate": 9.649595687331537e-06,
"loss": 3.6816,
"step": 99
},
{
"epoch": 0.08532423208191127,
"grad_norm": 68.72335052490234,
"learning_rate": 9.64061096136568e-06,
"loss": 3.8086,
"step": 100
},
{
"epoch": 0.08617747440273038,
"grad_norm": 48.387054443359375,
"learning_rate": 9.631626235399821e-06,
"loss": 3.7148,
"step": 101
},
{
"epoch": 0.08703071672354949,
"grad_norm": 30.06928825378418,
"learning_rate": 9.622641509433963e-06,
"loss": 3.1797,
"step": 102
},
{
"epoch": 0.0878839590443686,
"grad_norm": 103.71221923828125,
"learning_rate": 9.613656783468106e-06,
"loss": 3.4102,
"step": 103
},
{
"epoch": 0.08873720136518772,
"grad_norm": 22.561519622802734,
"learning_rate": 9.604672057502246e-06,
"loss": 3.4941,
"step": 104
},
{
"epoch": 0.08959044368600683,
"grad_norm": 36.27552032470703,
"learning_rate": 9.595687331536388e-06,
"loss": 4.0645,
"step": 105
},
{
"epoch": 0.09044368600682594,
"grad_norm": 60.4101448059082,
"learning_rate": 9.58670260557053e-06,
"loss": 3.1895,
"step": 106
},
{
"epoch": 0.09129692832764505,
"grad_norm": 31.981599807739258,
"learning_rate": 9.577717879604674e-06,
"loss": 3.123,
"step": 107
},
{
"epoch": 0.09215017064846416,
"grad_norm": 51.39161682128906,
"learning_rate": 9.568733153638814e-06,
"loss": 3.8027,
"step": 108
},
{
"epoch": 0.09300341296928327,
"grad_norm": 17.14482307434082,
"learning_rate": 9.559748427672956e-06,
"loss": 3.5996,
"step": 109
},
{
"epoch": 0.09385665529010238,
"grad_norm": 79.9706802368164,
"learning_rate": 9.550763701707098e-06,
"loss": 3.2578,
"step": 110
},
{
"epoch": 0.0947098976109215,
"grad_norm": 19.872787475585938,
"learning_rate": 9.54177897574124e-06,
"loss": 3.0957,
"step": 111
},
{
"epoch": 0.09556313993174062,
"grad_norm": 50.38517761230469,
"learning_rate": 9.532794249775383e-06,
"loss": 3.6895,
"step": 112
},
{
"epoch": 0.09641638225255973,
"grad_norm": 50.98223876953125,
"learning_rate": 9.523809523809525e-06,
"loss": 3.7188,
"step": 113
},
{
"epoch": 0.09726962457337884,
"grad_norm": 46.27577590942383,
"learning_rate": 9.514824797843667e-06,
"loss": 3.2383,
"step": 114
},
{
"epoch": 0.09812286689419795,
"grad_norm": 43.620479583740234,
"learning_rate": 9.505840071877809e-06,
"loss": 3.6348,
"step": 115
},
{
"epoch": 0.09897610921501707,
"grad_norm": 73.57115173339844,
"learning_rate": 9.496855345911951e-06,
"loss": 3.1875,
"step": 116
},
{
"epoch": 0.09982935153583618,
"grad_norm": 29.671640396118164,
"learning_rate": 9.487870619946093e-06,
"loss": 3.4141,
"step": 117
},
{
"epoch": 0.10068259385665529,
"grad_norm": 37.94879150390625,
"learning_rate": 9.478885893980234e-06,
"loss": 3.4414,
"step": 118
},
{
"epoch": 0.1015358361774744,
"grad_norm": 51.39364242553711,
"learning_rate": 9.469901168014376e-06,
"loss": 3.4434,
"step": 119
},
{
"epoch": 0.10238907849829351,
"grad_norm": 46.911163330078125,
"learning_rate": 9.460916442048518e-06,
"loss": 3.4355,
"step": 120
},
{
"epoch": 0.10324232081911262,
"grad_norm": 32.2253303527832,
"learning_rate": 9.451931716082661e-06,
"loss": 3.2402,
"step": 121
},
{
"epoch": 0.10409556313993173,
"grad_norm": 87.56474304199219,
"learning_rate": 9.442946990116802e-06,
"loss": 3.5059,
"step": 122
},
{
"epoch": 0.10494880546075085,
"grad_norm": 75.4452896118164,
"learning_rate": 9.433962264150944e-06,
"loss": 3.1016,
"step": 123
},
{
"epoch": 0.10580204778156997,
"grad_norm": 21.062419891357422,
"learning_rate": 9.424977538185086e-06,
"loss": 3.5176,
"step": 124
},
{
"epoch": 0.10665529010238908,
"grad_norm": 34.950862884521484,
"learning_rate": 9.415992812219228e-06,
"loss": 3.2168,
"step": 125
},
{
"epoch": 0.1075085324232082,
"grad_norm": 89.45964813232422,
"learning_rate": 9.40700808625337e-06,
"loss": 3.25,
"step": 126
},
{
"epoch": 0.1083617747440273,
"grad_norm": 58.562896728515625,
"learning_rate": 9.398023360287512e-06,
"loss": 3.7793,
"step": 127
},
{
"epoch": 0.10921501706484642,
"grad_norm": 54.15276336669922,
"learning_rate": 9.389038634321654e-06,
"loss": 3.5,
"step": 128
},
{
"epoch": 0.11006825938566553,
"grad_norm": 32.4635124206543,
"learning_rate": 9.380053908355796e-06,
"loss": 2.915,
"step": 129
},
{
"epoch": 0.11092150170648464,
"grad_norm": 22.57988739013672,
"learning_rate": 9.371069182389939e-06,
"loss": 2.9502,
"step": 130
},
{
"epoch": 0.11177474402730375,
"grad_norm": 38.44780731201172,
"learning_rate": 9.36208445642408e-06,
"loss": 2.9336,
"step": 131
},
{
"epoch": 0.11262798634812286,
"grad_norm": 21.83592414855957,
"learning_rate": 9.353099730458221e-06,
"loss": 2.9707,
"step": 132
},
{
"epoch": 0.11348122866894197,
"grad_norm": 24.39005470275879,
"learning_rate": 9.344115004492363e-06,
"loss": 3.2578,
"step": 133
},
{
"epoch": 0.11433447098976109,
"grad_norm": 59.925758361816406,
"learning_rate": 9.335130278526505e-06,
"loss": 3.2988,
"step": 134
},
{
"epoch": 0.11518771331058021,
"grad_norm": 74.08988189697266,
"learning_rate": 9.326145552560647e-06,
"loss": 2.7236,
"step": 135
},
{
"epoch": 0.11604095563139932,
"grad_norm": 63.953453063964844,
"learning_rate": 9.31716082659479e-06,
"loss": 3.1807,
"step": 136
},
{
"epoch": 0.11689419795221843,
"grad_norm": 36.688720703125,
"learning_rate": 9.308176100628931e-06,
"loss": 2.8018,
"step": 137
},
{
"epoch": 0.11774744027303755,
"grad_norm": 60.307430267333984,
"learning_rate": 9.299191374663074e-06,
"loss": 3.1572,
"step": 138
},
{
"epoch": 0.11860068259385666,
"grad_norm": 28.88834571838379,
"learning_rate": 9.290206648697216e-06,
"loss": 3.4766,
"step": 139
},
{
"epoch": 0.11945392491467577,
"grad_norm": 40.18682861328125,
"learning_rate": 9.281221922731358e-06,
"loss": 3.8594,
"step": 140
},
{
"epoch": 0.12030716723549488,
"grad_norm": 74.8680648803711,
"learning_rate": 9.272237196765498e-06,
"loss": 3.1719,
"step": 141
},
{
"epoch": 0.12116040955631399,
"grad_norm": 42.63037109375,
"learning_rate": 9.263252470799642e-06,
"loss": 2.8574,
"step": 142
},
{
"epoch": 0.1220136518771331,
"grad_norm": 32.380043029785156,
"learning_rate": 9.254267744833784e-06,
"loss": 3.2363,
"step": 143
},
{
"epoch": 0.12286689419795221,
"grad_norm": 71.21893310546875,
"learning_rate": 9.245283018867926e-06,
"loss": 3.1865,
"step": 144
},
{
"epoch": 0.12372013651877133,
"grad_norm": 27.608762741088867,
"learning_rate": 9.236298292902067e-06,
"loss": 2.9258,
"step": 145
},
{
"epoch": 0.12457337883959044,
"grad_norm": 26.77503776550293,
"learning_rate": 9.227313566936209e-06,
"loss": 3.0938,
"step": 146
},
{
"epoch": 0.12542662116040956,
"grad_norm": 103.02552795410156,
"learning_rate": 9.21832884097035e-06,
"loss": 3.1328,
"step": 147
},
{
"epoch": 0.12627986348122866,
"grad_norm": 83.52420806884766,
"learning_rate": 9.209344115004493e-06,
"loss": 2.9922,
"step": 148
},
{
"epoch": 0.12713310580204779,
"grad_norm": 73.67784118652344,
"learning_rate": 9.200359389038635e-06,
"loss": 2.8223,
"step": 149
},
{
"epoch": 0.12798634812286688,
"grad_norm": 23.8249454498291,
"learning_rate": 9.191374663072777e-06,
"loss": 3.3398,
"step": 150
},
{
"epoch": 0.128839590443686,
"grad_norm": 69.4936294555664,
"learning_rate": 9.182389937106919e-06,
"loss": 3.4326,
"step": 151
},
{
"epoch": 0.1296928327645051,
"grad_norm": 88.59197998046875,
"learning_rate": 9.173405211141061e-06,
"loss": 3.4414,
"step": 152
},
{
"epoch": 0.13054607508532423,
"grad_norm": 29.386064529418945,
"learning_rate": 9.164420485175203e-06,
"loss": 3.0303,
"step": 153
},
{
"epoch": 0.13139931740614336,
"grad_norm": 38.90749740600586,
"learning_rate": 9.155435759209345e-06,
"loss": 2.9619,
"step": 154
},
{
"epoch": 0.13225255972696245,
"grad_norm": 74.41140747070312,
"learning_rate": 9.146451033243486e-06,
"loss": 3.4102,
"step": 155
},
{
"epoch": 0.13310580204778158,
"grad_norm": 44.66842269897461,
"learning_rate": 9.13746630727763e-06,
"loss": 2.6963,
"step": 156
},
{
"epoch": 0.13395904436860068,
"grad_norm": 33.257205963134766,
"learning_rate": 9.128481581311772e-06,
"loss": 3.1924,
"step": 157
},
{
"epoch": 0.1348122866894198,
"grad_norm": 100.31049346923828,
"learning_rate": 9.119496855345914e-06,
"loss": 3.3809,
"step": 158
},
{
"epoch": 0.1356655290102389,
"grad_norm": 54.77112579345703,
"learning_rate": 9.110512129380054e-06,
"loss": 3.5938,
"step": 159
},
{
"epoch": 0.13651877133105803,
"grad_norm": 18.681119918823242,
"learning_rate": 9.101527403414196e-06,
"loss": 3.4785,
"step": 160
},
{
"epoch": 0.13737201365187712,
"grad_norm": 62.4477424621582,
"learning_rate": 9.092542677448338e-06,
"loss": 3.2266,
"step": 161
},
{
"epoch": 0.13822525597269625,
"grad_norm": 97.80989074707031,
"learning_rate": 9.08355795148248e-06,
"loss": 3.0898,
"step": 162
},
{
"epoch": 0.13907849829351535,
"grad_norm": 91.97032928466797,
"learning_rate": 9.074573225516622e-06,
"loss": 3.6445,
"step": 163
},
{
"epoch": 0.13993174061433447,
"grad_norm": 31.477741241455078,
"learning_rate": 9.065588499550765e-06,
"loss": 3.3359,
"step": 164
},
{
"epoch": 0.1407849829351536,
"grad_norm": 21.278085708618164,
"learning_rate": 9.056603773584907e-06,
"loss": 3.3047,
"step": 165
},
{
"epoch": 0.1416382252559727,
"grad_norm": 52.15373229980469,
"learning_rate": 9.047619047619049e-06,
"loss": 3.5176,
"step": 166
},
{
"epoch": 0.14249146757679182,
"grad_norm": 14.358907699584961,
"learning_rate": 9.03863432165319e-06,
"loss": 2.8945,
"step": 167
},
{
"epoch": 0.14334470989761092,
"grad_norm": 33.82578659057617,
"learning_rate": 9.029649595687333e-06,
"loss": 3.3242,
"step": 168
},
{
"epoch": 0.14419795221843004,
"grad_norm": 72.78608703613281,
"learning_rate": 9.020664869721473e-06,
"loss": 3.416,
"step": 169
},
{
"epoch": 0.14505119453924914,
"grad_norm": 97.01647186279297,
"learning_rate": 9.011680143755617e-06,
"loss": 3.25,
"step": 170
},
{
"epoch": 0.14590443686006827,
"grad_norm": 54.42570114135742,
"learning_rate": 9.002695417789759e-06,
"loss": 3.1309,
"step": 171
},
{
"epoch": 0.14675767918088736,
"grad_norm": 26.412174224853516,
"learning_rate": 8.9937106918239e-06,
"loss": 3.4609,
"step": 172
},
{
"epoch": 0.1476109215017065,
"grad_norm": 71.91547393798828,
"learning_rate": 8.984725965858042e-06,
"loss": 3.1289,
"step": 173
},
{
"epoch": 0.14846416382252559,
"grad_norm": 66.65043640136719,
"learning_rate": 8.975741239892184e-06,
"loss": 3.2383,
"step": 174
},
{
"epoch": 0.1493174061433447,
"grad_norm": 164.26414489746094,
"learning_rate": 8.966756513926326e-06,
"loss": 4.3633,
"step": 175
},
{
"epoch": 0.15017064846416384,
"grad_norm": 73.09919738769531,
"learning_rate": 8.957771787960468e-06,
"loss": 3.2148,
"step": 176
},
{
"epoch": 0.15102389078498293,
"grad_norm": 40.517093658447266,
"learning_rate": 8.94878706199461e-06,
"loss": 2.9707,
"step": 177
},
{
"epoch": 0.15187713310580206,
"grad_norm": 76.36444854736328,
"learning_rate": 8.939802336028752e-06,
"loss": 2.9424,
"step": 178
},
{
"epoch": 0.15273037542662116,
"grad_norm": 119.0013198852539,
"learning_rate": 8.930817610062894e-06,
"loss": 3.1953,
"step": 179
},
{
"epoch": 0.15358361774744028,
"grad_norm": 103.3395004272461,
"learning_rate": 8.921832884097036e-06,
"loss": 3.543,
"step": 180
},
{
"epoch": 0.15443686006825938,
"grad_norm": 106.20706176757812,
"learning_rate": 8.912848158131178e-06,
"loss": 3.7734,
"step": 181
},
{
"epoch": 0.1552901023890785,
"grad_norm": 53.621829986572266,
"learning_rate": 8.903863432165319e-06,
"loss": 3.1689,
"step": 182
},
{
"epoch": 0.1561433447098976,
"grad_norm": 47.70130920410156,
"learning_rate": 8.89487870619946e-06,
"loss": 3.1543,
"step": 183
},
{
"epoch": 0.15699658703071673,
"grad_norm": 38.16180419921875,
"learning_rate": 8.885893980233603e-06,
"loss": 2.6494,
"step": 184
},
{
"epoch": 0.15784982935153583,
"grad_norm": 89.42051696777344,
"learning_rate": 8.876909254267747e-06,
"loss": 3.5508,
"step": 185
},
{
"epoch": 0.15870307167235495,
"grad_norm": 60.47245407104492,
"learning_rate": 8.867924528301887e-06,
"loss": 3.6777,
"step": 186
},
{
"epoch": 0.15955631399317405,
"grad_norm": 24.777610778808594,
"learning_rate": 8.85893980233603e-06,
"loss": 2.3027,
"step": 187
},
{
"epoch": 0.16040955631399317,
"grad_norm": 28.14823341369629,
"learning_rate": 8.849955076370171e-06,
"loss": 3.0215,
"step": 188
},
{
"epoch": 0.1612627986348123,
"grad_norm": 33.189239501953125,
"learning_rate": 8.840970350404313e-06,
"loss": 3.2891,
"step": 189
},
{
"epoch": 0.1621160409556314,
"grad_norm": 25.520509719848633,
"learning_rate": 8.831985624438455e-06,
"loss": 2.8379,
"step": 190
},
{
"epoch": 0.16296928327645052,
"grad_norm": 55.70583724975586,
"learning_rate": 8.823000898472598e-06,
"loss": 3.25,
"step": 191
},
{
"epoch": 0.16382252559726962,
"grad_norm": 26.865032196044922,
"learning_rate": 8.81401617250674e-06,
"loss": 3.1055,
"step": 192
},
{
"epoch": 0.16467576791808874,
"grad_norm": 22.27757453918457,
"learning_rate": 8.805031446540882e-06,
"loss": 2.5625,
"step": 193
},
{
"epoch": 0.16552901023890784,
"grad_norm": 22.520416259765625,
"learning_rate": 8.796046720575024e-06,
"loss": 2.8887,
"step": 194
},
{
"epoch": 0.16638225255972697,
"grad_norm": 18.727357864379883,
"learning_rate": 8.787061994609166e-06,
"loss": 2.6807,
"step": 195
},
{
"epoch": 0.16723549488054607,
"grad_norm": 19.526918411254883,
"learning_rate": 8.778077268643306e-06,
"loss": 2.9512,
"step": 196
},
{
"epoch": 0.1680887372013652,
"grad_norm": 25.042152404785156,
"learning_rate": 8.769092542677448e-06,
"loss": 3.168,
"step": 197
},
{
"epoch": 0.1689419795221843,
"grad_norm": 21.94442367553711,
"learning_rate": 8.76010781671159e-06,
"loss": 3.3008,
"step": 198
},
{
"epoch": 0.1697952218430034,
"grad_norm": 18.068660736083984,
"learning_rate": 8.751123090745734e-06,
"loss": 2.999,
"step": 199
},
{
"epoch": 0.17064846416382254,
"grad_norm": 54.0893669128418,
"learning_rate": 8.742138364779875e-06,
"loss": 2.8115,
"step": 200
},
{
"epoch": 0.17150170648464164,
"grad_norm": 87.9207992553711,
"learning_rate": 8.733153638814017e-06,
"loss": 3.6152,
"step": 201
},
{
"epoch": 0.17235494880546076,
"grad_norm": 33.18696975708008,
"learning_rate": 8.724168912848159e-06,
"loss": 3.0527,
"step": 202
},
{
"epoch": 0.17320819112627986,
"grad_norm": 36.34266662597656,
"learning_rate": 8.715184186882301e-06,
"loss": 3.2754,
"step": 203
},
{
"epoch": 0.17406143344709898,
"grad_norm": 55.32210922241211,
"learning_rate": 8.706199460916443e-06,
"loss": 3.9746,
"step": 204
},
{
"epoch": 0.17491467576791808,
"grad_norm": 27.2241268157959,
"learning_rate": 8.697214734950583e-06,
"loss": 3.4297,
"step": 205
},
{
"epoch": 0.1757679180887372,
"grad_norm": 19.6944580078125,
"learning_rate": 8.688230008984727e-06,
"loss": 3.2139,
"step": 206
},
{
"epoch": 0.1766211604095563,
"grad_norm": 14.382315635681152,
"learning_rate": 8.67924528301887e-06,
"loss": 2.7715,
"step": 207
},
{
"epoch": 0.17747440273037543,
"grad_norm": 20.982158660888672,
"learning_rate": 8.670260557053011e-06,
"loss": 3.3418,
"step": 208
},
{
"epoch": 0.17832764505119453,
"grad_norm": 23.547433853149414,
"learning_rate": 8.661275831087152e-06,
"loss": 2.9199,
"step": 209
},
{
"epoch": 0.17918088737201365,
"grad_norm": 43.464237213134766,
"learning_rate": 8.652291105121294e-06,
"loss": 3.0234,
"step": 210
},
{
"epoch": 0.18003412969283278,
"grad_norm": 41.555118560791016,
"learning_rate": 8.643306379155436e-06,
"loss": 3.0107,
"step": 211
},
{
"epoch": 0.18088737201365188,
"grad_norm": 42.51097869873047,
"learning_rate": 8.634321653189578e-06,
"loss": 3.1074,
"step": 212
},
{
"epoch": 0.181740614334471,
"grad_norm": 35.8163948059082,
"learning_rate": 8.62533692722372e-06,
"loss": 3.0488,
"step": 213
},
{
"epoch": 0.1825938566552901,
"grad_norm": 48.06075668334961,
"learning_rate": 8.616352201257862e-06,
"loss": 2.8594,
"step": 214
},
{
"epoch": 0.18344709897610922,
"grad_norm": 31.809709548950195,
"learning_rate": 8.607367475292004e-06,
"loss": 3.0605,
"step": 215
},
{
"epoch": 0.18430034129692832,
"grad_norm": 33.539005279541016,
"learning_rate": 8.598382749326146e-06,
"loss": 2.8164,
"step": 216
},
{
"epoch": 0.18515358361774745,
"grad_norm": 30.759517669677734,
"learning_rate": 8.589398023360288e-06,
"loss": 3.4043,
"step": 217
},
{
"epoch": 0.18600682593856654,
"grad_norm": 46.54279327392578,
"learning_rate": 8.58041329739443e-06,
"loss": 3.3164,
"step": 218
},
{
"epoch": 0.18686006825938567,
"grad_norm": 17.588998794555664,
"learning_rate": 8.571428571428571e-06,
"loss": 2.8232,
"step": 219
},
{
"epoch": 0.18771331058020477,
"grad_norm": 40.2357063293457,
"learning_rate": 8.562443845462715e-06,
"loss": 3.6035,
"step": 220
},
{
"epoch": 0.1885665529010239,
"grad_norm": 30.986467361450195,
"learning_rate": 8.553459119496857e-06,
"loss": 3.2402,
"step": 221
},
{
"epoch": 0.189419795221843,
"grad_norm": 71.314453125,
"learning_rate": 8.544474393530999e-06,
"loss": 2.8965,
"step": 222
},
{
"epoch": 0.19027303754266212,
"grad_norm": 60.04804611206055,
"learning_rate": 8.53548966756514e-06,
"loss": 3.2324,
"step": 223
},
{
"epoch": 0.19112627986348124,
"grad_norm": 25.102706909179688,
"learning_rate": 8.526504941599281e-06,
"loss": 3.1973,
"step": 224
},
{
"epoch": 0.19197952218430034,
"grad_norm": 58.8226203918457,
"learning_rate": 8.517520215633423e-06,
"loss": 3.291,
"step": 225
},
{
"epoch": 0.19283276450511946,
"grad_norm": 80.93440246582031,
"learning_rate": 8.508535489667566e-06,
"loss": 4.0488,
"step": 226
},
{
"epoch": 0.19368600682593856,
"grad_norm": 15.695361137390137,
"learning_rate": 8.499550763701708e-06,
"loss": 2.875,
"step": 227
},
{
"epoch": 0.1945392491467577,
"grad_norm": 18.42605209350586,
"learning_rate": 8.49056603773585e-06,
"loss": 3.0059,
"step": 228
},
{
"epoch": 0.19539249146757678,
"grad_norm": 20.952381134033203,
"learning_rate": 8.481581311769992e-06,
"loss": 3.6758,
"step": 229
},
{
"epoch": 0.1962457337883959,
"grad_norm": 33.53485107421875,
"learning_rate": 8.472596585804134e-06,
"loss": 2.6758,
"step": 230
},
{
"epoch": 0.197098976109215,
"grad_norm": 18.76603889465332,
"learning_rate": 8.463611859838276e-06,
"loss": 3.0684,
"step": 231
},
{
"epoch": 0.19795221843003413,
"grad_norm": 61.98395538330078,
"learning_rate": 8.454627133872418e-06,
"loss": 3.3242,
"step": 232
},
{
"epoch": 0.19880546075085323,
"grad_norm": 24.302837371826172,
"learning_rate": 8.445642407906558e-06,
"loss": 3.5234,
"step": 233
},
{
"epoch": 0.19965870307167236,
"grad_norm": 58.31713104248047,
"learning_rate": 8.436657681940702e-06,
"loss": 3.7285,
"step": 234
},
{
"epoch": 0.20051194539249148,
"grad_norm": 36.07301712036133,
"learning_rate": 8.427672955974844e-06,
"loss": 3.2148,
"step": 235
},
{
"epoch": 0.20136518771331058,
"grad_norm": 20.333580017089844,
"learning_rate": 8.418688230008986e-06,
"loss": 3.2344,
"step": 236
},
{
"epoch": 0.2022184300341297,
"grad_norm": 19.519014358520508,
"learning_rate": 8.409703504043127e-06,
"loss": 2.7363,
"step": 237
},
{
"epoch": 0.2030716723549488,
"grad_norm": 17.365482330322266,
"learning_rate": 8.400718778077269e-06,
"loss": 3.0752,
"step": 238
},
{
"epoch": 0.20392491467576793,
"grad_norm": 14.840271949768066,
"learning_rate": 8.391734052111411e-06,
"loss": 2.9824,
"step": 239
},
{
"epoch": 0.20477815699658702,
"grad_norm": 19.23467254638672,
"learning_rate": 8.382749326145553e-06,
"loss": 2.9961,
"step": 240
},
{
"epoch": 0.20563139931740615,
"grad_norm": 68.42906188964844,
"learning_rate": 8.373764600179695e-06,
"loss": 3.4277,
"step": 241
},
{
"epoch": 0.20648464163822525,
"grad_norm": 26.17658042907715,
"learning_rate": 8.364779874213837e-06,
"loss": 2.6973,
"step": 242
},
{
"epoch": 0.20733788395904437,
"grad_norm": 18.755210876464844,
"learning_rate": 8.35579514824798e-06,
"loss": 3.1953,
"step": 243
},
{
"epoch": 0.20819112627986347,
"grad_norm": 17.498382568359375,
"learning_rate": 8.346810422282121e-06,
"loss": 3.0918,
"step": 244
},
{
"epoch": 0.2090443686006826,
"grad_norm": 44.80198669433594,
"learning_rate": 8.337825696316264e-06,
"loss": 2.4717,
"step": 245
},
{
"epoch": 0.2098976109215017,
"grad_norm": 45.40264892578125,
"learning_rate": 8.328840970350404e-06,
"loss": 2.9092,
"step": 246
},
{
"epoch": 0.21075085324232082,
"grad_norm": 28.791826248168945,
"learning_rate": 8.319856244384546e-06,
"loss": 2.9834,
"step": 247
},
{
"epoch": 0.21160409556313994,
"grad_norm": 26.131162643432617,
"learning_rate": 8.31087151841869e-06,
"loss": 2.9482,
"step": 248
},
{
"epoch": 0.21245733788395904,
"grad_norm": 45.71311950683594,
"learning_rate": 8.301886792452832e-06,
"loss": 3.1572,
"step": 249
},
{
"epoch": 0.21331058020477817,
"grad_norm": 27.159473419189453,
"learning_rate": 8.292902066486972e-06,
"loss": 2.8818,
"step": 250
},
{
"epoch": 0.21416382252559726,
"grad_norm": 24.58170509338379,
"learning_rate": 8.283917340521114e-06,
"loss": 3.2188,
"step": 251
},
{
"epoch": 0.2150170648464164,
"grad_norm": 25.872392654418945,
"learning_rate": 8.274932614555256e-06,
"loss": 3.2979,
"step": 252
},
{
"epoch": 0.2158703071672355,
"grad_norm": 23.962533950805664,
"learning_rate": 8.265947888589399e-06,
"loss": 3.4648,
"step": 253
},
{
"epoch": 0.2167235494880546,
"grad_norm": 17.57655143737793,
"learning_rate": 8.25696316262354e-06,
"loss": 2.7656,
"step": 254
},
{
"epoch": 0.2175767918088737,
"grad_norm": 57.268821716308594,
"learning_rate": 8.247978436657683e-06,
"loss": 3.459,
"step": 255
},
{
"epoch": 0.21843003412969283,
"grad_norm": 24.482690811157227,
"learning_rate": 8.238993710691825e-06,
"loss": 3.584,
"step": 256
},
{
"epoch": 0.21928327645051193,
"grad_norm": 25.513710021972656,
"learning_rate": 8.230008984725967e-06,
"loss": 3.4512,
"step": 257
},
{
"epoch": 0.22013651877133106,
"grad_norm": 55.66819381713867,
"learning_rate": 8.221024258760109e-06,
"loss": 3.0996,
"step": 258
},
{
"epoch": 0.22098976109215018,
"grad_norm": 50.33326721191406,
"learning_rate": 8.212039532794251e-06,
"loss": 3.498,
"step": 259
},
{
"epoch": 0.22184300341296928,
"grad_norm": 30.594228744506836,
"learning_rate": 8.203054806828391e-06,
"loss": 3.1953,
"step": 260
},
{
"epoch": 0.2226962457337884,
"grad_norm": 20.082685470581055,
"learning_rate": 8.194070080862534e-06,
"loss": 3.2734,
"step": 261
},
{
"epoch": 0.2235494880546075,
"grad_norm": 23.434057235717773,
"learning_rate": 8.185085354896676e-06,
"loss": 3.0488,
"step": 262
},
{
"epoch": 0.22440273037542663,
"grad_norm": 75.93733215332031,
"learning_rate": 8.17610062893082e-06,
"loss": 3.4014,
"step": 263
},
{
"epoch": 0.22525597269624573,
"grad_norm": 34.896339416503906,
"learning_rate": 8.16711590296496e-06,
"loss": 2.9199,
"step": 264
},
{
"epoch": 0.22610921501706485,
"grad_norm": 14.582112312316895,
"learning_rate": 8.158131176999102e-06,
"loss": 2.9688,
"step": 265
},
{
"epoch": 0.22696245733788395,
"grad_norm": 35.01908874511719,
"learning_rate": 8.149146451033244e-06,
"loss": 3.0977,
"step": 266
},
{
"epoch": 0.22781569965870307,
"grad_norm": 18.878812789916992,
"learning_rate": 8.140161725067386e-06,
"loss": 2.9307,
"step": 267
},
{
"epoch": 0.22866894197952217,
"grad_norm": 38.857398986816406,
"learning_rate": 8.131176999101528e-06,
"loss": 2.8672,
"step": 268
},
{
"epoch": 0.2295221843003413,
"grad_norm": 21.270587921142578,
"learning_rate": 8.12219227313567e-06,
"loss": 2.79,
"step": 269
},
{
"epoch": 0.23037542662116042,
"grad_norm": 55.603302001953125,
"learning_rate": 8.113207547169812e-06,
"loss": 3.1348,
"step": 270
},
{
"epoch": 0.23122866894197952,
"grad_norm": 60.84667205810547,
"learning_rate": 8.104222821203954e-06,
"loss": 3.1846,
"step": 271
},
{
"epoch": 0.23208191126279865,
"grad_norm": 21.8660888671875,
"learning_rate": 8.095238095238097e-06,
"loss": 2.4336,
"step": 272
},
{
"epoch": 0.23293515358361774,
"grad_norm": 47.333717346191406,
"learning_rate": 8.086253369272239e-06,
"loss": 3.2207,
"step": 273
},
{
"epoch": 0.23378839590443687,
"grad_norm": 41.70451354980469,
"learning_rate": 8.077268643306379e-06,
"loss": 2.8359,
"step": 274
},
{
"epoch": 0.23464163822525597,
"grad_norm": 35.146942138671875,
"learning_rate": 8.068283917340521e-06,
"loss": 2.6738,
"step": 275
},
{
"epoch": 0.2354948805460751,
"grad_norm": 16.213695526123047,
"learning_rate": 8.059299191374663e-06,
"loss": 2.8086,
"step": 276
},
{
"epoch": 0.2363481228668942,
"grad_norm": 47.92852783203125,
"learning_rate": 8.050314465408805e-06,
"loss": 2.8867,
"step": 277
},
{
"epoch": 0.23720136518771331,
"grad_norm": 42.166404724121094,
"learning_rate": 8.041329739442947e-06,
"loss": 3.125,
"step": 278
},
{
"epoch": 0.2380546075085324,
"grad_norm": 24.129230499267578,
"learning_rate": 8.03234501347709e-06,
"loss": 3.043,
"step": 279
},
{
"epoch": 0.23890784982935154,
"grad_norm": 16.145126342773438,
"learning_rate": 8.023360287511232e-06,
"loss": 3.3672,
"step": 280
},
{
"epoch": 0.23976109215017063,
"grad_norm": 68.74685668945312,
"learning_rate": 8.014375561545374e-06,
"loss": 3.1924,
"step": 281
},
{
"epoch": 0.24061433447098976,
"grad_norm": 25.997495651245117,
"learning_rate": 8.005390835579516e-06,
"loss": 3.0293,
"step": 282
},
{
"epoch": 0.24146757679180889,
"grad_norm": 53.29498291015625,
"learning_rate": 7.996406109613656e-06,
"loss": 2.9922,
"step": 283
},
{
"epoch": 0.24232081911262798,
"grad_norm": 15.400269508361816,
"learning_rate": 7.9874213836478e-06,
"loss": 3.1387,
"step": 284
},
{
"epoch": 0.2431740614334471,
"grad_norm": 20.252784729003906,
"learning_rate": 7.978436657681942e-06,
"loss": 2.9648,
"step": 285
},
{
"epoch": 0.2440273037542662,
"grad_norm": 57.464752197265625,
"learning_rate": 7.969451931716084e-06,
"loss": 3.6719,
"step": 286
},
{
"epoch": 0.24488054607508533,
"grad_norm": 43.41289520263672,
"learning_rate": 7.960467205750224e-06,
"loss": 2.8242,
"step": 287
},
{
"epoch": 0.24573378839590443,
"grad_norm": 17.788986206054688,
"learning_rate": 7.951482479784367e-06,
"loss": 3.1875,
"step": 288
},
{
"epoch": 0.24658703071672355,
"grad_norm": 15.844386100769043,
"learning_rate": 7.942497753818509e-06,
"loss": 2.9785,
"step": 289
},
{
"epoch": 0.24744027303754265,
"grad_norm": 58.50739669799805,
"learning_rate": 7.93351302785265e-06,
"loss": 2.9883,
"step": 290
},
{
"epoch": 0.24829351535836178,
"grad_norm": 60.459251403808594,
"learning_rate": 7.924528301886793e-06,
"loss": 2.8467,
"step": 291
},
{
"epoch": 0.24914675767918087,
"grad_norm": 48.048728942871094,
"learning_rate": 7.915543575920935e-06,
"loss": 3.1133,
"step": 292
},
{
"epoch": 0.25,
"grad_norm": 25.91424560546875,
"learning_rate": 7.906558849955077e-06,
"loss": 2.5762,
"step": 293
},
{
"epoch": 0.2508532423208191,
"grad_norm": 65.37548065185547,
"learning_rate": 7.897574123989219e-06,
"loss": 3.1191,
"step": 294
},
{
"epoch": 0.25170648464163825,
"grad_norm": 48.94771194458008,
"learning_rate": 7.888589398023361e-06,
"loss": 2.6484,
"step": 295
},
{
"epoch": 0.2525597269624573,
"grad_norm": 45.35071563720703,
"learning_rate": 7.879604672057503e-06,
"loss": 3.1436,
"step": 296
},
{
"epoch": 0.25341296928327645,
"grad_norm": 50.98272705078125,
"learning_rate": 7.870619946091644e-06,
"loss": 2.5635,
"step": 297
},
{
"epoch": 0.25426621160409557,
"grad_norm": 24.23581886291504,
"learning_rate": 7.861635220125787e-06,
"loss": 3.0156,
"step": 298
},
{
"epoch": 0.2551194539249147,
"grad_norm": 61.62641906738281,
"learning_rate": 7.85265049415993e-06,
"loss": 3.0684,
"step": 299
},
{
"epoch": 0.25597269624573377,
"grad_norm": 31.647212982177734,
"learning_rate": 7.843665768194072e-06,
"loss": 2.6011,
"step": 300
},
{
"epoch": 0.2568259385665529,
"grad_norm": 29.92403793334961,
"learning_rate": 7.834681042228212e-06,
"loss": 3.0156,
"step": 301
},
{
"epoch": 0.257679180887372,
"grad_norm": 40.79433059692383,
"learning_rate": 7.825696316262354e-06,
"loss": 3.2021,
"step": 302
},
{
"epoch": 0.25853242320819114,
"grad_norm": 21.312335968017578,
"learning_rate": 7.816711590296496e-06,
"loss": 2.7988,
"step": 303
},
{
"epoch": 0.2593856655290102,
"grad_norm": 31.760292053222656,
"learning_rate": 7.807726864330638e-06,
"loss": 3.2148,
"step": 304
},
{
"epoch": 0.26023890784982934,
"grad_norm": 20.41486167907715,
"learning_rate": 7.79874213836478e-06,
"loss": 2.9932,
"step": 305
},
{
"epoch": 0.26109215017064846,
"grad_norm": 25.601892471313477,
"learning_rate": 7.789757412398922e-06,
"loss": 2.751,
"step": 306
},
{
"epoch": 0.2619453924914676,
"grad_norm": 24.824922561645508,
"learning_rate": 7.780772686433065e-06,
"loss": 3.0527,
"step": 307
},
{
"epoch": 0.2627986348122867,
"grad_norm": 41.55419158935547,
"learning_rate": 7.771787960467207e-06,
"loss": 2.877,
"step": 308
},
{
"epoch": 0.2636518771331058,
"grad_norm": 31.784027099609375,
"learning_rate": 7.762803234501349e-06,
"loss": 3.3652,
"step": 309
},
{
"epoch": 0.2645051194539249,
"grad_norm": 36.64832305908203,
"learning_rate": 7.75381850853549e-06,
"loss": 3.2246,
"step": 310
},
{
"epoch": 0.26535836177474403,
"grad_norm": 59.69587707519531,
"learning_rate": 7.744833782569631e-06,
"loss": 2.8984,
"step": 311
},
{
"epoch": 0.26621160409556316,
"grad_norm": 29.08624267578125,
"learning_rate": 7.735849056603775e-06,
"loss": 2.8701,
"step": 312
},
{
"epoch": 0.26706484641638223,
"grad_norm": 34.791015625,
"learning_rate": 7.726864330637917e-06,
"loss": 2.5869,
"step": 313
},
{
"epoch": 0.26791808873720135,
"grad_norm": 55.99431610107422,
"learning_rate": 7.717879604672058e-06,
"loss": 2.9512,
"step": 314
},
{
"epoch": 0.2687713310580205,
"grad_norm": 30.719738006591797,
"learning_rate": 7.7088948787062e-06,
"loss": 3.1318,
"step": 315
},
{
"epoch": 0.2696245733788396,
"grad_norm": 24.247756958007812,
"learning_rate": 7.699910152740342e-06,
"loss": 3.3125,
"step": 316
},
{
"epoch": 0.27047781569965873,
"grad_norm": 19.7833309173584,
"learning_rate": 7.690925426774484e-06,
"loss": 2.8135,
"step": 317
},
{
"epoch": 0.2713310580204778,
"grad_norm": 50.110103607177734,
"learning_rate": 7.681940700808626e-06,
"loss": 2.707,
"step": 318
},
{
"epoch": 0.2721843003412969,
"grad_norm": 21.2917423248291,
"learning_rate": 7.672955974842768e-06,
"loss": 2.5205,
"step": 319
},
{
"epoch": 0.27303754266211605,
"grad_norm": 33.85706329345703,
"learning_rate": 7.66397124887691e-06,
"loss": 2.7705,
"step": 320
},
{
"epoch": 0.2738907849829352,
"grad_norm": 59.83601760864258,
"learning_rate": 7.654986522911052e-06,
"loss": 3.1875,
"step": 321
},
{
"epoch": 0.27474402730375425,
"grad_norm": 61.3809928894043,
"learning_rate": 7.646001796945194e-06,
"loss": 3.1445,
"step": 322
},
{
"epoch": 0.27559726962457337,
"grad_norm": 77.88739776611328,
"learning_rate": 7.637017070979336e-06,
"loss": 2.9951,
"step": 323
},
{
"epoch": 0.2764505119453925,
"grad_norm": 47.42338180541992,
"learning_rate": 7.6280323450134775e-06,
"loss": 3.1553,
"step": 324
},
{
"epoch": 0.2773037542662116,
"grad_norm": 31.061063766479492,
"learning_rate": 7.61904761904762e-06,
"loss": 3.2158,
"step": 325
},
{
"epoch": 0.2781569965870307,
"grad_norm": 19.786115646362305,
"learning_rate": 7.6100628930817626e-06,
"loss": 2.5645,
"step": 326
},
{
"epoch": 0.2790102389078498,
"grad_norm": 18.45869255065918,
"learning_rate": 7.601078167115904e-06,
"loss": 2.6465,
"step": 327
},
{
"epoch": 0.27986348122866894,
"grad_norm": 43.45344924926758,
"learning_rate": 7.592093441150046e-06,
"loss": 2.7793,
"step": 328
},
{
"epoch": 0.28071672354948807,
"grad_norm": 42.01021957397461,
"learning_rate": 7.583108715184188e-06,
"loss": 2.8398,
"step": 329
},
{
"epoch": 0.2815699658703072,
"grad_norm": 22.967981338500977,
"learning_rate": 7.574123989218329e-06,
"loss": 3.2949,
"step": 330
},
{
"epoch": 0.28242320819112626,
"grad_norm": 48.885189056396484,
"learning_rate": 7.565139263252471e-06,
"loss": 2.7598,
"step": 331
},
{
"epoch": 0.2832764505119454,
"grad_norm": 83.62909698486328,
"learning_rate": 7.5561545372866126e-06,
"loss": 3.6533,
"step": 332
},
{
"epoch": 0.2841296928327645,
"grad_norm": 102.9934310913086,
"learning_rate": 7.5471698113207555e-06,
"loss": 2.9414,
"step": 333
},
{
"epoch": 0.28498293515358364,
"grad_norm": 22.28573989868164,
"learning_rate": 7.538185085354898e-06,
"loss": 3.0723,
"step": 334
},
{
"epoch": 0.2858361774744027,
"grad_norm": 20.038768768310547,
"learning_rate": 7.52920035938904e-06,
"loss": 3.2168,
"step": 335
},
{
"epoch": 0.28668941979522183,
"grad_norm": 17.91118049621582,
"learning_rate": 7.520215633423181e-06,
"loss": 3.1191,
"step": 336
},
{
"epoch": 0.28754266211604096,
"grad_norm": 23.860708236694336,
"learning_rate": 7.511230907457323e-06,
"loss": 2.8877,
"step": 337
},
{
"epoch": 0.2883959044368601,
"grad_norm": 25.451940536499023,
"learning_rate": 7.502246181491465e-06,
"loss": 2.9814,
"step": 338
},
{
"epoch": 0.28924914675767915,
"grad_norm": 44.59007263183594,
"learning_rate": 7.493261455525606e-06,
"loss": 3.0049,
"step": 339
},
{
"epoch": 0.2901023890784983,
"grad_norm": 14.881490707397461,
"learning_rate": 7.484276729559748e-06,
"loss": 2.5664,
"step": 340
},
{
"epoch": 0.2909556313993174,
"grad_norm": 44.933406829833984,
"learning_rate": 7.475292003593891e-06,
"loss": 2.8262,
"step": 341
},
{
"epoch": 0.29180887372013653,
"grad_norm": 44.93138122558594,
"learning_rate": 7.4663072776280334e-06,
"loss": 2.9414,
"step": 342
},
{
"epoch": 0.29266211604095566,
"grad_norm": 17.614646911621094,
"learning_rate": 7.457322551662175e-06,
"loss": 2.627,
"step": 343
},
{
"epoch": 0.2935153583617747,
"grad_norm": 34.46635055541992,
"learning_rate": 7.448337825696317e-06,
"loss": 2.9785,
"step": 344
},
{
"epoch": 0.29436860068259385,
"grad_norm": 65.03298950195312,
"learning_rate": 7.439353099730459e-06,
"loss": 2.6504,
"step": 345
},
{
"epoch": 0.295221843003413,
"grad_norm": 21.21845245361328,
"learning_rate": 7.4303683737646e-06,
"loss": 2.8721,
"step": 346
},
{
"epoch": 0.2960750853242321,
"grad_norm": 33.291107177734375,
"learning_rate": 7.421383647798742e-06,
"loss": 2.6748,
"step": 347
},
{
"epoch": 0.29692832764505117,
"grad_norm": 18.167999267578125,
"learning_rate": 7.412398921832885e-06,
"loss": 2.6123,
"step": 348
},
{
"epoch": 0.2977815699658703,
"grad_norm": 41.19535446166992,
"learning_rate": 7.403414195867027e-06,
"loss": 3.4941,
"step": 349
},
{
"epoch": 0.2986348122866894,
"grad_norm": 28.60065269470215,
"learning_rate": 7.3944294699011685e-06,
"loss": 3.0127,
"step": 350
},
{
"epoch": 0.29948805460750855,
"grad_norm": 28.26238441467285,
"learning_rate": 7.3854447439353106e-06,
"loss": 2.791,
"step": 351
},
{
"epoch": 0.3003412969283277,
"grad_norm": 31.91538429260254,
"learning_rate": 7.376460017969453e-06,
"loss": 2.7793,
"step": 352
},
{
"epoch": 0.30119453924914674,
"grad_norm": 19.526973724365234,
"learning_rate": 7.367475292003594e-06,
"loss": 2.8916,
"step": 353
},
{
"epoch": 0.30204778156996587,
"grad_norm": 39.5968132019043,
"learning_rate": 7.358490566037736e-06,
"loss": 2.6309,
"step": 354
},
{
"epoch": 0.302901023890785,
"grad_norm": 53.58340072631836,
"learning_rate": 7.349505840071879e-06,
"loss": 2.9912,
"step": 355
},
{
"epoch": 0.3037542662116041,
"grad_norm": 38.25190353393555,
"learning_rate": 7.340521114106021e-06,
"loss": 2.7041,
"step": 356
},
{
"epoch": 0.3046075085324232,
"grad_norm": 15.885865211486816,
"learning_rate": 7.331536388140162e-06,
"loss": 3.0859,
"step": 357
},
{
"epoch": 0.3054607508532423,
"grad_norm": 19.57723617553711,
"learning_rate": 7.322551662174304e-06,
"loss": 3.0664,
"step": 358
},
{
"epoch": 0.30631399317406144,
"grad_norm": 63.955474853515625,
"learning_rate": 7.313566936208446e-06,
"loss": 2.7041,
"step": 359
},
{
"epoch": 0.30716723549488056,
"grad_norm": 32.2467155456543,
"learning_rate": 7.304582210242588e-06,
"loss": 3.2461,
"step": 360
},
{
"epoch": 0.30802047781569963,
"grad_norm": 30.897621154785156,
"learning_rate": 7.29559748427673e-06,
"loss": 3.4238,
"step": 361
},
{
"epoch": 0.30887372013651876,
"grad_norm": 18.033702850341797,
"learning_rate": 7.286612758310873e-06,
"loss": 3.0957,
"step": 362
},
{
"epoch": 0.3097269624573379,
"grad_norm": 41.53689956665039,
"learning_rate": 7.277628032345015e-06,
"loss": 2.918,
"step": 363
},
{
"epoch": 0.310580204778157,
"grad_norm": 43.60328674316406,
"learning_rate": 7.268643306379156e-06,
"loss": 3.0557,
"step": 364
},
{
"epoch": 0.31143344709897613,
"grad_norm": 55.88149642944336,
"learning_rate": 7.259658580413298e-06,
"loss": 2.7529,
"step": 365
},
{
"epoch": 0.3122866894197952,
"grad_norm": 46.07794952392578,
"learning_rate": 7.250673854447439e-06,
"loss": 2.835,
"step": 366
},
{
"epoch": 0.31313993174061433,
"grad_norm": 19.480363845825195,
"learning_rate": 7.2416891284815814e-06,
"loss": 2.4336,
"step": 367
},
{
"epoch": 0.31399317406143346,
"grad_norm": 78.5523681640625,
"learning_rate": 7.2327044025157235e-06,
"loss": 2.9111,
"step": 368
},
{
"epoch": 0.3148464163822526,
"grad_norm": 56.7245979309082,
"learning_rate": 7.2237196765498665e-06,
"loss": 2.7549,
"step": 369
},
{
"epoch": 0.31569965870307165,
"grad_norm": 94.528076171875,
"learning_rate": 7.214734950584008e-06,
"loss": 2.9033,
"step": 370
},
{
"epoch": 0.3165529010238908,
"grad_norm": 62.33586883544922,
"learning_rate": 7.20575022461815e-06,
"loss": 3.4121,
"step": 371
},
{
"epoch": 0.3174061433447099,
"grad_norm": 19.558395385742188,
"learning_rate": 7.196765498652292e-06,
"loss": 2.9961,
"step": 372
},
{
"epoch": 0.318259385665529,
"grad_norm": 41.45348358154297,
"learning_rate": 7.187780772686433e-06,
"loss": 3.1143,
"step": 373
},
{
"epoch": 0.3191126279863481,
"grad_norm": 42.826805114746094,
"learning_rate": 7.178796046720575e-06,
"loss": 3.0479,
"step": 374
},
{
"epoch": 0.3199658703071672,
"grad_norm": 60.3271598815918,
"learning_rate": 7.169811320754717e-06,
"loss": 3.6201,
"step": 375
},
{
"epoch": 0.32081911262798635,
"grad_norm": 15.749074935913086,
"learning_rate": 7.16082659478886e-06,
"loss": 3.1504,
"step": 376
},
{
"epoch": 0.3216723549488055,
"grad_norm": 28.352935791015625,
"learning_rate": 7.1518418688230015e-06,
"loss": 2.6055,
"step": 377
},
{
"epoch": 0.3225255972696246,
"grad_norm": 22.02720069885254,
"learning_rate": 7.1428571428571436e-06,
"loss": 2.6562,
"step": 378
},
{
"epoch": 0.32337883959044367,
"grad_norm": 32.356258392333984,
"learning_rate": 7.133872416891286e-06,
"loss": 2.502,
"step": 379
},
{
"epoch": 0.3242320819112628,
"grad_norm": 55.318992614746094,
"learning_rate": 7.124887690925427e-06,
"loss": 3.8184,
"step": 380
},
{
"epoch": 0.3250853242320819,
"grad_norm": 13.6019926071167,
"learning_rate": 7.115902964959569e-06,
"loss": 2.7568,
"step": 381
},
{
"epoch": 0.32593856655290104,
"grad_norm": 30.500629425048828,
"learning_rate": 7.106918238993711e-06,
"loss": 2.5703,
"step": 382
},
{
"epoch": 0.3267918088737201,
"grad_norm": 19.479543685913086,
"learning_rate": 7.097933513027854e-06,
"loss": 3.1357,
"step": 383
},
{
"epoch": 0.32764505119453924,
"grad_norm": 14.30429744720459,
"learning_rate": 7.088948787061995e-06,
"loss": 2.4258,
"step": 384
},
{
"epoch": 0.32849829351535836,
"grad_norm": 46.64712142944336,
"learning_rate": 7.079964061096137e-06,
"loss": 2.4863,
"step": 385
},
{
"epoch": 0.3293515358361775,
"grad_norm": 14.86281681060791,
"learning_rate": 7.0709793351302794e-06,
"loss": 2.5391,
"step": 386
},
{
"epoch": 0.3302047781569966,
"grad_norm": 24.936386108398438,
"learning_rate": 7.061994609164421e-06,
"loss": 3.0732,
"step": 387
},
{
"epoch": 0.3310580204778157,
"grad_norm": 23.111101150512695,
"learning_rate": 7.053009883198563e-06,
"loss": 2.9229,
"step": 388
},
{
"epoch": 0.3319112627986348,
"grad_norm": 28.36639976501465,
"learning_rate": 7.044025157232705e-06,
"loss": 3.0352,
"step": 389
},
{
"epoch": 0.33276450511945393,
"grad_norm": 22.827180862426758,
"learning_rate": 7.035040431266848e-06,
"loss": 3.0488,
"step": 390
},
{
"epoch": 0.33361774744027306,
"grad_norm": 24.18820571899414,
"learning_rate": 7.026055705300989e-06,
"loss": 2.1445,
"step": 391
},
{
"epoch": 0.33447098976109213,
"grad_norm": 38.80826950073242,
"learning_rate": 7.017070979335131e-06,
"loss": 3.0645,
"step": 392
},
{
"epoch": 0.33532423208191126,
"grad_norm": 39.28004455566406,
"learning_rate": 7.008086253369273e-06,
"loss": 2.6309,
"step": 393
},
{
"epoch": 0.3361774744027304,
"grad_norm": 16.985010147094727,
"learning_rate": 6.9991015274034144e-06,
"loss": 2.792,
"step": 394
},
{
"epoch": 0.3370307167235495,
"grad_norm": 18.20982551574707,
"learning_rate": 6.9901168014375565e-06,
"loss": 3.1348,
"step": 395
},
{
"epoch": 0.3378839590443686,
"grad_norm": 23.503843307495117,
"learning_rate": 6.981132075471699e-06,
"loss": 2.4434,
"step": 396
},
{
"epoch": 0.3387372013651877,
"grad_norm": 69.62710571289062,
"learning_rate": 6.9721473495058416e-06,
"loss": 2.8184,
"step": 397
},
{
"epoch": 0.3395904436860068,
"grad_norm": 47.18648910522461,
"learning_rate": 6.963162623539983e-06,
"loss": 2.9189,
"step": 398
},
{
"epoch": 0.34044368600682595,
"grad_norm": 40.54623794555664,
"learning_rate": 6.954177897574125e-06,
"loss": 2.7744,
"step": 399
},
{
"epoch": 0.3412969283276451,
"grad_norm": 26.348918914794922,
"learning_rate": 6.945193171608267e-06,
"loss": 2.7061,
"step": 400
},
{
"epoch": 0.34215017064846415,
"grad_norm": 26.754854202270508,
"learning_rate": 6.936208445642408e-06,
"loss": 2.8203,
"step": 401
},
{
"epoch": 0.3430034129692833,
"grad_norm": 23.61028289794922,
"learning_rate": 6.92722371967655e-06,
"loss": 2.502,
"step": 402
},
{
"epoch": 0.3438566552901024,
"grad_norm": 51.461273193359375,
"learning_rate": 6.9182389937106915e-06,
"loss": 3.4434,
"step": 403
},
{
"epoch": 0.3447098976109215,
"grad_norm": 61.8600959777832,
"learning_rate": 6.9092542677448345e-06,
"loss": 3.0664,
"step": 404
},
{
"epoch": 0.3455631399317406,
"grad_norm": 36.4835319519043,
"learning_rate": 6.9002695417789766e-06,
"loss": 2.5781,
"step": 405
},
{
"epoch": 0.3464163822525597,
"grad_norm": 20.035572052001953,
"learning_rate": 6.891284815813119e-06,
"loss": 2.7451,
"step": 406
},
{
"epoch": 0.34726962457337884,
"grad_norm": 23.01044273376465,
"learning_rate": 6.88230008984726e-06,
"loss": 2.8359,
"step": 407
},
{
"epoch": 0.34812286689419797,
"grad_norm": 34.898773193359375,
"learning_rate": 6.873315363881402e-06,
"loss": 2.8604,
"step": 408
},
{
"epoch": 0.34897610921501704,
"grad_norm": 17.49709701538086,
"learning_rate": 6.864330637915544e-06,
"loss": 2.75,
"step": 409
},
{
"epoch": 0.34982935153583616,
"grad_norm": 32.71485900878906,
"learning_rate": 6.855345911949685e-06,
"loss": 3.1094,
"step": 410
},
{
"epoch": 0.3506825938566553,
"grad_norm": 19.570741653442383,
"learning_rate": 6.846361185983828e-06,
"loss": 2.9473,
"step": 411
},
{
"epoch": 0.3515358361774744,
"grad_norm": 29.854347229003906,
"learning_rate": 6.83737646001797e-06,
"loss": 2.8809,
"step": 412
},
{
"epoch": 0.35238907849829354,
"grad_norm": 43.52353286743164,
"learning_rate": 6.8283917340521124e-06,
"loss": 2.8164,
"step": 413
},
{
"epoch": 0.3532423208191126,
"grad_norm": 26.43115997314453,
"learning_rate": 6.819407008086254e-06,
"loss": 2.4014,
"step": 414
},
{
"epoch": 0.35409556313993173,
"grad_norm": 25.89423370361328,
"learning_rate": 6.810422282120396e-06,
"loss": 2.6406,
"step": 415
},
{
"epoch": 0.35494880546075086,
"grad_norm": 35.70558166503906,
"learning_rate": 6.801437556154538e-06,
"loss": 2.4619,
"step": 416
},
{
"epoch": 0.35580204778157,
"grad_norm": 33.59202194213867,
"learning_rate": 6.792452830188679e-06,
"loss": 2.8213,
"step": 417
},
{
"epoch": 0.35665529010238906,
"grad_norm": 25.010950088500977,
"learning_rate": 6.783468104222821e-06,
"loss": 2.8096,
"step": 418
},
{
"epoch": 0.3575085324232082,
"grad_norm": 23.81590461730957,
"learning_rate": 6.774483378256964e-06,
"loss": 2.3193,
"step": 419
},
{
"epoch": 0.3583617747440273,
"grad_norm": 42.35072708129883,
"learning_rate": 6.765498652291106e-06,
"loss": 2.6592,
"step": 420
},
{
"epoch": 0.35921501706484643,
"grad_norm": 40.074851989746094,
"learning_rate": 6.7565139263252475e-06,
"loss": 2.6318,
"step": 421
},
{
"epoch": 0.36006825938566556,
"grad_norm": 32.20216751098633,
"learning_rate": 6.7475292003593895e-06,
"loss": 3.2891,
"step": 422
},
{
"epoch": 0.3609215017064846,
"grad_norm": 37.29304122924805,
"learning_rate": 6.738544474393532e-06,
"loss": 2.9023,
"step": 423
},
{
"epoch": 0.36177474402730375,
"grad_norm": 32.8192024230957,
"learning_rate": 6.729559748427673e-06,
"loss": 2.9736,
"step": 424
},
{
"epoch": 0.3626279863481229,
"grad_norm": 23.262601852416992,
"learning_rate": 6.720575022461815e-06,
"loss": 2.4893,
"step": 425
},
{
"epoch": 0.363481228668942,
"grad_norm": 40.62841796875,
"learning_rate": 6.711590296495958e-06,
"loss": 2.9531,
"step": 426
},
{
"epoch": 0.3643344709897611,
"grad_norm": 30.518091201782227,
"learning_rate": 6.7026055705301e-06,
"loss": 2.4004,
"step": 427
},
{
"epoch": 0.3651877133105802,
"grad_norm": 28.2186336517334,
"learning_rate": 6.693620844564241e-06,
"loss": 2.8008,
"step": 428
},
{
"epoch": 0.3660409556313993,
"grad_norm": 14.801209449768066,
"learning_rate": 6.684636118598383e-06,
"loss": 2.4902,
"step": 429
},
{
"epoch": 0.36689419795221845,
"grad_norm": 19.97643280029297,
"learning_rate": 6.675651392632525e-06,
"loss": 2.6104,
"step": 430
},
{
"epoch": 0.3677474402730375,
"grad_norm": 33.75346374511719,
"learning_rate": 6.666666666666667e-06,
"loss": 2.6221,
"step": 431
},
{
"epoch": 0.36860068259385664,
"grad_norm": 31.13344383239746,
"learning_rate": 6.657681940700809e-06,
"loss": 2.9883,
"step": 432
},
{
"epoch": 0.36945392491467577,
"grad_norm": 24.86776351928711,
"learning_rate": 6.648697214734952e-06,
"loss": 3.3652,
"step": 433
},
{
"epoch": 0.3703071672354949,
"grad_norm": 55.863922119140625,
"learning_rate": 6.639712488769094e-06,
"loss": 2.8779,
"step": 434
},
{
"epoch": 0.371160409556314,
"grad_norm": 14.440893173217773,
"learning_rate": 6.630727762803235e-06,
"loss": 2.8555,
"step": 435
},
{
"epoch": 0.3720136518771331,
"grad_norm": 16.983476638793945,
"learning_rate": 6.621743036837377e-06,
"loss": 3.1016,
"step": 436
},
{
"epoch": 0.3728668941979522,
"grad_norm": 17.240015029907227,
"learning_rate": 6.612758310871519e-06,
"loss": 2.8311,
"step": 437
},
{
"epoch": 0.37372013651877134,
"grad_norm": 18.871740341186523,
"learning_rate": 6.60377358490566e-06,
"loss": 2.8125,
"step": 438
},
{
"epoch": 0.37457337883959047,
"grad_norm": 36.629554748535156,
"learning_rate": 6.5947888589398025e-06,
"loss": 2.7119,
"step": 439
},
{
"epoch": 0.37542662116040953,
"grad_norm": 22.077198028564453,
"learning_rate": 6.5858041329739454e-06,
"loss": 2.5117,
"step": 440
},
{
"epoch": 0.37627986348122866,
"grad_norm": 36.363304138183594,
"learning_rate": 6.576819407008087e-06,
"loss": 3.335,
"step": 441
},
{
"epoch": 0.3771331058020478,
"grad_norm": 17.072507858276367,
"learning_rate": 6.567834681042229e-06,
"loss": 2.8789,
"step": 442
},
{
"epoch": 0.3779863481228669,
"grad_norm": 30.818571090698242,
"learning_rate": 6.558849955076371e-06,
"loss": 2.8848,
"step": 443
},
{
"epoch": 0.378839590443686,
"grad_norm": 35.21898651123047,
"learning_rate": 6.549865229110512e-06,
"loss": 2.6309,
"step": 444
},
{
"epoch": 0.3796928327645051,
"grad_norm": 24.29189109802246,
"learning_rate": 6.540880503144654e-06,
"loss": 2.8311,
"step": 445
},
{
"epoch": 0.38054607508532423,
"grad_norm": 27.211631774902344,
"learning_rate": 6.531895777178796e-06,
"loss": 2.7578,
"step": 446
},
{
"epoch": 0.38139931740614336,
"grad_norm": 20.377609252929688,
"learning_rate": 6.522911051212939e-06,
"loss": 2.5107,
"step": 447
},
{
"epoch": 0.3822525597269625,
"grad_norm": 31.11419677734375,
"learning_rate": 6.5139263252470805e-06,
"loss": 3.0205,
"step": 448
},
{
"epoch": 0.38310580204778155,
"grad_norm": 27.811227798461914,
"learning_rate": 6.5049415992812226e-06,
"loss": 2.5117,
"step": 449
},
{
"epoch": 0.3839590443686007,
"grad_norm": 27.51049041748047,
"learning_rate": 6.495956873315365e-06,
"loss": 2.748,
"step": 450
},
{
"epoch": 0.3848122866894198,
"grad_norm": 23.872310638427734,
"learning_rate": 6.486972147349506e-06,
"loss": 3.1309,
"step": 451
},
{
"epoch": 0.3856655290102389,
"grad_norm": 25.639631271362305,
"learning_rate": 6.477987421383648e-06,
"loss": 3.0293,
"step": 452
},
{
"epoch": 0.386518771331058,
"grad_norm": 78.10359954833984,
"learning_rate": 6.46900269541779e-06,
"loss": 2.7129,
"step": 453
},
{
"epoch": 0.3873720136518771,
"grad_norm": 33.32661056518555,
"learning_rate": 6.460017969451933e-06,
"loss": 3.0215,
"step": 454
},
{
"epoch": 0.38822525597269625,
"grad_norm": 18.216999053955078,
"learning_rate": 6.451033243486074e-06,
"loss": 2.6982,
"step": 455
},
{
"epoch": 0.3890784982935154,
"grad_norm": 15.191929817199707,
"learning_rate": 6.442048517520216e-06,
"loss": 2.6016,
"step": 456
},
{
"epoch": 0.38993174061433444,
"grad_norm": 20.197877883911133,
"learning_rate": 6.433063791554358e-06,
"loss": 2.791,
"step": 457
},
{
"epoch": 0.39078498293515357,
"grad_norm": 20.7491397857666,
"learning_rate": 6.4240790655885e-06,
"loss": 2.3955,
"step": 458
},
{
"epoch": 0.3916382252559727,
"grad_norm": 14.716793060302734,
"learning_rate": 6.415094339622642e-06,
"loss": 2.6826,
"step": 459
},
{
"epoch": 0.3924914675767918,
"grad_norm": 23.43107032775879,
"learning_rate": 6.406109613656784e-06,
"loss": 3.1709,
"step": 460
},
{
"epoch": 0.39334470989761094,
"grad_norm": 17.54826545715332,
"learning_rate": 6.397124887690927e-06,
"loss": 2.3867,
"step": 461
},
{
"epoch": 0.39419795221843,
"grad_norm": 83.90989685058594,
"learning_rate": 6.388140161725068e-06,
"loss": 2.9434,
"step": 462
},
{
"epoch": 0.39505119453924914,
"grad_norm": 69.95578002929688,
"learning_rate": 6.37915543575921e-06,
"loss": 2.5518,
"step": 463
},
{
"epoch": 0.39590443686006827,
"grad_norm": 53.826499938964844,
"learning_rate": 6.370170709793352e-06,
"loss": 2.4453,
"step": 464
},
{
"epoch": 0.3967576791808874,
"grad_norm": 18.601577758789062,
"learning_rate": 6.3611859838274934e-06,
"loss": 2.9531,
"step": 465
},
{
"epoch": 0.39761092150170646,
"grad_norm": 34.0197868347168,
"learning_rate": 6.3522012578616355e-06,
"loss": 2.4746,
"step": 466
},
{
"epoch": 0.3984641638225256,
"grad_norm": 44.096134185791016,
"learning_rate": 6.343216531895778e-06,
"loss": 2.8955,
"step": 467
},
{
"epoch": 0.3993174061433447,
"grad_norm": 24.102462768554688,
"learning_rate": 6.3342318059299205e-06,
"loss": 2.792,
"step": 468
},
{
"epoch": 0.40017064846416384,
"grad_norm": 17.850786209106445,
"learning_rate": 6.325247079964062e-06,
"loss": 2.3228,
"step": 469
},
{
"epoch": 0.40102389078498296,
"grad_norm": 21.928977966308594,
"learning_rate": 6.316262353998204e-06,
"loss": 2.6504,
"step": 470
},
{
"epoch": 0.40187713310580203,
"grad_norm": 25.40172004699707,
"learning_rate": 6.307277628032346e-06,
"loss": 2.6084,
"step": 471
},
{
"epoch": 0.40273037542662116,
"grad_norm": 16.1717586517334,
"learning_rate": 6.298292902066487e-06,
"loss": 2.0674,
"step": 472
},
{
"epoch": 0.4035836177474403,
"grad_norm": 29.402706146240234,
"learning_rate": 6.289308176100629e-06,
"loss": 2.916,
"step": 473
},
{
"epoch": 0.4044368600682594,
"grad_norm": 23.10331153869629,
"learning_rate": 6.2803234501347705e-06,
"loss": 2.6895,
"step": 474
},
{
"epoch": 0.4052901023890785,
"grad_norm": 23.997806549072266,
"learning_rate": 6.2713387241689135e-06,
"loss": 2.6953,
"step": 475
},
{
"epoch": 0.4061433447098976,
"grad_norm": 39.14799118041992,
"learning_rate": 6.2623539982030556e-06,
"loss": 2.625,
"step": 476
},
{
"epoch": 0.4069965870307167,
"grad_norm": 29.97382164001465,
"learning_rate": 6.253369272237198e-06,
"loss": 3.2129,
"step": 477
},
{
"epoch": 0.40784982935153585,
"grad_norm": 17.89264678955078,
"learning_rate": 6.244384546271339e-06,
"loss": 2.6367,
"step": 478
},
{
"epoch": 0.4087030716723549,
"grad_norm": 40.39899444580078,
"learning_rate": 6.235399820305481e-06,
"loss": 2.8965,
"step": 479
},
{
"epoch": 0.40955631399317405,
"grad_norm": 48.41572952270508,
"learning_rate": 6.226415094339623e-06,
"loss": 3.0703,
"step": 480
},
{
"epoch": 0.4104095563139932,
"grad_norm": 31.183040618896484,
"learning_rate": 6.217430368373764e-06,
"loss": 2.1592,
"step": 481
},
{
"epoch": 0.4112627986348123,
"grad_norm": 36.558773040771484,
"learning_rate": 6.208445642407907e-06,
"loss": 2.3945,
"step": 482
},
{
"epoch": 0.4121160409556314,
"grad_norm": 19.117509841918945,
"learning_rate": 6.199460916442049e-06,
"loss": 3.0605,
"step": 483
},
{
"epoch": 0.4129692832764505,
"grad_norm": 48.38492965698242,
"learning_rate": 6.1904761904761914e-06,
"loss": 2.291,
"step": 484
},
{
"epoch": 0.4138225255972696,
"grad_norm": 21.41737937927246,
"learning_rate": 6.181491464510333e-06,
"loss": 2.8252,
"step": 485
},
{
"epoch": 0.41467576791808874,
"grad_norm": 42.10402297973633,
"learning_rate": 6.172506738544475e-06,
"loss": 3.0918,
"step": 486
},
{
"epoch": 0.41552901023890787,
"grad_norm": 46.72148895263672,
"learning_rate": 6.163522012578617e-06,
"loss": 2.6162,
"step": 487
},
{
"epoch": 0.41638225255972694,
"grad_norm": 21.43707847595215,
"learning_rate": 6.154537286612758e-06,
"loss": 2.2754,
"step": 488
},
{
"epoch": 0.41723549488054607,
"grad_norm": 16.188798904418945,
"learning_rate": 6.145552560646901e-06,
"loss": 2.5078,
"step": 489
},
{
"epoch": 0.4180887372013652,
"grad_norm": 19.047313690185547,
"learning_rate": 6.136567834681043e-06,
"loss": 2.9209,
"step": 490
},
{
"epoch": 0.4189419795221843,
"grad_norm": 36.833744049072266,
"learning_rate": 6.127583108715185e-06,
"loss": 2.5791,
"step": 491
},
{
"epoch": 0.4197952218430034,
"grad_norm": 19.706417083740234,
"learning_rate": 6.1185983827493264e-06,
"loss": 2.9268,
"step": 492
},
{
"epoch": 0.4206484641638225,
"grad_norm": 33.76554870605469,
"learning_rate": 6.1096136567834685e-06,
"loss": 2.4229,
"step": 493
},
{
"epoch": 0.42150170648464164,
"grad_norm": 35.278297424316406,
"learning_rate": 6.100628930817611e-06,
"loss": 2.7832,
"step": 494
},
{
"epoch": 0.42235494880546076,
"grad_norm": 25.52465057373047,
"learning_rate": 6.091644204851752e-06,
"loss": 2.8623,
"step": 495
},
{
"epoch": 0.4232081911262799,
"grad_norm": 50.70538330078125,
"learning_rate": 6.082659478885895e-06,
"loss": 2.7324,
"step": 496
},
{
"epoch": 0.42406143344709896,
"grad_norm": 33.23221206665039,
"learning_rate": 6.073674752920037e-06,
"loss": 2.8828,
"step": 497
},
{
"epoch": 0.4249146757679181,
"grad_norm": 44.4984245300293,
"learning_rate": 6.064690026954179e-06,
"loss": 2.7217,
"step": 498
},
{
"epoch": 0.4257679180887372,
"grad_norm": 40.04144287109375,
"learning_rate": 6.05570530098832e-06,
"loss": 2.4414,
"step": 499
},
{
"epoch": 0.42662116040955633,
"grad_norm": 31.808074951171875,
"learning_rate": 6.046720575022462e-06,
"loss": 2.5635,
"step": 500
},
{
"epoch": 0.4274744027303754,
"grad_norm": 27.19110679626465,
"learning_rate": 6.037735849056604e-06,
"loss": 2.6348,
"step": 501
},
{
"epoch": 0.4283276450511945,
"grad_norm": 52.006351470947266,
"learning_rate": 6.028751123090746e-06,
"loss": 3.4004,
"step": 502
},
{
"epoch": 0.42918088737201365,
"grad_norm": 49.40862274169922,
"learning_rate": 6.019766397124888e-06,
"loss": 2.6865,
"step": 503
},
{
"epoch": 0.4300341296928328,
"grad_norm": 17.69283676147461,
"learning_rate": 6.010781671159031e-06,
"loss": 2.4824,
"step": 504
},
{
"epoch": 0.4308873720136519,
"grad_norm": 20.699617385864258,
"learning_rate": 6.001796945193173e-06,
"loss": 3.0488,
"step": 505
},
{
"epoch": 0.431740614334471,
"grad_norm": 56.287269592285156,
"learning_rate": 5.992812219227314e-06,
"loss": 3.0703,
"step": 506
},
{
"epoch": 0.4325938566552901,
"grad_norm": 19.926307678222656,
"learning_rate": 5.983827493261456e-06,
"loss": 2.7871,
"step": 507
},
{
"epoch": 0.4334470989761092,
"grad_norm": 33.74576187133789,
"learning_rate": 5.974842767295598e-06,
"loss": 2.9238,
"step": 508
},
{
"epoch": 0.43430034129692835,
"grad_norm": 27.027666091918945,
"learning_rate": 5.965858041329739e-06,
"loss": 2.2666,
"step": 509
},
{
"epoch": 0.4351535836177474,
"grad_norm": 49.87267303466797,
"learning_rate": 5.9568733153638815e-06,
"loss": 2.4355,
"step": 510
},
{
"epoch": 0.43600682593856654,
"grad_norm": 33.75191879272461,
"learning_rate": 5.9478885893980244e-06,
"loss": 2.1299,
"step": 511
},
{
"epoch": 0.43686006825938567,
"grad_norm": 23.760793685913086,
"learning_rate": 5.938903863432166e-06,
"loss": 2.9541,
"step": 512
},
{
"epoch": 0.4377133105802048,
"grad_norm": 19.85642433166504,
"learning_rate": 5.929919137466308e-06,
"loss": 2.9023,
"step": 513
},
{
"epoch": 0.43856655290102387,
"grad_norm": 42.32032775878906,
"learning_rate": 5.92093441150045e-06,
"loss": 2.8652,
"step": 514
},
{
"epoch": 0.439419795221843,
"grad_norm": 32.215065002441406,
"learning_rate": 5.911949685534591e-06,
"loss": 3.1729,
"step": 515
},
{
"epoch": 0.4402730375426621,
"grad_norm": 31.592498779296875,
"learning_rate": 5.902964959568733e-06,
"loss": 2.5439,
"step": 516
},
{
"epoch": 0.44112627986348124,
"grad_norm": 26.448612213134766,
"learning_rate": 5.893980233602875e-06,
"loss": 2.3545,
"step": 517
},
{
"epoch": 0.44197952218430037,
"grad_norm": 49.79834747314453,
"learning_rate": 5.884995507637018e-06,
"loss": 2.8105,
"step": 518
},
{
"epoch": 0.44283276450511944,
"grad_norm": 45.154701232910156,
"learning_rate": 5.8760107816711595e-06,
"loss": 2.9541,
"step": 519
},
{
"epoch": 0.44368600682593856,
"grad_norm": 41.03085708618164,
"learning_rate": 5.8670260557053015e-06,
"loss": 2.8398,
"step": 520
},
{
"epoch": 0.4445392491467577,
"grad_norm": 21.58003807067871,
"learning_rate": 5.858041329739444e-06,
"loss": 2.7559,
"step": 521
},
{
"epoch": 0.4453924914675768,
"grad_norm": 21.989830017089844,
"learning_rate": 5.849056603773585e-06,
"loss": 3.0488,
"step": 522
},
{
"epoch": 0.4462457337883959,
"grad_norm": 14.759679794311523,
"learning_rate": 5.840071877807727e-06,
"loss": 2.9902,
"step": 523
},
{
"epoch": 0.447098976109215,
"grad_norm": 14.680983543395996,
"learning_rate": 5.831087151841869e-06,
"loss": 2.4238,
"step": 524
},
{
"epoch": 0.44795221843003413,
"grad_norm": 20.452116012573242,
"learning_rate": 5.822102425876012e-06,
"loss": 2.7881,
"step": 525
},
{
"epoch": 0.44880546075085326,
"grad_norm": 15.639450073242188,
"learning_rate": 5.813117699910153e-06,
"loss": 2.6191,
"step": 526
},
{
"epoch": 0.4496587030716723,
"grad_norm": 38.403995513916016,
"learning_rate": 5.804132973944295e-06,
"loss": 2.6895,
"step": 527
},
{
"epoch": 0.45051194539249145,
"grad_norm": 14.627884864807129,
"learning_rate": 5.795148247978437e-06,
"loss": 2.7598,
"step": 528
},
{
"epoch": 0.4513651877133106,
"grad_norm": 17.250015258789062,
"learning_rate": 5.786163522012579e-06,
"loss": 2.5088,
"step": 529
},
{
"epoch": 0.4522184300341297,
"grad_norm": 36.893882751464844,
"learning_rate": 5.777178796046721e-06,
"loss": 2.6611,
"step": 530
},
{
"epoch": 0.45307167235494883,
"grad_norm": 29.592458724975586,
"learning_rate": 5.768194070080863e-06,
"loss": 2.3877,
"step": 531
},
{
"epoch": 0.4539249146757679,
"grad_norm": 29.255516052246094,
"learning_rate": 5.759209344115006e-06,
"loss": 3.1191,
"step": 532
},
{
"epoch": 0.454778156996587,
"grad_norm": 27.445293426513672,
"learning_rate": 5.750224618149147e-06,
"loss": 2.4629,
"step": 533
},
{
"epoch": 0.45563139931740615,
"grad_norm": 20.05036735534668,
"learning_rate": 5.741239892183289e-06,
"loss": 2.5107,
"step": 534
},
{
"epoch": 0.4564846416382253,
"grad_norm": 16.583898544311523,
"learning_rate": 5.732255166217431e-06,
"loss": 3.3027,
"step": 535
},
{
"epoch": 0.45733788395904434,
"grad_norm": 34.19240188598633,
"learning_rate": 5.723270440251572e-06,
"loss": 2.3574,
"step": 536
},
{
"epoch": 0.45819112627986347,
"grad_norm": 29.18450164794922,
"learning_rate": 5.7142857142857145e-06,
"loss": 2.3818,
"step": 537
},
{
"epoch": 0.4590443686006826,
"grad_norm": 26.387821197509766,
"learning_rate": 5.705300988319857e-06,
"loss": 2.709,
"step": 538
},
{
"epoch": 0.4598976109215017,
"grad_norm": 19.58378028869629,
"learning_rate": 5.6963162623539995e-06,
"loss": 2.6631,
"step": 539
},
{
"epoch": 0.46075085324232085,
"grad_norm": 25.05061912536621,
"learning_rate": 5.687331536388141e-06,
"loss": 2.6152,
"step": 540
},
{
"epoch": 0.4616040955631399,
"grad_norm": 33.281044006347656,
"learning_rate": 5.678346810422283e-06,
"loss": 2.6494,
"step": 541
},
{
"epoch": 0.46245733788395904,
"grad_norm": 15.196967124938965,
"learning_rate": 5.669362084456425e-06,
"loss": 2.9629,
"step": 542
},
{
"epoch": 0.46331058020477817,
"grad_norm": 48.015869140625,
"learning_rate": 5.660377358490566e-06,
"loss": 2.7188,
"step": 543
},
{
"epoch": 0.4641638225255973,
"grad_norm": 40.080692291259766,
"learning_rate": 5.651392632524708e-06,
"loss": 3.4395,
"step": 544
},
{
"epoch": 0.46501706484641636,
"grad_norm": 14.495575904846191,
"learning_rate": 5.64240790655885e-06,
"loss": 2.5498,
"step": 545
},
{
"epoch": 0.4658703071672355,
"grad_norm": 20.11421775817871,
"learning_rate": 5.6334231805929925e-06,
"loss": 2.4463,
"step": 546
},
{
"epoch": 0.4667235494880546,
"grad_norm": 22.752029418945312,
"learning_rate": 5.6244384546271346e-06,
"loss": 2.376,
"step": 547
},
{
"epoch": 0.46757679180887374,
"grad_norm": 31.627084732055664,
"learning_rate": 5.615453728661277e-06,
"loss": 2.3809,
"step": 548
},
{
"epoch": 0.4684300341296928,
"grad_norm": 45.1749382019043,
"learning_rate": 5.606469002695418e-06,
"loss": 2.6025,
"step": 549
},
{
"epoch": 0.46928327645051193,
"grad_norm": 14.6268892288208,
"learning_rate": 5.59748427672956e-06,
"loss": 2.3457,
"step": 550
},
{
"epoch": 0.47013651877133106,
"grad_norm": 22.34581756591797,
"learning_rate": 5.588499550763702e-06,
"loss": 2.9863,
"step": 551
},
{
"epoch": 0.4709897610921502,
"grad_norm": 30.1942195892334,
"learning_rate": 5.579514824797843e-06,
"loss": 3.0391,
"step": 552
},
{
"epoch": 0.4718430034129693,
"grad_norm": 44.00593566894531,
"learning_rate": 5.570530098831986e-06,
"loss": 3.0767,
"step": 553
},
{
"epoch": 0.4726962457337884,
"grad_norm": 20.9268798828125,
"learning_rate": 5.561545372866128e-06,
"loss": 2.3945,
"step": 554
},
{
"epoch": 0.4735494880546075,
"grad_norm": 17.279001235961914,
"learning_rate": 5.55256064690027e-06,
"loss": 2.2637,
"step": 555
},
{
"epoch": 0.47440273037542663,
"grad_norm": 35.23509979248047,
"learning_rate": 5.543575920934412e-06,
"loss": 2.5107,
"step": 556
},
{
"epoch": 0.47525597269624575,
"grad_norm": 26.05479621887207,
"learning_rate": 5.534591194968554e-06,
"loss": 2.457,
"step": 557
},
{
"epoch": 0.4761092150170648,
"grad_norm": 50.16437530517578,
"learning_rate": 5.525606469002696e-06,
"loss": 2.9473,
"step": 558
},
{
"epoch": 0.47696245733788395,
"grad_norm": 27.86246109008789,
"learning_rate": 5.516621743036837e-06,
"loss": 2.6533,
"step": 559
},
{
"epoch": 0.4778156996587031,
"grad_norm": 43.706398010253906,
"learning_rate": 5.50763701707098e-06,
"loss": 2.8643,
"step": 560
},
{
"epoch": 0.4786689419795222,
"grad_norm": 41.53769302368164,
"learning_rate": 5.498652291105122e-06,
"loss": 2.7725,
"step": 561
},
{
"epoch": 0.47952218430034127,
"grad_norm": 46.46355438232422,
"learning_rate": 5.489667565139264e-06,
"loss": 3.3379,
"step": 562
},
{
"epoch": 0.4803754266211604,
"grad_norm": 20.3262882232666,
"learning_rate": 5.4806828391734054e-06,
"loss": 2.7764,
"step": 563
},
{
"epoch": 0.4812286689419795,
"grad_norm": 25.64153480529785,
"learning_rate": 5.4716981132075475e-06,
"loss": 2.6289,
"step": 564
},
{
"epoch": 0.48208191126279865,
"grad_norm": 23.21479606628418,
"learning_rate": 5.46271338724169e-06,
"loss": 2.793,
"step": 565
},
{
"epoch": 0.48293515358361777,
"grad_norm": 25.748003005981445,
"learning_rate": 5.453728661275831e-06,
"loss": 2.5664,
"step": 566
},
{
"epoch": 0.48378839590443684,
"grad_norm": 15.132332801818848,
"learning_rate": 5.444743935309974e-06,
"loss": 2.5977,
"step": 567
},
{
"epoch": 0.48464163822525597,
"grad_norm": 22.88768768310547,
"learning_rate": 5.435759209344116e-06,
"loss": 2.4688,
"step": 568
},
{
"epoch": 0.4854948805460751,
"grad_norm": 40.691104888916016,
"learning_rate": 5.426774483378258e-06,
"loss": 2.627,
"step": 569
},
{
"epoch": 0.4863481228668942,
"grad_norm": 19.89053726196289,
"learning_rate": 5.417789757412399e-06,
"loss": 2.6426,
"step": 570
},
{
"epoch": 0.4872013651877133,
"grad_norm": 52.61355209350586,
"learning_rate": 5.408805031446541e-06,
"loss": 3.0352,
"step": 571
},
{
"epoch": 0.4880546075085324,
"grad_norm": 25.755590438842773,
"learning_rate": 5.399820305480683e-06,
"loss": 2.8721,
"step": 572
},
{
"epoch": 0.48890784982935154,
"grad_norm": 49.290321350097656,
"learning_rate": 5.390835579514825e-06,
"loss": 2.9473,
"step": 573
},
{
"epoch": 0.48976109215017066,
"grad_norm": 17.155630111694336,
"learning_rate": 5.3818508535489676e-06,
"loss": 2.6191,
"step": 574
},
{
"epoch": 0.4906143344709898,
"grad_norm": 20.74138641357422,
"learning_rate": 5.37286612758311e-06,
"loss": 2.5684,
"step": 575
},
{
"epoch": 0.49146757679180886,
"grad_norm": 38.56920623779297,
"learning_rate": 5.363881401617252e-06,
"loss": 2.6221,
"step": 576
},
{
"epoch": 0.492320819112628,
"grad_norm": 31.176231384277344,
"learning_rate": 5.354896675651393e-06,
"loss": 2.3271,
"step": 577
},
{
"epoch": 0.4931740614334471,
"grad_norm": 54.210899353027344,
"learning_rate": 5.345911949685535e-06,
"loss": 2.4258,
"step": 578
},
{
"epoch": 0.49402730375426623,
"grad_norm": 21.1136474609375,
"learning_rate": 5.336927223719677e-06,
"loss": 2.6562,
"step": 579
},
{
"epoch": 0.4948805460750853,
"grad_norm": 16.198816299438477,
"learning_rate": 5.327942497753818e-06,
"loss": 3.1621,
"step": 580
},
{
"epoch": 0.49573378839590443,
"grad_norm": 25.892831802368164,
"learning_rate": 5.3189577717879605e-06,
"loss": 2.793,
"step": 581
},
{
"epoch": 0.49658703071672355,
"grad_norm": 15.013483047485352,
"learning_rate": 5.3099730458221034e-06,
"loss": 2.4639,
"step": 582
},
{
"epoch": 0.4974402730375427,
"grad_norm": 35.05656051635742,
"learning_rate": 5.300988319856245e-06,
"loss": 2.8516,
"step": 583
},
{
"epoch": 0.49829351535836175,
"grad_norm": 27.9871768951416,
"learning_rate": 5.292003593890387e-06,
"loss": 2.4189,
"step": 584
},
{
"epoch": 0.4991467576791809,
"grad_norm": 15.608467102050781,
"learning_rate": 5.283018867924529e-06,
"loss": 2.5889,
"step": 585
},
{
"epoch": 0.5,
"grad_norm": 31.001338958740234,
"learning_rate": 5.27403414195867e-06,
"loss": 2.3174,
"step": 586
},
{
"epoch": 0.5008532423208191,
"grad_norm": 31.35817527770996,
"learning_rate": 5.265049415992812e-06,
"loss": 2.3955,
"step": 587
},
{
"epoch": 0.5017064846416383,
"grad_norm": 23.374814987182617,
"learning_rate": 5.256064690026954e-06,
"loss": 3.2852,
"step": 588
},
{
"epoch": 0.5025597269624573,
"grad_norm": 22.058351516723633,
"learning_rate": 5.247079964061097e-06,
"loss": 2.4004,
"step": 589
},
{
"epoch": 0.5034129692832765,
"grad_norm": 41.5433235168457,
"learning_rate": 5.2380952380952384e-06,
"loss": 2.8564,
"step": 590
},
{
"epoch": 0.5042662116040956,
"grad_norm": 20.69767189025879,
"learning_rate": 5.2291105121293805e-06,
"loss": 2.623,
"step": 591
},
{
"epoch": 0.5051194539249146,
"grad_norm": 28.56145477294922,
"learning_rate": 5.220125786163523e-06,
"loss": 2.6143,
"step": 592
},
{
"epoch": 0.5059726962457338,
"grad_norm": 64.19149780273438,
"learning_rate": 5.211141060197664e-06,
"loss": 3.0439,
"step": 593
},
{
"epoch": 0.5068259385665529,
"grad_norm": 42.88633346557617,
"learning_rate": 5.202156334231806e-06,
"loss": 2.3398,
"step": 594
},
{
"epoch": 0.507679180887372,
"grad_norm": 25.10820770263672,
"learning_rate": 5.193171608265948e-06,
"loss": 2.2949,
"step": 595
},
{
"epoch": 0.5085324232081911,
"grad_norm": 24.562023162841797,
"learning_rate": 5.184186882300091e-06,
"loss": 2.7129,
"step": 596
},
{
"epoch": 0.5093856655290102,
"grad_norm": 36.790157318115234,
"learning_rate": 5.175202156334232e-06,
"loss": 2.5508,
"step": 597
},
{
"epoch": 0.5102389078498294,
"grad_norm": 24.073081970214844,
"learning_rate": 5.166217430368374e-06,
"loss": 2.5186,
"step": 598
},
{
"epoch": 0.5110921501706485,
"grad_norm": 35.33838653564453,
"learning_rate": 5.157232704402516e-06,
"loss": 2.4971,
"step": 599
},
{
"epoch": 0.5119453924914675,
"grad_norm": 28.3082275390625,
"learning_rate": 5.148247978436658e-06,
"loss": 2.8848,
"step": 600
},
{
"epoch": 0.5127986348122867,
"grad_norm": 25.457237243652344,
"learning_rate": 5.1392632524708e-06,
"loss": 2.7832,
"step": 601
},
{
"epoch": 0.5136518771331058,
"grad_norm": 34.88469314575195,
"learning_rate": 5.130278526504942e-06,
"loss": 2.4941,
"step": 602
},
{
"epoch": 0.514505119453925,
"grad_norm": 41.899715423583984,
"learning_rate": 5.121293800539085e-06,
"loss": 3.002,
"step": 603
},
{
"epoch": 0.515358361774744,
"grad_norm": 27.774612426757812,
"learning_rate": 5.112309074573226e-06,
"loss": 2.8643,
"step": 604
},
{
"epoch": 0.5162116040955631,
"grad_norm": 67.86431121826172,
"learning_rate": 5.103324348607368e-06,
"loss": 2.8418,
"step": 605
},
{
"epoch": 0.5170648464163823,
"grad_norm": 27.578550338745117,
"learning_rate": 5.09433962264151e-06,
"loss": 2.2207,
"step": 606
},
{
"epoch": 0.5179180887372014,
"grad_norm": 31.7324275970459,
"learning_rate": 5.085354896675651e-06,
"loss": 3.3027,
"step": 607
},
{
"epoch": 0.5187713310580204,
"grad_norm": 20.27518081665039,
"learning_rate": 5.0763701707097935e-06,
"loss": 2.2842,
"step": 608
},
{
"epoch": 0.5196245733788396,
"grad_norm": 27.45115089416504,
"learning_rate": 5.067385444743936e-06,
"loss": 2.2988,
"step": 609
},
{
"epoch": 0.5204778156996587,
"grad_norm": 40.405704498291016,
"learning_rate": 5.0584007187780785e-06,
"loss": 3.1377,
"step": 610
},
{
"epoch": 0.5213310580204779,
"grad_norm": 15.520742416381836,
"learning_rate": 5.04941599281222e-06,
"loss": 2.3604,
"step": 611
},
{
"epoch": 0.5221843003412969,
"grad_norm": 28.408700942993164,
"learning_rate": 5.040431266846362e-06,
"loss": 2.5693,
"step": 612
},
{
"epoch": 0.523037542662116,
"grad_norm": 33.49451446533203,
"learning_rate": 5.031446540880504e-06,
"loss": 2.6279,
"step": 613
},
{
"epoch": 0.5238907849829352,
"grad_norm": 17.04746437072754,
"learning_rate": 5.022461814914645e-06,
"loss": 2.5713,
"step": 614
},
{
"epoch": 0.5247440273037542,
"grad_norm": 35.0278205871582,
"learning_rate": 5.013477088948787e-06,
"loss": 2.4004,
"step": 615
},
{
"epoch": 0.5255972696245734,
"grad_norm": 23.72642707824707,
"learning_rate": 5.004492362982929e-06,
"loss": 2.832,
"step": 616
},
{
"epoch": 0.5264505119453925,
"grad_norm": 17.700857162475586,
"learning_rate": 4.9955076370170715e-06,
"loss": 2.8467,
"step": 617
},
{
"epoch": 0.5273037542662116,
"grad_norm": 26.797855377197266,
"learning_rate": 4.986522911051213e-06,
"loss": 2.7002,
"step": 618
},
{
"epoch": 0.5281569965870307,
"grad_norm": 15.89353084564209,
"learning_rate": 4.977538185085356e-06,
"loss": 2.8301,
"step": 619
},
{
"epoch": 0.5290102389078498,
"grad_norm": 17.26994514465332,
"learning_rate": 4.968553459119497e-06,
"loss": 2.4023,
"step": 620
},
{
"epoch": 0.5298634812286689,
"grad_norm": 50.756412506103516,
"learning_rate": 4.959568733153639e-06,
"loss": 2.1123,
"step": 621
},
{
"epoch": 0.5307167235494881,
"grad_norm": 42.956947326660156,
"learning_rate": 4.950584007187781e-06,
"loss": 2.2637,
"step": 622
},
{
"epoch": 0.5315699658703071,
"grad_norm": 22.47896385192871,
"learning_rate": 4.941599281221923e-06,
"loss": 2.7529,
"step": 623
},
{
"epoch": 0.5324232081911263,
"grad_norm": 18.382062911987305,
"learning_rate": 4.932614555256065e-06,
"loss": 2.6665,
"step": 624
},
{
"epoch": 0.5332764505119454,
"grad_norm": 45.38220977783203,
"learning_rate": 4.9236298292902065e-06,
"loss": 2.4453,
"step": 625
},
{
"epoch": 0.5341296928327645,
"grad_norm": 51.33029556274414,
"learning_rate": 4.914645103324349e-06,
"loss": 3.1621,
"step": 626
},
{
"epoch": 0.5349829351535836,
"grad_norm": 39.17404556274414,
"learning_rate": 4.905660377358491e-06,
"loss": 2.9941,
"step": 627
},
{
"epoch": 0.5358361774744027,
"grad_norm": 56.7110595703125,
"learning_rate": 4.896675651392633e-06,
"loss": 2.9219,
"step": 628
},
{
"epoch": 0.5366894197952219,
"grad_norm": 15.27424144744873,
"learning_rate": 4.887690925426775e-06,
"loss": 2.5547,
"step": 629
},
{
"epoch": 0.537542662116041,
"grad_norm": 22.87930679321289,
"learning_rate": 4.878706199460917e-06,
"loss": 2.3506,
"step": 630
},
{
"epoch": 0.53839590443686,
"grad_norm": 53.75349807739258,
"learning_rate": 4.869721473495059e-06,
"loss": 2.3652,
"step": 631
},
{
"epoch": 0.5392491467576792,
"grad_norm": 30.84817123413086,
"learning_rate": 4.8607367475292e-06,
"loss": 2.9297,
"step": 632
},
{
"epoch": 0.5401023890784983,
"grad_norm": 20.29245948791504,
"learning_rate": 4.851752021563343e-06,
"loss": 2.9795,
"step": 633
},
{
"epoch": 0.5409556313993175,
"grad_norm": 17.75739097595215,
"learning_rate": 4.842767295597484e-06,
"loss": 2.4072,
"step": 634
},
{
"epoch": 0.5418088737201365,
"grad_norm": 21.76918601989746,
"learning_rate": 4.8337825696316265e-06,
"loss": 2.5088,
"step": 635
},
{
"epoch": 0.5426621160409556,
"grad_norm": 31.445209503173828,
"learning_rate": 4.824797843665769e-06,
"loss": 2.4473,
"step": 636
},
{
"epoch": 0.5435153583617748,
"grad_norm": 43.67631530761719,
"learning_rate": 4.815813117699911e-06,
"loss": 2.7129,
"step": 637
},
{
"epoch": 0.5443686006825939,
"grad_norm": 41.579044342041016,
"learning_rate": 4.806828391734053e-06,
"loss": 2.6562,
"step": 638
},
{
"epoch": 0.5452218430034129,
"grad_norm": 22.987728118896484,
"learning_rate": 4.797843665768194e-06,
"loss": 2.8525,
"step": 639
},
{
"epoch": 0.5460750853242321,
"grad_norm": 17.46269416809082,
"learning_rate": 4.788858939802337e-06,
"loss": 2.6797,
"step": 640
},
{
"epoch": 0.5469283276450512,
"grad_norm": 14.546910285949707,
"learning_rate": 4.779874213836478e-06,
"loss": 2.4775,
"step": 641
},
{
"epoch": 0.5477815699658704,
"grad_norm": 29.865407943725586,
"learning_rate": 4.77088948787062e-06,
"loss": 2.6279,
"step": 642
},
{
"epoch": 0.5486348122866894,
"grad_norm": 14.915757179260254,
"learning_rate": 4.761904761904762e-06,
"loss": 2.5225,
"step": 643
},
{
"epoch": 0.5494880546075085,
"grad_norm": 29.640846252441406,
"learning_rate": 4.7529200359389045e-06,
"loss": 2.4248,
"step": 644
},
{
"epoch": 0.5503412969283277,
"grad_norm": 44.55379104614258,
"learning_rate": 4.7439353099730466e-06,
"loss": 2.6348,
"step": 645
},
{
"epoch": 0.5511945392491467,
"grad_norm": 18.605289459228516,
"learning_rate": 4.734950584007188e-06,
"loss": 2.2568,
"step": 646
},
{
"epoch": 0.5520477815699659,
"grad_norm": 22.488618850708008,
"learning_rate": 4.725965858041331e-06,
"loss": 2.7637,
"step": 647
},
{
"epoch": 0.552901023890785,
"grad_norm": 18.215923309326172,
"learning_rate": 4.716981132075472e-06,
"loss": 2.4238,
"step": 648
},
{
"epoch": 0.5537542662116041,
"grad_norm": 48.733970642089844,
"learning_rate": 4.707996406109614e-06,
"loss": 2.998,
"step": 649
},
{
"epoch": 0.5546075085324232,
"grad_norm": 20.413524627685547,
"learning_rate": 4.699011680143756e-06,
"loss": 2.335,
"step": 650
},
{
"epoch": 0.5554607508532423,
"grad_norm": 46.819068908691406,
"learning_rate": 4.690026954177898e-06,
"loss": 3.0381,
"step": 651
},
{
"epoch": 0.5563139931740614,
"grad_norm": 26.987506866455078,
"learning_rate": 4.68104222821204e-06,
"loss": 2.7188,
"step": 652
},
{
"epoch": 0.5571672354948806,
"grad_norm": 37.12288284301758,
"learning_rate": 4.6720575022461816e-06,
"loss": 2.4189,
"step": 653
},
{
"epoch": 0.5580204778156996,
"grad_norm": 36.88205337524414,
"learning_rate": 4.663072776280324e-06,
"loss": 2.6191,
"step": 654
},
{
"epoch": 0.5588737201365188,
"grad_norm": 14.645258903503418,
"learning_rate": 4.654088050314466e-06,
"loss": 1.9053,
"step": 655
},
{
"epoch": 0.5597269624573379,
"grad_norm": 17.644990921020508,
"learning_rate": 4.645103324348608e-06,
"loss": 2.7158,
"step": 656
},
{
"epoch": 0.560580204778157,
"grad_norm": 70.53995513916016,
"learning_rate": 4.636118598382749e-06,
"loss": 3.0059,
"step": 657
},
{
"epoch": 0.5614334470989761,
"grad_norm": 50.95589065551758,
"learning_rate": 4.627133872416892e-06,
"loss": 3.1006,
"step": 658
},
{
"epoch": 0.5622866894197952,
"grad_norm": 41.75291061401367,
"learning_rate": 4.618149146451033e-06,
"loss": 2.29,
"step": 659
},
{
"epoch": 0.5631399317406144,
"grad_norm": 32.955204010009766,
"learning_rate": 4.609164420485175e-06,
"loss": 2.3164,
"step": 660
},
{
"epoch": 0.5639931740614335,
"grad_norm": 33.37961196899414,
"learning_rate": 4.6001796945193174e-06,
"loss": 2.6953,
"step": 661
},
{
"epoch": 0.5648464163822525,
"grad_norm": 41.79698181152344,
"learning_rate": 4.5911949685534595e-06,
"loss": 2.916,
"step": 662
},
{
"epoch": 0.5656996587030717,
"grad_norm": 18.266895294189453,
"learning_rate": 4.582210242587602e-06,
"loss": 2.1611,
"step": 663
},
{
"epoch": 0.5665529010238908,
"grad_norm": 29.101303100585938,
"learning_rate": 4.573225516621743e-06,
"loss": 2.749,
"step": 664
},
{
"epoch": 0.5674061433447098,
"grad_norm": 58.654640197753906,
"learning_rate": 4.564240790655886e-06,
"loss": 3.3467,
"step": 665
},
{
"epoch": 0.568259385665529,
"grad_norm": 52.830665588378906,
"learning_rate": 4.555256064690027e-06,
"loss": 3.0654,
"step": 666
},
{
"epoch": 0.5691126279863481,
"grad_norm": 39.432003021240234,
"learning_rate": 4.546271338724169e-06,
"loss": 2.9551,
"step": 667
},
{
"epoch": 0.5699658703071673,
"grad_norm": 19.742292404174805,
"learning_rate": 4.537286612758311e-06,
"loss": 2.8438,
"step": 668
},
{
"epoch": 0.5708191126279863,
"grad_norm": 31.633556365966797,
"learning_rate": 4.528301886792453e-06,
"loss": 2.6396,
"step": 669
},
{
"epoch": 0.5716723549488054,
"grad_norm": 58.045066833496094,
"learning_rate": 4.519317160826595e-06,
"loss": 2.7832,
"step": 670
},
{
"epoch": 0.5725255972696246,
"grad_norm": 55.10057067871094,
"learning_rate": 4.510332434860737e-06,
"loss": 3.0498,
"step": 671
},
{
"epoch": 0.5733788395904437,
"grad_norm": 41.77906799316406,
"learning_rate": 4.5013477088948796e-06,
"loss": 2.9658,
"step": 672
},
{
"epoch": 0.5742320819112628,
"grad_norm": 27.726163864135742,
"learning_rate": 4.492362982929021e-06,
"loss": 2.7373,
"step": 673
},
{
"epoch": 0.5750853242320819,
"grad_norm": 16.946115493774414,
"learning_rate": 4.483378256963163e-06,
"loss": 3.0039,
"step": 674
},
{
"epoch": 0.575938566552901,
"grad_norm": 21.34795379638672,
"learning_rate": 4.474393530997305e-06,
"loss": 3.1045,
"step": 675
},
{
"epoch": 0.5767918088737202,
"grad_norm": 45.67304229736328,
"learning_rate": 4.465408805031447e-06,
"loss": 2.7686,
"step": 676
},
{
"epoch": 0.5776450511945392,
"grad_norm": 34.049530029296875,
"learning_rate": 4.456424079065589e-06,
"loss": 2.4395,
"step": 677
},
{
"epoch": 0.5784982935153583,
"grad_norm": 36.776790618896484,
"learning_rate": 4.44743935309973e-06,
"loss": 2.8105,
"step": 678
},
{
"epoch": 0.5793515358361775,
"grad_norm": 17.602291107177734,
"learning_rate": 4.438454627133873e-06,
"loss": 2.9131,
"step": 679
},
{
"epoch": 0.5802047781569966,
"grad_norm": 27.473234176635742,
"learning_rate": 4.429469901168015e-06,
"loss": 2.8359,
"step": 680
},
{
"epoch": 0.5810580204778157,
"grad_norm": 22.257198333740234,
"learning_rate": 4.420485175202157e-06,
"loss": 2.4365,
"step": 681
},
{
"epoch": 0.5819112627986348,
"grad_norm": 53.0062141418457,
"learning_rate": 4.411500449236299e-06,
"loss": 2.2266,
"step": 682
},
{
"epoch": 0.5827645051194539,
"grad_norm": 35.180240631103516,
"learning_rate": 4.402515723270441e-06,
"loss": 2.6904,
"step": 683
},
{
"epoch": 0.5836177474402731,
"grad_norm": 28.43410873413086,
"learning_rate": 4.393530997304583e-06,
"loss": 2.4229,
"step": 684
},
{
"epoch": 0.5844709897610921,
"grad_norm": 18.973915100097656,
"learning_rate": 4.384546271338724e-06,
"loss": 2.5684,
"step": 685
},
{
"epoch": 0.5853242320819113,
"grad_norm": 66.25029754638672,
"learning_rate": 4.375561545372867e-06,
"loss": 2.3496,
"step": 686
},
{
"epoch": 0.5861774744027304,
"grad_norm": 32.91408157348633,
"learning_rate": 4.366576819407008e-06,
"loss": 2.3613,
"step": 687
},
{
"epoch": 0.5870307167235495,
"grad_norm": 27.794586181640625,
"learning_rate": 4.3575920934411504e-06,
"loss": 3.0039,
"step": 688
},
{
"epoch": 0.5878839590443686,
"grad_norm": 33.872276306152344,
"learning_rate": 4.348607367475292e-06,
"loss": 2.5781,
"step": 689
},
{
"epoch": 0.5887372013651877,
"grad_norm": 23.393707275390625,
"learning_rate": 4.339622641509435e-06,
"loss": 2.4434,
"step": 690
},
{
"epoch": 0.5895904436860068,
"grad_norm": 27.395784378051758,
"learning_rate": 4.330637915543576e-06,
"loss": 2.3867,
"step": 691
},
{
"epoch": 0.590443686006826,
"grad_norm": 27.155609130859375,
"learning_rate": 4.321653189577718e-06,
"loss": 2.7422,
"step": 692
},
{
"epoch": 0.591296928327645,
"grad_norm": 25.652048110961914,
"learning_rate": 4.31266846361186e-06,
"loss": 2.4365,
"step": 693
},
{
"epoch": 0.5921501706484642,
"grad_norm": 22.866825103759766,
"learning_rate": 4.303683737646002e-06,
"loss": 2.7734,
"step": 694
},
{
"epoch": 0.5930034129692833,
"grad_norm": 43.10763931274414,
"learning_rate": 4.294699011680144e-06,
"loss": 2.5938,
"step": 695
},
{
"epoch": 0.5938566552901023,
"grad_norm": 24.86405372619629,
"learning_rate": 4.2857142857142855e-06,
"loss": 2.2871,
"step": 696
},
{
"epoch": 0.5947098976109215,
"grad_norm": 25.735193252563477,
"learning_rate": 4.276729559748428e-06,
"loss": 2.4424,
"step": 697
},
{
"epoch": 0.5955631399317406,
"grad_norm": 17.094524383544922,
"learning_rate": 4.26774483378257e-06,
"loss": 3.0898,
"step": 698
},
{
"epoch": 0.5964163822525598,
"grad_norm": 32.05537796020508,
"learning_rate": 4.258760107816712e-06,
"loss": 3.0322,
"step": 699
},
{
"epoch": 0.5972696245733788,
"grad_norm": 19.903535842895508,
"learning_rate": 4.249775381850854e-06,
"loss": 2.2871,
"step": 700
},
{
"epoch": 0.5981228668941979,
"grad_norm": 28.79990005493164,
"learning_rate": 4.240790655884996e-06,
"loss": 2.6797,
"step": 701
},
{
"epoch": 0.5989761092150171,
"grad_norm": 13.19858169555664,
"learning_rate": 4.231805929919138e-06,
"loss": 2.417,
"step": 702
},
{
"epoch": 0.5998293515358362,
"grad_norm": 23.84611701965332,
"learning_rate": 4.222821203953279e-06,
"loss": 2.5889,
"step": 703
},
{
"epoch": 0.6006825938566553,
"grad_norm": 47.766387939453125,
"learning_rate": 4.213836477987422e-06,
"loss": 2.335,
"step": 704
},
{
"epoch": 0.6015358361774744,
"grad_norm": 41.32902526855469,
"learning_rate": 4.204851752021563e-06,
"loss": 2.9268,
"step": 705
},
{
"epoch": 0.6023890784982935,
"grad_norm": 27.452489852905273,
"learning_rate": 4.1958670260557055e-06,
"loss": 2.2168,
"step": 706
},
{
"epoch": 0.6032423208191127,
"grad_norm": 19.42645263671875,
"learning_rate": 4.186882300089848e-06,
"loss": 2.5527,
"step": 707
},
{
"epoch": 0.6040955631399317,
"grad_norm": 21.670026779174805,
"learning_rate": 4.17789757412399e-06,
"loss": 2.3047,
"step": 708
},
{
"epoch": 0.6049488054607508,
"grad_norm": 41.55127716064453,
"learning_rate": 4.168912848158132e-06,
"loss": 2.7568,
"step": 709
},
{
"epoch": 0.60580204778157,
"grad_norm": 18.455894470214844,
"learning_rate": 4.159928122192273e-06,
"loss": 2.1895,
"step": 710
},
{
"epoch": 0.606655290102389,
"grad_norm": 32.20492172241211,
"learning_rate": 4.150943396226416e-06,
"loss": 2.6816,
"step": 711
},
{
"epoch": 0.6075085324232082,
"grad_norm": 21.960777282714844,
"learning_rate": 4.141958670260557e-06,
"loss": 2.7148,
"step": 712
},
{
"epoch": 0.6083617747440273,
"grad_norm": 18.987390518188477,
"learning_rate": 4.132973944294699e-06,
"loss": 2.4961,
"step": 713
},
{
"epoch": 0.6092150170648464,
"grad_norm": 19.618938446044922,
"learning_rate": 4.123989218328841e-06,
"loss": 2.1768,
"step": 714
},
{
"epoch": 0.6100682593856656,
"grad_norm": 20.22203254699707,
"learning_rate": 4.1150044923629835e-06,
"loss": 2.5034,
"step": 715
},
{
"epoch": 0.6109215017064846,
"grad_norm": 25.868797302246094,
"learning_rate": 4.1060197663971255e-06,
"loss": 2.2061,
"step": 716
},
{
"epoch": 0.6117747440273038,
"grad_norm": 19.47433090209961,
"learning_rate": 4.097035040431267e-06,
"loss": 3.0068,
"step": 717
},
{
"epoch": 0.6126279863481229,
"grad_norm": 34.66838836669922,
"learning_rate": 4.08805031446541e-06,
"loss": 2.7236,
"step": 718
},
{
"epoch": 0.613481228668942,
"grad_norm": 59.0142822265625,
"learning_rate": 4.079065588499551e-06,
"loss": 3.1172,
"step": 719
},
{
"epoch": 0.6143344709897611,
"grad_norm": 64.44792938232422,
"learning_rate": 4.070080862533693e-06,
"loss": 2.5957,
"step": 720
},
{
"epoch": 0.6151877133105802,
"grad_norm": 16.721546173095703,
"learning_rate": 4.061096136567835e-06,
"loss": 2.8066,
"step": 721
},
{
"epoch": 0.6160409556313993,
"grad_norm": 24.300310134887695,
"learning_rate": 4.052111410601977e-06,
"loss": 2.7002,
"step": 722
},
{
"epoch": 0.6168941979522184,
"grad_norm": 34.62942123413086,
"learning_rate": 4.043126684636119e-06,
"loss": 2.9609,
"step": 723
},
{
"epoch": 0.6177474402730375,
"grad_norm": 26.634056091308594,
"learning_rate": 4.0341419586702606e-06,
"loss": 2.3535,
"step": 724
},
{
"epoch": 0.6186006825938567,
"grad_norm": 40.439910888671875,
"learning_rate": 4.025157232704403e-06,
"loss": 2.9062,
"step": 725
},
{
"epoch": 0.6194539249146758,
"grad_norm": 17.941150665283203,
"learning_rate": 4.016172506738545e-06,
"loss": 2.7451,
"step": 726
},
{
"epoch": 0.6203071672354948,
"grad_norm": 14.031157493591309,
"learning_rate": 4.007187780772687e-06,
"loss": 2.5586,
"step": 727
},
{
"epoch": 0.621160409556314,
"grad_norm": 12.27566146850586,
"learning_rate": 3.998203054806828e-06,
"loss": 2.3438,
"step": 728
},
{
"epoch": 0.6220136518771331,
"grad_norm": 13.139644622802734,
"learning_rate": 3.989218328840971e-06,
"loss": 2.4688,
"step": 729
},
{
"epoch": 0.6228668941979523,
"grad_norm": 16.500751495361328,
"learning_rate": 3.980233602875112e-06,
"loss": 2.7188,
"step": 730
},
{
"epoch": 0.6237201365187713,
"grad_norm": 28.81122398376465,
"learning_rate": 3.971248876909254e-06,
"loss": 2.3291,
"step": 731
},
{
"epoch": 0.6245733788395904,
"grad_norm": 25.21991539001465,
"learning_rate": 3.962264150943396e-06,
"loss": 2.5391,
"step": 732
},
{
"epoch": 0.6254266211604096,
"grad_norm": 29.478809356689453,
"learning_rate": 3.9532794249775385e-06,
"loss": 2.7256,
"step": 733
},
{
"epoch": 0.6262798634812287,
"grad_norm": 20.742538452148438,
"learning_rate": 3.944294699011681e-06,
"loss": 3.0352,
"step": 734
},
{
"epoch": 0.6271331058020477,
"grad_norm": 94.33541107177734,
"learning_rate": 3.935309973045822e-06,
"loss": 2.3477,
"step": 735
},
{
"epoch": 0.6279863481228669,
"grad_norm": 14.510876655578613,
"learning_rate": 3.926325247079965e-06,
"loss": 2.127,
"step": 736
},
{
"epoch": 0.628839590443686,
"grad_norm": 21.58650016784668,
"learning_rate": 3.917340521114106e-06,
"loss": 2.2949,
"step": 737
},
{
"epoch": 0.6296928327645052,
"grad_norm": 17.82122802734375,
"learning_rate": 3.908355795148248e-06,
"loss": 2.8574,
"step": 738
},
{
"epoch": 0.6305460750853242,
"grad_norm": 18.0192813873291,
"learning_rate": 3.89937106918239e-06,
"loss": 2.3711,
"step": 739
},
{
"epoch": 0.6313993174061433,
"grad_norm": 24.10041618347168,
"learning_rate": 3.890386343216532e-06,
"loss": 2.9199,
"step": 740
},
{
"epoch": 0.6322525597269625,
"grad_norm": 27.862274169921875,
"learning_rate": 3.881401617250674e-06,
"loss": 2.8037,
"step": 741
},
{
"epoch": 0.6331058020477816,
"grad_norm": 16.650089263916016,
"learning_rate": 3.872416891284816e-06,
"loss": 2.2358,
"step": 742
},
{
"epoch": 0.6339590443686007,
"grad_norm": 24.472097396850586,
"learning_rate": 3.8634321653189586e-06,
"loss": 2.3711,
"step": 743
},
{
"epoch": 0.6348122866894198,
"grad_norm": 14.919700622558594,
"learning_rate": 3.8544474393531e-06,
"loss": 2.7002,
"step": 744
},
{
"epoch": 0.6356655290102389,
"grad_norm": 13.569048881530762,
"learning_rate": 3.845462713387242e-06,
"loss": 2.3213,
"step": 745
},
{
"epoch": 0.636518771331058,
"grad_norm": 24.573429107666016,
"learning_rate": 3.836477987421384e-06,
"loss": 2.7637,
"step": 746
},
{
"epoch": 0.6373720136518771,
"grad_norm": 18.409570693969727,
"learning_rate": 3.827493261455526e-06,
"loss": 2.4854,
"step": 747
},
{
"epoch": 0.6382252559726962,
"grad_norm": 14.060251235961914,
"learning_rate": 3.818508535489668e-06,
"loss": 2.5479,
"step": 748
},
{
"epoch": 0.6390784982935154,
"grad_norm": 22.34339714050293,
"learning_rate": 3.80952380952381e-06,
"loss": 2.5029,
"step": 749
},
{
"epoch": 0.6399317406143344,
"grad_norm": 19.189834594726562,
"learning_rate": 3.800539083557952e-06,
"loss": 2.126,
"step": 750
},
{
"epoch": 0.6407849829351536,
"grad_norm": 51.201881408691406,
"learning_rate": 3.791554357592094e-06,
"loss": 2.3213,
"step": 751
},
{
"epoch": 0.6416382252559727,
"grad_norm": 35.899330139160156,
"learning_rate": 3.7825696316262357e-06,
"loss": 2.7754,
"step": 752
},
{
"epoch": 0.6424914675767918,
"grad_norm": 25.565492630004883,
"learning_rate": 3.7735849056603777e-06,
"loss": 2.5352,
"step": 753
},
{
"epoch": 0.643344709897611,
"grad_norm": 17.710491180419922,
"learning_rate": 3.76460017969452e-06,
"loss": 2.4473,
"step": 754
},
{
"epoch": 0.64419795221843,
"grad_norm": 30.0711612701416,
"learning_rate": 3.7556154537286615e-06,
"loss": 2.9932,
"step": 755
},
{
"epoch": 0.6450511945392492,
"grad_norm": 33.850616455078125,
"learning_rate": 3.746630727762803e-06,
"loss": 2.5303,
"step": 756
},
{
"epoch": 0.6459044368600683,
"grad_norm": 29.517227172851562,
"learning_rate": 3.7376460017969457e-06,
"loss": 2.1904,
"step": 757
},
{
"epoch": 0.6467576791808873,
"grad_norm": 35.89356994628906,
"learning_rate": 3.7286612758310873e-06,
"loss": 3.1494,
"step": 758
},
{
"epoch": 0.6476109215017065,
"grad_norm": 42.21514129638672,
"learning_rate": 3.7196765498652294e-06,
"loss": 2.5229,
"step": 759
},
{
"epoch": 0.6484641638225256,
"grad_norm": 17.243484497070312,
"learning_rate": 3.710691823899371e-06,
"loss": 2.9141,
"step": 760
},
{
"epoch": 0.6493174061433447,
"grad_norm": 24.45879364013672,
"learning_rate": 3.7017070979335136e-06,
"loss": 2.208,
"step": 761
},
{
"epoch": 0.6501706484641638,
"grad_norm": 21.869504928588867,
"learning_rate": 3.6927223719676553e-06,
"loss": 2.5908,
"step": 762
},
{
"epoch": 0.6510238907849829,
"grad_norm": 18.739221572875977,
"learning_rate": 3.683737646001797e-06,
"loss": 2.5801,
"step": 763
},
{
"epoch": 0.6518771331058021,
"grad_norm": 26.45047378540039,
"learning_rate": 3.6747529200359395e-06,
"loss": 2.8115,
"step": 764
},
{
"epoch": 0.6527303754266212,
"grad_norm": 19.305646896362305,
"learning_rate": 3.665768194070081e-06,
"loss": 2.1982,
"step": 765
},
{
"epoch": 0.6535836177474402,
"grad_norm": 46.113304138183594,
"learning_rate": 3.656783468104223e-06,
"loss": 3.0215,
"step": 766
},
{
"epoch": 0.6544368600682594,
"grad_norm": 46.685699462890625,
"learning_rate": 3.647798742138365e-06,
"loss": 2.7422,
"step": 767
},
{
"epoch": 0.6552901023890785,
"grad_norm": 20.04216957092285,
"learning_rate": 3.6388140161725074e-06,
"loss": 2.8691,
"step": 768
},
{
"epoch": 0.6561433447098977,
"grad_norm": 18.197967529296875,
"learning_rate": 3.629829290206649e-06,
"loss": 2.9648,
"step": 769
},
{
"epoch": 0.6569965870307167,
"grad_norm": 20.51030731201172,
"learning_rate": 3.6208445642407907e-06,
"loss": 2.209,
"step": 770
},
{
"epoch": 0.6578498293515358,
"grad_norm": 16.829713821411133,
"learning_rate": 3.6118598382749332e-06,
"loss": 2.5537,
"step": 771
},
{
"epoch": 0.658703071672355,
"grad_norm": 21.681400299072266,
"learning_rate": 3.602875112309075e-06,
"loss": 2.7969,
"step": 772
},
{
"epoch": 0.659556313993174,
"grad_norm": 25.944387435913086,
"learning_rate": 3.5938903863432166e-06,
"loss": 2.6719,
"step": 773
},
{
"epoch": 0.6604095563139932,
"grad_norm": 51.966121673583984,
"learning_rate": 3.5849056603773586e-06,
"loss": 2.8584,
"step": 774
},
{
"epoch": 0.6612627986348123,
"grad_norm": 18.35639190673828,
"learning_rate": 3.5759209344115007e-06,
"loss": 2.3818,
"step": 775
},
{
"epoch": 0.6621160409556314,
"grad_norm": 18.182228088378906,
"learning_rate": 3.566936208445643e-06,
"loss": 2.2686,
"step": 776
},
{
"epoch": 0.6629692832764505,
"grad_norm": 16.234655380249023,
"learning_rate": 3.5579514824797845e-06,
"loss": 2.2441,
"step": 777
},
{
"epoch": 0.6638225255972696,
"grad_norm": 34.618080139160156,
"learning_rate": 3.548966756513927e-06,
"loss": 3.0039,
"step": 778
},
{
"epoch": 0.6646757679180887,
"grad_norm": 20.892868041992188,
"learning_rate": 3.5399820305480687e-06,
"loss": 2.5215,
"step": 779
},
{
"epoch": 0.6655290102389079,
"grad_norm": 21.178865432739258,
"learning_rate": 3.5309973045822103e-06,
"loss": 2.126,
"step": 780
},
{
"epoch": 0.6663822525597269,
"grad_norm": 15.986398696899414,
"learning_rate": 3.5220125786163524e-06,
"loss": 2.4512,
"step": 781
},
{
"epoch": 0.6672354948805461,
"grad_norm": 18.40003204345703,
"learning_rate": 3.5130278526504945e-06,
"loss": 2.3096,
"step": 782
},
{
"epoch": 0.6680887372013652,
"grad_norm": 28.919540405273438,
"learning_rate": 3.5040431266846366e-06,
"loss": 2.292,
"step": 783
},
{
"epoch": 0.6689419795221843,
"grad_norm": 20.11212158203125,
"learning_rate": 3.4950584007187783e-06,
"loss": 2.3701,
"step": 784
},
{
"epoch": 0.6697952218430034,
"grad_norm": 22.81437110900879,
"learning_rate": 3.4860736747529208e-06,
"loss": 2.2822,
"step": 785
},
{
"epoch": 0.6706484641638225,
"grad_norm": 46.97941970825195,
"learning_rate": 3.4770889487870624e-06,
"loss": 2.252,
"step": 786
},
{
"epoch": 0.6715017064846417,
"grad_norm": 21.700454711914062,
"learning_rate": 3.468104222821204e-06,
"loss": 2.3457,
"step": 787
},
{
"epoch": 0.6723549488054608,
"grad_norm": 18.99515151977539,
"learning_rate": 3.4591194968553458e-06,
"loss": 2.2598,
"step": 788
},
{
"epoch": 0.6732081911262798,
"grad_norm": 17.171161651611328,
"learning_rate": 3.4501347708894883e-06,
"loss": 2.3037,
"step": 789
},
{
"epoch": 0.674061433447099,
"grad_norm": 19.60438346862793,
"learning_rate": 3.44115004492363e-06,
"loss": 2.5396,
"step": 790
},
{
"epoch": 0.6749146757679181,
"grad_norm": 21.116247177124023,
"learning_rate": 3.432165318957772e-06,
"loss": 2.4775,
"step": 791
},
{
"epoch": 0.6757679180887372,
"grad_norm": 17.95282745361328,
"learning_rate": 3.423180592991914e-06,
"loss": 2.2549,
"step": 792
},
{
"epoch": 0.6766211604095563,
"grad_norm": 37.449928283691406,
"learning_rate": 3.4141958670260562e-06,
"loss": 2.3965,
"step": 793
},
{
"epoch": 0.6774744027303754,
"grad_norm": 16.76734161376953,
"learning_rate": 3.405211141060198e-06,
"loss": 2.4512,
"step": 794
},
{
"epoch": 0.6783276450511946,
"grad_norm": 16.08464813232422,
"learning_rate": 3.3962264150943395e-06,
"loss": 2.2861,
"step": 795
},
{
"epoch": 0.6791808873720137,
"grad_norm": 22.598896026611328,
"learning_rate": 3.387241689128482e-06,
"loss": 2.0,
"step": 796
},
{
"epoch": 0.6800341296928327,
"grad_norm": 30.063629150390625,
"learning_rate": 3.3782569631626237e-06,
"loss": 2.3936,
"step": 797
},
{
"epoch": 0.6808873720136519,
"grad_norm": 15.586498260498047,
"learning_rate": 3.369272237196766e-06,
"loss": 2.3857,
"step": 798
},
{
"epoch": 0.681740614334471,
"grad_norm": 47.479331970214844,
"learning_rate": 3.3602875112309075e-06,
"loss": 2.5176,
"step": 799
},
{
"epoch": 0.6825938566552902,
"grad_norm": 29.33695411682129,
"learning_rate": 3.35130278526505e-06,
"loss": 2.6494,
"step": 800
},
{
"epoch": 0.6834470989761092,
"grad_norm": 49.52241516113281,
"learning_rate": 3.3423180592991917e-06,
"loss": 2.666,
"step": 801
},
{
"epoch": 0.6843003412969283,
"grad_norm": 30.229463577270508,
"learning_rate": 3.3333333333333333e-06,
"loss": 2.4409,
"step": 802
},
{
"epoch": 0.6851535836177475,
"grad_norm": 18.470956802368164,
"learning_rate": 3.324348607367476e-06,
"loss": 2.4453,
"step": 803
},
{
"epoch": 0.6860068259385665,
"grad_norm": 68.55836486816406,
"learning_rate": 3.3153638814016175e-06,
"loss": 2.7158,
"step": 804
},
{
"epoch": 0.6868600682593856,
"grad_norm": 23.803735733032227,
"learning_rate": 3.3063791554357596e-06,
"loss": 2.4375,
"step": 805
},
{
"epoch": 0.6877133105802048,
"grad_norm": 61.04603958129883,
"learning_rate": 3.2973944294699013e-06,
"loss": 2.7373,
"step": 806
},
{
"epoch": 0.6885665529010239,
"grad_norm": 40.623558044433594,
"learning_rate": 3.2884097035040433e-06,
"loss": 2.3535,
"step": 807
},
{
"epoch": 0.689419795221843,
"grad_norm": 44.683109283447266,
"learning_rate": 3.2794249775381854e-06,
"loss": 2.5059,
"step": 808
},
{
"epoch": 0.6902730375426621,
"grad_norm": 23.082717895507812,
"learning_rate": 3.270440251572327e-06,
"loss": 2.3584,
"step": 809
},
{
"epoch": 0.6911262798634812,
"grad_norm": 31.238815307617188,
"learning_rate": 3.2614555256064696e-06,
"loss": 2.2852,
"step": 810
},
{
"epoch": 0.6919795221843004,
"grad_norm": 16.465364456176758,
"learning_rate": 3.2524707996406113e-06,
"loss": 2.9834,
"step": 811
},
{
"epoch": 0.6928327645051194,
"grad_norm": 16.760278701782227,
"learning_rate": 3.243486073674753e-06,
"loss": 2.457,
"step": 812
},
{
"epoch": 0.6936860068259386,
"grad_norm": 24.745893478393555,
"learning_rate": 3.234501347708895e-06,
"loss": 2.2397,
"step": 813
},
{
"epoch": 0.6945392491467577,
"grad_norm": 22.20821762084961,
"learning_rate": 3.225516621743037e-06,
"loss": 2.3564,
"step": 814
},
{
"epoch": 0.6953924914675768,
"grad_norm": 34.78770065307617,
"learning_rate": 3.216531895777179e-06,
"loss": 2.7852,
"step": 815
},
{
"epoch": 0.6962457337883959,
"grad_norm": 17.53403091430664,
"learning_rate": 3.207547169811321e-06,
"loss": 2.1963,
"step": 816
},
{
"epoch": 0.697098976109215,
"grad_norm": 17.553361892700195,
"learning_rate": 3.1985624438454634e-06,
"loss": 2.3682,
"step": 817
},
{
"epoch": 0.6979522184300341,
"grad_norm": 16.97439956665039,
"learning_rate": 3.189577717879605e-06,
"loss": 2.4453,
"step": 818
},
{
"epoch": 0.6988054607508533,
"grad_norm": 24.054723739624023,
"learning_rate": 3.1805929919137467e-06,
"loss": 2.6201,
"step": 819
},
{
"epoch": 0.6996587030716723,
"grad_norm": 16.930429458618164,
"learning_rate": 3.171608265947889e-06,
"loss": 2.1953,
"step": 820
},
{
"epoch": 0.7005119453924915,
"grad_norm": 36.9339599609375,
"learning_rate": 3.162623539982031e-06,
"loss": 2.6875,
"step": 821
},
{
"epoch": 0.7013651877133106,
"grad_norm": 20.728759765625,
"learning_rate": 3.153638814016173e-06,
"loss": 2.7783,
"step": 822
},
{
"epoch": 0.7022184300341296,
"grad_norm": 32.32343673706055,
"learning_rate": 3.1446540880503146e-06,
"loss": 3.0117,
"step": 823
},
{
"epoch": 0.7030716723549488,
"grad_norm": 27.187162399291992,
"learning_rate": 3.1356693620844567e-06,
"loss": 2.3828,
"step": 824
},
{
"epoch": 0.7039249146757679,
"grad_norm": 19.576969146728516,
"learning_rate": 3.126684636118599e-06,
"loss": 2.1914,
"step": 825
},
{
"epoch": 0.7047781569965871,
"grad_norm": 19.67875099182129,
"learning_rate": 3.1176999101527405e-06,
"loss": 2.374,
"step": 826
},
{
"epoch": 0.7056313993174061,
"grad_norm": 51.096885681152344,
"learning_rate": 3.108715184186882e-06,
"loss": 3.0088,
"step": 827
},
{
"epoch": 0.7064846416382252,
"grad_norm": 16.370563507080078,
"learning_rate": 3.0997304582210247e-06,
"loss": 2.3779,
"step": 828
},
{
"epoch": 0.7073378839590444,
"grad_norm": 30.746566772460938,
"learning_rate": 3.0907457322551663e-06,
"loss": 2.6177,
"step": 829
},
{
"epoch": 0.7081911262798635,
"grad_norm": 30.207935333251953,
"learning_rate": 3.0817610062893084e-06,
"loss": 2.4072,
"step": 830
},
{
"epoch": 0.7090443686006825,
"grad_norm": 29.116840362548828,
"learning_rate": 3.0727762803234505e-06,
"loss": 2.0972,
"step": 831
},
{
"epoch": 0.7098976109215017,
"grad_norm": 26.794530868530273,
"learning_rate": 3.0637915543575926e-06,
"loss": 2.333,
"step": 832
},
{
"epoch": 0.7107508532423208,
"grad_norm": 18.36752700805664,
"learning_rate": 3.0548068283917343e-06,
"loss": 2.6602,
"step": 833
},
{
"epoch": 0.71160409556314,
"grad_norm": 22.439231872558594,
"learning_rate": 3.045822102425876e-06,
"loss": 2.7002,
"step": 834
},
{
"epoch": 0.712457337883959,
"grad_norm": 51.330665588378906,
"learning_rate": 3.0368373764600184e-06,
"loss": 2.8867,
"step": 835
},
{
"epoch": 0.7133105802047781,
"grad_norm": 24.982059478759766,
"learning_rate": 3.02785265049416e-06,
"loss": 2.415,
"step": 836
},
{
"epoch": 0.7141638225255973,
"grad_norm": 40.434627532958984,
"learning_rate": 3.018867924528302e-06,
"loss": 2.1611,
"step": 837
},
{
"epoch": 0.7150170648464164,
"grad_norm": 43.82883834838867,
"learning_rate": 3.009883198562444e-06,
"loss": 2.3281,
"step": 838
},
{
"epoch": 0.7158703071672355,
"grad_norm": 43.11958312988281,
"learning_rate": 3.0008984725965864e-06,
"loss": 2.1943,
"step": 839
},
{
"epoch": 0.7167235494880546,
"grad_norm": 17.046653747558594,
"learning_rate": 2.991913746630728e-06,
"loss": 2.3779,
"step": 840
},
{
"epoch": 0.7175767918088737,
"grad_norm": 20.74578857421875,
"learning_rate": 2.9829290206648697e-06,
"loss": 2.3047,
"step": 841
},
{
"epoch": 0.7184300341296929,
"grad_norm": 48.83142852783203,
"learning_rate": 2.9739442946990122e-06,
"loss": 2.5049,
"step": 842
},
{
"epoch": 0.7192832764505119,
"grad_norm": 42.406375885009766,
"learning_rate": 2.964959568733154e-06,
"loss": 2.5117,
"step": 843
},
{
"epoch": 0.7201365187713311,
"grad_norm": 55.99921798706055,
"learning_rate": 2.9559748427672955e-06,
"loss": 2.6826,
"step": 844
},
{
"epoch": 0.7209897610921502,
"grad_norm": 40.09762191772461,
"learning_rate": 2.9469901168014376e-06,
"loss": 2.6357,
"step": 845
},
{
"epoch": 0.7218430034129693,
"grad_norm": 22.537761688232422,
"learning_rate": 2.9380053908355797e-06,
"loss": 3.0352,
"step": 846
},
{
"epoch": 0.7226962457337884,
"grad_norm": 16.046295166015625,
"learning_rate": 2.929020664869722e-06,
"loss": 2.4648,
"step": 847
},
{
"epoch": 0.7235494880546075,
"grad_norm": 20.101272583007812,
"learning_rate": 2.9200359389038635e-06,
"loss": 2.335,
"step": 848
},
{
"epoch": 0.7244027303754266,
"grad_norm": 40.1048469543457,
"learning_rate": 2.911051212938006e-06,
"loss": 2.3594,
"step": 849
},
{
"epoch": 0.7252559726962458,
"grad_norm": 47.38935852050781,
"learning_rate": 2.9020664869721477e-06,
"loss": 2.0469,
"step": 850
},
{
"epoch": 0.7261092150170648,
"grad_norm": 24.103130340576172,
"learning_rate": 2.8930817610062893e-06,
"loss": 1.9531,
"step": 851
},
{
"epoch": 0.726962457337884,
"grad_norm": 22.3649845123291,
"learning_rate": 2.8840970350404314e-06,
"loss": 2.3027,
"step": 852
},
{
"epoch": 0.7278156996587031,
"grad_norm": 36.170406341552734,
"learning_rate": 2.8751123090745735e-06,
"loss": 2.2656,
"step": 853
},
{
"epoch": 0.7286689419795221,
"grad_norm": 30.652938842773438,
"learning_rate": 2.8661275831087156e-06,
"loss": 2.3604,
"step": 854
},
{
"epoch": 0.7295221843003413,
"grad_norm": 26.317873001098633,
"learning_rate": 2.8571428571428573e-06,
"loss": 2.1924,
"step": 855
},
{
"epoch": 0.7303754266211604,
"grad_norm": 52.10211944580078,
"learning_rate": 2.8481581311769998e-06,
"loss": 2.3828,
"step": 856
},
{
"epoch": 0.7312286689419796,
"grad_norm": 22.478017807006836,
"learning_rate": 2.8391734052111414e-06,
"loss": 2.5449,
"step": 857
},
{
"epoch": 0.7320819112627986,
"grad_norm": 21.12700080871582,
"learning_rate": 2.830188679245283e-06,
"loss": 1.9521,
"step": 858
},
{
"epoch": 0.7329351535836177,
"grad_norm": 20.594982147216797,
"learning_rate": 2.821203953279425e-06,
"loss": 2.1904,
"step": 859
},
{
"epoch": 0.7337883959044369,
"grad_norm": 21.516183853149414,
"learning_rate": 2.8122192273135673e-06,
"loss": 2.5332,
"step": 860
},
{
"epoch": 0.734641638225256,
"grad_norm": 25.919029235839844,
"learning_rate": 2.803234501347709e-06,
"loss": 2.3511,
"step": 861
},
{
"epoch": 0.735494880546075,
"grad_norm": 20.421133041381836,
"learning_rate": 2.794249775381851e-06,
"loss": 2.3271,
"step": 862
},
{
"epoch": 0.7363481228668942,
"grad_norm": 23.43620491027832,
"learning_rate": 2.785265049415993e-06,
"loss": 2.0537,
"step": 863
},
{
"epoch": 0.7372013651877133,
"grad_norm": 17.840322494506836,
"learning_rate": 2.776280323450135e-06,
"loss": 2.1328,
"step": 864
},
{
"epoch": 0.7380546075085325,
"grad_norm": 33.441341400146484,
"learning_rate": 2.767295597484277e-06,
"loss": 2.5332,
"step": 865
},
{
"epoch": 0.7389078498293515,
"grad_norm": 21.710899353027344,
"learning_rate": 2.7583108715184185e-06,
"loss": 2.665,
"step": 866
},
{
"epoch": 0.7397610921501706,
"grad_norm": 23.452516555786133,
"learning_rate": 2.749326145552561e-06,
"loss": 2.1514,
"step": 867
},
{
"epoch": 0.7406143344709898,
"grad_norm": 17.572235107421875,
"learning_rate": 2.7403414195867027e-06,
"loss": 1.7598,
"step": 868
},
{
"epoch": 0.7414675767918089,
"grad_norm": 32.19004440307617,
"learning_rate": 2.731356693620845e-06,
"loss": 2.1289,
"step": 869
},
{
"epoch": 0.742320819112628,
"grad_norm": 22.450124740600586,
"learning_rate": 2.722371967654987e-06,
"loss": 2.4658,
"step": 870
},
{
"epoch": 0.7431740614334471,
"grad_norm": 19.069766998291016,
"learning_rate": 2.713387241689129e-06,
"loss": 2.2441,
"step": 871
},
{
"epoch": 0.7440273037542662,
"grad_norm": 23.94462776184082,
"learning_rate": 2.7044025157232706e-06,
"loss": 2.1426,
"step": 872
},
{
"epoch": 0.7448805460750854,
"grad_norm": 36.22708511352539,
"learning_rate": 2.6954177897574123e-06,
"loss": 2.228,
"step": 873
},
{
"epoch": 0.7457337883959044,
"grad_norm": 21.32388687133789,
"learning_rate": 2.686433063791555e-06,
"loss": 2.8701,
"step": 874
},
{
"epoch": 0.7465870307167235,
"grad_norm": 23.661392211914062,
"learning_rate": 2.6774483378256965e-06,
"loss": 2.4297,
"step": 875
},
{
"epoch": 0.7474402730375427,
"grad_norm": 20.54587173461914,
"learning_rate": 2.6684636118598386e-06,
"loss": 2.9219,
"step": 876
},
{
"epoch": 0.7482935153583617,
"grad_norm": 24.399003982543945,
"learning_rate": 2.6594788858939802e-06,
"loss": 2.9365,
"step": 877
},
{
"epoch": 0.7491467576791809,
"grad_norm": 36.97280502319336,
"learning_rate": 2.6504941599281223e-06,
"loss": 2.376,
"step": 878
},
{
"epoch": 0.75,
"grad_norm": 22.310462951660156,
"learning_rate": 2.6415094339622644e-06,
"loss": 2.4043,
"step": 879
},
{
"epoch": 0.7508532423208191,
"grad_norm": 27.330747604370117,
"learning_rate": 2.632524707996406e-06,
"loss": 2.3359,
"step": 880
},
{
"epoch": 0.7517064846416383,
"grad_norm": 20.034399032592773,
"learning_rate": 2.6235399820305486e-06,
"loss": 2.8594,
"step": 881
},
{
"epoch": 0.7525597269624573,
"grad_norm": 26.571035385131836,
"learning_rate": 2.6145552560646903e-06,
"loss": 2.0596,
"step": 882
},
{
"epoch": 0.7534129692832765,
"grad_norm": 21.897262573242188,
"learning_rate": 2.605570530098832e-06,
"loss": 2.4683,
"step": 883
},
{
"epoch": 0.7542662116040956,
"grad_norm": 17.84102439880371,
"learning_rate": 2.596585804132974e-06,
"loss": 2.1904,
"step": 884
},
{
"epoch": 0.7551194539249146,
"grad_norm": 24.632801055908203,
"learning_rate": 2.587601078167116e-06,
"loss": 2.1152,
"step": 885
},
{
"epoch": 0.7559726962457338,
"grad_norm": 18.96522331237793,
"learning_rate": 2.578616352201258e-06,
"loss": 2.5303,
"step": 886
},
{
"epoch": 0.7568259385665529,
"grad_norm": 37.09746170043945,
"learning_rate": 2.5696316262354e-06,
"loss": 2.335,
"step": 887
},
{
"epoch": 0.757679180887372,
"grad_norm": 31.17850112915039,
"learning_rate": 2.5606469002695424e-06,
"loss": 2.5059,
"step": 888
},
{
"epoch": 0.7585324232081911,
"grad_norm": 36.89558029174805,
"learning_rate": 2.551662174303684e-06,
"loss": 2.7695,
"step": 889
},
{
"epoch": 0.7593856655290102,
"grad_norm": 20.49338150024414,
"learning_rate": 2.5426774483378257e-06,
"loss": 2.6846,
"step": 890
},
{
"epoch": 0.7602389078498294,
"grad_norm": 25.983245849609375,
"learning_rate": 2.533692722371968e-06,
"loss": 2.4092,
"step": 891
},
{
"epoch": 0.7610921501706485,
"grad_norm": 27.642595291137695,
"learning_rate": 2.52470799640611e-06,
"loss": 2.0869,
"step": 892
},
{
"epoch": 0.7619453924914675,
"grad_norm": 23.749832153320312,
"learning_rate": 2.515723270440252e-06,
"loss": 2.125,
"step": 893
},
{
"epoch": 0.7627986348122867,
"grad_norm": 28.204421997070312,
"learning_rate": 2.5067385444743936e-06,
"loss": 2.1562,
"step": 894
},
{
"epoch": 0.7636518771331058,
"grad_norm": 30.861194610595703,
"learning_rate": 2.4977538185085357e-06,
"loss": 3.0098,
"step": 895
},
{
"epoch": 0.764505119453925,
"grad_norm": 19.33704376220703,
"learning_rate": 2.488769092542678e-06,
"loss": 2.4072,
"step": 896
},
{
"epoch": 0.765358361774744,
"grad_norm": 27.73824119567871,
"learning_rate": 2.4797843665768195e-06,
"loss": 2.2256,
"step": 897
},
{
"epoch": 0.7662116040955631,
"grad_norm": 34.70376968383789,
"learning_rate": 2.4707996406109616e-06,
"loss": 2.4248,
"step": 898
},
{
"epoch": 0.7670648464163823,
"grad_norm": 29.114303588867188,
"learning_rate": 2.4618149146451032e-06,
"loss": 2.7227,
"step": 899
},
{
"epoch": 0.7679180887372014,
"grad_norm": 29.096269607543945,
"learning_rate": 2.4528301886792453e-06,
"loss": 2.4404,
"step": 900
},
{
"epoch": 0.7687713310580204,
"grad_norm": 58.764366149902344,
"learning_rate": 2.4438454627133874e-06,
"loss": 2.2744,
"step": 901
},
{
"epoch": 0.7696245733788396,
"grad_norm": 30.78232192993164,
"learning_rate": 2.4348607367475295e-06,
"loss": 2.5342,
"step": 902
},
{
"epoch": 0.7704778156996587,
"grad_norm": 19.90322494506836,
"learning_rate": 2.4258760107816716e-06,
"loss": 2.1504,
"step": 903
},
{
"epoch": 0.7713310580204779,
"grad_norm": 37.51405715942383,
"learning_rate": 2.4168912848158133e-06,
"loss": 2.1846,
"step": 904
},
{
"epoch": 0.7721843003412969,
"grad_norm": 16.94844627380371,
"learning_rate": 2.4079065588499553e-06,
"loss": 2.3501,
"step": 905
},
{
"epoch": 0.773037542662116,
"grad_norm": 28.434951782226562,
"learning_rate": 2.398921832884097e-06,
"loss": 2.0811,
"step": 906
},
{
"epoch": 0.7738907849829352,
"grad_norm": 16.979534149169922,
"learning_rate": 2.389937106918239e-06,
"loss": 2.3936,
"step": 907
},
{
"epoch": 0.7747440273037542,
"grad_norm": 22.91834831237793,
"learning_rate": 2.380952380952381e-06,
"loss": 2.9756,
"step": 908
},
{
"epoch": 0.7755972696245734,
"grad_norm": 22.019638061523438,
"learning_rate": 2.3719676549865233e-06,
"loss": 2.2075,
"step": 909
},
{
"epoch": 0.7764505119453925,
"grad_norm": 30.827585220336914,
"learning_rate": 2.3629829290206654e-06,
"loss": 2.4131,
"step": 910
},
{
"epoch": 0.7773037542662116,
"grad_norm": 30.66614532470703,
"learning_rate": 2.353998203054807e-06,
"loss": 2.2217,
"step": 911
},
{
"epoch": 0.7781569965870307,
"grad_norm": 25.246841430664062,
"learning_rate": 2.345013477088949e-06,
"loss": 2.3623,
"step": 912
},
{
"epoch": 0.7790102389078498,
"grad_norm": 24.9078311920166,
"learning_rate": 2.3360287511230908e-06,
"loss": 2.2676,
"step": 913
},
{
"epoch": 0.7798634812286689,
"grad_norm": 19.189767837524414,
"learning_rate": 2.327044025157233e-06,
"loss": 2.2119,
"step": 914
},
{
"epoch": 0.7807167235494881,
"grad_norm": 21.641551971435547,
"learning_rate": 2.3180592991913745e-06,
"loss": 2.0054,
"step": 915
},
{
"epoch": 0.7815699658703071,
"grad_norm": 30.206771850585938,
"learning_rate": 2.3090745732255166e-06,
"loss": 2.165,
"step": 916
},
{
"epoch": 0.7824232081911263,
"grad_norm": 26.020099639892578,
"learning_rate": 2.3000898472596587e-06,
"loss": 2.5234,
"step": 917
},
{
"epoch": 0.7832764505119454,
"grad_norm": 18.706939697265625,
"learning_rate": 2.291105121293801e-06,
"loss": 2.2012,
"step": 918
},
{
"epoch": 0.7841296928327645,
"grad_norm": 20.241901397705078,
"learning_rate": 2.282120395327943e-06,
"loss": 2.0015,
"step": 919
},
{
"epoch": 0.7849829351535836,
"grad_norm": 19.32655906677246,
"learning_rate": 2.2731356693620846e-06,
"loss": 1.5703,
"step": 920
},
{
"epoch": 0.7858361774744027,
"grad_norm": 22.523513793945312,
"learning_rate": 2.2641509433962266e-06,
"loss": 2.585,
"step": 921
},
{
"epoch": 0.7866894197952219,
"grad_norm": 36.01555252075195,
"learning_rate": 2.2551662174303683e-06,
"loss": 1.6499,
"step": 922
},
{
"epoch": 0.787542662116041,
"grad_norm": 27.907073974609375,
"learning_rate": 2.2461814914645104e-06,
"loss": 2.3066,
"step": 923
},
{
"epoch": 0.78839590443686,
"grad_norm": 19.171142578125,
"learning_rate": 2.2371967654986525e-06,
"loss": 2.0391,
"step": 924
},
{
"epoch": 0.7892491467576792,
"grad_norm": 35.75897216796875,
"learning_rate": 2.2282120395327946e-06,
"loss": 2.6104,
"step": 925
},
{
"epoch": 0.7901023890784983,
"grad_norm": 33.932472229003906,
"learning_rate": 2.2192273135669367e-06,
"loss": 2.6094,
"step": 926
},
{
"epoch": 0.7909556313993175,
"grad_norm": 25.582454681396484,
"learning_rate": 2.2102425876010783e-06,
"loss": 2.4424,
"step": 927
},
{
"epoch": 0.7918088737201365,
"grad_norm": 27.64750099182129,
"learning_rate": 2.2012578616352204e-06,
"loss": 2.418,
"step": 928
},
{
"epoch": 0.7926621160409556,
"grad_norm": 345.17022705078125,
"learning_rate": 2.192273135669362e-06,
"loss": 2.2305,
"step": 929
},
{
"epoch": 0.7935153583617748,
"grad_norm": 54.37593460083008,
"learning_rate": 2.183288409703504e-06,
"loss": 3.1572,
"step": 930
},
{
"epoch": 0.7943686006825939,
"grad_norm": 29.86464500427246,
"learning_rate": 2.174303683737646e-06,
"loss": 2.5684,
"step": 931
},
{
"epoch": 0.7952218430034129,
"grad_norm": 22.760496139526367,
"learning_rate": 2.165318957771788e-06,
"loss": 2.5928,
"step": 932
},
{
"epoch": 0.7960750853242321,
"grad_norm": 18.5999755859375,
"learning_rate": 2.15633423180593e-06,
"loss": 2.4062,
"step": 933
},
{
"epoch": 0.7969283276450512,
"grad_norm": 26.545793533325195,
"learning_rate": 2.147349505840072e-06,
"loss": 2.1387,
"step": 934
},
{
"epoch": 0.7977815699658704,
"grad_norm": 17.7139949798584,
"learning_rate": 2.138364779874214e-06,
"loss": 2.6348,
"step": 935
},
{
"epoch": 0.7986348122866894,
"grad_norm": 29.3115234375,
"learning_rate": 2.129380053908356e-06,
"loss": 2.624,
"step": 936
},
{
"epoch": 0.7994880546075085,
"grad_norm": 33.964839935302734,
"learning_rate": 2.120395327942498e-06,
"loss": 2.4248,
"step": 937
},
{
"epoch": 0.8003412969283277,
"grad_norm": 32.854530334472656,
"learning_rate": 2.1114106019766396e-06,
"loss": 2.75,
"step": 938
},
{
"epoch": 0.8011945392491467,
"grad_norm": 19.812353134155273,
"learning_rate": 2.1024258760107817e-06,
"loss": 2.209,
"step": 939
},
{
"epoch": 0.8020477815699659,
"grad_norm": 14.995081901550293,
"learning_rate": 2.093441150044924e-06,
"loss": 2.2539,
"step": 940
},
{
"epoch": 0.802901023890785,
"grad_norm": 40.92326736450195,
"learning_rate": 2.084456424079066e-06,
"loss": 2.3555,
"step": 941
},
{
"epoch": 0.8037542662116041,
"grad_norm": 40.27710723876953,
"learning_rate": 2.075471698113208e-06,
"loss": 2.4297,
"step": 942
},
{
"epoch": 0.8046075085324232,
"grad_norm": 25.924610137939453,
"learning_rate": 2.0664869721473496e-06,
"loss": 2.5576,
"step": 943
},
{
"epoch": 0.8054607508532423,
"grad_norm": 39.17571258544922,
"learning_rate": 2.0575022461814917e-06,
"loss": 2.5166,
"step": 944
},
{
"epoch": 0.8063139931740614,
"grad_norm": 24.386545181274414,
"learning_rate": 2.0485175202156334e-06,
"loss": 2.5127,
"step": 945
},
{
"epoch": 0.8071672354948806,
"grad_norm": 23.48984718322754,
"learning_rate": 2.0395327942497755e-06,
"loss": 2.5996,
"step": 946
},
{
"epoch": 0.8080204778156996,
"grad_norm": 14.407853126525879,
"learning_rate": 2.0305480682839176e-06,
"loss": 1.7422,
"step": 947
},
{
"epoch": 0.8088737201365188,
"grad_norm": 25.272546768188477,
"learning_rate": 2.0215633423180597e-06,
"loss": 1.9629,
"step": 948
},
{
"epoch": 0.8097269624573379,
"grad_norm": 41.10344696044922,
"learning_rate": 2.0125786163522013e-06,
"loss": 2.4893,
"step": 949
},
{
"epoch": 0.810580204778157,
"grad_norm": 34.03232955932617,
"learning_rate": 2.0035938903863434e-06,
"loss": 2.3135,
"step": 950
},
{
"epoch": 0.8114334470989761,
"grad_norm": 19.308574676513672,
"learning_rate": 1.9946091644204855e-06,
"loss": 2.3301,
"step": 951
},
{
"epoch": 0.8122866894197952,
"grad_norm": 19.482486724853516,
"learning_rate": 1.985624438454627e-06,
"loss": 2.3931,
"step": 952
},
{
"epoch": 0.8131399317406144,
"grad_norm": 19.74332046508789,
"learning_rate": 1.9766397124887693e-06,
"loss": 2.082,
"step": 953
},
{
"epoch": 0.8139931740614335,
"grad_norm": 24.09305763244629,
"learning_rate": 1.967654986522911e-06,
"loss": 2.5146,
"step": 954
},
{
"epoch": 0.8148464163822525,
"grad_norm": 23.39406394958496,
"learning_rate": 1.958670260557053e-06,
"loss": 2.9053,
"step": 955
},
{
"epoch": 0.8156996587030717,
"grad_norm": 18.08087921142578,
"learning_rate": 1.949685534591195e-06,
"loss": 1.9541,
"step": 956
},
{
"epoch": 0.8165529010238908,
"grad_norm": 22.059133529663086,
"learning_rate": 1.940700808625337e-06,
"loss": 3.1865,
"step": 957
},
{
"epoch": 0.8174061433447098,
"grad_norm": 16.29061508178711,
"learning_rate": 1.9317160826594793e-06,
"loss": 2.0771,
"step": 958
},
{
"epoch": 0.818259385665529,
"grad_norm": 23.810237884521484,
"learning_rate": 1.922731356693621e-06,
"loss": 2.8477,
"step": 959
},
{
"epoch": 0.8191126279863481,
"grad_norm": 30.86567497253418,
"learning_rate": 1.913746630727763e-06,
"loss": 2.082,
"step": 960
},
{
"epoch": 0.8199658703071673,
"grad_norm": 24.14322853088379,
"learning_rate": 1.904761904761905e-06,
"loss": 2.9727,
"step": 961
},
{
"epoch": 0.8208191126279863,
"grad_norm": 20.15610694885254,
"learning_rate": 1.895777178796047e-06,
"loss": 2.4414,
"step": 962
},
{
"epoch": 0.8216723549488054,
"grad_norm": 105.63774108886719,
"learning_rate": 1.8867924528301889e-06,
"loss": 2.3037,
"step": 963
},
{
"epoch": 0.8225255972696246,
"grad_norm": 17.520694732666016,
"learning_rate": 1.8778077268643308e-06,
"loss": 2.2676,
"step": 964
},
{
"epoch": 0.8233788395904437,
"grad_norm": 26.543331146240234,
"learning_rate": 1.8688230008984728e-06,
"loss": 2.2607,
"step": 965
},
{
"epoch": 0.8242320819112628,
"grad_norm": 22.674585342407227,
"learning_rate": 1.8598382749326147e-06,
"loss": 2.4355,
"step": 966
},
{
"epoch": 0.8250853242320819,
"grad_norm": 29.76778793334961,
"learning_rate": 1.8508535489667568e-06,
"loss": 2.7432,
"step": 967
},
{
"epoch": 0.825938566552901,
"grad_norm": 32.59353256225586,
"learning_rate": 1.8418688230008985e-06,
"loss": 2.6934,
"step": 968
},
{
"epoch": 0.8267918088737202,
"grad_norm": 24.909629821777344,
"learning_rate": 1.8328840970350406e-06,
"loss": 1.8232,
"step": 969
},
{
"epoch": 0.8276450511945392,
"grad_norm": 46.205047607421875,
"learning_rate": 1.8238993710691824e-06,
"loss": 2.8975,
"step": 970
},
{
"epoch": 0.8284982935153583,
"grad_norm": 17.16266441345215,
"learning_rate": 1.8149146451033245e-06,
"loss": 2.5156,
"step": 971
},
{
"epoch": 0.8293515358361775,
"grad_norm": 104.19178009033203,
"learning_rate": 1.8059299191374666e-06,
"loss": 2.5391,
"step": 972
},
{
"epoch": 0.8302047781569966,
"grad_norm": 13.162457466125488,
"learning_rate": 1.7969451931716083e-06,
"loss": 1.8076,
"step": 973
},
{
"epoch": 0.8310580204778157,
"grad_norm": 16.426860809326172,
"learning_rate": 1.7879604672057504e-06,
"loss": 2.6025,
"step": 974
},
{
"epoch": 0.8319112627986348,
"grad_norm": 18.29142189025879,
"learning_rate": 1.7789757412398922e-06,
"loss": 2.667,
"step": 975
},
{
"epoch": 0.8327645051194539,
"grad_norm": 17.840328216552734,
"learning_rate": 1.7699910152740343e-06,
"loss": 2.4551,
"step": 976
},
{
"epoch": 0.8336177474402731,
"grad_norm": 17.80030059814453,
"learning_rate": 1.7610062893081762e-06,
"loss": 2.5186,
"step": 977
},
{
"epoch": 0.8344709897610921,
"grad_norm": 34.99458694458008,
"learning_rate": 1.7520215633423183e-06,
"loss": 2.4072,
"step": 978
},
{
"epoch": 0.8353242320819113,
"grad_norm": 17.811429977416992,
"learning_rate": 1.7430368373764604e-06,
"loss": 2.1963,
"step": 979
},
{
"epoch": 0.8361774744027304,
"grad_norm": 19.649438858032227,
"learning_rate": 1.734052111410602e-06,
"loss": 2.3896,
"step": 980
},
{
"epoch": 0.8370307167235495,
"grad_norm": 27.297197341918945,
"learning_rate": 1.7250673854447441e-06,
"loss": 2.375,
"step": 981
},
{
"epoch": 0.8378839590443686,
"grad_norm": 17.75628662109375,
"learning_rate": 1.716082659478886e-06,
"loss": 1.9648,
"step": 982
},
{
"epoch": 0.8387372013651877,
"grad_norm": 19.516536712646484,
"learning_rate": 1.7070979335130281e-06,
"loss": 2.3828,
"step": 983
},
{
"epoch": 0.8395904436860068,
"grad_norm": 15.936653137207031,
"learning_rate": 1.6981132075471698e-06,
"loss": 2.2891,
"step": 984
},
{
"epoch": 0.840443686006826,
"grad_norm": 32.02035903930664,
"learning_rate": 1.6891284815813119e-06,
"loss": 2.0713,
"step": 985
},
{
"epoch": 0.841296928327645,
"grad_norm": 24.9638671875,
"learning_rate": 1.6801437556154537e-06,
"loss": 2.3364,
"step": 986
},
{
"epoch": 0.8421501706484642,
"grad_norm": 27.20418930053711,
"learning_rate": 1.6711590296495958e-06,
"loss": 2.2085,
"step": 987
},
{
"epoch": 0.8430034129692833,
"grad_norm": 18.280986785888672,
"learning_rate": 1.662174303683738e-06,
"loss": 1.8594,
"step": 988
},
{
"epoch": 0.8438566552901023,
"grad_norm": 52.79808807373047,
"learning_rate": 1.6531895777178798e-06,
"loss": 2.7715,
"step": 989
},
{
"epoch": 0.8447098976109215,
"grad_norm": 22.51861572265625,
"learning_rate": 1.6442048517520217e-06,
"loss": 2.7188,
"step": 990
},
{
"epoch": 0.8455631399317406,
"grad_norm": 47.933372497558594,
"learning_rate": 1.6352201257861635e-06,
"loss": 3.1797,
"step": 991
},
{
"epoch": 0.8464163822525598,
"grad_norm": 20.896522521972656,
"learning_rate": 1.6262353998203056e-06,
"loss": 2.2227,
"step": 992
},
{
"epoch": 0.8472696245733788,
"grad_norm": 24.34409523010254,
"learning_rate": 1.6172506738544475e-06,
"loss": 2.2549,
"step": 993
},
{
"epoch": 0.8481228668941979,
"grad_norm": 34.13801574707031,
"learning_rate": 1.6082659478885896e-06,
"loss": 2.7314,
"step": 994
},
{
"epoch": 0.8489761092150171,
"grad_norm": 20.989660263061523,
"learning_rate": 1.5992812219227317e-06,
"loss": 2.7432,
"step": 995
},
{
"epoch": 0.8498293515358362,
"grad_norm": 20.589107513427734,
"learning_rate": 1.5902964959568734e-06,
"loss": 2.4668,
"step": 996
},
{
"epoch": 0.8506825938566553,
"grad_norm": 28.15553092956543,
"learning_rate": 1.5813117699910154e-06,
"loss": 2.7314,
"step": 997
},
{
"epoch": 0.8515358361774744,
"grad_norm": 15.138993263244629,
"learning_rate": 1.5723270440251573e-06,
"loss": 2.3604,
"step": 998
},
{
"epoch": 0.8523890784982935,
"grad_norm": 41.964927673339844,
"learning_rate": 1.5633423180592994e-06,
"loss": 2.4551,
"step": 999
},
{
"epoch": 0.8532423208191127,
"grad_norm": 24.446157455444336,
"learning_rate": 1.554357592093441e-06,
"loss": 2.75,
"step": 1000
},
{
"epoch": 0.8540955631399317,
"grad_norm": 23.93151092529297,
"learning_rate": 1.5453728661275832e-06,
"loss": 2.4736,
"step": 1001
},
{
"epoch": 0.8549488054607508,
"grad_norm": 29.804882049560547,
"learning_rate": 1.5363881401617253e-06,
"loss": 2.0986,
"step": 1002
},
{
"epoch": 0.85580204778157,
"grad_norm": 29.341312408447266,
"learning_rate": 1.5274034141958671e-06,
"loss": 2.1719,
"step": 1003
},
{
"epoch": 0.856655290102389,
"grad_norm": 19.525724411010742,
"learning_rate": 1.5184186882300092e-06,
"loss": 2.2354,
"step": 1004
},
{
"epoch": 0.8575085324232082,
"grad_norm": 20.416091918945312,
"learning_rate": 1.509433962264151e-06,
"loss": 2.5049,
"step": 1005
},
{
"epoch": 0.8583617747440273,
"grad_norm": 12.879405975341797,
"learning_rate": 1.5004492362982932e-06,
"loss": 2.001,
"step": 1006
},
{
"epoch": 0.8592150170648464,
"grad_norm": 15.967265129089355,
"learning_rate": 1.4914645103324349e-06,
"loss": 2.4395,
"step": 1007
},
{
"epoch": 0.8600682593856656,
"grad_norm": 27.89084243774414,
"learning_rate": 1.482479784366577e-06,
"loss": 2.1934,
"step": 1008
},
{
"epoch": 0.8609215017064846,
"grad_norm": 26.392724990844727,
"learning_rate": 1.4734950584007188e-06,
"loss": 2.5176,
"step": 1009
},
{
"epoch": 0.8617747440273038,
"grad_norm": 40.049503326416016,
"learning_rate": 1.464510332434861e-06,
"loss": 2.1807,
"step": 1010
},
{
"epoch": 0.8626279863481229,
"grad_norm": 16.034893035888672,
"learning_rate": 1.455525606469003e-06,
"loss": 2.3408,
"step": 1011
},
{
"epoch": 0.863481228668942,
"grad_norm": 16.689733505249023,
"learning_rate": 1.4465408805031447e-06,
"loss": 2.1162,
"step": 1012
},
{
"epoch": 0.8643344709897611,
"grad_norm": 19.43463134765625,
"learning_rate": 1.4375561545372868e-06,
"loss": 2.292,
"step": 1013
},
{
"epoch": 0.8651877133105802,
"grad_norm": 38.333335876464844,
"learning_rate": 1.4285714285714286e-06,
"loss": 2.9941,
"step": 1014
},
{
"epoch": 0.8660409556313993,
"grad_norm": 20.772680282592773,
"learning_rate": 1.4195867026055707e-06,
"loss": 2.0811,
"step": 1015
},
{
"epoch": 0.8668941979522184,
"grad_norm": 15.94913387298584,
"learning_rate": 1.4106019766397126e-06,
"loss": 2.7207,
"step": 1016
},
{
"epoch": 0.8677474402730375,
"grad_norm": 22.80181884765625,
"learning_rate": 1.4016172506738545e-06,
"loss": 2.5518,
"step": 1017
},
{
"epoch": 0.8686006825938567,
"grad_norm": 24.729825973510742,
"learning_rate": 1.3926325247079966e-06,
"loss": 2.1699,
"step": 1018
},
{
"epoch": 0.8694539249146758,
"grad_norm": 34.91142654418945,
"learning_rate": 1.3836477987421384e-06,
"loss": 2.6553,
"step": 1019
},
{
"epoch": 0.8703071672354948,
"grad_norm": 25.28569793701172,
"learning_rate": 1.3746630727762805e-06,
"loss": 2.25,
"step": 1020
},
{
"epoch": 0.871160409556314,
"grad_norm": 22.64327621459961,
"learning_rate": 1.3656783468104224e-06,
"loss": 1.7666,
"step": 1021
},
{
"epoch": 0.8720136518771331,
"grad_norm": 20.240745544433594,
"learning_rate": 1.3566936208445645e-06,
"loss": 2.9014,
"step": 1022
},
{
"epoch": 0.8728668941979523,
"grad_norm": 20.950401306152344,
"learning_rate": 1.3477088948787062e-06,
"loss": 2.4102,
"step": 1023
},
{
"epoch": 0.8737201365187713,
"grad_norm": 36.6849479675293,
"learning_rate": 1.3387241689128482e-06,
"loss": 2.8906,
"step": 1024
},
{
"epoch": 0.8745733788395904,
"grad_norm": 31.71055030822754,
"learning_rate": 1.3297394429469901e-06,
"loss": 2.417,
"step": 1025
},
{
"epoch": 0.8754266211604096,
"grad_norm": 26.664091110229492,
"learning_rate": 1.3207547169811322e-06,
"loss": 2.6895,
"step": 1026
},
{
"epoch": 0.8762798634812287,
"grad_norm": 21.710546493530273,
"learning_rate": 1.3117699910152743e-06,
"loss": 2.2451,
"step": 1027
},
{
"epoch": 0.8771331058020477,
"grad_norm": 23.674776077270508,
"learning_rate": 1.302785265049416e-06,
"loss": 2.6943,
"step": 1028
},
{
"epoch": 0.8779863481228669,
"grad_norm": 17.062026977539062,
"learning_rate": 1.293800539083558e-06,
"loss": 2.5938,
"step": 1029
},
{
"epoch": 0.878839590443686,
"grad_norm": 19.835830688476562,
"learning_rate": 1.2848158131177e-06,
"loss": 1.8984,
"step": 1030
},
{
"epoch": 0.8796928327645052,
"grad_norm": 22.001405715942383,
"learning_rate": 1.275831087151842e-06,
"loss": 2.3711,
"step": 1031
},
{
"epoch": 0.8805460750853242,
"grad_norm": 19.773300170898438,
"learning_rate": 1.266846361185984e-06,
"loss": 2.2451,
"step": 1032
},
{
"epoch": 0.8813993174061433,
"grad_norm": 20.110618591308594,
"learning_rate": 1.257861635220126e-06,
"loss": 2.0088,
"step": 1033
},
{
"epoch": 0.8822525597269625,
"grad_norm": 18.571706771850586,
"learning_rate": 1.2488769092542679e-06,
"loss": 2.6729,
"step": 1034
},
{
"epoch": 0.8831058020477816,
"grad_norm": 22.619691848754883,
"learning_rate": 1.2398921832884097e-06,
"loss": 2.3291,
"step": 1035
},
{
"epoch": 0.8839590443686007,
"grad_norm": 15.416295051574707,
"learning_rate": 1.2309074573225516e-06,
"loss": 2.1562,
"step": 1036
},
{
"epoch": 0.8848122866894198,
"grad_norm": 18.906389236450195,
"learning_rate": 1.2219227313566937e-06,
"loss": 2.2881,
"step": 1037
},
{
"epoch": 0.8856655290102389,
"grad_norm": 26.780014038085938,
"learning_rate": 1.2129380053908358e-06,
"loss": 2.1504,
"step": 1038
},
{
"epoch": 0.886518771331058,
"grad_norm": 23.079744338989258,
"learning_rate": 1.2039532794249777e-06,
"loss": 2.5264,
"step": 1039
},
{
"epoch": 0.8873720136518771,
"grad_norm": 18.029769897460938,
"learning_rate": 1.1949685534591195e-06,
"loss": 2.4541,
"step": 1040
},
{
"epoch": 0.8882252559726962,
"grad_norm": 30.483293533325195,
"learning_rate": 1.1859838274932616e-06,
"loss": 2.7627,
"step": 1041
},
{
"epoch": 0.8890784982935154,
"grad_norm": 18.396657943725586,
"learning_rate": 1.1769991015274035e-06,
"loss": 2.1699,
"step": 1042
},
{
"epoch": 0.8899317406143344,
"grad_norm": 42.24234390258789,
"learning_rate": 1.1680143755615454e-06,
"loss": 2.4609,
"step": 1043
},
{
"epoch": 0.8907849829351536,
"grad_norm": 37.315792083740234,
"learning_rate": 1.1590296495956873e-06,
"loss": 1.9644,
"step": 1044
},
{
"epoch": 0.8916382252559727,
"grad_norm": 18.035730361938477,
"learning_rate": 1.1500449236298294e-06,
"loss": 2.2441,
"step": 1045
},
{
"epoch": 0.8924914675767918,
"grad_norm": 33.729732513427734,
"learning_rate": 1.1410601976639714e-06,
"loss": 2.7402,
"step": 1046
},
{
"epoch": 0.893344709897611,
"grad_norm": 17.152904510498047,
"learning_rate": 1.1320754716981133e-06,
"loss": 2.2881,
"step": 1047
},
{
"epoch": 0.89419795221843,
"grad_norm": 30.483760833740234,
"learning_rate": 1.1230907457322552e-06,
"loss": 2.1406,
"step": 1048
},
{
"epoch": 0.8950511945392492,
"grad_norm": 22.160011291503906,
"learning_rate": 1.1141060197663973e-06,
"loss": 2.4102,
"step": 1049
},
{
"epoch": 0.8959044368600683,
"grad_norm": 21.861427307128906,
"learning_rate": 1.1051212938005392e-06,
"loss": 2.377,
"step": 1050
},
{
"epoch": 0.8967576791808873,
"grad_norm": 16.11711883544922,
"learning_rate": 1.096136567834681e-06,
"loss": 2.2207,
"step": 1051
},
{
"epoch": 0.8976109215017065,
"grad_norm": 16.705957412719727,
"learning_rate": 1.087151841868823e-06,
"loss": 2.4766,
"step": 1052
},
{
"epoch": 0.8984641638225256,
"grad_norm": 21.10558319091797,
"learning_rate": 1.078167115902965e-06,
"loss": 2.8291,
"step": 1053
},
{
"epoch": 0.8993174061433447,
"grad_norm": 16.919170379638672,
"learning_rate": 1.069182389937107e-06,
"loss": 2.3379,
"step": 1054
},
{
"epoch": 0.9001706484641638,
"grad_norm": 16.1108455657959,
"learning_rate": 1.060197663971249e-06,
"loss": 2.0049,
"step": 1055
},
{
"epoch": 0.9010238907849829,
"grad_norm": 14.90361213684082,
"learning_rate": 1.0512129380053909e-06,
"loss": 2.0732,
"step": 1056
},
{
"epoch": 0.9018771331058021,
"grad_norm": 22.117961883544922,
"learning_rate": 1.042228212039533e-06,
"loss": 2.4478,
"step": 1057
},
{
"epoch": 0.9027303754266212,
"grad_norm": 17.26648712158203,
"learning_rate": 1.0332434860736748e-06,
"loss": 2.4834,
"step": 1058
},
{
"epoch": 0.9035836177474402,
"grad_norm": 29.885637283325195,
"learning_rate": 1.0242587601078167e-06,
"loss": 2.5439,
"step": 1059
},
{
"epoch": 0.9044368600682594,
"grad_norm": 36.06587600708008,
"learning_rate": 1.0152740341419588e-06,
"loss": 2.3203,
"step": 1060
},
{
"epoch": 0.9052901023890785,
"grad_norm": 23.25389862060547,
"learning_rate": 1.0062893081761007e-06,
"loss": 1.8232,
"step": 1061
},
{
"epoch": 0.9061433447098977,
"grad_norm": 19.698678970336914,
"learning_rate": 9.973045822102428e-07,
"loss": 2.1807,
"step": 1062
},
{
"epoch": 0.9069965870307167,
"grad_norm": 20.899768829345703,
"learning_rate": 9.883198562443846e-07,
"loss": 2.0088,
"step": 1063
},
{
"epoch": 0.9078498293515358,
"grad_norm": 17.271106719970703,
"learning_rate": 9.793351302785265e-07,
"loss": 2.4834,
"step": 1064
},
{
"epoch": 0.908703071672355,
"grad_norm": 18.44869041442871,
"learning_rate": 9.703504043126686e-07,
"loss": 2.2676,
"step": 1065
},
{
"epoch": 0.909556313993174,
"grad_norm": 22.660959243774414,
"learning_rate": 9.613656783468105e-07,
"loss": 2.5537,
"step": 1066
},
{
"epoch": 0.9104095563139932,
"grad_norm": 27.0996150970459,
"learning_rate": 9.523809523809525e-07,
"loss": 2.2266,
"step": 1067
},
{
"epoch": 0.9112627986348123,
"grad_norm": 34.511531829833984,
"learning_rate": 9.433962264150944e-07,
"loss": 2.3223,
"step": 1068
},
{
"epoch": 0.9121160409556314,
"grad_norm": 27.928035736083984,
"learning_rate": 9.344115004492364e-07,
"loss": 2.6211,
"step": 1069
},
{
"epoch": 0.9129692832764505,
"grad_norm": 27.532997131347656,
"learning_rate": 9.254267744833784e-07,
"loss": 2.2324,
"step": 1070
},
{
"epoch": 0.9138225255972696,
"grad_norm": 14.844276428222656,
"learning_rate": 9.164420485175203e-07,
"loss": 2.2422,
"step": 1071
},
{
"epoch": 0.9146757679180887,
"grad_norm": 21.835037231445312,
"learning_rate": 9.074573225516623e-07,
"loss": 2.2881,
"step": 1072
},
{
"epoch": 0.9155290102389079,
"grad_norm": 19.69972801208496,
"learning_rate": 8.984725965858041e-07,
"loss": 2.1846,
"step": 1073
},
{
"epoch": 0.9163822525597269,
"grad_norm": 15.349184036254883,
"learning_rate": 8.894878706199461e-07,
"loss": 1.7319,
"step": 1074
},
{
"epoch": 0.9172354948805461,
"grad_norm": 19.928043365478516,
"learning_rate": 8.805031446540881e-07,
"loss": 2.4658,
"step": 1075
},
{
"epoch": 0.9180887372013652,
"grad_norm": 17.366472244262695,
"learning_rate": 8.715184186882302e-07,
"loss": 2.2046,
"step": 1076
},
{
"epoch": 0.9189419795221843,
"grad_norm": 29.670156478881836,
"learning_rate": 8.625336927223721e-07,
"loss": 2.3613,
"step": 1077
},
{
"epoch": 0.9197952218430034,
"grad_norm": 30.919553756713867,
"learning_rate": 8.535489667565141e-07,
"loss": 1.9961,
"step": 1078
},
{
"epoch": 0.9206484641638225,
"grad_norm": 14.773041725158691,
"learning_rate": 8.445642407906559e-07,
"loss": 1.9248,
"step": 1079
},
{
"epoch": 0.9215017064846417,
"grad_norm": 24.538284301757812,
"learning_rate": 8.355795148247979e-07,
"loss": 2.3975,
"step": 1080
},
{
"epoch": 0.9223549488054608,
"grad_norm": 19.382570266723633,
"learning_rate": 8.265947888589399e-07,
"loss": 2.1125,
"step": 1081
},
{
"epoch": 0.9232081911262798,
"grad_norm": 19.567092895507812,
"learning_rate": 8.176100628930818e-07,
"loss": 2.7686,
"step": 1082
},
{
"epoch": 0.924061433447099,
"grad_norm": 29.195287704467773,
"learning_rate": 8.086253369272238e-07,
"loss": 2.8613,
"step": 1083
},
{
"epoch": 0.9249146757679181,
"grad_norm": 20.474294662475586,
"learning_rate": 7.996406109613658e-07,
"loss": 2.2227,
"step": 1084
},
{
"epoch": 0.9257679180887372,
"grad_norm": 18.846967697143555,
"learning_rate": 7.906558849955077e-07,
"loss": 2.5205,
"step": 1085
},
{
"epoch": 0.9266211604095563,
"grad_norm": 23.362014770507812,
"learning_rate": 7.816711590296497e-07,
"loss": 2.3252,
"step": 1086
},
{
"epoch": 0.9274744027303754,
"grad_norm": 23.076448440551758,
"learning_rate": 7.726864330637916e-07,
"loss": 1.7588,
"step": 1087
},
{
"epoch": 0.9283276450511946,
"grad_norm": 24.485366821289062,
"learning_rate": 7.637017070979336e-07,
"loss": 2.397,
"step": 1088
},
{
"epoch": 0.9291808873720137,
"grad_norm": 21.237762451171875,
"learning_rate": 7.547169811320755e-07,
"loss": 2.7598,
"step": 1089
},
{
"epoch": 0.9300341296928327,
"grad_norm": 26.516183853149414,
"learning_rate": 7.457322551662174e-07,
"loss": 2.7891,
"step": 1090
},
{
"epoch": 0.9308873720136519,
"grad_norm": 21.238964080810547,
"learning_rate": 7.367475292003594e-07,
"loss": 2.5303,
"step": 1091
},
{
"epoch": 0.931740614334471,
"grad_norm": 30.594709396362305,
"learning_rate": 7.277628032345015e-07,
"loss": 1.8975,
"step": 1092
},
{
"epoch": 0.9325938566552902,
"grad_norm": 35.4166145324707,
"learning_rate": 7.187780772686434e-07,
"loss": 1.9258,
"step": 1093
},
{
"epoch": 0.9334470989761092,
"grad_norm": 22.17268943786621,
"learning_rate": 7.097933513027854e-07,
"loss": 2.4492,
"step": 1094
},
{
"epoch": 0.9343003412969283,
"grad_norm": 17.64724349975586,
"learning_rate": 7.008086253369272e-07,
"loss": 2.5547,
"step": 1095
},
{
"epoch": 0.9351535836177475,
"grad_norm": 24.73906135559082,
"learning_rate": 6.918238993710692e-07,
"loss": 2.5254,
"step": 1096
},
{
"epoch": 0.9360068259385665,
"grad_norm": 26.848501205444336,
"learning_rate": 6.828391734052112e-07,
"loss": 2.6846,
"step": 1097
},
{
"epoch": 0.9368600682593856,
"grad_norm": 20.17809295654297,
"learning_rate": 6.738544474393531e-07,
"loss": 2.2471,
"step": 1098
},
{
"epoch": 0.9377133105802048,
"grad_norm": 22.41636085510254,
"learning_rate": 6.648697214734951e-07,
"loss": 1.8472,
"step": 1099
},
{
"epoch": 0.9385665529010239,
"grad_norm": 39.55388641357422,
"learning_rate": 6.558849955076372e-07,
"loss": 2.3638,
"step": 1100
},
{
"epoch": 0.939419795221843,
"grad_norm": 33.42490005493164,
"learning_rate": 6.46900269541779e-07,
"loss": 2.4688,
"step": 1101
},
{
"epoch": 0.9402730375426621,
"grad_norm": 21.77603530883789,
"learning_rate": 6.37915543575921e-07,
"loss": 2.2363,
"step": 1102
},
{
"epoch": 0.9411262798634812,
"grad_norm": 24.46465301513672,
"learning_rate": 6.28930817610063e-07,
"loss": 2.2061,
"step": 1103
},
{
"epoch": 0.9419795221843004,
"grad_norm": 19.78148651123047,
"learning_rate": 6.199460916442049e-07,
"loss": 2.5908,
"step": 1104
},
{
"epoch": 0.9428327645051194,
"grad_norm": 31.342111587524414,
"learning_rate": 6.109613656783469e-07,
"loss": 2.5615,
"step": 1105
},
{
"epoch": 0.9436860068259386,
"grad_norm": 13.717397689819336,
"learning_rate": 6.019766397124888e-07,
"loss": 2.0635,
"step": 1106
},
{
"epoch": 0.9445392491467577,
"grad_norm": 21.84761619567871,
"learning_rate": 5.929919137466308e-07,
"loss": 2.293,
"step": 1107
},
{
"epoch": 0.9453924914675768,
"grad_norm": 37.024166107177734,
"learning_rate": 5.840071877807727e-07,
"loss": 2.249,
"step": 1108
},
{
"epoch": 0.9462457337883959,
"grad_norm": 17.425418853759766,
"learning_rate": 5.750224618149147e-07,
"loss": 2.3672,
"step": 1109
},
{
"epoch": 0.947098976109215,
"grad_norm": 26.030006408691406,
"learning_rate": 5.660377358490567e-07,
"loss": 2.124,
"step": 1110
},
{
"epoch": 0.9479522184300341,
"grad_norm": 22.431434631347656,
"learning_rate": 5.570530098831986e-07,
"loss": 2.3848,
"step": 1111
},
{
"epoch": 0.9488054607508533,
"grad_norm": 19.168577194213867,
"learning_rate": 5.480682839173405e-07,
"loss": 2.1758,
"step": 1112
},
{
"epoch": 0.9496587030716723,
"grad_norm": 17.07505226135254,
"learning_rate": 5.390835579514825e-07,
"loss": 2.374,
"step": 1113
},
{
"epoch": 0.9505119453924915,
"grad_norm": 26.080429077148438,
"learning_rate": 5.300988319856245e-07,
"loss": 2.3418,
"step": 1114
},
{
"epoch": 0.9513651877133106,
"grad_norm": 21.243762969970703,
"learning_rate": 5.211141060197665e-07,
"loss": 2.1953,
"step": 1115
},
{
"epoch": 0.9522184300341296,
"grad_norm": 15.045170783996582,
"learning_rate": 5.121293800539083e-07,
"loss": 2.1016,
"step": 1116
},
{
"epoch": 0.9530716723549488,
"grad_norm": 23.7006778717041,
"learning_rate": 5.031446540880503e-07,
"loss": 2.8184,
"step": 1117
},
{
"epoch": 0.9539249146757679,
"grad_norm": 18.883411407470703,
"learning_rate": 4.941599281221923e-07,
"loss": 2.6475,
"step": 1118
},
{
"epoch": 0.9547781569965871,
"grad_norm": 25.889921188354492,
"learning_rate": 4.851752021563343e-07,
"loss": 2.0693,
"step": 1119
},
{
"epoch": 0.9556313993174061,
"grad_norm": 22.17185401916504,
"learning_rate": 4.7619047619047623e-07,
"loss": 2.4834,
"step": 1120
},
{
"epoch": 0.9564846416382252,
"grad_norm": 22.72771644592285,
"learning_rate": 4.672057502246182e-07,
"loss": 2.5225,
"step": 1121
},
{
"epoch": 0.9573378839590444,
"grad_norm": 14.462482452392578,
"learning_rate": 4.5822102425876014e-07,
"loss": 2.0977,
"step": 1122
},
{
"epoch": 0.9581911262798635,
"grad_norm": 16.800121307373047,
"learning_rate": 4.4923629829290207e-07,
"loss": 2.0635,
"step": 1123
},
{
"epoch": 0.9590443686006825,
"grad_norm": 31.013629913330078,
"learning_rate": 4.4025157232704405e-07,
"loss": 2.3926,
"step": 1124
},
{
"epoch": 0.9598976109215017,
"grad_norm": 16.524974822998047,
"learning_rate": 4.3126684636118604e-07,
"loss": 2.1387,
"step": 1125
},
{
"epoch": 0.9607508532423208,
"grad_norm": 25.04954719543457,
"learning_rate": 4.2228212039532797e-07,
"loss": 2.6875,
"step": 1126
},
{
"epoch": 0.96160409556314,
"grad_norm": 22.931171417236328,
"learning_rate": 4.1329739442946995e-07,
"loss": 2.2598,
"step": 1127
},
{
"epoch": 0.962457337883959,
"grad_norm": 18.490894317626953,
"learning_rate": 4.043126684636119e-07,
"loss": 2.0537,
"step": 1128
},
{
"epoch": 0.9633105802047781,
"grad_norm": 18.204050064086914,
"learning_rate": 3.9532794249775386e-07,
"loss": 2.5342,
"step": 1129
},
{
"epoch": 0.9641638225255973,
"grad_norm": 40.86081314086914,
"learning_rate": 3.863432165318958e-07,
"loss": 2.4844,
"step": 1130
},
{
"epoch": 0.9650170648464164,
"grad_norm": 14.383349418640137,
"learning_rate": 3.773584905660378e-07,
"loss": 1.7993,
"step": 1131
},
{
"epoch": 0.9658703071672355,
"grad_norm": 23.310863494873047,
"learning_rate": 3.683737646001797e-07,
"loss": 2.2402,
"step": 1132
},
{
"epoch": 0.9667235494880546,
"grad_norm": 18.63228416442871,
"learning_rate": 3.593890386343217e-07,
"loss": 1.998,
"step": 1133
},
{
"epoch": 0.9675767918088737,
"grad_norm": 82.3555908203125,
"learning_rate": 3.504043126684636e-07,
"loss": 2.5557,
"step": 1134
},
{
"epoch": 0.9684300341296929,
"grad_norm": 16.93051528930664,
"learning_rate": 3.414195867026056e-07,
"loss": 2.2803,
"step": 1135
},
{
"epoch": 0.9692832764505119,
"grad_norm": 21.192161560058594,
"learning_rate": 3.3243486073674753e-07,
"loss": 2.7246,
"step": 1136
},
{
"epoch": 0.9701365187713311,
"grad_norm": 18.740169525146484,
"learning_rate": 3.234501347708895e-07,
"loss": 2.7227,
"step": 1137
},
{
"epoch": 0.9709897610921502,
"grad_norm": 19.469411849975586,
"learning_rate": 3.144654088050315e-07,
"loss": 2.2979,
"step": 1138
},
{
"epoch": 0.9718430034129693,
"grad_norm": 18.27403450012207,
"learning_rate": 3.0548068283917343e-07,
"loss": 2.251,
"step": 1139
},
{
"epoch": 0.9726962457337884,
"grad_norm": 24.299030303955078,
"learning_rate": 2.964959568733154e-07,
"loss": 2.4541,
"step": 1140
},
{
"epoch": 0.9735494880546075,
"grad_norm": 24.530475616455078,
"learning_rate": 2.8751123090745734e-07,
"loss": 2.1475,
"step": 1141
},
{
"epoch": 0.9744027303754266,
"grad_norm": 25.455007553100586,
"learning_rate": 2.785265049415993e-07,
"loss": 2.3701,
"step": 1142
},
{
"epoch": 0.9752559726962458,
"grad_norm": 16.469362258911133,
"learning_rate": 2.6954177897574125e-07,
"loss": 1.9355,
"step": 1143
},
{
"epoch": 0.9761092150170648,
"grad_norm": 19.01254653930664,
"learning_rate": 2.6055705300988324e-07,
"loss": 2.3057,
"step": 1144
},
{
"epoch": 0.976962457337884,
"grad_norm": 19.243854522705078,
"learning_rate": 2.5157232704402517e-07,
"loss": 2.4736,
"step": 1145
},
{
"epoch": 0.9778156996587031,
"grad_norm": 22.926416397094727,
"learning_rate": 2.4258760107816715e-07,
"loss": 1.9111,
"step": 1146
},
{
"epoch": 0.9786689419795221,
"grad_norm": 21.436504364013672,
"learning_rate": 2.336028751123091e-07,
"loss": 2.5264,
"step": 1147
},
{
"epoch": 0.9795221843003413,
"grad_norm": 18.81846809387207,
"learning_rate": 2.2461814914645103e-07,
"loss": 2.2744,
"step": 1148
},
{
"epoch": 0.9803754266211604,
"grad_norm": 17.731666564941406,
"learning_rate": 2.1563342318059302e-07,
"loss": 2.3506,
"step": 1149
},
{
"epoch": 0.9812286689419796,
"grad_norm": 18.162921905517578,
"learning_rate": 2.0664869721473497e-07,
"loss": 2.3584,
"step": 1150
},
{
"epoch": 0.9820819112627986,
"grad_norm": 17.115859985351562,
"learning_rate": 1.9766397124887693e-07,
"loss": 2.3174,
"step": 1151
},
{
"epoch": 0.9829351535836177,
"grad_norm": 19.12236785888672,
"learning_rate": 1.886792452830189e-07,
"loss": 2.8047,
"step": 1152
},
{
"epoch": 0.9837883959044369,
"grad_norm": 46.08415603637695,
"learning_rate": 1.7969451931716084e-07,
"loss": 2.2979,
"step": 1153
},
{
"epoch": 0.984641638225256,
"grad_norm": 19.95178985595703,
"learning_rate": 1.707097933513028e-07,
"loss": 2.5273,
"step": 1154
},
{
"epoch": 0.985494880546075,
"grad_norm": 19.663936614990234,
"learning_rate": 1.6172506738544476e-07,
"loss": 2.1338,
"step": 1155
},
{
"epoch": 0.9863481228668942,
"grad_norm": 18.48255729675293,
"learning_rate": 1.5274034141958671e-07,
"loss": 2.2705,
"step": 1156
},
{
"epoch": 0.9872013651877133,
"grad_norm": 14.638534545898438,
"learning_rate": 1.4375561545372867e-07,
"loss": 1.9199,
"step": 1157
},
{
"epoch": 0.9880546075085325,
"grad_norm": 17.064104080200195,
"learning_rate": 1.3477088948787063e-07,
"loss": 2.3135,
"step": 1158
},
{
"epoch": 0.9889078498293515,
"grad_norm": 24.58094024658203,
"learning_rate": 1.2578616352201258e-07,
"loss": 2.5918,
"step": 1159
},
{
"epoch": 0.9897610921501706,
"grad_norm": 15.54403305053711,
"learning_rate": 1.1680143755615455e-07,
"loss": 2.0586,
"step": 1160
},
{
"epoch": 0.9906143344709898,
"grad_norm": 19.432096481323242,
"learning_rate": 1.0781671159029651e-07,
"loss": 2.0234,
"step": 1161
},
{
"epoch": 0.9914675767918089,
"grad_norm": 23.13845443725586,
"learning_rate": 9.883198562443847e-08,
"loss": 2.1523,
"step": 1162
},
{
"epoch": 0.992320819112628,
"grad_norm": 19.302988052368164,
"learning_rate": 8.984725965858042e-08,
"loss": 2.5889,
"step": 1163
},
{
"epoch": 0.9931740614334471,
"grad_norm": 22.22420883178711,
"learning_rate": 8.086253369272238e-08,
"loss": 2.4434,
"step": 1164
},
{
"epoch": 0.9940273037542662,
"grad_norm": 16.57465362548828,
"learning_rate": 7.187780772686433e-08,
"loss": 2.2539,
"step": 1165
},
{
"epoch": 0.9948805460750854,
"grad_norm": 29.630794525146484,
"learning_rate": 6.289308176100629e-08,
"loss": 2.3193,
"step": 1166
},
{
"epoch": 0.9957337883959044,
"grad_norm": 16.782909393310547,
"learning_rate": 5.3908355795148254e-08,
"loss": 1.9961,
"step": 1167
},
{
"epoch": 0.9965870307167235,
"grad_norm": 25.104806900024414,
"learning_rate": 4.492362982929021e-08,
"loss": 2.375,
"step": 1168
},
{
"epoch": 0.9974402730375427,
"grad_norm": 15.419062614440918,
"learning_rate": 3.593890386343217e-08,
"loss": 1.9033,
"step": 1169
},
{
"epoch": 0.9982935153583617,
"grad_norm": 18.09457015991211,
"learning_rate": 2.6954177897574127e-08,
"loss": 2.3037,
"step": 1170
},
{
"epoch": 0.9991467576791809,
"grad_norm": 20.33756446838379,
"learning_rate": 1.7969451931716084e-08,
"loss": 2.2764,
"step": 1171
},
{
"epoch": 1.0,
"grad_norm": 22.51215362548828,
"learning_rate": 8.984725965858042e-09,
"loss": 2.6963,
"step": 1172
},
{
"epoch": 1.0,
"step": 1172,
"total_flos": 1.0293976231418266e+18,
"train_loss": 2.8942469053300983,
"train_runtime": 765.0994,
"train_samples_per_second": 392.106,
"train_steps_per_second": 1.532
}
],
"logging_steps": 1,
"max_steps": 1172,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.0293976231418266e+18,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}