tc / trainer_state.json
cfli's picture
Upload folder using huggingface_hub
2af07c8 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 2344,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0004266211604095563,
"grad_norm": 182.41513061523438,
"learning_rate": 0.0,
"loss": 7.0137,
"step": 1
},
{
"epoch": 0.0008532423208191126,
"grad_norm": 168.21009826660156,
"learning_rate": 8.474576271186442e-08,
"loss": 6.4453,
"step": 2
},
{
"epoch": 0.001279863481228669,
"grad_norm": 191.7275848388672,
"learning_rate": 1.6949152542372883e-07,
"loss": 6.5684,
"step": 3
},
{
"epoch": 0.0017064846416382253,
"grad_norm": 188.32925415039062,
"learning_rate": 2.5423728813559323e-07,
"loss": 6.9531,
"step": 4
},
{
"epoch": 0.0021331058020477816,
"grad_norm": 165.11737060546875,
"learning_rate": 3.3898305084745766e-07,
"loss": 5.9609,
"step": 5
},
{
"epoch": 0.002559726962457338,
"grad_norm": 103.73497009277344,
"learning_rate": 4.2372881355932204e-07,
"loss": 5.7617,
"step": 6
},
{
"epoch": 0.0029863481228668944,
"grad_norm": 139.20375061035156,
"learning_rate": 5.084745762711865e-07,
"loss": 5.5273,
"step": 7
},
{
"epoch": 0.0034129692832764505,
"grad_norm": 96.38396453857422,
"learning_rate": 5.93220338983051e-07,
"loss": 5.5898,
"step": 8
},
{
"epoch": 0.0038395904436860067,
"grad_norm": 100.02017211914062,
"learning_rate": 6.779661016949153e-07,
"loss": 5.0391,
"step": 9
},
{
"epoch": 0.004266211604095563,
"grad_norm": 131.98098754882812,
"learning_rate": 7.627118644067798e-07,
"loss": 5.375,
"step": 10
},
{
"epoch": 0.00469283276450512,
"grad_norm": 155.806396484375,
"learning_rate": 8.474576271186441e-07,
"loss": 5.7734,
"step": 11
},
{
"epoch": 0.005119453924914676,
"grad_norm": 151.8001251220703,
"learning_rate": 9.322033898305086e-07,
"loss": 6.3633,
"step": 12
},
{
"epoch": 0.005546075085324232,
"grad_norm": 148.3273468017578,
"learning_rate": 1.016949152542373e-06,
"loss": 6.2266,
"step": 13
},
{
"epoch": 0.005972696245733789,
"grad_norm": 178.5255126953125,
"learning_rate": 1.1016949152542374e-06,
"loss": 6.4297,
"step": 14
},
{
"epoch": 0.0063993174061433445,
"grad_norm": 167.9723663330078,
"learning_rate": 1.186440677966102e-06,
"loss": 6.6602,
"step": 15
},
{
"epoch": 0.006825938566552901,
"grad_norm": 132.07110595703125,
"learning_rate": 1.2711864406779662e-06,
"loss": 6.0898,
"step": 16
},
{
"epoch": 0.007252559726962458,
"grad_norm": 85.35553741455078,
"learning_rate": 1.3559322033898307e-06,
"loss": 5.9453,
"step": 17
},
{
"epoch": 0.007679180887372013,
"grad_norm": 59.41227340698242,
"learning_rate": 1.4406779661016951e-06,
"loss": 5.4102,
"step": 18
},
{
"epoch": 0.008105802047781569,
"grad_norm": 126.45568084716797,
"learning_rate": 1.5254237288135596e-06,
"loss": 5.4043,
"step": 19
},
{
"epoch": 0.008532423208191127,
"grad_norm": 48.57197570800781,
"learning_rate": 1.6101694915254237e-06,
"loss": 5.7871,
"step": 20
},
{
"epoch": 0.008959044368600682,
"grad_norm": 102.81019592285156,
"learning_rate": 1.6949152542372882e-06,
"loss": 5.459,
"step": 21
},
{
"epoch": 0.00938566552901024,
"grad_norm": 130.22621154785156,
"learning_rate": 1.7796610169491526e-06,
"loss": 5.207,
"step": 22
},
{
"epoch": 0.009812286689419795,
"grad_norm": 159.87655639648438,
"learning_rate": 1.8644067796610171e-06,
"loss": 5.0664,
"step": 23
},
{
"epoch": 0.010238907849829351,
"grad_norm": 136.3029327392578,
"learning_rate": 1.9491525423728816e-06,
"loss": 5.4824,
"step": 24
},
{
"epoch": 0.010665529010238909,
"grad_norm": 92.69465637207031,
"learning_rate": 2.033898305084746e-06,
"loss": 5.918,
"step": 25
},
{
"epoch": 0.011092150170648464,
"grad_norm": 154.13182067871094,
"learning_rate": 2.11864406779661e-06,
"loss": 5.3008,
"step": 26
},
{
"epoch": 0.01151877133105802,
"grad_norm": 107.52462005615234,
"learning_rate": 2.203389830508475e-06,
"loss": 5.2422,
"step": 27
},
{
"epoch": 0.011945392491467578,
"grad_norm": 103.60791778564453,
"learning_rate": 2.288135593220339e-06,
"loss": 4.7988,
"step": 28
},
{
"epoch": 0.012372013651877133,
"grad_norm": 128.28968811035156,
"learning_rate": 2.372881355932204e-06,
"loss": 5.123,
"step": 29
},
{
"epoch": 0.012798634812286689,
"grad_norm": 28.77732276916504,
"learning_rate": 2.457627118644068e-06,
"loss": 5.0469,
"step": 30
},
{
"epoch": 0.013225255972696246,
"grad_norm": 12.902440071105957,
"learning_rate": 2.5423728813559323e-06,
"loss": 4.8359,
"step": 31
},
{
"epoch": 0.013651877133105802,
"grad_norm": 76.21299743652344,
"learning_rate": 2.627118644067797e-06,
"loss": 4.418,
"step": 32
},
{
"epoch": 0.014078498293515358,
"grad_norm": 25.660552978515625,
"learning_rate": 2.7118644067796613e-06,
"loss": 4.6562,
"step": 33
},
{
"epoch": 0.014505119453924915,
"grad_norm": 12.741079330444336,
"learning_rate": 2.7966101694915256e-06,
"loss": 4.6113,
"step": 34
},
{
"epoch": 0.014931740614334471,
"grad_norm": 62.22819137573242,
"learning_rate": 2.8813559322033903e-06,
"loss": 4.5723,
"step": 35
},
{
"epoch": 0.015358361774744027,
"grad_norm": 18.65302276611328,
"learning_rate": 2.9661016949152545e-06,
"loss": 4.4434,
"step": 36
},
{
"epoch": 0.015784982935153583,
"grad_norm": 33.47079849243164,
"learning_rate": 3.0508474576271192e-06,
"loss": 4.6621,
"step": 37
},
{
"epoch": 0.016211604095563138,
"grad_norm": 65.28570556640625,
"learning_rate": 3.135593220338983e-06,
"loss": 4.4492,
"step": 38
},
{
"epoch": 0.016638225255972697,
"grad_norm": 18.592966079711914,
"learning_rate": 3.2203389830508473e-06,
"loss": 4.2324,
"step": 39
},
{
"epoch": 0.017064846416382253,
"grad_norm": 44.71397018432617,
"learning_rate": 3.305084745762712e-06,
"loss": 4.2344,
"step": 40
},
{
"epoch": 0.01749146757679181,
"grad_norm": 28.17648696899414,
"learning_rate": 3.3898305084745763e-06,
"loss": 4.127,
"step": 41
},
{
"epoch": 0.017918088737201365,
"grad_norm": 19.224647521972656,
"learning_rate": 3.474576271186441e-06,
"loss": 4.2148,
"step": 42
},
{
"epoch": 0.01834470989761092,
"grad_norm": 22.937255859375,
"learning_rate": 3.5593220338983053e-06,
"loss": 4.3301,
"step": 43
},
{
"epoch": 0.01877133105802048,
"grad_norm": 29.138164520263672,
"learning_rate": 3.6440677966101695e-06,
"loss": 4.3203,
"step": 44
},
{
"epoch": 0.019197952218430035,
"grad_norm": 51.77314758300781,
"learning_rate": 3.7288135593220342e-06,
"loss": 3.8086,
"step": 45
},
{
"epoch": 0.01962457337883959,
"grad_norm": 11.840182304382324,
"learning_rate": 3.8135593220338985e-06,
"loss": 3.9746,
"step": 46
},
{
"epoch": 0.020051194539249147,
"grad_norm": 47.615848541259766,
"learning_rate": 3.898305084745763e-06,
"loss": 4.2988,
"step": 47
},
{
"epoch": 0.020477815699658702,
"grad_norm": 42.8083381652832,
"learning_rate": 3.9830508474576275e-06,
"loss": 3.7168,
"step": 48
},
{
"epoch": 0.020904436860068258,
"grad_norm": 40.28496551513672,
"learning_rate": 4.067796610169492e-06,
"loss": 4.0176,
"step": 49
},
{
"epoch": 0.021331058020477817,
"grad_norm": 13.85673713684082,
"learning_rate": 4.152542372881356e-06,
"loss": 3.8164,
"step": 50
},
{
"epoch": 0.021757679180887373,
"grad_norm": 14.762373924255371,
"learning_rate": 4.23728813559322e-06,
"loss": 4.1719,
"step": 51
},
{
"epoch": 0.02218430034129693,
"grad_norm": 27.187488555908203,
"learning_rate": 4.322033898305085e-06,
"loss": 4.416,
"step": 52
},
{
"epoch": 0.022610921501706484,
"grad_norm": 38.772220611572266,
"learning_rate": 4.40677966101695e-06,
"loss": 3.9219,
"step": 53
},
{
"epoch": 0.02303754266211604,
"grad_norm": 59.488548278808594,
"learning_rate": 4.491525423728814e-06,
"loss": 3.4355,
"step": 54
},
{
"epoch": 0.023464163822525596,
"grad_norm": 59.54182434082031,
"learning_rate": 4.576271186440678e-06,
"loss": 3.7969,
"step": 55
},
{
"epoch": 0.023890784982935155,
"grad_norm": 28.035480499267578,
"learning_rate": 4.6610169491525425e-06,
"loss": 3.75,
"step": 56
},
{
"epoch": 0.02431740614334471,
"grad_norm": 61.01053237915039,
"learning_rate": 4.745762711864408e-06,
"loss": 4.3125,
"step": 57
},
{
"epoch": 0.024744027303754267,
"grad_norm": 48.68558883666992,
"learning_rate": 4.830508474576272e-06,
"loss": 4.2266,
"step": 58
},
{
"epoch": 0.025170648464163822,
"grad_norm": 26.09473419189453,
"learning_rate": 4.915254237288136e-06,
"loss": 3.9473,
"step": 59
},
{
"epoch": 0.025597269624573378,
"grad_norm": 31.998085021972656,
"learning_rate": 5e-06,
"loss": 3.5244,
"step": 60
},
{
"epoch": 0.026023890784982934,
"grad_norm": 64.63945007324219,
"learning_rate": 5.084745762711865e-06,
"loss": 3.9707,
"step": 61
},
{
"epoch": 0.026450511945392493,
"grad_norm": 61.44773864746094,
"learning_rate": 5.169491525423729e-06,
"loss": 3.7266,
"step": 62
},
{
"epoch": 0.02687713310580205,
"grad_norm": 96.3592529296875,
"learning_rate": 5.254237288135594e-06,
"loss": 4.2656,
"step": 63
},
{
"epoch": 0.027303754266211604,
"grad_norm": 88.27396392822266,
"learning_rate": 5.338983050847458e-06,
"loss": 3.9473,
"step": 64
},
{
"epoch": 0.02773037542662116,
"grad_norm": 49.265892028808594,
"learning_rate": 5.423728813559323e-06,
"loss": 3.6973,
"step": 65
},
{
"epoch": 0.028156996587030716,
"grad_norm": 20.13355255126953,
"learning_rate": 5.508474576271187e-06,
"loss": 3.9805,
"step": 66
},
{
"epoch": 0.02858361774744027,
"grad_norm": 21.27371597290039,
"learning_rate": 5.593220338983051e-06,
"loss": 3.6777,
"step": 67
},
{
"epoch": 0.02901023890784983,
"grad_norm": 18.588361740112305,
"learning_rate": 5.677966101694916e-06,
"loss": 3.6465,
"step": 68
},
{
"epoch": 0.029436860068259386,
"grad_norm": 46.04341506958008,
"learning_rate": 5.7627118644067805e-06,
"loss": 3.627,
"step": 69
},
{
"epoch": 0.029863481228668942,
"grad_norm": 89.78927612304688,
"learning_rate": 5.847457627118645e-06,
"loss": 3.7227,
"step": 70
},
{
"epoch": 0.030290102389078498,
"grad_norm": 85.39374542236328,
"learning_rate": 5.932203389830509e-06,
"loss": 3.4609,
"step": 71
},
{
"epoch": 0.030716723549488054,
"grad_norm": 100.72692108154297,
"learning_rate": 6.0169491525423725e-06,
"loss": 3.6016,
"step": 72
},
{
"epoch": 0.03114334470989761,
"grad_norm": 23.946613311767578,
"learning_rate": 6.1016949152542385e-06,
"loss": 4.0117,
"step": 73
},
{
"epoch": 0.031569965870307165,
"grad_norm": 63.32298278808594,
"learning_rate": 6.186440677966103e-06,
"loss": 3.9414,
"step": 74
},
{
"epoch": 0.03199658703071672,
"grad_norm": 63.94538116455078,
"learning_rate": 6.271186440677966e-06,
"loss": 4.1016,
"step": 75
},
{
"epoch": 0.032423208191126277,
"grad_norm": 85.71823120117188,
"learning_rate": 6.3559322033898304e-06,
"loss": 4.1504,
"step": 76
},
{
"epoch": 0.03284982935153584,
"grad_norm": 43.66586685180664,
"learning_rate": 6.440677966101695e-06,
"loss": 3.5293,
"step": 77
},
{
"epoch": 0.033276450511945395,
"grad_norm": 31.57004165649414,
"learning_rate": 6.52542372881356e-06,
"loss": 3.9141,
"step": 78
},
{
"epoch": 0.03370307167235495,
"grad_norm": 17.844924926757812,
"learning_rate": 6.610169491525424e-06,
"loss": 3.5469,
"step": 79
},
{
"epoch": 0.034129692832764506,
"grad_norm": 60.84952163696289,
"learning_rate": 6.694915254237288e-06,
"loss": 3.6504,
"step": 80
},
{
"epoch": 0.03455631399317406,
"grad_norm": 54.73509979248047,
"learning_rate": 6.779661016949153e-06,
"loss": 3.5234,
"step": 81
},
{
"epoch": 0.03498293515358362,
"grad_norm": 15.640512466430664,
"learning_rate": 6.864406779661017e-06,
"loss": 3.3867,
"step": 82
},
{
"epoch": 0.035409556313993173,
"grad_norm": 25.58538246154785,
"learning_rate": 6.949152542372882e-06,
"loss": 3.5215,
"step": 83
},
{
"epoch": 0.03583617747440273,
"grad_norm": 47.705474853515625,
"learning_rate": 7.033898305084746e-06,
"loss": 3.3828,
"step": 84
},
{
"epoch": 0.036262798634812285,
"grad_norm": 19.92372703552246,
"learning_rate": 7.1186440677966106e-06,
"loss": 3.959,
"step": 85
},
{
"epoch": 0.03668941979522184,
"grad_norm": 58.42499542236328,
"learning_rate": 7.203389830508475e-06,
"loss": 3.6934,
"step": 86
},
{
"epoch": 0.037116040955631396,
"grad_norm": 16.288785934448242,
"learning_rate": 7.288135593220339e-06,
"loss": 3.0273,
"step": 87
},
{
"epoch": 0.03754266211604096,
"grad_norm": 38.42738342285156,
"learning_rate": 7.372881355932204e-06,
"loss": 3.0938,
"step": 88
},
{
"epoch": 0.037969283276450515,
"grad_norm": 60.21091842651367,
"learning_rate": 7.4576271186440685e-06,
"loss": 3.3691,
"step": 89
},
{
"epoch": 0.03839590443686007,
"grad_norm": 30.50881004333496,
"learning_rate": 7.542372881355933e-06,
"loss": 3.4746,
"step": 90
},
{
"epoch": 0.038822525597269626,
"grad_norm": 21.170913696289062,
"learning_rate": 7.627118644067797e-06,
"loss": 3.25,
"step": 91
},
{
"epoch": 0.03924914675767918,
"grad_norm": 22.86617088317871,
"learning_rate": 7.711864406779663e-06,
"loss": 3.1152,
"step": 92
},
{
"epoch": 0.03967576791808874,
"grad_norm": 73.49543762207031,
"learning_rate": 7.796610169491526e-06,
"loss": 3.1328,
"step": 93
},
{
"epoch": 0.04010238907849829,
"grad_norm": 40.927406311035156,
"learning_rate": 7.88135593220339e-06,
"loss": 3.0215,
"step": 94
},
{
"epoch": 0.04052901023890785,
"grad_norm": 51.40994644165039,
"learning_rate": 7.966101694915255e-06,
"loss": 3.8906,
"step": 95
},
{
"epoch": 0.040955631399317405,
"grad_norm": 59.001495361328125,
"learning_rate": 8.050847457627118e-06,
"loss": 3.9209,
"step": 96
},
{
"epoch": 0.04138225255972696,
"grad_norm": 26.802316665649414,
"learning_rate": 8.135593220338983e-06,
"loss": 3.5762,
"step": 97
},
{
"epoch": 0.041808873720136516,
"grad_norm": 32.98997116088867,
"learning_rate": 8.220338983050849e-06,
"loss": 3.7031,
"step": 98
},
{
"epoch": 0.04223549488054607,
"grad_norm": 23.042098999023438,
"learning_rate": 8.305084745762712e-06,
"loss": 4.0488,
"step": 99
},
{
"epoch": 0.042662116040955635,
"grad_norm": 66.93525695800781,
"learning_rate": 8.389830508474577e-06,
"loss": 3.8965,
"step": 100
},
{
"epoch": 0.04308873720136519,
"grad_norm": 39.358070373535156,
"learning_rate": 8.47457627118644e-06,
"loss": 3.2852,
"step": 101
},
{
"epoch": 0.043515358361774746,
"grad_norm": 14.342375755310059,
"learning_rate": 8.559322033898306e-06,
"loss": 3.3594,
"step": 102
},
{
"epoch": 0.0439419795221843,
"grad_norm": 36.743690490722656,
"learning_rate": 8.64406779661017e-06,
"loss": 3.3359,
"step": 103
},
{
"epoch": 0.04436860068259386,
"grad_norm": 32.253173828125,
"learning_rate": 8.728813559322034e-06,
"loss": 3.3418,
"step": 104
},
{
"epoch": 0.04479522184300341,
"grad_norm": 34.88021469116211,
"learning_rate": 8.8135593220339e-06,
"loss": 3.623,
"step": 105
},
{
"epoch": 0.04522184300341297,
"grad_norm": 62.94032287597656,
"learning_rate": 8.898305084745763e-06,
"loss": 3.6289,
"step": 106
},
{
"epoch": 0.045648464163822525,
"grad_norm": 37.47450637817383,
"learning_rate": 8.983050847457628e-06,
"loss": 2.9238,
"step": 107
},
{
"epoch": 0.04607508532423208,
"grad_norm": 21.631309509277344,
"learning_rate": 9.067796610169493e-06,
"loss": 3.4141,
"step": 108
},
{
"epoch": 0.046501706484641636,
"grad_norm": 42.39487075805664,
"learning_rate": 9.152542372881356e-06,
"loss": 4.3613,
"step": 109
},
{
"epoch": 0.04692832764505119,
"grad_norm": 29.449064254760742,
"learning_rate": 9.237288135593222e-06,
"loss": 3.7773,
"step": 110
},
{
"epoch": 0.04735494880546075,
"grad_norm": 35.71257019042969,
"learning_rate": 9.322033898305085e-06,
"loss": 3.0,
"step": 111
},
{
"epoch": 0.04778156996587031,
"grad_norm": 45.36302947998047,
"learning_rate": 9.40677966101695e-06,
"loss": 3.7363,
"step": 112
},
{
"epoch": 0.048208191126279866,
"grad_norm": 21.860015869140625,
"learning_rate": 9.491525423728815e-06,
"loss": 3.6289,
"step": 113
},
{
"epoch": 0.04863481228668942,
"grad_norm": 46.48929977416992,
"learning_rate": 9.576271186440679e-06,
"loss": 3.3066,
"step": 114
},
{
"epoch": 0.04906143344709898,
"grad_norm": 16.482284545898438,
"learning_rate": 9.661016949152544e-06,
"loss": 3.1934,
"step": 115
},
{
"epoch": 0.04948805460750853,
"grad_norm": 29.34090232849121,
"learning_rate": 9.745762711864407e-06,
"loss": 3.7871,
"step": 116
},
{
"epoch": 0.04991467576791809,
"grad_norm": 26.02210235595703,
"learning_rate": 9.830508474576272e-06,
"loss": 3.2969,
"step": 117
},
{
"epoch": 0.050341296928327645,
"grad_norm": 33.3517951965332,
"learning_rate": 9.915254237288137e-06,
"loss": 3.1006,
"step": 118
},
{
"epoch": 0.0507679180887372,
"grad_norm": 40.15692138671875,
"learning_rate": 1e-05,
"loss": 3.6816,
"step": 119
},
{
"epoch": 0.051194539249146756,
"grad_norm": 31.148941040039062,
"learning_rate": 9.995507637017073e-06,
"loss": 3.6113,
"step": 120
},
{
"epoch": 0.05162116040955631,
"grad_norm": 44.07515335083008,
"learning_rate": 9.991015274034143e-06,
"loss": 3.5859,
"step": 121
},
{
"epoch": 0.05204778156996587,
"grad_norm": 73.39000701904297,
"learning_rate": 9.986522911051215e-06,
"loss": 3.2852,
"step": 122
},
{
"epoch": 0.05247440273037542,
"grad_norm": 47.39154815673828,
"learning_rate": 9.982030548068285e-06,
"loss": 2.7686,
"step": 123
},
{
"epoch": 0.052901023890784986,
"grad_norm": 18.518726348876953,
"learning_rate": 9.977538185085355e-06,
"loss": 3.5391,
"step": 124
},
{
"epoch": 0.05332764505119454,
"grad_norm": 19.222064971923828,
"learning_rate": 9.973045822102425e-06,
"loss": 3.8418,
"step": 125
},
{
"epoch": 0.0537542662116041,
"grad_norm": 93.03326416015625,
"learning_rate": 9.968553459119497e-06,
"loss": 3.4277,
"step": 126
},
{
"epoch": 0.05418088737201365,
"grad_norm": 25.00471305847168,
"learning_rate": 9.96406109613657e-06,
"loss": 3.4277,
"step": 127
},
{
"epoch": 0.05460750853242321,
"grad_norm": 20.304758071899414,
"learning_rate": 9.95956873315364e-06,
"loss": 3.5527,
"step": 128
},
{
"epoch": 0.055034129692832764,
"grad_norm": 49.463077545166016,
"learning_rate": 9.955076370170711e-06,
"loss": 3.3809,
"step": 129
},
{
"epoch": 0.05546075085324232,
"grad_norm": 19.390296936035156,
"learning_rate": 9.950584007187781e-06,
"loss": 3.7402,
"step": 130
},
{
"epoch": 0.055887372013651876,
"grad_norm": 70.4116439819336,
"learning_rate": 9.946091644204853e-06,
"loss": 3.6895,
"step": 131
},
{
"epoch": 0.05631399317406143,
"grad_norm": 32.00130844116211,
"learning_rate": 9.941599281221924e-06,
"loss": 2.8701,
"step": 132
},
{
"epoch": 0.05674061433447099,
"grad_norm": 15.688907623291016,
"learning_rate": 9.937106918238994e-06,
"loss": 3.5215,
"step": 133
},
{
"epoch": 0.05716723549488054,
"grad_norm": 28.683942794799805,
"learning_rate": 9.932614555256066e-06,
"loss": 3.791,
"step": 134
},
{
"epoch": 0.057593856655290106,
"grad_norm": 20.80558204650879,
"learning_rate": 9.928122192273136e-06,
"loss": 3.0703,
"step": 135
},
{
"epoch": 0.05802047781569966,
"grad_norm": 59.96001434326172,
"learning_rate": 9.923629829290208e-06,
"loss": 3.6309,
"step": 136
},
{
"epoch": 0.05844709897610922,
"grad_norm": 62.419368743896484,
"learning_rate": 9.919137466307278e-06,
"loss": 3.5293,
"step": 137
},
{
"epoch": 0.05887372013651877,
"grad_norm": 65.35993957519531,
"learning_rate": 9.91464510332435e-06,
"loss": 3.4355,
"step": 138
},
{
"epoch": 0.05930034129692833,
"grad_norm": 47.3123779296875,
"learning_rate": 9.91015274034142e-06,
"loss": 3.3809,
"step": 139
},
{
"epoch": 0.059726962457337884,
"grad_norm": 47.363914489746094,
"learning_rate": 9.905660377358492e-06,
"loss": 3.998,
"step": 140
},
{
"epoch": 0.06015358361774744,
"grad_norm": 37.10547637939453,
"learning_rate": 9.901168014375562e-06,
"loss": 3.0352,
"step": 141
},
{
"epoch": 0.060580204778156996,
"grad_norm": 46.55497360229492,
"learning_rate": 9.896675651392634e-06,
"loss": 3.0293,
"step": 142
},
{
"epoch": 0.06100682593856655,
"grad_norm": 25.803016662597656,
"learning_rate": 9.892183288409704e-06,
"loss": 3.3047,
"step": 143
},
{
"epoch": 0.06143344709897611,
"grad_norm": 33.59292984008789,
"learning_rate": 9.887690925426774e-06,
"loss": 3.6113,
"step": 144
},
{
"epoch": 0.06186006825938566,
"grad_norm": 36.967140197753906,
"learning_rate": 9.883198562443846e-06,
"loss": 2.8086,
"step": 145
},
{
"epoch": 0.06228668941979522,
"grad_norm": 80.75099182128906,
"learning_rate": 9.878706199460916e-06,
"loss": 3.4902,
"step": 146
},
{
"epoch": 0.06271331058020478,
"grad_norm": 60.304359436035156,
"learning_rate": 9.874213836477988e-06,
"loss": 3.2695,
"step": 147
},
{
"epoch": 0.06313993174061433,
"grad_norm": 73.07403564453125,
"learning_rate": 9.86972147349506e-06,
"loss": 3.5273,
"step": 148
},
{
"epoch": 0.06356655290102389,
"grad_norm": 24.91359519958496,
"learning_rate": 9.86522911051213e-06,
"loss": 3.4062,
"step": 149
},
{
"epoch": 0.06399317406143344,
"grad_norm": 41.76774978637695,
"learning_rate": 9.860736747529202e-06,
"loss": 3.4316,
"step": 150
},
{
"epoch": 0.064419795221843,
"grad_norm": 39.29990005493164,
"learning_rate": 9.856244384546273e-06,
"loss": 3.3379,
"step": 151
},
{
"epoch": 0.06484641638225255,
"grad_norm": 66.60665130615234,
"learning_rate": 9.851752021563343e-06,
"loss": 3.5,
"step": 152
},
{
"epoch": 0.06527303754266212,
"grad_norm": 32.428009033203125,
"learning_rate": 9.847259658580413e-06,
"loss": 3.1816,
"step": 153
},
{
"epoch": 0.06569965870307168,
"grad_norm": 20.9417781829834,
"learning_rate": 9.842767295597485e-06,
"loss": 3.3105,
"step": 154
},
{
"epoch": 0.06612627986348123,
"grad_norm": 59.17805862426758,
"learning_rate": 9.838274932614557e-06,
"loss": 3.1074,
"step": 155
},
{
"epoch": 0.06655290102389079,
"grad_norm": 58.085693359375,
"learning_rate": 9.833782569631627e-06,
"loss": 3.1348,
"step": 156
},
{
"epoch": 0.06697952218430034,
"grad_norm": 26.633859634399414,
"learning_rate": 9.829290206648699e-06,
"loss": 3.3379,
"step": 157
},
{
"epoch": 0.0674061433447099,
"grad_norm": 27.633708953857422,
"learning_rate": 9.824797843665769e-06,
"loss": 3.7852,
"step": 158
},
{
"epoch": 0.06783276450511945,
"grad_norm": 103.55244445800781,
"learning_rate": 9.820305480682841e-06,
"loss": 3.1504,
"step": 159
},
{
"epoch": 0.06825938566552901,
"grad_norm": 118.57666778564453,
"learning_rate": 9.815813117699911e-06,
"loss": 3.5752,
"step": 160
},
{
"epoch": 0.06868600682593856,
"grad_norm": 46.34486389160156,
"learning_rate": 9.811320754716981e-06,
"loss": 3.4902,
"step": 161
},
{
"epoch": 0.06911262798634812,
"grad_norm": 74.79405975341797,
"learning_rate": 9.806828391734053e-06,
"loss": 2.9424,
"step": 162
},
{
"epoch": 0.06953924914675767,
"grad_norm": 57.469566345214844,
"learning_rate": 9.802336028751123e-06,
"loss": 3.4219,
"step": 163
},
{
"epoch": 0.06996587030716724,
"grad_norm": 20.774744033813477,
"learning_rate": 9.797843665768195e-06,
"loss": 3.7871,
"step": 164
},
{
"epoch": 0.0703924914675768,
"grad_norm": 58.800113677978516,
"learning_rate": 9.793351302785265e-06,
"loss": 3.4141,
"step": 165
},
{
"epoch": 0.07081911262798635,
"grad_norm": 65.20160675048828,
"learning_rate": 9.788858939802337e-06,
"loss": 3.3359,
"step": 166
},
{
"epoch": 0.07124573378839591,
"grad_norm": 71.90331268310547,
"learning_rate": 9.784366576819408e-06,
"loss": 3.2422,
"step": 167
},
{
"epoch": 0.07167235494880546,
"grad_norm": 69.28284454345703,
"learning_rate": 9.77987421383648e-06,
"loss": 3.6152,
"step": 168
},
{
"epoch": 0.07209897610921502,
"grad_norm": 34.195621490478516,
"learning_rate": 9.77538185085355e-06,
"loss": 3.7012,
"step": 169
},
{
"epoch": 0.07252559726962457,
"grad_norm": 15.721536636352539,
"learning_rate": 9.77088948787062e-06,
"loss": 3.5898,
"step": 170
},
{
"epoch": 0.07295221843003413,
"grad_norm": 18.50776481628418,
"learning_rate": 9.766397124887692e-06,
"loss": 3.3477,
"step": 171
},
{
"epoch": 0.07337883959044368,
"grad_norm": 149.7288055419922,
"learning_rate": 9.761904761904762e-06,
"loss": 3.7578,
"step": 172
},
{
"epoch": 0.07380546075085324,
"grad_norm": 84.52557373046875,
"learning_rate": 9.757412398921834e-06,
"loss": 3.4316,
"step": 173
},
{
"epoch": 0.07423208191126279,
"grad_norm": 83.47309112548828,
"learning_rate": 9.752920035938904e-06,
"loss": 3.3984,
"step": 174
},
{
"epoch": 0.07465870307167236,
"grad_norm": 27.937088012695312,
"learning_rate": 9.748427672955976e-06,
"loss": 3.7148,
"step": 175
},
{
"epoch": 0.07508532423208192,
"grad_norm": 23.611757278442383,
"learning_rate": 9.743935309973048e-06,
"loss": 3.208,
"step": 176
},
{
"epoch": 0.07551194539249147,
"grad_norm": 30.196529388427734,
"learning_rate": 9.739442946990118e-06,
"loss": 3.1592,
"step": 177
},
{
"epoch": 0.07593856655290103,
"grad_norm": 42.56078338623047,
"learning_rate": 9.734950584007188e-06,
"loss": 3.6641,
"step": 178
},
{
"epoch": 0.07636518771331058,
"grad_norm": 51.98075866699219,
"learning_rate": 9.73045822102426e-06,
"loss": 3.834,
"step": 179
},
{
"epoch": 0.07679180887372014,
"grad_norm": 54.121307373046875,
"learning_rate": 9.72596585804133e-06,
"loss": 3.457,
"step": 180
},
{
"epoch": 0.07721843003412969,
"grad_norm": 39.24628829956055,
"learning_rate": 9.7214734950584e-06,
"loss": 3.2236,
"step": 181
},
{
"epoch": 0.07764505119453925,
"grad_norm": 20.651260375976562,
"learning_rate": 9.716981132075472e-06,
"loss": 3.8125,
"step": 182
},
{
"epoch": 0.0780716723549488,
"grad_norm": 68.77881622314453,
"learning_rate": 9.712488769092544e-06,
"loss": 3.3652,
"step": 183
},
{
"epoch": 0.07849829351535836,
"grad_norm": 21.397153854370117,
"learning_rate": 9.707996406109614e-06,
"loss": 3.3125,
"step": 184
},
{
"epoch": 0.07892491467576791,
"grad_norm": 73.60026550292969,
"learning_rate": 9.703504043126686e-06,
"loss": 3.3945,
"step": 185
},
{
"epoch": 0.07935153583617748,
"grad_norm": 129.18272399902344,
"learning_rate": 9.699011680143757e-06,
"loss": 3.4141,
"step": 186
},
{
"epoch": 0.07977815699658702,
"grad_norm": 27.627702713012695,
"learning_rate": 9.694519317160828e-06,
"loss": 3.3594,
"step": 187
},
{
"epoch": 0.08020477815699659,
"grad_norm": 19.12004852294922,
"learning_rate": 9.690026954177899e-06,
"loss": 3.1738,
"step": 188
},
{
"epoch": 0.08063139931740615,
"grad_norm": 54.20294189453125,
"learning_rate": 9.685534591194969e-06,
"loss": 3.3184,
"step": 189
},
{
"epoch": 0.0810580204778157,
"grad_norm": 79.17913818359375,
"learning_rate": 9.68104222821204e-06,
"loss": 3.8184,
"step": 190
},
{
"epoch": 0.08148464163822526,
"grad_norm": 93.08058166503906,
"learning_rate": 9.676549865229111e-06,
"loss": 3.7852,
"step": 191
},
{
"epoch": 0.08191126279863481,
"grad_norm": 94.66240692138672,
"learning_rate": 9.672057502246183e-06,
"loss": 4.0957,
"step": 192
},
{
"epoch": 0.08233788395904437,
"grad_norm": 74.10448455810547,
"learning_rate": 9.667565139263253e-06,
"loss": 3.3672,
"step": 193
},
{
"epoch": 0.08276450511945392,
"grad_norm": 15.366697311401367,
"learning_rate": 9.663072776280325e-06,
"loss": 2.8145,
"step": 194
},
{
"epoch": 0.08319112627986348,
"grad_norm": 28.673463821411133,
"learning_rate": 9.658580413297395e-06,
"loss": 3.334,
"step": 195
},
{
"epoch": 0.08361774744027303,
"grad_norm": 67.39178466796875,
"learning_rate": 9.654088050314467e-06,
"loss": 3.5195,
"step": 196
},
{
"epoch": 0.0840443686006826,
"grad_norm": 50.03989791870117,
"learning_rate": 9.649595687331537e-06,
"loss": 3.6289,
"step": 197
},
{
"epoch": 0.08447098976109214,
"grad_norm": 60.73853302001953,
"learning_rate": 9.645103324348607e-06,
"loss": 3.5918,
"step": 198
},
{
"epoch": 0.0848976109215017,
"grad_norm": 62.10542297363281,
"learning_rate": 9.64061096136568e-06,
"loss": 3.0469,
"step": 199
},
{
"epoch": 0.08532423208191127,
"grad_norm": 46.920196533203125,
"learning_rate": 9.63611859838275e-06,
"loss": 3.1494,
"step": 200
},
{
"epoch": 0.08575085324232082,
"grad_norm": 33.74738693237305,
"learning_rate": 9.631626235399821e-06,
"loss": 3.6973,
"step": 201
},
{
"epoch": 0.08617747440273038,
"grad_norm": 41.40378189086914,
"learning_rate": 9.627133872416892e-06,
"loss": 3.373,
"step": 202
},
{
"epoch": 0.08660409556313993,
"grad_norm": 83.42320251464844,
"learning_rate": 9.622641509433963e-06,
"loss": 3.4805,
"step": 203
},
{
"epoch": 0.08703071672354949,
"grad_norm": 60.74094772338867,
"learning_rate": 9.618149146451034e-06,
"loss": 3.2285,
"step": 204
},
{
"epoch": 0.08745733788395904,
"grad_norm": 80.0930404663086,
"learning_rate": 9.613656783468106e-06,
"loss": 3.5938,
"step": 205
},
{
"epoch": 0.0878839590443686,
"grad_norm": 45.48680877685547,
"learning_rate": 9.609164420485176e-06,
"loss": 3.3945,
"step": 206
},
{
"epoch": 0.08831058020477815,
"grad_norm": 37.88972473144531,
"learning_rate": 9.604672057502246e-06,
"loss": 2.7588,
"step": 207
},
{
"epoch": 0.08873720136518772,
"grad_norm": 25.568714141845703,
"learning_rate": 9.600179694519318e-06,
"loss": 3.2305,
"step": 208
},
{
"epoch": 0.08916382252559726,
"grad_norm": 56.041358947753906,
"learning_rate": 9.595687331536388e-06,
"loss": 3.123,
"step": 209
},
{
"epoch": 0.08959044368600683,
"grad_norm": 34.62221908569336,
"learning_rate": 9.59119496855346e-06,
"loss": 3.166,
"step": 210
},
{
"epoch": 0.09001706484641639,
"grad_norm": 79.33340454101562,
"learning_rate": 9.58670260557053e-06,
"loss": 3.1953,
"step": 211
},
{
"epoch": 0.09044368600682594,
"grad_norm": 45.325496673583984,
"learning_rate": 9.582210242587602e-06,
"loss": 3.4336,
"step": 212
},
{
"epoch": 0.0908703071672355,
"grad_norm": 20.415624618530273,
"learning_rate": 9.577717879604674e-06,
"loss": 3.1016,
"step": 213
},
{
"epoch": 0.09129692832764505,
"grad_norm": 80.71441650390625,
"learning_rate": 9.573225516621744e-06,
"loss": 3.8184,
"step": 214
},
{
"epoch": 0.09172354948805461,
"grad_norm": 44.34187698364258,
"learning_rate": 9.568733153638814e-06,
"loss": 3.4629,
"step": 215
},
{
"epoch": 0.09215017064846416,
"grad_norm": 23.23796272277832,
"learning_rate": 9.564240790655886e-06,
"loss": 3.5723,
"step": 216
},
{
"epoch": 0.09257679180887372,
"grad_norm": 24.016454696655273,
"learning_rate": 9.559748427672956e-06,
"loss": 3.0312,
"step": 217
},
{
"epoch": 0.09300341296928327,
"grad_norm": 22.58914566040039,
"learning_rate": 9.555256064690027e-06,
"loss": 3.3926,
"step": 218
},
{
"epoch": 0.09343003412969283,
"grad_norm": 43.52719497680664,
"learning_rate": 9.550763701707098e-06,
"loss": 3.0215,
"step": 219
},
{
"epoch": 0.09385665529010238,
"grad_norm": 19.073322296142578,
"learning_rate": 9.54627133872417e-06,
"loss": 3.0996,
"step": 220
},
{
"epoch": 0.09428327645051195,
"grad_norm": 35.90131759643555,
"learning_rate": 9.54177897574124e-06,
"loss": 3.0117,
"step": 221
},
{
"epoch": 0.0947098976109215,
"grad_norm": 41.415557861328125,
"learning_rate": 9.537286612758312e-06,
"loss": 2.7695,
"step": 222
},
{
"epoch": 0.09513651877133106,
"grad_norm": 37.814273834228516,
"learning_rate": 9.532794249775383e-06,
"loss": 3.3242,
"step": 223
},
{
"epoch": 0.09556313993174062,
"grad_norm": 85.59257507324219,
"learning_rate": 9.528301886792455e-06,
"loss": 3.6572,
"step": 224
},
{
"epoch": 0.09598976109215017,
"grad_norm": 60.9831428527832,
"learning_rate": 9.523809523809525e-06,
"loss": 3.0859,
"step": 225
},
{
"epoch": 0.09641638225255973,
"grad_norm": 66.6321029663086,
"learning_rate": 9.519317160826595e-06,
"loss": 3.2715,
"step": 226
},
{
"epoch": 0.09684300341296928,
"grad_norm": 30.54461097717285,
"learning_rate": 9.514824797843667e-06,
"loss": 2.5059,
"step": 227
},
{
"epoch": 0.09726962457337884,
"grad_norm": 31.020240783691406,
"learning_rate": 9.510332434860737e-06,
"loss": 3.0801,
"step": 228
},
{
"epoch": 0.09769624573378839,
"grad_norm": 38.5662841796875,
"learning_rate": 9.505840071877809e-06,
"loss": 3.4961,
"step": 229
},
{
"epoch": 0.09812286689419795,
"grad_norm": 32.846153259277344,
"learning_rate": 9.501347708894879e-06,
"loss": 3.2148,
"step": 230
},
{
"epoch": 0.0985494880546075,
"grad_norm": 24.943174362182617,
"learning_rate": 9.496855345911951e-06,
"loss": 3.1523,
"step": 231
},
{
"epoch": 0.09897610921501707,
"grad_norm": 36.72368240356445,
"learning_rate": 9.492362982929021e-06,
"loss": 3.0732,
"step": 232
},
{
"epoch": 0.09940273037542662,
"grad_norm": 18.37833023071289,
"learning_rate": 9.487870619946093e-06,
"loss": 2.8652,
"step": 233
},
{
"epoch": 0.09982935153583618,
"grad_norm": 23.98996353149414,
"learning_rate": 9.483378256963163e-06,
"loss": 3.1426,
"step": 234
},
{
"epoch": 0.10025597269624574,
"grad_norm": 43.43721008300781,
"learning_rate": 9.478885893980234e-06,
"loss": 3.6602,
"step": 235
},
{
"epoch": 0.10068259385665529,
"grad_norm": 59.83910369873047,
"learning_rate": 9.474393530997305e-06,
"loss": 3.2461,
"step": 236
},
{
"epoch": 0.10110921501706485,
"grad_norm": 26.23409652709961,
"learning_rate": 9.469901168014376e-06,
"loss": 2.832,
"step": 237
},
{
"epoch": 0.1015358361774744,
"grad_norm": 20.064634323120117,
"learning_rate": 9.465408805031447e-06,
"loss": 2.9238,
"step": 238
},
{
"epoch": 0.10196245733788396,
"grad_norm": 17.101430892944336,
"learning_rate": 9.460916442048518e-06,
"loss": 2.8154,
"step": 239
},
{
"epoch": 0.10238907849829351,
"grad_norm": 23.318517684936523,
"learning_rate": 9.45642407906559e-06,
"loss": 3.3594,
"step": 240
},
{
"epoch": 0.10281569965870307,
"grad_norm": 35.31196212768555,
"learning_rate": 9.451931716082661e-06,
"loss": 2.875,
"step": 241
},
{
"epoch": 0.10324232081911262,
"grad_norm": 25.542692184448242,
"learning_rate": 9.447439353099732e-06,
"loss": 2.8672,
"step": 242
},
{
"epoch": 0.10366894197952219,
"grad_norm": 28.5914249420166,
"learning_rate": 9.442946990116802e-06,
"loss": 3.252,
"step": 243
},
{
"epoch": 0.10409556313993173,
"grad_norm": 35.261146545410156,
"learning_rate": 9.438454627133872e-06,
"loss": 2.9238,
"step": 244
},
{
"epoch": 0.1045221843003413,
"grad_norm": 31.006851196289062,
"learning_rate": 9.433962264150944e-06,
"loss": 3.543,
"step": 245
},
{
"epoch": 0.10494880546075085,
"grad_norm": 28.32399559020996,
"learning_rate": 9.429469901168014e-06,
"loss": 3.1533,
"step": 246
},
{
"epoch": 0.10537542662116041,
"grad_norm": 22.59617805480957,
"learning_rate": 9.424977538185086e-06,
"loss": 2.9961,
"step": 247
},
{
"epoch": 0.10580204778156997,
"grad_norm": 31.118349075317383,
"learning_rate": 9.420485175202158e-06,
"loss": 3.8945,
"step": 248
},
{
"epoch": 0.10622866894197952,
"grad_norm": 42.014286041259766,
"learning_rate": 9.415992812219228e-06,
"loss": 2.7529,
"step": 249
},
{
"epoch": 0.10665529010238908,
"grad_norm": 30.373682022094727,
"learning_rate": 9.4115004492363e-06,
"loss": 3.2578,
"step": 250
},
{
"epoch": 0.10708191126279863,
"grad_norm": 18.01287078857422,
"learning_rate": 9.40700808625337e-06,
"loss": 3.1133,
"step": 251
},
{
"epoch": 0.1075085324232082,
"grad_norm": 34.93136978149414,
"learning_rate": 9.40251572327044e-06,
"loss": 2.9883,
"step": 252
},
{
"epoch": 0.10793515358361774,
"grad_norm": 19.136699676513672,
"learning_rate": 9.398023360287512e-06,
"loss": 3.2773,
"step": 253
},
{
"epoch": 0.1083617747440273,
"grad_norm": 67.24413299560547,
"learning_rate": 9.393530997304582e-06,
"loss": 3.0068,
"step": 254
},
{
"epoch": 0.10878839590443685,
"grad_norm": 65.19268035888672,
"learning_rate": 9.389038634321654e-06,
"loss": 3.1406,
"step": 255
},
{
"epoch": 0.10921501706484642,
"grad_norm": 78.92341613769531,
"learning_rate": 9.384546271338725e-06,
"loss": 3.709,
"step": 256
},
{
"epoch": 0.10964163822525597,
"grad_norm": 58.49309539794922,
"learning_rate": 9.380053908355796e-06,
"loss": 3.1504,
"step": 257
},
{
"epoch": 0.11006825938566553,
"grad_norm": 29.386493682861328,
"learning_rate": 9.375561545372867e-06,
"loss": 3.25,
"step": 258
},
{
"epoch": 0.11049488054607509,
"grad_norm": 36.24472427368164,
"learning_rate": 9.371069182389939e-06,
"loss": 3.084,
"step": 259
},
{
"epoch": 0.11092150170648464,
"grad_norm": 91.86468505859375,
"learning_rate": 9.366576819407009e-06,
"loss": 3.084,
"step": 260
},
{
"epoch": 0.1113481228668942,
"grad_norm": 105.02268981933594,
"learning_rate": 9.36208445642408e-06,
"loss": 4.002,
"step": 261
},
{
"epoch": 0.11177474402730375,
"grad_norm": 66.99861907958984,
"learning_rate": 9.357592093441151e-06,
"loss": 3.1582,
"step": 262
},
{
"epoch": 0.11220136518771331,
"grad_norm": 19.5322208404541,
"learning_rate": 9.353099730458221e-06,
"loss": 3.25,
"step": 263
},
{
"epoch": 0.11262798634812286,
"grad_norm": 21.23870849609375,
"learning_rate": 9.348607367475293e-06,
"loss": 3.4746,
"step": 264
},
{
"epoch": 0.11305460750853243,
"grad_norm": 21.280193328857422,
"learning_rate": 9.344115004492363e-06,
"loss": 3.0547,
"step": 265
},
{
"epoch": 0.11348122866894197,
"grad_norm": 23.46653938293457,
"learning_rate": 9.339622641509435e-06,
"loss": 2.9883,
"step": 266
},
{
"epoch": 0.11390784982935154,
"grad_norm": 24.50129508972168,
"learning_rate": 9.335130278526505e-06,
"loss": 3.2188,
"step": 267
},
{
"epoch": 0.11433447098976109,
"grad_norm": 20.978782653808594,
"learning_rate": 9.330637915543577e-06,
"loss": 3.1113,
"step": 268
},
{
"epoch": 0.11476109215017065,
"grad_norm": 23.665151596069336,
"learning_rate": 9.326145552560647e-06,
"loss": 2.9453,
"step": 269
},
{
"epoch": 0.11518771331058021,
"grad_norm": 19.293272018432617,
"learning_rate": 9.32165318957772e-06,
"loss": 2.8809,
"step": 270
},
{
"epoch": 0.11561433447098976,
"grad_norm": 17.699464797973633,
"learning_rate": 9.31716082659479e-06,
"loss": 2.9258,
"step": 271
},
{
"epoch": 0.11604095563139932,
"grad_norm": 44.36705780029297,
"learning_rate": 9.31266846361186e-06,
"loss": 2.7197,
"step": 272
},
{
"epoch": 0.11646757679180887,
"grad_norm": 23.318252563476562,
"learning_rate": 9.308176100628931e-06,
"loss": 3.8906,
"step": 273
},
{
"epoch": 0.11689419795221843,
"grad_norm": 49.59714889526367,
"learning_rate": 9.303683737646002e-06,
"loss": 3.3008,
"step": 274
},
{
"epoch": 0.11732081911262798,
"grad_norm": 33.267974853515625,
"learning_rate": 9.299191374663074e-06,
"loss": 3.498,
"step": 275
},
{
"epoch": 0.11774744027303755,
"grad_norm": 27.456832885742188,
"learning_rate": 9.294699011680145e-06,
"loss": 3.5273,
"step": 276
},
{
"epoch": 0.1181740614334471,
"grad_norm": 38.26292037963867,
"learning_rate": 9.290206648697216e-06,
"loss": 3.0762,
"step": 277
},
{
"epoch": 0.11860068259385666,
"grad_norm": 41.656185150146484,
"learning_rate": 9.285714285714288e-06,
"loss": 3.4746,
"step": 278
},
{
"epoch": 0.1190273037542662,
"grad_norm": 32.80821228027344,
"learning_rate": 9.281221922731358e-06,
"loss": 3.7451,
"step": 279
},
{
"epoch": 0.11945392491467577,
"grad_norm": 20.763341903686523,
"learning_rate": 9.276729559748428e-06,
"loss": 3.3711,
"step": 280
},
{
"epoch": 0.11988054607508532,
"grad_norm": 62.02823257446289,
"learning_rate": 9.272237196765498e-06,
"loss": 3.2432,
"step": 281
},
{
"epoch": 0.12030716723549488,
"grad_norm": 17.502212524414062,
"learning_rate": 9.26774483378257e-06,
"loss": 3.1055,
"step": 282
},
{
"epoch": 0.12073378839590444,
"grad_norm": 19.061084747314453,
"learning_rate": 9.263252470799642e-06,
"loss": 3.1777,
"step": 283
},
{
"epoch": 0.12116040955631399,
"grad_norm": 18.355392456054688,
"learning_rate": 9.258760107816712e-06,
"loss": 2.4844,
"step": 284
},
{
"epoch": 0.12158703071672355,
"grad_norm": 53.59149932861328,
"learning_rate": 9.254267744833784e-06,
"loss": 3.3242,
"step": 285
},
{
"epoch": 0.1220136518771331,
"grad_norm": 64.19649505615234,
"learning_rate": 9.249775381850854e-06,
"loss": 3.4668,
"step": 286
},
{
"epoch": 0.12244027303754267,
"grad_norm": 24.470962524414062,
"learning_rate": 9.245283018867926e-06,
"loss": 3.1016,
"step": 287
},
{
"epoch": 0.12286689419795221,
"grad_norm": 35.040287017822266,
"learning_rate": 9.240790655884996e-06,
"loss": 3.0752,
"step": 288
},
{
"epoch": 0.12329351535836178,
"grad_norm": 49.83961486816406,
"learning_rate": 9.236298292902067e-06,
"loss": 3.7383,
"step": 289
},
{
"epoch": 0.12372013651877133,
"grad_norm": 71.65055084228516,
"learning_rate": 9.231805929919138e-06,
"loss": 3.7207,
"step": 290
},
{
"epoch": 0.12414675767918089,
"grad_norm": 35.319000244140625,
"learning_rate": 9.227313566936209e-06,
"loss": 3.6309,
"step": 291
},
{
"epoch": 0.12457337883959044,
"grad_norm": 29.98273468017578,
"learning_rate": 9.22282120395328e-06,
"loss": 2.9551,
"step": 292
},
{
"epoch": 0.125,
"grad_norm": 26.370370864868164,
"learning_rate": 9.21832884097035e-06,
"loss": 3.2305,
"step": 293
},
{
"epoch": 0.12542662116040956,
"grad_norm": 61.236785888671875,
"learning_rate": 9.213836477987423e-06,
"loss": 3.7461,
"step": 294
},
{
"epoch": 0.12585324232081913,
"grad_norm": 23.307022094726562,
"learning_rate": 9.209344115004493e-06,
"loss": 2.8672,
"step": 295
},
{
"epoch": 0.12627986348122866,
"grad_norm": 42.118377685546875,
"learning_rate": 9.204851752021565e-06,
"loss": 2.8496,
"step": 296
},
{
"epoch": 0.12670648464163822,
"grad_norm": 62.872283935546875,
"learning_rate": 9.200359389038635e-06,
"loss": 3.3105,
"step": 297
},
{
"epoch": 0.12713310580204779,
"grad_norm": 27.943336486816406,
"learning_rate": 9.195867026055707e-06,
"loss": 3.3574,
"step": 298
},
{
"epoch": 0.12755972696245735,
"grad_norm": 32.46928787231445,
"learning_rate": 9.191374663072777e-06,
"loss": 3.5156,
"step": 299
},
{
"epoch": 0.12798634812286688,
"grad_norm": 61.87351989746094,
"learning_rate": 9.186882300089847e-06,
"loss": 3.2422,
"step": 300
},
{
"epoch": 0.12841296928327645,
"grad_norm": 108.4822998046875,
"learning_rate": 9.182389937106919e-06,
"loss": 3.3574,
"step": 301
},
{
"epoch": 0.128839590443686,
"grad_norm": 95.23423767089844,
"learning_rate": 9.17789757412399e-06,
"loss": 3.5137,
"step": 302
},
{
"epoch": 0.12926621160409557,
"grad_norm": 75.93583679199219,
"learning_rate": 9.173405211141061e-06,
"loss": 3.2715,
"step": 303
},
{
"epoch": 0.1296928327645051,
"grad_norm": 48.323036193847656,
"learning_rate": 9.168912848158133e-06,
"loss": 3.0869,
"step": 304
},
{
"epoch": 0.13011945392491467,
"grad_norm": 33.861515045166016,
"learning_rate": 9.164420485175203e-06,
"loss": 3.332,
"step": 305
},
{
"epoch": 0.13054607508532423,
"grad_norm": 48.168663024902344,
"learning_rate": 9.159928122192273e-06,
"loss": 3.4336,
"step": 306
},
{
"epoch": 0.1309726962457338,
"grad_norm": 43.305660247802734,
"learning_rate": 9.155435759209345e-06,
"loss": 2.998,
"step": 307
},
{
"epoch": 0.13139931740614336,
"grad_norm": 32.60908889770508,
"learning_rate": 9.150943396226416e-06,
"loss": 2.9258,
"step": 308
},
{
"epoch": 0.1318259385665529,
"grad_norm": 68.1447525024414,
"learning_rate": 9.146451033243486e-06,
"loss": 3.3203,
"step": 309
},
{
"epoch": 0.13225255972696245,
"grad_norm": 66.20540618896484,
"learning_rate": 9.141958670260558e-06,
"loss": 3.2773,
"step": 310
},
{
"epoch": 0.13267918088737202,
"grad_norm": 39.17079544067383,
"learning_rate": 9.13746630727763e-06,
"loss": 3.168,
"step": 311
},
{
"epoch": 0.13310580204778158,
"grad_norm": 48.80988311767578,
"learning_rate": 9.1329739442947e-06,
"loss": 3.7939,
"step": 312
},
{
"epoch": 0.13353242320819111,
"grad_norm": 49.57157897949219,
"learning_rate": 9.128481581311772e-06,
"loss": 3.5527,
"step": 313
},
{
"epoch": 0.13395904436860068,
"grad_norm": 29.27692222595215,
"learning_rate": 9.123989218328842e-06,
"loss": 3.3145,
"step": 314
},
{
"epoch": 0.13438566552901024,
"grad_norm": 79.36219024658203,
"learning_rate": 9.119496855345914e-06,
"loss": 3.9492,
"step": 315
},
{
"epoch": 0.1348122866894198,
"grad_norm": 67.88323974609375,
"learning_rate": 9.115004492362984e-06,
"loss": 3.292,
"step": 316
},
{
"epoch": 0.13523890784982937,
"grad_norm": 30.693199157714844,
"learning_rate": 9.110512129380054e-06,
"loss": 3.4082,
"step": 317
},
{
"epoch": 0.1356655290102389,
"grad_norm": 49.18144989013672,
"learning_rate": 9.106019766397126e-06,
"loss": 4.2949,
"step": 318
},
{
"epoch": 0.13609215017064846,
"grad_norm": 22.70636749267578,
"learning_rate": 9.101527403414196e-06,
"loss": 3.124,
"step": 319
},
{
"epoch": 0.13651877133105803,
"grad_norm": 17.64731788635254,
"learning_rate": 9.097035040431268e-06,
"loss": 3.2451,
"step": 320
},
{
"epoch": 0.1369453924914676,
"grad_norm": 35.68535232543945,
"learning_rate": 9.092542677448338e-06,
"loss": 3.2607,
"step": 321
},
{
"epoch": 0.13737201365187712,
"grad_norm": 25.95724868774414,
"learning_rate": 9.08805031446541e-06,
"loss": 3.1426,
"step": 322
},
{
"epoch": 0.13779863481228669,
"grad_norm": 20.903661727905273,
"learning_rate": 9.08355795148248e-06,
"loss": 2.7314,
"step": 323
},
{
"epoch": 0.13822525597269625,
"grad_norm": 35.06086730957031,
"learning_rate": 9.079065588499552e-06,
"loss": 3.2363,
"step": 324
},
{
"epoch": 0.1386518771331058,
"grad_norm": 14.262763023376465,
"learning_rate": 9.074573225516622e-06,
"loss": 3.0547,
"step": 325
},
{
"epoch": 0.13907849829351535,
"grad_norm": 14.322052001953125,
"learning_rate": 9.070080862533693e-06,
"loss": 2.9902,
"step": 326
},
{
"epoch": 0.1395051194539249,
"grad_norm": 27.241592407226562,
"learning_rate": 9.065588499550765e-06,
"loss": 2.542,
"step": 327
},
{
"epoch": 0.13993174061433447,
"grad_norm": 23.563817977905273,
"learning_rate": 9.061096136567835e-06,
"loss": 3.4922,
"step": 328
},
{
"epoch": 0.14035836177474403,
"grad_norm": 26.41652488708496,
"learning_rate": 9.056603773584907e-06,
"loss": 3.1562,
"step": 329
},
{
"epoch": 0.1407849829351536,
"grad_norm": 41.39048385620117,
"learning_rate": 9.052111410601977e-06,
"loss": 3.0781,
"step": 330
},
{
"epoch": 0.14121160409556313,
"grad_norm": 48.801815032958984,
"learning_rate": 9.047619047619049e-06,
"loss": 2.877,
"step": 331
},
{
"epoch": 0.1416382252559727,
"grad_norm": 27.74787139892578,
"learning_rate": 9.04312668463612e-06,
"loss": 3.5898,
"step": 332
},
{
"epoch": 0.14206484641638226,
"grad_norm": 27.950883865356445,
"learning_rate": 9.03863432165319e-06,
"loss": 3.0215,
"step": 333
},
{
"epoch": 0.14249146757679182,
"grad_norm": 27.625429153442383,
"learning_rate": 9.034141958670261e-06,
"loss": 3.377,
"step": 334
},
{
"epoch": 0.14291808873720135,
"grad_norm": 56.00491714477539,
"learning_rate": 9.029649595687333e-06,
"loss": 3.6309,
"step": 335
},
{
"epoch": 0.14334470989761092,
"grad_norm": 21.263246536254883,
"learning_rate": 9.025157232704403e-06,
"loss": 3.0566,
"step": 336
},
{
"epoch": 0.14377133105802048,
"grad_norm": 19.661069869995117,
"learning_rate": 9.020664869721473e-06,
"loss": 3.0547,
"step": 337
},
{
"epoch": 0.14419795221843004,
"grad_norm": 22.829408645629883,
"learning_rate": 9.016172506738545e-06,
"loss": 2.9902,
"step": 338
},
{
"epoch": 0.14462457337883958,
"grad_norm": 26.44325065612793,
"learning_rate": 9.011680143755617e-06,
"loss": 3.0586,
"step": 339
},
{
"epoch": 0.14505119453924914,
"grad_norm": 23.391923904418945,
"learning_rate": 9.007187780772687e-06,
"loss": 3.3105,
"step": 340
},
{
"epoch": 0.1454778156996587,
"grad_norm": 18.95509910583496,
"learning_rate": 9.002695417789759e-06,
"loss": 2.7871,
"step": 341
},
{
"epoch": 0.14590443686006827,
"grad_norm": 18.797534942626953,
"learning_rate": 8.99820305480683e-06,
"loss": 3.4238,
"step": 342
},
{
"epoch": 0.14633105802047783,
"grad_norm": 12.933985710144043,
"learning_rate": 8.9937106918239e-06,
"loss": 2.957,
"step": 343
},
{
"epoch": 0.14675767918088736,
"grad_norm": 74.13087463378906,
"learning_rate": 8.989218328840971e-06,
"loss": 3.709,
"step": 344
},
{
"epoch": 0.14718430034129693,
"grad_norm": 27.274499893188477,
"learning_rate": 8.984725965858042e-06,
"loss": 3.1973,
"step": 345
},
{
"epoch": 0.1476109215017065,
"grad_norm": 21.707658767700195,
"learning_rate": 8.980233602875114e-06,
"loss": 3.1875,
"step": 346
},
{
"epoch": 0.14803754266211605,
"grad_norm": 22.60236930847168,
"learning_rate": 8.975741239892184e-06,
"loss": 2.7441,
"step": 347
},
{
"epoch": 0.14846416382252559,
"grad_norm": 22.136030197143555,
"learning_rate": 8.971248876909256e-06,
"loss": 3.1777,
"step": 348
},
{
"epoch": 0.14889078498293515,
"grad_norm": 41.246482849121094,
"learning_rate": 8.966756513926326e-06,
"loss": 3.207,
"step": 349
},
{
"epoch": 0.1493174061433447,
"grad_norm": 20.428062438964844,
"learning_rate": 8.962264150943398e-06,
"loss": 3.2832,
"step": 350
},
{
"epoch": 0.14974402730375427,
"grad_norm": 18.96843147277832,
"learning_rate": 8.957771787960468e-06,
"loss": 2.7578,
"step": 351
},
{
"epoch": 0.15017064846416384,
"grad_norm": 51.70307540893555,
"learning_rate": 8.95327942497754e-06,
"loss": 3.1045,
"step": 352
},
{
"epoch": 0.15059726962457337,
"grad_norm": 24.539751052856445,
"learning_rate": 8.94878706199461e-06,
"loss": 3.3164,
"step": 353
},
{
"epoch": 0.15102389078498293,
"grad_norm": 28.489370346069336,
"learning_rate": 8.94429469901168e-06,
"loss": 2.8105,
"step": 354
},
{
"epoch": 0.1514505119453925,
"grad_norm": 67.5721664428711,
"learning_rate": 8.939802336028752e-06,
"loss": 3.4707,
"step": 355
},
{
"epoch": 0.15187713310580206,
"grad_norm": 19.821903228759766,
"learning_rate": 8.935309973045822e-06,
"loss": 3.2227,
"step": 356
},
{
"epoch": 0.1523037542662116,
"grad_norm": 27.4295711517334,
"learning_rate": 8.930817610062894e-06,
"loss": 2.8145,
"step": 357
},
{
"epoch": 0.15273037542662116,
"grad_norm": 42.77101516723633,
"learning_rate": 8.926325247079964e-06,
"loss": 2.791,
"step": 358
},
{
"epoch": 0.15315699658703072,
"grad_norm": 15.855666160583496,
"learning_rate": 8.921832884097036e-06,
"loss": 3.0332,
"step": 359
},
{
"epoch": 0.15358361774744028,
"grad_norm": 21.724477767944336,
"learning_rate": 8.917340521114106e-06,
"loss": 3.0322,
"step": 360
},
{
"epoch": 0.15401023890784982,
"grad_norm": 44.551021575927734,
"learning_rate": 8.912848158131178e-06,
"loss": 3.3057,
"step": 361
},
{
"epoch": 0.15443686006825938,
"grad_norm": 16.5596981048584,
"learning_rate": 8.908355795148249e-06,
"loss": 3.0547,
"step": 362
},
{
"epoch": 0.15486348122866894,
"grad_norm": 17.56165313720703,
"learning_rate": 8.903863432165319e-06,
"loss": 3.0039,
"step": 363
},
{
"epoch": 0.1552901023890785,
"grad_norm": 21.125158309936523,
"learning_rate": 8.89937106918239e-06,
"loss": 3.0469,
"step": 364
},
{
"epoch": 0.15571672354948807,
"grad_norm": 39.21709442138672,
"learning_rate": 8.89487870619946e-06,
"loss": 3.1875,
"step": 365
},
{
"epoch": 0.1561433447098976,
"grad_norm": 52.8671875,
"learning_rate": 8.890386343216533e-06,
"loss": 3.5781,
"step": 366
},
{
"epoch": 0.15656996587030717,
"grad_norm": 33.692012786865234,
"learning_rate": 8.885893980233603e-06,
"loss": 3.3535,
"step": 367
},
{
"epoch": 0.15699658703071673,
"grad_norm": 75.68614196777344,
"learning_rate": 8.881401617250675e-06,
"loss": 2.6094,
"step": 368
},
{
"epoch": 0.1574232081911263,
"grad_norm": 65.00382232666016,
"learning_rate": 8.876909254267747e-06,
"loss": 2.8457,
"step": 369
},
{
"epoch": 0.15784982935153583,
"grad_norm": 78.48975372314453,
"learning_rate": 8.872416891284817e-06,
"loss": 2.9648,
"step": 370
},
{
"epoch": 0.1582764505119454,
"grad_norm": 43.618648529052734,
"learning_rate": 8.867924528301887e-06,
"loss": 3.1523,
"step": 371
},
{
"epoch": 0.15870307167235495,
"grad_norm": 20.660226821899414,
"learning_rate": 8.863432165318957e-06,
"loss": 2.9531,
"step": 372
},
{
"epoch": 0.1591296928327645,
"grad_norm": 17.3546199798584,
"learning_rate": 8.85893980233603e-06,
"loss": 2.6953,
"step": 373
},
{
"epoch": 0.15955631399317405,
"grad_norm": 82.11502838134766,
"learning_rate": 8.8544474393531e-06,
"loss": 3.3164,
"step": 374
},
{
"epoch": 0.1599829351535836,
"grad_norm": 56.06876754760742,
"learning_rate": 8.849955076370171e-06,
"loss": 2.9805,
"step": 375
},
{
"epoch": 0.16040955631399317,
"grad_norm": 63.79014587402344,
"learning_rate": 8.845462713387243e-06,
"loss": 2.9902,
"step": 376
},
{
"epoch": 0.16083617747440274,
"grad_norm": 62.75901412963867,
"learning_rate": 8.840970350404313e-06,
"loss": 2.8008,
"step": 377
},
{
"epoch": 0.1612627986348123,
"grad_norm": 30.096982955932617,
"learning_rate": 8.836477987421385e-06,
"loss": 3.5918,
"step": 378
},
{
"epoch": 0.16168941979522183,
"grad_norm": 38.117801666259766,
"learning_rate": 8.831985624438455e-06,
"loss": 3.0928,
"step": 379
},
{
"epoch": 0.1621160409556314,
"grad_norm": 23.72600746154785,
"learning_rate": 8.827493261455526e-06,
"loss": 3.0742,
"step": 380
},
{
"epoch": 0.16254266211604096,
"grad_norm": 34.349876403808594,
"learning_rate": 8.823000898472598e-06,
"loss": 3.6162,
"step": 381
},
{
"epoch": 0.16296928327645052,
"grad_norm": 41.78531265258789,
"learning_rate": 8.818508535489668e-06,
"loss": 2.623,
"step": 382
},
{
"epoch": 0.16339590443686006,
"grad_norm": 16.38304328918457,
"learning_rate": 8.81401617250674e-06,
"loss": 2.7578,
"step": 383
},
{
"epoch": 0.16382252559726962,
"grad_norm": 26.40755844116211,
"learning_rate": 8.80952380952381e-06,
"loss": 2.9482,
"step": 384
},
{
"epoch": 0.16424914675767918,
"grad_norm": 64.92070007324219,
"learning_rate": 8.805031446540882e-06,
"loss": 3.1387,
"step": 385
},
{
"epoch": 0.16467576791808874,
"grad_norm": 50.635353088378906,
"learning_rate": 8.800539083557952e-06,
"loss": 3.2109,
"step": 386
},
{
"epoch": 0.1651023890784983,
"grad_norm": 20.982133865356445,
"learning_rate": 8.796046720575024e-06,
"loss": 2.9834,
"step": 387
},
{
"epoch": 0.16552901023890784,
"grad_norm": 70.14364624023438,
"learning_rate": 8.791554357592094e-06,
"loss": 3.8008,
"step": 388
},
{
"epoch": 0.1659556313993174,
"grad_norm": 19.195613861083984,
"learning_rate": 8.787061994609166e-06,
"loss": 3.3164,
"step": 389
},
{
"epoch": 0.16638225255972697,
"grad_norm": 22.438356399536133,
"learning_rate": 8.782569631626236e-06,
"loss": 3.0254,
"step": 390
},
{
"epoch": 0.16680887372013653,
"grad_norm": 18.336254119873047,
"learning_rate": 8.778077268643306e-06,
"loss": 2.877,
"step": 391
},
{
"epoch": 0.16723549488054607,
"grad_norm": 19.402578353881836,
"learning_rate": 8.773584905660378e-06,
"loss": 3.1582,
"step": 392
},
{
"epoch": 0.16766211604095563,
"grad_norm": 18.75274085998535,
"learning_rate": 8.769092542677448e-06,
"loss": 3.0508,
"step": 393
},
{
"epoch": 0.1680887372013652,
"grad_norm": 29.596078872680664,
"learning_rate": 8.76460017969452e-06,
"loss": 3.3203,
"step": 394
},
{
"epoch": 0.16851535836177475,
"grad_norm": 54.33580780029297,
"learning_rate": 8.76010781671159e-06,
"loss": 2.8652,
"step": 395
},
{
"epoch": 0.1689419795221843,
"grad_norm": 19.791545867919922,
"learning_rate": 8.755615453728662e-06,
"loss": 2.7363,
"step": 396
},
{
"epoch": 0.16936860068259385,
"grad_norm": 13.774062156677246,
"learning_rate": 8.751123090745734e-06,
"loss": 2.623,
"step": 397
},
{
"epoch": 0.1697952218430034,
"grad_norm": 16.633455276489258,
"learning_rate": 8.746630727762804e-06,
"loss": 3.0918,
"step": 398
},
{
"epoch": 0.17022184300341298,
"grad_norm": 19.359846115112305,
"learning_rate": 8.742138364779875e-06,
"loss": 2.9824,
"step": 399
},
{
"epoch": 0.17064846416382254,
"grad_norm": 20.94213104248047,
"learning_rate": 8.737646001796945e-06,
"loss": 2.7168,
"step": 400
},
{
"epoch": 0.17107508532423207,
"grad_norm": 36.21412658691406,
"learning_rate": 8.733153638814017e-06,
"loss": 3.0322,
"step": 401
},
{
"epoch": 0.17150170648464164,
"grad_norm": 20.138742446899414,
"learning_rate": 8.728661275831087e-06,
"loss": 2.9668,
"step": 402
},
{
"epoch": 0.1719283276450512,
"grad_norm": 23.036239624023438,
"learning_rate": 8.724168912848159e-06,
"loss": 2.6787,
"step": 403
},
{
"epoch": 0.17235494880546076,
"grad_norm": 23.585355758666992,
"learning_rate": 8.71967654986523e-06,
"loss": 2.5127,
"step": 404
},
{
"epoch": 0.1727815699658703,
"grad_norm": 39.485984802246094,
"learning_rate": 8.715184186882301e-06,
"loss": 3.0957,
"step": 405
},
{
"epoch": 0.17320819112627986,
"grad_norm": 24.583091735839844,
"learning_rate": 8.710691823899373e-06,
"loss": 2.998,
"step": 406
},
{
"epoch": 0.17363481228668942,
"grad_norm": 25.577939987182617,
"learning_rate": 8.706199460916443e-06,
"loss": 3.1807,
"step": 407
},
{
"epoch": 0.17406143344709898,
"grad_norm": 20.640886306762695,
"learning_rate": 8.701707097933513e-06,
"loss": 2.8564,
"step": 408
},
{
"epoch": 0.17448805460750852,
"grad_norm": 27.32965087890625,
"learning_rate": 8.697214734950583e-06,
"loss": 3.043,
"step": 409
},
{
"epoch": 0.17491467576791808,
"grad_norm": 23.118106842041016,
"learning_rate": 8.692722371967655e-06,
"loss": 3.002,
"step": 410
},
{
"epoch": 0.17534129692832764,
"grad_norm": 36.54021453857422,
"learning_rate": 8.688230008984727e-06,
"loss": 2.8516,
"step": 411
},
{
"epoch": 0.1757679180887372,
"grad_norm": 33.472747802734375,
"learning_rate": 8.683737646001797e-06,
"loss": 2.8984,
"step": 412
},
{
"epoch": 0.17619453924914677,
"grad_norm": 16.954639434814453,
"learning_rate": 8.67924528301887e-06,
"loss": 2.8779,
"step": 413
},
{
"epoch": 0.1766211604095563,
"grad_norm": 15.781172752380371,
"learning_rate": 8.67475292003594e-06,
"loss": 2.8623,
"step": 414
},
{
"epoch": 0.17704778156996587,
"grad_norm": 30.408344268798828,
"learning_rate": 8.670260557053011e-06,
"loss": 2.918,
"step": 415
},
{
"epoch": 0.17747440273037543,
"grad_norm": 20.8376522064209,
"learning_rate": 8.665768194070082e-06,
"loss": 2.8662,
"step": 416
},
{
"epoch": 0.177901023890785,
"grad_norm": 19.008398056030273,
"learning_rate": 8.661275831087152e-06,
"loss": 3.1709,
"step": 417
},
{
"epoch": 0.17832764505119453,
"grad_norm": 21.907400131225586,
"learning_rate": 8.656783468104224e-06,
"loss": 2.9385,
"step": 418
},
{
"epoch": 0.1787542662116041,
"grad_norm": 18.403654098510742,
"learning_rate": 8.652291105121294e-06,
"loss": 2.8965,
"step": 419
},
{
"epoch": 0.17918088737201365,
"grad_norm": 45.25904083251953,
"learning_rate": 8.647798742138366e-06,
"loss": 2.4941,
"step": 420
},
{
"epoch": 0.17960750853242322,
"grad_norm": 21.15122413635254,
"learning_rate": 8.643306379155436e-06,
"loss": 2.9277,
"step": 421
},
{
"epoch": 0.18003412969283278,
"grad_norm": 20.08800506591797,
"learning_rate": 8.638814016172508e-06,
"loss": 2.8164,
"step": 422
},
{
"epoch": 0.1804607508532423,
"grad_norm": 41.06690979003906,
"learning_rate": 8.634321653189578e-06,
"loss": 3.2324,
"step": 423
},
{
"epoch": 0.18088737201365188,
"grad_norm": 37.022769927978516,
"learning_rate": 8.62982929020665e-06,
"loss": 3.0547,
"step": 424
},
{
"epoch": 0.18131399317406144,
"grad_norm": 30.134794235229492,
"learning_rate": 8.62533692722372e-06,
"loss": 3.1455,
"step": 425
},
{
"epoch": 0.181740614334471,
"grad_norm": 29.732011795043945,
"learning_rate": 8.620844564240792e-06,
"loss": 3.1504,
"step": 426
},
{
"epoch": 0.18216723549488054,
"grad_norm": 27.267990112304688,
"learning_rate": 8.616352201257862e-06,
"loss": 3.0957,
"step": 427
},
{
"epoch": 0.1825938566552901,
"grad_norm": 20.294174194335938,
"learning_rate": 8.611859838274932e-06,
"loss": 3.2441,
"step": 428
},
{
"epoch": 0.18302047781569966,
"grad_norm": 59.82269287109375,
"learning_rate": 8.607367475292004e-06,
"loss": 2.7686,
"step": 429
},
{
"epoch": 0.18344709897610922,
"grad_norm": 13.050138473510742,
"learning_rate": 8.602875112309074e-06,
"loss": 3.1133,
"step": 430
},
{
"epoch": 0.18387372013651876,
"grad_norm": 23.325040817260742,
"learning_rate": 8.598382749326146e-06,
"loss": 3.1797,
"step": 431
},
{
"epoch": 0.18430034129692832,
"grad_norm": 42.300777435302734,
"learning_rate": 8.593890386343218e-06,
"loss": 3.1504,
"step": 432
},
{
"epoch": 0.18472696245733788,
"grad_norm": 17.584157943725586,
"learning_rate": 8.589398023360288e-06,
"loss": 2.8652,
"step": 433
},
{
"epoch": 0.18515358361774745,
"grad_norm": 25.537080764770508,
"learning_rate": 8.58490566037736e-06,
"loss": 3.1504,
"step": 434
},
{
"epoch": 0.185580204778157,
"grad_norm": 33.94193649291992,
"learning_rate": 8.58041329739443e-06,
"loss": 3.4141,
"step": 435
},
{
"epoch": 0.18600682593856654,
"grad_norm": 36.545509338378906,
"learning_rate": 8.5759209344115e-06,
"loss": 3.4863,
"step": 436
},
{
"epoch": 0.1864334470989761,
"grad_norm": 22.413768768310547,
"learning_rate": 8.571428571428571e-06,
"loss": 3.123,
"step": 437
},
{
"epoch": 0.18686006825938567,
"grad_norm": 20.436925888061523,
"learning_rate": 8.566936208445643e-06,
"loss": 3.0488,
"step": 438
},
{
"epoch": 0.18728668941979523,
"grad_norm": 82.86627960205078,
"learning_rate": 8.562443845462715e-06,
"loss": 3.2324,
"step": 439
},
{
"epoch": 0.18771331058020477,
"grad_norm": 85.6285629272461,
"learning_rate": 8.557951482479785e-06,
"loss": 3.5078,
"step": 440
},
{
"epoch": 0.18813993174061433,
"grad_norm": 90.78428649902344,
"learning_rate": 8.553459119496857e-06,
"loss": 2.9971,
"step": 441
},
{
"epoch": 0.1885665529010239,
"grad_norm": 64.85440063476562,
"learning_rate": 8.548966756513927e-06,
"loss": 3.4453,
"step": 442
},
{
"epoch": 0.18899317406143346,
"grad_norm": 21.0238094329834,
"learning_rate": 8.544474393530999e-06,
"loss": 2.6582,
"step": 443
},
{
"epoch": 0.189419795221843,
"grad_norm": 28.667882919311523,
"learning_rate": 8.539982030548069e-06,
"loss": 3.623,
"step": 444
},
{
"epoch": 0.18984641638225255,
"grad_norm": 55.42424392700195,
"learning_rate": 8.53548966756514e-06,
"loss": 3.0801,
"step": 445
},
{
"epoch": 0.19027303754266212,
"grad_norm": 20.951005935668945,
"learning_rate": 8.530997304582211e-06,
"loss": 2.6289,
"step": 446
},
{
"epoch": 0.19069965870307168,
"grad_norm": 92.44082641601562,
"learning_rate": 8.526504941599281e-06,
"loss": 3.8604,
"step": 447
},
{
"epoch": 0.19112627986348124,
"grad_norm": 49.035438537597656,
"learning_rate": 8.522012578616353e-06,
"loss": 3.3428,
"step": 448
},
{
"epoch": 0.19155290102389078,
"grad_norm": 34.45978546142578,
"learning_rate": 8.517520215633423e-06,
"loss": 3.3242,
"step": 449
},
{
"epoch": 0.19197952218430034,
"grad_norm": 20.125938415527344,
"learning_rate": 8.513027852650495e-06,
"loss": 2.8877,
"step": 450
},
{
"epoch": 0.1924061433447099,
"grad_norm": 27.46782684326172,
"learning_rate": 8.508535489667566e-06,
"loss": 3.1895,
"step": 451
},
{
"epoch": 0.19283276450511946,
"grad_norm": 59.36712646484375,
"learning_rate": 8.504043126684637e-06,
"loss": 3.3203,
"step": 452
},
{
"epoch": 0.193259385665529,
"grad_norm": 33.552188873291016,
"learning_rate": 8.499550763701708e-06,
"loss": 3.2256,
"step": 453
},
{
"epoch": 0.19368600682593856,
"grad_norm": 17.913942337036133,
"learning_rate": 8.495058400718778e-06,
"loss": 2.7002,
"step": 454
},
{
"epoch": 0.19411262798634812,
"grad_norm": 28.015735626220703,
"learning_rate": 8.49056603773585e-06,
"loss": 2.7158,
"step": 455
},
{
"epoch": 0.1945392491467577,
"grad_norm": 14.068643569946289,
"learning_rate": 8.48607367475292e-06,
"loss": 2.916,
"step": 456
},
{
"epoch": 0.19496587030716722,
"grad_norm": 19.000486373901367,
"learning_rate": 8.481581311769992e-06,
"loss": 2.8457,
"step": 457
},
{
"epoch": 0.19539249146757678,
"grad_norm": 19.684755325317383,
"learning_rate": 8.477088948787062e-06,
"loss": 3.1113,
"step": 458
},
{
"epoch": 0.19581911262798635,
"grad_norm": 25.133827209472656,
"learning_rate": 8.472596585804134e-06,
"loss": 2.2959,
"step": 459
},
{
"epoch": 0.1962457337883959,
"grad_norm": 35.57670593261719,
"learning_rate": 8.468104222821206e-06,
"loss": 3.082,
"step": 460
},
{
"epoch": 0.19667235494880547,
"grad_norm": 58.25027847290039,
"learning_rate": 8.463611859838276e-06,
"loss": 2.9502,
"step": 461
},
{
"epoch": 0.197098976109215,
"grad_norm": 50.10265350341797,
"learning_rate": 8.459119496855346e-06,
"loss": 3.1689,
"step": 462
},
{
"epoch": 0.19752559726962457,
"grad_norm": 22.792301177978516,
"learning_rate": 8.454627133872418e-06,
"loss": 2.7754,
"step": 463
},
{
"epoch": 0.19795221843003413,
"grad_norm": 24.666540145874023,
"learning_rate": 8.450134770889488e-06,
"loss": 2.7695,
"step": 464
},
{
"epoch": 0.1983788395904437,
"grad_norm": 22.577451705932617,
"learning_rate": 8.445642407906558e-06,
"loss": 2.8379,
"step": 465
},
{
"epoch": 0.19880546075085323,
"grad_norm": 71.5382308959961,
"learning_rate": 8.44115004492363e-06,
"loss": 3.3887,
"step": 466
},
{
"epoch": 0.1992320819112628,
"grad_norm": 55.62384033203125,
"learning_rate": 8.436657681940702e-06,
"loss": 3.293,
"step": 467
},
{
"epoch": 0.19965870307167236,
"grad_norm": 33.4422492980957,
"learning_rate": 8.432165318957772e-06,
"loss": 3.4922,
"step": 468
},
{
"epoch": 0.20008532423208192,
"grad_norm": 65.28173828125,
"learning_rate": 8.427672955974844e-06,
"loss": 3.0293,
"step": 469
},
{
"epoch": 0.20051194539249148,
"grad_norm": 22.559324264526367,
"learning_rate": 8.423180592991915e-06,
"loss": 2.7207,
"step": 470
},
{
"epoch": 0.20093856655290102,
"grad_norm": 25.741518020629883,
"learning_rate": 8.418688230008986e-06,
"loss": 2.6143,
"step": 471
},
{
"epoch": 0.20136518771331058,
"grad_norm": 77.9397201538086,
"learning_rate": 8.414195867026057e-06,
"loss": 3.1191,
"step": 472
},
{
"epoch": 0.20179180887372014,
"grad_norm": 96.0541000366211,
"learning_rate": 8.409703504043127e-06,
"loss": 3.2148,
"step": 473
},
{
"epoch": 0.2022184300341297,
"grad_norm": 84.57242584228516,
"learning_rate": 8.405211141060199e-06,
"loss": 3.1709,
"step": 474
},
{
"epoch": 0.20264505119453924,
"grad_norm": 60.31303024291992,
"learning_rate": 8.400718778077269e-06,
"loss": 3.1729,
"step": 475
},
{
"epoch": 0.2030716723549488,
"grad_norm": 44.18376159667969,
"learning_rate": 8.39622641509434e-06,
"loss": 3.0234,
"step": 476
},
{
"epoch": 0.20349829351535836,
"grad_norm": 35.493770599365234,
"learning_rate": 8.391734052111411e-06,
"loss": 2.8408,
"step": 477
},
{
"epoch": 0.20392491467576793,
"grad_norm": 22.417808532714844,
"learning_rate": 8.387241689128483e-06,
"loss": 2.8154,
"step": 478
},
{
"epoch": 0.20435153583617746,
"grad_norm": 16.359333038330078,
"learning_rate": 8.382749326145553e-06,
"loss": 2.6514,
"step": 479
},
{
"epoch": 0.20477815699658702,
"grad_norm": 80.31161499023438,
"learning_rate": 8.378256963162625e-06,
"loss": 3.1973,
"step": 480
},
{
"epoch": 0.2052047781569966,
"grad_norm": 56.79182815551758,
"learning_rate": 8.373764600179695e-06,
"loss": 3.2148,
"step": 481
},
{
"epoch": 0.20563139931740615,
"grad_norm": 79.41356658935547,
"learning_rate": 8.369272237196765e-06,
"loss": 3.1064,
"step": 482
},
{
"epoch": 0.2060580204778157,
"grad_norm": 92.82357025146484,
"learning_rate": 8.364779874213837e-06,
"loss": 2.8896,
"step": 483
},
{
"epoch": 0.20648464163822525,
"grad_norm": 63.247501373291016,
"learning_rate": 8.360287511230907e-06,
"loss": 3.0967,
"step": 484
},
{
"epoch": 0.2069112627986348,
"grad_norm": 21.605344772338867,
"learning_rate": 8.35579514824798e-06,
"loss": 2.9824,
"step": 485
},
{
"epoch": 0.20733788395904437,
"grad_norm": 20.009994506835938,
"learning_rate": 8.35130278526505e-06,
"loss": 2.9199,
"step": 486
},
{
"epoch": 0.20776450511945393,
"grad_norm": 23.621641159057617,
"learning_rate": 8.346810422282121e-06,
"loss": 3.0059,
"step": 487
},
{
"epoch": 0.20819112627986347,
"grad_norm": 16.526639938354492,
"learning_rate": 8.342318059299193e-06,
"loss": 2.3086,
"step": 488
},
{
"epoch": 0.20861774744027303,
"grad_norm": 50.59242630004883,
"learning_rate": 8.337825696316264e-06,
"loss": 3.3828,
"step": 489
},
{
"epoch": 0.2090443686006826,
"grad_norm": 21.509166717529297,
"learning_rate": 8.333333333333334e-06,
"loss": 2.293,
"step": 490
},
{
"epoch": 0.20947098976109216,
"grad_norm": 60.66617965698242,
"learning_rate": 8.328840970350404e-06,
"loss": 2.9844,
"step": 491
},
{
"epoch": 0.2098976109215017,
"grad_norm": 48.574520111083984,
"learning_rate": 8.324348607367476e-06,
"loss": 3.2734,
"step": 492
},
{
"epoch": 0.21032423208191126,
"grad_norm": 33.63777160644531,
"learning_rate": 8.319856244384546e-06,
"loss": 2.8203,
"step": 493
},
{
"epoch": 0.21075085324232082,
"grad_norm": 28.87843132019043,
"learning_rate": 8.315363881401618e-06,
"loss": 2.6367,
"step": 494
},
{
"epoch": 0.21117747440273038,
"grad_norm": 27.04725456237793,
"learning_rate": 8.31087151841869e-06,
"loss": 2.9414,
"step": 495
},
{
"epoch": 0.21160409556313994,
"grad_norm": 25.990129470825195,
"learning_rate": 8.30637915543576e-06,
"loss": 2.583,
"step": 496
},
{
"epoch": 0.21203071672354948,
"grad_norm": 19.33521270751953,
"learning_rate": 8.301886792452832e-06,
"loss": 2.9385,
"step": 497
},
{
"epoch": 0.21245733788395904,
"grad_norm": 55.301395416259766,
"learning_rate": 8.297394429469902e-06,
"loss": 2.6992,
"step": 498
},
{
"epoch": 0.2128839590443686,
"grad_norm": 36.61276626586914,
"learning_rate": 8.292902066486972e-06,
"loss": 3.0576,
"step": 499
},
{
"epoch": 0.21331058020477817,
"grad_norm": 68.78702545166016,
"learning_rate": 8.288409703504044e-06,
"loss": 3.4453,
"step": 500
},
{
"epoch": 0.2137372013651877,
"grad_norm": 42.436580657958984,
"learning_rate": 8.283917340521114e-06,
"loss": 2.8301,
"step": 501
},
{
"epoch": 0.21416382252559726,
"grad_norm": 26.847007751464844,
"learning_rate": 8.279424977538186e-06,
"loss": 3.5859,
"step": 502
},
{
"epoch": 0.21459044368600683,
"grad_norm": 20.354631423950195,
"learning_rate": 8.274932614555256e-06,
"loss": 3.1436,
"step": 503
},
{
"epoch": 0.2150170648464164,
"grad_norm": 21.232385635375977,
"learning_rate": 8.270440251572328e-06,
"loss": 3.1562,
"step": 504
},
{
"epoch": 0.21544368600682595,
"grad_norm": 14.351090431213379,
"learning_rate": 8.265947888589399e-06,
"loss": 2.6172,
"step": 505
},
{
"epoch": 0.2158703071672355,
"grad_norm": 39.42547607421875,
"learning_rate": 8.26145552560647e-06,
"loss": 2.7881,
"step": 506
},
{
"epoch": 0.21629692832764505,
"grad_norm": 49.7579460144043,
"learning_rate": 8.25696316262354e-06,
"loss": 3.2529,
"step": 507
},
{
"epoch": 0.2167235494880546,
"grad_norm": 45.9566535949707,
"learning_rate": 8.252470799640613e-06,
"loss": 2.3877,
"step": 508
},
{
"epoch": 0.21715017064846417,
"grad_norm": 16.314781188964844,
"learning_rate": 8.247978436657683e-06,
"loss": 3.1973,
"step": 509
},
{
"epoch": 0.2175767918088737,
"grad_norm": 17.71656608581543,
"learning_rate": 8.243486073674753e-06,
"loss": 2.8887,
"step": 510
},
{
"epoch": 0.21800341296928327,
"grad_norm": 17.876148223876953,
"learning_rate": 8.238993710691825e-06,
"loss": 3.207,
"step": 511
},
{
"epoch": 0.21843003412969283,
"grad_norm": 29.692302703857422,
"learning_rate": 8.234501347708895e-06,
"loss": 2.8604,
"step": 512
},
{
"epoch": 0.2188566552901024,
"grad_norm": 28.06431770324707,
"learning_rate": 8.230008984725967e-06,
"loss": 2.8867,
"step": 513
},
{
"epoch": 0.21928327645051193,
"grad_norm": 46.33169937133789,
"learning_rate": 8.225516621743037e-06,
"loss": 3.5039,
"step": 514
},
{
"epoch": 0.2197098976109215,
"grad_norm": 25.718185424804688,
"learning_rate": 8.221024258760109e-06,
"loss": 3.1055,
"step": 515
},
{
"epoch": 0.22013651877133106,
"grad_norm": 17.153024673461914,
"learning_rate": 8.21653189577718e-06,
"loss": 2.9961,
"step": 516
},
{
"epoch": 0.22056313993174062,
"grad_norm": 18.76894760131836,
"learning_rate": 8.212039532794251e-06,
"loss": 2.6982,
"step": 517
},
{
"epoch": 0.22098976109215018,
"grad_norm": 34.840484619140625,
"learning_rate": 8.207547169811321e-06,
"loss": 2.9414,
"step": 518
},
{
"epoch": 0.22141638225255972,
"grad_norm": 44.31029510498047,
"learning_rate": 8.203054806828391e-06,
"loss": 3.1445,
"step": 519
},
{
"epoch": 0.22184300341296928,
"grad_norm": 14.775177955627441,
"learning_rate": 8.198562443845463e-06,
"loss": 2.6436,
"step": 520
},
{
"epoch": 0.22226962457337884,
"grad_norm": 45.092811584472656,
"learning_rate": 8.194070080862534e-06,
"loss": 3.0488,
"step": 521
},
{
"epoch": 0.2226962457337884,
"grad_norm": 18.923681259155273,
"learning_rate": 8.189577717879605e-06,
"loss": 3.1758,
"step": 522
},
{
"epoch": 0.22312286689419794,
"grad_norm": 26.43110466003418,
"learning_rate": 8.185085354896676e-06,
"loss": 2.4033,
"step": 523
},
{
"epoch": 0.2235494880546075,
"grad_norm": 20.219383239746094,
"learning_rate": 8.180592991913748e-06,
"loss": 2.7607,
"step": 524
},
{
"epoch": 0.22397610921501707,
"grad_norm": 17.233644485473633,
"learning_rate": 8.17610062893082e-06,
"loss": 3.0059,
"step": 525
},
{
"epoch": 0.22440273037542663,
"grad_norm": 30.3442325592041,
"learning_rate": 8.17160826594789e-06,
"loss": 2.4814,
"step": 526
},
{
"epoch": 0.22482935153583616,
"grad_norm": 26.53208351135254,
"learning_rate": 8.16711590296496e-06,
"loss": 2.7441,
"step": 527
},
{
"epoch": 0.22525597269624573,
"grad_norm": 34.5225944519043,
"learning_rate": 8.16262353998203e-06,
"loss": 3.0029,
"step": 528
},
{
"epoch": 0.2256825938566553,
"grad_norm": 24.5207576751709,
"learning_rate": 8.158131176999102e-06,
"loss": 2.7129,
"step": 529
},
{
"epoch": 0.22610921501706485,
"grad_norm": 26.960975646972656,
"learning_rate": 8.153638814016172e-06,
"loss": 2.8838,
"step": 530
},
{
"epoch": 0.22653583617747441,
"grad_norm": 27.879377365112305,
"learning_rate": 8.149146451033244e-06,
"loss": 3.125,
"step": 531
},
{
"epoch": 0.22696245733788395,
"grad_norm": 33.46174621582031,
"learning_rate": 8.144654088050316e-06,
"loss": 2.877,
"step": 532
},
{
"epoch": 0.2273890784982935,
"grad_norm": 17.529651641845703,
"learning_rate": 8.140161725067386e-06,
"loss": 2.6582,
"step": 533
},
{
"epoch": 0.22781569965870307,
"grad_norm": 28.782405853271484,
"learning_rate": 8.135669362084458e-06,
"loss": 2.9971,
"step": 534
},
{
"epoch": 0.22824232081911264,
"grad_norm": 19.45545768737793,
"learning_rate": 8.131176999101528e-06,
"loss": 2.6934,
"step": 535
},
{
"epoch": 0.22866894197952217,
"grad_norm": 31.947803497314453,
"learning_rate": 8.126684636118598e-06,
"loss": 3.084,
"step": 536
},
{
"epoch": 0.22909556313993173,
"grad_norm": 46.25978469848633,
"learning_rate": 8.12219227313567e-06,
"loss": 2.7988,
"step": 537
},
{
"epoch": 0.2295221843003413,
"grad_norm": 37.96335220336914,
"learning_rate": 8.11769991015274e-06,
"loss": 3.2832,
"step": 538
},
{
"epoch": 0.22994880546075086,
"grad_norm": 51.27415466308594,
"learning_rate": 8.113207547169812e-06,
"loss": 3.0996,
"step": 539
},
{
"epoch": 0.23037542662116042,
"grad_norm": 27.133466720581055,
"learning_rate": 8.108715184186883e-06,
"loss": 2.8252,
"step": 540
},
{
"epoch": 0.23080204778156996,
"grad_norm": 15.50619888305664,
"learning_rate": 8.104222821203954e-06,
"loss": 2.8223,
"step": 541
},
{
"epoch": 0.23122866894197952,
"grad_norm": 27.35281753540039,
"learning_rate": 8.099730458221025e-06,
"loss": 2.7988,
"step": 542
},
{
"epoch": 0.23165529010238908,
"grad_norm": 27.582168579101562,
"learning_rate": 8.095238095238097e-06,
"loss": 2.8779,
"step": 543
},
{
"epoch": 0.23208191126279865,
"grad_norm": 46.383941650390625,
"learning_rate": 8.090745732255167e-06,
"loss": 2.6719,
"step": 544
},
{
"epoch": 0.23250853242320818,
"grad_norm": 24.81082534790039,
"learning_rate": 8.086253369272239e-06,
"loss": 3.6602,
"step": 545
},
{
"epoch": 0.23293515358361774,
"grad_norm": 19.902307510375977,
"learning_rate": 8.081761006289309e-06,
"loss": 2.6816,
"step": 546
},
{
"epoch": 0.2333617747440273,
"grad_norm": 30.1766357421875,
"learning_rate": 8.077268643306379e-06,
"loss": 3.2646,
"step": 547
},
{
"epoch": 0.23378839590443687,
"grad_norm": 18.741374969482422,
"learning_rate": 8.072776280323451e-06,
"loss": 3.4355,
"step": 548
},
{
"epoch": 0.2342150170648464,
"grad_norm": 16.75374984741211,
"learning_rate": 8.068283917340521e-06,
"loss": 2.5967,
"step": 549
},
{
"epoch": 0.23464163822525597,
"grad_norm": 17.990034103393555,
"learning_rate": 8.063791554357593e-06,
"loss": 2.6309,
"step": 550
},
{
"epoch": 0.23506825938566553,
"grad_norm": 17.261295318603516,
"learning_rate": 8.059299191374663e-06,
"loss": 2.9277,
"step": 551
},
{
"epoch": 0.2354948805460751,
"grad_norm": 29.741744995117188,
"learning_rate": 8.054806828391735e-06,
"loss": 3.4238,
"step": 552
},
{
"epoch": 0.23592150170648465,
"grad_norm": 42.073055267333984,
"learning_rate": 8.050314465408805e-06,
"loss": 3.8125,
"step": 553
},
{
"epoch": 0.2363481228668942,
"grad_norm": 15.395310401916504,
"learning_rate": 8.045822102425877e-06,
"loss": 3.0898,
"step": 554
},
{
"epoch": 0.23677474402730375,
"grad_norm": 13.185260772705078,
"learning_rate": 8.041329739442947e-06,
"loss": 3.1797,
"step": 555
},
{
"epoch": 0.23720136518771331,
"grad_norm": 57.49574661254883,
"learning_rate": 8.036837376460018e-06,
"loss": 3.0664,
"step": 556
},
{
"epoch": 0.23762798634812288,
"grad_norm": 19.3037109375,
"learning_rate": 8.03234501347709e-06,
"loss": 2.8164,
"step": 557
},
{
"epoch": 0.2380546075085324,
"grad_norm": 13.614612579345703,
"learning_rate": 8.02785265049416e-06,
"loss": 2.8848,
"step": 558
},
{
"epoch": 0.23848122866894197,
"grad_norm": 36.09251403808594,
"learning_rate": 8.023360287511232e-06,
"loss": 2.8193,
"step": 559
},
{
"epoch": 0.23890784982935154,
"grad_norm": 38.101280212402344,
"learning_rate": 8.018867924528303e-06,
"loss": 2.9141,
"step": 560
},
{
"epoch": 0.2393344709897611,
"grad_norm": 30.57044792175293,
"learning_rate": 8.014375561545374e-06,
"loss": 2.9688,
"step": 561
},
{
"epoch": 0.23976109215017063,
"grad_norm": 55.0594367980957,
"learning_rate": 8.009883198562446e-06,
"loss": 3.2461,
"step": 562
},
{
"epoch": 0.2401877133105802,
"grad_norm": 20.185823440551758,
"learning_rate": 8.005390835579516e-06,
"loss": 2.8135,
"step": 563
},
{
"epoch": 0.24061433447098976,
"grad_norm": 23.831144332885742,
"learning_rate": 8.000898472596586e-06,
"loss": 2.6895,
"step": 564
},
{
"epoch": 0.24104095563139932,
"grad_norm": 65.2356948852539,
"learning_rate": 7.996406109613656e-06,
"loss": 2.8037,
"step": 565
},
{
"epoch": 0.24146757679180889,
"grad_norm": 47.4893684387207,
"learning_rate": 7.991913746630728e-06,
"loss": 3.1465,
"step": 566
},
{
"epoch": 0.24189419795221842,
"grad_norm": 37.859413146972656,
"learning_rate": 7.9874213836478e-06,
"loss": 3.0283,
"step": 567
},
{
"epoch": 0.24232081911262798,
"grad_norm": 26.822053909301758,
"learning_rate": 7.98292902066487e-06,
"loss": 3.1855,
"step": 568
},
{
"epoch": 0.24274744027303755,
"grad_norm": 17.800918579101562,
"learning_rate": 7.978436657681942e-06,
"loss": 2.7031,
"step": 569
},
{
"epoch": 0.2431740614334471,
"grad_norm": 55.43043518066406,
"learning_rate": 7.973944294699012e-06,
"loss": 3.0273,
"step": 570
},
{
"epoch": 0.24360068259385664,
"grad_norm": 24.088314056396484,
"learning_rate": 7.969451931716084e-06,
"loss": 3.166,
"step": 571
},
{
"epoch": 0.2440273037542662,
"grad_norm": 57.95652389526367,
"learning_rate": 7.964959568733154e-06,
"loss": 2.7227,
"step": 572
},
{
"epoch": 0.24445392491467577,
"grad_norm": 43.33866500854492,
"learning_rate": 7.960467205750224e-06,
"loss": 2.7988,
"step": 573
},
{
"epoch": 0.24488054607508533,
"grad_norm": 46.34353256225586,
"learning_rate": 7.955974842767296e-06,
"loss": 3.0703,
"step": 574
},
{
"epoch": 0.2453071672354949,
"grad_norm": 35.27037048339844,
"learning_rate": 7.951482479784367e-06,
"loss": 3.1719,
"step": 575
},
{
"epoch": 0.24573378839590443,
"grad_norm": 21.704389572143555,
"learning_rate": 7.946990116801438e-06,
"loss": 2.4922,
"step": 576
},
{
"epoch": 0.246160409556314,
"grad_norm": 23.15915298461914,
"learning_rate": 7.942497753818509e-06,
"loss": 2.7568,
"step": 577
},
{
"epoch": 0.24658703071672355,
"grad_norm": 51.94176483154297,
"learning_rate": 7.93800539083558e-06,
"loss": 2.3887,
"step": 578
},
{
"epoch": 0.24701365187713312,
"grad_norm": 24.061573028564453,
"learning_rate": 7.93351302785265e-06,
"loss": 2.8096,
"step": 579
},
{
"epoch": 0.24744027303754265,
"grad_norm": 15.878485679626465,
"learning_rate": 7.929020664869723e-06,
"loss": 2.4375,
"step": 580
},
{
"epoch": 0.24786689419795221,
"grad_norm": 19.481571197509766,
"learning_rate": 7.924528301886793e-06,
"loss": 2.9883,
"step": 581
},
{
"epoch": 0.24829351535836178,
"grad_norm": 31.10467529296875,
"learning_rate": 7.920035938903865e-06,
"loss": 2.2686,
"step": 582
},
{
"epoch": 0.24872013651877134,
"grad_norm": 19.816104888916016,
"learning_rate": 7.915543575920935e-06,
"loss": 2.6562,
"step": 583
},
{
"epoch": 0.24914675767918087,
"grad_norm": 18.393310546875,
"learning_rate": 7.911051212938005e-06,
"loss": 2.8135,
"step": 584
},
{
"epoch": 0.24957337883959044,
"grad_norm": 38.82542419433594,
"learning_rate": 7.906558849955077e-06,
"loss": 2.5684,
"step": 585
},
{
"epoch": 0.25,
"grad_norm": 89.28064727783203,
"learning_rate": 7.902066486972147e-06,
"loss": 3.9434,
"step": 586
},
{
"epoch": 0.25042662116040953,
"grad_norm": 66.42613983154297,
"learning_rate": 7.897574123989219e-06,
"loss": 3.1797,
"step": 587
},
{
"epoch": 0.2508532423208191,
"grad_norm": 85.55559539794922,
"learning_rate": 7.893081761006291e-06,
"loss": 3.2402,
"step": 588
},
{
"epoch": 0.25127986348122866,
"grad_norm": 64.09162902832031,
"learning_rate": 7.888589398023361e-06,
"loss": 3.3799,
"step": 589
},
{
"epoch": 0.25170648464163825,
"grad_norm": 18.269323348999023,
"learning_rate": 7.884097035040431e-06,
"loss": 2.7402,
"step": 590
},
{
"epoch": 0.2521331058020478,
"grad_norm": 27.09288787841797,
"learning_rate": 7.879604672057503e-06,
"loss": 2.7715,
"step": 591
},
{
"epoch": 0.2525597269624573,
"grad_norm": 23.382566452026367,
"learning_rate": 7.875112309074573e-06,
"loss": 2.8662,
"step": 592
},
{
"epoch": 0.2529863481228669,
"grad_norm": 34.532344818115234,
"learning_rate": 7.870619946091644e-06,
"loss": 2.6865,
"step": 593
},
{
"epoch": 0.25341296928327645,
"grad_norm": 64.53623962402344,
"learning_rate": 7.866127583108716e-06,
"loss": 2.9072,
"step": 594
},
{
"epoch": 0.253839590443686,
"grad_norm": 36.02356719970703,
"learning_rate": 7.861635220125787e-06,
"loss": 2.6055,
"step": 595
},
{
"epoch": 0.25426621160409557,
"grad_norm": 77.93729400634766,
"learning_rate": 7.857142857142858e-06,
"loss": 3.1201,
"step": 596
},
{
"epoch": 0.2546928327645051,
"grad_norm": 33.50895309448242,
"learning_rate": 7.85265049415993e-06,
"loss": 3.1641,
"step": 597
},
{
"epoch": 0.2551194539249147,
"grad_norm": 21.22097396850586,
"learning_rate": 7.848158131177e-06,
"loss": 3.252,
"step": 598
},
{
"epoch": 0.25554607508532423,
"grad_norm": 28.865007400512695,
"learning_rate": 7.843665768194072e-06,
"loss": 3.2734,
"step": 599
},
{
"epoch": 0.25597269624573377,
"grad_norm": 24.108217239379883,
"learning_rate": 7.839173405211142e-06,
"loss": 2.8125,
"step": 600
},
{
"epoch": 0.25639931740614336,
"grad_norm": 38.636558532714844,
"learning_rate": 7.834681042228212e-06,
"loss": 3.1211,
"step": 601
},
{
"epoch": 0.2568259385665529,
"grad_norm": 20.850168228149414,
"learning_rate": 7.830188679245284e-06,
"loss": 3.4316,
"step": 602
},
{
"epoch": 0.2572525597269625,
"grad_norm": 40.4669075012207,
"learning_rate": 7.825696316262354e-06,
"loss": 2.4248,
"step": 603
},
{
"epoch": 0.257679180887372,
"grad_norm": 21.405136108398438,
"learning_rate": 7.821203953279426e-06,
"loss": 2.7734,
"step": 604
},
{
"epoch": 0.25810580204778155,
"grad_norm": 16.794015884399414,
"learning_rate": 7.816711590296496e-06,
"loss": 2.4277,
"step": 605
},
{
"epoch": 0.25853242320819114,
"grad_norm": 16.479524612426758,
"learning_rate": 7.812219227313568e-06,
"loss": 2.8281,
"step": 606
},
{
"epoch": 0.2589590443686007,
"grad_norm": 38.69625473022461,
"learning_rate": 7.807726864330638e-06,
"loss": 3.0762,
"step": 607
},
{
"epoch": 0.2593856655290102,
"grad_norm": 64.19967651367188,
"learning_rate": 7.80323450134771e-06,
"loss": 3.1436,
"step": 608
},
{
"epoch": 0.2598122866894198,
"grad_norm": 31.575109481811523,
"learning_rate": 7.79874213836478e-06,
"loss": 2.5459,
"step": 609
},
{
"epoch": 0.26023890784982934,
"grad_norm": 46.65521240234375,
"learning_rate": 7.79424977538185e-06,
"loss": 3.0957,
"step": 610
},
{
"epoch": 0.2606655290102389,
"grad_norm": 15.696451187133789,
"learning_rate": 7.789757412398922e-06,
"loss": 3.0127,
"step": 611
},
{
"epoch": 0.26109215017064846,
"grad_norm": 30.369564056396484,
"learning_rate": 7.785265049415993e-06,
"loss": 2.3926,
"step": 612
},
{
"epoch": 0.261518771331058,
"grad_norm": 36.5367546081543,
"learning_rate": 7.780772686433065e-06,
"loss": 3.1211,
"step": 613
},
{
"epoch": 0.2619453924914676,
"grad_norm": 52.71562957763672,
"learning_rate": 7.776280323450135e-06,
"loss": 2.4541,
"step": 614
},
{
"epoch": 0.2623720136518771,
"grad_norm": 17.766803741455078,
"learning_rate": 7.771787960467207e-06,
"loss": 2.8086,
"step": 615
},
{
"epoch": 0.2627986348122867,
"grad_norm": 22.81015968322754,
"learning_rate": 7.767295597484279e-06,
"loss": 2.5469,
"step": 616
},
{
"epoch": 0.26322525597269625,
"grad_norm": 26.944904327392578,
"learning_rate": 7.762803234501349e-06,
"loss": 3.3516,
"step": 617
},
{
"epoch": 0.2636518771331058,
"grad_norm": 30.520727157592773,
"learning_rate": 7.758310871518419e-06,
"loss": 2.5283,
"step": 618
},
{
"epoch": 0.2640784982935154,
"grad_norm": 45.16160583496094,
"learning_rate": 7.75381850853549e-06,
"loss": 2.9727,
"step": 619
},
{
"epoch": 0.2645051194539249,
"grad_norm": 48.80204391479492,
"learning_rate": 7.749326145552561e-06,
"loss": 3.4336,
"step": 620
},
{
"epoch": 0.26493174061433444,
"grad_norm": 20.030519485473633,
"learning_rate": 7.744833782569631e-06,
"loss": 3.083,
"step": 621
},
{
"epoch": 0.26535836177474403,
"grad_norm": 37.67460250854492,
"learning_rate": 7.740341419586703e-06,
"loss": 2.7139,
"step": 622
},
{
"epoch": 0.26578498293515357,
"grad_norm": 27.934261322021484,
"learning_rate": 7.735849056603775e-06,
"loss": 2.9209,
"step": 623
},
{
"epoch": 0.26621160409556316,
"grad_norm": 32.7962646484375,
"learning_rate": 7.731356693620845e-06,
"loss": 2.2637,
"step": 624
},
{
"epoch": 0.2666382252559727,
"grad_norm": 21.026763916015625,
"learning_rate": 7.726864330637917e-06,
"loss": 3.0254,
"step": 625
},
{
"epoch": 0.26706484641638223,
"grad_norm": 20.621740341186523,
"learning_rate": 7.722371967654987e-06,
"loss": 2.8926,
"step": 626
},
{
"epoch": 0.2674914675767918,
"grad_norm": 20.117944717407227,
"learning_rate": 7.717879604672058e-06,
"loss": 3.3389,
"step": 627
},
{
"epoch": 0.26791808873720135,
"grad_norm": 16.37653160095215,
"learning_rate": 7.71338724168913e-06,
"loss": 2.5674,
"step": 628
},
{
"epoch": 0.26834470989761094,
"grad_norm": 17.15595817565918,
"learning_rate": 7.7088948787062e-06,
"loss": 2.8652,
"step": 629
},
{
"epoch": 0.2687713310580205,
"grad_norm": 29.244970321655273,
"learning_rate": 7.704402515723271e-06,
"loss": 3.0859,
"step": 630
},
{
"epoch": 0.26919795221843,
"grad_norm": 17.230205535888672,
"learning_rate": 7.699910152740342e-06,
"loss": 2.2197,
"step": 631
},
{
"epoch": 0.2696245733788396,
"grad_norm": 18.00596046447754,
"learning_rate": 7.695417789757414e-06,
"loss": 2.498,
"step": 632
},
{
"epoch": 0.27005119453924914,
"grad_norm": 27.735151290893555,
"learning_rate": 7.690925426774484e-06,
"loss": 2.8369,
"step": 633
},
{
"epoch": 0.27047781569965873,
"grad_norm": 18.78544044494629,
"learning_rate": 7.686433063791556e-06,
"loss": 2.9736,
"step": 634
},
{
"epoch": 0.27090443686006827,
"grad_norm": 44.827117919921875,
"learning_rate": 7.681940700808626e-06,
"loss": 2.9199,
"step": 635
},
{
"epoch": 0.2713310580204778,
"grad_norm": 36.48609924316406,
"learning_rate": 7.677448337825698e-06,
"loss": 2.7637,
"step": 636
},
{
"epoch": 0.2717576791808874,
"grad_norm": 18.71845245361328,
"learning_rate": 7.672955974842768e-06,
"loss": 2.8975,
"step": 637
},
{
"epoch": 0.2721843003412969,
"grad_norm": 21.344680786132812,
"learning_rate": 7.668463611859838e-06,
"loss": 2.9189,
"step": 638
},
{
"epoch": 0.27261092150170646,
"grad_norm": 21.78455924987793,
"learning_rate": 7.66397124887691e-06,
"loss": 2.7725,
"step": 639
},
{
"epoch": 0.27303754266211605,
"grad_norm": 29.85883140563965,
"learning_rate": 7.65947888589398e-06,
"loss": 2.5283,
"step": 640
},
{
"epoch": 0.2734641638225256,
"grad_norm": 20.216970443725586,
"learning_rate": 7.654986522911052e-06,
"loss": 3.0732,
"step": 641
},
{
"epoch": 0.2738907849829352,
"grad_norm": 24.468124389648438,
"learning_rate": 7.650494159928122e-06,
"loss": 3.3447,
"step": 642
},
{
"epoch": 0.2743174061433447,
"grad_norm": 23.012601852416992,
"learning_rate": 7.646001796945194e-06,
"loss": 2.9512,
"step": 643
},
{
"epoch": 0.27474402730375425,
"grad_norm": 20.536497116088867,
"learning_rate": 7.641509433962266e-06,
"loss": 2.6768,
"step": 644
},
{
"epoch": 0.27517064846416384,
"grad_norm": 29.968305587768555,
"learning_rate": 7.637017070979336e-06,
"loss": 2.3643,
"step": 645
},
{
"epoch": 0.27559726962457337,
"grad_norm": 34.39928436279297,
"learning_rate": 7.632524707996406e-06,
"loss": 3.3252,
"step": 646
},
{
"epoch": 0.27602389078498296,
"grad_norm": 23.07245635986328,
"learning_rate": 7.6280323450134775e-06,
"loss": 3.1465,
"step": 647
},
{
"epoch": 0.2764505119453925,
"grad_norm": 52.81306457519531,
"learning_rate": 7.623539982030549e-06,
"loss": 2.874,
"step": 648
},
{
"epoch": 0.27687713310580203,
"grad_norm": 13.939685821533203,
"learning_rate": 7.61904761904762e-06,
"loss": 2.7227,
"step": 649
},
{
"epoch": 0.2773037542662116,
"grad_norm": 16.605741500854492,
"learning_rate": 7.614555256064691e-06,
"loss": 2.6914,
"step": 650
},
{
"epoch": 0.27773037542662116,
"grad_norm": 18.800424575805664,
"learning_rate": 7.6100628930817626e-06,
"loss": 2.6963,
"step": 651
},
{
"epoch": 0.2781569965870307,
"grad_norm": 51.422508239746094,
"learning_rate": 7.605570530098833e-06,
"loss": 2.7793,
"step": 652
},
{
"epoch": 0.2785836177474403,
"grad_norm": 31.76155662536621,
"learning_rate": 7.601078167115904e-06,
"loss": 2.8135,
"step": 653
},
{
"epoch": 0.2790102389078498,
"grad_norm": 27.845605850219727,
"learning_rate": 7.596585804132974e-06,
"loss": 2.3193,
"step": 654
},
{
"epoch": 0.2794368600682594,
"grad_norm": 32.7022705078125,
"learning_rate": 7.592093441150046e-06,
"loss": 2.8477,
"step": 655
},
{
"epoch": 0.27986348122866894,
"grad_norm": 32.736080169677734,
"learning_rate": 7.587601078167116e-06,
"loss": 2.9072,
"step": 656
},
{
"epoch": 0.2802901023890785,
"grad_norm": 22.69315528869629,
"learning_rate": 7.583108715184188e-06,
"loss": 3.0986,
"step": 657
},
{
"epoch": 0.28071672354948807,
"grad_norm": 41.13274383544922,
"learning_rate": 7.578616352201259e-06,
"loss": 2.5479,
"step": 658
},
{
"epoch": 0.2811433447098976,
"grad_norm": 34.375823974609375,
"learning_rate": 7.574123989218329e-06,
"loss": 2.54,
"step": 659
},
{
"epoch": 0.2815699658703072,
"grad_norm": 50.617103576660156,
"learning_rate": 7.569631626235401e-06,
"loss": 3.3945,
"step": 660
},
{
"epoch": 0.2819965870307167,
"grad_norm": 19.791898727416992,
"learning_rate": 7.565139263252471e-06,
"loss": 2.6709,
"step": 661
},
{
"epoch": 0.28242320819112626,
"grad_norm": 38.855125427246094,
"learning_rate": 7.560646900269542e-06,
"loss": 3.3887,
"step": 662
},
{
"epoch": 0.28284982935153585,
"grad_norm": 19.478548049926758,
"learning_rate": 7.5561545372866126e-06,
"loss": 2.6631,
"step": 663
},
{
"epoch": 0.2832764505119454,
"grad_norm": 33.800086975097656,
"learning_rate": 7.5516621743036844e-06,
"loss": 2.7578,
"step": 664
},
{
"epoch": 0.2837030716723549,
"grad_norm": 23.76025390625,
"learning_rate": 7.5471698113207555e-06,
"loss": 3.123,
"step": 665
},
{
"epoch": 0.2841296928327645,
"grad_norm": 15.858804702758789,
"learning_rate": 7.5426774483378265e-06,
"loss": 2.7354,
"step": 666
},
{
"epoch": 0.28455631399317405,
"grad_norm": 23.25118064880371,
"learning_rate": 7.538185085354898e-06,
"loss": 2.332,
"step": 667
},
{
"epoch": 0.28498293515358364,
"grad_norm": 13.508567810058594,
"learning_rate": 7.533692722371968e-06,
"loss": 2.6807,
"step": 668
},
{
"epoch": 0.2854095563139932,
"grad_norm": 62.4262580871582,
"learning_rate": 7.52920035938904e-06,
"loss": 3.2617,
"step": 669
},
{
"epoch": 0.2858361774744027,
"grad_norm": 21.98171615600586,
"learning_rate": 7.52470799640611e-06,
"loss": 2.9414,
"step": 670
},
{
"epoch": 0.2862627986348123,
"grad_norm": 25.632265090942383,
"learning_rate": 7.520215633423181e-06,
"loss": 2.5762,
"step": 671
},
{
"epoch": 0.28668941979522183,
"grad_norm": 14.146087646484375,
"learning_rate": 7.515723270440253e-06,
"loss": 2.7949,
"step": 672
},
{
"epoch": 0.2871160409556314,
"grad_norm": 37.98383331298828,
"learning_rate": 7.511230907457323e-06,
"loss": 3.1572,
"step": 673
},
{
"epoch": 0.28754266211604096,
"grad_norm": 44.23236083984375,
"learning_rate": 7.506738544474395e-06,
"loss": 2.5234,
"step": 674
},
{
"epoch": 0.2879692832764505,
"grad_norm": 23.958847045898438,
"learning_rate": 7.502246181491465e-06,
"loss": 3.0811,
"step": 675
},
{
"epoch": 0.2883959044368601,
"grad_norm": 19.74029541015625,
"learning_rate": 7.497753818508536e-06,
"loss": 2.8955,
"step": 676
},
{
"epoch": 0.2888225255972696,
"grad_norm": 16.605003356933594,
"learning_rate": 7.493261455525606e-06,
"loss": 2.9785,
"step": 677
},
{
"epoch": 0.28924914675767915,
"grad_norm": 17.542959213256836,
"learning_rate": 7.488769092542678e-06,
"loss": 2.9639,
"step": 678
},
{
"epoch": 0.28967576791808874,
"grad_norm": 32.7658805847168,
"learning_rate": 7.484276729559748e-06,
"loss": 3.3887,
"step": 679
},
{
"epoch": 0.2901023890784983,
"grad_norm": 23.959367752075195,
"learning_rate": 7.47978436657682e-06,
"loss": 2.6729,
"step": 680
},
{
"epoch": 0.29052901023890787,
"grad_norm": 30.203109741210938,
"learning_rate": 7.475292003593891e-06,
"loss": 2.9775,
"step": 681
},
{
"epoch": 0.2909556313993174,
"grad_norm": 48.85400390625,
"learning_rate": 7.4707996406109616e-06,
"loss": 2.6562,
"step": 682
},
{
"epoch": 0.29138225255972694,
"grad_norm": 30.754945755004883,
"learning_rate": 7.4663072776280334e-06,
"loss": 3.0322,
"step": 683
},
{
"epoch": 0.29180887372013653,
"grad_norm": 20.216299057006836,
"learning_rate": 7.461814914645104e-06,
"loss": 2.6914,
"step": 684
},
{
"epoch": 0.29223549488054607,
"grad_norm": 25.015613555908203,
"learning_rate": 7.457322551662175e-06,
"loss": 2.8984,
"step": 685
},
{
"epoch": 0.29266211604095566,
"grad_norm": 33.04557418823242,
"learning_rate": 7.452830188679246e-06,
"loss": 2.4697,
"step": 686
},
{
"epoch": 0.2930887372013652,
"grad_norm": 45.48899459838867,
"learning_rate": 7.448337825696317e-06,
"loss": 2.751,
"step": 687
},
{
"epoch": 0.2935153583617747,
"grad_norm": 14.871517181396484,
"learning_rate": 7.443845462713389e-06,
"loss": 2.75,
"step": 688
},
{
"epoch": 0.2939419795221843,
"grad_norm": 42.968326568603516,
"learning_rate": 7.439353099730459e-06,
"loss": 2.9346,
"step": 689
},
{
"epoch": 0.29436860068259385,
"grad_norm": 19.281057357788086,
"learning_rate": 7.43486073674753e-06,
"loss": 2.3662,
"step": 690
},
{
"epoch": 0.2947952218430034,
"grad_norm": 33.68014907836914,
"learning_rate": 7.4303683737646e-06,
"loss": 2.8867,
"step": 691
},
{
"epoch": 0.295221843003413,
"grad_norm": 22.495100021362305,
"learning_rate": 7.425876010781672e-06,
"loss": 2.6484,
"step": 692
},
{
"epoch": 0.2956484641638225,
"grad_norm": 23.902799606323242,
"learning_rate": 7.421383647798742e-06,
"loss": 3.0869,
"step": 693
},
{
"epoch": 0.2960750853242321,
"grad_norm": 29.26067352294922,
"learning_rate": 7.416891284815813e-06,
"loss": 2.6133,
"step": 694
},
{
"epoch": 0.29650170648464164,
"grad_norm": 65.28421020507812,
"learning_rate": 7.412398921832885e-06,
"loss": 3.1631,
"step": 695
},
{
"epoch": 0.29692832764505117,
"grad_norm": 19.710037231445312,
"learning_rate": 7.407906558849955e-06,
"loss": 2.4121,
"step": 696
},
{
"epoch": 0.29735494880546076,
"grad_norm": 20.86551284790039,
"learning_rate": 7.403414195867027e-06,
"loss": 2.8691,
"step": 697
},
{
"epoch": 0.2977815699658703,
"grad_norm": 26.411109924316406,
"learning_rate": 7.398921832884097e-06,
"loss": 3.0977,
"step": 698
},
{
"epoch": 0.2982081911262799,
"grad_norm": 16.621917724609375,
"learning_rate": 7.3944294699011685e-06,
"loss": 2.4307,
"step": 699
},
{
"epoch": 0.2986348122866894,
"grad_norm": 54.798316955566406,
"learning_rate": 7.389937106918239e-06,
"loss": 3.0176,
"step": 700
},
{
"epoch": 0.29906143344709896,
"grad_norm": 16.24550437927246,
"learning_rate": 7.3854447439353106e-06,
"loss": 2.751,
"step": 701
},
{
"epoch": 0.29948805460750855,
"grad_norm": 46.361629486083984,
"learning_rate": 7.380952380952382e-06,
"loss": 3.0439,
"step": 702
},
{
"epoch": 0.2999146757679181,
"grad_norm": 24.557788848876953,
"learning_rate": 7.376460017969453e-06,
"loss": 3.0371,
"step": 703
},
{
"epoch": 0.3003412969283277,
"grad_norm": 20.77698516845703,
"learning_rate": 7.371967654986524e-06,
"loss": 3.3896,
"step": 704
},
{
"epoch": 0.3007679180887372,
"grad_norm": 30.620227813720703,
"learning_rate": 7.367475292003594e-06,
"loss": 2.8926,
"step": 705
},
{
"epoch": 0.30119453924914674,
"grad_norm": 18.66374969482422,
"learning_rate": 7.362982929020666e-06,
"loss": 2.8789,
"step": 706
},
{
"epoch": 0.30162116040955633,
"grad_norm": 46.04564666748047,
"learning_rate": 7.358490566037736e-06,
"loss": 2.4189,
"step": 707
},
{
"epoch": 0.30204778156996587,
"grad_norm": 52.486331939697266,
"learning_rate": 7.353998203054807e-06,
"loss": 2.8389,
"step": 708
},
{
"epoch": 0.3024744027303754,
"grad_norm": 14.944149017333984,
"learning_rate": 7.349505840071879e-06,
"loss": 2.7168,
"step": 709
},
{
"epoch": 0.302901023890785,
"grad_norm": 14.924211502075195,
"learning_rate": 7.345013477088949e-06,
"loss": 2.877,
"step": 710
},
{
"epoch": 0.3033276450511945,
"grad_norm": 21.726722717285156,
"learning_rate": 7.340521114106021e-06,
"loss": 2.7949,
"step": 711
},
{
"epoch": 0.3037542662116041,
"grad_norm": 28.78704833984375,
"learning_rate": 7.336028751123091e-06,
"loss": 2.6279,
"step": 712
},
{
"epoch": 0.30418088737201365,
"grad_norm": 41.87957763671875,
"learning_rate": 7.331536388140162e-06,
"loss": 3.2207,
"step": 713
},
{
"epoch": 0.3046075085324232,
"grad_norm": 18.67123031616211,
"learning_rate": 7.3270440251572324e-06,
"loss": 2.8047,
"step": 714
},
{
"epoch": 0.3050341296928328,
"grad_norm": 24.137971878051758,
"learning_rate": 7.322551662174304e-06,
"loss": 3.0098,
"step": 715
},
{
"epoch": 0.3054607508532423,
"grad_norm": 31.285362243652344,
"learning_rate": 7.318059299191375e-06,
"loss": 2.501,
"step": 716
},
{
"epoch": 0.3058873720136519,
"grad_norm": 15.62086296081543,
"learning_rate": 7.313566936208446e-06,
"loss": 2.5713,
"step": 717
},
{
"epoch": 0.30631399317406144,
"grad_norm": 40.82994842529297,
"learning_rate": 7.3090745732255175e-06,
"loss": 2.5781,
"step": 718
},
{
"epoch": 0.306740614334471,
"grad_norm": 48.110591888427734,
"learning_rate": 7.304582210242588e-06,
"loss": 2.5967,
"step": 719
},
{
"epoch": 0.30716723549488056,
"grad_norm": 80.84425354003906,
"learning_rate": 7.3000898472596595e-06,
"loss": 3.1064,
"step": 720
},
{
"epoch": 0.3075938566552901,
"grad_norm": 39.28225326538086,
"learning_rate": 7.29559748427673e-06,
"loss": 3.1758,
"step": 721
},
{
"epoch": 0.30802047781569963,
"grad_norm": 28.14806365966797,
"learning_rate": 7.291105121293801e-06,
"loss": 2.9873,
"step": 722
},
{
"epoch": 0.3084470989761092,
"grad_norm": 57.97172927856445,
"learning_rate": 7.286612758310873e-06,
"loss": 2.2529,
"step": 723
},
{
"epoch": 0.30887372013651876,
"grad_norm": 19.667949676513672,
"learning_rate": 7.282120395327943e-06,
"loss": 2.749,
"step": 724
},
{
"epoch": 0.30930034129692835,
"grad_norm": 41.68422317504883,
"learning_rate": 7.277628032345015e-06,
"loss": 2.6387,
"step": 725
},
{
"epoch": 0.3097269624573379,
"grad_norm": 54.32461929321289,
"learning_rate": 7.273135669362085e-06,
"loss": 3.0742,
"step": 726
},
{
"epoch": 0.3101535836177474,
"grad_norm": 39.13712692260742,
"learning_rate": 7.268643306379156e-06,
"loss": 2.5449,
"step": 727
},
{
"epoch": 0.310580204778157,
"grad_norm": 85.2264404296875,
"learning_rate": 7.264150943396226e-06,
"loss": 3.1641,
"step": 728
},
{
"epoch": 0.31100682593856654,
"grad_norm": 67.59739685058594,
"learning_rate": 7.259658580413298e-06,
"loss": 2.7236,
"step": 729
},
{
"epoch": 0.31143344709897613,
"grad_norm": 51.05998229980469,
"learning_rate": 7.255166217430369e-06,
"loss": 2.7227,
"step": 730
},
{
"epoch": 0.31186006825938567,
"grad_norm": 31.026859283447266,
"learning_rate": 7.250673854447439e-06,
"loss": 2.8906,
"step": 731
},
{
"epoch": 0.3122866894197952,
"grad_norm": 15.103306770324707,
"learning_rate": 7.246181491464511e-06,
"loss": 2.4883,
"step": 732
},
{
"epoch": 0.3127133105802048,
"grad_norm": 33.09571075439453,
"learning_rate": 7.2416891284815814e-06,
"loss": 2.6738,
"step": 733
},
{
"epoch": 0.31313993174061433,
"grad_norm": 14.153240203857422,
"learning_rate": 7.237196765498653e-06,
"loss": 2.6084,
"step": 734
},
{
"epoch": 0.31356655290102387,
"grad_norm": 27.6345157623291,
"learning_rate": 7.2327044025157235e-06,
"loss": 2.4736,
"step": 735
},
{
"epoch": 0.31399317406143346,
"grad_norm": 38.9594612121582,
"learning_rate": 7.2282120395327946e-06,
"loss": 2.9785,
"step": 736
},
{
"epoch": 0.314419795221843,
"grad_norm": 18.558412551879883,
"learning_rate": 7.2237196765498665e-06,
"loss": 2.6201,
"step": 737
},
{
"epoch": 0.3148464163822526,
"grad_norm": 28.34615135192871,
"learning_rate": 7.219227313566937e-06,
"loss": 2.5713,
"step": 738
},
{
"epoch": 0.3152730375426621,
"grad_norm": 20.91386604309082,
"learning_rate": 7.214734950584008e-06,
"loss": 2.7188,
"step": 739
},
{
"epoch": 0.31569965870307165,
"grad_norm": 49.61574935913086,
"learning_rate": 7.210242587601079e-06,
"loss": 2.6348,
"step": 740
},
{
"epoch": 0.31612627986348124,
"grad_norm": 19.535137176513672,
"learning_rate": 7.20575022461815e-06,
"loss": 3.002,
"step": 741
},
{
"epoch": 0.3165529010238908,
"grad_norm": 16.409156799316406,
"learning_rate": 7.20125786163522e-06,
"loss": 2.54,
"step": 742
},
{
"epoch": 0.31697952218430037,
"grad_norm": 25.113422393798828,
"learning_rate": 7.196765498652292e-06,
"loss": 2.7148,
"step": 743
},
{
"epoch": 0.3174061433447099,
"grad_norm": 21.636598587036133,
"learning_rate": 7.192273135669363e-06,
"loss": 2.7275,
"step": 744
},
{
"epoch": 0.31783276450511944,
"grad_norm": 17.268226623535156,
"learning_rate": 7.187780772686433e-06,
"loss": 2.9434,
"step": 745
},
{
"epoch": 0.318259385665529,
"grad_norm": 25.657812118530273,
"learning_rate": 7.183288409703505e-06,
"loss": 2.3467,
"step": 746
},
{
"epoch": 0.31868600682593856,
"grad_norm": 16.959508895874023,
"learning_rate": 7.178796046720575e-06,
"loss": 2.998,
"step": 747
},
{
"epoch": 0.3191126279863481,
"grad_norm": 15.126802444458008,
"learning_rate": 7.174303683737647e-06,
"loss": 2.7227,
"step": 748
},
{
"epoch": 0.3195392491467577,
"grad_norm": 35.737876892089844,
"learning_rate": 7.169811320754717e-06,
"loss": 2.8984,
"step": 749
},
{
"epoch": 0.3199658703071672,
"grad_norm": 23.50027084350586,
"learning_rate": 7.165318957771788e-06,
"loss": 2.7188,
"step": 750
},
{
"epoch": 0.3203924914675768,
"grad_norm": 46.277732849121094,
"learning_rate": 7.16082659478886e-06,
"loss": 2.8535,
"step": 751
},
{
"epoch": 0.32081911262798635,
"grad_norm": 15.298663139343262,
"learning_rate": 7.1563342318059304e-06,
"loss": 2.5234,
"step": 752
},
{
"epoch": 0.3212457337883959,
"grad_norm": 18.49799156188965,
"learning_rate": 7.1518418688230015e-06,
"loss": 2.627,
"step": 753
},
{
"epoch": 0.3216723549488055,
"grad_norm": 24.766132354736328,
"learning_rate": 7.1473495058400725e-06,
"loss": 2.9336,
"step": 754
},
{
"epoch": 0.322098976109215,
"grad_norm": 17.22234535217285,
"learning_rate": 7.1428571428571436e-06,
"loss": 3.1074,
"step": 755
},
{
"epoch": 0.3225255972696246,
"grad_norm": 33.30344009399414,
"learning_rate": 7.138364779874214e-06,
"loss": 2.3975,
"step": 756
},
{
"epoch": 0.32295221843003413,
"grad_norm": 21.910131454467773,
"learning_rate": 7.133872416891286e-06,
"loss": 2.3291,
"step": 757
},
{
"epoch": 0.32337883959044367,
"grad_norm": 19.836538314819336,
"learning_rate": 7.129380053908357e-06,
"loss": 2.5684,
"step": 758
},
{
"epoch": 0.32380546075085326,
"grad_norm": 25.19745445251465,
"learning_rate": 7.124887690925427e-06,
"loss": 2.8418,
"step": 759
},
{
"epoch": 0.3242320819112628,
"grad_norm": 16.91659927368164,
"learning_rate": 7.120395327942499e-06,
"loss": 2.2397,
"step": 760
},
{
"epoch": 0.3246587030716723,
"grad_norm": 50.67641067504883,
"learning_rate": 7.115902964959569e-06,
"loss": 3.2031,
"step": 761
},
{
"epoch": 0.3250853242320819,
"grad_norm": 40.11088562011719,
"learning_rate": 7.111410601976641e-06,
"loss": 2.7275,
"step": 762
},
{
"epoch": 0.32551194539249145,
"grad_norm": 43.53041458129883,
"learning_rate": 7.106918238993711e-06,
"loss": 3.5127,
"step": 763
},
{
"epoch": 0.32593856655290104,
"grad_norm": 24.22418975830078,
"learning_rate": 7.102425876010782e-06,
"loss": 2.8223,
"step": 764
},
{
"epoch": 0.3263651877133106,
"grad_norm": 20.259675979614258,
"learning_rate": 7.097933513027854e-06,
"loss": 2.7197,
"step": 765
},
{
"epoch": 0.3267918088737201,
"grad_norm": 34.51850891113281,
"learning_rate": 7.093441150044924e-06,
"loss": 2.4951,
"step": 766
},
{
"epoch": 0.3272184300341297,
"grad_norm": 41.48360824584961,
"learning_rate": 7.088948787061995e-06,
"loss": 2.4844,
"step": 767
},
{
"epoch": 0.32764505119453924,
"grad_norm": 40.56487274169922,
"learning_rate": 7.0844564240790654e-06,
"loss": 3.0293,
"step": 768
},
{
"epoch": 0.32807167235494883,
"grad_norm": 21.44087791442871,
"learning_rate": 7.079964061096137e-06,
"loss": 2.5938,
"step": 769
},
{
"epoch": 0.32849829351535836,
"grad_norm": 17.163280487060547,
"learning_rate": 7.0754716981132075e-06,
"loss": 2.8359,
"step": 770
},
{
"epoch": 0.3289249146757679,
"grad_norm": 37.45278549194336,
"learning_rate": 7.0709793351302794e-06,
"loss": 3.0166,
"step": 771
},
{
"epoch": 0.3293515358361775,
"grad_norm": 24.047813415527344,
"learning_rate": 7.0664869721473505e-06,
"loss": 2.9121,
"step": 772
},
{
"epoch": 0.329778156996587,
"grad_norm": 28.624671936035156,
"learning_rate": 7.061994609164421e-06,
"loss": 2.5586,
"step": 773
},
{
"epoch": 0.3302047781569966,
"grad_norm": 16.891504287719727,
"learning_rate": 7.0575022461814926e-06,
"loss": 2.3682,
"step": 774
},
{
"epoch": 0.33063139931740615,
"grad_norm": 18.889495849609375,
"learning_rate": 7.053009883198563e-06,
"loss": 2.9854,
"step": 775
},
{
"epoch": 0.3310580204778157,
"grad_norm": 15.73511028289795,
"learning_rate": 7.048517520215634e-06,
"loss": 2.6162,
"step": 776
},
{
"epoch": 0.3314846416382253,
"grad_norm": 28.1113338470459,
"learning_rate": 7.044025157232705e-06,
"loss": 3.1523,
"step": 777
},
{
"epoch": 0.3319112627986348,
"grad_norm": 39.47050094604492,
"learning_rate": 7.039532794249776e-06,
"loss": 3.6523,
"step": 778
},
{
"epoch": 0.33233788395904434,
"grad_norm": 28.715503692626953,
"learning_rate": 7.035040431266848e-06,
"loss": 2.7803,
"step": 779
},
{
"epoch": 0.33276450511945393,
"grad_norm": 16.625925064086914,
"learning_rate": 7.030548068283918e-06,
"loss": 2.1416,
"step": 780
},
{
"epoch": 0.33319112627986347,
"grad_norm": 22.609573364257812,
"learning_rate": 7.026055705300989e-06,
"loss": 3.0781,
"step": 781
},
{
"epoch": 0.33361774744027306,
"grad_norm": 23.720556259155273,
"learning_rate": 7.021563342318059e-06,
"loss": 2.5449,
"step": 782
},
{
"epoch": 0.3340443686006826,
"grad_norm": 21.275888442993164,
"learning_rate": 7.017070979335131e-06,
"loss": 2.9033,
"step": 783
},
{
"epoch": 0.33447098976109213,
"grad_norm": 43.3743896484375,
"learning_rate": 7.012578616352201e-06,
"loss": 2.7246,
"step": 784
},
{
"epoch": 0.3348976109215017,
"grad_norm": 44.226131439208984,
"learning_rate": 7.008086253369273e-06,
"loss": 2.7852,
"step": 785
},
{
"epoch": 0.33532423208191126,
"grad_norm": 29.745895385742188,
"learning_rate": 7.003593890386344e-06,
"loss": 3.3535,
"step": 786
},
{
"epoch": 0.33575085324232085,
"grad_norm": 17.165884017944336,
"learning_rate": 6.9991015274034144e-06,
"loss": 2.3242,
"step": 787
},
{
"epoch": 0.3361774744027304,
"grad_norm": 17.3034725189209,
"learning_rate": 6.994609164420486e-06,
"loss": 2.4121,
"step": 788
},
{
"epoch": 0.3366040955631399,
"grad_norm": 35.9029655456543,
"learning_rate": 6.9901168014375565e-06,
"loss": 3.1836,
"step": 789
},
{
"epoch": 0.3370307167235495,
"grad_norm": 32.665138244628906,
"learning_rate": 6.9856244384546276e-06,
"loss": 2.7295,
"step": 790
},
{
"epoch": 0.33745733788395904,
"grad_norm": 24.935747146606445,
"learning_rate": 6.981132075471699e-06,
"loss": 2.9609,
"step": 791
},
{
"epoch": 0.3378839590443686,
"grad_norm": 17.43288230895996,
"learning_rate": 6.97663971248877e-06,
"loss": 2.5576,
"step": 792
},
{
"epoch": 0.33831058020477817,
"grad_norm": 17.04522705078125,
"learning_rate": 6.9721473495058416e-06,
"loss": 2.4004,
"step": 793
},
{
"epoch": 0.3387372013651877,
"grad_norm": 36.757266998291016,
"learning_rate": 6.967654986522912e-06,
"loss": 2.8086,
"step": 794
},
{
"epoch": 0.3391638225255973,
"grad_norm": 27.1303768157959,
"learning_rate": 6.963162623539983e-06,
"loss": 3.0547,
"step": 795
},
{
"epoch": 0.3395904436860068,
"grad_norm": 19.866275787353516,
"learning_rate": 6.958670260557053e-06,
"loss": 3.1318,
"step": 796
},
{
"epoch": 0.34001706484641636,
"grad_norm": 33.106712341308594,
"learning_rate": 6.954177897574125e-06,
"loss": 3.5977,
"step": 797
},
{
"epoch": 0.34044368600682595,
"grad_norm": 22.2625732421875,
"learning_rate": 6.949685534591195e-06,
"loss": 2.9775,
"step": 798
},
{
"epoch": 0.3408703071672355,
"grad_norm": 39.98801040649414,
"learning_rate": 6.945193171608267e-06,
"loss": 2.5645,
"step": 799
},
{
"epoch": 0.3412969283276451,
"grad_norm": 28.945308685302734,
"learning_rate": 6.940700808625338e-06,
"loss": 2.7559,
"step": 800
},
{
"epoch": 0.3417235494880546,
"grad_norm": 32.561458587646484,
"learning_rate": 6.936208445642408e-06,
"loss": 3.3193,
"step": 801
},
{
"epoch": 0.34215017064846415,
"grad_norm": 19.446264266967773,
"learning_rate": 6.93171608265948e-06,
"loss": 2.6797,
"step": 802
},
{
"epoch": 0.34257679180887374,
"grad_norm": 14.95886516571045,
"learning_rate": 6.92722371967655e-06,
"loss": 2.8809,
"step": 803
},
{
"epoch": 0.3430034129692833,
"grad_norm": 27.389297485351562,
"learning_rate": 6.922731356693621e-06,
"loss": 3.2207,
"step": 804
},
{
"epoch": 0.3434300341296928,
"grad_norm": 25.899202346801758,
"learning_rate": 6.9182389937106915e-06,
"loss": 2.6055,
"step": 805
},
{
"epoch": 0.3438566552901024,
"grad_norm": 29.216964721679688,
"learning_rate": 6.9137466307277634e-06,
"loss": 2.9365,
"step": 806
},
{
"epoch": 0.34428327645051193,
"grad_norm": 14.480628967285156,
"learning_rate": 6.9092542677448345e-06,
"loss": 2.1611,
"step": 807
},
{
"epoch": 0.3447098976109215,
"grad_norm": 21.613162994384766,
"learning_rate": 6.9047619047619055e-06,
"loss": 2.4648,
"step": 808
},
{
"epoch": 0.34513651877133106,
"grad_norm": 19.232038497924805,
"learning_rate": 6.9002695417789766e-06,
"loss": 2.6406,
"step": 809
},
{
"epoch": 0.3455631399317406,
"grad_norm": 26.753767013549805,
"learning_rate": 6.895777178796047e-06,
"loss": 2.8281,
"step": 810
},
{
"epoch": 0.3459897610921502,
"grad_norm": 17.898767471313477,
"learning_rate": 6.891284815813119e-06,
"loss": 2.5,
"step": 811
},
{
"epoch": 0.3464163822525597,
"grad_norm": 29.41793441772461,
"learning_rate": 6.886792452830189e-06,
"loss": 2.7646,
"step": 812
},
{
"epoch": 0.3468430034129693,
"grad_norm": 25.555450439453125,
"learning_rate": 6.88230008984726e-06,
"loss": 3.3203,
"step": 813
},
{
"epoch": 0.34726962457337884,
"grad_norm": 25.43630599975586,
"learning_rate": 6.877807726864332e-06,
"loss": 3.1045,
"step": 814
},
{
"epoch": 0.3476962457337884,
"grad_norm": 24.84125518798828,
"learning_rate": 6.873315363881402e-06,
"loss": 2.4229,
"step": 815
},
{
"epoch": 0.34812286689419797,
"grad_norm": 31.87086296081543,
"learning_rate": 6.868823000898474e-06,
"loss": 2.6289,
"step": 816
},
{
"epoch": 0.3485494880546075,
"grad_norm": 20.92204475402832,
"learning_rate": 6.864330637915544e-06,
"loss": 2.4424,
"step": 817
},
{
"epoch": 0.34897610921501704,
"grad_norm": 34.5516471862793,
"learning_rate": 6.859838274932615e-06,
"loss": 2.7666,
"step": 818
},
{
"epoch": 0.34940273037542663,
"grad_norm": 24.447370529174805,
"learning_rate": 6.855345911949685e-06,
"loss": 2.3027,
"step": 819
},
{
"epoch": 0.34982935153583616,
"grad_norm": 22.048521041870117,
"learning_rate": 6.850853548966757e-06,
"loss": 2.6621,
"step": 820
},
{
"epoch": 0.35025597269624575,
"grad_norm": 31.585052490234375,
"learning_rate": 6.846361185983828e-06,
"loss": 2.5371,
"step": 821
},
{
"epoch": 0.3506825938566553,
"grad_norm": 19.84841537475586,
"learning_rate": 6.841868823000899e-06,
"loss": 2.3789,
"step": 822
},
{
"epoch": 0.3511092150170648,
"grad_norm": 14.99898624420166,
"learning_rate": 6.83737646001797e-06,
"loss": 2.373,
"step": 823
},
{
"epoch": 0.3515358361774744,
"grad_norm": 18.301692962646484,
"learning_rate": 6.8328840970350405e-06,
"loss": 2.4795,
"step": 824
},
{
"epoch": 0.35196245733788395,
"grad_norm": 38.46403884887695,
"learning_rate": 6.8283917340521124e-06,
"loss": 3.001,
"step": 825
},
{
"epoch": 0.35238907849829354,
"grad_norm": 42.38199996948242,
"learning_rate": 6.823899371069183e-06,
"loss": 2.2314,
"step": 826
},
{
"epoch": 0.3528156996587031,
"grad_norm": 17.83135223388672,
"learning_rate": 6.819407008086254e-06,
"loss": 2.6631,
"step": 827
},
{
"epoch": 0.3532423208191126,
"grad_norm": 24.800996780395508,
"learning_rate": 6.8149146451033256e-06,
"loss": 3.1953,
"step": 828
},
{
"epoch": 0.3536689419795222,
"grad_norm": 18.44524383544922,
"learning_rate": 6.810422282120396e-06,
"loss": 2.6289,
"step": 829
},
{
"epoch": 0.35409556313993173,
"grad_norm": 24.03241539001465,
"learning_rate": 6.805929919137468e-06,
"loss": 3.0879,
"step": 830
},
{
"epoch": 0.35452218430034127,
"grad_norm": 56.00475311279297,
"learning_rate": 6.801437556154538e-06,
"loss": 2.2793,
"step": 831
},
{
"epoch": 0.35494880546075086,
"grad_norm": 56.98671340942383,
"learning_rate": 6.796945193171609e-06,
"loss": 3.3281,
"step": 832
},
{
"epoch": 0.3553754266211604,
"grad_norm": 66.61808013916016,
"learning_rate": 6.792452830188679e-06,
"loss": 3.208,
"step": 833
},
{
"epoch": 0.35580204778157,
"grad_norm": 47.69834518432617,
"learning_rate": 6.787960467205751e-06,
"loss": 2.4082,
"step": 834
},
{
"epoch": 0.3562286689419795,
"grad_norm": 20.011539459228516,
"learning_rate": 6.783468104222821e-06,
"loss": 3.0039,
"step": 835
},
{
"epoch": 0.35665529010238906,
"grad_norm": 22.645479202270508,
"learning_rate": 6.778975741239893e-06,
"loss": 2.6797,
"step": 836
},
{
"epoch": 0.35708191126279865,
"grad_norm": 25.83390998840332,
"learning_rate": 6.774483378256964e-06,
"loss": 2.7969,
"step": 837
},
{
"epoch": 0.3575085324232082,
"grad_norm": 25.184431076049805,
"learning_rate": 6.769991015274034e-06,
"loss": 2.7764,
"step": 838
},
{
"epoch": 0.35793515358361777,
"grad_norm": 24.293928146362305,
"learning_rate": 6.765498652291106e-06,
"loss": 2.6016,
"step": 839
},
{
"epoch": 0.3583617747440273,
"grad_norm": 17.71709442138672,
"learning_rate": 6.761006289308176e-06,
"loss": 2.3506,
"step": 840
},
{
"epoch": 0.35878839590443684,
"grad_norm": 32.32875061035156,
"learning_rate": 6.7565139263252475e-06,
"loss": 2.8857,
"step": 841
},
{
"epoch": 0.35921501706484643,
"grad_norm": 24.71383285522461,
"learning_rate": 6.752021563342318e-06,
"loss": 3.1406,
"step": 842
},
{
"epoch": 0.35964163822525597,
"grad_norm": 47.8384895324707,
"learning_rate": 6.7475292003593895e-06,
"loss": 2.833,
"step": 843
},
{
"epoch": 0.36006825938566556,
"grad_norm": 20.34820556640625,
"learning_rate": 6.743036837376461e-06,
"loss": 3.1543,
"step": 844
},
{
"epoch": 0.3604948805460751,
"grad_norm": 36.99142837524414,
"learning_rate": 6.738544474393532e-06,
"loss": 2.4814,
"step": 845
},
{
"epoch": 0.3609215017064846,
"grad_norm": 17.918649673461914,
"learning_rate": 6.734052111410603e-06,
"loss": 3.002,
"step": 846
},
{
"epoch": 0.3613481228668942,
"grad_norm": 43.56641387939453,
"learning_rate": 6.729559748427673e-06,
"loss": 2.8711,
"step": 847
},
{
"epoch": 0.36177474402730375,
"grad_norm": 19.88298988342285,
"learning_rate": 6.725067385444745e-06,
"loss": 3.1875,
"step": 848
},
{
"epoch": 0.3622013651877133,
"grad_norm": 16.08139991760254,
"learning_rate": 6.720575022461815e-06,
"loss": 2.3691,
"step": 849
},
{
"epoch": 0.3626279863481229,
"grad_norm": 27.99295997619629,
"learning_rate": 6.716082659478886e-06,
"loss": 2.5859,
"step": 850
},
{
"epoch": 0.3630546075085324,
"grad_norm": 18.48909568786621,
"learning_rate": 6.711590296495958e-06,
"loss": 2.9287,
"step": 851
},
{
"epoch": 0.363481228668942,
"grad_norm": 22.837125778198242,
"learning_rate": 6.707097933513028e-06,
"loss": 2.5029,
"step": 852
},
{
"epoch": 0.36390784982935154,
"grad_norm": 16.875024795532227,
"learning_rate": 6.7026055705301e-06,
"loss": 2.5801,
"step": 853
},
{
"epoch": 0.3643344709897611,
"grad_norm": 20.041526794433594,
"learning_rate": 6.69811320754717e-06,
"loss": 2.9414,
"step": 854
},
{
"epoch": 0.36476109215017066,
"grad_norm": 33.22433853149414,
"learning_rate": 6.693620844564241e-06,
"loss": 2.9219,
"step": 855
},
{
"epoch": 0.3651877133105802,
"grad_norm": 40.56968688964844,
"learning_rate": 6.6891284815813114e-06,
"loss": 2.7197,
"step": 856
},
{
"epoch": 0.3656143344709898,
"grad_norm": 21.923696517944336,
"learning_rate": 6.684636118598383e-06,
"loss": 2.5166,
"step": 857
},
{
"epoch": 0.3660409556313993,
"grad_norm": 34.44352340698242,
"learning_rate": 6.680143755615454e-06,
"loss": 2.5635,
"step": 858
},
{
"epoch": 0.36646757679180886,
"grad_norm": 18.53885269165039,
"learning_rate": 6.675651392632525e-06,
"loss": 2.6416,
"step": 859
},
{
"epoch": 0.36689419795221845,
"grad_norm": 18.8934268951416,
"learning_rate": 6.6711590296495964e-06,
"loss": 2.6191,
"step": 860
},
{
"epoch": 0.367320819112628,
"grad_norm": 38.83974075317383,
"learning_rate": 6.666666666666667e-06,
"loss": 2.4268,
"step": 861
},
{
"epoch": 0.3677474402730375,
"grad_norm": 20.042360305786133,
"learning_rate": 6.6621743036837385e-06,
"loss": 2.458,
"step": 862
},
{
"epoch": 0.3681740614334471,
"grad_norm": 36.47145462036133,
"learning_rate": 6.657681940700809e-06,
"loss": 1.9346,
"step": 863
},
{
"epoch": 0.36860068259385664,
"grad_norm": 28.04621696472168,
"learning_rate": 6.65318957771788e-06,
"loss": 3.3066,
"step": 864
},
{
"epoch": 0.36902730375426623,
"grad_norm": 21.341005325317383,
"learning_rate": 6.648697214734952e-06,
"loss": 2.6602,
"step": 865
},
{
"epoch": 0.36945392491467577,
"grad_norm": 31.291488647460938,
"learning_rate": 6.644204851752022e-06,
"loss": 2.8086,
"step": 866
},
{
"epoch": 0.3698805460750853,
"grad_norm": 32.221710205078125,
"learning_rate": 6.639712488769094e-06,
"loss": 2.5039,
"step": 867
},
{
"epoch": 0.3703071672354949,
"grad_norm": 29.85544776916504,
"learning_rate": 6.635220125786164e-06,
"loss": 2.6084,
"step": 868
},
{
"epoch": 0.37073378839590443,
"grad_norm": 29.484418869018555,
"learning_rate": 6.630727762803235e-06,
"loss": 2.4824,
"step": 869
},
{
"epoch": 0.371160409556314,
"grad_norm": 25.850082397460938,
"learning_rate": 6.626235399820305e-06,
"loss": 2.4316,
"step": 870
},
{
"epoch": 0.37158703071672355,
"grad_norm": 21.31001091003418,
"learning_rate": 6.621743036837377e-06,
"loss": 2.5518,
"step": 871
},
{
"epoch": 0.3720136518771331,
"grad_norm": 21.5012149810791,
"learning_rate": 6.617250673854448e-06,
"loss": 2.6621,
"step": 872
},
{
"epoch": 0.3724402730375427,
"grad_norm": 24.898344039916992,
"learning_rate": 6.612758310871519e-06,
"loss": 2.2402,
"step": 873
},
{
"epoch": 0.3728668941979522,
"grad_norm": 21.651897430419922,
"learning_rate": 6.60826594788859e-06,
"loss": 2.7373,
"step": 874
},
{
"epoch": 0.37329351535836175,
"grad_norm": 18.67881202697754,
"learning_rate": 6.60377358490566e-06,
"loss": 2.4678,
"step": 875
},
{
"epoch": 0.37372013651877134,
"grad_norm": 28.32158088684082,
"learning_rate": 6.599281221922732e-06,
"loss": 2.6621,
"step": 876
},
{
"epoch": 0.3741467576791809,
"grad_norm": 27.963510513305664,
"learning_rate": 6.5947888589398025e-06,
"loss": 2.3184,
"step": 877
},
{
"epoch": 0.37457337883959047,
"grad_norm": 18.716800689697266,
"learning_rate": 6.5902964959568736e-06,
"loss": 2.707,
"step": 878
},
{
"epoch": 0.375,
"grad_norm": 26.4693660736084,
"learning_rate": 6.5858041329739454e-06,
"loss": 2.8682,
"step": 879
},
{
"epoch": 0.37542662116040953,
"grad_norm": 17.989103317260742,
"learning_rate": 6.581311769991016e-06,
"loss": 2.5781,
"step": 880
},
{
"epoch": 0.3758532423208191,
"grad_norm": 17.228116989135742,
"learning_rate": 6.576819407008087e-06,
"loss": 2.1328,
"step": 881
},
{
"epoch": 0.37627986348122866,
"grad_norm": 42.95380401611328,
"learning_rate": 6.572327044025158e-06,
"loss": 2.8135,
"step": 882
},
{
"epoch": 0.37670648464163825,
"grad_norm": 33.39653778076172,
"learning_rate": 6.567834681042229e-06,
"loss": 2.9727,
"step": 883
},
{
"epoch": 0.3771331058020478,
"grad_norm": 39.138118743896484,
"learning_rate": 6.563342318059299e-06,
"loss": 2.7061,
"step": 884
},
{
"epoch": 0.3775597269624573,
"grad_norm": 18.219497680664062,
"learning_rate": 6.558849955076371e-06,
"loss": 2.7354,
"step": 885
},
{
"epoch": 0.3779863481228669,
"grad_norm": 37.8060302734375,
"learning_rate": 6.554357592093442e-06,
"loss": 2.9434,
"step": 886
},
{
"epoch": 0.37841296928327645,
"grad_norm": 23.32965660095215,
"learning_rate": 6.549865229110512e-06,
"loss": 3.0215,
"step": 887
},
{
"epoch": 0.378839590443686,
"grad_norm": 39.0524787902832,
"learning_rate": 6.545372866127584e-06,
"loss": 2.4316,
"step": 888
},
{
"epoch": 0.37926621160409557,
"grad_norm": 22.534435272216797,
"learning_rate": 6.540880503144654e-06,
"loss": 2.5332,
"step": 889
},
{
"epoch": 0.3796928327645051,
"grad_norm": 41.38739013671875,
"learning_rate": 6.536388140161726e-06,
"loss": 2.8721,
"step": 890
},
{
"epoch": 0.3801194539249147,
"grad_norm": 21.326303482055664,
"learning_rate": 6.531895777178796e-06,
"loss": 2.4785,
"step": 891
},
{
"epoch": 0.38054607508532423,
"grad_norm": 18.502744674682617,
"learning_rate": 6.527403414195867e-06,
"loss": 2.7476,
"step": 892
},
{
"epoch": 0.38097269624573377,
"grad_norm": 20.264904022216797,
"learning_rate": 6.522911051212939e-06,
"loss": 2.4033,
"step": 893
},
{
"epoch": 0.38139931740614336,
"grad_norm": 25.84261131286621,
"learning_rate": 6.518418688230009e-06,
"loss": 2.668,
"step": 894
},
{
"epoch": 0.3818259385665529,
"grad_norm": 39.63558578491211,
"learning_rate": 6.5139263252470805e-06,
"loss": 2.2871,
"step": 895
},
{
"epoch": 0.3822525597269625,
"grad_norm": 42.330448150634766,
"learning_rate": 6.5094339622641515e-06,
"loss": 2.7676,
"step": 896
},
{
"epoch": 0.382679180887372,
"grad_norm": 19.066465377807617,
"learning_rate": 6.5049415992812226e-06,
"loss": 2.4248,
"step": 897
},
{
"epoch": 0.38310580204778155,
"grad_norm": 20.37721061706543,
"learning_rate": 6.500449236298293e-06,
"loss": 2.2129,
"step": 898
},
{
"epoch": 0.38353242320819114,
"grad_norm": 40.267765045166016,
"learning_rate": 6.495956873315365e-06,
"loss": 2.5332,
"step": 899
},
{
"epoch": 0.3839590443686007,
"grad_norm": 18.17340850830078,
"learning_rate": 6.491464510332436e-06,
"loss": 2.3945,
"step": 900
},
{
"epoch": 0.3843856655290102,
"grad_norm": 15.765764236450195,
"learning_rate": 6.486972147349506e-06,
"loss": 1.9697,
"step": 901
},
{
"epoch": 0.3848122866894198,
"grad_norm": 47.31410598754883,
"learning_rate": 6.482479784366578e-06,
"loss": 3.4316,
"step": 902
},
{
"epoch": 0.38523890784982934,
"grad_norm": 19.825407028198242,
"learning_rate": 6.477987421383648e-06,
"loss": 2.2998,
"step": 903
},
{
"epoch": 0.3856655290102389,
"grad_norm": 23.80759048461914,
"learning_rate": 6.47349505840072e-06,
"loss": 2.8438,
"step": 904
},
{
"epoch": 0.38609215017064846,
"grad_norm": 33.58242416381836,
"learning_rate": 6.46900269541779e-06,
"loss": 2.7188,
"step": 905
},
{
"epoch": 0.386518771331058,
"grad_norm": 20.253826141357422,
"learning_rate": 6.464510332434861e-06,
"loss": 2.5635,
"step": 906
},
{
"epoch": 0.3869453924914676,
"grad_norm": 31.54781723022461,
"learning_rate": 6.460017969451933e-06,
"loss": 2.9951,
"step": 907
},
{
"epoch": 0.3873720136518771,
"grad_norm": 36.786502838134766,
"learning_rate": 6.455525606469003e-06,
"loss": 3.0371,
"step": 908
},
{
"epoch": 0.3877986348122867,
"grad_norm": 17.36682891845703,
"learning_rate": 6.451033243486074e-06,
"loss": 2.167,
"step": 909
},
{
"epoch": 0.38822525597269625,
"grad_norm": 21.992660522460938,
"learning_rate": 6.446540880503145e-06,
"loss": 2.0117,
"step": 910
},
{
"epoch": 0.3886518771331058,
"grad_norm": 17.453601837158203,
"learning_rate": 6.442048517520216e-06,
"loss": 2.5195,
"step": 911
},
{
"epoch": 0.3890784982935154,
"grad_norm": 17.015531539916992,
"learning_rate": 6.4375561545372865e-06,
"loss": 2.2314,
"step": 912
},
{
"epoch": 0.3895051194539249,
"grad_norm": 22.681379318237305,
"learning_rate": 6.433063791554358e-06,
"loss": 3.1436,
"step": 913
},
{
"epoch": 0.38993174061433444,
"grad_norm": 15.419713020324707,
"learning_rate": 6.4285714285714295e-06,
"loss": 2.5117,
"step": 914
},
{
"epoch": 0.39035836177474403,
"grad_norm": 29.035032272338867,
"learning_rate": 6.4240790655885e-06,
"loss": 2.6631,
"step": 915
},
{
"epoch": 0.39078498293515357,
"grad_norm": 25.45609474182129,
"learning_rate": 6.4195867026055715e-06,
"loss": 2.8594,
"step": 916
},
{
"epoch": 0.39121160409556316,
"grad_norm": 26.522363662719727,
"learning_rate": 6.415094339622642e-06,
"loss": 2.2881,
"step": 917
},
{
"epoch": 0.3916382252559727,
"grad_norm": 31.536636352539062,
"learning_rate": 6.410601976639713e-06,
"loss": 2.3486,
"step": 918
},
{
"epoch": 0.39206484641638223,
"grad_norm": 25.44186019897461,
"learning_rate": 6.406109613656784e-06,
"loss": 2.6182,
"step": 919
},
{
"epoch": 0.3924914675767918,
"grad_norm": 37.88819885253906,
"learning_rate": 6.401617250673855e-06,
"loss": 2.5488,
"step": 920
},
{
"epoch": 0.39291808873720135,
"grad_norm": 17.342376708984375,
"learning_rate": 6.397124887690927e-06,
"loss": 2.5137,
"step": 921
},
{
"epoch": 0.39334470989761094,
"grad_norm": 25.706722259521484,
"learning_rate": 6.392632524707997e-06,
"loss": 2.9775,
"step": 922
},
{
"epoch": 0.3937713310580205,
"grad_norm": 18.36219596862793,
"learning_rate": 6.388140161725068e-06,
"loss": 2.5596,
"step": 923
},
{
"epoch": 0.39419795221843,
"grad_norm": 16.040849685668945,
"learning_rate": 6.383647798742138e-06,
"loss": 1.9355,
"step": 924
},
{
"epoch": 0.3946245733788396,
"grad_norm": 19.206146240234375,
"learning_rate": 6.37915543575921e-06,
"loss": 2.5811,
"step": 925
},
{
"epoch": 0.39505119453924914,
"grad_norm": 20.79313087463379,
"learning_rate": 6.37466307277628e-06,
"loss": 2.4482,
"step": 926
},
{
"epoch": 0.39547781569965873,
"grad_norm": 27.068574905395508,
"learning_rate": 6.370170709793352e-06,
"loss": 2.7402,
"step": 927
},
{
"epoch": 0.39590443686006827,
"grad_norm": 23.966190338134766,
"learning_rate": 6.365678346810423e-06,
"loss": 2.9131,
"step": 928
},
{
"epoch": 0.3963310580204778,
"grad_norm": 37.11189651489258,
"learning_rate": 6.3611859838274934e-06,
"loss": 2.7607,
"step": 929
},
{
"epoch": 0.3967576791808874,
"grad_norm": 28.57893943786621,
"learning_rate": 6.356693620844565e-06,
"loss": 2.7305,
"step": 930
},
{
"epoch": 0.3971843003412969,
"grad_norm": 21.24115753173828,
"learning_rate": 6.3522012578616355e-06,
"loss": 2.7012,
"step": 931
},
{
"epoch": 0.39761092150170646,
"grad_norm": 17.273521423339844,
"learning_rate": 6.3477088948787066e-06,
"loss": 2.6914,
"step": 932
},
{
"epoch": 0.39803754266211605,
"grad_norm": 42.97947692871094,
"learning_rate": 6.343216531895778e-06,
"loss": 2.5791,
"step": 933
},
{
"epoch": 0.3984641638225256,
"grad_norm": 16.661331176757812,
"learning_rate": 6.338724168912849e-06,
"loss": 2.7388,
"step": 934
},
{
"epoch": 0.3988907849829352,
"grad_norm": 51.49508285522461,
"learning_rate": 6.3342318059299205e-06,
"loss": 2.5723,
"step": 935
},
{
"epoch": 0.3993174061433447,
"grad_norm": 50.0458869934082,
"learning_rate": 6.329739442946991e-06,
"loss": 2.751,
"step": 936
},
{
"epoch": 0.39974402730375425,
"grad_norm": 39.40812683105469,
"learning_rate": 6.325247079964062e-06,
"loss": 2.7227,
"step": 937
},
{
"epoch": 0.40017064846416384,
"grad_norm": 20.065385818481445,
"learning_rate": 6.320754716981132e-06,
"loss": 2.6074,
"step": 938
},
{
"epoch": 0.40059726962457337,
"grad_norm": 29.637264251708984,
"learning_rate": 6.316262353998204e-06,
"loss": 2.4805,
"step": 939
},
{
"epoch": 0.40102389078498296,
"grad_norm": 31.02073860168457,
"learning_rate": 6.311769991015274e-06,
"loss": 2.5801,
"step": 940
},
{
"epoch": 0.4014505119453925,
"grad_norm": 23.977516174316406,
"learning_rate": 6.307277628032346e-06,
"loss": 2.4229,
"step": 941
},
{
"epoch": 0.40187713310580203,
"grad_norm": 16.94057273864746,
"learning_rate": 6.302785265049417e-06,
"loss": 2.3359,
"step": 942
},
{
"epoch": 0.4023037542662116,
"grad_norm": 47.786930084228516,
"learning_rate": 6.298292902066487e-06,
"loss": 2.2529,
"step": 943
},
{
"epoch": 0.40273037542662116,
"grad_norm": 30.42767333984375,
"learning_rate": 6.293800539083559e-06,
"loss": 2.6318,
"step": 944
},
{
"epoch": 0.4031569965870307,
"grad_norm": 68.6019515991211,
"learning_rate": 6.289308176100629e-06,
"loss": 3.3115,
"step": 945
},
{
"epoch": 0.4035836177474403,
"grad_norm": 46.8463020324707,
"learning_rate": 6.2848158131177e-06,
"loss": 2.9385,
"step": 946
},
{
"epoch": 0.4040102389078498,
"grad_norm": 16.7492618560791,
"learning_rate": 6.2803234501347705e-06,
"loss": 2.0059,
"step": 947
},
{
"epoch": 0.4044368600682594,
"grad_norm": 19.94270896911621,
"learning_rate": 6.2758310871518424e-06,
"loss": 2.2812,
"step": 948
},
{
"epoch": 0.40486348122866894,
"grad_norm": 24.3834285736084,
"learning_rate": 6.2713387241689135e-06,
"loss": 3.5176,
"step": 949
},
{
"epoch": 0.4052901023890785,
"grad_norm": 25.361343383789062,
"learning_rate": 6.2668463611859845e-06,
"loss": 3.1465,
"step": 950
},
{
"epoch": 0.40571672354948807,
"grad_norm": 33.129478454589844,
"learning_rate": 6.2623539982030556e-06,
"loss": 2.5137,
"step": 951
},
{
"epoch": 0.4061433447098976,
"grad_norm": 36.047359466552734,
"learning_rate": 6.257861635220126e-06,
"loss": 2.5098,
"step": 952
},
{
"epoch": 0.4065699658703072,
"grad_norm": 29.85443878173828,
"learning_rate": 6.253369272237198e-06,
"loss": 2.7441,
"step": 953
},
{
"epoch": 0.4069965870307167,
"grad_norm": 20.3319149017334,
"learning_rate": 6.248876909254268e-06,
"loss": 2.7578,
"step": 954
},
{
"epoch": 0.40742320819112626,
"grad_norm": 45.543827056884766,
"learning_rate": 6.244384546271339e-06,
"loss": 2.4893,
"step": 955
},
{
"epoch": 0.40784982935153585,
"grad_norm": 17.98326301574707,
"learning_rate": 6.239892183288411e-06,
"loss": 2.7275,
"step": 956
},
{
"epoch": 0.4082764505119454,
"grad_norm": 16.175251007080078,
"learning_rate": 6.235399820305481e-06,
"loss": 2.5791,
"step": 957
},
{
"epoch": 0.4087030716723549,
"grad_norm": 23.90378189086914,
"learning_rate": 6.230907457322553e-06,
"loss": 2.7363,
"step": 958
},
{
"epoch": 0.4091296928327645,
"grad_norm": 18.764850616455078,
"learning_rate": 6.226415094339623e-06,
"loss": 2.7949,
"step": 959
},
{
"epoch": 0.40955631399317405,
"grad_norm": 24.244136810302734,
"learning_rate": 6.221922731356694e-06,
"loss": 2.8447,
"step": 960
},
{
"epoch": 0.40998293515358364,
"grad_norm": 18.434280395507812,
"learning_rate": 6.217430368373764e-06,
"loss": 2.9453,
"step": 961
},
{
"epoch": 0.4104095563139932,
"grad_norm": 29.534372329711914,
"learning_rate": 6.212938005390836e-06,
"loss": 2.998,
"step": 962
},
{
"epoch": 0.4108361774744027,
"grad_norm": 16.15813446044922,
"learning_rate": 6.208445642407907e-06,
"loss": 2.6787,
"step": 963
},
{
"epoch": 0.4112627986348123,
"grad_norm": 20.7001953125,
"learning_rate": 6.203953279424978e-06,
"loss": 2.666,
"step": 964
},
{
"epoch": 0.41168941979522183,
"grad_norm": 25.749404907226562,
"learning_rate": 6.199460916442049e-06,
"loss": 3.0195,
"step": 965
},
{
"epoch": 0.4121160409556314,
"grad_norm": 23.610687255859375,
"learning_rate": 6.1949685534591195e-06,
"loss": 2.418,
"step": 966
},
{
"epoch": 0.41254266211604096,
"grad_norm": 20.643394470214844,
"learning_rate": 6.1904761904761914e-06,
"loss": 2.5879,
"step": 967
},
{
"epoch": 0.4129692832764505,
"grad_norm": 39.67180252075195,
"learning_rate": 6.185983827493262e-06,
"loss": 3.2188,
"step": 968
},
{
"epoch": 0.4133959044368601,
"grad_norm": 37.136566162109375,
"learning_rate": 6.181491464510333e-06,
"loss": 2.4941,
"step": 969
},
{
"epoch": 0.4138225255972696,
"grad_norm": 23.125782012939453,
"learning_rate": 6.1769991015274046e-06,
"loss": 2.5205,
"step": 970
},
{
"epoch": 0.41424914675767915,
"grad_norm": 17.800636291503906,
"learning_rate": 6.172506738544475e-06,
"loss": 2.4541,
"step": 971
},
{
"epoch": 0.41467576791808874,
"grad_norm": 21.861169815063477,
"learning_rate": 6.168014375561547e-06,
"loss": 2.6475,
"step": 972
},
{
"epoch": 0.4151023890784983,
"grad_norm": 14.438638687133789,
"learning_rate": 6.163522012578617e-06,
"loss": 2.4336,
"step": 973
},
{
"epoch": 0.41552901023890787,
"grad_norm": 29.65276527404785,
"learning_rate": 6.159029649595688e-06,
"loss": 2.7949,
"step": 974
},
{
"epoch": 0.4159556313993174,
"grad_norm": 24.857969284057617,
"learning_rate": 6.154537286612758e-06,
"loss": 2.6533,
"step": 975
},
{
"epoch": 0.41638225255972694,
"grad_norm": 20.298656463623047,
"learning_rate": 6.15004492362983e-06,
"loss": 2.8877,
"step": 976
},
{
"epoch": 0.41680887372013653,
"grad_norm": 20.43273162841797,
"learning_rate": 6.145552560646901e-06,
"loss": 3.293,
"step": 977
},
{
"epoch": 0.41723549488054607,
"grad_norm": 15.368301391601562,
"learning_rate": 6.141060197663972e-06,
"loss": 2.7148,
"step": 978
},
{
"epoch": 0.41766211604095566,
"grad_norm": 18.29572296142578,
"learning_rate": 6.136567834681043e-06,
"loss": 2.4434,
"step": 979
},
{
"epoch": 0.4180887372013652,
"grad_norm": 16.30262565612793,
"learning_rate": 6.132075471698113e-06,
"loss": 2.5898,
"step": 980
},
{
"epoch": 0.4185153583617747,
"grad_norm": 21.400785446166992,
"learning_rate": 6.127583108715185e-06,
"loss": 2.7266,
"step": 981
},
{
"epoch": 0.4189419795221843,
"grad_norm": 29.911426544189453,
"learning_rate": 6.123090745732255e-06,
"loss": 2.6367,
"step": 982
},
{
"epoch": 0.41936860068259385,
"grad_norm": 22.61137580871582,
"learning_rate": 6.1185983827493264e-06,
"loss": 3.0137,
"step": 983
},
{
"epoch": 0.4197952218430034,
"grad_norm": 12.429327011108398,
"learning_rate": 6.114106019766398e-06,
"loss": 2.0137,
"step": 984
},
{
"epoch": 0.420221843003413,
"grad_norm": 63.111473083496094,
"learning_rate": 6.1096136567834685e-06,
"loss": 2.8916,
"step": 985
},
{
"epoch": 0.4206484641638225,
"grad_norm": 24.865880966186523,
"learning_rate": 6.1051212938005396e-06,
"loss": 2.5361,
"step": 986
},
{
"epoch": 0.4210750853242321,
"grad_norm": 24.293546676635742,
"learning_rate": 6.100628930817611e-06,
"loss": 2.6357,
"step": 987
},
{
"epoch": 0.42150170648464164,
"grad_norm": 16.23811149597168,
"learning_rate": 6.096136567834682e-06,
"loss": 2.1338,
"step": 988
},
{
"epoch": 0.42192832764505117,
"grad_norm": 21.733821868896484,
"learning_rate": 6.091644204851752e-06,
"loss": 2.373,
"step": 989
},
{
"epoch": 0.42235494880546076,
"grad_norm": 18.73717498779297,
"learning_rate": 6.087151841868824e-06,
"loss": 2.2764,
"step": 990
},
{
"epoch": 0.4227815699658703,
"grad_norm": 26.71270751953125,
"learning_rate": 6.082659478885895e-06,
"loss": 2.4351,
"step": 991
},
{
"epoch": 0.4232081911262799,
"grad_norm": 22.92618751525879,
"learning_rate": 6.078167115902965e-06,
"loss": 3.0625,
"step": 992
},
{
"epoch": 0.4236348122866894,
"grad_norm": 38.981849670410156,
"learning_rate": 6.073674752920037e-06,
"loss": 2.9932,
"step": 993
},
{
"epoch": 0.42406143344709896,
"grad_norm": 45.898590087890625,
"learning_rate": 6.069182389937107e-06,
"loss": 3.1201,
"step": 994
},
{
"epoch": 0.42448805460750855,
"grad_norm": 47.92092514038086,
"learning_rate": 6.064690026954179e-06,
"loss": 2.6689,
"step": 995
},
{
"epoch": 0.4249146757679181,
"grad_norm": 33.29727554321289,
"learning_rate": 6.060197663971249e-06,
"loss": 1.7715,
"step": 996
},
{
"epoch": 0.4253412969283277,
"grad_norm": 19.22433853149414,
"learning_rate": 6.05570530098832e-06,
"loss": 2.8066,
"step": 997
},
{
"epoch": 0.4257679180887372,
"grad_norm": 27.66653060913086,
"learning_rate": 6.05121293800539e-06,
"loss": 2.918,
"step": 998
},
{
"epoch": 0.42619453924914674,
"grad_norm": 49.86943817138672,
"learning_rate": 6.046720575022462e-06,
"loss": 2.5547,
"step": 999
},
{
"epoch": 0.42662116040955633,
"grad_norm": 75.04938507080078,
"learning_rate": 6.042228212039533e-06,
"loss": 2.75,
"step": 1000
},
{
"epoch": 0.42704778156996587,
"grad_norm": 22.91497802734375,
"learning_rate": 6.037735849056604e-06,
"loss": 3.0762,
"step": 1001
},
{
"epoch": 0.4274744027303754,
"grad_norm": 53.352027893066406,
"learning_rate": 6.0332434860736754e-06,
"loss": 2.4795,
"step": 1002
},
{
"epoch": 0.427901023890785,
"grad_norm": 45.149879455566406,
"learning_rate": 6.028751123090746e-06,
"loss": 2.5918,
"step": 1003
},
{
"epoch": 0.4283276450511945,
"grad_norm": 51.942405700683594,
"learning_rate": 6.0242587601078175e-06,
"loss": 2.8516,
"step": 1004
},
{
"epoch": 0.4287542662116041,
"grad_norm": 31.42732048034668,
"learning_rate": 6.019766397124888e-06,
"loss": 2.7529,
"step": 1005
},
{
"epoch": 0.42918088737201365,
"grad_norm": 22.029170989990234,
"learning_rate": 6.015274034141959e-06,
"loss": 2.2666,
"step": 1006
},
{
"epoch": 0.4296075085324232,
"grad_norm": 32.57006072998047,
"learning_rate": 6.010781671159031e-06,
"loss": 2.4951,
"step": 1007
},
{
"epoch": 0.4300341296928328,
"grad_norm": 21.854209899902344,
"learning_rate": 6.006289308176101e-06,
"loss": 2.7588,
"step": 1008
},
{
"epoch": 0.4304607508532423,
"grad_norm": 24.148046493530273,
"learning_rate": 6.001796945193173e-06,
"loss": 2.6313,
"step": 1009
},
{
"epoch": 0.4308873720136519,
"grad_norm": 25.809846878051758,
"learning_rate": 5.997304582210243e-06,
"loss": 2.6152,
"step": 1010
},
{
"epoch": 0.43131399317406144,
"grad_norm": 19.045398712158203,
"learning_rate": 5.992812219227314e-06,
"loss": 2.416,
"step": 1011
},
{
"epoch": 0.431740614334471,
"grad_norm": 21.391023635864258,
"learning_rate": 5.988319856244384e-06,
"loss": 2.3564,
"step": 1012
},
{
"epoch": 0.43216723549488056,
"grad_norm": 44.32109451293945,
"learning_rate": 5.983827493261456e-06,
"loss": 2.8799,
"step": 1013
},
{
"epoch": 0.4325938566552901,
"grad_norm": 16.042848587036133,
"learning_rate": 5.979335130278527e-06,
"loss": 2.3809,
"step": 1014
},
{
"epoch": 0.43302047781569963,
"grad_norm": 40.12324523925781,
"learning_rate": 5.974842767295598e-06,
"loss": 2.1445,
"step": 1015
},
{
"epoch": 0.4334470989761092,
"grad_norm": 16.52766227722168,
"learning_rate": 5.970350404312669e-06,
"loss": 2.4209,
"step": 1016
},
{
"epoch": 0.43387372013651876,
"grad_norm": 30.28310775756836,
"learning_rate": 5.965858041329739e-06,
"loss": 2.2705,
"step": 1017
},
{
"epoch": 0.43430034129692835,
"grad_norm": 63.12556457519531,
"learning_rate": 5.961365678346811e-06,
"loss": 2.4072,
"step": 1018
},
{
"epoch": 0.4347269624573379,
"grad_norm": 30.52492904663086,
"learning_rate": 5.9568733153638815e-06,
"loss": 3.2734,
"step": 1019
},
{
"epoch": 0.4351535836177474,
"grad_norm": 16.001874923706055,
"learning_rate": 5.9523809523809525e-06,
"loss": 2.207,
"step": 1020
},
{
"epoch": 0.435580204778157,
"grad_norm": 27.28630828857422,
"learning_rate": 5.9478885893980244e-06,
"loss": 2.6582,
"step": 1021
},
{
"epoch": 0.43600682593856654,
"grad_norm": 20.705059051513672,
"learning_rate": 5.943396226415095e-06,
"loss": 2.3154,
"step": 1022
},
{
"epoch": 0.43643344709897613,
"grad_norm": 25.739479064941406,
"learning_rate": 5.938903863432166e-06,
"loss": 2.8486,
"step": 1023
},
{
"epoch": 0.43686006825938567,
"grad_norm": 24.382295608520508,
"learning_rate": 5.934411500449237e-06,
"loss": 2.4648,
"step": 1024
},
{
"epoch": 0.4372866894197952,
"grad_norm": 39.85039138793945,
"learning_rate": 5.929919137466308e-06,
"loss": 2.2539,
"step": 1025
},
{
"epoch": 0.4377133105802048,
"grad_norm": 24.816200256347656,
"learning_rate": 5.925426774483378e-06,
"loss": 2.377,
"step": 1026
},
{
"epoch": 0.43813993174061433,
"grad_norm": 38.931419372558594,
"learning_rate": 5.92093441150045e-06,
"loss": 2.5742,
"step": 1027
},
{
"epoch": 0.43856655290102387,
"grad_norm": 19.670873641967773,
"learning_rate": 5.916442048517521e-06,
"loss": 2.4814,
"step": 1028
},
{
"epoch": 0.43899317406143346,
"grad_norm": 18.52810287475586,
"learning_rate": 5.911949685534591e-06,
"loss": 2.8203,
"step": 1029
},
{
"epoch": 0.439419795221843,
"grad_norm": 26.418214797973633,
"learning_rate": 5.907457322551663e-06,
"loss": 2.2998,
"step": 1030
},
{
"epoch": 0.4398464163822526,
"grad_norm": 19.35175895690918,
"learning_rate": 5.902964959568733e-06,
"loss": 2.8086,
"step": 1031
},
{
"epoch": 0.4402730375426621,
"grad_norm": 46.12089157104492,
"learning_rate": 5.898472596585805e-06,
"loss": 2.4053,
"step": 1032
},
{
"epoch": 0.44069965870307165,
"grad_norm": 19.147022247314453,
"learning_rate": 5.893980233602875e-06,
"loss": 2.4072,
"step": 1033
},
{
"epoch": 0.44112627986348124,
"grad_norm": 17.690813064575195,
"learning_rate": 5.889487870619946e-06,
"loss": 2.7412,
"step": 1034
},
{
"epoch": 0.4415529010238908,
"grad_norm": 55.85981750488281,
"learning_rate": 5.884995507637018e-06,
"loss": 2.5459,
"step": 1035
},
{
"epoch": 0.44197952218430037,
"grad_norm": 19.12326431274414,
"learning_rate": 5.880503144654088e-06,
"loss": 2.3145,
"step": 1036
},
{
"epoch": 0.4424061433447099,
"grad_norm": 22.143829345703125,
"learning_rate": 5.8760107816711595e-06,
"loss": 2.1816,
"step": 1037
},
{
"epoch": 0.44283276450511944,
"grad_norm": 16.12741470336914,
"learning_rate": 5.8715184186882305e-06,
"loss": 2.2637,
"step": 1038
},
{
"epoch": 0.443259385665529,
"grad_norm": 24.46463966369629,
"learning_rate": 5.8670260557053015e-06,
"loss": 2.6064,
"step": 1039
},
{
"epoch": 0.44368600682593856,
"grad_norm": 17.870634078979492,
"learning_rate": 5.862533692722372e-06,
"loss": 2.2676,
"step": 1040
},
{
"epoch": 0.4441126279863481,
"grad_norm": 55.32482147216797,
"learning_rate": 5.858041329739444e-06,
"loss": 3.1143,
"step": 1041
},
{
"epoch": 0.4445392491467577,
"grad_norm": 18.651668548583984,
"learning_rate": 5.853548966756515e-06,
"loss": 2.3745,
"step": 1042
},
{
"epoch": 0.4449658703071672,
"grad_norm": 38.92100524902344,
"learning_rate": 5.849056603773585e-06,
"loss": 3.0059,
"step": 1043
},
{
"epoch": 0.4453924914675768,
"grad_norm": 24.060253143310547,
"learning_rate": 5.844564240790657e-06,
"loss": 2.9922,
"step": 1044
},
{
"epoch": 0.44581911262798635,
"grad_norm": 19.545761108398438,
"learning_rate": 5.840071877807727e-06,
"loss": 2.333,
"step": 1045
},
{
"epoch": 0.4462457337883959,
"grad_norm": 31.55517578125,
"learning_rate": 5.835579514824799e-06,
"loss": 3.0957,
"step": 1046
},
{
"epoch": 0.4466723549488055,
"grad_norm": 18.396970748901367,
"learning_rate": 5.831087151841869e-06,
"loss": 1.8457,
"step": 1047
},
{
"epoch": 0.447098976109215,
"grad_norm": 27.257366180419922,
"learning_rate": 5.82659478885894e-06,
"loss": 2.8574,
"step": 1048
},
{
"epoch": 0.4475255972696246,
"grad_norm": 21.9024658203125,
"learning_rate": 5.822102425876012e-06,
"loss": 2.2686,
"step": 1049
},
{
"epoch": 0.44795221843003413,
"grad_norm": 22.36772918701172,
"learning_rate": 5.817610062893082e-06,
"loss": 2.4717,
"step": 1050
},
{
"epoch": 0.44837883959044367,
"grad_norm": 24.611875534057617,
"learning_rate": 5.813117699910153e-06,
"loss": 2.5195,
"step": 1051
},
{
"epoch": 0.44880546075085326,
"grad_norm": 18.70650863647461,
"learning_rate": 5.808625336927224e-06,
"loss": 2.3916,
"step": 1052
},
{
"epoch": 0.4492320819112628,
"grad_norm": 15.388991355895996,
"learning_rate": 5.804132973944295e-06,
"loss": 2.4912,
"step": 1053
},
{
"epoch": 0.4496587030716723,
"grad_norm": 21.341957092285156,
"learning_rate": 5.7996406109613655e-06,
"loss": 2.6201,
"step": 1054
},
{
"epoch": 0.4500853242320819,
"grad_norm": 22.18140411376953,
"learning_rate": 5.795148247978437e-06,
"loss": 2.8838,
"step": 1055
},
{
"epoch": 0.45051194539249145,
"grad_norm": 17.50536346435547,
"learning_rate": 5.7906558849955085e-06,
"loss": 2.6729,
"step": 1056
},
{
"epoch": 0.45093856655290104,
"grad_norm": 21.7335205078125,
"learning_rate": 5.786163522012579e-06,
"loss": 2.666,
"step": 1057
},
{
"epoch": 0.4513651877133106,
"grad_norm": 24.962146759033203,
"learning_rate": 5.7816711590296505e-06,
"loss": 2.1211,
"step": 1058
},
{
"epoch": 0.4517918088737201,
"grad_norm": 20.209169387817383,
"learning_rate": 5.777178796046721e-06,
"loss": 2.5088,
"step": 1059
},
{
"epoch": 0.4522184300341297,
"grad_norm": 18.95348358154297,
"learning_rate": 5.772686433063792e-06,
"loss": 2.6123,
"step": 1060
},
{
"epoch": 0.45264505119453924,
"grad_norm": 16.449033737182617,
"learning_rate": 5.768194070080863e-06,
"loss": 2.29,
"step": 1061
},
{
"epoch": 0.45307167235494883,
"grad_norm": 19.58086585998535,
"learning_rate": 5.763701707097934e-06,
"loss": 2.4629,
"step": 1062
},
{
"epoch": 0.45349829351535836,
"grad_norm": 28.654550552368164,
"learning_rate": 5.759209344115006e-06,
"loss": 2.5645,
"step": 1063
},
{
"epoch": 0.4539249146757679,
"grad_norm": 38.15443801879883,
"learning_rate": 5.754716981132076e-06,
"loss": 2.3252,
"step": 1064
},
{
"epoch": 0.4543515358361775,
"grad_norm": 18.46957778930664,
"learning_rate": 5.750224618149147e-06,
"loss": 2.9434,
"step": 1065
},
{
"epoch": 0.454778156996587,
"grad_norm": 30.695247650146484,
"learning_rate": 5.745732255166217e-06,
"loss": 3.1504,
"step": 1066
},
{
"epoch": 0.4552047781569966,
"grad_norm": 20.049087524414062,
"learning_rate": 5.741239892183289e-06,
"loss": 2.0674,
"step": 1067
},
{
"epoch": 0.45563139931740615,
"grad_norm": 17.473054885864258,
"learning_rate": 5.736747529200359e-06,
"loss": 2.3477,
"step": 1068
},
{
"epoch": 0.4560580204778157,
"grad_norm": 20.370532989501953,
"learning_rate": 5.732255166217431e-06,
"loss": 2.1992,
"step": 1069
},
{
"epoch": 0.4564846416382253,
"grad_norm": 24.673768997192383,
"learning_rate": 5.727762803234502e-06,
"loss": 2.333,
"step": 1070
},
{
"epoch": 0.4569112627986348,
"grad_norm": 26.14463233947754,
"learning_rate": 5.723270440251572e-06,
"loss": 2.3564,
"step": 1071
},
{
"epoch": 0.45733788395904434,
"grad_norm": 20.086162567138672,
"learning_rate": 5.718778077268644e-06,
"loss": 2.4512,
"step": 1072
},
{
"epoch": 0.45776450511945393,
"grad_norm": 20.043794631958008,
"learning_rate": 5.7142857142857145e-06,
"loss": 2.8096,
"step": 1073
},
{
"epoch": 0.45819112627986347,
"grad_norm": 40.026126861572266,
"learning_rate": 5.7097933513027856e-06,
"loss": 2.2119,
"step": 1074
},
{
"epoch": 0.45861774744027306,
"grad_norm": 25.369979858398438,
"learning_rate": 5.705300988319857e-06,
"loss": 2.5371,
"step": 1075
},
{
"epoch": 0.4590443686006826,
"grad_norm": 22.0860538482666,
"learning_rate": 5.700808625336928e-06,
"loss": 2.9932,
"step": 1076
},
{
"epoch": 0.45947098976109213,
"grad_norm": 31.264373779296875,
"learning_rate": 5.6963162623539995e-06,
"loss": 2.9736,
"step": 1077
},
{
"epoch": 0.4598976109215017,
"grad_norm": 59.6558723449707,
"learning_rate": 5.69182389937107e-06,
"loss": 2.7861,
"step": 1078
},
{
"epoch": 0.46032423208191126,
"grad_norm": 20.534461975097656,
"learning_rate": 5.687331536388141e-06,
"loss": 3.2305,
"step": 1079
},
{
"epoch": 0.46075085324232085,
"grad_norm": 20.675729751586914,
"learning_rate": 5.682839173405211e-06,
"loss": 2.6768,
"step": 1080
},
{
"epoch": 0.4611774744027304,
"grad_norm": 36.32667922973633,
"learning_rate": 5.678346810422283e-06,
"loss": 2.7354,
"step": 1081
},
{
"epoch": 0.4616040955631399,
"grad_norm": 18.81818199157715,
"learning_rate": 5.673854447439353e-06,
"loss": 2.4863,
"step": 1082
},
{
"epoch": 0.4620307167235495,
"grad_norm": 22.678890228271484,
"learning_rate": 5.669362084456425e-06,
"loss": 2.2812,
"step": 1083
},
{
"epoch": 0.46245733788395904,
"grad_norm": 52.0784912109375,
"learning_rate": 5.664869721473496e-06,
"loss": 2.6211,
"step": 1084
},
{
"epoch": 0.4628839590443686,
"grad_norm": 16.65648078918457,
"learning_rate": 5.660377358490566e-06,
"loss": 2.6953,
"step": 1085
},
{
"epoch": 0.46331058020477817,
"grad_norm": 15.564534187316895,
"learning_rate": 5.655884995507638e-06,
"loss": 2.5234,
"step": 1086
},
{
"epoch": 0.4637372013651877,
"grad_norm": 14.748406410217285,
"learning_rate": 5.651392632524708e-06,
"loss": 2.1904,
"step": 1087
},
{
"epoch": 0.4641638225255973,
"grad_norm": 36.08272933959961,
"learning_rate": 5.646900269541779e-06,
"loss": 2.499,
"step": 1088
},
{
"epoch": 0.4645904436860068,
"grad_norm": 31.064861297607422,
"learning_rate": 5.64240790655885e-06,
"loss": 2.5146,
"step": 1089
},
{
"epoch": 0.46501706484641636,
"grad_norm": 20.036354064941406,
"learning_rate": 5.637915543575921e-06,
"loss": 2.8281,
"step": 1090
},
{
"epoch": 0.46544368600682595,
"grad_norm": 16.89986801147461,
"learning_rate": 5.6334231805929925e-06,
"loss": 2.3643,
"step": 1091
},
{
"epoch": 0.4658703071672355,
"grad_norm": 21.826248168945312,
"learning_rate": 5.6289308176100635e-06,
"loss": 2.0293,
"step": 1092
},
{
"epoch": 0.4662969283276451,
"grad_norm": 23.403091430664062,
"learning_rate": 5.6244384546271346e-06,
"loss": 2.0586,
"step": 1093
},
{
"epoch": 0.4667235494880546,
"grad_norm": 19.01036834716797,
"learning_rate": 5.619946091644205e-06,
"loss": 2.2705,
"step": 1094
},
{
"epoch": 0.46715017064846415,
"grad_norm": 40.55333709716797,
"learning_rate": 5.615453728661277e-06,
"loss": 2.6978,
"step": 1095
},
{
"epoch": 0.46757679180887374,
"grad_norm": 29.737985610961914,
"learning_rate": 5.610961365678347e-06,
"loss": 2.5518,
"step": 1096
},
{
"epoch": 0.4680034129692833,
"grad_norm": 23.35071563720703,
"learning_rate": 5.606469002695418e-06,
"loss": 2.7148,
"step": 1097
},
{
"epoch": 0.4684300341296928,
"grad_norm": 23.68649673461914,
"learning_rate": 5.60197663971249e-06,
"loss": 2.626,
"step": 1098
},
{
"epoch": 0.4688566552901024,
"grad_norm": 31.265790939331055,
"learning_rate": 5.59748427672956e-06,
"loss": 3.2021,
"step": 1099
},
{
"epoch": 0.46928327645051193,
"grad_norm": 28.331497192382812,
"learning_rate": 5.592991913746632e-06,
"loss": 2.165,
"step": 1100
},
{
"epoch": 0.4697098976109215,
"grad_norm": 21.12152862548828,
"learning_rate": 5.588499550763702e-06,
"loss": 2.186,
"step": 1101
},
{
"epoch": 0.47013651877133106,
"grad_norm": 22.582916259765625,
"learning_rate": 5.584007187780773e-06,
"loss": 2.5117,
"step": 1102
},
{
"epoch": 0.4705631399317406,
"grad_norm": 19.8083438873291,
"learning_rate": 5.579514824797843e-06,
"loss": 2.0576,
"step": 1103
},
{
"epoch": 0.4709897610921502,
"grad_norm": 25.670671463012695,
"learning_rate": 5.575022461814915e-06,
"loss": 2.7871,
"step": 1104
},
{
"epoch": 0.4714163822525597,
"grad_norm": 26.358726501464844,
"learning_rate": 5.570530098831986e-06,
"loss": 2.3057,
"step": 1105
},
{
"epoch": 0.4718430034129693,
"grad_norm": 15.725049018859863,
"learning_rate": 5.566037735849057e-06,
"loss": 2.6221,
"step": 1106
},
{
"epoch": 0.47226962457337884,
"grad_norm": 32.615047454833984,
"learning_rate": 5.561545372866128e-06,
"loss": 2.1543,
"step": 1107
},
{
"epoch": 0.4726962457337884,
"grad_norm": 19.219602584838867,
"learning_rate": 5.5570530098831985e-06,
"loss": 2.4561,
"step": 1108
},
{
"epoch": 0.47312286689419797,
"grad_norm": 18.56755828857422,
"learning_rate": 5.55256064690027e-06,
"loss": 2.2778,
"step": 1109
},
{
"epoch": 0.4735494880546075,
"grad_norm": 15.861446380615234,
"learning_rate": 5.548068283917341e-06,
"loss": 2.4023,
"step": 1110
},
{
"epoch": 0.47397610921501704,
"grad_norm": 34.29450225830078,
"learning_rate": 5.543575920934412e-06,
"loss": 2.4219,
"step": 1111
},
{
"epoch": 0.47440273037542663,
"grad_norm": 28.189361572265625,
"learning_rate": 5.5390835579514835e-06,
"loss": 2.7207,
"step": 1112
},
{
"epoch": 0.47482935153583616,
"grad_norm": 34.98194885253906,
"learning_rate": 5.534591194968554e-06,
"loss": 2.5635,
"step": 1113
},
{
"epoch": 0.47525597269624575,
"grad_norm": 16.712553024291992,
"learning_rate": 5.530098831985626e-06,
"loss": 2.1338,
"step": 1114
},
{
"epoch": 0.4756825938566553,
"grad_norm": 24.47882843017578,
"learning_rate": 5.525606469002696e-06,
"loss": 2.1953,
"step": 1115
},
{
"epoch": 0.4761092150170648,
"grad_norm": 42.815486907958984,
"learning_rate": 5.521114106019767e-06,
"loss": 2.4131,
"step": 1116
},
{
"epoch": 0.4765358361774744,
"grad_norm": 27.543291091918945,
"learning_rate": 5.516621743036837e-06,
"loss": 2.3418,
"step": 1117
},
{
"epoch": 0.47696245733788395,
"grad_norm": 26.759994506835938,
"learning_rate": 5.512129380053909e-06,
"loss": 3.208,
"step": 1118
},
{
"epoch": 0.47738907849829354,
"grad_norm": 27.660327911376953,
"learning_rate": 5.50763701707098e-06,
"loss": 2.2578,
"step": 1119
},
{
"epoch": 0.4778156996587031,
"grad_norm": 20.498302459716797,
"learning_rate": 5.503144654088051e-06,
"loss": 2.8809,
"step": 1120
},
{
"epoch": 0.4782423208191126,
"grad_norm": 33.10205841064453,
"learning_rate": 5.498652291105122e-06,
"loss": 2.8047,
"step": 1121
},
{
"epoch": 0.4786689419795222,
"grad_norm": 18.99040985107422,
"learning_rate": 5.494159928122192e-06,
"loss": 2.4971,
"step": 1122
},
{
"epoch": 0.47909556313993173,
"grad_norm": 20.12007713317871,
"learning_rate": 5.489667565139264e-06,
"loss": 2.4883,
"step": 1123
},
{
"epoch": 0.47952218430034127,
"grad_norm": 34.227657318115234,
"learning_rate": 5.485175202156334e-06,
"loss": 2.0781,
"step": 1124
},
{
"epoch": 0.47994880546075086,
"grad_norm": 18.170772552490234,
"learning_rate": 5.4806828391734054e-06,
"loss": 2.3389,
"step": 1125
},
{
"epoch": 0.4803754266211604,
"grad_norm": 22.892549514770508,
"learning_rate": 5.476190476190477e-06,
"loss": 2.0605,
"step": 1126
},
{
"epoch": 0.48080204778157,
"grad_norm": 14.68527889251709,
"learning_rate": 5.4716981132075475e-06,
"loss": 2.248,
"step": 1127
},
{
"epoch": 0.4812286689419795,
"grad_norm": 23.0800838470459,
"learning_rate": 5.4672057502246186e-06,
"loss": 2.1816,
"step": 1128
},
{
"epoch": 0.48165529010238906,
"grad_norm": 28.375272750854492,
"learning_rate": 5.46271338724169e-06,
"loss": 2.5811,
"step": 1129
},
{
"epoch": 0.48208191126279865,
"grad_norm": 64.8277587890625,
"learning_rate": 5.458221024258761e-06,
"loss": 3.0557,
"step": 1130
},
{
"epoch": 0.4825085324232082,
"grad_norm": 52.17132568359375,
"learning_rate": 5.453728661275831e-06,
"loss": 2.6113,
"step": 1131
},
{
"epoch": 0.48293515358361777,
"grad_norm": 19.48263168334961,
"learning_rate": 5.449236298292903e-06,
"loss": 2.0498,
"step": 1132
},
{
"epoch": 0.4833617747440273,
"grad_norm": 18.925079345703125,
"learning_rate": 5.444743935309974e-06,
"loss": 2.9893,
"step": 1133
},
{
"epoch": 0.48378839590443684,
"grad_norm": 19.89716148376465,
"learning_rate": 5.440251572327044e-06,
"loss": 2.7705,
"step": 1134
},
{
"epoch": 0.48421501706484643,
"grad_norm": 28.375167846679688,
"learning_rate": 5.435759209344116e-06,
"loss": 2.8643,
"step": 1135
},
{
"epoch": 0.48464163822525597,
"grad_norm": 22.519615173339844,
"learning_rate": 5.431266846361186e-06,
"loss": 2.5166,
"step": 1136
},
{
"epoch": 0.48506825938566556,
"grad_norm": 16.866195678710938,
"learning_rate": 5.426774483378258e-06,
"loss": 2.5,
"step": 1137
},
{
"epoch": 0.4854948805460751,
"grad_norm": 36.9367790222168,
"learning_rate": 5.422282120395328e-06,
"loss": 2.8838,
"step": 1138
},
{
"epoch": 0.4859215017064846,
"grad_norm": 51.1767463684082,
"learning_rate": 5.417789757412399e-06,
"loss": 2.6123,
"step": 1139
},
{
"epoch": 0.4863481228668942,
"grad_norm": 18.42434310913086,
"learning_rate": 5.413297394429471e-06,
"loss": 2.2568,
"step": 1140
},
{
"epoch": 0.48677474402730375,
"grad_norm": 20.477943420410156,
"learning_rate": 5.408805031446541e-06,
"loss": 2.4121,
"step": 1141
},
{
"epoch": 0.4872013651877133,
"grad_norm": 23.394515991210938,
"learning_rate": 5.404312668463612e-06,
"loss": 2.3174,
"step": 1142
},
{
"epoch": 0.4876279863481229,
"grad_norm": 15.655838966369629,
"learning_rate": 5.399820305480683e-06,
"loss": 2.4307,
"step": 1143
},
{
"epoch": 0.4880546075085324,
"grad_norm": 17.30601692199707,
"learning_rate": 5.3953279424977544e-06,
"loss": 2.1406,
"step": 1144
},
{
"epoch": 0.488481228668942,
"grad_norm": 17.408594131469727,
"learning_rate": 5.390835579514825e-06,
"loss": 2.4082,
"step": 1145
},
{
"epoch": 0.48890784982935154,
"grad_norm": 20.133886337280273,
"learning_rate": 5.3863432165318965e-06,
"loss": 2.4775,
"step": 1146
},
{
"epoch": 0.4893344709897611,
"grad_norm": 28.761140823364258,
"learning_rate": 5.3818508535489676e-06,
"loss": 2.376,
"step": 1147
},
{
"epoch": 0.48976109215017066,
"grad_norm": 20.112117767333984,
"learning_rate": 5.377358490566038e-06,
"loss": 2.5723,
"step": 1148
},
{
"epoch": 0.4901877133105802,
"grad_norm": 33.22997283935547,
"learning_rate": 5.37286612758311e-06,
"loss": 2.4688,
"step": 1149
},
{
"epoch": 0.4906143344709898,
"grad_norm": 31.31877899169922,
"learning_rate": 5.36837376460018e-06,
"loss": 2.4941,
"step": 1150
},
{
"epoch": 0.4910409556313993,
"grad_norm": 27.41322135925293,
"learning_rate": 5.363881401617252e-06,
"loss": 3.0059,
"step": 1151
},
{
"epoch": 0.49146757679180886,
"grad_norm": 23.11745262145996,
"learning_rate": 5.359389038634322e-06,
"loss": 2.6611,
"step": 1152
},
{
"epoch": 0.49189419795221845,
"grad_norm": 22.89065933227539,
"learning_rate": 5.354896675651393e-06,
"loss": 2.7139,
"step": 1153
},
{
"epoch": 0.492320819112628,
"grad_norm": 17.799470901489258,
"learning_rate": 5.350404312668463e-06,
"loss": 2.0234,
"step": 1154
},
{
"epoch": 0.4927474402730375,
"grad_norm": 24.692134857177734,
"learning_rate": 5.345911949685535e-06,
"loss": 2.5801,
"step": 1155
},
{
"epoch": 0.4931740614334471,
"grad_norm": 23.89969825744629,
"learning_rate": 5.341419586702606e-06,
"loss": 2.4844,
"step": 1156
},
{
"epoch": 0.49360068259385664,
"grad_norm": 18.586627960205078,
"learning_rate": 5.336927223719677e-06,
"loss": 2.125,
"step": 1157
},
{
"epoch": 0.49402730375426623,
"grad_norm": 25.83800506591797,
"learning_rate": 5.332434860736748e-06,
"loss": 2.7432,
"step": 1158
},
{
"epoch": 0.49445392491467577,
"grad_norm": 18.455707550048828,
"learning_rate": 5.327942497753818e-06,
"loss": 2.458,
"step": 1159
},
{
"epoch": 0.4948805460750853,
"grad_norm": 29.522663116455078,
"learning_rate": 5.32345013477089e-06,
"loss": 2.7256,
"step": 1160
},
{
"epoch": 0.4953071672354949,
"grad_norm": 33.990966796875,
"learning_rate": 5.3189577717879605e-06,
"loss": 2.5508,
"step": 1161
},
{
"epoch": 0.49573378839590443,
"grad_norm": 37.19002914428711,
"learning_rate": 5.3144654088050315e-06,
"loss": 2.1768,
"step": 1162
},
{
"epoch": 0.496160409556314,
"grad_norm": 17.45855712890625,
"learning_rate": 5.3099730458221034e-06,
"loss": 2.3623,
"step": 1163
},
{
"epoch": 0.49658703071672355,
"grad_norm": 21.004287719726562,
"learning_rate": 5.305480682839174e-06,
"loss": 2.5498,
"step": 1164
},
{
"epoch": 0.4970136518771331,
"grad_norm": 18.135629653930664,
"learning_rate": 5.300988319856245e-06,
"loss": 2.5381,
"step": 1165
},
{
"epoch": 0.4974402730375427,
"grad_norm": 54.42517852783203,
"learning_rate": 5.296495956873316e-06,
"loss": 3.4648,
"step": 1166
},
{
"epoch": 0.4978668941979522,
"grad_norm": 23.84809684753418,
"learning_rate": 5.292003593890387e-06,
"loss": 2.7002,
"step": 1167
},
{
"epoch": 0.49829351535836175,
"grad_norm": 37.697021484375,
"learning_rate": 5.287511230907457e-06,
"loss": 2.8711,
"step": 1168
},
{
"epoch": 0.49872013651877134,
"grad_norm": 50.69107437133789,
"learning_rate": 5.283018867924529e-06,
"loss": 2.2715,
"step": 1169
},
{
"epoch": 0.4991467576791809,
"grad_norm": 32.984683990478516,
"learning_rate": 5.2785265049416e-06,
"loss": 2.6553,
"step": 1170
},
{
"epoch": 0.49957337883959047,
"grad_norm": 19.88435173034668,
"learning_rate": 5.27403414195867e-06,
"loss": 2.3164,
"step": 1171
},
{
"epoch": 0.5,
"grad_norm": 27.762235641479492,
"learning_rate": 5.269541778975742e-06,
"loss": 2.9717,
"step": 1172
},
{
"epoch": 0.5004266211604096,
"grad_norm": 17.994325637817383,
"learning_rate": 5.265049415992812e-06,
"loss": 2.2549,
"step": 1173
},
{
"epoch": 0.5008532423208191,
"grad_norm": 16.808128356933594,
"learning_rate": 5.260557053009884e-06,
"loss": 2.4434,
"step": 1174
},
{
"epoch": 0.5012798634812287,
"grad_norm": 19.83344268798828,
"learning_rate": 5.256064690026954e-06,
"loss": 2.291,
"step": 1175
},
{
"epoch": 0.5017064846416383,
"grad_norm": 18.270017623901367,
"learning_rate": 5.251572327044025e-06,
"loss": 2.6992,
"step": 1176
},
{
"epoch": 0.5021331058020477,
"grad_norm": 23.939945220947266,
"learning_rate": 5.247079964061097e-06,
"loss": 2.1406,
"step": 1177
},
{
"epoch": 0.5025597269624573,
"grad_norm": 15.736184120178223,
"learning_rate": 5.242587601078167e-06,
"loss": 2.1777,
"step": 1178
},
{
"epoch": 0.5029863481228669,
"grad_norm": 26.24043083190918,
"learning_rate": 5.2380952380952384e-06,
"loss": 2.6523,
"step": 1179
},
{
"epoch": 0.5034129692832765,
"grad_norm": 20.484760284423828,
"learning_rate": 5.2336028751123095e-06,
"loss": 2.6504,
"step": 1180
},
{
"epoch": 0.503839590443686,
"grad_norm": 20.004854202270508,
"learning_rate": 5.2291105121293805e-06,
"loss": 2.0503,
"step": 1181
},
{
"epoch": 0.5042662116040956,
"grad_norm": 21.367979049682617,
"learning_rate": 5.224618149146451e-06,
"loss": 2.0713,
"step": 1182
},
{
"epoch": 0.5046928327645052,
"grad_norm": 22.656190872192383,
"learning_rate": 5.220125786163523e-06,
"loss": 2.3164,
"step": 1183
},
{
"epoch": 0.5051194539249146,
"grad_norm": 19.639827728271484,
"learning_rate": 5.215633423180594e-06,
"loss": 2.1338,
"step": 1184
},
{
"epoch": 0.5055460750853242,
"grad_norm": 36.95730209350586,
"learning_rate": 5.211141060197664e-06,
"loss": 2.1045,
"step": 1185
},
{
"epoch": 0.5059726962457338,
"grad_norm": 25.0295467376709,
"learning_rate": 5.206648697214736e-06,
"loss": 2.5039,
"step": 1186
},
{
"epoch": 0.5063993174061433,
"grad_norm": 31.372411727905273,
"learning_rate": 5.202156334231806e-06,
"loss": 2.5654,
"step": 1187
},
{
"epoch": 0.5068259385665529,
"grad_norm": 35.015716552734375,
"learning_rate": 5.197663971248878e-06,
"loss": 2.2344,
"step": 1188
},
{
"epoch": 0.5072525597269625,
"grad_norm": 34.40018844604492,
"learning_rate": 5.193171608265948e-06,
"loss": 2.6162,
"step": 1189
},
{
"epoch": 0.507679180887372,
"grad_norm": 27.553220748901367,
"learning_rate": 5.188679245283019e-06,
"loss": 2.5781,
"step": 1190
},
{
"epoch": 0.5081058020477816,
"grad_norm": 29.57081413269043,
"learning_rate": 5.184186882300091e-06,
"loss": 2.8047,
"step": 1191
},
{
"epoch": 0.5085324232081911,
"grad_norm": 29.047510147094727,
"learning_rate": 5.179694519317161e-06,
"loss": 3.1738,
"step": 1192
},
{
"epoch": 0.5089590443686007,
"grad_norm": 22.855804443359375,
"learning_rate": 5.175202156334232e-06,
"loss": 2.5459,
"step": 1193
},
{
"epoch": 0.5093856655290102,
"grad_norm": 16.29975700378418,
"learning_rate": 5.170709793351303e-06,
"loss": 2.3008,
"step": 1194
},
{
"epoch": 0.5098122866894198,
"grad_norm": 18.54262351989746,
"learning_rate": 5.166217430368374e-06,
"loss": 2.2949,
"step": 1195
},
{
"epoch": 0.5102389078498294,
"grad_norm": 21.953824996948242,
"learning_rate": 5.1617250673854445e-06,
"loss": 2.6367,
"step": 1196
},
{
"epoch": 0.5106655290102389,
"grad_norm": 16.686342239379883,
"learning_rate": 5.157232704402516e-06,
"loss": 2.2246,
"step": 1197
},
{
"epoch": 0.5110921501706485,
"grad_norm": 20.373126983642578,
"learning_rate": 5.1527403414195874e-06,
"loss": 2.7695,
"step": 1198
},
{
"epoch": 0.511518771331058,
"grad_norm": 19.9268798828125,
"learning_rate": 5.148247978436658e-06,
"loss": 2.6016,
"step": 1199
},
{
"epoch": 0.5119453924914675,
"grad_norm": 16.246139526367188,
"learning_rate": 5.1437556154537295e-06,
"loss": 2.4639,
"step": 1200
},
{
"epoch": 0.5123720136518771,
"grad_norm": 25.90472984313965,
"learning_rate": 5.1392632524708e-06,
"loss": 2.5928,
"step": 1201
},
{
"epoch": 0.5127986348122867,
"grad_norm": 20.83331871032715,
"learning_rate": 5.134770889487871e-06,
"loss": 2.583,
"step": 1202
},
{
"epoch": 0.5132252559726962,
"grad_norm": 28.974882125854492,
"learning_rate": 5.130278526504942e-06,
"loss": 2.6738,
"step": 1203
},
{
"epoch": 0.5136518771331058,
"grad_norm": 16.286806106567383,
"learning_rate": 5.125786163522013e-06,
"loss": 2.3604,
"step": 1204
},
{
"epoch": 0.5140784982935154,
"grad_norm": 27.790830612182617,
"learning_rate": 5.121293800539085e-06,
"loss": 2.1895,
"step": 1205
},
{
"epoch": 0.514505119453925,
"grad_norm": 18.570655822753906,
"learning_rate": 5.116801437556155e-06,
"loss": 2.4883,
"step": 1206
},
{
"epoch": 0.5149317406143344,
"grad_norm": 39.29001235961914,
"learning_rate": 5.112309074573226e-06,
"loss": 2.6777,
"step": 1207
},
{
"epoch": 0.515358361774744,
"grad_norm": 18.890174865722656,
"learning_rate": 5.107816711590296e-06,
"loss": 2.5215,
"step": 1208
},
{
"epoch": 0.5157849829351536,
"grad_norm": 24.600017547607422,
"learning_rate": 5.103324348607368e-06,
"loss": 2.3481,
"step": 1209
},
{
"epoch": 0.5162116040955631,
"grad_norm": 22.012611389160156,
"learning_rate": 5.098831985624438e-06,
"loss": 2.374,
"step": 1210
},
{
"epoch": 0.5166382252559727,
"grad_norm": 17.54711151123047,
"learning_rate": 5.09433962264151e-06,
"loss": 2.2744,
"step": 1211
},
{
"epoch": 0.5170648464163823,
"grad_norm": 25.47920799255371,
"learning_rate": 5.089847259658581e-06,
"loss": 2.9141,
"step": 1212
},
{
"epoch": 0.5174914675767918,
"grad_norm": 18.82390022277832,
"learning_rate": 5.085354896675651e-06,
"loss": 2.1562,
"step": 1213
},
{
"epoch": 0.5179180887372014,
"grad_norm": 52.312286376953125,
"learning_rate": 5.080862533692723e-06,
"loss": 3.0713,
"step": 1214
},
{
"epoch": 0.518344709897611,
"grad_norm": 27.089868545532227,
"learning_rate": 5.0763701707097935e-06,
"loss": 2.6895,
"step": 1215
},
{
"epoch": 0.5187713310580204,
"grad_norm": 20.7680606842041,
"learning_rate": 5.0718778077268645e-06,
"loss": 2.6787,
"step": 1216
},
{
"epoch": 0.51919795221843,
"grad_norm": 31.512439727783203,
"learning_rate": 5.067385444743936e-06,
"loss": 2.334,
"step": 1217
},
{
"epoch": 0.5196245733788396,
"grad_norm": 29.377948760986328,
"learning_rate": 5.062893081761007e-06,
"loss": 2.4209,
"step": 1218
},
{
"epoch": 0.5200511945392492,
"grad_norm": 53.89030456542969,
"learning_rate": 5.0584007187780785e-06,
"loss": 2.2803,
"step": 1219
},
{
"epoch": 0.5204778156996587,
"grad_norm": 35.73152542114258,
"learning_rate": 5.053908355795149e-06,
"loss": 1.998,
"step": 1220
},
{
"epoch": 0.5209044368600683,
"grad_norm": 20.638343811035156,
"learning_rate": 5.04941599281222e-06,
"loss": 2.7383,
"step": 1221
},
{
"epoch": 0.5213310580204779,
"grad_norm": 29.040016174316406,
"learning_rate": 5.04492362982929e-06,
"loss": 2.7412,
"step": 1222
},
{
"epoch": 0.5217576791808873,
"grad_norm": 17.859210968017578,
"learning_rate": 5.040431266846362e-06,
"loss": 2.0283,
"step": 1223
},
{
"epoch": 0.5221843003412969,
"grad_norm": 24.831958770751953,
"learning_rate": 5.035938903863432e-06,
"loss": 2.3135,
"step": 1224
},
{
"epoch": 0.5226109215017065,
"grad_norm": 16.252965927124023,
"learning_rate": 5.031446540880504e-06,
"loss": 2.1006,
"step": 1225
},
{
"epoch": 0.523037542662116,
"grad_norm": 24.575307846069336,
"learning_rate": 5.026954177897575e-06,
"loss": 2.6299,
"step": 1226
},
{
"epoch": 0.5234641638225256,
"grad_norm": 17.893375396728516,
"learning_rate": 5.022461814914645e-06,
"loss": 2.5127,
"step": 1227
},
{
"epoch": 0.5238907849829352,
"grad_norm": 17.92720603942871,
"learning_rate": 5.017969451931717e-06,
"loss": 2.626,
"step": 1228
},
{
"epoch": 0.5243174061433447,
"grad_norm": 39.802059173583984,
"learning_rate": 5.013477088948787e-06,
"loss": 2.6738,
"step": 1229
},
{
"epoch": 0.5247440273037542,
"grad_norm": 30.07309341430664,
"learning_rate": 5.008984725965858e-06,
"loss": 2.6621,
"step": 1230
},
{
"epoch": 0.5251706484641638,
"grad_norm": 25.614131927490234,
"learning_rate": 5.004492362982929e-06,
"loss": 2.4473,
"step": 1231
},
{
"epoch": 0.5255972696245734,
"grad_norm": 16.77971839904785,
"learning_rate": 5e-06,
"loss": 2.0137,
"step": 1232
},
{
"epoch": 0.5260238907849829,
"grad_norm": 23.12045669555664,
"learning_rate": 4.9955076370170715e-06,
"loss": 2.2012,
"step": 1233
},
{
"epoch": 0.5264505119453925,
"grad_norm": 25.62958526611328,
"learning_rate": 4.9910152740341425e-06,
"loss": 2.2744,
"step": 1234
},
{
"epoch": 0.5268771331058021,
"grad_norm": 22.395214080810547,
"learning_rate": 4.986522911051213e-06,
"loss": 2.4043,
"step": 1235
},
{
"epoch": 0.5273037542662116,
"grad_norm": 19.13766098022461,
"learning_rate": 4.982030548068285e-06,
"loss": 2.334,
"step": 1236
},
{
"epoch": 0.5277303754266212,
"grad_norm": 21.669918060302734,
"learning_rate": 4.977538185085356e-06,
"loss": 2.7744,
"step": 1237
},
{
"epoch": 0.5281569965870307,
"grad_norm": 21.873449325561523,
"learning_rate": 4.973045822102427e-06,
"loss": 2.0127,
"step": 1238
},
{
"epoch": 0.5285836177474402,
"grad_norm": 23.759654998779297,
"learning_rate": 4.968553459119497e-06,
"loss": 2.1641,
"step": 1239
},
{
"epoch": 0.5290102389078498,
"grad_norm": 19.724685668945312,
"learning_rate": 4.964061096136568e-06,
"loss": 2.8232,
"step": 1240
},
{
"epoch": 0.5294368600682594,
"grad_norm": 24.578834533691406,
"learning_rate": 4.959568733153639e-06,
"loss": 2.9199,
"step": 1241
},
{
"epoch": 0.5298634812286689,
"grad_norm": 33.008914947509766,
"learning_rate": 4.95507637017071e-06,
"loss": 2.3926,
"step": 1242
},
{
"epoch": 0.5302901023890785,
"grad_norm": 26.207422256469727,
"learning_rate": 4.950584007187781e-06,
"loss": 2.1758,
"step": 1243
},
{
"epoch": 0.5307167235494881,
"grad_norm": 21.034513473510742,
"learning_rate": 4.946091644204852e-06,
"loss": 2.9932,
"step": 1244
},
{
"epoch": 0.5311433447098977,
"grad_norm": 16.45236587524414,
"learning_rate": 4.941599281221923e-06,
"loss": 2.5645,
"step": 1245
},
{
"epoch": 0.5315699658703071,
"grad_norm": 36.50604248046875,
"learning_rate": 4.937106918238994e-06,
"loss": 2.3691,
"step": 1246
},
{
"epoch": 0.5319965870307167,
"grad_norm": 19.009384155273438,
"learning_rate": 4.932614555256065e-06,
"loss": 1.9756,
"step": 1247
},
{
"epoch": 0.5324232081911263,
"grad_norm": 19.400297164916992,
"learning_rate": 4.928122192273136e-06,
"loss": 2.417,
"step": 1248
},
{
"epoch": 0.5328498293515358,
"grad_norm": 33.59346389770508,
"learning_rate": 4.9236298292902065e-06,
"loss": 2.4419,
"step": 1249
},
{
"epoch": 0.5332764505119454,
"grad_norm": 18.19139862060547,
"learning_rate": 4.919137466307278e-06,
"loss": 1.9072,
"step": 1250
},
{
"epoch": 0.533703071672355,
"grad_norm": 23.012216567993164,
"learning_rate": 4.914645103324349e-06,
"loss": 2.5869,
"step": 1251
},
{
"epoch": 0.5341296928327645,
"grad_norm": 38.36344528198242,
"learning_rate": 4.9101527403414205e-06,
"loss": 2.6416,
"step": 1252
},
{
"epoch": 0.534556313993174,
"grad_norm": 21.004196166992188,
"learning_rate": 4.905660377358491e-06,
"loss": 2.5957,
"step": 1253
},
{
"epoch": 0.5349829351535836,
"grad_norm": 19.322311401367188,
"learning_rate": 4.901168014375562e-06,
"loss": 1.7642,
"step": 1254
},
{
"epoch": 0.5354095563139932,
"grad_norm": 32.670780181884766,
"learning_rate": 4.896675651392633e-06,
"loss": 2.7812,
"step": 1255
},
{
"epoch": 0.5358361774744027,
"grad_norm": 27.457656860351562,
"learning_rate": 4.892183288409704e-06,
"loss": 2.1992,
"step": 1256
},
{
"epoch": 0.5362627986348123,
"grad_norm": 30.712905883789062,
"learning_rate": 4.887690925426775e-06,
"loss": 2.4502,
"step": 1257
},
{
"epoch": 0.5366894197952219,
"grad_norm": 16.117074966430664,
"learning_rate": 4.883198562443846e-06,
"loss": 2.1328,
"step": 1258
},
{
"epoch": 0.5371160409556314,
"grad_norm": 25.477861404418945,
"learning_rate": 4.878706199460917e-06,
"loss": 2.5,
"step": 1259
},
{
"epoch": 0.537542662116041,
"grad_norm": 28.105344772338867,
"learning_rate": 4.874213836477988e-06,
"loss": 2.5112,
"step": 1260
},
{
"epoch": 0.5379692832764505,
"grad_norm": 24.64742660522461,
"learning_rate": 4.869721473495059e-06,
"loss": 2.0586,
"step": 1261
},
{
"epoch": 0.53839590443686,
"grad_norm": 16.60985565185547,
"learning_rate": 4.86522911051213e-06,
"loss": 2.1172,
"step": 1262
},
{
"epoch": 0.5388225255972696,
"grad_norm": 28.405723571777344,
"learning_rate": 4.8607367475292e-06,
"loss": 2.8301,
"step": 1263
},
{
"epoch": 0.5392491467576792,
"grad_norm": 20.750675201416016,
"learning_rate": 4.856244384546272e-06,
"loss": 2.3027,
"step": 1264
},
{
"epoch": 0.5396757679180887,
"grad_norm": 45.88642120361328,
"learning_rate": 4.851752021563343e-06,
"loss": 3.0342,
"step": 1265
},
{
"epoch": 0.5401023890784983,
"grad_norm": 23.829801559448242,
"learning_rate": 4.847259658580414e-06,
"loss": 2.4111,
"step": 1266
},
{
"epoch": 0.5405290102389079,
"grad_norm": 43.45284652709961,
"learning_rate": 4.842767295597484e-06,
"loss": 2.6387,
"step": 1267
},
{
"epoch": 0.5409556313993175,
"grad_norm": 44.6607666015625,
"learning_rate": 4.8382749326145555e-06,
"loss": 2.3877,
"step": 1268
},
{
"epoch": 0.5413822525597269,
"grad_norm": 23.956260681152344,
"learning_rate": 4.8337825696316265e-06,
"loss": 2.5483,
"step": 1269
},
{
"epoch": 0.5418088737201365,
"grad_norm": 25.851301193237305,
"learning_rate": 4.8292902066486976e-06,
"loss": 2.3066,
"step": 1270
},
{
"epoch": 0.5422354948805461,
"grad_norm": 17.28289794921875,
"learning_rate": 4.824797843665769e-06,
"loss": 2.1562,
"step": 1271
},
{
"epoch": 0.5426621160409556,
"grad_norm": 22.85431671142578,
"learning_rate": 4.82030548068284e-06,
"loss": 1.9053,
"step": 1272
},
{
"epoch": 0.5430887372013652,
"grad_norm": 24.000097274780273,
"learning_rate": 4.815813117699911e-06,
"loss": 2.6982,
"step": 1273
},
{
"epoch": 0.5435153583617748,
"grad_norm": 21.238855361938477,
"learning_rate": 4.811320754716982e-06,
"loss": 2.3408,
"step": 1274
},
{
"epoch": 0.5439419795221843,
"grad_norm": 27.686309814453125,
"learning_rate": 4.806828391734053e-06,
"loss": 2.4111,
"step": 1275
},
{
"epoch": 0.5443686006825939,
"grad_norm": 27.691089630126953,
"learning_rate": 4.802336028751123e-06,
"loss": 2.4756,
"step": 1276
},
{
"epoch": 0.5447952218430034,
"grad_norm": 46.49020767211914,
"learning_rate": 4.797843665768194e-06,
"loss": 2.4033,
"step": 1277
},
{
"epoch": 0.5452218430034129,
"grad_norm": 42.342952728271484,
"learning_rate": 4.793351302785265e-06,
"loss": 2.8711,
"step": 1278
},
{
"epoch": 0.5456484641638225,
"grad_norm": 17.928974151611328,
"learning_rate": 4.788858939802337e-06,
"loss": 2.3525,
"step": 1279
},
{
"epoch": 0.5460750853242321,
"grad_norm": 17.892406463623047,
"learning_rate": 4.784366576819407e-06,
"loss": 2.2793,
"step": 1280
},
{
"epoch": 0.5465017064846417,
"grad_norm": 23.351177215576172,
"learning_rate": 4.779874213836478e-06,
"loss": 2.4336,
"step": 1281
},
{
"epoch": 0.5469283276450512,
"grad_norm": 20.19877052307129,
"learning_rate": 4.775381850853549e-06,
"loss": 2.4072,
"step": 1282
},
{
"epoch": 0.5473549488054608,
"grad_norm": 24.09697723388672,
"learning_rate": 4.77088948787062e-06,
"loss": 2.3301,
"step": 1283
},
{
"epoch": 0.5477815699658704,
"grad_norm": 22.438928604125977,
"learning_rate": 4.766397124887691e-06,
"loss": 2.2861,
"step": 1284
},
{
"epoch": 0.5482081911262798,
"grad_norm": 25.924821853637695,
"learning_rate": 4.761904761904762e-06,
"loss": 2.168,
"step": 1285
},
{
"epoch": 0.5486348122866894,
"grad_norm": 29.486230850219727,
"learning_rate": 4.757412398921833e-06,
"loss": 2.6016,
"step": 1286
},
{
"epoch": 0.549061433447099,
"grad_norm": 27.782878875732422,
"learning_rate": 4.7529200359389045e-06,
"loss": 2.3057,
"step": 1287
},
{
"epoch": 0.5494880546075085,
"grad_norm": 17.11376953125,
"learning_rate": 4.7484276729559755e-06,
"loss": 2.3311,
"step": 1288
},
{
"epoch": 0.5499146757679181,
"grad_norm": 26.16791534423828,
"learning_rate": 4.7439353099730466e-06,
"loss": 2.4697,
"step": 1289
},
{
"epoch": 0.5503412969283277,
"grad_norm": 17.218490600585938,
"learning_rate": 4.739442946990117e-06,
"loss": 2.4209,
"step": 1290
},
{
"epoch": 0.5507679180887372,
"grad_norm": 25.795942306518555,
"learning_rate": 4.734950584007188e-06,
"loss": 2.541,
"step": 1291
},
{
"epoch": 0.5511945392491467,
"grad_norm": 33.7730598449707,
"learning_rate": 4.730458221024259e-06,
"loss": 2.6445,
"step": 1292
},
{
"epoch": 0.5516211604095563,
"grad_norm": 35.27183151245117,
"learning_rate": 4.725965858041331e-06,
"loss": 2.8086,
"step": 1293
},
{
"epoch": 0.5520477815699659,
"grad_norm": 22.450122833251953,
"learning_rate": 4.721473495058401e-06,
"loss": 2.499,
"step": 1294
},
{
"epoch": 0.5524744027303754,
"grad_norm": 20.96445655822754,
"learning_rate": 4.716981132075472e-06,
"loss": 2.1758,
"step": 1295
},
{
"epoch": 0.552901023890785,
"grad_norm": 20.37903594970703,
"learning_rate": 4.712488769092543e-06,
"loss": 2.542,
"step": 1296
},
{
"epoch": 0.5533276450511946,
"grad_norm": 21.708114624023438,
"learning_rate": 4.707996406109614e-06,
"loss": 2.4727,
"step": 1297
},
{
"epoch": 0.5537542662116041,
"grad_norm": 35.709999084472656,
"learning_rate": 4.703504043126685e-06,
"loss": 2.3232,
"step": 1298
},
{
"epoch": 0.5541808873720137,
"grad_norm": 17.35190773010254,
"learning_rate": 4.699011680143756e-06,
"loss": 2.6094,
"step": 1299
},
{
"epoch": 0.5546075085324232,
"grad_norm": 16.749832153320312,
"learning_rate": 4.694519317160827e-06,
"loss": 2.252,
"step": 1300
},
{
"epoch": 0.5550341296928327,
"grad_norm": 32.088626861572266,
"learning_rate": 4.690026954177898e-06,
"loss": 2.334,
"step": 1301
},
{
"epoch": 0.5554607508532423,
"grad_norm": 17.354429244995117,
"learning_rate": 4.685534591194969e-06,
"loss": 2.0596,
"step": 1302
},
{
"epoch": 0.5558873720136519,
"grad_norm": 20.02385139465332,
"learning_rate": 4.68104222821204e-06,
"loss": 2.3604,
"step": 1303
},
{
"epoch": 0.5563139931740614,
"grad_norm": 17.277572631835938,
"learning_rate": 4.6765498652291105e-06,
"loss": 2.3037,
"step": 1304
},
{
"epoch": 0.556740614334471,
"grad_norm": 21.47422218322754,
"learning_rate": 4.6720575022461816e-06,
"loss": 2.54,
"step": 1305
},
{
"epoch": 0.5571672354948806,
"grad_norm": 27.697263717651367,
"learning_rate": 4.667565139263253e-06,
"loss": 2.1299,
"step": 1306
},
{
"epoch": 0.5575938566552902,
"grad_norm": 26.158000946044922,
"learning_rate": 4.663072776280324e-06,
"loss": 2.5928,
"step": 1307
},
{
"epoch": 0.5580204778156996,
"grad_norm": 29.957881927490234,
"learning_rate": 4.658580413297395e-06,
"loss": 2.4639,
"step": 1308
},
{
"epoch": 0.5584470989761092,
"grad_norm": 20.76220703125,
"learning_rate": 4.654088050314466e-06,
"loss": 2.5986,
"step": 1309
},
{
"epoch": 0.5588737201365188,
"grad_norm": 22.95929718017578,
"learning_rate": 4.649595687331537e-06,
"loss": 2.7158,
"step": 1310
},
{
"epoch": 0.5593003412969283,
"grad_norm": 17.745424270629883,
"learning_rate": 4.645103324348608e-06,
"loss": 1.9844,
"step": 1311
},
{
"epoch": 0.5597269624573379,
"grad_norm": 22.64072036743164,
"learning_rate": 4.640610961365679e-06,
"loss": 2.0156,
"step": 1312
},
{
"epoch": 0.5601535836177475,
"grad_norm": 44.04773712158203,
"learning_rate": 4.636118598382749e-06,
"loss": 3.0254,
"step": 1313
},
{
"epoch": 0.560580204778157,
"grad_norm": 37.88700866699219,
"learning_rate": 4.631626235399821e-06,
"loss": 2.3643,
"step": 1314
},
{
"epoch": 0.5610068259385665,
"grad_norm": 17.318178176879883,
"learning_rate": 4.627133872416892e-06,
"loss": 2.2002,
"step": 1315
},
{
"epoch": 0.5614334470989761,
"grad_norm": 22.529075622558594,
"learning_rate": 4.622641509433963e-06,
"loss": 2.625,
"step": 1316
},
{
"epoch": 0.5618600682593856,
"grad_norm": 29.588624954223633,
"learning_rate": 4.618149146451033e-06,
"loss": 2.2354,
"step": 1317
},
{
"epoch": 0.5622866894197952,
"grad_norm": 20.1776123046875,
"learning_rate": 4.613656783468104e-06,
"loss": 2.7715,
"step": 1318
},
{
"epoch": 0.5627133105802048,
"grad_norm": 18.949871063232422,
"learning_rate": 4.609164420485175e-06,
"loss": 2.3174,
"step": 1319
},
{
"epoch": 0.5631399317406144,
"grad_norm": 23.272371292114258,
"learning_rate": 4.604672057502246e-06,
"loss": 2.3906,
"step": 1320
},
{
"epoch": 0.5635665529010239,
"grad_norm": 28.942964553833008,
"learning_rate": 4.6001796945193174e-06,
"loss": 2.332,
"step": 1321
},
{
"epoch": 0.5639931740614335,
"grad_norm": 17.387943267822266,
"learning_rate": 4.5956873315363885e-06,
"loss": 1.8735,
"step": 1322
},
{
"epoch": 0.564419795221843,
"grad_norm": 28.30118179321289,
"learning_rate": 4.5911949685534595e-06,
"loss": 2.8379,
"step": 1323
},
{
"epoch": 0.5648464163822525,
"grad_norm": 20.047611236572266,
"learning_rate": 4.5867026055705306e-06,
"loss": 2.666,
"step": 1324
},
{
"epoch": 0.5652730375426621,
"grad_norm": 24.70307159423828,
"learning_rate": 4.582210242587602e-06,
"loss": 2.0962,
"step": 1325
},
{
"epoch": 0.5656996587030717,
"grad_norm": 16.165760040283203,
"learning_rate": 4.577717879604673e-06,
"loss": 2.3633,
"step": 1326
},
{
"epoch": 0.5661262798634812,
"grad_norm": 24.163618087768555,
"learning_rate": 4.573225516621743e-06,
"loss": 3.1045,
"step": 1327
},
{
"epoch": 0.5665529010238908,
"grad_norm": 20.49041748046875,
"learning_rate": 4.568733153638815e-06,
"loss": 2.8564,
"step": 1328
},
{
"epoch": 0.5669795221843004,
"grad_norm": 28.167705535888672,
"learning_rate": 4.564240790655886e-06,
"loss": 3.1016,
"step": 1329
},
{
"epoch": 0.5674061433447098,
"grad_norm": 17.828779220581055,
"learning_rate": 4.559748427672957e-06,
"loss": 2.5957,
"step": 1330
},
{
"epoch": 0.5678327645051194,
"grad_norm": 44.40090560913086,
"learning_rate": 4.555256064690027e-06,
"loss": 2.4102,
"step": 1331
},
{
"epoch": 0.568259385665529,
"grad_norm": 20.21436882019043,
"learning_rate": 4.550763701707098e-06,
"loss": 2.3369,
"step": 1332
},
{
"epoch": 0.5686860068259386,
"grad_norm": 19.991886138916016,
"learning_rate": 4.546271338724169e-06,
"loss": 2.2256,
"step": 1333
},
{
"epoch": 0.5691126279863481,
"grad_norm": 19.2490177154541,
"learning_rate": 4.54177897574124e-06,
"loss": 2.0166,
"step": 1334
},
{
"epoch": 0.5695392491467577,
"grad_norm": 16.32038688659668,
"learning_rate": 4.537286612758311e-06,
"loss": 2.1035,
"step": 1335
},
{
"epoch": 0.5699658703071673,
"grad_norm": 16.530920028686523,
"learning_rate": 4.532794249775382e-06,
"loss": 2.3135,
"step": 1336
},
{
"epoch": 0.5703924914675768,
"grad_norm": 24.308748245239258,
"learning_rate": 4.528301886792453e-06,
"loss": 2.4453,
"step": 1337
},
{
"epoch": 0.5708191126279863,
"grad_norm": 32.11763000488281,
"learning_rate": 4.523809523809524e-06,
"loss": 2.5371,
"step": 1338
},
{
"epoch": 0.5712457337883959,
"grad_norm": 21.306795120239258,
"learning_rate": 4.519317160826595e-06,
"loss": 2.2373,
"step": 1339
},
{
"epoch": 0.5716723549488054,
"grad_norm": 26.340106964111328,
"learning_rate": 4.5148247978436664e-06,
"loss": 1.749,
"step": 1340
},
{
"epoch": 0.572098976109215,
"grad_norm": 21.620981216430664,
"learning_rate": 4.510332434860737e-06,
"loss": 2.1904,
"step": 1341
},
{
"epoch": 0.5725255972696246,
"grad_norm": 27.624412536621094,
"learning_rate": 4.5058400718778085e-06,
"loss": 2.3442,
"step": 1342
},
{
"epoch": 0.5729522184300341,
"grad_norm": 18.985275268554688,
"learning_rate": 4.5013477088948796e-06,
"loss": 2.3994,
"step": 1343
},
{
"epoch": 0.5733788395904437,
"grad_norm": 42.2714729309082,
"learning_rate": 4.49685534591195e-06,
"loss": 2.8359,
"step": 1344
},
{
"epoch": 0.5738054607508533,
"grad_norm": 32.8116455078125,
"learning_rate": 4.492362982929021e-06,
"loss": 1.9834,
"step": 1345
},
{
"epoch": 0.5742320819112628,
"grad_norm": 31.643775939941406,
"learning_rate": 4.487870619946092e-06,
"loss": 2.1738,
"step": 1346
},
{
"epoch": 0.5746587030716723,
"grad_norm": 25.283864974975586,
"learning_rate": 4.483378256963163e-06,
"loss": 1.9385,
"step": 1347
},
{
"epoch": 0.5750853242320819,
"grad_norm": 23.998638153076172,
"learning_rate": 4.478885893980234e-06,
"loss": 2.5303,
"step": 1348
},
{
"epoch": 0.5755119453924915,
"grad_norm": 17.42400360107422,
"learning_rate": 4.474393530997305e-06,
"loss": 2.0859,
"step": 1349
},
{
"epoch": 0.575938566552901,
"grad_norm": 23.299835205078125,
"learning_rate": 4.469901168014376e-06,
"loss": 2.5273,
"step": 1350
},
{
"epoch": 0.5763651877133106,
"grad_norm": 21.345651626586914,
"learning_rate": 4.465408805031447e-06,
"loss": 2.7656,
"step": 1351
},
{
"epoch": 0.5767918088737202,
"grad_norm": 29.26232147216797,
"learning_rate": 4.460916442048518e-06,
"loss": 2.3965,
"step": 1352
},
{
"epoch": 0.5772184300341296,
"grad_norm": 20.05579376220703,
"learning_rate": 4.456424079065589e-06,
"loss": 2.4482,
"step": 1353
},
{
"epoch": 0.5776450511945392,
"grad_norm": 16.981386184692383,
"learning_rate": 4.451931716082659e-06,
"loss": 2.0449,
"step": 1354
},
{
"epoch": 0.5780716723549488,
"grad_norm": 33.122074127197266,
"learning_rate": 4.44743935309973e-06,
"loss": 2.3916,
"step": 1355
},
{
"epoch": 0.5784982935153583,
"grad_norm": 22.73659324645996,
"learning_rate": 4.4429469901168014e-06,
"loss": 2.5859,
"step": 1356
},
{
"epoch": 0.5789249146757679,
"grad_norm": 18.3586368560791,
"learning_rate": 4.438454627133873e-06,
"loss": 2.5098,
"step": 1357
},
{
"epoch": 0.5793515358361775,
"grad_norm": 16.827573776245117,
"learning_rate": 4.4339622641509435e-06,
"loss": 2.2197,
"step": 1358
},
{
"epoch": 0.5797781569965871,
"grad_norm": 27.184112548828125,
"learning_rate": 4.429469901168015e-06,
"loss": 2.4912,
"step": 1359
},
{
"epoch": 0.5802047781569966,
"grad_norm": 24.514375686645508,
"learning_rate": 4.424977538185086e-06,
"loss": 2.2236,
"step": 1360
},
{
"epoch": 0.5806313993174061,
"grad_norm": 21.03577423095703,
"learning_rate": 4.420485175202157e-06,
"loss": 2.1973,
"step": 1361
},
{
"epoch": 0.5810580204778157,
"grad_norm": 21.42534637451172,
"learning_rate": 4.415992812219228e-06,
"loss": 2.6367,
"step": 1362
},
{
"epoch": 0.5814846416382252,
"grad_norm": 25.538663864135742,
"learning_rate": 4.411500449236299e-06,
"loss": 2.46,
"step": 1363
},
{
"epoch": 0.5819112627986348,
"grad_norm": 20.800830841064453,
"learning_rate": 4.40700808625337e-06,
"loss": 2.1704,
"step": 1364
},
{
"epoch": 0.5823378839590444,
"grad_norm": 29.959156036376953,
"learning_rate": 4.402515723270441e-06,
"loss": 2.5312,
"step": 1365
},
{
"epoch": 0.5827645051194539,
"grad_norm": 23.058263778686523,
"learning_rate": 4.398023360287512e-06,
"loss": 2.5,
"step": 1366
},
{
"epoch": 0.5831911262798635,
"grad_norm": 25.83173179626465,
"learning_rate": 4.393530997304583e-06,
"loss": 2.3066,
"step": 1367
},
{
"epoch": 0.5836177474402731,
"grad_norm": 22.539011001586914,
"learning_rate": 4.389038634321653e-06,
"loss": 2.6162,
"step": 1368
},
{
"epoch": 0.5840443686006825,
"grad_norm": 40.652252197265625,
"learning_rate": 4.384546271338724e-06,
"loss": 2.3555,
"step": 1369
},
{
"epoch": 0.5844709897610921,
"grad_norm": 29.97538948059082,
"learning_rate": 4.380053908355795e-06,
"loss": 2.5176,
"step": 1370
},
{
"epoch": 0.5848976109215017,
"grad_norm": 21.40877914428711,
"learning_rate": 4.375561545372867e-06,
"loss": 2.5283,
"step": 1371
},
{
"epoch": 0.5853242320819113,
"grad_norm": 22.635761260986328,
"learning_rate": 4.371069182389937e-06,
"loss": 2.3252,
"step": 1372
},
{
"epoch": 0.5857508532423208,
"grad_norm": 22.346757888793945,
"learning_rate": 4.366576819407008e-06,
"loss": 2.2568,
"step": 1373
},
{
"epoch": 0.5861774744027304,
"grad_norm": 19.227197647094727,
"learning_rate": 4.362084456424079e-06,
"loss": 2.4033,
"step": 1374
},
{
"epoch": 0.58660409556314,
"grad_norm": 18.52188491821289,
"learning_rate": 4.3575920934411504e-06,
"loss": 1.7754,
"step": 1375
},
{
"epoch": 0.5870307167235495,
"grad_norm": 60.36956024169922,
"learning_rate": 4.3530997304582215e-06,
"loss": 2.9863,
"step": 1376
},
{
"epoch": 0.587457337883959,
"grad_norm": 23.77344512939453,
"learning_rate": 4.348607367475292e-06,
"loss": 2.3872,
"step": 1377
},
{
"epoch": 0.5878839590443686,
"grad_norm": 20.1966552734375,
"learning_rate": 4.344115004492364e-06,
"loss": 2.457,
"step": 1378
},
{
"epoch": 0.5883105802047781,
"grad_norm": 37.82915496826172,
"learning_rate": 4.339622641509435e-06,
"loss": 2.2109,
"step": 1379
},
{
"epoch": 0.5887372013651877,
"grad_norm": 15.02815055847168,
"learning_rate": 4.335130278526506e-06,
"loss": 1.8213,
"step": 1380
},
{
"epoch": 0.5891638225255973,
"grad_norm": 24.90953826904297,
"learning_rate": 4.330637915543576e-06,
"loss": 2.4609,
"step": 1381
},
{
"epoch": 0.5895904436860068,
"grad_norm": 25.334327697753906,
"learning_rate": 4.326145552560647e-06,
"loss": 2.9844,
"step": 1382
},
{
"epoch": 0.5900170648464164,
"grad_norm": 16.76384925842285,
"learning_rate": 4.321653189577718e-06,
"loss": 2.1152,
"step": 1383
},
{
"epoch": 0.590443686006826,
"grad_norm": 24.324865341186523,
"learning_rate": 4.317160826594789e-06,
"loss": 2.0791,
"step": 1384
},
{
"epoch": 0.5908703071672355,
"grad_norm": 18.31525421142578,
"learning_rate": 4.31266846361186e-06,
"loss": 2.1934,
"step": 1385
},
{
"epoch": 0.591296928327645,
"grad_norm": 29.65024185180664,
"learning_rate": 4.308176100628931e-06,
"loss": 3.0664,
"step": 1386
},
{
"epoch": 0.5917235494880546,
"grad_norm": 66.52584075927734,
"learning_rate": 4.303683737646002e-06,
"loss": 2.7041,
"step": 1387
},
{
"epoch": 0.5921501706484642,
"grad_norm": 43.482704162597656,
"learning_rate": 4.299191374663073e-06,
"loss": 2.3887,
"step": 1388
},
{
"epoch": 0.5925767918088737,
"grad_norm": 44.79536819458008,
"learning_rate": 4.294699011680144e-06,
"loss": 2.5781,
"step": 1389
},
{
"epoch": 0.5930034129692833,
"grad_norm": 35.73846435546875,
"learning_rate": 4.290206648697215e-06,
"loss": 2.2432,
"step": 1390
},
{
"epoch": 0.5934300341296929,
"grad_norm": 36.16798400878906,
"learning_rate": 4.2857142857142855e-06,
"loss": 2.5605,
"step": 1391
},
{
"epoch": 0.5938566552901023,
"grad_norm": 20.952844619750977,
"learning_rate": 4.281221922731357e-06,
"loss": 2.3232,
"step": 1392
},
{
"epoch": 0.5942832764505119,
"grad_norm": 27.37010955810547,
"learning_rate": 4.276729559748428e-06,
"loss": 2.832,
"step": 1393
},
{
"epoch": 0.5947098976109215,
"grad_norm": 15.844951629638672,
"learning_rate": 4.2722371967654994e-06,
"loss": 2.2432,
"step": 1394
},
{
"epoch": 0.5951365187713311,
"grad_norm": 27.62236213684082,
"learning_rate": 4.26774483378257e-06,
"loss": 2.2822,
"step": 1395
},
{
"epoch": 0.5955631399317406,
"grad_norm": 44.5030517578125,
"learning_rate": 4.263252470799641e-06,
"loss": 3.1357,
"step": 1396
},
{
"epoch": 0.5959897610921502,
"grad_norm": 43.49186325073242,
"learning_rate": 4.258760107816712e-06,
"loss": 2.7344,
"step": 1397
},
{
"epoch": 0.5964163822525598,
"grad_norm": 45.643653869628906,
"learning_rate": 4.254267744833783e-06,
"loss": 2.8877,
"step": 1398
},
{
"epoch": 0.5968430034129693,
"grad_norm": 44.29466247558594,
"learning_rate": 4.249775381850854e-06,
"loss": 2.4814,
"step": 1399
},
{
"epoch": 0.5972696245733788,
"grad_norm": 36.38447570800781,
"learning_rate": 4.245283018867925e-06,
"loss": 2.5371,
"step": 1400
},
{
"epoch": 0.5976962457337884,
"grad_norm": 34.13217544555664,
"learning_rate": 4.240790655884996e-06,
"loss": 2.2461,
"step": 1401
},
{
"epoch": 0.5981228668941979,
"grad_norm": 16.927581787109375,
"learning_rate": 4.236298292902067e-06,
"loss": 1.9912,
"step": 1402
},
{
"epoch": 0.5985494880546075,
"grad_norm": 19.867677688598633,
"learning_rate": 4.231805929919138e-06,
"loss": 1.8281,
"step": 1403
},
{
"epoch": 0.5989761092150171,
"grad_norm": 41.92014694213867,
"learning_rate": 4.227313566936209e-06,
"loss": 2.6885,
"step": 1404
},
{
"epoch": 0.5994027303754266,
"grad_norm": 31.410295486450195,
"learning_rate": 4.222821203953279e-06,
"loss": 2.1963,
"step": 1405
},
{
"epoch": 0.5998293515358362,
"grad_norm": 22.472976684570312,
"learning_rate": 4.218328840970351e-06,
"loss": 2.3354,
"step": 1406
},
{
"epoch": 0.6002559726962458,
"grad_norm": 22.231199264526367,
"learning_rate": 4.213836477987422e-06,
"loss": 2.6934,
"step": 1407
},
{
"epoch": 0.6006825938566553,
"grad_norm": 25.221227645874023,
"learning_rate": 4.209344115004493e-06,
"loss": 2.5557,
"step": 1408
},
{
"epoch": 0.6011092150170648,
"grad_norm": 19.79888343811035,
"learning_rate": 4.204851752021563e-06,
"loss": 2.6304,
"step": 1409
},
{
"epoch": 0.6015358361774744,
"grad_norm": 27.240476608276367,
"learning_rate": 4.2003593890386345e-06,
"loss": 2.042,
"step": 1410
},
{
"epoch": 0.601962457337884,
"grad_norm": 23.848093032836914,
"learning_rate": 4.1958670260557055e-06,
"loss": 2.3359,
"step": 1411
},
{
"epoch": 0.6023890784982935,
"grad_norm": 27.17098617553711,
"learning_rate": 4.1913746630727765e-06,
"loss": 2.7607,
"step": 1412
},
{
"epoch": 0.6028156996587031,
"grad_norm": 35.22904968261719,
"learning_rate": 4.186882300089848e-06,
"loss": 2.5801,
"step": 1413
},
{
"epoch": 0.6032423208191127,
"grad_norm": 35.9609260559082,
"learning_rate": 4.182389937106919e-06,
"loss": 2.4004,
"step": 1414
},
{
"epoch": 0.6036689419795221,
"grad_norm": 19.513574600219727,
"learning_rate": 4.17789757412399e-06,
"loss": 2.5693,
"step": 1415
},
{
"epoch": 0.6040955631399317,
"grad_norm": 20.018802642822266,
"learning_rate": 4.173405211141061e-06,
"loss": 2.0693,
"step": 1416
},
{
"epoch": 0.6045221843003413,
"grad_norm": 19.38519287109375,
"learning_rate": 4.168912848158132e-06,
"loss": 1.853,
"step": 1417
},
{
"epoch": 0.6049488054607508,
"grad_norm": 27.481489181518555,
"learning_rate": 4.164420485175202e-06,
"loss": 2.7305,
"step": 1418
},
{
"epoch": 0.6053754266211604,
"grad_norm": 19.81107521057129,
"learning_rate": 4.159928122192273e-06,
"loss": 2.1338,
"step": 1419
},
{
"epoch": 0.60580204778157,
"grad_norm": 44.185691833496094,
"learning_rate": 4.155435759209345e-06,
"loss": 2.8506,
"step": 1420
},
{
"epoch": 0.6062286689419796,
"grad_norm": 29.988082885742188,
"learning_rate": 4.150943396226416e-06,
"loss": 2.2617,
"step": 1421
},
{
"epoch": 0.606655290102389,
"grad_norm": 19.156795501708984,
"learning_rate": 4.146451033243486e-06,
"loss": 2.647,
"step": 1422
},
{
"epoch": 0.6070819112627986,
"grad_norm": 25.175708770751953,
"learning_rate": 4.141958670260557e-06,
"loss": 2.2324,
"step": 1423
},
{
"epoch": 0.6075085324232082,
"grad_norm": 26.030254364013672,
"learning_rate": 4.137466307277628e-06,
"loss": 2.2109,
"step": 1424
},
{
"epoch": 0.6079351535836177,
"grad_norm": 15.288980484008789,
"learning_rate": 4.132973944294699e-06,
"loss": 2.0322,
"step": 1425
},
{
"epoch": 0.6083617747440273,
"grad_norm": 15.24339771270752,
"learning_rate": 4.12848158131177e-06,
"loss": 2.0342,
"step": 1426
},
{
"epoch": 0.6087883959044369,
"grad_norm": 28.109180450439453,
"learning_rate": 4.123989218328841e-06,
"loss": 2.2969,
"step": 1427
},
{
"epoch": 0.6092150170648464,
"grad_norm": 38.712154388427734,
"learning_rate": 4.119496855345912e-06,
"loss": 2.4268,
"step": 1428
},
{
"epoch": 0.609641638225256,
"grad_norm": 41.02090835571289,
"learning_rate": 4.1150044923629835e-06,
"loss": 2.208,
"step": 1429
},
{
"epoch": 0.6100682593856656,
"grad_norm": 19.8539981842041,
"learning_rate": 4.1105121293800545e-06,
"loss": 2.5723,
"step": 1430
},
{
"epoch": 0.610494880546075,
"grad_norm": 17.092798233032227,
"learning_rate": 4.1060197663971255e-06,
"loss": 1.7119,
"step": 1431
},
{
"epoch": 0.6109215017064846,
"grad_norm": 22.7246036529541,
"learning_rate": 4.101527403414196e-06,
"loss": 2.2485,
"step": 1432
},
{
"epoch": 0.6113481228668942,
"grad_norm": 21.90310287475586,
"learning_rate": 4.097035040431267e-06,
"loss": 2.0273,
"step": 1433
},
{
"epoch": 0.6117747440273038,
"grad_norm": 28.43157958984375,
"learning_rate": 4.092542677448338e-06,
"loss": 2.8447,
"step": 1434
},
{
"epoch": 0.6122013651877133,
"grad_norm": 18.50580406188965,
"learning_rate": 4.08805031446541e-06,
"loss": 2.1816,
"step": 1435
},
{
"epoch": 0.6126279863481229,
"grad_norm": 22.50284767150879,
"learning_rate": 4.08355795148248e-06,
"loss": 2.0723,
"step": 1436
},
{
"epoch": 0.6130546075085325,
"grad_norm": 28.248563766479492,
"learning_rate": 4.079065588499551e-06,
"loss": 2.1426,
"step": 1437
},
{
"epoch": 0.613481228668942,
"grad_norm": 38.63645935058594,
"learning_rate": 4.074573225516622e-06,
"loss": 2.375,
"step": 1438
},
{
"epoch": 0.6139078498293515,
"grad_norm": 27.894947052001953,
"learning_rate": 4.070080862533693e-06,
"loss": 2.0645,
"step": 1439
},
{
"epoch": 0.6143344709897611,
"grad_norm": 37.0277099609375,
"learning_rate": 4.065588499550764e-06,
"loss": 2.3076,
"step": 1440
},
{
"epoch": 0.6147610921501706,
"grad_norm": 15.98538589477539,
"learning_rate": 4.061096136567835e-06,
"loss": 1.8379,
"step": 1441
},
{
"epoch": 0.6151877133105802,
"grad_norm": 16.836027145385742,
"learning_rate": 4.056603773584906e-06,
"loss": 2.3564,
"step": 1442
},
{
"epoch": 0.6156143344709898,
"grad_norm": 31.81623649597168,
"learning_rate": 4.052111410601977e-06,
"loss": 2.6572,
"step": 1443
},
{
"epoch": 0.6160409556313993,
"grad_norm": 38.87391662597656,
"learning_rate": 4.047619047619048e-06,
"loss": 2.4619,
"step": 1444
},
{
"epoch": 0.6164675767918089,
"grad_norm": 24.75901222229004,
"learning_rate": 4.043126684636119e-06,
"loss": 2.6758,
"step": 1445
},
{
"epoch": 0.6168941979522184,
"grad_norm": 26.73711395263672,
"learning_rate": 4.0386343216531895e-06,
"loss": 2.3291,
"step": 1446
},
{
"epoch": 0.617320819112628,
"grad_norm": 15.336953163146973,
"learning_rate": 4.0341419586702606e-06,
"loss": 2.1064,
"step": 1447
},
{
"epoch": 0.6177474402730375,
"grad_norm": 27.878210067749023,
"learning_rate": 4.029649595687332e-06,
"loss": 2.3994,
"step": 1448
},
{
"epoch": 0.6181740614334471,
"grad_norm": 20.88320541381836,
"learning_rate": 4.025157232704403e-06,
"loss": 2.1064,
"step": 1449
},
{
"epoch": 0.6186006825938567,
"grad_norm": 25.652420043945312,
"learning_rate": 4.020664869721474e-06,
"loss": 2.083,
"step": 1450
},
{
"epoch": 0.6190273037542662,
"grad_norm": 20.050722122192383,
"learning_rate": 4.016172506738545e-06,
"loss": 2.2979,
"step": 1451
},
{
"epoch": 0.6194539249146758,
"grad_norm": 15.306723594665527,
"learning_rate": 4.011680143755616e-06,
"loss": 2.1504,
"step": 1452
},
{
"epoch": 0.6198805460750854,
"grad_norm": 19.287073135375977,
"learning_rate": 4.007187780772687e-06,
"loss": 2.8262,
"step": 1453
},
{
"epoch": 0.6203071672354948,
"grad_norm": 14.914201736450195,
"learning_rate": 4.002695417789758e-06,
"loss": 1.9775,
"step": 1454
},
{
"epoch": 0.6207337883959044,
"grad_norm": 18.950130462646484,
"learning_rate": 3.998203054806828e-06,
"loss": 2.4258,
"step": 1455
},
{
"epoch": 0.621160409556314,
"grad_norm": 15.977020263671875,
"learning_rate": 3.9937106918239e-06,
"loss": 2.1719,
"step": 1456
},
{
"epoch": 0.6215870307167235,
"grad_norm": 22.02812385559082,
"learning_rate": 3.989218328840971e-06,
"loss": 2.5156,
"step": 1457
},
{
"epoch": 0.6220136518771331,
"grad_norm": 17.025178909301758,
"learning_rate": 3.984725965858042e-06,
"loss": 2.5469,
"step": 1458
},
{
"epoch": 0.6224402730375427,
"grad_norm": 39.999542236328125,
"learning_rate": 3.980233602875112e-06,
"loss": 2.7939,
"step": 1459
},
{
"epoch": 0.6228668941979523,
"grad_norm": 16.99581527709961,
"learning_rate": 3.975741239892183e-06,
"loss": 2.2998,
"step": 1460
},
{
"epoch": 0.6232935153583617,
"grad_norm": 18.14431381225586,
"learning_rate": 3.971248876909254e-06,
"loss": 1.8926,
"step": 1461
},
{
"epoch": 0.6237201365187713,
"grad_norm": 16.469980239868164,
"learning_rate": 3.966756513926325e-06,
"loss": 2.5195,
"step": 1462
},
{
"epoch": 0.6241467576791809,
"grad_norm": 18.794111251831055,
"learning_rate": 3.962264150943396e-06,
"loss": 2.4385,
"step": 1463
},
{
"epoch": 0.6245733788395904,
"grad_norm": 22.943275451660156,
"learning_rate": 3.9577717879604675e-06,
"loss": 2.3149,
"step": 1464
},
{
"epoch": 0.625,
"grad_norm": 30.07889175415039,
"learning_rate": 3.9532794249775385e-06,
"loss": 2.1279,
"step": 1465
},
{
"epoch": 0.6254266211604096,
"grad_norm": 36.66804885864258,
"learning_rate": 3.9487870619946096e-06,
"loss": 2.7715,
"step": 1466
},
{
"epoch": 0.6258532423208191,
"grad_norm": 18.29961585998535,
"learning_rate": 3.944294699011681e-06,
"loss": 2.1836,
"step": 1467
},
{
"epoch": 0.6262798634812287,
"grad_norm": 27.083736419677734,
"learning_rate": 3.939802336028752e-06,
"loss": 2.2803,
"step": 1468
},
{
"epoch": 0.6267064846416383,
"grad_norm": 31.700674057006836,
"learning_rate": 3.935309973045822e-06,
"loss": 2.7314,
"step": 1469
},
{
"epoch": 0.6271331058020477,
"grad_norm": 20.692028045654297,
"learning_rate": 3.930817610062894e-06,
"loss": 2.6025,
"step": 1470
},
{
"epoch": 0.6275597269624573,
"grad_norm": 20.040754318237305,
"learning_rate": 3.926325247079965e-06,
"loss": 2.0977,
"step": 1471
},
{
"epoch": 0.6279863481228669,
"grad_norm": 24.181251525878906,
"learning_rate": 3.921832884097036e-06,
"loss": 2.8213,
"step": 1472
},
{
"epoch": 0.6284129692832765,
"grad_norm": 22.22473907470703,
"learning_rate": 3.917340521114106e-06,
"loss": 2.2432,
"step": 1473
},
{
"epoch": 0.628839590443686,
"grad_norm": 21.3482666015625,
"learning_rate": 3.912848158131177e-06,
"loss": 2.5049,
"step": 1474
},
{
"epoch": 0.6292662116040956,
"grad_norm": 22.529102325439453,
"learning_rate": 3.908355795148248e-06,
"loss": 2.0825,
"step": 1475
},
{
"epoch": 0.6296928327645052,
"grad_norm": 29.344873428344727,
"learning_rate": 3.903863432165319e-06,
"loss": 2.3721,
"step": 1476
},
{
"epoch": 0.6301194539249146,
"grad_norm": 23.979862213134766,
"learning_rate": 3.89937106918239e-06,
"loss": 2.4229,
"step": 1477
},
{
"epoch": 0.6305460750853242,
"grad_norm": 20.146678924560547,
"learning_rate": 3.894878706199461e-06,
"loss": 2.2549,
"step": 1478
},
{
"epoch": 0.6309726962457338,
"grad_norm": 32.02549743652344,
"learning_rate": 3.890386343216532e-06,
"loss": 2.1494,
"step": 1479
},
{
"epoch": 0.6313993174061433,
"grad_norm": 15.963859558105469,
"learning_rate": 3.885893980233603e-06,
"loss": 2.3057,
"step": 1480
},
{
"epoch": 0.6318259385665529,
"grad_norm": 26.127859115600586,
"learning_rate": 3.881401617250674e-06,
"loss": 2.8711,
"step": 1481
},
{
"epoch": 0.6322525597269625,
"grad_norm": 16.796703338623047,
"learning_rate": 3.876909254267745e-06,
"loss": 2.0557,
"step": 1482
},
{
"epoch": 0.632679180887372,
"grad_norm": 21.77611541748047,
"learning_rate": 3.872416891284816e-06,
"loss": 2.1187,
"step": 1483
},
{
"epoch": 0.6331058020477816,
"grad_norm": 26.624557495117188,
"learning_rate": 3.8679245283018875e-06,
"loss": 2.5098,
"step": 1484
},
{
"epoch": 0.6335324232081911,
"grad_norm": 27.21763801574707,
"learning_rate": 3.8634321653189586e-06,
"loss": 1.9473,
"step": 1485
},
{
"epoch": 0.6339590443686007,
"grad_norm": 17.299362182617188,
"learning_rate": 3.858939802336029e-06,
"loss": 2.7949,
"step": 1486
},
{
"epoch": 0.6343856655290102,
"grad_norm": 20.601594924926758,
"learning_rate": 3.8544474393531e-06,
"loss": 2.415,
"step": 1487
},
{
"epoch": 0.6348122866894198,
"grad_norm": 16.806154251098633,
"learning_rate": 3.849955076370171e-06,
"loss": 2.1221,
"step": 1488
},
{
"epoch": 0.6352389078498294,
"grad_norm": 23.250965118408203,
"learning_rate": 3.845462713387242e-06,
"loss": 2.2686,
"step": 1489
},
{
"epoch": 0.6356655290102389,
"grad_norm": 24.61576271057129,
"learning_rate": 3.840970350404313e-06,
"loss": 2.1787,
"step": 1490
},
{
"epoch": 0.6360921501706485,
"grad_norm": 16.336050033569336,
"learning_rate": 3.836477987421384e-06,
"loss": 1.9048,
"step": 1491
},
{
"epoch": 0.636518771331058,
"grad_norm": 18.35066795349121,
"learning_rate": 3.831985624438455e-06,
"loss": 2.2012,
"step": 1492
},
{
"epoch": 0.6369453924914675,
"grad_norm": 27.973064422607422,
"learning_rate": 3.827493261455526e-06,
"loss": 2.7266,
"step": 1493
},
{
"epoch": 0.6373720136518771,
"grad_norm": 18.747730255126953,
"learning_rate": 3.823000898472597e-06,
"loss": 1.8916,
"step": 1494
},
{
"epoch": 0.6377986348122867,
"grad_norm": 19.106430053710938,
"learning_rate": 3.818508535489668e-06,
"loss": 2.2539,
"step": 1495
},
{
"epoch": 0.6382252559726962,
"grad_norm": 31.52516746520996,
"learning_rate": 3.8140161725067388e-06,
"loss": 1.9854,
"step": 1496
},
{
"epoch": 0.6386518771331058,
"grad_norm": 22.284278869628906,
"learning_rate": 3.80952380952381e-06,
"loss": 2.2305,
"step": 1497
},
{
"epoch": 0.6390784982935154,
"grad_norm": 32.087974548339844,
"learning_rate": 3.8050314465408813e-06,
"loss": 2.6914,
"step": 1498
},
{
"epoch": 0.639505119453925,
"grad_norm": 23.919710159301758,
"learning_rate": 3.800539083557952e-06,
"loss": 2.2656,
"step": 1499
},
{
"epoch": 0.6399317406143344,
"grad_norm": 34.100765228271484,
"learning_rate": 3.796046720575023e-06,
"loss": 2.2314,
"step": 1500
},
{
"epoch": 0.640358361774744,
"grad_norm": 25.631540298461914,
"learning_rate": 3.791554357592094e-06,
"loss": 1.9922,
"step": 1501
},
{
"epoch": 0.6407849829351536,
"grad_norm": 20.974817276000977,
"learning_rate": 3.7870619946091646e-06,
"loss": 2.8428,
"step": 1502
},
{
"epoch": 0.6412116040955631,
"grad_norm": 20.296192169189453,
"learning_rate": 3.7825696316262357e-06,
"loss": 1.9897,
"step": 1503
},
{
"epoch": 0.6416382252559727,
"grad_norm": 24.699539184570312,
"learning_rate": 3.7780772686433063e-06,
"loss": 2.4717,
"step": 1504
},
{
"epoch": 0.6420648464163823,
"grad_norm": 18.056325912475586,
"learning_rate": 3.7735849056603777e-06,
"loss": 2.0938,
"step": 1505
},
{
"epoch": 0.6424914675767918,
"grad_norm": 28.3708553314209,
"learning_rate": 3.769092542677449e-06,
"loss": 2.582,
"step": 1506
},
{
"epoch": 0.6429180887372014,
"grad_norm": 23.238800048828125,
"learning_rate": 3.76460017969452e-06,
"loss": 2.3086,
"step": 1507
},
{
"epoch": 0.643344709897611,
"grad_norm": 26.795642852783203,
"learning_rate": 3.7601078167115905e-06,
"loss": 2.4912,
"step": 1508
},
{
"epoch": 0.6437713310580204,
"grad_norm": 29.788188934326172,
"learning_rate": 3.7556154537286615e-06,
"loss": 2.4717,
"step": 1509
},
{
"epoch": 0.64419795221843,
"grad_norm": 22.72492218017578,
"learning_rate": 3.7511230907457325e-06,
"loss": 2.1494,
"step": 1510
},
{
"epoch": 0.6446245733788396,
"grad_norm": 42.80537414550781,
"learning_rate": 3.746630727762803e-06,
"loss": 2.3721,
"step": 1511
},
{
"epoch": 0.6450511945392492,
"grad_norm": 29.475852966308594,
"learning_rate": 3.742138364779874e-06,
"loss": 2.7007,
"step": 1512
},
{
"epoch": 0.6454778156996587,
"grad_norm": 18.431743621826172,
"learning_rate": 3.7376460017969457e-06,
"loss": 2.1729,
"step": 1513
},
{
"epoch": 0.6459044368600683,
"grad_norm": 25.59565544128418,
"learning_rate": 3.7331536388140167e-06,
"loss": 2.335,
"step": 1514
},
{
"epoch": 0.6463310580204779,
"grad_norm": 17.255495071411133,
"learning_rate": 3.7286612758310873e-06,
"loss": 1.9688,
"step": 1515
},
{
"epoch": 0.6467576791808873,
"grad_norm": 16.406015396118164,
"learning_rate": 3.7241689128481584e-06,
"loss": 2.2363,
"step": 1516
},
{
"epoch": 0.6471843003412969,
"grad_norm": 16.080997467041016,
"learning_rate": 3.7196765498652294e-06,
"loss": 1.8213,
"step": 1517
},
{
"epoch": 0.6476109215017065,
"grad_norm": 18.1623477935791,
"learning_rate": 3.7151841868823e-06,
"loss": 1.9053,
"step": 1518
},
{
"epoch": 0.648037542662116,
"grad_norm": 29.810514450073242,
"learning_rate": 3.710691823899371e-06,
"loss": 2.0469,
"step": 1519
},
{
"epoch": 0.6484641638225256,
"grad_norm": 18.05474090576172,
"learning_rate": 3.7061994609164426e-06,
"loss": 2.0127,
"step": 1520
},
{
"epoch": 0.6488907849829352,
"grad_norm": 31.862010955810547,
"learning_rate": 3.7017070979335136e-06,
"loss": 2.2441,
"step": 1521
},
{
"epoch": 0.6493174061433447,
"grad_norm": 40.58087921142578,
"learning_rate": 3.6972147349505842e-06,
"loss": 2.1729,
"step": 1522
},
{
"epoch": 0.6497440273037542,
"grad_norm": 22.69669532775879,
"learning_rate": 3.6927223719676553e-06,
"loss": 2.2363,
"step": 1523
},
{
"epoch": 0.6501706484641638,
"grad_norm": 25.206192016601562,
"learning_rate": 3.6882300089847263e-06,
"loss": 2.6113,
"step": 1524
},
{
"epoch": 0.6505972696245734,
"grad_norm": 50.6636962890625,
"learning_rate": 3.683737646001797e-06,
"loss": 2.6367,
"step": 1525
},
{
"epoch": 0.6510238907849829,
"grad_norm": 36.94301986694336,
"learning_rate": 3.679245283018868e-06,
"loss": 2.2275,
"step": 1526
},
{
"epoch": 0.6514505119453925,
"grad_norm": 21.014408111572266,
"learning_rate": 3.6747529200359395e-06,
"loss": 2.4297,
"step": 1527
},
{
"epoch": 0.6518771331058021,
"grad_norm": 30.594039916992188,
"learning_rate": 3.6702605570530105e-06,
"loss": 2.4385,
"step": 1528
},
{
"epoch": 0.6523037542662116,
"grad_norm": 43.26289367675781,
"learning_rate": 3.665768194070081e-06,
"loss": 2.3418,
"step": 1529
},
{
"epoch": 0.6527303754266212,
"grad_norm": 30.854293823242188,
"learning_rate": 3.661275831087152e-06,
"loss": 2.2285,
"step": 1530
},
{
"epoch": 0.6531569965870307,
"grad_norm": 33.77555847167969,
"learning_rate": 3.656783468104223e-06,
"loss": 2.2373,
"step": 1531
},
{
"epoch": 0.6535836177474402,
"grad_norm": 36.544769287109375,
"learning_rate": 3.652291105121294e-06,
"loss": 2.8721,
"step": 1532
},
{
"epoch": 0.6540102389078498,
"grad_norm": 32.035823822021484,
"learning_rate": 3.647798742138365e-06,
"loss": 2.292,
"step": 1533
},
{
"epoch": 0.6544368600682594,
"grad_norm": 21.970911026000977,
"learning_rate": 3.6433063791554363e-06,
"loss": 2.0488,
"step": 1534
},
{
"epoch": 0.6548634812286689,
"grad_norm": 25.027658462524414,
"learning_rate": 3.6388140161725074e-06,
"loss": 2.6094,
"step": 1535
},
{
"epoch": 0.6552901023890785,
"grad_norm": 21.497413635253906,
"learning_rate": 3.634321653189578e-06,
"loss": 2.5479,
"step": 1536
},
{
"epoch": 0.6557167235494881,
"grad_norm": 26.59954261779785,
"learning_rate": 3.629829290206649e-06,
"loss": 2.3184,
"step": 1537
},
{
"epoch": 0.6561433447098977,
"grad_norm": 20.841184616088867,
"learning_rate": 3.6253369272237197e-06,
"loss": 2.332,
"step": 1538
},
{
"epoch": 0.6565699658703071,
"grad_norm": 25.912158966064453,
"learning_rate": 3.6208445642407907e-06,
"loss": 2.498,
"step": 1539
},
{
"epoch": 0.6569965870307167,
"grad_norm": 15.258447647094727,
"learning_rate": 3.6163522012578618e-06,
"loss": 2.0029,
"step": 1540
},
{
"epoch": 0.6574232081911263,
"grad_norm": 20.002315521240234,
"learning_rate": 3.6118598382749332e-06,
"loss": 2.2607,
"step": 1541
},
{
"epoch": 0.6578498293515358,
"grad_norm": 19.588613510131836,
"learning_rate": 3.607367475292004e-06,
"loss": 2.5439,
"step": 1542
},
{
"epoch": 0.6582764505119454,
"grad_norm": 24.894855499267578,
"learning_rate": 3.602875112309075e-06,
"loss": 2.5576,
"step": 1543
},
{
"epoch": 0.658703071672355,
"grad_norm": 16.874536514282227,
"learning_rate": 3.598382749326146e-06,
"loss": 2.0537,
"step": 1544
},
{
"epoch": 0.6591296928327645,
"grad_norm": 18.829151153564453,
"learning_rate": 3.5938903863432166e-06,
"loss": 2.1377,
"step": 1545
},
{
"epoch": 0.659556313993174,
"grad_norm": 19.12894058227539,
"learning_rate": 3.5893980233602876e-06,
"loss": 2.3105,
"step": 1546
},
{
"epoch": 0.6599829351535836,
"grad_norm": 16.351715087890625,
"learning_rate": 3.5849056603773586e-06,
"loss": 2.627,
"step": 1547
},
{
"epoch": 0.6604095563139932,
"grad_norm": 27.592687606811523,
"learning_rate": 3.58041329739443e-06,
"loss": 2.6953,
"step": 1548
},
{
"epoch": 0.6608361774744027,
"grad_norm": 39.048057556152344,
"learning_rate": 3.5759209344115007e-06,
"loss": 2.4189,
"step": 1549
},
{
"epoch": 0.6612627986348123,
"grad_norm": 17.709571838378906,
"learning_rate": 3.5714285714285718e-06,
"loss": 2.3457,
"step": 1550
},
{
"epoch": 0.6616894197952219,
"grad_norm": 34.29948806762695,
"learning_rate": 3.566936208445643e-06,
"loss": 2.085,
"step": 1551
},
{
"epoch": 0.6621160409556314,
"grad_norm": 19.947063446044922,
"learning_rate": 3.5624438454627134e-06,
"loss": 2.7549,
"step": 1552
},
{
"epoch": 0.662542662116041,
"grad_norm": 20.71929359436035,
"learning_rate": 3.5579514824797845e-06,
"loss": 1.9424,
"step": 1553
},
{
"epoch": 0.6629692832764505,
"grad_norm": 25.915510177612305,
"learning_rate": 3.5534591194968555e-06,
"loss": 2.5254,
"step": 1554
},
{
"epoch": 0.66339590443686,
"grad_norm": 18.495929718017578,
"learning_rate": 3.548966756513927e-06,
"loss": 2.0322,
"step": 1555
},
{
"epoch": 0.6638225255972696,
"grad_norm": 33.1831169128418,
"learning_rate": 3.5444743935309976e-06,
"loss": 2.4834,
"step": 1556
},
{
"epoch": 0.6642491467576792,
"grad_norm": 24.745229721069336,
"learning_rate": 3.5399820305480687e-06,
"loss": 2.4292,
"step": 1557
},
{
"epoch": 0.6646757679180887,
"grad_norm": 20.10657501220703,
"learning_rate": 3.5354896675651397e-06,
"loss": 2.0835,
"step": 1558
},
{
"epoch": 0.6651023890784983,
"grad_norm": 20.765737533569336,
"learning_rate": 3.5309973045822103e-06,
"loss": 1.7295,
"step": 1559
},
{
"epoch": 0.6655290102389079,
"grad_norm": 38.23501205444336,
"learning_rate": 3.5265049415992814e-06,
"loss": 2.0771,
"step": 1560
},
{
"epoch": 0.6659556313993175,
"grad_norm": 29.63912010192871,
"learning_rate": 3.5220125786163524e-06,
"loss": 2.3281,
"step": 1561
},
{
"epoch": 0.6663822525597269,
"grad_norm": 32.683170318603516,
"learning_rate": 3.517520215633424e-06,
"loss": 2.3936,
"step": 1562
},
{
"epoch": 0.6668088737201365,
"grad_norm": 23.847415924072266,
"learning_rate": 3.5130278526504945e-06,
"loss": 2.6152,
"step": 1563
},
{
"epoch": 0.6672354948805461,
"grad_norm": 33.72971725463867,
"learning_rate": 3.5085354896675656e-06,
"loss": 2.4717,
"step": 1564
},
{
"epoch": 0.6676621160409556,
"grad_norm": 29.628982543945312,
"learning_rate": 3.5040431266846366e-06,
"loss": 2.7285,
"step": 1565
},
{
"epoch": 0.6680887372013652,
"grad_norm": 36.2376823425293,
"learning_rate": 3.4995507637017072e-06,
"loss": 2.5703,
"step": 1566
},
{
"epoch": 0.6685153583617748,
"grad_norm": 26.659250259399414,
"learning_rate": 3.4950584007187783e-06,
"loss": 1.9951,
"step": 1567
},
{
"epoch": 0.6689419795221843,
"grad_norm": 29.95134162902832,
"learning_rate": 3.4905660377358493e-06,
"loss": 2.7998,
"step": 1568
},
{
"epoch": 0.6693686006825939,
"grad_norm": 31.50892448425293,
"learning_rate": 3.4860736747529208e-06,
"loss": 2.1104,
"step": 1569
},
{
"epoch": 0.6697952218430034,
"grad_norm": 23.725902557373047,
"learning_rate": 3.4815813117699914e-06,
"loss": 2.6953,
"step": 1570
},
{
"epoch": 0.6702218430034129,
"grad_norm": 24.071653366088867,
"learning_rate": 3.4770889487870624e-06,
"loss": 1.9536,
"step": 1571
},
{
"epoch": 0.6706484641638225,
"grad_norm": 19.715450286865234,
"learning_rate": 3.4725965858041335e-06,
"loss": 1.8994,
"step": 1572
},
{
"epoch": 0.6710750853242321,
"grad_norm": 19.194473266601562,
"learning_rate": 3.468104222821204e-06,
"loss": 2.0479,
"step": 1573
},
{
"epoch": 0.6715017064846417,
"grad_norm": 19.590009689331055,
"learning_rate": 3.463611859838275e-06,
"loss": 2.2793,
"step": 1574
},
{
"epoch": 0.6719283276450512,
"grad_norm": 27.58205795288086,
"learning_rate": 3.4591194968553458e-06,
"loss": 2.5371,
"step": 1575
},
{
"epoch": 0.6723549488054608,
"grad_norm": 26.039403915405273,
"learning_rate": 3.4546271338724172e-06,
"loss": 2.1006,
"step": 1576
},
{
"epoch": 0.6727815699658704,
"grad_norm": 33.50947570800781,
"learning_rate": 3.4501347708894883e-06,
"loss": 2.7207,
"step": 1577
},
{
"epoch": 0.6732081911262798,
"grad_norm": 43.135765075683594,
"learning_rate": 3.4456424079065593e-06,
"loss": 2.3198,
"step": 1578
},
{
"epoch": 0.6736348122866894,
"grad_norm": 26.70039939880371,
"learning_rate": 3.44115004492363e-06,
"loss": 2.9517,
"step": 1579
},
{
"epoch": 0.674061433447099,
"grad_norm": 20.953989028930664,
"learning_rate": 3.436657681940701e-06,
"loss": 2.5205,
"step": 1580
},
{
"epoch": 0.6744880546075085,
"grad_norm": 19.561216354370117,
"learning_rate": 3.432165318957772e-06,
"loss": 2.1191,
"step": 1581
},
{
"epoch": 0.6749146757679181,
"grad_norm": 24.886316299438477,
"learning_rate": 3.4276729559748427e-06,
"loss": 2.126,
"step": 1582
},
{
"epoch": 0.6753412969283277,
"grad_norm": 21.290424346923828,
"learning_rate": 3.423180592991914e-06,
"loss": 2.2529,
"step": 1583
},
{
"epoch": 0.6757679180887372,
"grad_norm": 34.25997543334961,
"learning_rate": 3.418688230008985e-06,
"loss": 2.0249,
"step": 1584
},
{
"epoch": 0.6761945392491467,
"grad_norm": 22.012805938720703,
"learning_rate": 3.4141958670260562e-06,
"loss": 2.7158,
"step": 1585
},
{
"epoch": 0.6766211604095563,
"grad_norm": 24.38823699951172,
"learning_rate": 3.409703504043127e-06,
"loss": 1.8994,
"step": 1586
},
{
"epoch": 0.6770477815699659,
"grad_norm": 22.14129066467285,
"learning_rate": 3.405211141060198e-06,
"loss": 2.041,
"step": 1587
},
{
"epoch": 0.6774744027303754,
"grad_norm": 23.154438018798828,
"learning_rate": 3.400718778077269e-06,
"loss": 1.7139,
"step": 1588
},
{
"epoch": 0.677901023890785,
"grad_norm": 17.61654281616211,
"learning_rate": 3.3962264150943395e-06,
"loss": 1.834,
"step": 1589
},
{
"epoch": 0.6783276450511946,
"grad_norm": 18.777099609375,
"learning_rate": 3.3917340521114106e-06,
"loss": 2.3018,
"step": 1590
},
{
"epoch": 0.6787542662116041,
"grad_norm": 22.44559097290039,
"learning_rate": 3.387241689128482e-06,
"loss": 1.6294,
"step": 1591
},
{
"epoch": 0.6791808873720137,
"grad_norm": 21.086833953857422,
"learning_rate": 3.382749326145553e-06,
"loss": 2.1709,
"step": 1592
},
{
"epoch": 0.6796075085324232,
"grad_norm": 25.184696197509766,
"learning_rate": 3.3782569631626237e-06,
"loss": 2.3477,
"step": 1593
},
{
"epoch": 0.6800341296928327,
"grad_norm": 23.782960891723633,
"learning_rate": 3.3737646001796948e-06,
"loss": 2.8232,
"step": 1594
},
{
"epoch": 0.6804607508532423,
"grad_norm": 41.1099739074707,
"learning_rate": 3.369272237196766e-06,
"loss": 2.3916,
"step": 1595
},
{
"epoch": 0.6808873720136519,
"grad_norm": 20.48354721069336,
"learning_rate": 3.3647798742138364e-06,
"loss": 2.2881,
"step": 1596
},
{
"epoch": 0.6813139931740614,
"grad_norm": 19.21649742126465,
"learning_rate": 3.3602875112309075e-06,
"loss": 2.042,
"step": 1597
},
{
"epoch": 0.681740614334471,
"grad_norm": 20.342247009277344,
"learning_rate": 3.355795148247979e-06,
"loss": 2.1675,
"step": 1598
},
{
"epoch": 0.6821672354948806,
"grad_norm": 26.08676528930664,
"learning_rate": 3.35130278526505e-06,
"loss": 2.0991,
"step": 1599
},
{
"epoch": 0.6825938566552902,
"grad_norm": 20.84765625,
"learning_rate": 3.3468104222821206e-06,
"loss": 1.8174,
"step": 1600
},
{
"epoch": 0.6830204778156996,
"grad_norm": 18.198646545410156,
"learning_rate": 3.3423180592991917e-06,
"loss": 2.1973,
"step": 1601
},
{
"epoch": 0.6834470989761092,
"grad_norm": 20.363811492919922,
"learning_rate": 3.3378256963162627e-06,
"loss": 2.6855,
"step": 1602
},
{
"epoch": 0.6838737201365188,
"grad_norm": 21.353164672851562,
"learning_rate": 3.3333333333333333e-06,
"loss": 2.6387,
"step": 1603
},
{
"epoch": 0.6843003412969283,
"grad_norm": 33.03525161743164,
"learning_rate": 3.3288409703504044e-06,
"loss": 1.668,
"step": 1604
},
{
"epoch": 0.6847269624573379,
"grad_norm": 22.72640609741211,
"learning_rate": 3.324348607367476e-06,
"loss": 1.9023,
"step": 1605
},
{
"epoch": 0.6851535836177475,
"grad_norm": 22.232295989990234,
"learning_rate": 3.319856244384547e-06,
"loss": 2.5342,
"step": 1606
},
{
"epoch": 0.685580204778157,
"grad_norm": 21.776939392089844,
"learning_rate": 3.3153638814016175e-06,
"loss": 2.4629,
"step": 1607
},
{
"epoch": 0.6860068259385665,
"grad_norm": 32.66164016723633,
"learning_rate": 3.3108715184186885e-06,
"loss": 2.165,
"step": 1608
},
{
"epoch": 0.6864334470989761,
"grad_norm": 32.1452522277832,
"learning_rate": 3.3063791554357596e-06,
"loss": 2.0732,
"step": 1609
},
{
"epoch": 0.6868600682593856,
"grad_norm": 19.777278900146484,
"learning_rate": 3.30188679245283e-06,
"loss": 2.4092,
"step": 1610
},
{
"epoch": 0.6872866894197952,
"grad_norm": 19.71466064453125,
"learning_rate": 3.2973944294699013e-06,
"loss": 2.1514,
"step": 1611
},
{
"epoch": 0.6877133105802048,
"grad_norm": 17.147441864013672,
"learning_rate": 3.2929020664869727e-06,
"loss": 1.9487,
"step": 1612
},
{
"epoch": 0.6881399317406144,
"grad_norm": 44.140228271484375,
"learning_rate": 3.2884097035040433e-06,
"loss": 2.7715,
"step": 1613
},
{
"epoch": 0.6885665529010239,
"grad_norm": 33.57496643066406,
"learning_rate": 3.2839173405211144e-06,
"loss": 2.7432,
"step": 1614
},
{
"epoch": 0.6889931740614335,
"grad_norm": 18.664264678955078,
"learning_rate": 3.2794249775381854e-06,
"loss": 2.5352,
"step": 1615
},
{
"epoch": 0.689419795221843,
"grad_norm": 23.951438903808594,
"learning_rate": 3.274932614555256e-06,
"loss": 2.2256,
"step": 1616
},
{
"epoch": 0.6898464163822525,
"grad_norm": 23.820545196533203,
"learning_rate": 3.270440251572327e-06,
"loss": 2.1924,
"step": 1617
},
{
"epoch": 0.6902730375426621,
"grad_norm": 20.0013427734375,
"learning_rate": 3.265947888589398e-06,
"loss": 2.165,
"step": 1618
},
{
"epoch": 0.6906996587030717,
"grad_norm": 20.567819595336914,
"learning_rate": 3.2614555256064696e-06,
"loss": 2.75,
"step": 1619
},
{
"epoch": 0.6911262798634812,
"grad_norm": 16.655508041381836,
"learning_rate": 3.2569631626235402e-06,
"loss": 1.8428,
"step": 1620
},
{
"epoch": 0.6915529010238908,
"grad_norm": 17.113733291625977,
"learning_rate": 3.2524707996406113e-06,
"loss": 2.2402,
"step": 1621
},
{
"epoch": 0.6919795221843004,
"grad_norm": 27.453786849975586,
"learning_rate": 3.2479784366576823e-06,
"loss": 2.207,
"step": 1622
},
{
"epoch": 0.6924061433447098,
"grad_norm": 32.38252258300781,
"learning_rate": 3.243486073674753e-06,
"loss": 2.2246,
"step": 1623
},
{
"epoch": 0.6928327645051194,
"grad_norm": 18.718177795410156,
"learning_rate": 3.238993710691824e-06,
"loss": 2.3203,
"step": 1624
},
{
"epoch": 0.693259385665529,
"grad_norm": 37.67975616455078,
"learning_rate": 3.234501347708895e-06,
"loss": 1.8008,
"step": 1625
},
{
"epoch": 0.6936860068259386,
"grad_norm": 25.621017456054688,
"learning_rate": 3.2300089847259665e-06,
"loss": 2.2832,
"step": 1626
},
{
"epoch": 0.6941126279863481,
"grad_norm": 18.550310134887695,
"learning_rate": 3.225516621743037e-06,
"loss": 2.1826,
"step": 1627
},
{
"epoch": 0.6945392491467577,
"grad_norm": 31.40336799621582,
"learning_rate": 3.221024258760108e-06,
"loss": 2.0342,
"step": 1628
},
{
"epoch": 0.6949658703071673,
"grad_norm": 21.5510196685791,
"learning_rate": 3.216531895777179e-06,
"loss": 2.5488,
"step": 1629
},
{
"epoch": 0.6953924914675768,
"grad_norm": 16.434173583984375,
"learning_rate": 3.21203953279425e-06,
"loss": 1.8301,
"step": 1630
},
{
"epoch": 0.6958191126279863,
"grad_norm": 23.86673355102539,
"learning_rate": 3.207547169811321e-06,
"loss": 1.6816,
"step": 1631
},
{
"epoch": 0.6962457337883959,
"grad_norm": 22.883087158203125,
"learning_rate": 3.203054806828392e-06,
"loss": 2.374,
"step": 1632
},
{
"epoch": 0.6966723549488054,
"grad_norm": 18.144968032836914,
"learning_rate": 3.1985624438454634e-06,
"loss": 2.3555,
"step": 1633
},
{
"epoch": 0.697098976109215,
"grad_norm": 19.690826416015625,
"learning_rate": 3.194070080862534e-06,
"loss": 2.1045,
"step": 1634
},
{
"epoch": 0.6975255972696246,
"grad_norm": 23.947898864746094,
"learning_rate": 3.189577717879605e-06,
"loss": 2.3008,
"step": 1635
},
{
"epoch": 0.6979522184300341,
"grad_norm": 21.14999008178711,
"learning_rate": 3.185085354896676e-06,
"loss": 2.1064,
"step": 1636
},
{
"epoch": 0.6983788395904437,
"grad_norm": 26.988557815551758,
"learning_rate": 3.1805929919137467e-06,
"loss": 1.9912,
"step": 1637
},
{
"epoch": 0.6988054607508533,
"grad_norm": 27.08625602722168,
"learning_rate": 3.1761006289308178e-06,
"loss": 2.4316,
"step": 1638
},
{
"epoch": 0.6992320819112628,
"grad_norm": 23.413265228271484,
"learning_rate": 3.171608265947889e-06,
"loss": 2.2451,
"step": 1639
},
{
"epoch": 0.6996587030716723,
"grad_norm": 21.323143005371094,
"learning_rate": 3.1671159029649603e-06,
"loss": 1.9639,
"step": 1640
},
{
"epoch": 0.7000853242320819,
"grad_norm": 23.912181854248047,
"learning_rate": 3.162623539982031e-06,
"loss": 2.1152,
"step": 1641
},
{
"epoch": 0.7005119453924915,
"grad_norm": 23.545801162719727,
"learning_rate": 3.158131176999102e-06,
"loss": 2.4561,
"step": 1642
},
{
"epoch": 0.700938566552901,
"grad_norm": 29.719528198242188,
"learning_rate": 3.153638814016173e-06,
"loss": 2.4512,
"step": 1643
},
{
"epoch": 0.7013651877133106,
"grad_norm": 16.677494049072266,
"learning_rate": 3.1491464510332436e-06,
"loss": 2.0938,
"step": 1644
},
{
"epoch": 0.7017918088737202,
"grad_norm": 29.851078033447266,
"learning_rate": 3.1446540880503146e-06,
"loss": 2.5488,
"step": 1645
},
{
"epoch": 0.7022184300341296,
"grad_norm": 26.422513961791992,
"learning_rate": 3.1401617250673853e-06,
"loss": 2.4424,
"step": 1646
},
{
"epoch": 0.7026450511945392,
"grad_norm": 22.49760627746582,
"learning_rate": 3.1356693620844567e-06,
"loss": 2.6123,
"step": 1647
},
{
"epoch": 0.7030716723549488,
"grad_norm": 37.57074737548828,
"learning_rate": 3.1311769991015278e-06,
"loss": 2.4365,
"step": 1648
},
{
"epoch": 0.7034982935153583,
"grad_norm": 19.417375564575195,
"learning_rate": 3.126684636118599e-06,
"loss": 2.084,
"step": 1649
},
{
"epoch": 0.7039249146757679,
"grad_norm": 46.048404693603516,
"learning_rate": 3.1221922731356694e-06,
"loss": 2.7559,
"step": 1650
},
{
"epoch": 0.7043515358361775,
"grad_norm": 24.032854080200195,
"learning_rate": 3.1176999101527405e-06,
"loss": 2.4941,
"step": 1651
},
{
"epoch": 0.7047781569965871,
"grad_norm": 27.37352180480957,
"learning_rate": 3.1132075471698115e-06,
"loss": 2.5059,
"step": 1652
},
{
"epoch": 0.7052047781569966,
"grad_norm": 18.343231201171875,
"learning_rate": 3.108715184186882e-06,
"loss": 1.9766,
"step": 1653
},
{
"epoch": 0.7056313993174061,
"grad_norm": 32.41136169433594,
"learning_rate": 3.1042228212039536e-06,
"loss": 2.7559,
"step": 1654
},
{
"epoch": 0.7060580204778157,
"grad_norm": 18.58074951171875,
"learning_rate": 3.0997304582210247e-06,
"loss": 2.3477,
"step": 1655
},
{
"epoch": 0.7064846416382252,
"grad_norm": 17.456533432006836,
"learning_rate": 3.0952380952380957e-06,
"loss": 2.0459,
"step": 1656
},
{
"epoch": 0.7069112627986348,
"grad_norm": 16.901012420654297,
"learning_rate": 3.0907457322551663e-06,
"loss": 2.0396,
"step": 1657
},
{
"epoch": 0.7073378839590444,
"grad_norm": 15.16879653930664,
"learning_rate": 3.0862533692722374e-06,
"loss": 1.8633,
"step": 1658
},
{
"epoch": 0.7077645051194539,
"grad_norm": 22.42111587524414,
"learning_rate": 3.0817610062893084e-06,
"loss": 2.4502,
"step": 1659
},
{
"epoch": 0.7081911262798635,
"grad_norm": 27.441150665283203,
"learning_rate": 3.077268643306379e-06,
"loss": 2.5537,
"step": 1660
},
{
"epoch": 0.7086177474402731,
"grad_norm": 24.77878761291504,
"learning_rate": 3.0727762803234505e-06,
"loss": 1.7925,
"step": 1661
},
{
"epoch": 0.7090443686006825,
"grad_norm": 22.958045959472656,
"learning_rate": 3.0682839173405216e-06,
"loss": 2.5142,
"step": 1662
},
{
"epoch": 0.7094709897610921,
"grad_norm": 28.620180130004883,
"learning_rate": 3.0637915543575926e-06,
"loss": 2.3408,
"step": 1663
},
{
"epoch": 0.7098976109215017,
"grad_norm": 32.69829177856445,
"learning_rate": 3.0592991913746632e-06,
"loss": 2.2031,
"step": 1664
},
{
"epoch": 0.7103242320819113,
"grad_norm": 45.391334533691406,
"learning_rate": 3.0548068283917343e-06,
"loss": 2.5068,
"step": 1665
},
{
"epoch": 0.7107508532423208,
"grad_norm": 20.9996280670166,
"learning_rate": 3.0503144654088053e-06,
"loss": 2.5391,
"step": 1666
},
{
"epoch": 0.7111774744027304,
"grad_norm": 23.97358512878418,
"learning_rate": 3.045822102425876e-06,
"loss": 2.5625,
"step": 1667
},
{
"epoch": 0.71160409556314,
"grad_norm": 20.434316635131836,
"learning_rate": 3.0413297394429474e-06,
"loss": 2.1904,
"step": 1668
},
{
"epoch": 0.7120307167235495,
"grad_norm": 28.51020622253418,
"learning_rate": 3.0368373764600184e-06,
"loss": 2.4521,
"step": 1669
},
{
"epoch": 0.712457337883959,
"grad_norm": 16.132415771484375,
"learning_rate": 3.0323450134770895e-06,
"loss": 1.5845,
"step": 1670
},
{
"epoch": 0.7128839590443686,
"grad_norm": 23.020078659057617,
"learning_rate": 3.02785265049416e-06,
"loss": 2.9268,
"step": 1671
},
{
"epoch": 0.7133105802047781,
"grad_norm": 27.02225685119629,
"learning_rate": 3.023360287511231e-06,
"loss": 2.2266,
"step": 1672
},
{
"epoch": 0.7137372013651877,
"grad_norm": 19.597509384155273,
"learning_rate": 3.018867924528302e-06,
"loss": 2.6143,
"step": 1673
},
{
"epoch": 0.7141638225255973,
"grad_norm": 22.84983253479004,
"learning_rate": 3.014375561545373e-06,
"loss": 1.9414,
"step": 1674
},
{
"epoch": 0.7145904436860068,
"grad_norm": 29.941665649414062,
"learning_rate": 3.009883198562444e-06,
"loss": 2.7354,
"step": 1675
},
{
"epoch": 0.7150170648464164,
"grad_norm": 22.785655975341797,
"learning_rate": 3.0053908355795153e-06,
"loss": 2.1377,
"step": 1676
},
{
"epoch": 0.715443686006826,
"grad_norm": 19.041711807250977,
"learning_rate": 3.0008984725965864e-06,
"loss": 2.1201,
"step": 1677
},
{
"epoch": 0.7158703071672355,
"grad_norm": 19.834877014160156,
"learning_rate": 2.996406109613657e-06,
"loss": 2.1113,
"step": 1678
},
{
"epoch": 0.716296928327645,
"grad_norm": 16.664960861206055,
"learning_rate": 2.991913746630728e-06,
"loss": 2.0352,
"step": 1679
},
{
"epoch": 0.7167235494880546,
"grad_norm": 27.215831756591797,
"learning_rate": 2.987421383647799e-06,
"loss": 2.1787,
"step": 1680
},
{
"epoch": 0.7171501706484642,
"grad_norm": 19.89889907836914,
"learning_rate": 2.9829290206648697e-06,
"loss": 2.4629,
"step": 1681
},
{
"epoch": 0.7175767918088737,
"grad_norm": 23.80775260925293,
"learning_rate": 2.9784366576819408e-06,
"loss": 2.3076,
"step": 1682
},
{
"epoch": 0.7180034129692833,
"grad_norm": 34.298248291015625,
"learning_rate": 2.9739442946990122e-06,
"loss": 1.8589,
"step": 1683
},
{
"epoch": 0.7184300341296929,
"grad_norm": 29.29851722717285,
"learning_rate": 2.969451931716083e-06,
"loss": 2.1162,
"step": 1684
},
{
"epoch": 0.7188566552901023,
"grad_norm": 23.561893463134766,
"learning_rate": 2.964959568733154e-06,
"loss": 2.3486,
"step": 1685
},
{
"epoch": 0.7192832764505119,
"grad_norm": 19.6323299407959,
"learning_rate": 2.960467205750225e-06,
"loss": 1.9863,
"step": 1686
},
{
"epoch": 0.7197098976109215,
"grad_norm": 31.969934463500977,
"learning_rate": 2.9559748427672955e-06,
"loss": 2.4453,
"step": 1687
},
{
"epoch": 0.7201365187713311,
"grad_norm": 24.619426727294922,
"learning_rate": 2.9514824797843666e-06,
"loss": 2.8027,
"step": 1688
},
{
"epoch": 0.7205631399317406,
"grad_norm": 20.186494827270508,
"learning_rate": 2.9469901168014376e-06,
"loss": 2.4297,
"step": 1689
},
{
"epoch": 0.7209897610921502,
"grad_norm": 18.27107048034668,
"learning_rate": 2.942497753818509e-06,
"loss": 2.1455,
"step": 1690
},
{
"epoch": 0.7214163822525598,
"grad_norm": 21.095502853393555,
"learning_rate": 2.9380053908355797e-06,
"loss": 2.6606,
"step": 1691
},
{
"epoch": 0.7218430034129693,
"grad_norm": 20.978729248046875,
"learning_rate": 2.9335130278526508e-06,
"loss": 2.2842,
"step": 1692
},
{
"epoch": 0.7222696245733788,
"grad_norm": 21.04619026184082,
"learning_rate": 2.929020664869722e-06,
"loss": 1.2656,
"step": 1693
},
{
"epoch": 0.7226962457337884,
"grad_norm": 28.32137680053711,
"learning_rate": 2.9245283018867924e-06,
"loss": 2.3525,
"step": 1694
},
{
"epoch": 0.7231228668941979,
"grad_norm": 22.19200325012207,
"learning_rate": 2.9200359389038635e-06,
"loss": 2.0947,
"step": 1695
},
{
"epoch": 0.7235494880546075,
"grad_norm": 19.34542465209961,
"learning_rate": 2.9155435759209345e-06,
"loss": 2.1494,
"step": 1696
},
{
"epoch": 0.7239761092150171,
"grad_norm": 23.154399871826172,
"learning_rate": 2.911051212938006e-06,
"loss": 2.0,
"step": 1697
},
{
"epoch": 0.7244027303754266,
"grad_norm": 27.6765079498291,
"learning_rate": 2.9065588499550766e-06,
"loss": 2.1758,
"step": 1698
},
{
"epoch": 0.7248293515358362,
"grad_norm": 30.37624740600586,
"learning_rate": 2.9020664869721477e-06,
"loss": 2.7207,
"step": 1699
},
{
"epoch": 0.7252559726962458,
"grad_norm": 19.52189064025879,
"learning_rate": 2.8975741239892187e-06,
"loss": 1.8457,
"step": 1700
},
{
"epoch": 0.7256825938566553,
"grad_norm": 31.081382751464844,
"learning_rate": 2.8930817610062893e-06,
"loss": 2.3071,
"step": 1701
},
{
"epoch": 0.7261092150170648,
"grad_norm": 23.929893493652344,
"learning_rate": 2.8885893980233604e-06,
"loss": 2.3467,
"step": 1702
},
{
"epoch": 0.7265358361774744,
"grad_norm": 19.086774826049805,
"learning_rate": 2.8840970350404314e-06,
"loss": 2.0127,
"step": 1703
},
{
"epoch": 0.726962457337884,
"grad_norm": 30.935726165771484,
"learning_rate": 2.879604672057503e-06,
"loss": 2.4414,
"step": 1704
},
{
"epoch": 0.7273890784982935,
"grad_norm": 33.78768539428711,
"learning_rate": 2.8751123090745735e-06,
"loss": 1.8154,
"step": 1705
},
{
"epoch": 0.7278156996587031,
"grad_norm": 19.40523338317871,
"learning_rate": 2.8706199460916445e-06,
"loss": 1.8164,
"step": 1706
},
{
"epoch": 0.7282423208191127,
"grad_norm": 38.8382682800293,
"learning_rate": 2.8661275831087156e-06,
"loss": 2.1211,
"step": 1707
},
{
"epoch": 0.7286689419795221,
"grad_norm": 26.457233428955078,
"learning_rate": 2.861635220125786e-06,
"loss": 2.0469,
"step": 1708
},
{
"epoch": 0.7290955631399317,
"grad_norm": 22.73707389831543,
"learning_rate": 2.8571428571428573e-06,
"loss": 1.8389,
"step": 1709
},
{
"epoch": 0.7295221843003413,
"grad_norm": 49.22174072265625,
"learning_rate": 2.8526504941599283e-06,
"loss": 2.457,
"step": 1710
},
{
"epoch": 0.7299488054607508,
"grad_norm": 26.232208251953125,
"learning_rate": 2.8481581311769998e-06,
"loss": 2.3633,
"step": 1711
},
{
"epoch": 0.7303754266211604,
"grad_norm": 23.40569496154785,
"learning_rate": 2.8436657681940704e-06,
"loss": 2.4282,
"step": 1712
},
{
"epoch": 0.73080204778157,
"grad_norm": 26.51075553894043,
"learning_rate": 2.8391734052111414e-06,
"loss": 2.5537,
"step": 1713
},
{
"epoch": 0.7312286689419796,
"grad_norm": 20.640514373779297,
"learning_rate": 2.8346810422282125e-06,
"loss": 2.0527,
"step": 1714
},
{
"epoch": 0.731655290102389,
"grad_norm": 22.921810150146484,
"learning_rate": 2.830188679245283e-06,
"loss": 2.1309,
"step": 1715
},
{
"epoch": 0.7320819112627986,
"grad_norm": 29.728538513183594,
"learning_rate": 2.825696316262354e-06,
"loss": 2.7529,
"step": 1716
},
{
"epoch": 0.7325085324232082,
"grad_norm": 24.323043823242188,
"learning_rate": 2.821203953279425e-06,
"loss": 2.083,
"step": 1717
},
{
"epoch": 0.7329351535836177,
"grad_norm": 26.60840606689453,
"learning_rate": 2.8167115902964962e-06,
"loss": 2.4717,
"step": 1718
},
{
"epoch": 0.7333617747440273,
"grad_norm": 21.946855545043945,
"learning_rate": 2.8122192273135673e-06,
"loss": 2.3496,
"step": 1719
},
{
"epoch": 0.7337883959044369,
"grad_norm": 26.811389923095703,
"learning_rate": 2.8077268643306383e-06,
"loss": 2.1406,
"step": 1720
},
{
"epoch": 0.7342150170648464,
"grad_norm": 18.540809631347656,
"learning_rate": 2.803234501347709e-06,
"loss": 2.0864,
"step": 1721
},
{
"epoch": 0.734641638225256,
"grad_norm": 23.249303817749023,
"learning_rate": 2.79874213836478e-06,
"loss": 2.2393,
"step": 1722
},
{
"epoch": 0.7350682593856656,
"grad_norm": 20.395145416259766,
"learning_rate": 2.794249775381851e-06,
"loss": 2.4502,
"step": 1723
},
{
"epoch": 0.735494880546075,
"grad_norm": 30.138233184814453,
"learning_rate": 2.7897574123989217e-06,
"loss": 2.1201,
"step": 1724
},
{
"epoch": 0.7359215017064846,
"grad_norm": 26.507150650024414,
"learning_rate": 2.785265049415993e-06,
"loss": 1.9185,
"step": 1725
},
{
"epoch": 0.7363481228668942,
"grad_norm": 15.685582160949707,
"learning_rate": 2.780772686433064e-06,
"loss": 2.0293,
"step": 1726
},
{
"epoch": 0.7367747440273038,
"grad_norm": 26.94832420349121,
"learning_rate": 2.776280323450135e-06,
"loss": 1.9082,
"step": 1727
},
{
"epoch": 0.7372013651877133,
"grad_norm": 28.36393165588379,
"learning_rate": 2.771787960467206e-06,
"loss": 1.8506,
"step": 1728
},
{
"epoch": 0.7376279863481229,
"grad_norm": 17.436843872070312,
"learning_rate": 2.767295597484277e-06,
"loss": 2.1221,
"step": 1729
},
{
"epoch": 0.7380546075085325,
"grad_norm": 23.560590744018555,
"learning_rate": 2.762803234501348e-06,
"loss": 2.4292,
"step": 1730
},
{
"epoch": 0.738481228668942,
"grad_norm": 17.839962005615234,
"learning_rate": 2.7583108715184185e-06,
"loss": 2.2773,
"step": 1731
},
{
"epoch": 0.7389078498293515,
"grad_norm": 33.70856475830078,
"learning_rate": 2.75381850853549e-06,
"loss": 2.2441,
"step": 1732
},
{
"epoch": 0.7393344709897611,
"grad_norm": 28.266454696655273,
"learning_rate": 2.749326145552561e-06,
"loss": 2.4609,
"step": 1733
},
{
"epoch": 0.7397610921501706,
"grad_norm": 20.637826919555664,
"learning_rate": 2.744833782569632e-06,
"loss": 2.3477,
"step": 1734
},
{
"epoch": 0.7401877133105802,
"grad_norm": 18.084033966064453,
"learning_rate": 2.7403414195867027e-06,
"loss": 2.168,
"step": 1735
},
{
"epoch": 0.7406143344709898,
"grad_norm": 26.160219192504883,
"learning_rate": 2.7358490566037738e-06,
"loss": 2.1494,
"step": 1736
},
{
"epoch": 0.7410409556313993,
"grad_norm": 25.901765823364258,
"learning_rate": 2.731356693620845e-06,
"loss": 2.126,
"step": 1737
},
{
"epoch": 0.7414675767918089,
"grad_norm": 18.53731346130371,
"learning_rate": 2.7268643306379154e-06,
"loss": 1.915,
"step": 1738
},
{
"epoch": 0.7418941979522184,
"grad_norm": 25.867483139038086,
"learning_rate": 2.722371967654987e-06,
"loss": 2.4033,
"step": 1739
},
{
"epoch": 0.742320819112628,
"grad_norm": 29.431753158569336,
"learning_rate": 2.717879604672058e-06,
"loss": 1.8164,
"step": 1740
},
{
"epoch": 0.7427474402730375,
"grad_norm": 21.479326248168945,
"learning_rate": 2.713387241689129e-06,
"loss": 1.9727,
"step": 1741
},
{
"epoch": 0.7431740614334471,
"grad_norm": 17.52713966369629,
"learning_rate": 2.7088948787061996e-06,
"loss": 1.5908,
"step": 1742
},
{
"epoch": 0.7436006825938567,
"grad_norm": 19.50530242919922,
"learning_rate": 2.7044025157232706e-06,
"loss": 1.6367,
"step": 1743
},
{
"epoch": 0.7440273037542662,
"grad_norm": 35.69096755981445,
"learning_rate": 2.6999101527403417e-06,
"loss": 1.9648,
"step": 1744
},
{
"epoch": 0.7444539249146758,
"grad_norm": 44.18523406982422,
"learning_rate": 2.6954177897574123e-06,
"loss": 2.3213,
"step": 1745
},
{
"epoch": 0.7448805460750854,
"grad_norm": 22.357803344726562,
"learning_rate": 2.6909254267744838e-06,
"loss": 1.9614,
"step": 1746
},
{
"epoch": 0.7453071672354948,
"grad_norm": 21.612682342529297,
"learning_rate": 2.686433063791555e-06,
"loss": 2.5068,
"step": 1747
},
{
"epoch": 0.7457337883959044,
"grad_norm": 31.66046714782715,
"learning_rate": 2.681940700808626e-06,
"loss": 2.7129,
"step": 1748
},
{
"epoch": 0.746160409556314,
"grad_norm": 23.753938674926758,
"learning_rate": 2.6774483378256965e-06,
"loss": 2.2041,
"step": 1749
},
{
"epoch": 0.7465870307167235,
"grad_norm": 28.22380256652832,
"learning_rate": 2.6729559748427675e-06,
"loss": 2.2656,
"step": 1750
},
{
"epoch": 0.7470136518771331,
"grad_norm": 22.26675796508789,
"learning_rate": 2.6684636118598386e-06,
"loss": 2.3828,
"step": 1751
},
{
"epoch": 0.7474402730375427,
"grad_norm": 18.363147735595703,
"learning_rate": 2.663971248876909e-06,
"loss": 2.0376,
"step": 1752
},
{
"epoch": 0.7478668941979523,
"grad_norm": 18.026317596435547,
"learning_rate": 2.6594788858939802e-06,
"loss": 2.0869,
"step": 1753
},
{
"epoch": 0.7482935153583617,
"grad_norm": 23.124431610107422,
"learning_rate": 2.6549865229110517e-06,
"loss": 1.8809,
"step": 1754
},
{
"epoch": 0.7487201365187713,
"grad_norm": 20.112951278686523,
"learning_rate": 2.6504941599281223e-06,
"loss": 1.9185,
"step": 1755
},
{
"epoch": 0.7491467576791809,
"grad_norm": 19.65840721130371,
"learning_rate": 2.6460017969451934e-06,
"loss": 2.3105,
"step": 1756
},
{
"epoch": 0.7495733788395904,
"grad_norm": 17.392486572265625,
"learning_rate": 2.6415094339622644e-06,
"loss": 2.0371,
"step": 1757
},
{
"epoch": 0.75,
"grad_norm": 23.173776626586914,
"learning_rate": 2.637017070979335e-06,
"loss": 3.1699,
"step": 1758
},
{
"epoch": 0.7504266211604096,
"grad_norm": 35.75828552246094,
"learning_rate": 2.632524707996406e-06,
"loss": 2.25,
"step": 1759
},
{
"epoch": 0.7508532423208191,
"grad_norm": 19.388690948486328,
"learning_rate": 2.628032345013477e-06,
"loss": 2.043,
"step": 1760
},
{
"epoch": 0.7512798634812287,
"grad_norm": 29.570627212524414,
"learning_rate": 2.6235399820305486e-06,
"loss": 2.1875,
"step": 1761
},
{
"epoch": 0.7517064846416383,
"grad_norm": 24.603408813476562,
"learning_rate": 2.6190476190476192e-06,
"loss": 2.0171,
"step": 1762
},
{
"epoch": 0.7521331058020477,
"grad_norm": 43.5400276184082,
"learning_rate": 2.6145552560646903e-06,
"loss": 2.4141,
"step": 1763
},
{
"epoch": 0.7525597269624573,
"grad_norm": 20.218944549560547,
"learning_rate": 2.6100628930817613e-06,
"loss": 2.2295,
"step": 1764
},
{
"epoch": 0.7529863481228669,
"grad_norm": 16.16379737854004,
"learning_rate": 2.605570530098832e-06,
"loss": 1.7656,
"step": 1765
},
{
"epoch": 0.7534129692832765,
"grad_norm": 18.501811981201172,
"learning_rate": 2.601078167115903e-06,
"loss": 2.2217,
"step": 1766
},
{
"epoch": 0.753839590443686,
"grad_norm": 24.61173439025879,
"learning_rate": 2.596585804132974e-06,
"loss": 2.2686,
"step": 1767
},
{
"epoch": 0.7542662116040956,
"grad_norm": 25.14869499206543,
"learning_rate": 2.5920934411500455e-06,
"loss": 2.0107,
"step": 1768
},
{
"epoch": 0.7546928327645052,
"grad_norm": 25.4429931640625,
"learning_rate": 2.587601078167116e-06,
"loss": 2.4429,
"step": 1769
},
{
"epoch": 0.7551194539249146,
"grad_norm": 34.942222595214844,
"learning_rate": 2.583108715184187e-06,
"loss": 2.2051,
"step": 1770
},
{
"epoch": 0.7555460750853242,
"grad_norm": 58.64693069458008,
"learning_rate": 2.578616352201258e-06,
"loss": 2.1289,
"step": 1771
},
{
"epoch": 0.7559726962457338,
"grad_norm": 20.511198043823242,
"learning_rate": 2.574123989218329e-06,
"loss": 1.7251,
"step": 1772
},
{
"epoch": 0.7563993174061433,
"grad_norm": 19.378904342651367,
"learning_rate": 2.5696316262354e-06,
"loss": 2.2139,
"step": 1773
},
{
"epoch": 0.7568259385665529,
"grad_norm": 26.095415115356445,
"learning_rate": 2.565139263252471e-06,
"loss": 2.1777,
"step": 1774
},
{
"epoch": 0.7572525597269625,
"grad_norm": 22.2766056060791,
"learning_rate": 2.5606469002695424e-06,
"loss": 2.3418,
"step": 1775
},
{
"epoch": 0.757679180887372,
"grad_norm": 23.590354919433594,
"learning_rate": 2.556154537286613e-06,
"loss": 2.1255,
"step": 1776
},
{
"epoch": 0.7581058020477816,
"grad_norm": 20.151622772216797,
"learning_rate": 2.551662174303684e-06,
"loss": 2.3418,
"step": 1777
},
{
"epoch": 0.7585324232081911,
"grad_norm": 22.800518035888672,
"learning_rate": 2.547169811320755e-06,
"loss": 2.1279,
"step": 1778
},
{
"epoch": 0.7589590443686007,
"grad_norm": 20.73748207092285,
"learning_rate": 2.5426774483378257e-06,
"loss": 2.291,
"step": 1779
},
{
"epoch": 0.7593856655290102,
"grad_norm": 33.89189147949219,
"learning_rate": 2.5381850853548968e-06,
"loss": 2.3872,
"step": 1780
},
{
"epoch": 0.7598122866894198,
"grad_norm": 28.626930236816406,
"learning_rate": 2.533692722371968e-06,
"loss": 2.3867,
"step": 1781
},
{
"epoch": 0.7602389078498294,
"grad_norm": 18.091033935546875,
"learning_rate": 2.5292003593890393e-06,
"loss": 1.8428,
"step": 1782
},
{
"epoch": 0.7606655290102389,
"grad_norm": 24.8004093170166,
"learning_rate": 2.52470799640611e-06,
"loss": 2.3076,
"step": 1783
},
{
"epoch": 0.7610921501706485,
"grad_norm": 27.434419631958008,
"learning_rate": 2.520215633423181e-06,
"loss": 2.0391,
"step": 1784
},
{
"epoch": 0.761518771331058,
"grad_norm": 21.95925521850586,
"learning_rate": 2.515723270440252e-06,
"loss": 2.0225,
"step": 1785
},
{
"epoch": 0.7619453924914675,
"grad_norm": 20.50782585144043,
"learning_rate": 2.5112309074573226e-06,
"loss": 1.7798,
"step": 1786
},
{
"epoch": 0.7623720136518771,
"grad_norm": 23.383893966674805,
"learning_rate": 2.5067385444743936e-06,
"loss": 1.7754,
"step": 1787
},
{
"epoch": 0.7627986348122867,
"grad_norm": 30.2733154296875,
"learning_rate": 2.5022461814914647e-06,
"loss": 1.896,
"step": 1788
},
{
"epoch": 0.7632252559726962,
"grad_norm": 50.34253692626953,
"learning_rate": 2.4977538185085357e-06,
"loss": 3.1006,
"step": 1789
},
{
"epoch": 0.7636518771331058,
"grad_norm": 31.713327407836914,
"learning_rate": 2.4932614555256063e-06,
"loss": 2.1416,
"step": 1790
},
{
"epoch": 0.7640784982935154,
"grad_norm": 24.32350730895996,
"learning_rate": 2.488769092542678e-06,
"loss": 2.6396,
"step": 1791
},
{
"epoch": 0.764505119453925,
"grad_norm": 20.80146026611328,
"learning_rate": 2.4842767295597484e-06,
"loss": 2.4419,
"step": 1792
},
{
"epoch": 0.7649317406143344,
"grad_norm": 17.979833602905273,
"learning_rate": 2.4797843665768195e-06,
"loss": 2.0811,
"step": 1793
},
{
"epoch": 0.765358361774744,
"grad_norm": 33.28745651245117,
"learning_rate": 2.4752920035938905e-06,
"loss": 1.9746,
"step": 1794
},
{
"epoch": 0.7657849829351536,
"grad_norm": 20.66737174987793,
"learning_rate": 2.4707996406109616e-06,
"loss": 2.0127,
"step": 1795
},
{
"epoch": 0.7662116040955631,
"grad_norm": 24.25516700744629,
"learning_rate": 2.4663072776280326e-06,
"loss": 2.1421,
"step": 1796
},
{
"epoch": 0.7666382252559727,
"grad_norm": 31.5616455078125,
"learning_rate": 2.4618149146451032e-06,
"loss": 2.6055,
"step": 1797
},
{
"epoch": 0.7670648464163823,
"grad_norm": 25.29386329650879,
"learning_rate": 2.4573225516621747e-06,
"loss": 2.2949,
"step": 1798
},
{
"epoch": 0.7674914675767918,
"grad_norm": 19.253997802734375,
"learning_rate": 2.4528301886792453e-06,
"loss": 2.3252,
"step": 1799
},
{
"epoch": 0.7679180887372014,
"grad_norm": 32.48329162597656,
"learning_rate": 2.4483378256963164e-06,
"loss": 2.1895,
"step": 1800
},
{
"epoch": 0.768344709897611,
"grad_norm": 28.606746673583984,
"learning_rate": 2.4438454627133874e-06,
"loss": 1.7744,
"step": 1801
},
{
"epoch": 0.7687713310580204,
"grad_norm": 26.95733642578125,
"learning_rate": 2.4393530997304585e-06,
"loss": 2.6348,
"step": 1802
},
{
"epoch": 0.76919795221843,
"grad_norm": 24.73307991027832,
"learning_rate": 2.4348607367475295e-06,
"loss": 2.4717,
"step": 1803
},
{
"epoch": 0.7696245733788396,
"grad_norm": 33.115753173828125,
"learning_rate": 2.4303683737646e-06,
"loss": 2.4561,
"step": 1804
},
{
"epoch": 0.7700511945392492,
"grad_norm": 24.048250198364258,
"learning_rate": 2.4258760107816716e-06,
"loss": 1.9028,
"step": 1805
},
{
"epoch": 0.7704778156996587,
"grad_norm": 24.314634323120117,
"learning_rate": 2.421383647798742e-06,
"loss": 2.0,
"step": 1806
},
{
"epoch": 0.7709044368600683,
"grad_norm": 29.49398422241211,
"learning_rate": 2.4168912848158133e-06,
"loss": 2.3843,
"step": 1807
},
{
"epoch": 0.7713310580204779,
"grad_norm": 17.011449813842773,
"learning_rate": 2.4123989218328843e-06,
"loss": 1.7373,
"step": 1808
},
{
"epoch": 0.7717576791808873,
"grad_norm": 31.276538848876953,
"learning_rate": 2.4079065588499553e-06,
"loss": 2.4697,
"step": 1809
},
{
"epoch": 0.7721843003412969,
"grad_norm": 31.9920711517334,
"learning_rate": 2.4034141958670264e-06,
"loss": 2.0557,
"step": 1810
},
{
"epoch": 0.7726109215017065,
"grad_norm": 23.170621871948242,
"learning_rate": 2.398921832884097e-06,
"loss": 2.3584,
"step": 1811
},
{
"epoch": 0.773037542662116,
"grad_norm": 25.794071197509766,
"learning_rate": 2.3944294699011685e-06,
"loss": 2.5537,
"step": 1812
},
{
"epoch": 0.7734641638225256,
"grad_norm": 22.407812118530273,
"learning_rate": 2.389937106918239e-06,
"loss": 2.1099,
"step": 1813
},
{
"epoch": 0.7738907849829352,
"grad_norm": 20.0296630859375,
"learning_rate": 2.38544474393531e-06,
"loss": 1.8984,
"step": 1814
},
{
"epoch": 0.7743174061433447,
"grad_norm": 18.59185218811035,
"learning_rate": 2.380952380952381e-06,
"loss": 2.2271,
"step": 1815
},
{
"epoch": 0.7747440273037542,
"grad_norm": 23.263097763061523,
"learning_rate": 2.3764600179694522e-06,
"loss": 2.3281,
"step": 1816
},
{
"epoch": 0.7751706484641638,
"grad_norm": 20.494384765625,
"learning_rate": 2.3719676549865233e-06,
"loss": 2.3438,
"step": 1817
},
{
"epoch": 0.7755972696245734,
"grad_norm": 30.46561050415039,
"learning_rate": 2.367475292003594e-06,
"loss": 1.9331,
"step": 1818
},
{
"epoch": 0.7760238907849829,
"grad_norm": 18.048294067382812,
"learning_rate": 2.3629829290206654e-06,
"loss": 2.1006,
"step": 1819
},
{
"epoch": 0.7764505119453925,
"grad_norm": 22.645097732543945,
"learning_rate": 2.358490566037736e-06,
"loss": 2.4961,
"step": 1820
},
{
"epoch": 0.7768771331058021,
"grad_norm": 36.97308349609375,
"learning_rate": 2.353998203054807e-06,
"loss": 2.7051,
"step": 1821
},
{
"epoch": 0.7773037542662116,
"grad_norm": 20.397632598876953,
"learning_rate": 2.349505840071878e-06,
"loss": 2.3193,
"step": 1822
},
{
"epoch": 0.7777303754266212,
"grad_norm": 18.061418533325195,
"learning_rate": 2.345013477088949e-06,
"loss": 1.8418,
"step": 1823
},
{
"epoch": 0.7781569965870307,
"grad_norm": 26.37966537475586,
"learning_rate": 2.34052111410602e-06,
"loss": 2.2393,
"step": 1824
},
{
"epoch": 0.7785836177474402,
"grad_norm": 18.162370681762695,
"learning_rate": 2.3360287511230908e-06,
"loss": 2.0254,
"step": 1825
},
{
"epoch": 0.7790102389078498,
"grad_norm": 24.261823654174805,
"learning_rate": 2.331536388140162e-06,
"loss": 2.1113,
"step": 1826
},
{
"epoch": 0.7794368600682594,
"grad_norm": 36.012611389160156,
"learning_rate": 2.327044025157233e-06,
"loss": 1.8486,
"step": 1827
},
{
"epoch": 0.7798634812286689,
"grad_norm": 21.801942825317383,
"learning_rate": 2.322551662174304e-06,
"loss": 1.9272,
"step": 1828
},
{
"epoch": 0.7802901023890785,
"grad_norm": 20.881542205810547,
"learning_rate": 2.3180592991913745e-06,
"loss": 2.333,
"step": 1829
},
{
"epoch": 0.7807167235494881,
"grad_norm": 18.64068031311035,
"learning_rate": 2.313566936208446e-06,
"loss": 2.1831,
"step": 1830
},
{
"epoch": 0.7811433447098977,
"grad_norm": 20.850994110107422,
"learning_rate": 2.3090745732255166e-06,
"loss": 2.0732,
"step": 1831
},
{
"epoch": 0.7815699658703071,
"grad_norm": 22.28333282470703,
"learning_rate": 2.3045822102425877e-06,
"loss": 1.7607,
"step": 1832
},
{
"epoch": 0.7819965870307167,
"grad_norm": 30.814220428466797,
"learning_rate": 2.3000898472596587e-06,
"loss": 2.5879,
"step": 1833
},
{
"epoch": 0.7824232081911263,
"grad_norm": 31.240434646606445,
"learning_rate": 2.2955974842767298e-06,
"loss": 2.3564,
"step": 1834
},
{
"epoch": 0.7828498293515358,
"grad_norm": 25.117435455322266,
"learning_rate": 2.291105121293801e-06,
"loss": 2.0645,
"step": 1835
},
{
"epoch": 0.7832764505119454,
"grad_norm": 45.28719711303711,
"learning_rate": 2.2866127583108714e-06,
"loss": 1.9028,
"step": 1836
},
{
"epoch": 0.783703071672355,
"grad_norm": 17.77985191345215,
"learning_rate": 2.282120395327943e-06,
"loss": 2.1016,
"step": 1837
},
{
"epoch": 0.7841296928327645,
"grad_norm": 31.70467185974121,
"learning_rate": 2.2776280323450135e-06,
"loss": 2.3066,
"step": 1838
},
{
"epoch": 0.784556313993174,
"grad_norm": 20.322813034057617,
"learning_rate": 2.2731356693620846e-06,
"loss": 2.3457,
"step": 1839
},
{
"epoch": 0.7849829351535836,
"grad_norm": 22.35746955871582,
"learning_rate": 2.2686433063791556e-06,
"loss": 2.2314,
"step": 1840
},
{
"epoch": 0.7854095563139932,
"grad_norm": 19.00843048095703,
"learning_rate": 2.2641509433962266e-06,
"loss": 2.0537,
"step": 1841
},
{
"epoch": 0.7858361774744027,
"grad_norm": 23.502687454223633,
"learning_rate": 2.2596585804132977e-06,
"loss": 2.5869,
"step": 1842
},
{
"epoch": 0.7862627986348123,
"grad_norm": 23.143768310546875,
"learning_rate": 2.2551662174303683e-06,
"loss": 1.8096,
"step": 1843
},
{
"epoch": 0.7866894197952219,
"grad_norm": 16.373092651367188,
"learning_rate": 2.2506738544474398e-06,
"loss": 1.606,
"step": 1844
},
{
"epoch": 0.7871160409556314,
"grad_norm": 18.551097869873047,
"learning_rate": 2.2461814914645104e-06,
"loss": 1.8193,
"step": 1845
},
{
"epoch": 0.787542662116041,
"grad_norm": 22.2662410736084,
"learning_rate": 2.2416891284815814e-06,
"loss": 1.958,
"step": 1846
},
{
"epoch": 0.7879692832764505,
"grad_norm": 49.5590705871582,
"learning_rate": 2.2371967654986525e-06,
"loss": 2.3447,
"step": 1847
},
{
"epoch": 0.78839590443686,
"grad_norm": 42.248992919921875,
"learning_rate": 2.2327044025157235e-06,
"loss": 2.0059,
"step": 1848
},
{
"epoch": 0.7888225255972696,
"grad_norm": 58.42856216430664,
"learning_rate": 2.2282120395327946e-06,
"loss": 2.5,
"step": 1849
},
{
"epoch": 0.7892491467576792,
"grad_norm": 35.04775619506836,
"learning_rate": 2.223719676549865e-06,
"loss": 2.4609,
"step": 1850
},
{
"epoch": 0.7896757679180887,
"grad_norm": 21.540834426879883,
"learning_rate": 2.2192273135669367e-06,
"loss": 2.0635,
"step": 1851
},
{
"epoch": 0.7901023890784983,
"grad_norm": 46.99656295776367,
"learning_rate": 2.2147349505840073e-06,
"loss": 2.1025,
"step": 1852
},
{
"epoch": 0.7905290102389079,
"grad_norm": 27.469890594482422,
"learning_rate": 2.2102425876010783e-06,
"loss": 2.3389,
"step": 1853
},
{
"epoch": 0.7909556313993175,
"grad_norm": 23.75452995300293,
"learning_rate": 2.2057502246181494e-06,
"loss": 2.1201,
"step": 1854
},
{
"epoch": 0.7913822525597269,
"grad_norm": 19.295225143432617,
"learning_rate": 2.2012578616352204e-06,
"loss": 1.9268,
"step": 1855
},
{
"epoch": 0.7918088737201365,
"grad_norm": 24.537572860717773,
"learning_rate": 2.1967654986522915e-06,
"loss": 2.1846,
"step": 1856
},
{
"epoch": 0.7922354948805461,
"grad_norm": 25.888763427734375,
"learning_rate": 2.192273135669362e-06,
"loss": 2.3018,
"step": 1857
},
{
"epoch": 0.7926621160409556,
"grad_norm": 17.448314666748047,
"learning_rate": 2.1877807726864336e-06,
"loss": 1.6963,
"step": 1858
},
{
"epoch": 0.7930887372013652,
"grad_norm": 31.14927101135254,
"learning_rate": 2.183288409703504e-06,
"loss": 2.1484,
"step": 1859
},
{
"epoch": 0.7935153583617748,
"grad_norm": 28.934234619140625,
"learning_rate": 2.1787960467205752e-06,
"loss": 2.0508,
"step": 1860
},
{
"epoch": 0.7939419795221843,
"grad_norm": 19.254159927368164,
"learning_rate": 2.174303683737646e-06,
"loss": 1.8345,
"step": 1861
},
{
"epoch": 0.7943686006825939,
"grad_norm": 22.62273406982422,
"learning_rate": 2.1698113207547173e-06,
"loss": 2.6045,
"step": 1862
},
{
"epoch": 0.7947952218430034,
"grad_norm": 26.880126953125,
"learning_rate": 2.165318957771788e-06,
"loss": 2.0889,
"step": 1863
},
{
"epoch": 0.7952218430034129,
"grad_norm": 25.2117977142334,
"learning_rate": 2.160826594788859e-06,
"loss": 2.2568,
"step": 1864
},
{
"epoch": 0.7956484641638225,
"grad_norm": 47.15989303588867,
"learning_rate": 2.15633423180593e-06,
"loss": 1.9131,
"step": 1865
},
{
"epoch": 0.7960750853242321,
"grad_norm": 43.305965423583984,
"learning_rate": 2.151841868823001e-06,
"loss": 2.1338,
"step": 1866
},
{
"epoch": 0.7965017064846417,
"grad_norm": 31.032060623168945,
"learning_rate": 2.147349505840072e-06,
"loss": 2.2998,
"step": 1867
},
{
"epoch": 0.7969283276450512,
"grad_norm": 31.633068084716797,
"learning_rate": 2.1428571428571427e-06,
"loss": 2.02,
"step": 1868
},
{
"epoch": 0.7973549488054608,
"grad_norm": 25.194854736328125,
"learning_rate": 2.138364779874214e-06,
"loss": 2.2529,
"step": 1869
},
{
"epoch": 0.7977815699658704,
"grad_norm": 23.624929428100586,
"learning_rate": 2.133872416891285e-06,
"loss": 2.0996,
"step": 1870
},
{
"epoch": 0.7982081911262798,
"grad_norm": 24.722183227539062,
"learning_rate": 2.129380053908356e-06,
"loss": 2.0752,
"step": 1871
},
{
"epoch": 0.7986348122866894,
"grad_norm": 28.390928268432617,
"learning_rate": 2.124887690925427e-06,
"loss": 1.793,
"step": 1872
},
{
"epoch": 0.799061433447099,
"grad_norm": 17.568288803100586,
"learning_rate": 2.120395327942498e-06,
"loss": 1.8057,
"step": 1873
},
{
"epoch": 0.7994880546075085,
"grad_norm": 25.567970275878906,
"learning_rate": 2.115902964959569e-06,
"loss": 1.7949,
"step": 1874
},
{
"epoch": 0.7999146757679181,
"grad_norm": 20.945209503173828,
"learning_rate": 2.1114106019766396e-06,
"loss": 2.3408,
"step": 1875
},
{
"epoch": 0.8003412969283277,
"grad_norm": 23.48021697998047,
"learning_rate": 2.106918238993711e-06,
"loss": 2.0088,
"step": 1876
},
{
"epoch": 0.8007679180887372,
"grad_norm": 31.483278274536133,
"learning_rate": 2.1024258760107817e-06,
"loss": 2.6606,
"step": 1877
},
{
"epoch": 0.8011945392491467,
"grad_norm": 21.021120071411133,
"learning_rate": 2.0979335130278528e-06,
"loss": 1.9497,
"step": 1878
},
{
"epoch": 0.8016211604095563,
"grad_norm": 20.318021774291992,
"learning_rate": 2.093441150044924e-06,
"loss": 2.3223,
"step": 1879
},
{
"epoch": 0.8020477815699659,
"grad_norm": 33.12303924560547,
"learning_rate": 2.088948787061995e-06,
"loss": 1.3691,
"step": 1880
},
{
"epoch": 0.8024744027303754,
"grad_norm": 20.866891860961914,
"learning_rate": 2.084456424079066e-06,
"loss": 1.8916,
"step": 1881
},
{
"epoch": 0.802901023890785,
"grad_norm": 33.17778396606445,
"learning_rate": 2.0799640610961365e-06,
"loss": 2.2188,
"step": 1882
},
{
"epoch": 0.8033276450511946,
"grad_norm": 27.898754119873047,
"learning_rate": 2.075471698113208e-06,
"loss": 2.0117,
"step": 1883
},
{
"epoch": 0.8037542662116041,
"grad_norm": 25.94223976135254,
"learning_rate": 2.0709793351302786e-06,
"loss": 1.7227,
"step": 1884
},
{
"epoch": 0.8041808873720137,
"grad_norm": 21.092864990234375,
"learning_rate": 2.0664869721473496e-06,
"loss": 1.9883,
"step": 1885
},
{
"epoch": 0.8046075085324232,
"grad_norm": 28.999971389770508,
"learning_rate": 2.0619946091644207e-06,
"loss": 1.9463,
"step": 1886
},
{
"epoch": 0.8050341296928327,
"grad_norm": 22.959543228149414,
"learning_rate": 2.0575022461814917e-06,
"loss": 1.6489,
"step": 1887
},
{
"epoch": 0.8054607508532423,
"grad_norm": 26.268436431884766,
"learning_rate": 2.0530098831985628e-06,
"loss": 1.9351,
"step": 1888
},
{
"epoch": 0.8058873720136519,
"grad_norm": 24.802099227905273,
"learning_rate": 2.0485175202156334e-06,
"loss": 2.6807,
"step": 1889
},
{
"epoch": 0.8063139931740614,
"grad_norm": 24.925498962402344,
"learning_rate": 2.044025157232705e-06,
"loss": 2.2598,
"step": 1890
},
{
"epoch": 0.806740614334471,
"grad_norm": 27.905561447143555,
"learning_rate": 2.0395327942497755e-06,
"loss": 1.9902,
"step": 1891
},
{
"epoch": 0.8071672354948806,
"grad_norm": 29.847583770751953,
"learning_rate": 2.0350404312668465e-06,
"loss": 2.1709,
"step": 1892
},
{
"epoch": 0.8075938566552902,
"grad_norm": 27.835908889770508,
"learning_rate": 2.0305480682839176e-06,
"loss": 2.6738,
"step": 1893
},
{
"epoch": 0.8080204778156996,
"grad_norm": 31.87246322631836,
"learning_rate": 2.0260557053009886e-06,
"loss": 2.5508,
"step": 1894
},
{
"epoch": 0.8084470989761092,
"grad_norm": 22.407638549804688,
"learning_rate": 2.0215633423180597e-06,
"loss": 2.5898,
"step": 1895
},
{
"epoch": 0.8088737201365188,
"grad_norm": 23.444799423217773,
"learning_rate": 2.0170709793351303e-06,
"loss": 2.5332,
"step": 1896
},
{
"epoch": 0.8093003412969283,
"grad_norm": 24.204666137695312,
"learning_rate": 2.0125786163522013e-06,
"loss": 1.9404,
"step": 1897
},
{
"epoch": 0.8097269624573379,
"grad_norm": 22.92106056213379,
"learning_rate": 2.0080862533692724e-06,
"loss": 1.8535,
"step": 1898
},
{
"epoch": 0.8101535836177475,
"grad_norm": 24.548683166503906,
"learning_rate": 2.0035938903863434e-06,
"loss": 1.8535,
"step": 1899
},
{
"epoch": 0.810580204778157,
"grad_norm": 20.212099075317383,
"learning_rate": 1.999101527403414e-06,
"loss": 2.0874,
"step": 1900
},
{
"epoch": 0.8110068259385665,
"grad_norm": 20.215652465820312,
"learning_rate": 1.9946091644204855e-06,
"loss": 1.9922,
"step": 1901
},
{
"epoch": 0.8114334470989761,
"grad_norm": 23.670936584472656,
"learning_rate": 1.990116801437556e-06,
"loss": 2.3438,
"step": 1902
},
{
"epoch": 0.8118600682593856,
"grad_norm": 29.262372970581055,
"learning_rate": 1.985624438454627e-06,
"loss": 2.1133,
"step": 1903
},
{
"epoch": 0.8122866894197952,
"grad_norm": 38.6277961730957,
"learning_rate": 1.981132075471698e-06,
"loss": 2.124,
"step": 1904
},
{
"epoch": 0.8127133105802048,
"grad_norm": 34.57408142089844,
"learning_rate": 1.9766397124887693e-06,
"loss": 2.1533,
"step": 1905
},
{
"epoch": 0.8131399317406144,
"grad_norm": 22.756662368774414,
"learning_rate": 1.9721473495058403e-06,
"loss": 2.4146,
"step": 1906
},
{
"epoch": 0.8135665529010239,
"grad_norm": 23.002607345581055,
"learning_rate": 1.967654986522911e-06,
"loss": 1.7754,
"step": 1907
},
{
"epoch": 0.8139931740614335,
"grad_norm": 22.411535263061523,
"learning_rate": 1.9631626235399824e-06,
"loss": 2.0752,
"step": 1908
},
{
"epoch": 0.814419795221843,
"grad_norm": 43.06706619262695,
"learning_rate": 1.958670260557053e-06,
"loss": 2.0039,
"step": 1909
},
{
"epoch": 0.8148464163822525,
"grad_norm": 33.41255187988281,
"learning_rate": 1.954177897574124e-06,
"loss": 2.0869,
"step": 1910
},
{
"epoch": 0.8152730375426621,
"grad_norm": 18.505645751953125,
"learning_rate": 1.949685534591195e-06,
"loss": 1.9316,
"step": 1911
},
{
"epoch": 0.8156996587030717,
"grad_norm": 25.759122848510742,
"learning_rate": 1.945193171608266e-06,
"loss": 2.1436,
"step": 1912
},
{
"epoch": 0.8161262798634812,
"grad_norm": 22.365253448486328,
"learning_rate": 1.940700808625337e-06,
"loss": 2.2036,
"step": 1913
},
{
"epoch": 0.8165529010238908,
"grad_norm": 21.153892517089844,
"learning_rate": 1.936208445642408e-06,
"loss": 2.0137,
"step": 1914
},
{
"epoch": 0.8169795221843004,
"grad_norm": 20.937232971191406,
"learning_rate": 1.9317160826594793e-06,
"loss": 2.2578,
"step": 1915
},
{
"epoch": 0.8174061433447098,
"grad_norm": 19.788347244262695,
"learning_rate": 1.92722371967655e-06,
"loss": 1.9375,
"step": 1916
},
{
"epoch": 0.8178327645051194,
"grad_norm": 35.839080810546875,
"learning_rate": 1.922731356693621e-06,
"loss": 2.4355,
"step": 1917
},
{
"epoch": 0.818259385665529,
"grad_norm": 24.199609756469727,
"learning_rate": 1.918238993710692e-06,
"loss": 2.417,
"step": 1918
},
{
"epoch": 0.8186860068259386,
"grad_norm": 17.443004608154297,
"learning_rate": 1.913746630727763e-06,
"loss": 1.7695,
"step": 1919
},
{
"epoch": 0.8191126279863481,
"grad_norm": 42.2618293762207,
"learning_rate": 1.909254267744834e-06,
"loss": 1.9414,
"step": 1920
},
{
"epoch": 0.8195392491467577,
"grad_norm": 20.51604652404785,
"learning_rate": 1.904761904761905e-06,
"loss": 2.1553,
"step": 1921
},
{
"epoch": 0.8199658703071673,
"grad_norm": 18.936643600463867,
"learning_rate": 1.900269541778976e-06,
"loss": 2.4023,
"step": 1922
},
{
"epoch": 0.8203924914675768,
"grad_norm": 28.36236000061035,
"learning_rate": 1.895777178796047e-06,
"loss": 1.9023,
"step": 1923
},
{
"epoch": 0.8208191126279863,
"grad_norm": 17.966564178466797,
"learning_rate": 1.8912848158131178e-06,
"loss": 1.5234,
"step": 1924
},
{
"epoch": 0.8212457337883959,
"grad_norm": 23.814815521240234,
"learning_rate": 1.8867924528301889e-06,
"loss": 2.0996,
"step": 1925
},
{
"epoch": 0.8216723549488054,
"grad_norm": 22.847030639648438,
"learning_rate": 1.88230008984726e-06,
"loss": 2.1987,
"step": 1926
},
{
"epoch": 0.822098976109215,
"grad_norm": 23.444608688354492,
"learning_rate": 1.8778077268643308e-06,
"loss": 1.8022,
"step": 1927
},
{
"epoch": 0.8225255972696246,
"grad_norm": 16.979387283325195,
"learning_rate": 1.8733153638814016e-06,
"loss": 1.7539,
"step": 1928
},
{
"epoch": 0.8229522184300341,
"grad_norm": 21.849397659301758,
"learning_rate": 1.8688230008984728e-06,
"loss": 2.2803,
"step": 1929
},
{
"epoch": 0.8233788395904437,
"grad_norm": 27.620882034301758,
"learning_rate": 1.8643306379155437e-06,
"loss": 2.2627,
"step": 1930
},
{
"epoch": 0.8238054607508533,
"grad_norm": 44.100711822509766,
"learning_rate": 1.8598382749326147e-06,
"loss": 1.9717,
"step": 1931
},
{
"epoch": 0.8242320819112628,
"grad_norm": 25.661659240722656,
"learning_rate": 1.8553459119496855e-06,
"loss": 1.8379,
"step": 1932
},
{
"epoch": 0.8246587030716723,
"grad_norm": 23.532838821411133,
"learning_rate": 1.8508535489667568e-06,
"loss": 2.4883,
"step": 1933
},
{
"epoch": 0.8250853242320819,
"grad_norm": 33.77391815185547,
"learning_rate": 1.8463611859838276e-06,
"loss": 2.2764,
"step": 1934
},
{
"epoch": 0.8255119453924915,
"grad_norm": 33.51873016357422,
"learning_rate": 1.8418688230008985e-06,
"loss": 1.7642,
"step": 1935
},
{
"epoch": 0.825938566552901,
"grad_norm": 19.887741088867188,
"learning_rate": 1.8373764600179697e-06,
"loss": 2.0903,
"step": 1936
},
{
"epoch": 0.8263651877133106,
"grad_norm": 33.04109573364258,
"learning_rate": 1.8328840970350406e-06,
"loss": 2.3115,
"step": 1937
},
{
"epoch": 0.8267918088737202,
"grad_norm": 21.50177574157715,
"learning_rate": 1.8283917340521116e-06,
"loss": 2.127,
"step": 1938
},
{
"epoch": 0.8272184300341296,
"grad_norm": 24.439558029174805,
"learning_rate": 1.8238993710691824e-06,
"loss": 2.4102,
"step": 1939
},
{
"epoch": 0.8276450511945392,
"grad_norm": 31.59778594970703,
"learning_rate": 1.8194070080862537e-06,
"loss": 2.3262,
"step": 1940
},
{
"epoch": 0.8280716723549488,
"grad_norm": 23.083101272583008,
"learning_rate": 1.8149146451033245e-06,
"loss": 2.3291,
"step": 1941
},
{
"epoch": 0.8284982935153583,
"grad_norm": 17.35047149658203,
"learning_rate": 1.8104222821203954e-06,
"loss": 1.8423,
"step": 1942
},
{
"epoch": 0.8289249146757679,
"grad_norm": 22.005908966064453,
"learning_rate": 1.8059299191374666e-06,
"loss": 1.9536,
"step": 1943
},
{
"epoch": 0.8293515358361775,
"grad_norm": 22.35020637512207,
"learning_rate": 1.8014375561545374e-06,
"loss": 2.1006,
"step": 1944
},
{
"epoch": 0.8297781569965871,
"grad_norm": 26.536985397338867,
"learning_rate": 1.7969451931716083e-06,
"loss": 2.0322,
"step": 1945
},
{
"epoch": 0.8302047781569966,
"grad_norm": 20.950786590576172,
"learning_rate": 1.7924528301886793e-06,
"loss": 2.1045,
"step": 1946
},
{
"epoch": 0.8306313993174061,
"grad_norm": 28.460447311401367,
"learning_rate": 1.7879604672057504e-06,
"loss": 2.0654,
"step": 1947
},
{
"epoch": 0.8310580204778157,
"grad_norm": 24.567773818969727,
"learning_rate": 1.7834681042228214e-06,
"loss": 2.2036,
"step": 1948
},
{
"epoch": 0.8314846416382252,
"grad_norm": 26.11224937438965,
"learning_rate": 1.7789757412398922e-06,
"loss": 1.8745,
"step": 1949
},
{
"epoch": 0.8319112627986348,
"grad_norm": 22.54220962524414,
"learning_rate": 1.7744833782569635e-06,
"loss": 1.6982,
"step": 1950
},
{
"epoch": 0.8323378839590444,
"grad_norm": 24.108007431030273,
"learning_rate": 1.7699910152740343e-06,
"loss": 2.042,
"step": 1951
},
{
"epoch": 0.8327645051194539,
"grad_norm": 16.941404342651367,
"learning_rate": 1.7654986522911052e-06,
"loss": 1.5225,
"step": 1952
},
{
"epoch": 0.8331911262798635,
"grad_norm": 31.694730758666992,
"learning_rate": 1.7610062893081762e-06,
"loss": 1.9097,
"step": 1953
},
{
"epoch": 0.8336177474402731,
"grad_norm": 20.774200439453125,
"learning_rate": 1.7565139263252473e-06,
"loss": 1.9844,
"step": 1954
},
{
"epoch": 0.8340443686006825,
"grad_norm": 27.799217224121094,
"learning_rate": 1.7520215633423183e-06,
"loss": 2.0264,
"step": 1955
},
{
"epoch": 0.8344709897610921,
"grad_norm": 30.80106544494629,
"learning_rate": 1.7475292003593891e-06,
"loss": 2.6523,
"step": 1956
},
{
"epoch": 0.8348976109215017,
"grad_norm": 20.44435691833496,
"learning_rate": 1.7430368373764604e-06,
"loss": 1.7144,
"step": 1957
},
{
"epoch": 0.8353242320819113,
"grad_norm": 31.098176956176758,
"learning_rate": 1.7385444743935312e-06,
"loss": 2.502,
"step": 1958
},
{
"epoch": 0.8357508532423208,
"grad_norm": 25.127107620239258,
"learning_rate": 1.734052111410602e-06,
"loss": 2.125,
"step": 1959
},
{
"epoch": 0.8361774744027304,
"grad_norm": 27.822410583496094,
"learning_rate": 1.7295597484276729e-06,
"loss": 2.1826,
"step": 1960
},
{
"epoch": 0.83660409556314,
"grad_norm": 22.251161575317383,
"learning_rate": 1.7250673854447441e-06,
"loss": 2.0444,
"step": 1961
},
{
"epoch": 0.8370307167235495,
"grad_norm": 23.80228042602539,
"learning_rate": 1.720575022461815e-06,
"loss": 2.4683,
"step": 1962
},
{
"epoch": 0.837457337883959,
"grad_norm": 22.37078857421875,
"learning_rate": 1.716082659478886e-06,
"loss": 2.1426,
"step": 1963
},
{
"epoch": 0.8378839590443686,
"grad_norm": 19.09315299987793,
"learning_rate": 1.711590296495957e-06,
"loss": 1.8899,
"step": 1964
},
{
"epoch": 0.8383105802047781,
"grad_norm": 23.49970817565918,
"learning_rate": 1.7070979335130281e-06,
"loss": 2.5645,
"step": 1965
},
{
"epoch": 0.8387372013651877,
"grad_norm": 68.0501480102539,
"learning_rate": 1.702605570530099e-06,
"loss": 2.2539,
"step": 1966
},
{
"epoch": 0.8391638225255973,
"grad_norm": 24.918535232543945,
"learning_rate": 1.6981132075471698e-06,
"loss": 2.1025,
"step": 1967
},
{
"epoch": 0.8395904436860068,
"grad_norm": 30.718915939331055,
"learning_rate": 1.693620844564241e-06,
"loss": 1.6104,
"step": 1968
},
{
"epoch": 0.8400170648464164,
"grad_norm": 24.890552520751953,
"learning_rate": 1.6891284815813119e-06,
"loss": 2.416,
"step": 1969
},
{
"epoch": 0.840443686006826,
"grad_norm": 23.353469848632812,
"learning_rate": 1.684636118598383e-06,
"loss": 2.7842,
"step": 1970
},
{
"epoch": 0.8408703071672355,
"grad_norm": 21.08226776123047,
"learning_rate": 1.6801437556154537e-06,
"loss": 1.8774,
"step": 1971
},
{
"epoch": 0.841296928327645,
"grad_norm": 20.481369018554688,
"learning_rate": 1.675651392632525e-06,
"loss": 2.2529,
"step": 1972
},
{
"epoch": 0.8417235494880546,
"grad_norm": 37.47206115722656,
"learning_rate": 1.6711590296495958e-06,
"loss": 2.1377,
"step": 1973
},
{
"epoch": 0.8421501706484642,
"grad_norm": 21.767244338989258,
"learning_rate": 1.6666666666666667e-06,
"loss": 1.8887,
"step": 1974
},
{
"epoch": 0.8425767918088737,
"grad_norm": 25.5743350982666,
"learning_rate": 1.662174303683738e-06,
"loss": 2.293,
"step": 1975
},
{
"epoch": 0.8430034129692833,
"grad_norm": 21.288652420043945,
"learning_rate": 1.6576819407008088e-06,
"loss": 1.9062,
"step": 1976
},
{
"epoch": 0.8434300341296929,
"grad_norm": 23.8049259185791,
"learning_rate": 1.6531895777178798e-06,
"loss": 2.0957,
"step": 1977
},
{
"epoch": 0.8438566552901023,
"grad_norm": 24.071441650390625,
"learning_rate": 1.6486972147349506e-06,
"loss": 2.2852,
"step": 1978
},
{
"epoch": 0.8442832764505119,
"grad_norm": 23.946063995361328,
"learning_rate": 1.6442048517520217e-06,
"loss": 1.9229,
"step": 1979
},
{
"epoch": 0.8447098976109215,
"grad_norm": 18.481111526489258,
"learning_rate": 1.6397124887690927e-06,
"loss": 1.8364,
"step": 1980
},
{
"epoch": 0.8451365187713311,
"grad_norm": 17.902070999145508,
"learning_rate": 1.6352201257861635e-06,
"loss": 1.9219,
"step": 1981
},
{
"epoch": 0.8455631399317406,
"grad_norm": 23.107677459716797,
"learning_rate": 1.6307277628032348e-06,
"loss": 1.8792,
"step": 1982
},
{
"epoch": 0.8459897610921502,
"grad_norm": 16.212055206298828,
"learning_rate": 1.6262353998203056e-06,
"loss": 1.9043,
"step": 1983
},
{
"epoch": 0.8464163822525598,
"grad_norm": 29.245019912719727,
"learning_rate": 1.6217430368373765e-06,
"loss": 1.8955,
"step": 1984
},
{
"epoch": 0.8468430034129693,
"grad_norm": 29.921899795532227,
"learning_rate": 1.6172506738544475e-06,
"loss": 2.2031,
"step": 1985
},
{
"epoch": 0.8472696245733788,
"grad_norm": 29.58133316040039,
"learning_rate": 1.6127583108715186e-06,
"loss": 2.3086,
"step": 1986
},
{
"epoch": 0.8476962457337884,
"grad_norm": 30.39230728149414,
"learning_rate": 1.6082659478885896e-06,
"loss": 2.1631,
"step": 1987
},
{
"epoch": 0.8481228668941979,
"grad_norm": 24.831098556518555,
"learning_rate": 1.6037735849056604e-06,
"loss": 2.2432,
"step": 1988
},
{
"epoch": 0.8485494880546075,
"grad_norm": 36.24596405029297,
"learning_rate": 1.5992812219227317e-06,
"loss": 2.2324,
"step": 1989
},
{
"epoch": 0.8489761092150171,
"grad_norm": 25.26513671875,
"learning_rate": 1.5947888589398025e-06,
"loss": 2.6475,
"step": 1990
},
{
"epoch": 0.8494027303754266,
"grad_norm": 22.898345947265625,
"learning_rate": 1.5902964959568734e-06,
"loss": 1.9658,
"step": 1991
},
{
"epoch": 0.8498293515358362,
"grad_norm": 24.595247268676758,
"learning_rate": 1.5858041329739444e-06,
"loss": 2.373,
"step": 1992
},
{
"epoch": 0.8502559726962458,
"grad_norm": 41.001983642578125,
"learning_rate": 1.5813117699910154e-06,
"loss": 2.5127,
"step": 1993
},
{
"epoch": 0.8506825938566553,
"grad_norm": 29.098054885864258,
"learning_rate": 1.5768194070080865e-06,
"loss": 2.0771,
"step": 1994
},
{
"epoch": 0.8511092150170648,
"grad_norm": 19.65403175354004,
"learning_rate": 1.5723270440251573e-06,
"loss": 2.1309,
"step": 1995
},
{
"epoch": 0.8515358361774744,
"grad_norm": 36.26100158691406,
"learning_rate": 1.5678346810422284e-06,
"loss": 1.751,
"step": 1996
},
{
"epoch": 0.851962457337884,
"grad_norm": 34.61759567260742,
"learning_rate": 1.5633423180592994e-06,
"loss": 1.8232,
"step": 1997
},
{
"epoch": 0.8523890784982935,
"grad_norm": 29.321847915649414,
"learning_rate": 1.5588499550763702e-06,
"loss": 1.8682,
"step": 1998
},
{
"epoch": 0.8528156996587031,
"grad_norm": 20.491769790649414,
"learning_rate": 1.554357592093441e-06,
"loss": 1.7466,
"step": 1999
},
{
"epoch": 0.8532423208191127,
"grad_norm": 25.29309844970703,
"learning_rate": 1.5498652291105123e-06,
"loss": 1.8062,
"step": 2000
},
{
"epoch": 0.8536689419795221,
"grad_norm": 21.117168426513672,
"learning_rate": 1.5453728661275832e-06,
"loss": 2.0132,
"step": 2001
},
{
"epoch": 0.8540955631399317,
"grad_norm": 26.844043731689453,
"learning_rate": 1.5408805031446542e-06,
"loss": 1.8477,
"step": 2002
},
{
"epoch": 0.8545221843003413,
"grad_norm": 28.656776428222656,
"learning_rate": 1.5363881401617253e-06,
"loss": 2.5469,
"step": 2003
},
{
"epoch": 0.8549488054607508,
"grad_norm": 20.729461669921875,
"learning_rate": 1.5318957771787963e-06,
"loss": 1.8193,
"step": 2004
},
{
"epoch": 0.8553754266211604,
"grad_norm": 24.91880226135254,
"learning_rate": 1.5274034141958671e-06,
"loss": 2.4961,
"step": 2005
},
{
"epoch": 0.85580204778157,
"grad_norm": 19.17313575744629,
"learning_rate": 1.522911051212938e-06,
"loss": 1.9546,
"step": 2006
},
{
"epoch": 0.8562286689419796,
"grad_norm": 17.52025604248047,
"learning_rate": 1.5184186882300092e-06,
"loss": 1.6777,
"step": 2007
},
{
"epoch": 0.856655290102389,
"grad_norm": 20.038908004760742,
"learning_rate": 1.51392632524708e-06,
"loss": 1.9248,
"step": 2008
},
{
"epoch": 0.8570819112627986,
"grad_norm": 41.33238220214844,
"learning_rate": 1.509433962264151e-06,
"loss": 2.4092,
"step": 2009
},
{
"epoch": 0.8575085324232082,
"grad_norm": 30.137481689453125,
"learning_rate": 1.504941599281222e-06,
"loss": 2.084,
"step": 2010
},
{
"epoch": 0.8579351535836177,
"grad_norm": 17.346654891967773,
"learning_rate": 1.5004492362982932e-06,
"loss": 1.4453,
"step": 2011
},
{
"epoch": 0.8583617747440273,
"grad_norm": 26.113149642944336,
"learning_rate": 1.495956873315364e-06,
"loss": 2.5786,
"step": 2012
},
{
"epoch": 0.8587883959044369,
"grad_norm": 26.139968872070312,
"learning_rate": 1.4914645103324349e-06,
"loss": 2.1934,
"step": 2013
},
{
"epoch": 0.8592150170648464,
"grad_norm": 24.80109214782715,
"learning_rate": 1.4869721473495061e-06,
"loss": 2.3604,
"step": 2014
},
{
"epoch": 0.859641638225256,
"grad_norm": 25.143037796020508,
"learning_rate": 1.482479784366577e-06,
"loss": 2.4048,
"step": 2015
},
{
"epoch": 0.8600682593856656,
"grad_norm": 29.503969192504883,
"learning_rate": 1.4779874213836478e-06,
"loss": 1.9688,
"step": 2016
},
{
"epoch": 0.860494880546075,
"grad_norm": 20.356889724731445,
"learning_rate": 1.4734950584007188e-06,
"loss": 2.2007,
"step": 2017
},
{
"epoch": 0.8609215017064846,
"grad_norm": 26.912221908569336,
"learning_rate": 1.4690026954177899e-06,
"loss": 2.3154,
"step": 2018
},
{
"epoch": 0.8613481228668942,
"grad_norm": 20.486494064331055,
"learning_rate": 1.464510332434861e-06,
"loss": 2.3149,
"step": 2019
},
{
"epoch": 0.8617747440273038,
"grad_norm": 23.620391845703125,
"learning_rate": 1.4600179694519317e-06,
"loss": 2.1152,
"step": 2020
},
{
"epoch": 0.8622013651877133,
"grad_norm": 19.62716293334961,
"learning_rate": 1.455525606469003e-06,
"loss": 1.7969,
"step": 2021
},
{
"epoch": 0.8626279863481229,
"grad_norm": 32.85416030883789,
"learning_rate": 1.4510332434860738e-06,
"loss": 2.248,
"step": 2022
},
{
"epoch": 0.8630546075085325,
"grad_norm": 18.510087966918945,
"learning_rate": 1.4465408805031447e-06,
"loss": 1.9766,
"step": 2023
},
{
"epoch": 0.863481228668942,
"grad_norm": 23.601825714111328,
"learning_rate": 1.4420485175202157e-06,
"loss": 2.5127,
"step": 2024
},
{
"epoch": 0.8639078498293515,
"grad_norm": 42.78569412231445,
"learning_rate": 1.4375561545372868e-06,
"loss": 2.3135,
"step": 2025
},
{
"epoch": 0.8643344709897611,
"grad_norm": 22.61996841430664,
"learning_rate": 1.4330637915543578e-06,
"loss": 1.7754,
"step": 2026
},
{
"epoch": 0.8647610921501706,
"grad_norm": 20.096586227416992,
"learning_rate": 1.4285714285714286e-06,
"loss": 2.1143,
"step": 2027
},
{
"epoch": 0.8651877133105802,
"grad_norm": 32.09673309326172,
"learning_rate": 1.4240790655884999e-06,
"loss": 1.9922,
"step": 2028
},
{
"epoch": 0.8656143344709898,
"grad_norm": 23.80088233947754,
"learning_rate": 1.4195867026055707e-06,
"loss": 2.1055,
"step": 2029
},
{
"epoch": 0.8660409556313993,
"grad_norm": 31.186294555664062,
"learning_rate": 1.4150943396226415e-06,
"loss": 2.3418,
"step": 2030
},
{
"epoch": 0.8664675767918089,
"grad_norm": 24.158933639526367,
"learning_rate": 1.4106019766397126e-06,
"loss": 2.0342,
"step": 2031
},
{
"epoch": 0.8668941979522184,
"grad_norm": 23.584606170654297,
"learning_rate": 1.4061096136567836e-06,
"loss": 2.3428,
"step": 2032
},
{
"epoch": 0.867320819112628,
"grad_norm": 19.908782958984375,
"learning_rate": 1.4016172506738545e-06,
"loss": 1.9004,
"step": 2033
},
{
"epoch": 0.8677474402730375,
"grad_norm": 19.742876052856445,
"learning_rate": 1.3971248876909255e-06,
"loss": 1.5688,
"step": 2034
},
{
"epoch": 0.8681740614334471,
"grad_norm": 22.75543975830078,
"learning_rate": 1.3926325247079966e-06,
"loss": 2.5547,
"step": 2035
},
{
"epoch": 0.8686006825938567,
"grad_norm": 32.865848541259766,
"learning_rate": 1.3881401617250676e-06,
"loss": 2.0195,
"step": 2036
},
{
"epoch": 0.8690273037542662,
"grad_norm": 25.184295654296875,
"learning_rate": 1.3836477987421384e-06,
"loss": 2.1816,
"step": 2037
},
{
"epoch": 0.8694539249146758,
"grad_norm": 19.12177085876465,
"learning_rate": 1.3791554357592093e-06,
"loss": 2.3477,
"step": 2038
},
{
"epoch": 0.8698805460750854,
"grad_norm": 21.0264949798584,
"learning_rate": 1.3746630727762805e-06,
"loss": 1.7646,
"step": 2039
},
{
"epoch": 0.8703071672354948,
"grad_norm": 24.255212783813477,
"learning_rate": 1.3701707097933514e-06,
"loss": 2.2432,
"step": 2040
},
{
"epoch": 0.8707337883959044,
"grad_norm": 18.39045524597168,
"learning_rate": 1.3656783468104224e-06,
"loss": 1.6699,
"step": 2041
},
{
"epoch": 0.871160409556314,
"grad_norm": 18.828027725219727,
"learning_rate": 1.3611859838274934e-06,
"loss": 2.2451,
"step": 2042
},
{
"epoch": 0.8715870307167235,
"grad_norm": 37.50020980834961,
"learning_rate": 1.3566936208445645e-06,
"loss": 1.7896,
"step": 2043
},
{
"epoch": 0.8720136518771331,
"grad_norm": 19.664005279541016,
"learning_rate": 1.3522012578616353e-06,
"loss": 1.8555,
"step": 2044
},
{
"epoch": 0.8724402730375427,
"grad_norm": 17.749042510986328,
"learning_rate": 1.3477088948787062e-06,
"loss": 1.6689,
"step": 2045
},
{
"epoch": 0.8728668941979523,
"grad_norm": 21.117351531982422,
"learning_rate": 1.3432165318957774e-06,
"loss": 2.3311,
"step": 2046
},
{
"epoch": 0.8732935153583617,
"grad_norm": 20.475221633911133,
"learning_rate": 1.3387241689128482e-06,
"loss": 1.9712,
"step": 2047
},
{
"epoch": 0.8737201365187713,
"grad_norm": 22.14813804626465,
"learning_rate": 1.3342318059299193e-06,
"loss": 2.1699,
"step": 2048
},
{
"epoch": 0.8741467576791809,
"grad_norm": 22.878765106201172,
"learning_rate": 1.3297394429469901e-06,
"loss": 2.1074,
"step": 2049
},
{
"epoch": 0.8745733788395904,
"grad_norm": 20.329811096191406,
"learning_rate": 1.3252470799640612e-06,
"loss": 1.9375,
"step": 2050
},
{
"epoch": 0.875,
"grad_norm": 23.280149459838867,
"learning_rate": 1.3207547169811322e-06,
"loss": 1.877,
"step": 2051
},
{
"epoch": 0.8754266211604096,
"grad_norm": 19.771005630493164,
"learning_rate": 1.316262353998203e-06,
"loss": 1.9712,
"step": 2052
},
{
"epoch": 0.8758532423208191,
"grad_norm": 18.957622528076172,
"learning_rate": 1.3117699910152743e-06,
"loss": 1.6733,
"step": 2053
},
{
"epoch": 0.8762798634812287,
"grad_norm": 22.287721633911133,
"learning_rate": 1.3072776280323451e-06,
"loss": 2.3984,
"step": 2054
},
{
"epoch": 0.8767064846416383,
"grad_norm": 28.450151443481445,
"learning_rate": 1.302785265049416e-06,
"loss": 2.6621,
"step": 2055
},
{
"epoch": 0.8771331058020477,
"grad_norm": 23.43517303466797,
"learning_rate": 1.298292902066487e-06,
"loss": 1.9453,
"step": 2056
},
{
"epoch": 0.8775597269624573,
"grad_norm": 24.089502334594727,
"learning_rate": 1.293800539083558e-06,
"loss": 2.1045,
"step": 2057
},
{
"epoch": 0.8779863481228669,
"grad_norm": 26.210840225219727,
"learning_rate": 1.289308176100629e-06,
"loss": 2.2441,
"step": 2058
},
{
"epoch": 0.8784129692832765,
"grad_norm": 19.64763832092285,
"learning_rate": 1.2848158131177e-06,
"loss": 2.0576,
"step": 2059
},
{
"epoch": 0.878839590443686,
"grad_norm": 24.991710662841797,
"learning_rate": 1.2803234501347712e-06,
"loss": 2.0898,
"step": 2060
},
{
"epoch": 0.8792662116040956,
"grad_norm": 19.32874298095703,
"learning_rate": 1.275831087151842e-06,
"loss": 2.3027,
"step": 2061
},
{
"epoch": 0.8796928327645052,
"grad_norm": 17.936874389648438,
"learning_rate": 1.2713387241689129e-06,
"loss": 1.6777,
"step": 2062
},
{
"epoch": 0.8801194539249146,
"grad_norm": 26.58176040649414,
"learning_rate": 1.266846361185984e-06,
"loss": 2.4883,
"step": 2063
},
{
"epoch": 0.8805460750853242,
"grad_norm": 24.924253463745117,
"learning_rate": 1.262353998203055e-06,
"loss": 2.4746,
"step": 2064
},
{
"epoch": 0.8809726962457338,
"grad_norm": 22.25920295715332,
"learning_rate": 1.257861635220126e-06,
"loss": 1.6597,
"step": 2065
},
{
"epoch": 0.8813993174061433,
"grad_norm": 21.69879722595215,
"learning_rate": 1.2533692722371968e-06,
"loss": 2.0581,
"step": 2066
},
{
"epoch": 0.8818259385665529,
"grad_norm": 20.946352005004883,
"learning_rate": 1.2488769092542679e-06,
"loss": 1.8125,
"step": 2067
},
{
"epoch": 0.8822525597269625,
"grad_norm": 25.7633056640625,
"learning_rate": 1.244384546271339e-06,
"loss": 1.8984,
"step": 2068
},
{
"epoch": 0.882679180887372,
"grad_norm": 22.418216705322266,
"learning_rate": 1.2398921832884097e-06,
"loss": 2.0986,
"step": 2069
},
{
"epoch": 0.8831058020477816,
"grad_norm": 24.003862380981445,
"learning_rate": 1.2353998203054808e-06,
"loss": 2.0674,
"step": 2070
},
{
"epoch": 0.8835324232081911,
"grad_norm": 34.044063568115234,
"learning_rate": 1.2309074573225516e-06,
"loss": 2.0288,
"step": 2071
},
{
"epoch": 0.8839590443686007,
"grad_norm": 24.938892364501953,
"learning_rate": 1.2264150943396227e-06,
"loss": 2.3838,
"step": 2072
},
{
"epoch": 0.8843856655290102,
"grad_norm": 33.812461853027344,
"learning_rate": 1.2219227313566937e-06,
"loss": 2.0166,
"step": 2073
},
{
"epoch": 0.8848122866894198,
"grad_norm": 40.91825866699219,
"learning_rate": 1.2174303683737648e-06,
"loss": 2.5576,
"step": 2074
},
{
"epoch": 0.8852389078498294,
"grad_norm": 25.86836814880371,
"learning_rate": 1.2129380053908358e-06,
"loss": 2.1416,
"step": 2075
},
{
"epoch": 0.8856655290102389,
"grad_norm": 20.361167907714844,
"learning_rate": 1.2084456424079066e-06,
"loss": 2.2021,
"step": 2076
},
{
"epoch": 0.8860921501706485,
"grad_norm": 35.148014068603516,
"learning_rate": 1.2039532794249777e-06,
"loss": 2.2529,
"step": 2077
},
{
"epoch": 0.886518771331058,
"grad_norm": 19.91792869567871,
"learning_rate": 1.1994609164420485e-06,
"loss": 2.0381,
"step": 2078
},
{
"epoch": 0.8869453924914675,
"grad_norm": 19.79247283935547,
"learning_rate": 1.1949685534591195e-06,
"loss": 1.8823,
"step": 2079
},
{
"epoch": 0.8873720136518771,
"grad_norm": 21.4259033203125,
"learning_rate": 1.1904761904761906e-06,
"loss": 2.6367,
"step": 2080
},
{
"epoch": 0.8877986348122867,
"grad_norm": 22.293596267700195,
"learning_rate": 1.1859838274932616e-06,
"loss": 2.4014,
"step": 2081
},
{
"epoch": 0.8882252559726962,
"grad_norm": 30.97688102722168,
"learning_rate": 1.1814914645103327e-06,
"loss": 1.9829,
"step": 2082
},
{
"epoch": 0.8886518771331058,
"grad_norm": 27.427173614501953,
"learning_rate": 1.1769991015274035e-06,
"loss": 2.7266,
"step": 2083
},
{
"epoch": 0.8890784982935154,
"grad_norm": 23.10563087463379,
"learning_rate": 1.1725067385444746e-06,
"loss": 2.4023,
"step": 2084
},
{
"epoch": 0.889505119453925,
"grad_norm": 24.243976593017578,
"learning_rate": 1.1680143755615454e-06,
"loss": 2.5977,
"step": 2085
},
{
"epoch": 0.8899317406143344,
"grad_norm": 26.19727325439453,
"learning_rate": 1.1635220125786164e-06,
"loss": 2.165,
"step": 2086
},
{
"epoch": 0.890358361774744,
"grad_norm": 21.771196365356445,
"learning_rate": 1.1590296495956873e-06,
"loss": 2.2637,
"step": 2087
},
{
"epoch": 0.8907849829351536,
"grad_norm": 20.06059455871582,
"learning_rate": 1.1545372866127583e-06,
"loss": 1.9355,
"step": 2088
},
{
"epoch": 0.8912116040955631,
"grad_norm": 17.9766902923584,
"learning_rate": 1.1500449236298294e-06,
"loss": 1.5938,
"step": 2089
},
{
"epoch": 0.8916382252559727,
"grad_norm": 27.272201538085938,
"learning_rate": 1.1455525606469004e-06,
"loss": 1.9111,
"step": 2090
},
{
"epoch": 0.8920648464163823,
"grad_norm": 19.23063850402832,
"learning_rate": 1.1410601976639714e-06,
"loss": 1.9775,
"step": 2091
},
{
"epoch": 0.8924914675767918,
"grad_norm": 32.15581130981445,
"learning_rate": 1.1365678346810423e-06,
"loss": 2.1719,
"step": 2092
},
{
"epoch": 0.8929180887372014,
"grad_norm": 17.017885208129883,
"learning_rate": 1.1320754716981133e-06,
"loss": 1.5283,
"step": 2093
},
{
"epoch": 0.893344709897611,
"grad_norm": 28.969139099121094,
"learning_rate": 1.1275831087151842e-06,
"loss": 2.2002,
"step": 2094
},
{
"epoch": 0.8937713310580204,
"grad_norm": 30.85367774963379,
"learning_rate": 1.1230907457322552e-06,
"loss": 2.4268,
"step": 2095
},
{
"epoch": 0.89419795221843,
"grad_norm": 21.826454162597656,
"learning_rate": 1.1185983827493262e-06,
"loss": 2.123,
"step": 2096
},
{
"epoch": 0.8946245733788396,
"grad_norm": 25.08101463317871,
"learning_rate": 1.1141060197663973e-06,
"loss": 1.9346,
"step": 2097
},
{
"epoch": 0.8950511945392492,
"grad_norm": 42.20564270019531,
"learning_rate": 1.1096136567834683e-06,
"loss": 2.4873,
"step": 2098
},
{
"epoch": 0.8954778156996587,
"grad_norm": 23.242401123046875,
"learning_rate": 1.1051212938005392e-06,
"loss": 2.5371,
"step": 2099
},
{
"epoch": 0.8959044368600683,
"grad_norm": 24.964221954345703,
"learning_rate": 1.1006289308176102e-06,
"loss": 2.2559,
"step": 2100
},
{
"epoch": 0.8963310580204779,
"grad_norm": 26.806692123413086,
"learning_rate": 1.096136567834681e-06,
"loss": 2.0908,
"step": 2101
},
{
"epoch": 0.8967576791808873,
"grad_norm": 27.2700252532959,
"learning_rate": 1.091644204851752e-06,
"loss": 2.1367,
"step": 2102
},
{
"epoch": 0.8971843003412969,
"grad_norm": 16.515722274780273,
"learning_rate": 1.087151841868823e-06,
"loss": 1.8257,
"step": 2103
},
{
"epoch": 0.8976109215017065,
"grad_norm": 29.197978973388672,
"learning_rate": 1.082659478885894e-06,
"loss": 2.2861,
"step": 2104
},
{
"epoch": 0.898037542662116,
"grad_norm": 32.033565521240234,
"learning_rate": 1.078167115902965e-06,
"loss": 2.1191,
"step": 2105
},
{
"epoch": 0.8984641638225256,
"grad_norm": 24.29416847229004,
"learning_rate": 1.073674752920036e-06,
"loss": 1.9277,
"step": 2106
},
{
"epoch": 0.8988907849829352,
"grad_norm": 21.460018157958984,
"learning_rate": 1.069182389937107e-06,
"loss": 2.2949,
"step": 2107
},
{
"epoch": 0.8993174061433447,
"grad_norm": 23.199054718017578,
"learning_rate": 1.064690026954178e-06,
"loss": 2.0605,
"step": 2108
},
{
"epoch": 0.8997440273037542,
"grad_norm": 19.786056518554688,
"learning_rate": 1.060197663971249e-06,
"loss": 1.981,
"step": 2109
},
{
"epoch": 0.9001706484641638,
"grad_norm": 27.960031509399414,
"learning_rate": 1.0557053009883198e-06,
"loss": 2.126,
"step": 2110
},
{
"epoch": 0.9005972696245734,
"grad_norm": 24.085905075073242,
"learning_rate": 1.0512129380053909e-06,
"loss": 2.0376,
"step": 2111
},
{
"epoch": 0.9010238907849829,
"grad_norm": 20.048545837402344,
"learning_rate": 1.046720575022462e-06,
"loss": 1.6416,
"step": 2112
},
{
"epoch": 0.9014505119453925,
"grad_norm": 21.789766311645508,
"learning_rate": 1.042228212039533e-06,
"loss": 2.3467,
"step": 2113
},
{
"epoch": 0.9018771331058021,
"grad_norm": 19.590885162353516,
"learning_rate": 1.037735849056604e-06,
"loss": 1.7197,
"step": 2114
},
{
"epoch": 0.9023037542662116,
"grad_norm": 24.716259002685547,
"learning_rate": 1.0332434860736748e-06,
"loss": 1.9697,
"step": 2115
},
{
"epoch": 0.9027303754266212,
"grad_norm": 23.647457122802734,
"learning_rate": 1.0287511230907459e-06,
"loss": 2.0835,
"step": 2116
},
{
"epoch": 0.9031569965870307,
"grad_norm": 24.428579330444336,
"learning_rate": 1.0242587601078167e-06,
"loss": 1.959,
"step": 2117
},
{
"epoch": 0.9035836177474402,
"grad_norm": 20.997983932495117,
"learning_rate": 1.0197663971248877e-06,
"loss": 1.7334,
"step": 2118
},
{
"epoch": 0.9040102389078498,
"grad_norm": 21.304641723632812,
"learning_rate": 1.0152740341419588e-06,
"loss": 1.4746,
"step": 2119
},
{
"epoch": 0.9044368600682594,
"grad_norm": 32.69042205810547,
"learning_rate": 1.0107816711590298e-06,
"loss": 2.3721,
"step": 2120
},
{
"epoch": 0.9048634812286689,
"grad_norm": 26.37349510192871,
"learning_rate": 1.0062893081761007e-06,
"loss": 2.2334,
"step": 2121
},
{
"epoch": 0.9052901023890785,
"grad_norm": 37.03302001953125,
"learning_rate": 1.0017969451931717e-06,
"loss": 2.292,
"step": 2122
},
{
"epoch": 0.9057167235494881,
"grad_norm": 22.1188907623291,
"learning_rate": 9.973045822102428e-07,
"loss": 2.0234,
"step": 2123
},
{
"epoch": 0.9061433447098977,
"grad_norm": 27.15044593811035,
"learning_rate": 9.928122192273136e-07,
"loss": 2.0137,
"step": 2124
},
{
"epoch": 0.9065699658703071,
"grad_norm": 24.961999893188477,
"learning_rate": 9.883198562443846e-07,
"loss": 2.5054,
"step": 2125
},
{
"epoch": 0.9069965870307167,
"grad_norm": 30.560245513916016,
"learning_rate": 9.838274932614555e-07,
"loss": 1.8213,
"step": 2126
},
{
"epoch": 0.9074232081911263,
"grad_norm": 22.483095169067383,
"learning_rate": 9.793351302785265e-07,
"loss": 1.5615,
"step": 2127
},
{
"epoch": 0.9078498293515358,
"grad_norm": 28.84935760498047,
"learning_rate": 9.748427672955975e-07,
"loss": 2.5317,
"step": 2128
},
{
"epoch": 0.9082764505119454,
"grad_norm": 22.43422508239746,
"learning_rate": 9.703504043126686e-07,
"loss": 1.897,
"step": 2129
},
{
"epoch": 0.908703071672355,
"grad_norm": 23.057863235473633,
"learning_rate": 9.658580413297396e-07,
"loss": 2.1621,
"step": 2130
},
{
"epoch": 0.9091296928327645,
"grad_norm": 23.351160049438477,
"learning_rate": 9.613656783468105e-07,
"loss": 1.6934,
"step": 2131
},
{
"epoch": 0.909556313993174,
"grad_norm": 29.212770462036133,
"learning_rate": 9.568733153638815e-07,
"loss": 2.124,
"step": 2132
},
{
"epoch": 0.9099829351535836,
"grad_norm": 26.707088470458984,
"learning_rate": 9.523809523809525e-07,
"loss": 2.3135,
"step": 2133
},
{
"epoch": 0.9104095563139932,
"grad_norm": 23.58577537536621,
"learning_rate": 9.478885893980235e-07,
"loss": 2.2168,
"step": 2134
},
{
"epoch": 0.9108361774744027,
"grad_norm": 21.69426155090332,
"learning_rate": 9.433962264150944e-07,
"loss": 2.4932,
"step": 2135
},
{
"epoch": 0.9112627986348123,
"grad_norm": 34.198604583740234,
"learning_rate": 9.389038634321654e-07,
"loss": 2.1826,
"step": 2136
},
{
"epoch": 0.9116894197952219,
"grad_norm": 20.484844207763672,
"learning_rate": 9.344115004492364e-07,
"loss": 1.6392,
"step": 2137
},
{
"epoch": 0.9121160409556314,
"grad_norm": 48.64004135131836,
"learning_rate": 9.299191374663074e-07,
"loss": 1.8081,
"step": 2138
},
{
"epoch": 0.912542662116041,
"grad_norm": 36.88442611694336,
"learning_rate": 9.254267744833784e-07,
"loss": 2.5791,
"step": 2139
},
{
"epoch": 0.9129692832764505,
"grad_norm": 31.48349380493164,
"learning_rate": 9.209344115004492e-07,
"loss": 1.9502,
"step": 2140
},
{
"epoch": 0.91339590443686,
"grad_norm": 21.401731491088867,
"learning_rate": 9.164420485175203e-07,
"loss": 2.2046,
"step": 2141
},
{
"epoch": 0.9138225255972696,
"grad_norm": 22.95062255859375,
"learning_rate": 9.119496855345912e-07,
"loss": 1.7771,
"step": 2142
},
{
"epoch": 0.9142491467576792,
"grad_norm": 17.692584991455078,
"learning_rate": 9.074573225516623e-07,
"loss": 1.7725,
"step": 2143
},
{
"epoch": 0.9146757679180887,
"grad_norm": 25.14078140258789,
"learning_rate": 9.029649595687333e-07,
"loss": 2.1123,
"step": 2144
},
{
"epoch": 0.9151023890784983,
"grad_norm": 23.51230239868164,
"learning_rate": 8.984725965858041e-07,
"loss": 2.0317,
"step": 2145
},
{
"epoch": 0.9155290102389079,
"grad_norm": 40.93922805786133,
"learning_rate": 8.939802336028752e-07,
"loss": 2.1279,
"step": 2146
},
{
"epoch": 0.9159556313993175,
"grad_norm": 31.59501075744629,
"learning_rate": 8.894878706199461e-07,
"loss": 2.0869,
"step": 2147
},
{
"epoch": 0.9163822525597269,
"grad_norm": 43.97781753540039,
"learning_rate": 8.849955076370172e-07,
"loss": 2.0508,
"step": 2148
},
{
"epoch": 0.9168088737201365,
"grad_norm": 29.16642951965332,
"learning_rate": 8.805031446540881e-07,
"loss": 1.8281,
"step": 2149
},
{
"epoch": 0.9172354948805461,
"grad_norm": 19.41118812561035,
"learning_rate": 8.760107816711592e-07,
"loss": 1.7861,
"step": 2150
},
{
"epoch": 0.9176621160409556,
"grad_norm": 24.558164596557617,
"learning_rate": 8.715184186882302e-07,
"loss": 2.6377,
"step": 2151
},
{
"epoch": 0.9180887372013652,
"grad_norm": 16.41124153137207,
"learning_rate": 8.67026055705301e-07,
"loss": 1.6436,
"step": 2152
},
{
"epoch": 0.9185153583617748,
"grad_norm": 19.3815860748291,
"learning_rate": 8.625336927223721e-07,
"loss": 1.6597,
"step": 2153
},
{
"epoch": 0.9189419795221843,
"grad_norm": 35.49598693847656,
"learning_rate": 8.58041329739443e-07,
"loss": 2.2285,
"step": 2154
},
{
"epoch": 0.9193686006825939,
"grad_norm": 28.167762756347656,
"learning_rate": 8.535489667565141e-07,
"loss": 2.2881,
"step": 2155
},
{
"epoch": 0.9197952218430034,
"grad_norm": 27.260639190673828,
"learning_rate": 8.490566037735849e-07,
"loss": 2.5161,
"step": 2156
},
{
"epoch": 0.9202218430034129,
"grad_norm": 21.667072296142578,
"learning_rate": 8.445642407906559e-07,
"loss": 2.2041,
"step": 2157
},
{
"epoch": 0.9206484641638225,
"grad_norm": 19.246713638305664,
"learning_rate": 8.400718778077269e-07,
"loss": 1.7842,
"step": 2158
},
{
"epoch": 0.9210750853242321,
"grad_norm": 20.60174560546875,
"learning_rate": 8.355795148247979e-07,
"loss": 1.9756,
"step": 2159
},
{
"epoch": 0.9215017064846417,
"grad_norm": 20.229415893554688,
"learning_rate": 8.31087151841869e-07,
"loss": 1.6904,
"step": 2160
},
{
"epoch": 0.9219283276450512,
"grad_norm": 19.543737411499023,
"learning_rate": 8.265947888589399e-07,
"loss": 2.252,
"step": 2161
},
{
"epoch": 0.9223549488054608,
"grad_norm": 20.05359649658203,
"learning_rate": 8.221024258760108e-07,
"loss": 1.5576,
"step": 2162
},
{
"epoch": 0.9227815699658704,
"grad_norm": 25.795568466186523,
"learning_rate": 8.176100628930818e-07,
"loss": 1.7412,
"step": 2163
},
{
"epoch": 0.9232081911262798,
"grad_norm": 25.85951805114746,
"learning_rate": 8.131176999101528e-07,
"loss": 2.7568,
"step": 2164
},
{
"epoch": 0.9236348122866894,
"grad_norm": 25.40492057800293,
"learning_rate": 8.086253369272238e-07,
"loss": 2.125,
"step": 2165
},
{
"epoch": 0.924061433447099,
"grad_norm": 21.45697784423828,
"learning_rate": 8.041329739442948e-07,
"loss": 2.0469,
"step": 2166
},
{
"epoch": 0.9244880546075085,
"grad_norm": 27.524532318115234,
"learning_rate": 7.996406109613658e-07,
"loss": 2.1284,
"step": 2167
},
{
"epoch": 0.9249146757679181,
"grad_norm": 25.877559661865234,
"learning_rate": 7.951482479784367e-07,
"loss": 2.2461,
"step": 2168
},
{
"epoch": 0.9253412969283277,
"grad_norm": 27.39493179321289,
"learning_rate": 7.906558849955077e-07,
"loss": 2.1006,
"step": 2169
},
{
"epoch": 0.9257679180887372,
"grad_norm": 31.304426193237305,
"learning_rate": 7.861635220125787e-07,
"loss": 2.4561,
"step": 2170
},
{
"epoch": 0.9261945392491467,
"grad_norm": 33.025875091552734,
"learning_rate": 7.816711590296497e-07,
"loss": 1.6978,
"step": 2171
},
{
"epoch": 0.9266211604095563,
"grad_norm": 29.005495071411133,
"learning_rate": 7.771787960467205e-07,
"loss": 2.3926,
"step": 2172
},
{
"epoch": 0.9270477815699659,
"grad_norm": 46.53617477416992,
"learning_rate": 7.726864330637916e-07,
"loss": 2.0522,
"step": 2173
},
{
"epoch": 0.9274744027303754,
"grad_norm": 24.20000457763672,
"learning_rate": 7.681940700808626e-07,
"loss": 2.2446,
"step": 2174
},
{
"epoch": 0.927901023890785,
"grad_norm": 25.25002670288086,
"learning_rate": 7.637017070979336e-07,
"loss": 2.0361,
"step": 2175
},
{
"epoch": 0.9283276450511946,
"grad_norm": 21.676692962646484,
"learning_rate": 7.592093441150046e-07,
"loss": 1.8105,
"step": 2176
},
{
"epoch": 0.9287542662116041,
"grad_norm": 29.155437469482422,
"learning_rate": 7.547169811320755e-07,
"loss": 2.2046,
"step": 2177
},
{
"epoch": 0.9291808873720137,
"grad_norm": 29.175691604614258,
"learning_rate": 7.502246181491466e-07,
"loss": 2.5146,
"step": 2178
},
{
"epoch": 0.9296075085324232,
"grad_norm": 25.129615783691406,
"learning_rate": 7.457322551662174e-07,
"loss": 1.9575,
"step": 2179
},
{
"epoch": 0.9300341296928327,
"grad_norm": 24.428255081176758,
"learning_rate": 7.412398921832885e-07,
"loss": 1.7617,
"step": 2180
},
{
"epoch": 0.9304607508532423,
"grad_norm": 25.9346866607666,
"learning_rate": 7.367475292003594e-07,
"loss": 1.9551,
"step": 2181
},
{
"epoch": 0.9308873720136519,
"grad_norm": 23.452688217163086,
"learning_rate": 7.322551662174305e-07,
"loss": 2.2588,
"step": 2182
},
{
"epoch": 0.9313139931740614,
"grad_norm": 33.63352584838867,
"learning_rate": 7.277628032345015e-07,
"loss": 2.2471,
"step": 2183
},
{
"epoch": 0.931740614334471,
"grad_norm": 19.231609344482422,
"learning_rate": 7.232704402515723e-07,
"loss": 1.998,
"step": 2184
},
{
"epoch": 0.9321672354948806,
"grad_norm": 18.095294952392578,
"learning_rate": 7.187780772686434e-07,
"loss": 1.6978,
"step": 2185
},
{
"epoch": 0.9325938566552902,
"grad_norm": 20.472152709960938,
"learning_rate": 7.142857142857143e-07,
"loss": 1.9365,
"step": 2186
},
{
"epoch": 0.9330204778156996,
"grad_norm": 19.606014251708984,
"learning_rate": 7.097933513027854e-07,
"loss": 2.1846,
"step": 2187
},
{
"epoch": 0.9334470989761092,
"grad_norm": 19.07863998413086,
"learning_rate": 7.053009883198563e-07,
"loss": 2.166,
"step": 2188
},
{
"epoch": 0.9338737201365188,
"grad_norm": 21.699003219604492,
"learning_rate": 7.008086253369272e-07,
"loss": 2.1631,
"step": 2189
},
{
"epoch": 0.9343003412969283,
"grad_norm": 26.416015625,
"learning_rate": 6.963162623539983e-07,
"loss": 1.8184,
"step": 2190
},
{
"epoch": 0.9347269624573379,
"grad_norm": 28.63286590576172,
"learning_rate": 6.918238993710692e-07,
"loss": 1.9531,
"step": 2191
},
{
"epoch": 0.9351535836177475,
"grad_norm": 29.21709632873535,
"learning_rate": 6.873315363881403e-07,
"loss": 2.0801,
"step": 2192
},
{
"epoch": 0.935580204778157,
"grad_norm": 45.255577087402344,
"learning_rate": 6.828391734052112e-07,
"loss": 2.0156,
"step": 2193
},
{
"epoch": 0.9360068259385665,
"grad_norm": 17.908376693725586,
"learning_rate": 6.783468104222822e-07,
"loss": 1.876,
"step": 2194
},
{
"epoch": 0.9364334470989761,
"grad_norm": 18.84600257873535,
"learning_rate": 6.738544474393531e-07,
"loss": 1.7788,
"step": 2195
},
{
"epoch": 0.9368600682593856,
"grad_norm": 21.277381896972656,
"learning_rate": 6.693620844564241e-07,
"loss": 2.0059,
"step": 2196
},
{
"epoch": 0.9372866894197952,
"grad_norm": 26.39069938659668,
"learning_rate": 6.648697214734951e-07,
"loss": 2.1885,
"step": 2197
},
{
"epoch": 0.9377133105802048,
"grad_norm": 20.444717407226562,
"learning_rate": 6.603773584905661e-07,
"loss": 1.6426,
"step": 2198
},
{
"epoch": 0.9381399317406144,
"grad_norm": 26.616222381591797,
"learning_rate": 6.558849955076372e-07,
"loss": 2.0957,
"step": 2199
},
{
"epoch": 0.9385665529010239,
"grad_norm": 22.825220108032227,
"learning_rate": 6.51392632524708e-07,
"loss": 1.8066,
"step": 2200
},
{
"epoch": 0.9389931740614335,
"grad_norm": 35.218353271484375,
"learning_rate": 6.46900269541779e-07,
"loss": 1.8926,
"step": 2201
},
{
"epoch": 0.939419795221843,
"grad_norm": 18.171491622924805,
"learning_rate": 6.4240790655885e-07,
"loss": 1.9795,
"step": 2202
},
{
"epoch": 0.9398464163822525,
"grad_norm": 22.466522216796875,
"learning_rate": 6.37915543575921e-07,
"loss": 2.0566,
"step": 2203
},
{
"epoch": 0.9402730375426621,
"grad_norm": 29.288331985473633,
"learning_rate": 6.33423180592992e-07,
"loss": 2.625,
"step": 2204
},
{
"epoch": 0.9406996587030717,
"grad_norm": 17.484268188476562,
"learning_rate": 6.28930817610063e-07,
"loss": 1.7607,
"step": 2205
},
{
"epoch": 0.9411262798634812,
"grad_norm": 29.735485076904297,
"learning_rate": 6.244384546271339e-07,
"loss": 2.2158,
"step": 2206
},
{
"epoch": 0.9415529010238908,
"grad_norm": 18.694398880004883,
"learning_rate": 6.199460916442049e-07,
"loss": 1.5278,
"step": 2207
},
{
"epoch": 0.9419795221843004,
"grad_norm": 22.542926788330078,
"learning_rate": 6.154537286612758e-07,
"loss": 1.8477,
"step": 2208
},
{
"epoch": 0.9424061433447098,
"grad_norm": 41.35770034790039,
"learning_rate": 6.109613656783469e-07,
"loss": 2.6221,
"step": 2209
},
{
"epoch": 0.9428327645051194,
"grad_norm": 22.501956939697266,
"learning_rate": 6.064690026954179e-07,
"loss": 1.8408,
"step": 2210
},
{
"epoch": 0.943259385665529,
"grad_norm": 21.826805114746094,
"learning_rate": 6.019766397124888e-07,
"loss": 2.208,
"step": 2211
},
{
"epoch": 0.9436860068259386,
"grad_norm": 19.94651985168457,
"learning_rate": 5.974842767295598e-07,
"loss": 1.9448,
"step": 2212
},
{
"epoch": 0.9441126279863481,
"grad_norm": 24.20819091796875,
"learning_rate": 5.929919137466308e-07,
"loss": 1.8105,
"step": 2213
},
{
"epoch": 0.9445392491467577,
"grad_norm": 22.356653213500977,
"learning_rate": 5.884995507637018e-07,
"loss": 1.5435,
"step": 2214
},
{
"epoch": 0.9449658703071673,
"grad_norm": 30.155969619750977,
"learning_rate": 5.840071877807727e-07,
"loss": 2.3643,
"step": 2215
},
{
"epoch": 0.9453924914675768,
"grad_norm": 21.76837921142578,
"learning_rate": 5.795148247978436e-07,
"loss": 1.7051,
"step": 2216
},
{
"epoch": 0.9458191126279863,
"grad_norm": 25.253297805786133,
"learning_rate": 5.750224618149147e-07,
"loss": 2.3926,
"step": 2217
},
{
"epoch": 0.9462457337883959,
"grad_norm": 17.783184051513672,
"learning_rate": 5.705300988319857e-07,
"loss": 1.874,
"step": 2218
},
{
"epoch": 0.9466723549488054,
"grad_norm": 26.631513595581055,
"learning_rate": 5.660377358490567e-07,
"loss": 1.7959,
"step": 2219
},
{
"epoch": 0.947098976109215,
"grad_norm": 23.53889274597168,
"learning_rate": 5.615453728661276e-07,
"loss": 2.2017,
"step": 2220
},
{
"epoch": 0.9475255972696246,
"grad_norm": 29.182485580444336,
"learning_rate": 5.570530098831986e-07,
"loss": 1.8428,
"step": 2221
},
{
"epoch": 0.9479522184300341,
"grad_norm": 25.79732894897461,
"learning_rate": 5.525606469002696e-07,
"loss": 1.8906,
"step": 2222
},
{
"epoch": 0.9483788395904437,
"grad_norm": 23.194297790527344,
"learning_rate": 5.480682839173405e-07,
"loss": 2.1357,
"step": 2223
},
{
"epoch": 0.9488054607508533,
"grad_norm": 24.474523544311523,
"learning_rate": 5.435759209344115e-07,
"loss": 2.0537,
"step": 2224
},
{
"epoch": 0.9492320819112628,
"grad_norm": 24.2773380279541,
"learning_rate": 5.390835579514825e-07,
"loss": 1.9922,
"step": 2225
},
{
"epoch": 0.9496587030716723,
"grad_norm": 24.227388381958008,
"learning_rate": 5.345911949685535e-07,
"loss": 2.1426,
"step": 2226
},
{
"epoch": 0.9500853242320819,
"grad_norm": 24.159053802490234,
"learning_rate": 5.300988319856245e-07,
"loss": 2.1465,
"step": 2227
},
{
"epoch": 0.9505119453924915,
"grad_norm": 30.254880905151367,
"learning_rate": 5.256064690026954e-07,
"loss": 2.248,
"step": 2228
},
{
"epoch": 0.950938566552901,
"grad_norm": 29.864953994750977,
"learning_rate": 5.211141060197665e-07,
"loss": 1.7378,
"step": 2229
},
{
"epoch": 0.9513651877133106,
"grad_norm": 24.123804092407227,
"learning_rate": 5.166217430368374e-07,
"loss": 2.5049,
"step": 2230
},
{
"epoch": 0.9517918088737202,
"grad_norm": 42.50309753417969,
"learning_rate": 5.121293800539083e-07,
"loss": 2.2256,
"step": 2231
},
{
"epoch": 0.9522184300341296,
"grad_norm": 23.206518173217773,
"learning_rate": 5.076370170709794e-07,
"loss": 2.498,
"step": 2232
},
{
"epoch": 0.9526450511945392,
"grad_norm": 23.798446655273438,
"learning_rate": 5.031446540880503e-07,
"loss": 1.8755,
"step": 2233
},
{
"epoch": 0.9530716723549488,
"grad_norm": 20.48027229309082,
"learning_rate": 4.986522911051214e-07,
"loss": 1.7832,
"step": 2234
},
{
"epoch": 0.9534982935153583,
"grad_norm": 24.295406341552734,
"learning_rate": 4.941599281221923e-07,
"loss": 1.8711,
"step": 2235
},
{
"epoch": 0.9539249146757679,
"grad_norm": 25.365156173706055,
"learning_rate": 4.896675651392633e-07,
"loss": 2.3271,
"step": 2236
},
{
"epoch": 0.9543515358361775,
"grad_norm": 29.6678466796875,
"learning_rate": 4.851752021563343e-07,
"loss": 1.7764,
"step": 2237
},
{
"epoch": 0.9547781569965871,
"grad_norm": 23.474645614624023,
"learning_rate": 4.806828391734052e-07,
"loss": 2.3232,
"step": 2238
},
{
"epoch": 0.9552047781569966,
"grad_norm": 24.556297302246094,
"learning_rate": 4.7619047619047623e-07,
"loss": 2.2383,
"step": 2239
},
{
"epoch": 0.9556313993174061,
"grad_norm": 35.68256378173828,
"learning_rate": 4.716981132075472e-07,
"loss": 1.7432,
"step": 2240
},
{
"epoch": 0.9560580204778157,
"grad_norm": 23.850263595581055,
"learning_rate": 4.672057502246182e-07,
"loss": 1.9561,
"step": 2241
},
{
"epoch": 0.9564846416382252,
"grad_norm": 24.310489654541016,
"learning_rate": 4.627133872416892e-07,
"loss": 2.6641,
"step": 2242
},
{
"epoch": 0.9569112627986348,
"grad_norm": 26.781076431274414,
"learning_rate": 4.5822102425876014e-07,
"loss": 2.415,
"step": 2243
},
{
"epoch": 0.9573378839590444,
"grad_norm": 24.84231948852539,
"learning_rate": 4.5372866127583113e-07,
"loss": 2.3594,
"step": 2244
},
{
"epoch": 0.9577645051194539,
"grad_norm": 22.890363693237305,
"learning_rate": 4.4923629829290207e-07,
"loss": 2.3408,
"step": 2245
},
{
"epoch": 0.9581911262798635,
"grad_norm": 21.693756103515625,
"learning_rate": 4.4474393530997306e-07,
"loss": 2.0381,
"step": 2246
},
{
"epoch": 0.9586177474402731,
"grad_norm": 26.951416015625,
"learning_rate": 4.4025157232704405e-07,
"loss": 2.0249,
"step": 2247
},
{
"epoch": 0.9590443686006825,
"grad_norm": 30.898317337036133,
"learning_rate": 4.357592093441151e-07,
"loss": 1.9487,
"step": 2248
},
{
"epoch": 0.9594709897610921,
"grad_norm": 18.807350158691406,
"learning_rate": 4.3126684636118604e-07,
"loss": 1.4497,
"step": 2249
},
{
"epoch": 0.9598976109215017,
"grad_norm": 19.728879928588867,
"learning_rate": 4.2677448337825703e-07,
"loss": 2.0342,
"step": 2250
},
{
"epoch": 0.9603242320819113,
"grad_norm": 19.783090591430664,
"learning_rate": 4.2228212039532797e-07,
"loss": 1.6553,
"step": 2251
},
{
"epoch": 0.9607508532423208,
"grad_norm": 21.11236572265625,
"learning_rate": 4.1778975741239896e-07,
"loss": 1.6157,
"step": 2252
},
{
"epoch": 0.9611774744027304,
"grad_norm": 24.241262435913086,
"learning_rate": 4.1329739442946995e-07,
"loss": 2.0508,
"step": 2253
},
{
"epoch": 0.96160409556314,
"grad_norm": 22.40394401550293,
"learning_rate": 4.088050314465409e-07,
"loss": 2.0752,
"step": 2254
},
{
"epoch": 0.9620307167235495,
"grad_norm": 21.405790328979492,
"learning_rate": 4.043126684636119e-07,
"loss": 2.0957,
"step": 2255
},
{
"epoch": 0.962457337883959,
"grad_norm": 19.25254249572754,
"learning_rate": 3.998203054806829e-07,
"loss": 1.957,
"step": 2256
},
{
"epoch": 0.9628839590443686,
"grad_norm": 27.212743759155273,
"learning_rate": 3.9532794249775386e-07,
"loss": 2.1802,
"step": 2257
},
{
"epoch": 0.9633105802047781,
"grad_norm": 31.379535675048828,
"learning_rate": 3.9083557951482485e-07,
"loss": 1.7871,
"step": 2258
},
{
"epoch": 0.9637372013651877,
"grad_norm": 20.31130027770996,
"learning_rate": 3.863432165318958e-07,
"loss": 2.0615,
"step": 2259
},
{
"epoch": 0.9641638225255973,
"grad_norm": 26.714658737182617,
"learning_rate": 3.818508535489668e-07,
"loss": 2.6816,
"step": 2260
},
{
"epoch": 0.9645904436860068,
"grad_norm": 30.048847198486328,
"learning_rate": 3.773584905660378e-07,
"loss": 2.9863,
"step": 2261
},
{
"epoch": 0.9650170648464164,
"grad_norm": 19.18850326538086,
"learning_rate": 3.728661275831087e-07,
"loss": 1.9072,
"step": 2262
},
{
"epoch": 0.965443686006826,
"grad_norm": 39.271331787109375,
"learning_rate": 3.683737646001797e-07,
"loss": 2.2754,
"step": 2263
},
{
"epoch": 0.9658703071672355,
"grad_norm": 20.29155158996582,
"learning_rate": 3.6388140161725075e-07,
"loss": 1.7832,
"step": 2264
},
{
"epoch": 0.966296928327645,
"grad_norm": 21.201148986816406,
"learning_rate": 3.593890386343217e-07,
"loss": 1.8188,
"step": 2265
},
{
"epoch": 0.9667235494880546,
"grad_norm": 21.81801414489746,
"learning_rate": 3.548966756513927e-07,
"loss": 1.6567,
"step": 2266
},
{
"epoch": 0.9671501706484642,
"grad_norm": 22.927953720092773,
"learning_rate": 3.504043126684636e-07,
"loss": 2.3242,
"step": 2267
},
{
"epoch": 0.9675767918088737,
"grad_norm": 24.306936264038086,
"learning_rate": 3.459119496855346e-07,
"loss": 1.502,
"step": 2268
},
{
"epoch": 0.9680034129692833,
"grad_norm": 22.41961669921875,
"learning_rate": 3.414195867026056e-07,
"loss": 2.335,
"step": 2269
},
{
"epoch": 0.9684300341296929,
"grad_norm": 23.747060775756836,
"learning_rate": 3.3692722371967654e-07,
"loss": 1.9102,
"step": 2270
},
{
"epoch": 0.9688566552901023,
"grad_norm": 22.018306732177734,
"learning_rate": 3.3243486073674753e-07,
"loss": 1.8613,
"step": 2271
},
{
"epoch": 0.9692832764505119,
"grad_norm": 22.868671417236328,
"learning_rate": 3.279424977538186e-07,
"loss": 2.1055,
"step": 2272
},
{
"epoch": 0.9697098976109215,
"grad_norm": 29.00540542602539,
"learning_rate": 3.234501347708895e-07,
"loss": 2.8096,
"step": 2273
},
{
"epoch": 0.9701365187713311,
"grad_norm": 21.77559471130371,
"learning_rate": 3.189577717879605e-07,
"loss": 1.9175,
"step": 2274
},
{
"epoch": 0.9705631399317406,
"grad_norm": 18.843427658081055,
"learning_rate": 3.144654088050315e-07,
"loss": 1.7012,
"step": 2275
},
{
"epoch": 0.9709897610921502,
"grad_norm": 24.1773681640625,
"learning_rate": 3.0997304582210244e-07,
"loss": 1.5874,
"step": 2276
},
{
"epoch": 0.9714163822525598,
"grad_norm": 19.41014862060547,
"learning_rate": 3.0548068283917343e-07,
"loss": 1.395,
"step": 2277
},
{
"epoch": 0.9718430034129693,
"grad_norm": 25.66033363342285,
"learning_rate": 3.009883198562444e-07,
"loss": 3.2275,
"step": 2278
},
{
"epoch": 0.9722696245733788,
"grad_norm": 30.046892166137695,
"learning_rate": 2.964959568733154e-07,
"loss": 1.9316,
"step": 2279
},
{
"epoch": 0.9726962457337884,
"grad_norm": 20.475860595703125,
"learning_rate": 2.9200359389038635e-07,
"loss": 1.7725,
"step": 2280
},
{
"epoch": 0.9731228668941979,
"grad_norm": 24.3115177154541,
"learning_rate": 2.8751123090745734e-07,
"loss": 1.875,
"step": 2281
},
{
"epoch": 0.9735494880546075,
"grad_norm": 18.358522415161133,
"learning_rate": 2.8301886792452833e-07,
"loss": 1.6748,
"step": 2282
},
{
"epoch": 0.9739761092150171,
"grad_norm": 20.390560150146484,
"learning_rate": 2.785265049415993e-07,
"loss": 1.7437,
"step": 2283
},
{
"epoch": 0.9744027303754266,
"grad_norm": 36.942039489746094,
"learning_rate": 2.7403414195867026e-07,
"loss": 2.3301,
"step": 2284
},
{
"epoch": 0.9748293515358362,
"grad_norm": 21.772939682006836,
"learning_rate": 2.6954177897574125e-07,
"loss": 2.1104,
"step": 2285
},
{
"epoch": 0.9752559726962458,
"grad_norm": 22.524234771728516,
"learning_rate": 2.6504941599281224e-07,
"loss": 1.8799,
"step": 2286
},
{
"epoch": 0.9756825938566553,
"grad_norm": 23.909032821655273,
"learning_rate": 2.6055705300988324e-07,
"loss": 1.793,
"step": 2287
},
{
"epoch": 0.9761092150170648,
"grad_norm": 34.19670486450195,
"learning_rate": 2.560646900269542e-07,
"loss": 2.3564,
"step": 2288
},
{
"epoch": 0.9765358361774744,
"grad_norm": 23.46267318725586,
"learning_rate": 2.5157232704402517e-07,
"loss": 1.8989,
"step": 2289
},
{
"epoch": 0.976962457337884,
"grad_norm": 25.99132537841797,
"learning_rate": 2.4707996406109616e-07,
"loss": 1.6729,
"step": 2290
},
{
"epoch": 0.9773890784982935,
"grad_norm": 19.899215698242188,
"learning_rate": 2.4258760107816715e-07,
"loss": 2.0142,
"step": 2291
},
{
"epoch": 0.9778156996587031,
"grad_norm": 32.16071701049805,
"learning_rate": 2.3809523809523811e-07,
"loss": 2.2734,
"step": 2292
},
{
"epoch": 0.9782423208191127,
"grad_norm": 21.155961990356445,
"learning_rate": 2.336028751123091e-07,
"loss": 1.6182,
"step": 2293
},
{
"epoch": 0.9786689419795221,
"grad_norm": 19.982471466064453,
"learning_rate": 2.2911051212938007e-07,
"loss": 2.0479,
"step": 2294
},
{
"epoch": 0.9790955631399317,
"grad_norm": 26.075380325317383,
"learning_rate": 2.2461814914645103e-07,
"loss": 2.0459,
"step": 2295
},
{
"epoch": 0.9795221843003413,
"grad_norm": 21.82196617126465,
"learning_rate": 2.2012578616352203e-07,
"loss": 1.7998,
"step": 2296
},
{
"epoch": 0.9799488054607508,
"grad_norm": 17.56633186340332,
"learning_rate": 2.1563342318059302e-07,
"loss": 1.6953,
"step": 2297
},
{
"epoch": 0.9803754266211604,
"grad_norm": 20.110218048095703,
"learning_rate": 2.1114106019766398e-07,
"loss": 2.0469,
"step": 2298
},
{
"epoch": 0.98080204778157,
"grad_norm": 20.577871322631836,
"learning_rate": 2.0664869721473497e-07,
"loss": 1.853,
"step": 2299
},
{
"epoch": 0.9812286689419796,
"grad_norm": 26.926685333251953,
"learning_rate": 2.0215633423180594e-07,
"loss": 2.0679,
"step": 2300
},
{
"epoch": 0.981655290102389,
"grad_norm": 26.033254623413086,
"learning_rate": 1.9766397124887693e-07,
"loss": 2.124,
"step": 2301
},
{
"epoch": 0.9820819112627986,
"grad_norm": 36.75836944580078,
"learning_rate": 1.931716082659479e-07,
"loss": 1.7207,
"step": 2302
},
{
"epoch": 0.9825085324232082,
"grad_norm": 17.078088760375977,
"learning_rate": 1.886792452830189e-07,
"loss": 1.6279,
"step": 2303
},
{
"epoch": 0.9829351535836177,
"grad_norm": 23.182506561279297,
"learning_rate": 1.8418688230008985e-07,
"loss": 2.2988,
"step": 2304
},
{
"epoch": 0.9833617747440273,
"grad_norm": 25.592905044555664,
"learning_rate": 1.7969451931716084e-07,
"loss": 2.0176,
"step": 2305
},
{
"epoch": 0.9837883959044369,
"grad_norm": 21.9675350189209,
"learning_rate": 1.752021563342318e-07,
"loss": 1.9124,
"step": 2306
},
{
"epoch": 0.9842150170648464,
"grad_norm": 26.200014114379883,
"learning_rate": 1.707097933513028e-07,
"loss": 1.8384,
"step": 2307
},
{
"epoch": 0.984641638225256,
"grad_norm": 37.688297271728516,
"learning_rate": 1.6621743036837377e-07,
"loss": 1.9126,
"step": 2308
},
{
"epoch": 0.9850682593856656,
"grad_norm": 33.95469284057617,
"learning_rate": 1.6172506738544476e-07,
"loss": 2.4209,
"step": 2309
},
{
"epoch": 0.985494880546075,
"grad_norm": 32.37906265258789,
"learning_rate": 1.5723270440251575e-07,
"loss": 2.2676,
"step": 2310
},
{
"epoch": 0.9859215017064846,
"grad_norm": 22.61172866821289,
"learning_rate": 1.5274034141958671e-07,
"loss": 2.1543,
"step": 2311
},
{
"epoch": 0.9863481228668942,
"grad_norm": 23.850786209106445,
"learning_rate": 1.482479784366577e-07,
"loss": 2.0547,
"step": 2312
},
{
"epoch": 0.9867747440273038,
"grad_norm": 26.80950164794922,
"learning_rate": 1.4375561545372867e-07,
"loss": 2.3789,
"step": 2313
},
{
"epoch": 0.9872013651877133,
"grad_norm": 27.270933151245117,
"learning_rate": 1.3926325247079966e-07,
"loss": 2.1729,
"step": 2314
},
{
"epoch": 0.9876279863481229,
"grad_norm": 20.2312068939209,
"learning_rate": 1.3477088948787063e-07,
"loss": 1.8989,
"step": 2315
},
{
"epoch": 0.9880546075085325,
"grad_norm": 28.13861846923828,
"learning_rate": 1.3027852650494162e-07,
"loss": 2.0293,
"step": 2316
},
{
"epoch": 0.988481228668942,
"grad_norm": 25.935266494750977,
"learning_rate": 1.2578616352201258e-07,
"loss": 1.7109,
"step": 2317
},
{
"epoch": 0.9889078498293515,
"grad_norm": 36.28443145751953,
"learning_rate": 1.2129380053908357e-07,
"loss": 2.1348,
"step": 2318
},
{
"epoch": 0.9893344709897611,
"grad_norm": 22.1302547454834,
"learning_rate": 1.1680143755615455e-07,
"loss": 1.7305,
"step": 2319
},
{
"epoch": 0.9897610921501706,
"grad_norm": 34.13624954223633,
"learning_rate": 1.1230907457322552e-07,
"loss": 2.2808,
"step": 2320
},
{
"epoch": 0.9901877133105802,
"grad_norm": 22.001792907714844,
"learning_rate": 1.0781671159029651e-07,
"loss": 2.0962,
"step": 2321
},
{
"epoch": 0.9906143344709898,
"grad_norm": 23.512155532836914,
"learning_rate": 1.0332434860736749e-07,
"loss": 1.915,
"step": 2322
},
{
"epoch": 0.9910409556313993,
"grad_norm": 20.13483238220215,
"learning_rate": 9.883198562443847e-08,
"loss": 1.8555,
"step": 2323
},
{
"epoch": 0.9914675767918089,
"grad_norm": 23.738903045654297,
"learning_rate": 9.433962264150944e-08,
"loss": 2.0503,
"step": 2324
},
{
"epoch": 0.9918941979522184,
"grad_norm": 21.183456420898438,
"learning_rate": 8.984725965858042e-08,
"loss": 2.0312,
"step": 2325
},
{
"epoch": 0.992320819112628,
"grad_norm": 45.3771858215332,
"learning_rate": 8.53548966756514e-08,
"loss": 2.8213,
"step": 2326
},
{
"epoch": 0.9927474402730375,
"grad_norm": 28.3593807220459,
"learning_rate": 8.086253369272238e-08,
"loss": 1.8647,
"step": 2327
},
{
"epoch": 0.9931740614334471,
"grad_norm": 35.657875061035156,
"learning_rate": 7.637017070979336e-08,
"loss": 1.9424,
"step": 2328
},
{
"epoch": 0.9936006825938567,
"grad_norm": 30.0842227935791,
"learning_rate": 7.187780772686433e-08,
"loss": 2.043,
"step": 2329
},
{
"epoch": 0.9940273037542662,
"grad_norm": 23.042972564697266,
"learning_rate": 6.738544474393531e-08,
"loss": 2.0967,
"step": 2330
},
{
"epoch": 0.9944539249146758,
"grad_norm": 21.70021629333496,
"learning_rate": 6.289308176100629e-08,
"loss": 1.8965,
"step": 2331
},
{
"epoch": 0.9948805460750854,
"grad_norm": 27.594707489013672,
"learning_rate": 5.8400718778077276e-08,
"loss": 1.9629,
"step": 2332
},
{
"epoch": 0.9953071672354948,
"grad_norm": 22.49936866760254,
"learning_rate": 5.3908355795148254e-08,
"loss": 2.335,
"step": 2333
},
{
"epoch": 0.9957337883959044,
"grad_norm": 37.52388381958008,
"learning_rate": 4.941599281221923e-08,
"loss": 2.9873,
"step": 2334
},
{
"epoch": 0.996160409556314,
"grad_norm": 36.102996826171875,
"learning_rate": 4.492362982929021e-08,
"loss": 2.1113,
"step": 2335
},
{
"epoch": 0.9965870307167235,
"grad_norm": 23.30735969543457,
"learning_rate": 4.043126684636119e-08,
"loss": 2.0635,
"step": 2336
},
{
"epoch": 0.9970136518771331,
"grad_norm": 26.220535278320312,
"learning_rate": 3.593890386343217e-08,
"loss": 1.7402,
"step": 2337
},
{
"epoch": 0.9974402730375427,
"grad_norm": 22.964366912841797,
"learning_rate": 3.1446540880503146e-08,
"loss": 2.0645,
"step": 2338
},
{
"epoch": 0.9978668941979523,
"grad_norm": 36.568817138671875,
"learning_rate": 2.6954177897574127e-08,
"loss": 1.9258,
"step": 2339
},
{
"epoch": 0.9982935153583617,
"grad_norm": 23.925901412963867,
"learning_rate": 2.2461814914645105e-08,
"loss": 1.8818,
"step": 2340
},
{
"epoch": 0.9987201365187713,
"grad_norm": 25.192325592041016,
"learning_rate": 1.7969451931716084e-08,
"loss": 2.4082,
"step": 2341
},
{
"epoch": 0.9991467576791809,
"grad_norm": 46.00706100463867,
"learning_rate": 1.3477088948787064e-08,
"loss": 2.2656,
"step": 2342
},
{
"epoch": 0.9995733788395904,
"grad_norm": 24.589725494384766,
"learning_rate": 8.984725965858042e-09,
"loss": 2.0283,
"step": 2343
},
{
"epoch": 1.0,
"grad_norm": 20.995540618896484,
"learning_rate": 4.492362982929021e-09,
"loss": 2.2715,
"step": 2344
},
{
"epoch": 1.0,
"step": 2344,
"total_flos": 2.0877700871959347e+18,
"train_loss": 2.621050004666169,
"train_runtime": 1424.1008,
"train_samples_per_second": 421.318,
"train_steps_per_second": 1.646
}
],
"logging_steps": 1,
"max_steps": 2344,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.0877700871959347e+18,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}