{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 1172, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0008532423208191126, "grad_norm": 476.3564758300781, "learning_rate": 0.0, "loss": 9.6172, "step": 1 }, { "epoch": 0.0017064846416382253, "grad_norm": 473.1830139160156, "learning_rate": 1.6949152542372883e-07, "loss": 9.1602, "step": 2 }, { "epoch": 0.002559726962457338, "grad_norm": 576.52734375, "learning_rate": 3.3898305084745766e-07, "loss": 10.8633, "step": 3 }, { "epoch": 0.0034129692832764505, "grad_norm": 394.45654296875, "learning_rate": 5.084745762711865e-07, "loss": 7.9258, "step": 4 }, { "epoch": 0.004266211604095563, "grad_norm": 435.7214660644531, "learning_rate": 6.779661016949153e-07, "loss": 10.0195, "step": 5 }, { "epoch": 0.005119453924914676, "grad_norm": 417.97314453125, "learning_rate": 8.474576271186441e-07, "loss": 9.6953, "step": 6 }, { "epoch": 0.005972696245733789, "grad_norm": 379.0962219238281, "learning_rate": 1.016949152542373e-06, "loss": 9.4453, "step": 7 }, { "epoch": 0.006825938566552901, "grad_norm": 333.2943420410156, "learning_rate": 1.186440677966102e-06, "loss": 7.7344, "step": 8 }, { "epoch": 0.007679180887372013, "grad_norm": 312.8900451660156, "learning_rate": 1.3559322033898307e-06, "loss": 7.8281, "step": 9 }, { "epoch": 0.008532423208191127, "grad_norm": 365.3479919433594, "learning_rate": 1.5254237288135596e-06, "loss": 9.1172, "step": 10 }, { "epoch": 0.00938566552901024, "grad_norm": 267.8406982421875, "learning_rate": 1.6949152542372882e-06, "loss": 8.2539, "step": 11 }, { "epoch": 0.010238907849829351, "grad_norm": 393.07293701171875, "learning_rate": 1.8644067796610171e-06, "loss": 9.0859, "step": 12 }, { "epoch": 0.011092150170648464, "grad_norm": 585.0484008789062, "learning_rate": 2.033898305084746e-06, "loss": 9.7812, "step": 13 }, { "epoch": 0.011945392491467578, "grad_norm": 658.748779296875, "learning_rate": 2.203389830508475e-06, "loss": 9.1484, "step": 14 }, { "epoch": 0.012798634812286689, "grad_norm": 607.95068359375, "learning_rate": 2.372881355932204e-06, "loss": 9.6484, "step": 15 }, { "epoch": 0.013651877133105802, "grad_norm": 722.521728515625, "learning_rate": 2.5423728813559323e-06, "loss": 10.4219, "step": 16 }, { "epoch": 0.014505119453924915, "grad_norm": 504.6495666503906, "learning_rate": 2.7118644067796613e-06, "loss": 7.9531, "step": 17 }, { "epoch": 0.015358361774744027, "grad_norm": 480.6727294921875, "learning_rate": 2.8813559322033903e-06, "loss": 7.3555, "step": 18 }, { "epoch": 0.016211604095563138, "grad_norm": 289.67529296875, "learning_rate": 3.0508474576271192e-06, "loss": 6.4062, "step": 19 }, { "epoch": 0.017064846416382253, "grad_norm": 143.08592224121094, "learning_rate": 3.2203389830508473e-06, "loss": 5.4492, "step": 20 }, { "epoch": 0.017918088737201365, "grad_norm": 145.436279296875, "learning_rate": 3.3898305084745763e-06, "loss": 5.2383, "step": 21 }, { "epoch": 0.01877133105802048, "grad_norm": 363.3115539550781, "learning_rate": 3.5593220338983053e-06, "loss": 6.668, "step": 22 }, { "epoch": 0.01962457337883959, "grad_norm": 311.4585266113281, "learning_rate": 3.7288135593220342e-06, "loss": 6.0078, "step": 23 }, { "epoch": 0.020477815699658702, "grad_norm": 132.5367431640625, "learning_rate": 3.898305084745763e-06, "loss": 4.8789, "step": 24 }, { "epoch": 0.021331058020477817, "grad_norm": 59.49905776977539, "learning_rate": 4.067796610169492e-06, "loss": 4.9141, "step": 25 }, { "epoch": 0.02218430034129693, "grad_norm": 144.73834228515625, "learning_rate": 4.23728813559322e-06, "loss": 5.2695, "step": 26 }, { "epoch": 0.02303754266211604, "grad_norm": 72.64948272705078, "learning_rate": 4.40677966101695e-06, "loss": 5.3594, "step": 27 }, { "epoch": 0.023890784982935155, "grad_norm": 67.38402557373047, "learning_rate": 4.576271186440678e-06, "loss": 4.5137, "step": 28 }, { "epoch": 0.024744027303754267, "grad_norm": 70.24736785888672, "learning_rate": 4.745762711864408e-06, "loss": 4.7305, "step": 29 }, { "epoch": 0.025597269624573378, "grad_norm": 37.224700927734375, "learning_rate": 4.915254237288136e-06, "loss": 4.5859, "step": 30 }, { "epoch": 0.026450511945392493, "grad_norm": 52.372982025146484, "learning_rate": 5.084745762711865e-06, "loss": 4.6504, "step": 31 }, { "epoch": 0.027303754266211604, "grad_norm": 58.09052658081055, "learning_rate": 5.254237288135594e-06, "loss": 5.0195, "step": 32 }, { "epoch": 0.028156996587030716, "grad_norm": 273.7731018066406, "learning_rate": 5.423728813559323e-06, "loss": 4.877, "step": 33 }, { "epoch": 0.02901023890784983, "grad_norm": 257.3023986816406, "learning_rate": 5.593220338983051e-06, "loss": 5.1367, "step": 34 }, { "epoch": 0.029863481228668942, "grad_norm": 37.15542984008789, "learning_rate": 5.7627118644067805e-06, "loss": 4.498, "step": 35 }, { "epoch": 0.030716723549488054, "grad_norm": 57.637229919433594, "learning_rate": 5.932203389830509e-06, "loss": 4.498, "step": 36 }, { "epoch": 0.031569965870307165, "grad_norm": 161.52142333984375, "learning_rate": 6.1016949152542385e-06, "loss": 4.8535, "step": 37 }, { "epoch": 0.032423208191126277, "grad_norm": 144.53648376464844, "learning_rate": 6.271186440677966e-06, "loss": 5.0547, "step": 38 }, { "epoch": 0.033276450511945395, "grad_norm": 97.21565246582031, "learning_rate": 6.440677966101695e-06, "loss": 4.4238, "step": 39 }, { "epoch": 0.034129692832764506, "grad_norm": 189.5275115966797, "learning_rate": 6.610169491525424e-06, "loss": 4.6973, "step": 40 }, { "epoch": 0.03498293515358362, "grad_norm": 52.70186233520508, "learning_rate": 6.779661016949153e-06, "loss": 4.2754, "step": 41 }, { "epoch": 0.03583617747440273, "grad_norm": 204.4432830810547, "learning_rate": 6.949152542372882e-06, "loss": 5.0195, "step": 42 }, { "epoch": 0.03668941979522184, "grad_norm": 179.79095458984375, "learning_rate": 7.1186440677966106e-06, "loss": 5.0684, "step": 43 }, { "epoch": 0.03754266211604096, "grad_norm": 119.27928924560547, "learning_rate": 7.288135593220339e-06, "loss": 4.6699, "step": 44 }, { "epoch": 0.03839590443686007, "grad_norm": 83.18463897705078, "learning_rate": 7.4576271186440685e-06, "loss": 4.0254, "step": 45 }, { "epoch": 0.03924914675767918, "grad_norm": 121.39764404296875, "learning_rate": 7.627118644067797e-06, "loss": 4.6602, "step": 46 }, { "epoch": 0.04010238907849829, "grad_norm": 32.138038635253906, "learning_rate": 7.796610169491526e-06, "loss": 4.5078, "step": 47 }, { "epoch": 0.040955631399317405, "grad_norm": 105.33744812011719, "learning_rate": 7.966101694915255e-06, "loss": 4.1465, "step": 48 }, { "epoch": 0.041808873720136516, "grad_norm": 163.0584259033203, "learning_rate": 8.135593220338983e-06, "loss": 4.7871, "step": 49 }, { "epoch": 0.042662116040955635, "grad_norm": 90.8143081665039, "learning_rate": 8.305084745762712e-06, "loss": 4.3184, "step": 50 }, { "epoch": 0.043515358361774746, "grad_norm": 130.4494171142578, "learning_rate": 8.47457627118644e-06, "loss": 4.6992, "step": 51 }, { "epoch": 0.04436860068259386, "grad_norm": 120.24535369873047, "learning_rate": 8.64406779661017e-06, "loss": 4.6289, "step": 52 }, { "epoch": 0.04522184300341297, "grad_norm": 129.9118194580078, "learning_rate": 8.8135593220339e-06, "loss": 4.2207, "step": 53 }, { "epoch": 0.04607508532423208, "grad_norm": 96.74512481689453, "learning_rate": 8.983050847457628e-06, "loss": 4.1035, "step": 54 }, { "epoch": 0.04692832764505119, "grad_norm": 127.90748596191406, "learning_rate": 9.152542372881356e-06, "loss": 4.7188, "step": 55 }, { "epoch": 0.04778156996587031, "grad_norm": 79.55717468261719, "learning_rate": 9.322033898305085e-06, "loss": 3.7441, "step": 56 }, { "epoch": 0.04863481228668942, "grad_norm": 107.03738403320312, "learning_rate": 9.491525423728815e-06, "loss": 4.3281, "step": 57 }, { "epoch": 0.04948805460750853, "grad_norm": 137.00802612304688, "learning_rate": 9.661016949152544e-06, "loss": 4.5039, "step": 58 }, { "epoch": 0.050341296928327645, "grad_norm": 56.521270751953125, "learning_rate": 9.830508474576272e-06, "loss": 4.1992, "step": 59 }, { "epoch": 0.051194539249146756, "grad_norm": 91.46454620361328, "learning_rate": 1e-05, "loss": 4.2109, "step": 60 }, { "epoch": 0.05204778156996587, "grad_norm": 77.05010986328125, "learning_rate": 9.991015274034143e-06, "loss": 3.8574, "step": 61 }, { "epoch": 0.052901023890784986, "grad_norm": 23.841175079345703, "learning_rate": 9.982030548068285e-06, "loss": 3.9512, "step": 62 }, { "epoch": 0.0537542662116041, "grad_norm": 84.35244750976562, "learning_rate": 9.973045822102425e-06, "loss": 3.9473, "step": 63 }, { "epoch": 0.05460750853242321, "grad_norm": 127.69181823730469, "learning_rate": 9.96406109613657e-06, "loss": 3.8828, "step": 64 }, { "epoch": 0.05546075085324232, "grad_norm": 27.332721710205078, "learning_rate": 9.955076370170711e-06, "loss": 3.8457, "step": 65 }, { "epoch": 0.05631399317406143, "grad_norm": 127.41008758544922, "learning_rate": 9.946091644204853e-06, "loss": 4.3711, "step": 66 }, { "epoch": 0.05716723549488054, "grad_norm": 50.57866287231445, "learning_rate": 9.937106918238994e-06, "loss": 3.4824, "step": 67 }, { "epoch": 0.05802047781569966, "grad_norm": 47.33999252319336, "learning_rate": 9.928122192273136e-06, "loss": 3.8301, "step": 68 }, { "epoch": 0.05887372013651877, "grad_norm": 28.65631675720215, "learning_rate": 9.919137466307278e-06, "loss": 3.5703, "step": 69 }, { "epoch": 0.059726962457337884, "grad_norm": 100.2359619140625, "learning_rate": 9.91015274034142e-06, "loss": 4.5332, "step": 70 }, { "epoch": 0.060580204778156996, "grad_norm": 27.0611572265625, "learning_rate": 9.901168014375562e-06, "loss": 3.5859, "step": 71 }, { "epoch": 0.06143344709897611, "grad_norm": 119.60940551757812, "learning_rate": 9.892183288409704e-06, "loss": 4.0703, "step": 72 }, { "epoch": 0.06228668941979522, "grad_norm": 113.57787322998047, "learning_rate": 9.883198562443846e-06, "loss": 4.0352, "step": 73 }, { "epoch": 0.06313993174061433, "grad_norm": 22.393367767333984, "learning_rate": 9.874213836477988e-06, "loss": 4.0293, "step": 74 }, { "epoch": 0.06399317406143344, "grad_norm": 77.3707504272461, "learning_rate": 9.86522911051213e-06, "loss": 3.6855, "step": 75 }, { "epoch": 0.06484641638225255, "grad_norm": 23.650131225585938, "learning_rate": 9.856244384546273e-06, "loss": 3.8066, "step": 76 }, { "epoch": 0.06569965870307168, "grad_norm": 78.92176055908203, "learning_rate": 9.847259658580413e-06, "loss": 4.0781, "step": 77 }, { "epoch": 0.06655290102389079, "grad_norm": 57.146488189697266, "learning_rate": 9.838274932614557e-06, "loss": 3.8672, "step": 78 }, { "epoch": 0.0674061433447099, "grad_norm": 65.50660705566406, "learning_rate": 9.829290206648699e-06, "loss": 3.2881, "step": 79 }, { "epoch": 0.06825938566552901, "grad_norm": 71.74410247802734, "learning_rate": 9.820305480682841e-06, "loss": 3.8984, "step": 80 }, { "epoch": 0.06911262798634812, "grad_norm": 34.78994369506836, "learning_rate": 9.811320754716981e-06, "loss": 3.8711, "step": 81 }, { "epoch": 0.06996587030716724, "grad_norm": 115.17135620117188, "learning_rate": 9.802336028751123e-06, "loss": 4.6289, "step": 82 }, { "epoch": 0.07081911262798635, "grad_norm": 74.1488037109375, "learning_rate": 9.793351302785265e-06, "loss": 4.1133, "step": 83 }, { "epoch": 0.07167235494880546, "grad_norm": 60.60784149169922, "learning_rate": 9.784366576819408e-06, "loss": 4.1113, "step": 84 }, { "epoch": 0.07252559726962457, "grad_norm": 138.30191040039062, "learning_rate": 9.77538185085355e-06, "loss": 3.7441, "step": 85 }, { "epoch": 0.07337883959044368, "grad_norm": 29.931669235229492, "learning_rate": 9.766397124887692e-06, "loss": 3.375, "step": 86 }, { "epoch": 0.07423208191126279, "grad_norm": 53.22774887084961, "learning_rate": 9.757412398921834e-06, "loss": 3.6758, "step": 87 }, { "epoch": 0.07508532423208192, "grad_norm": 38.70452880859375, "learning_rate": 9.748427672955976e-06, "loss": 4.002, "step": 88 }, { "epoch": 0.07593856655290103, "grad_norm": 30.55535125732422, "learning_rate": 9.739442946990118e-06, "loss": 4.207, "step": 89 }, { "epoch": 0.07679180887372014, "grad_norm": 27.83077049255371, "learning_rate": 9.73045822102426e-06, "loss": 3.4453, "step": 90 }, { "epoch": 0.07764505119453925, "grad_norm": 121.7099380493164, "learning_rate": 9.7214734950584e-06, "loss": 3.9688, "step": 91 }, { "epoch": 0.07849829351535836, "grad_norm": 81.67149353027344, "learning_rate": 9.712488769092544e-06, "loss": 3.8965, "step": 92 }, { "epoch": 0.07935153583617748, "grad_norm": 39.18846893310547, "learning_rate": 9.703504043126686e-06, "loss": 3.8633, "step": 93 }, { "epoch": 0.08020477815699659, "grad_norm": 84.66485595703125, "learning_rate": 9.694519317160828e-06, "loss": 3.9961, "step": 94 }, { "epoch": 0.0810580204778157, "grad_norm": 82.30975341796875, "learning_rate": 9.685534591194969e-06, "loss": 3.793, "step": 95 }, { "epoch": 0.08191126279863481, "grad_norm": 88.6453628540039, "learning_rate": 9.676549865229111e-06, "loss": 3.8477, "step": 96 }, { "epoch": 0.08276450511945392, "grad_norm": 105.96221160888672, "learning_rate": 9.667565139263253e-06, "loss": 3.9941, "step": 97 }, { "epoch": 0.08361774744027303, "grad_norm": 23.890165328979492, "learning_rate": 9.658580413297395e-06, "loss": 3.3945, "step": 98 }, { "epoch": 0.08447098976109214, "grad_norm": 77.99059295654297, "learning_rate": 9.649595687331537e-06, "loss": 3.6816, "step": 99 }, { "epoch": 0.08532423208191127, "grad_norm": 68.72335052490234, "learning_rate": 9.64061096136568e-06, "loss": 3.8086, "step": 100 }, { "epoch": 0.08617747440273038, "grad_norm": 48.387054443359375, "learning_rate": 9.631626235399821e-06, "loss": 3.7148, "step": 101 }, { "epoch": 0.08703071672354949, "grad_norm": 30.06928825378418, "learning_rate": 9.622641509433963e-06, "loss": 3.1797, "step": 102 }, { "epoch": 0.0878839590443686, "grad_norm": 103.71221923828125, "learning_rate": 9.613656783468106e-06, "loss": 3.4102, "step": 103 }, { "epoch": 0.08873720136518772, "grad_norm": 22.561519622802734, "learning_rate": 9.604672057502246e-06, "loss": 3.4941, "step": 104 }, { "epoch": 0.08959044368600683, "grad_norm": 36.27552032470703, "learning_rate": 9.595687331536388e-06, "loss": 4.0645, "step": 105 }, { "epoch": 0.09044368600682594, "grad_norm": 60.4101448059082, "learning_rate": 9.58670260557053e-06, "loss": 3.1895, "step": 106 }, { "epoch": 0.09129692832764505, "grad_norm": 31.981599807739258, "learning_rate": 9.577717879604674e-06, "loss": 3.123, "step": 107 }, { "epoch": 0.09215017064846416, "grad_norm": 51.39161682128906, "learning_rate": 9.568733153638814e-06, "loss": 3.8027, "step": 108 }, { "epoch": 0.09300341296928327, "grad_norm": 17.14482307434082, "learning_rate": 9.559748427672956e-06, "loss": 3.5996, "step": 109 }, { "epoch": 0.09385665529010238, "grad_norm": 79.9706802368164, "learning_rate": 9.550763701707098e-06, "loss": 3.2578, "step": 110 }, { "epoch": 0.0947098976109215, "grad_norm": 19.872787475585938, "learning_rate": 9.54177897574124e-06, "loss": 3.0957, "step": 111 }, { "epoch": 0.09556313993174062, "grad_norm": 50.38517761230469, "learning_rate": 9.532794249775383e-06, "loss": 3.6895, "step": 112 }, { "epoch": 0.09641638225255973, "grad_norm": 50.98223876953125, "learning_rate": 9.523809523809525e-06, "loss": 3.7188, "step": 113 }, { "epoch": 0.09726962457337884, "grad_norm": 46.27577590942383, "learning_rate": 9.514824797843667e-06, "loss": 3.2383, "step": 114 }, { "epoch": 0.09812286689419795, "grad_norm": 43.620479583740234, "learning_rate": 9.505840071877809e-06, "loss": 3.6348, "step": 115 }, { "epoch": 0.09897610921501707, "grad_norm": 73.57115173339844, "learning_rate": 9.496855345911951e-06, "loss": 3.1875, "step": 116 }, { "epoch": 0.09982935153583618, "grad_norm": 29.671640396118164, "learning_rate": 9.487870619946093e-06, "loss": 3.4141, "step": 117 }, { "epoch": 0.10068259385665529, "grad_norm": 37.94879150390625, "learning_rate": 9.478885893980234e-06, "loss": 3.4414, "step": 118 }, { "epoch": 0.1015358361774744, "grad_norm": 51.39364242553711, "learning_rate": 9.469901168014376e-06, "loss": 3.4434, "step": 119 }, { "epoch": 0.10238907849829351, "grad_norm": 46.911163330078125, "learning_rate": 9.460916442048518e-06, "loss": 3.4355, "step": 120 }, { "epoch": 0.10324232081911262, "grad_norm": 32.2253303527832, "learning_rate": 9.451931716082661e-06, "loss": 3.2402, "step": 121 }, { "epoch": 0.10409556313993173, "grad_norm": 87.56474304199219, "learning_rate": 9.442946990116802e-06, "loss": 3.5059, "step": 122 }, { "epoch": 0.10494880546075085, "grad_norm": 75.4452896118164, "learning_rate": 9.433962264150944e-06, "loss": 3.1016, "step": 123 }, { "epoch": 0.10580204778156997, "grad_norm": 21.062419891357422, "learning_rate": 9.424977538185086e-06, "loss": 3.5176, "step": 124 }, { "epoch": 0.10665529010238908, "grad_norm": 34.950862884521484, "learning_rate": 9.415992812219228e-06, "loss": 3.2168, "step": 125 }, { "epoch": 0.1075085324232082, "grad_norm": 89.45964813232422, "learning_rate": 9.40700808625337e-06, "loss": 3.25, "step": 126 }, { "epoch": 0.1083617747440273, "grad_norm": 58.562896728515625, "learning_rate": 9.398023360287512e-06, "loss": 3.7793, "step": 127 }, { "epoch": 0.10921501706484642, "grad_norm": 54.15276336669922, "learning_rate": 9.389038634321654e-06, "loss": 3.5, "step": 128 }, { "epoch": 0.11006825938566553, "grad_norm": 32.4635124206543, "learning_rate": 9.380053908355796e-06, "loss": 2.915, "step": 129 }, { "epoch": 0.11092150170648464, "grad_norm": 22.57988739013672, "learning_rate": 9.371069182389939e-06, "loss": 2.9502, "step": 130 }, { "epoch": 0.11177474402730375, "grad_norm": 38.44780731201172, "learning_rate": 9.36208445642408e-06, "loss": 2.9336, "step": 131 }, { "epoch": 0.11262798634812286, "grad_norm": 21.83592414855957, "learning_rate": 9.353099730458221e-06, "loss": 2.9707, "step": 132 }, { "epoch": 0.11348122866894197, "grad_norm": 24.39005470275879, "learning_rate": 9.344115004492363e-06, "loss": 3.2578, "step": 133 }, { "epoch": 0.11433447098976109, "grad_norm": 59.925758361816406, "learning_rate": 9.335130278526505e-06, "loss": 3.2988, "step": 134 }, { "epoch": 0.11518771331058021, "grad_norm": 74.08988189697266, "learning_rate": 9.326145552560647e-06, "loss": 2.7236, "step": 135 }, { "epoch": 0.11604095563139932, "grad_norm": 63.953453063964844, "learning_rate": 9.31716082659479e-06, "loss": 3.1807, "step": 136 }, { "epoch": 0.11689419795221843, "grad_norm": 36.688720703125, "learning_rate": 9.308176100628931e-06, "loss": 2.8018, "step": 137 }, { "epoch": 0.11774744027303755, "grad_norm": 60.307430267333984, "learning_rate": 9.299191374663074e-06, "loss": 3.1572, "step": 138 }, { "epoch": 0.11860068259385666, "grad_norm": 28.88834571838379, "learning_rate": 9.290206648697216e-06, "loss": 3.4766, "step": 139 }, { "epoch": 0.11945392491467577, "grad_norm": 40.18682861328125, "learning_rate": 9.281221922731358e-06, "loss": 3.8594, "step": 140 }, { "epoch": 0.12030716723549488, "grad_norm": 74.8680648803711, "learning_rate": 9.272237196765498e-06, "loss": 3.1719, "step": 141 }, { "epoch": 0.12116040955631399, "grad_norm": 42.63037109375, "learning_rate": 9.263252470799642e-06, "loss": 2.8574, "step": 142 }, { "epoch": 0.1220136518771331, "grad_norm": 32.380043029785156, "learning_rate": 9.254267744833784e-06, "loss": 3.2363, "step": 143 }, { "epoch": 0.12286689419795221, "grad_norm": 71.21893310546875, "learning_rate": 9.245283018867926e-06, "loss": 3.1865, "step": 144 }, { "epoch": 0.12372013651877133, "grad_norm": 27.608762741088867, "learning_rate": 9.236298292902067e-06, "loss": 2.9258, "step": 145 }, { "epoch": 0.12457337883959044, "grad_norm": 26.77503776550293, "learning_rate": 9.227313566936209e-06, "loss": 3.0938, "step": 146 }, { "epoch": 0.12542662116040956, "grad_norm": 103.02552795410156, "learning_rate": 9.21832884097035e-06, "loss": 3.1328, "step": 147 }, { "epoch": 0.12627986348122866, "grad_norm": 83.52420806884766, "learning_rate": 9.209344115004493e-06, "loss": 2.9922, "step": 148 }, { "epoch": 0.12713310580204779, "grad_norm": 73.67784118652344, "learning_rate": 9.200359389038635e-06, "loss": 2.8223, "step": 149 }, { "epoch": 0.12798634812286688, "grad_norm": 23.8249454498291, "learning_rate": 9.191374663072777e-06, "loss": 3.3398, "step": 150 }, { "epoch": 0.128839590443686, "grad_norm": 69.4936294555664, "learning_rate": 9.182389937106919e-06, "loss": 3.4326, "step": 151 }, { "epoch": 0.1296928327645051, "grad_norm": 88.59197998046875, "learning_rate": 9.173405211141061e-06, "loss": 3.4414, "step": 152 }, { "epoch": 0.13054607508532423, "grad_norm": 29.386064529418945, "learning_rate": 9.164420485175203e-06, "loss": 3.0303, "step": 153 }, { "epoch": 0.13139931740614336, "grad_norm": 38.90749740600586, "learning_rate": 9.155435759209345e-06, "loss": 2.9619, "step": 154 }, { "epoch": 0.13225255972696245, "grad_norm": 74.41140747070312, "learning_rate": 9.146451033243486e-06, "loss": 3.4102, "step": 155 }, { "epoch": 0.13310580204778158, "grad_norm": 44.66842269897461, "learning_rate": 9.13746630727763e-06, "loss": 2.6963, "step": 156 }, { "epoch": 0.13395904436860068, "grad_norm": 33.257205963134766, "learning_rate": 9.128481581311772e-06, "loss": 3.1924, "step": 157 }, { "epoch": 0.1348122866894198, "grad_norm": 100.31049346923828, "learning_rate": 9.119496855345914e-06, "loss": 3.3809, "step": 158 }, { "epoch": 0.1356655290102389, "grad_norm": 54.77112579345703, "learning_rate": 9.110512129380054e-06, "loss": 3.5938, "step": 159 }, { "epoch": 0.13651877133105803, "grad_norm": 18.681119918823242, "learning_rate": 9.101527403414196e-06, "loss": 3.4785, "step": 160 }, { "epoch": 0.13737201365187712, "grad_norm": 62.4477424621582, "learning_rate": 9.092542677448338e-06, "loss": 3.2266, "step": 161 }, { "epoch": 0.13822525597269625, "grad_norm": 97.80989074707031, "learning_rate": 9.08355795148248e-06, "loss": 3.0898, "step": 162 }, { "epoch": 0.13907849829351535, "grad_norm": 91.97032928466797, "learning_rate": 9.074573225516622e-06, "loss": 3.6445, "step": 163 }, { "epoch": 0.13993174061433447, "grad_norm": 31.477741241455078, "learning_rate": 9.065588499550765e-06, "loss": 3.3359, "step": 164 }, { "epoch": 0.1407849829351536, "grad_norm": 21.278085708618164, "learning_rate": 9.056603773584907e-06, "loss": 3.3047, "step": 165 }, { "epoch": 0.1416382252559727, "grad_norm": 52.15373229980469, "learning_rate": 9.047619047619049e-06, "loss": 3.5176, "step": 166 }, { "epoch": 0.14249146757679182, "grad_norm": 14.358907699584961, "learning_rate": 9.03863432165319e-06, "loss": 2.8945, "step": 167 }, { "epoch": 0.14334470989761092, "grad_norm": 33.82578659057617, "learning_rate": 9.029649595687333e-06, "loss": 3.3242, "step": 168 }, { "epoch": 0.14419795221843004, "grad_norm": 72.78608703613281, "learning_rate": 9.020664869721473e-06, "loss": 3.416, "step": 169 }, { "epoch": 0.14505119453924914, "grad_norm": 97.01647186279297, "learning_rate": 9.011680143755617e-06, "loss": 3.25, "step": 170 }, { "epoch": 0.14590443686006827, "grad_norm": 54.42570114135742, "learning_rate": 9.002695417789759e-06, "loss": 3.1309, "step": 171 }, { "epoch": 0.14675767918088736, "grad_norm": 26.412174224853516, "learning_rate": 8.9937106918239e-06, "loss": 3.4609, "step": 172 }, { "epoch": 0.1476109215017065, "grad_norm": 71.91547393798828, "learning_rate": 8.984725965858042e-06, "loss": 3.1289, "step": 173 }, { "epoch": 0.14846416382252559, "grad_norm": 66.65043640136719, "learning_rate": 8.975741239892184e-06, "loss": 3.2383, "step": 174 }, { "epoch": 0.1493174061433447, "grad_norm": 164.26414489746094, "learning_rate": 8.966756513926326e-06, "loss": 4.3633, "step": 175 }, { "epoch": 0.15017064846416384, "grad_norm": 73.09919738769531, "learning_rate": 8.957771787960468e-06, "loss": 3.2148, "step": 176 }, { "epoch": 0.15102389078498293, "grad_norm": 40.517093658447266, "learning_rate": 8.94878706199461e-06, "loss": 2.9707, "step": 177 }, { "epoch": 0.15187713310580206, "grad_norm": 76.36444854736328, "learning_rate": 8.939802336028752e-06, "loss": 2.9424, "step": 178 }, { "epoch": 0.15273037542662116, "grad_norm": 119.0013198852539, "learning_rate": 8.930817610062894e-06, "loss": 3.1953, "step": 179 }, { "epoch": 0.15358361774744028, "grad_norm": 103.3395004272461, "learning_rate": 8.921832884097036e-06, "loss": 3.543, "step": 180 }, { "epoch": 0.15443686006825938, "grad_norm": 106.20706176757812, "learning_rate": 8.912848158131178e-06, "loss": 3.7734, "step": 181 }, { "epoch": 0.1552901023890785, "grad_norm": 53.621829986572266, "learning_rate": 8.903863432165319e-06, "loss": 3.1689, "step": 182 }, { "epoch": 0.1561433447098976, "grad_norm": 47.70130920410156, "learning_rate": 8.89487870619946e-06, "loss": 3.1543, "step": 183 }, { "epoch": 0.15699658703071673, "grad_norm": 38.16180419921875, "learning_rate": 8.885893980233603e-06, "loss": 2.6494, "step": 184 }, { "epoch": 0.15784982935153583, "grad_norm": 89.42051696777344, "learning_rate": 8.876909254267747e-06, "loss": 3.5508, "step": 185 }, { "epoch": 0.15870307167235495, "grad_norm": 60.47245407104492, "learning_rate": 8.867924528301887e-06, "loss": 3.6777, "step": 186 }, { "epoch": 0.15955631399317405, "grad_norm": 24.777610778808594, "learning_rate": 8.85893980233603e-06, "loss": 2.3027, "step": 187 }, { "epoch": 0.16040955631399317, "grad_norm": 28.14823341369629, "learning_rate": 8.849955076370171e-06, "loss": 3.0215, "step": 188 }, { "epoch": 0.1612627986348123, "grad_norm": 33.189239501953125, "learning_rate": 8.840970350404313e-06, "loss": 3.2891, "step": 189 }, { "epoch": 0.1621160409556314, "grad_norm": 25.520509719848633, "learning_rate": 8.831985624438455e-06, "loss": 2.8379, "step": 190 }, { "epoch": 0.16296928327645052, "grad_norm": 55.70583724975586, "learning_rate": 8.823000898472598e-06, "loss": 3.25, "step": 191 }, { "epoch": 0.16382252559726962, "grad_norm": 26.865032196044922, "learning_rate": 8.81401617250674e-06, "loss": 3.1055, "step": 192 }, { "epoch": 0.16467576791808874, "grad_norm": 22.27757453918457, "learning_rate": 8.805031446540882e-06, "loss": 2.5625, "step": 193 }, { "epoch": 0.16552901023890784, "grad_norm": 22.520416259765625, "learning_rate": 8.796046720575024e-06, "loss": 2.8887, "step": 194 }, { "epoch": 0.16638225255972697, "grad_norm": 18.727357864379883, "learning_rate": 8.787061994609166e-06, "loss": 2.6807, "step": 195 }, { "epoch": 0.16723549488054607, "grad_norm": 19.526918411254883, "learning_rate": 8.778077268643306e-06, "loss": 2.9512, "step": 196 }, { "epoch": 0.1680887372013652, "grad_norm": 25.042152404785156, "learning_rate": 8.769092542677448e-06, "loss": 3.168, "step": 197 }, { "epoch": 0.1689419795221843, "grad_norm": 21.94442367553711, "learning_rate": 8.76010781671159e-06, "loss": 3.3008, "step": 198 }, { "epoch": 0.1697952218430034, "grad_norm": 18.068660736083984, "learning_rate": 8.751123090745734e-06, "loss": 2.999, "step": 199 }, { "epoch": 0.17064846416382254, "grad_norm": 54.0893669128418, "learning_rate": 8.742138364779875e-06, "loss": 2.8115, "step": 200 }, { "epoch": 0.17150170648464164, "grad_norm": 87.9207992553711, "learning_rate": 8.733153638814017e-06, "loss": 3.6152, "step": 201 }, { "epoch": 0.17235494880546076, "grad_norm": 33.18696975708008, "learning_rate": 8.724168912848159e-06, "loss": 3.0527, "step": 202 }, { "epoch": 0.17320819112627986, "grad_norm": 36.34266662597656, "learning_rate": 8.715184186882301e-06, "loss": 3.2754, "step": 203 }, { "epoch": 0.17406143344709898, "grad_norm": 55.32210922241211, "learning_rate": 8.706199460916443e-06, "loss": 3.9746, "step": 204 }, { "epoch": 0.17491467576791808, "grad_norm": 27.2241268157959, "learning_rate": 8.697214734950583e-06, "loss": 3.4297, "step": 205 }, { "epoch": 0.1757679180887372, "grad_norm": 19.6944580078125, "learning_rate": 8.688230008984727e-06, "loss": 3.2139, "step": 206 }, { "epoch": 0.1766211604095563, "grad_norm": 14.382315635681152, "learning_rate": 8.67924528301887e-06, "loss": 2.7715, "step": 207 }, { "epoch": 0.17747440273037543, "grad_norm": 20.982158660888672, "learning_rate": 8.670260557053011e-06, "loss": 3.3418, "step": 208 }, { "epoch": 0.17832764505119453, "grad_norm": 23.547433853149414, "learning_rate": 8.661275831087152e-06, "loss": 2.9199, "step": 209 }, { "epoch": 0.17918088737201365, "grad_norm": 43.464237213134766, "learning_rate": 8.652291105121294e-06, "loss": 3.0234, "step": 210 }, { "epoch": 0.18003412969283278, "grad_norm": 41.555118560791016, "learning_rate": 8.643306379155436e-06, "loss": 3.0107, "step": 211 }, { "epoch": 0.18088737201365188, "grad_norm": 42.51097869873047, "learning_rate": 8.634321653189578e-06, "loss": 3.1074, "step": 212 }, { "epoch": 0.181740614334471, "grad_norm": 35.8163948059082, "learning_rate": 8.62533692722372e-06, "loss": 3.0488, "step": 213 }, { "epoch": 0.1825938566552901, "grad_norm": 48.06075668334961, "learning_rate": 8.616352201257862e-06, "loss": 2.8594, "step": 214 }, { "epoch": 0.18344709897610922, "grad_norm": 31.809709548950195, "learning_rate": 8.607367475292004e-06, "loss": 3.0605, "step": 215 }, { "epoch": 0.18430034129692832, "grad_norm": 33.539005279541016, "learning_rate": 8.598382749326146e-06, "loss": 2.8164, "step": 216 }, { "epoch": 0.18515358361774745, "grad_norm": 30.759517669677734, "learning_rate": 8.589398023360288e-06, "loss": 3.4043, "step": 217 }, { "epoch": 0.18600682593856654, "grad_norm": 46.54279327392578, "learning_rate": 8.58041329739443e-06, "loss": 3.3164, "step": 218 }, { "epoch": 0.18686006825938567, "grad_norm": 17.588998794555664, "learning_rate": 8.571428571428571e-06, "loss": 2.8232, "step": 219 }, { "epoch": 0.18771331058020477, "grad_norm": 40.2357063293457, "learning_rate": 8.562443845462715e-06, "loss": 3.6035, "step": 220 }, { "epoch": 0.1885665529010239, "grad_norm": 30.986467361450195, "learning_rate": 8.553459119496857e-06, "loss": 3.2402, "step": 221 }, { "epoch": 0.189419795221843, "grad_norm": 71.314453125, "learning_rate": 8.544474393530999e-06, "loss": 2.8965, "step": 222 }, { "epoch": 0.19027303754266212, "grad_norm": 60.04804611206055, "learning_rate": 8.53548966756514e-06, "loss": 3.2324, "step": 223 }, { "epoch": 0.19112627986348124, "grad_norm": 25.102706909179688, "learning_rate": 8.526504941599281e-06, "loss": 3.1973, "step": 224 }, { "epoch": 0.19197952218430034, "grad_norm": 58.8226203918457, "learning_rate": 8.517520215633423e-06, "loss": 3.291, "step": 225 }, { "epoch": 0.19283276450511946, "grad_norm": 80.93440246582031, "learning_rate": 8.508535489667566e-06, "loss": 4.0488, "step": 226 }, { "epoch": 0.19368600682593856, "grad_norm": 15.695361137390137, "learning_rate": 8.499550763701708e-06, "loss": 2.875, "step": 227 }, { "epoch": 0.1945392491467577, "grad_norm": 18.42605209350586, "learning_rate": 8.49056603773585e-06, "loss": 3.0059, "step": 228 }, { "epoch": 0.19539249146757678, "grad_norm": 20.952381134033203, "learning_rate": 8.481581311769992e-06, "loss": 3.6758, "step": 229 }, { "epoch": 0.1962457337883959, "grad_norm": 33.53485107421875, "learning_rate": 8.472596585804134e-06, "loss": 2.6758, "step": 230 }, { "epoch": 0.197098976109215, "grad_norm": 18.76603889465332, "learning_rate": 8.463611859838276e-06, "loss": 3.0684, "step": 231 }, { "epoch": 0.19795221843003413, "grad_norm": 61.98395538330078, "learning_rate": 8.454627133872418e-06, "loss": 3.3242, "step": 232 }, { "epoch": 0.19880546075085323, "grad_norm": 24.302837371826172, "learning_rate": 8.445642407906558e-06, "loss": 3.5234, "step": 233 }, { "epoch": 0.19965870307167236, "grad_norm": 58.31713104248047, "learning_rate": 8.436657681940702e-06, "loss": 3.7285, "step": 234 }, { "epoch": 0.20051194539249148, "grad_norm": 36.07301712036133, "learning_rate": 8.427672955974844e-06, "loss": 3.2148, "step": 235 }, { "epoch": 0.20136518771331058, "grad_norm": 20.333580017089844, "learning_rate": 8.418688230008986e-06, "loss": 3.2344, "step": 236 }, { "epoch": 0.2022184300341297, "grad_norm": 19.519014358520508, "learning_rate": 8.409703504043127e-06, "loss": 2.7363, "step": 237 }, { "epoch": 0.2030716723549488, "grad_norm": 17.365482330322266, "learning_rate": 8.400718778077269e-06, "loss": 3.0752, "step": 238 }, { "epoch": 0.20392491467576793, "grad_norm": 14.840271949768066, "learning_rate": 8.391734052111411e-06, "loss": 2.9824, "step": 239 }, { "epoch": 0.20477815699658702, "grad_norm": 19.23467254638672, "learning_rate": 8.382749326145553e-06, "loss": 2.9961, "step": 240 }, { "epoch": 0.20563139931740615, "grad_norm": 68.42906188964844, "learning_rate": 8.373764600179695e-06, "loss": 3.4277, "step": 241 }, { "epoch": 0.20648464163822525, "grad_norm": 26.17658042907715, "learning_rate": 8.364779874213837e-06, "loss": 2.6973, "step": 242 }, { "epoch": 0.20733788395904437, "grad_norm": 18.755210876464844, "learning_rate": 8.35579514824798e-06, "loss": 3.1953, "step": 243 }, { "epoch": 0.20819112627986347, "grad_norm": 17.498382568359375, "learning_rate": 8.346810422282121e-06, "loss": 3.0918, "step": 244 }, { "epoch": 0.2090443686006826, "grad_norm": 44.80198669433594, "learning_rate": 8.337825696316264e-06, "loss": 2.4717, "step": 245 }, { "epoch": 0.2098976109215017, "grad_norm": 45.40264892578125, "learning_rate": 8.328840970350404e-06, "loss": 2.9092, "step": 246 }, { "epoch": 0.21075085324232082, "grad_norm": 28.791826248168945, "learning_rate": 8.319856244384546e-06, "loss": 2.9834, "step": 247 }, { "epoch": 0.21160409556313994, "grad_norm": 26.131162643432617, "learning_rate": 8.31087151841869e-06, "loss": 2.9482, "step": 248 }, { "epoch": 0.21245733788395904, "grad_norm": 45.71311950683594, "learning_rate": 8.301886792452832e-06, "loss": 3.1572, "step": 249 }, { "epoch": 0.21331058020477817, "grad_norm": 27.159473419189453, "learning_rate": 8.292902066486972e-06, "loss": 2.8818, "step": 250 }, { "epoch": 0.21416382252559726, "grad_norm": 24.58170509338379, "learning_rate": 8.283917340521114e-06, "loss": 3.2188, "step": 251 }, { "epoch": 0.2150170648464164, "grad_norm": 25.872392654418945, "learning_rate": 8.274932614555256e-06, "loss": 3.2979, "step": 252 }, { "epoch": 0.2158703071672355, "grad_norm": 23.962533950805664, "learning_rate": 8.265947888589399e-06, "loss": 3.4648, "step": 253 }, { "epoch": 0.2167235494880546, "grad_norm": 17.57655143737793, "learning_rate": 8.25696316262354e-06, "loss": 2.7656, "step": 254 }, { "epoch": 0.2175767918088737, "grad_norm": 57.268821716308594, "learning_rate": 8.247978436657683e-06, "loss": 3.459, "step": 255 }, { "epoch": 0.21843003412969283, "grad_norm": 24.482690811157227, "learning_rate": 8.238993710691825e-06, "loss": 3.584, "step": 256 }, { "epoch": 0.21928327645051193, "grad_norm": 25.513710021972656, "learning_rate": 8.230008984725967e-06, "loss": 3.4512, "step": 257 }, { "epoch": 0.22013651877133106, "grad_norm": 55.66819381713867, "learning_rate": 8.221024258760109e-06, "loss": 3.0996, "step": 258 }, { "epoch": 0.22098976109215018, "grad_norm": 50.33326721191406, "learning_rate": 8.212039532794251e-06, "loss": 3.498, "step": 259 }, { "epoch": 0.22184300341296928, "grad_norm": 30.594228744506836, "learning_rate": 8.203054806828391e-06, "loss": 3.1953, "step": 260 }, { "epoch": 0.2226962457337884, "grad_norm": 20.082685470581055, "learning_rate": 8.194070080862534e-06, "loss": 3.2734, "step": 261 }, { "epoch": 0.2235494880546075, "grad_norm": 23.434057235717773, "learning_rate": 8.185085354896676e-06, "loss": 3.0488, "step": 262 }, { "epoch": 0.22440273037542663, "grad_norm": 75.93733215332031, "learning_rate": 8.17610062893082e-06, "loss": 3.4014, "step": 263 }, { "epoch": 0.22525597269624573, "grad_norm": 34.896339416503906, "learning_rate": 8.16711590296496e-06, "loss": 2.9199, "step": 264 }, { "epoch": 0.22610921501706485, "grad_norm": 14.582112312316895, "learning_rate": 8.158131176999102e-06, "loss": 2.9688, "step": 265 }, { "epoch": 0.22696245733788395, "grad_norm": 35.01908874511719, "learning_rate": 8.149146451033244e-06, "loss": 3.0977, "step": 266 }, { "epoch": 0.22781569965870307, "grad_norm": 18.878812789916992, "learning_rate": 8.140161725067386e-06, "loss": 2.9307, "step": 267 }, { "epoch": 0.22866894197952217, "grad_norm": 38.857398986816406, "learning_rate": 8.131176999101528e-06, "loss": 2.8672, "step": 268 }, { "epoch": 0.2295221843003413, "grad_norm": 21.270587921142578, "learning_rate": 8.12219227313567e-06, "loss": 2.79, "step": 269 }, { "epoch": 0.23037542662116042, "grad_norm": 55.603302001953125, "learning_rate": 8.113207547169812e-06, "loss": 3.1348, "step": 270 }, { "epoch": 0.23122866894197952, "grad_norm": 60.84667205810547, "learning_rate": 8.104222821203954e-06, "loss": 3.1846, "step": 271 }, { "epoch": 0.23208191126279865, "grad_norm": 21.8660888671875, "learning_rate": 8.095238095238097e-06, "loss": 2.4336, "step": 272 }, { "epoch": 0.23293515358361774, "grad_norm": 47.333717346191406, "learning_rate": 8.086253369272239e-06, "loss": 3.2207, "step": 273 }, { "epoch": 0.23378839590443687, "grad_norm": 41.70451354980469, "learning_rate": 8.077268643306379e-06, "loss": 2.8359, "step": 274 }, { "epoch": 0.23464163822525597, "grad_norm": 35.146942138671875, "learning_rate": 8.068283917340521e-06, "loss": 2.6738, "step": 275 }, { "epoch": 0.2354948805460751, "grad_norm": 16.213695526123047, "learning_rate": 8.059299191374663e-06, "loss": 2.8086, "step": 276 }, { "epoch": 0.2363481228668942, "grad_norm": 47.92852783203125, "learning_rate": 8.050314465408805e-06, "loss": 2.8867, "step": 277 }, { "epoch": 0.23720136518771331, "grad_norm": 42.166404724121094, "learning_rate": 8.041329739442947e-06, "loss": 3.125, "step": 278 }, { "epoch": 0.2380546075085324, "grad_norm": 24.129230499267578, "learning_rate": 8.03234501347709e-06, "loss": 3.043, "step": 279 }, { "epoch": 0.23890784982935154, "grad_norm": 16.145126342773438, "learning_rate": 8.023360287511232e-06, "loss": 3.3672, "step": 280 }, { "epoch": 0.23976109215017063, "grad_norm": 68.74685668945312, "learning_rate": 8.014375561545374e-06, "loss": 3.1924, "step": 281 }, { "epoch": 0.24061433447098976, "grad_norm": 25.997495651245117, "learning_rate": 8.005390835579516e-06, "loss": 3.0293, "step": 282 }, { "epoch": 0.24146757679180889, "grad_norm": 53.29498291015625, "learning_rate": 7.996406109613656e-06, "loss": 2.9922, "step": 283 }, { "epoch": 0.24232081911262798, "grad_norm": 15.400269508361816, "learning_rate": 7.9874213836478e-06, "loss": 3.1387, "step": 284 }, { "epoch": 0.2431740614334471, "grad_norm": 20.252784729003906, "learning_rate": 7.978436657681942e-06, "loss": 2.9648, "step": 285 }, { "epoch": 0.2440273037542662, "grad_norm": 57.464752197265625, "learning_rate": 7.969451931716084e-06, "loss": 3.6719, "step": 286 }, { "epoch": 0.24488054607508533, "grad_norm": 43.41289520263672, "learning_rate": 7.960467205750224e-06, "loss": 2.8242, "step": 287 }, { "epoch": 0.24573378839590443, "grad_norm": 17.788986206054688, "learning_rate": 7.951482479784367e-06, "loss": 3.1875, "step": 288 }, { "epoch": 0.24658703071672355, "grad_norm": 15.844386100769043, "learning_rate": 7.942497753818509e-06, "loss": 2.9785, "step": 289 }, { "epoch": 0.24744027303754265, "grad_norm": 58.50739669799805, "learning_rate": 7.93351302785265e-06, "loss": 2.9883, "step": 290 }, { "epoch": 0.24829351535836178, "grad_norm": 60.459251403808594, "learning_rate": 7.924528301886793e-06, "loss": 2.8467, "step": 291 }, { "epoch": 0.24914675767918087, "grad_norm": 48.048728942871094, "learning_rate": 7.915543575920935e-06, "loss": 3.1133, "step": 292 }, { "epoch": 0.25, "grad_norm": 25.91424560546875, "learning_rate": 7.906558849955077e-06, "loss": 2.5762, "step": 293 }, { "epoch": 0.2508532423208191, "grad_norm": 65.37548065185547, "learning_rate": 7.897574123989219e-06, "loss": 3.1191, "step": 294 }, { "epoch": 0.25170648464163825, "grad_norm": 48.94771194458008, "learning_rate": 7.888589398023361e-06, "loss": 2.6484, "step": 295 }, { "epoch": 0.2525597269624573, "grad_norm": 45.35071563720703, "learning_rate": 7.879604672057503e-06, "loss": 3.1436, "step": 296 }, { "epoch": 0.25341296928327645, "grad_norm": 50.98272705078125, "learning_rate": 7.870619946091644e-06, "loss": 2.5635, "step": 297 }, { "epoch": 0.25426621160409557, "grad_norm": 24.23581886291504, "learning_rate": 7.861635220125787e-06, "loss": 3.0156, "step": 298 }, { "epoch": 0.2551194539249147, "grad_norm": 61.62641906738281, "learning_rate": 7.85265049415993e-06, "loss": 3.0684, "step": 299 }, { "epoch": 0.25597269624573377, "grad_norm": 31.647212982177734, "learning_rate": 7.843665768194072e-06, "loss": 2.6011, "step": 300 }, { "epoch": 0.2568259385665529, "grad_norm": 29.92403793334961, "learning_rate": 7.834681042228212e-06, "loss": 3.0156, "step": 301 }, { "epoch": 0.257679180887372, "grad_norm": 40.79433059692383, "learning_rate": 7.825696316262354e-06, "loss": 3.2021, "step": 302 }, { "epoch": 0.25853242320819114, "grad_norm": 21.312335968017578, "learning_rate": 7.816711590296496e-06, "loss": 2.7988, "step": 303 }, { "epoch": 0.2593856655290102, "grad_norm": 31.760292053222656, "learning_rate": 7.807726864330638e-06, "loss": 3.2148, "step": 304 }, { "epoch": 0.26023890784982934, "grad_norm": 20.41486167907715, "learning_rate": 7.79874213836478e-06, "loss": 2.9932, "step": 305 }, { "epoch": 0.26109215017064846, "grad_norm": 25.601892471313477, "learning_rate": 7.789757412398922e-06, "loss": 2.751, "step": 306 }, { "epoch": 0.2619453924914676, "grad_norm": 24.824922561645508, "learning_rate": 7.780772686433065e-06, "loss": 3.0527, "step": 307 }, { "epoch": 0.2627986348122867, "grad_norm": 41.55419158935547, "learning_rate": 7.771787960467207e-06, "loss": 2.877, "step": 308 }, { "epoch": 0.2636518771331058, "grad_norm": 31.784027099609375, "learning_rate": 7.762803234501349e-06, "loss": 3.3652, "step": 309 }, { "epoch": 0.2645051194539249, "grad_norm": 36.64832305908203, "learning_rate": 7.75381850853549e-06, "loss": 3.2246, "step": 310 }, { "epoch": 0.26535836177474403, "grad_norm": 59.69587707519531, "learning_rate": 7.744833782569631e-06, "loss": 2.8984, "step": 311 }, { "epoch": 0.26621160409556316, "grad_norm": 29.08624267578125, "learning_rate": 7.735849056603775e-06, "loss": 2.8701, "step": 312 }, { "epoch": 0.26706484641638223, "grad_norm": 34.791015625, "learning_rate": 7.726864330637917e-06, "loss": 2.5869, "step": 313 }, { "epoch": 0.26791808873720135, "grad_norm": 55.99431610107422, "learning_rate": 7.717879604672058e-06, "loss": 2.9512, "step": 314 }, { "epoch": 0.2687713310580205, "grad_norm": 30.719738006591797, "learning_rate": 7.7088948787062e-06, "loss": 3.1318, "step": 315 }, { "epoch": 0.2696245733788396, "grad_norm": 24.247756958007812, "learning_rate": 7.699910152740342e-06, "loss": 3.3125, "step": 316 }, { "epoch": 0.27047781569965873, "grad_norm": 19.7833309173584, "learning_rate": 7.690925426774484e-06, "loss": 2.8135, "step": 317 }, { "epoch": 0.2713310580204778, "grad_norm": 50.110103607177734, "learning_rate": 7.681940700808626e-06, "loss": 2.707, "step": 318 }, { "epoch": 0.2721843003412969, "grad_norm": 21.2917423248291, "learning_rate": 7.672955974842768e-06, "loss": 2.5205, "step": 319 }, { "epoch": 0.27303754266211605, "grad_norm": 33.85706329345703, "learning_rate": 7.66397124887691e-06, "loss": 2.7705, "step": 320 }, { "epoch": 0.2738907849829352, "grad_norm": 59.83601760864258, "learning_rate": 7.654986522911052e-06, "loss": 3.1875, "step": 321 }, { "epoch": 0.27474402730375425, "grad_norm": 61.3809928894043, "learning_rate": 7.646001796945194e-06, "loss": 3.1445, "step": 322 }, { "epoch": 0.27559726962457337, "grad_norm": 77.88739776611328, "learning_rate": 7.637017070979336e-06, "loss": 2.9951, "step": 323 }, { "epoch": 0.2764505119453925, "grad_norm": 47.42338180541992, "learning_rate": 7.6280323450134775e-06, "loss": 3.1553, "step": 324 }, { "epoch": 0.2773037542662116, "grad_norm": 31.061063766479492, "learning_rate": 7.61904761904762e-06, "loss": 3.2158, "step": 325 }, { "epoch": 0.2781569965870307, "grad_norm": 19.786115646362305, "learning_rate": 7.6100628930817626e-06, "loss": 2.5645, "step": 326 }, { "epoch": 0.2790102389078498, "grad_norm": 18.45869255065918, "learning_rate": 7.601078167115904e-06, "loss": 2.6465, "step": 327 }, { "epoch": 0.27986348122866894, "grad_norm": 43.45344924926758, "learning_rate": 7.592093441150046e-06, "loss": 2.7793, "step": 328 }, { "epoch": 0.28071672354948807, "grad_norm": 42.01021957397461, "learning_rate": 7.583108715184188e-06, "loss": 2.8398, "step": 329 }, { "epoch": 0.2815699658703072, "grad_norm": 22.967981338500977, "learning_rate": 7.574123989218329e-06, "loss": 3.2949, "step": 330 }, { "epoch": 0.28242320819112626, "grad_norm": 48.885189056396484, "learning_rate": 7.565139263252471e-06, "loss": 2.7598, "step": 331 }, { "epoch": 0.2832764505119454, "grad_norm": 83.62909698486328, "learning_rate": 7.5561545372866126e-06, "loss": 3.6533, "step": 332 }, { "epoch": 0.2841296928327645, "grad_norm": 102.9934310913086, "learning_rate": 7.5471698113207555e-06, "loss": 2.9414, "step": 333 }, { "epoch": 0.28498293515358364, "grad_norm": 22.28573989868164, "learning_rate": 7.538185085354898e-06, "loss": 3.0723, "step": 334 }, { "epoch": 0.2858361774744027, "grad_norm": 20.038768768310547, "learning_rate": 7.52920035938904e-06, "loss": 3.2168, "step": 335 }, { "epoch": 0.28668941979522183, "grad_norm": 17.91118049621582, "learning_rate": 7.520215633423181e-06, "loss": 3.1191, "step": 336 }, { "epoch": 0.28754266211604096, "grad_norm": 23.860708236694336, "learning_rate": 7.511230907457323e-06, "loss": 2.8877, "step": 337 }, { "epoch": 0.2883959044368601, "grad_norm": 25.451940536499023, "learning_rate": 7.502246181491465e-06, "loss": 2.9814, "step": 338 }, { "epoch": 0.28924914675767915, "grad_norm": 44.59007263183594, "learning_rate": 7.493261455525606e-06, "loss": 3.0049, "step": 339 }, { "epoch": 0.2901023890784983, "grad_norm": 14.881490707397461, "learning_rate": 7.484276729559748e-06, "loss": 2.5664, "step": 340 }, { "epoch": 0.2909556313993174, "grad_norm": 44.933406829833984, "learning_rate": 7.475292003593891e-06, "loss": 2.8262, "step": 341 }, { "epoch": 0.29180887372013653, "grad_norm": 44.93138122558594, "learning_rate": 7.4663072776280334e-06, "loss": 2.9414, "step": 342 }, { "epoch": 0.29266211604095566, "grad_norm": 17.614646911621094, "learning_rate": 7.457322551662175e-06, "loss": 2.627, "step": 343 }, { "epoch": 0.2935153583617747, "grad_norm": 34.46635055541992, "learning_rate": 7.448337825696317e-06, "loss": 2.9785, "step": 344 }, { "epoch": 0.29436860068259385, "grad_norm": 65.03298950195312, "learning_rate": 7.439353099730459e-06, "loss": 2.6504, "step": 345 }, { "epoch": 0.295221843003413, "grad_norm": 21.21845245361328, "learning_rate": 7.4303683737646e-06, "loss": 2.8721, "step": 346 }, { "epoch": 0.2960750853242321, "grad_norm": 33.291107177734375, "learning_rate": 7.421383647798742e-06, "loss": 2.6748, "step": 347 }, { "epoch": 0.29692832764505117, "grad_norm": 18.167999267578125, "learning_rate": 7.412398921832885e-06, "loss": 2.6123, "step": 348 }, { "epoch": 0.2977815699658703, "grad_norm": 41.19535446166992, "learning_rate": 7.403414195867027e-06, "loss": 3.4941, "step": 349 }, { "epoch": 0.2986348122866894, "grad_norm": 28.60065269470215, "learning_rate": 7.3944294699011685e-06, "loss": 3.0127, "step": 350 }, { "epoch": 0.29948805460750855, "grad_norm": 28.26238441467285, "learning_rate": 7.3854447439353106e-06, "loss": 2.791, "step": 351 }, { "epoch": 0.3003412969283277, "grad_norm": 31.91538429260254, "learning_rate": 7.376460017969453e-06, "loss": 2.7793, "step": 352 }, { "epoch": 0.30119453924914674, "grad_norm": 19.526973724365234, "learning_rate": 7.367475292003594e-06, "loss": 2.8916, "step": 353 }, { "epoch": 0.30204778156996587, "grad_norm": 39.5968132019043, "learning_rate": 7.358490566037736e-06, "loss": 2.6309, "step": 354 }, { "epoch": 0.302901023890785, "grad_norm": 53.58340072631836, "learning_rate": 7.349505840071879e-06, "loss": 2.9912, "step": 355 }, { "epoch": 0.3037542662116041, "grad_norm": 38.25190353393555, "learning_rate": 7.340521114106021e-06, "loss": 2.7041, "step": 356 }, { "epoch": 0.3046075085324232, "grad_norm": 15.885865211486816, "learning_rate": 7.331536388140162e-06, "loss": 3.0859, "step": 357 }, { "epoch": 0.3054607508532423, "grad_norm": 19.57723617553711, "learning_rate": 7.322551662174304e-06, "loss": 3.0664, "step": 358 }, { "epoch": 0.30631399317406144, "grad_norm": 63.955474853515625, "learning_rate": 7.313566936208446e-06, "loss": 2.7041, "step": 359 }, { "epoch": 0.30716723549488056, "grad_norm": 32.2467155456543, "learning_rate": 7.304582210242588e-06, "loss": 3.2461, "step": 360 }, { "epoch": 0.30802047781569963, "grad_norm": 30.897621154785156, "learning_rate": 7.29559748427673e-06, "loss": 3.4238, "step": 361 }, { "epoch": 0.30887372013651876, "grad_norm": 18.033702850341797, "learning_rate": 7.286612758310873e-06, "loss": 3.0957, "step": 362 }, { "epoch": 0.3097269624573379, "grad_norm": 41.53689956665039, "learning_rate": 7.277628032345015e-06, "loss": 2.918, "step": 363 }, { "epoch": 0.310580204778157, "grad_norm": 43.60328674316406, "learning_rate": 7.268643306379156e-06, "loss": 3.0557, "step": 364 }, { "epoch": 0.31143344709897613, "grad_norm": 55.88149642944336, "learning_rate": 7.259658580413298e-06, "loss": 2.7529, "step": 365 }, { "epoch": 0.3122866894197952, "grad_norm": 46.07794952392578, "learning_rate": 7.250673854447439e-06, "loss": 2.835, "step": 366 }, { "epoch": 0.31313993174061433, "grad_norm": 19.480363845825195, "learning_rate": 7.2416891284815814e-06, "loss": 2.4336, "step": 367 }, { "epoch": 0.31399317406143346, "grad_norm": 78.5523681640625, "learning_rate": 7.2327044025157235e-06, "loss": 2.9111, "step": 368 }, { "epoch": 0.3148464163822526, "grad_norm": 56.7245979309082, "learning_rate": 7.2237196765498665e-06, "loss": 2.7549, "step": 369 }, { "epoch": 0.31569965870307165, "grad_norm": 94.528076171875, "learning_rate": 7.214734950584008e-06, "loss": 2.9033, "step": 370 }, { "epoch": 0.3165529010238908, "grad_norm": 62.33586883544922, "learning_rate": 7.20575022461815e-06, "loss": 3.4121, "step": 371 }, { "epoch": 0.3174061433447099, "grad_norm": 19.558395385742188, "learning_rate": 7.196765498652292e-06, "loss": 2.9961, "step": 372 }, { "epoch": 0.318259385665529, "grad_norm": 41.45348358154297, "learning_rate": 7.187780772686433e-06, "loss": 3.1143, "step": 373 }, { "epoch": 0.3191126279863481, "grad_norm": 42.826805114746094, "learning_rate": 7.178796046720575e-06, "loss": 3.0479, "step": 374 }, { "epoch": 0.3199658703071672, "grad_norm": 60.3271598815918, "learning_rate": 7.169811320754717e-06, "loss": 3.6201, "step": 375 }, { "epoch": 0.32081911262798635, "grad_norm": 15.749074935913086, "learning_rate": 7.16082659478886e-06, "loss": 3.1504, "step": 376 }, { "epoch": 0.3216723549488055, "grad_norm": 28.352935791015625, "learning_rate": 7.1518418688230015e-06, "loss": 2.6055, "step": 377 }, { "epoch": 0.3225255972696246, "grad_norm": 22.02720069885254, "learning_rate": 7.1428571428571436e-06, "loss": 2.6562, "step": 378 }, { "epoch": 0.32337883959044367, "grad_norm": 32.356258392333984, "learning_rate": 7.133872416891286e-06, "loss": 2.502, "step": 379 }, { "epoch": 0.3242320819112628, "grad_norm": 55.318992614746094, "learning_rate": 7.124887690925427e-06, "loss": 3.8184, "step": 380 }, { "epoch": 0.3250853242320819, "grad_norm": 13.6019926071167, "learning_rate": 7.115902964959569e-06, "loss": 2.7568, "step": 381 }, { "epoch": 0.32593856655290104, "grad_norm": 30.500629425048828, "learning_rate": 7.106918238993711e-06, "loss": 2.5703, "step": 382 }, { "epoch": 0.3267918088737201, "grad_norm": 19.479543685913086, "learning_rate": 7.097933513027854e-06, "loss": 3.1357, "step": 383 }, { "epoch": 0.32764505119453924, "grad_norm": 14.30429744720459, "learning_rate": 7.088948787061995e-06, "loss": 2.4258, "step": 384 }, { "epoch": 0.32849829351535836, "grad_norm": 46.64712142944336, "learning_rate": 7.079964061096137e-06, "loss": 2.4863, "step": 385 }, { "epoch": 0.3293515358361775, "grad_norm": 14.86281681060791, "learning_rate": 7.0709793351302794e-06, "loss": 2.5391, "step": 386 }, { "epoch": 0.3302047781569966, "grad_norm": 24.936386108398438, "learning_rate": 7.061994609164421e-06, "loss": 3.0732, "step": 387 }, { "epoch": 0.3310580204778157, "grad_norm": 23.111101150512695, "learning_rate": 7.053009883198563e-06, "loss": 2.9229, "step": 388 }, { "epoch": 0.3319112627986348, "grad_norm": 28.36639976501465, "learning_rate": 7.044025157232705e-06, "loss": 3.0352, "step": 389 }, { "epoch": 0.33276450511945393, "grad_norm": 22.827180862426758, "learning_rate": 7.035040431266848e-06, "loss": 3.0488, "step": 390 }, { "epoch": 0.33361774744027306, "grad_norm": 24.18820571899414, "learning_rate": 7.026055705300989e-06, "loss": 2.1445, "step": 391 }, { "epoch": 0.33447098976109213, "grad_norm": 38.80826950073242, "learning_rate": 7.017070979335131e-06, "loss": 3.0645, "step": 392 }, { "epoch": 0.33532423208191126, "grad_norm": 39.28004455566406, "learning_rate": 7.008086253369273e-06, "loss": 2.6309, "step": 393 }, { "epoch": 0.3361774744027304, "grad_norm": 16.985010147094727, "learning_rate": 6.9991015274034144e-06, "loss": 2.792, "step": 394 }, { "epoch": 0.3370307167235495, "grad_norm": 18.20982551574707, "learning_rate": 6.9901168014375565e-06, "loss": 3.1348, "step": 395 }, { "epoch": 0.3378839590443686, "grad_norm": 23.503843307495117, "learning_rate": 6.981132075471699e-06, "loss": 2.4434, "step": 396 }, { "epoch": 0.3387372013651877, "grad_norm": 69.62710571289062, "learning_rate": 6.9721473495058416e-06, "loss": 2.8184, "step": 397 }, { "epoch": 0.3395904436860068, "grad_norm": 47.18648910522461, "learning_rate": 6.963162623539983e-06, "loss": 2.9189, "step": 398 }, { "epoch": 0.34044368600682595, "grad_norm": 40.54623794555664, "learning_rate": 6.954177897574125e-06, "loss": 2.7744, "step": 399 }, { "epoch": 0.3412969283276451, "grad_norm": 26.348918914794922, "learning_rate": 6.945193171608267e-06, "loss": 2.7061, "step": 400 }, { "epoch": 0.34215017064846415, "grad_norm": 26.754854202270508, "learning_rate": 6.936208445642408e-06, "loss": 2.8203, "step": 401 }, { "epoch": 0.3430034129692833, "grad_norm": 23.61028289794922, "learning_rate": 6.92722371967655e-06, "loss": 2.502, "step": 402 }, { "epoch": 0.3438566552901024, "grad_norm": 51.461273193359375, "learning_rate": 6.9182389937106915e-06, "loss": 3.4434, "step": 403 }, { "epoch": 0.3447098976109215, "grad_norm": 61.8600959777832, "learning_rate": 6.9092542677448345e-06, "loss": 3.0664, "step": 404 }, { "epoch": 0.3455631399317406, "grad_norm": 36.4835319519043, "learning_rate": 6.9002695417789766e-06, "loss": 2.5781, "step": 405 }, { "epoch": 0.3464163822525597, "grad_norm": 20.035572052001953, "learning_rate": 6.891284815813119e-06, "loss": 2.7451, "step": 406 }, { "epoch": 0.34726962457337884, "grad_norm": 23.01044273376465, "learning_rate": 6.88230008984726e-06, "loss": 2.8359, "step": 407 }, { "epoch": 0.34812286689419797, "grad_norm": 34.898773193359375, "learning_rate": 6.873315363881402e-06, "loss": 2.8604, "step": 408 }, { "epoch": 0.34897610921501704, "grad_norm": 17.49709701538086, "learning_rate": 6.864330637915544e-06, "loss": 2.75, "step": 409 }, { "epoch": 0.34982935153583616, "grad_norm": 32.71485900878906, "learning_rate": 6.855345911949685e-06, "loss": 3.1094, "step": 410 }, { "epoch": 0.3506825938566553, "grad_norm": 19.570741653442383, "learning_rate": 6.846361185983828e-06, "loss": 2.9473, "step": 411 }, { "epoch": 0.3515358361774744, "grad_norm": 29.854347229003906, "learning_rate": 6.83737646001797e-06, "loss": 2.8809, "step": 412 }, { "epoch": 0.35238907849829354, "grad_norm": 43.52353286743164, "learning_rate": 6.8283917340521124e-06, "loss": 2.8164, "step": 413 }, { "epoch": 0.3532423208191126, "grad_norm": 26.43115997314453, "learning_rate": 6.819407008086254e-06, "loss": 2.4014, "step": 414 }, { "epoch": 0.35409556313993173, "grad_norm": 25.89423370361328, "learning_rate": 6.810422282120396e-06, "loss": 2.6406, "step": 415 }, { "epoch": 0.35494880546075086, "grad_norm": 35.70558166503906, "learning_rate": 6.801437556154538e-06, "loss": 2.4619, "step": 416 }, { "epoch": 0.35580204778157, "grad_norm": 33.59202194213867, "learning_rate": 6.792452830188679e-06, "loss": 2.8213, "step": 417 }, { "epoch": 0.35665529010238906, "grad_norm": 25.010950088500977, "learning_rate": 6.783468104222821e-06, "loss": 2.8096, "step": 418 }, { "epoch": 0.3575085324232082, "grad_norm": 23.81590461730957, "learning_rate": 6.774483378256964e-06, "loss": 2.3193, "step": 419 }, { "epoch": 0.3583617747440273, "grad_norm": 42.35072708129883, "learning_rate": 6.765498652291106e-06, "loss": 2.6592, "step": 420 }, { "epoch": 0.35921501706484643, "grad_norm": 40.074851989746094, "learning_rate": 6.7565139263252475e-06, "loss": 2.6318, "step": 421 }, { "epoch": 0.36006825938566556, "grad_norm": 32.20216751098633, "learning_rate": 6.7475292003593895e-06, "loss": 3.2891, "step": 422 }, { "epoch": 0.3609215017064846, "grad_norm": 37.29304122924805, "learning_rate": 6.738544474393532e-06, "loss": 2.9023, "step": 423 }, { "epoch": 0.36177474402730375, "grad_norm": 32.8192024230957, "learning_rate": 6.729559748427673e-06, "loss": 2.9736, "step": 424 }, { "epoch": 0.3626279863481229, "grad_norm": 23.262601852416992, "learning_rate": 6.720575022461815e-06, "loss": 2.4893, "step": 425 }, { "epoch": 0.363481228668942, "grad_norm": 40.62841796875, "learning_rate": 6.711590296495958e-06, "loss": 2.9531, "step": 426 }, { "epoch": 0.3643344709897611, "grad_norm": 30.518091201782227, "learning_rate": 6.7026055705301e-06, "loss": 2.4004, "step": 427 }, { "epoch": 0.3651877133105802, "grad_norm": 28.2186336517334, "learning_rate": 6.693620844564241e-06, "loss": 2.8008, "step": 428 }, { "epoch": 0.3660409556313993, "grad_norm": 14.801209449768066, "learning_rate": 6.684636118598383e-06, "loss": 2.4902, "step": 429 }, { "epoch": 0.36689419795221845, "grad_norm": 19.97643280029297, "learning_rate": 6.675651392632525e-06, "loss": 2.6104, "step": 430 }, { "epoch": 0.3677474402730375, "grad_norm": 33.75346374511719, "learning_rate": 6.666666666666667e-06, "loss": 2.6221, "step": 431 }, { "epoch": 0.36860068259385664, "grad_norm": 31.13344383239746, "learning_rate": 6.657681940700809e-06, "loss": 2.9883, "step": 432 }, { "epoch": 0.36945392491467577, "grad_norm": 24.86776351928711, "learning_rate": 6.648697214734952e-06, "loss": 3.3652, "step": 433 }, { "epoch": 0.3703071672354949, "grad_norm": 55.863922119140625, "learning_rate": 6.639712488769094e-06, "loss": 2.8779, "step": 434 }, { "epoch": 0.371160409556314, "grad_norm": 14.440893173217773, "learning_rate": 6.630727762803235e-06, "loss": 2.8555, "step": 435 }, { "epoch": 0.3720136518771331, "grad_norm": 16.983476638793945, "learning_rate": 6.621743036837377e-06, "loss": 3.1016, "step": 436 }, { "epoch": 0.3728668941979522, "grad_norm": 17.240015029907227, "learning_rate": 6.612758310871519e-06, "loss": 2.8311, "step": 437 }, { "epoch": 0.37372013651877134, "grad_norm": 18.871740341186523, "learning_rate": 6.60377358490566e-06, "loss": 2.8125, "step": 438 }, { "epoch": 0.37457337883959047, "grad_norm": 36.629554748535156, "learning_rate": 6.5947888589398025e-06, "loss": 2.7119, "step": 439 }, { "epoch": 0.37542662116040953, "grad_norm": 22.077198028564453, "learning_rate": 6.5858041329739454e-06, "loss": 2.5117, "step": 440 }, { "epoch": 0.37627986348122866, "grad_norm": 36.363304138183594, "learning_rate": 6.576819407008087e-06, "loss": 3.335, "step": 441 }, { "epoch": 0.3771331058020478, "grad_norm": 17.072507858276367, "learning_rate": 6.567834681042229e-06, "loss": 2.8789, "step": 442 }, { "epoch": 0.3779863481228669, "grad_norm": 30.818571090698242, "learning_rate": 6.558849955076371e-06, "loss": 2.8848, "step": 443 }, { "epoch": 0.378839590443686, "grad_norm": 35.21898651123047, "learning_rate": 6.549865229110512e-06, "loss": 2.6309, "step": 444 }, { "epoch": 0.3796928327645051, "grad_norm": 24.29189109802246, "learning_rate": 6.540880503144654e-06, "loss": 2.8311, "step": 445 }, { "epoch": 0.38054607508532423, "grad_norm": 27.211631774902344, "learning_rate": 6.531895777178796e-06, "loss": 2.7578, "step": 446 }, { "epoch": 0.38139931740614336, "grad_norm": 20.377609252929688, "learning_rate": 6.522911051212939e-06, "loss": 2.5107, "step": 447 }, { "epoch": 0.3822525597269625, "grad_norm": 31.11419677734375, "learning_rate": 6.5139263252470805e-06, "loss": 3.0205, "step": 448 }, { "epoch": 0.38310580204778155, "grad_norm": 27.811227798461914, "learning_rate": 6.5049415992812226e-06, "loss": 2.5117, "step": 449 }, { "epoch": 0.3839590443686007, "grad_norm": 27.51049041748047, "learning_rate": 6.495956873315365e-06, "loss": 2.748, "step": 450 }, { "epoch": 0.3848122866894198, "grad_norm": 23.872310638427734, "learning_rate": 6.486972147349506e-06, "loss": 3.1309, "step": 451 }, { "epoch": 0.3856655290102389, "grad_norm": 25.639631271362305, "learning_rate": 6.477987421383648e-06, "loss": 3.0293, "step": 452 }, { "epoch": 0.386518771331058, "grad_norm": 78.10359954833984, "learning_rate": 6.46900269541779e-06, "loss": 2.7129, "step": 453 }, { "epoch": 0.3873720136518771, "grad_norm": 33.32661056518555, "learning_rate": 6.460017969451933e-06, "loss": 3.0215, "step": 454 }, { "epoch": 0.38822525597269625, "grad_norm": 18.216999053955078, "learning_rate": 6.451033243486074e-06, "loss": 2.6982, "step": 455 }, { "epoch": 0.3890784982935154, "grad_norm": 15.191929817199707, "learning_rate": 6.442048517520216e-06, "loss": 2.6016, "step": 456 }, { "epoch": 0.38993174061433444, "grad_norm": 20.197877883911133, "learning_rate": 6.433063791554358e-06, "loss": 2.791, "step": 457 }, { "epoch": 0.39078498293515357, "grad_norm": 20.7491397857666, "learning_rate": 6.4240790655885e-06, "loss": 2.3955, "step": 458 }, { "epoch": 0.3916382252559727, "grad_norm": 14.716793060302734, "learning_rate": 6.415094339622642e-06, "loss": 2.6826, "step": 459 }, { "epoch": 0.3924914675767918, "grad_norm": 23.43107032775879, "learning_rate": 6.406109613656784e-06, "loss": 3.1709, "step": 460 }, { "epoch": 0.39334470989761094, "grad_norm": 17.54826545715332, "learning_rate": 6.397124887690927e-06, "loss": 2.3867, "step": 461 }, { "epoch": 0.39419795221843, "grad_norm": 83.90989685058594, "learning_rate": 6.388140161725068e-06, "loss": 2.9434, "step": 462 }, { "epoch": 0.39505119453924914, "grad_norm": 69.95578002929688, "learning_rate": 6.37915543575921e-06, "loss": 2.5518, "step": 463 }, { "epoch": 0.39590443686006827, "grad_norm": 53.826499938964844, "learning_rate": 6.370170709793352e-06, "loss": 2.4453, "step": 464 }, { "epoch": 0.3967576791808874, "grad_norm": 18.601577758789062, "learning_rate": 6.3611859838274934e-06, "loss": 2.9531, "step": 465 }, { "epoch": 0.39761092150170646, "grad_norm": 34.0197868347168, "learning_rate": 6.3522012578616355e-06, "loss": 2.4746, "step": 466 }, { "epoch": 0.3984641638225256, "grad_norm": 44.096134185791016, "learning_rate": 6.343216531895778e-06, "loss": 2.8955, "step": 467 }, { "epoch": 0.3993174061433447, "grad_norm": 24.102462768554688, "learning_rate": 6.3342318059299205e-06, "loss": 2.792, "step": 468 }, { "epoch": 0.40017064846416384, "grad_norm": 17.850786209106445, "learning_rate": 6.325247079964062e-06, "loss": 2.3228, "step": 469 }, { "epoch": 0.40102389078498296, "grad_norm": 21.928977966308594, "learning_rate": 6.316262353998204e-06, "loss": 2.6504, "step": 470 }, { "epoch": 0.40187713310580203, "grad_norm": 25.40172004699707, "learning_rate": 6.307277628032346e-06, "loss": 2.6084, "step": 471 }, { "epoch": 0.40273037542662116, "grad_norm": 16.1717586517334, "learning_rate": 6.298292902066487e-06, "loss": 2.0674, "step": 472 }, { "epoch": 0.4035836177474403, "grad_norm": 29.402706146240234, "learning_rate": 6.289308176100629e-06, "loss": 2.916, "step": 473 }, { "epoch": 0.4044368600682594, "grad_norm": 23.10331153869629, "learning_rate": 6.2803234501347705e-06, "loss": 2.6895, "step": 474 }, { "epoch": 0.4052901023890785, "grad_norm": 23.997806549072266, "learning_rate": 6.2713387241689135e-06, "loss": 2.6953, "step": 475 }, { "epoch": 0.4061433447098976, "grad_norm": 39.14799118041992, "learning_rate": 6.2623539982030556e-06, "loss": 2.625, "step": 476 }, { "epoch": 0.4069965870307167, "grad_norm": 29.97382164001465, "learning_rate": 6.253369272237198e-06, "loss": 3.2129, "step": 477 }, { "epoch": 0.40784982935153585, "grad_norm": 17.89264678955078, "learning_rate": 6.244384546271339e-06, "loss": 2.6367, "step": 478 }, { "epoch": 0.4087030716723549, "grad_norm": 40.39899444580078, "learning_rate": 6.235399820305481e-06, "loss": 2.8965, "step": 479 }, { "epoch": 0.40955631399317405, "grad_norm": 48.41572952270508, "learning_rate": 6.226415094339623e-06, "loss": 3.0703, "step": 480 }, { "epoch": 0.4104095563139932, "grad_norm": 31.183040618896484, "learning_rate": 6.217430368373764e-06, "loss": 2.1592, "step": 481 }, { "epoch": 0.4112627986348123, "grad_norm": 36.558773040771484, "learning_rate": 6.208445642407907e-06, "loss": 2.3945, "step": 482 }, { "epoch": 0.4121160409556314, "grad_norm": 19.117509841918945, "learning_rate": 6.199460916442049e-06, "loss": 3.0605, "step": 483 }, { "epoch": 0.4129692832764505, "grad_norm": 48.38492965698242, "learning_rate": 6.1904761904761914e-06, "loss": 2.291, "step": 484 }, { "epoch": 0.4138225255972696, "grad_norm": 21.41737937927246, "learning_rate": 6.181491464510333e-06, "loss": 2.8252, "step": 485 }, { "epoch": 0.41467576791808874, "grad_norm": 42.10402297973633, "learning_rate": 6.172506738544475e-06, "loss": 3.0918, "step": 486 }, { "epoch": 0.41552901023890787, "grad_norm": 46.72148895263672, "learning_rate": 6.163522012578617e-06, "loss": 2.6162, "step": 487 }, { "epoch": 0.41638225255972694, "grad_norm": 21.43707847595215, "learning_rate": 6.154537286612758e-06, "loss": 2.2754, "step": 488 }, { "epoch": 0.41723549488054607, "grad_norm": 16.188798904418945, "learning_rate": 6.145552560646901e-06, "loss": 2.5078, "step": 489 }, { "epoch": 0.4180887372013652, "grad_norm": 19.047313690185547, "learning_rate": 6.136567834681043e-06, "loss": 2.9209, "step": 490 }, { "epoch": 0.4189419795221843, "grad_norm": 36.833744049072266, "learning_rate": 6.127583108715185e-06, "loss": 2.5791, "step": 491 }, { "epoch": 0.4197952218430034, "grad_norm": 19.706417083740234, "learning_rate": 6.1185983827493264e-06, "loss": 2.9268, "step": 492 }, { "epoch": 0.4206484641638225, "grad_norm": 33.76554870605469, "learning_rate": 6.1096136567834685e-06, "loss": 2.4229, "step": 493 }, { "epoch": 0.42150170648464164, "grad_norm": 35.278297424316406, "learning_rate": 6.100628930817611e-06, "loss": 2.7832, "step": 494 }, { "epoch": 0.42235494880546076, "grad_norm": 25.52465057373047, "learning_rate": 6.091644204851752e-06, "loss": 2.8623, "step": 495 }, { "epoch": 0.4232081911262799, "grad_norm": 50.70538330078125, "learning_rate": 6.082659478885895e-06, "loss": 2.7324, "step": 496 }, { "epoch": 0.42406143344709896, "grad_norm": 33.23221206665039, "learning_rate": 6.073674752920037e-06, "loss": 2.8828, "step": 497 }, { "epoch": 0.4249146757679181, "grad_norm": 44.4984245300293, "learning_rate": 6.064690026954179e-06, "loss": 2.7217, "step": 498 }, { "epoch": 0.4257679180887372, "grad_norm": 40.04144287109375, "learning_rate": 6.05570530098832e-06, "loss": 2.4414, "step": 499 }, { "epoch": 0.42662116040955633, "grad_norm": 31.808074951171875, "learning_rate": 6.046720575022462e-06, "loss": 2.5635, "step": 500 }, { "epoch": 0.4274744027303754, "grad_norm": 27.19110679626465, "learning_rate": 6.037735849056604e-06, "loss": 2.6348, "step": 501 }, { "epoch": 0.4283276450511945, "grad_norm": 52.006351470947266, "learning_rate": 6.028751123090746e-06, "loss": 3.4004, "step": 502 }, { "epoch": 0.42918088737201365, "grad_norm": 49.40862274169922, "learning_rate": 6.019766397124888e-06, "loss": 2.6865, "step": 503 }, { "epoch": 0.4300341296928328, "grad_norm": 17.69283676147461, "learning_rate": 6.010781671159031e-06, "loss": 2.4824, "step": 504 }, { "epoch": 0.4308873720136519, "grad_norm": 20.699617385864258, "learning_rate": 6.001796945193173e-06, "loss": 3.0488, "step": 505 }, { "epoch": 0.431740614334471, "grad_norm": 56.287269592285156, "learning_rate": 5.992812219227314e-06, "loss": 3.0703, "step": 506 }, { "epoch": 0.4325938566552901, "grad_norm": 19.926307678222656, "learning_rate": 5.983827493261456e-06, "loss": 2.7871, "step": 507 }, { "epoch": 0.4334470989761092, "grad_norm": 33.74576187133789, "learning_rate": 5.974842767295598e-06, "loss": 2.9238, "step": 508 }, { "epoch": 0.43430034129692835, "grad_norm": 27.027666091918945, "learning_rate": 5.965858041329739e-06, "loss": 2.2666, "step": 509 }, { "epoch": 0.4351535836177474, "grad_norm": 49.87267303466797, "learning_rate": 5.9568733153638815e-06, "loss": 2.4355, "step": 510 }, { "epoch": 0.43600682593856654, "grad_norm": 33.75191879272461, "learning_rate": 5.9478885893980244e-06, "loss": 2.1299, "step": 511 }, { "epoch": 0.43686006825938567, "grad_norm": 23.760793685913086, "learning_rate": 5.938903863432166e-06, "loss": 2.9541, "step": 512 }, { "epoch": 0.4377133105802048, "grad_norm": 19.85642433166504, "learning_rate": 5.929919137466308e-06, "loss": 2.9023, "step": 513 }, { "epoch": 0.43856655290102387, "grad_norm": 42.32032775878906, "learning_rate": 5.92093441150045e-06, "loss": 2.8652, "step": 514 }, { "epoch": 0.439419795221843, "grad_norm": 32.215065002441406, "learning_rate": 5.911949685534591e-06, "loss": 3.1729, "step": 515 }, { "epoch": 0.4402730375426621, "grad_norm": 31.592498779296875, "learning_rate": 5.902964959568733e-06, "loss": 2.5439, "step": 516 }, { "epoch": 0.44112627986348124, "grad_norm": 26.448612213134766, "learning_rate": 5.893980233602875e-06, "loss": 2.3545, "step": 517 }, { "epoch": 0.44197952218430037, "grad_norm": 49.79834747314453, "learning_rate": 5.884995507637018e-06, "loss": 2.8105, "step": 518 }, { "epoch": 0.44283276450511944, "grad_norm": 45.154701232910156, "learning_rate": 5.8760107816711595e-06, "loss": 2.9541, "step": 519 }, { "epoch": 0.44368600682593856, "grad_norm": 41.03085708618164, "learning_rate": 5.8670260557053015e-06, "loss": 2.8398, "step": 520 }, { "epoch": 0.4445392491467577, "grad_norm": 21.58003807067871, "learning_rate": 5.858041329739444e-06, "loss": 2.7559, "step": 521 }, { "epoch": 0.4453924914675768, "grad_norm": 21.989830017089844, "learning_rate": 5.849056603773585e-06, "loss": 3.0488, "step": 522 }, { "epoch": 0.4462457337883959, "grad_norm": 14.759679794311523, "learning_rate": 5.840071877807727e-06, "loss": 2.9902, "step": 523 }, { "epoch": 0.447098976109215, "grad_norm": 14.680983543395996, "learning_rate": 5.831087151841869e-06, "loss": 2.4238, "step": 524 }, { "epoch": 0.44795221843003413, "grad_norm": 20.452116012573242, "learning_rate": 5.822102425876012e-06, "loss": 2.7881, "step": 525 }, { "epoch": 0.44880546075085326, "grad_norm": 15.639450073242188, "learning_rate": 5.813117699910153e-06, "loss": 2.6191, "step": 526 }, { "epoch": 0.4496587030716723, "grad_norm": 38.403995513916016, "learning_rate": 5.804132973944295e-06, "loss": 2.6895, "step": 527 }, { "epoch": 0.45051194539249145, "grad_norm": 14.627884864807129, "learning_rate": 5.795148247978437e-06, "loss": 2.7598, "step": 528 }, { "epoch": 0.4513651877133106, "grad_norm": 17.250015258789062, "learning_rate": 5.786163522012579e-06, "loss": 2.5088, "step": 529 }, { "epoch": 0.4522184300341297, "grad_norm": 36.893882751464844, "learning_rate": 5.777178796046721e-06, "loss": 2.6611, "step": 530 }, { "epoch": 0.45307167235494883, "grad_norm": 29.592458724975586, "learning_rate": 5.768194070080863e-06, "loss": 2.3877, "step": 531 }, { "epoch": 0.4539249146757679, "grad_norm": 29.255516052246094, "learning_rate": 5.759209344115006e-06, "loss": 3.1191, "step": 532 }, { "epoch": 0.454778156996587, "grad_norm": 27.445293426513672, "learning_rate": 5.750224618149147e-06, "loss": 2.4629, "step": 533 }, { "epoch": 0.45563139931740615, "grad_norm": 20.05036735534668, "learning_rate": 5.741239892183289e-06, "loss": 2.5107, "step": 534 }, { "epoch": 0.4564846416382253, "grad_norm": 16.583898544311523, "learning_rate": 5.732255166217431e-06, "loss": 3.3027, "step": 535 }, { "epoch": 0.45733788395904434, "grad_norm": 34.19240188598633, "learning_rate": 5.723270440251572e-06, "loss": 2.3574, "step": 536 }, { "epoch": 0.45819112627986347, "grad_norm": 29.18450164794922, "learning_rate": 5.7142857142857145e-06, "loss": 2.3818, "step": 537 }, { "epoch": 0.4590443686006826, "grad_norm": 26.387821197509766, "learning_rate": 5.705300988319857e-06, "loss": 2.709, "step": 538 }, { "epoch": 0.4598976109215017, "grad_norm": 19.58378028869629, "learning_rate": 5.6963162623539995e-06, "loss": 2.6631, "step": 539 }, { "epoch": 0.46075085324232085, "grad_norm": 25.05061912536621, "learning_rate": 5.687331536388141e-06, "loss": 2.6152, "step": 540 }, { "epoch": 0.4616040955631399, "grad_norm": 33.281044006347656, "learning_rate": 5.678346810422283e-06, "loss": 2.6494, "step": 541 }, { "epoch": 0.46245733788395904, "grad_norm": 15.196967124938965, "learning_rate": 5.669362084456425e-06, "loss": 2.9629, "step": 542 }, { "epoch": 0.46331058020477817, "grad_norm": 48.015869140625, "learning_rate": 5.660377358490566e-06, "loss": 2.7188, "step": 543 }, { "epoch": 0.4641638225255973, "grad_norm": 40.080692291259766, "learning_rate": 5.651392632524708e-06, "loss": 3.4395, "step": 544 }, { "epoch": 0.46501706484641636, "grad_norm": 14.495575904846191, "learning_rate": 5.64240790655885e-06, "loss": 2.5498, "step": 545 }, { "epoch": 0.4658703071672355, "grad_norm": 20.11421775817871, "learning_rate": 5.6334231805929925e-06, "loss": 2.4463, "step": 546 }, { "epoch": 0.4667235494880546, "grad_norm": 22.752029418945312, "learning_rate": 5.6244384546271346e-06, "loss": 2.376, "step": 547 }, { "epoch": 0.46757679180887374, "grad_norm": 31.627084732055664, "learning_rate": 5.615453728661277e-06, "loss": 2.3809, "step": 548 }, { "epoch": 0.4684300341296928, "grad_norm": 45.1749382019043, "learning_rate": 5.606469002695418e-06, "loss": 2.6025, "step": 549 }, { "epoch": 0.46928327645051193, "grad_norm": 14.6268892288208, "learning_rate": 5.59748427672956e-06, "loss": 2.3457, "step": 550 }, { "epoch": 0.47013651877133106, "grad_norm": 22.34581756591797, "learning_rate": 5.588499550763702e-06, "loss": 2.9863, "step": 551 }, { "epoch": 0.4709897610921502, "grad_norm": 30.1942195892334, "learning_rate": 5.579514824797843e-06, "loss": 3.0391, "step": 552 }, { "epoch": 0.4718430034129693, "grad_norm": 44.00593566894531, "learning_rate": 5.570530098831986e-06, "loss": 3.0767, "step": 553 }, { "epoch": 0.4726962457337884, "grad_norm": 20.9268798828125, "learning_rate": 5.561545372866128e-06, "loss": 2.3945, "step": 554 }, { "epoch": 0.4735494880546075, "grad_norm": 17.279001235961914, "learning_rate": 5.55256064690027e-06, "loss": 2.2637, "step": 555 }, { "epoch": 0.47440273037542663, "grad_norm": 35.23509979248047, "learning_rate": 5.543575920934412e-06, "loss": 2.5107, "step": 556 }, { "epoch": 0.47525597269624575, "grad_norm": 26.05479621887207, "learning_rate": 5.534591194968554e-06, "loss": 2.457, "step": 557 }, { "epoch": 0.4761092150170648, "grad_norm": 50.16437530517578, "learning_rate": 5.525606469002696e-06, "loss": 2.9473, "step": 558 }, { "epoch": 0.47696245733788395, "grad_norm": 27.86246109008789, "learning_rate": 5.516621743036837e-06, "loss": 2.6533, "step": 559 }, { "epoch": 0.4778156996587031, "grad_norm": 43.706398010253906, "learning_rate": 5.50763701707098e-06, "loss": 2.8643, "step": 560 }, { "epoch": 0.4786689419795222, "grad_norm": 41.53769302368164, "learning_rate": 5.498652291105122e-06, "loss": 2.7725, "step": 561 }, { "epoch": 0.47952218430034127, "grad_norm": 46.46355438232422, "learning_rate": 5.489667565139264e-06, "loss": 3.3379, "step": 562 }, { "epoch": 0.4803754266211604, "grad_norm": 20.3262882232666, "learning_rate": 5.4806828391734054e-06, "loss": 2.7764, "step": 563 }, { "epoch": 0.4812286689419795, "grad_norm": 25.64153480529785, "learning_rate": 5.4716981132075475e-06, "loss": 2.6289, "step": 564 }, { "epoch": 0.48208191126279865, "grad_norm": 23.21479606628418, "learning_rate": 5.46271338724169e-06, "loss": 2.793, "step": 565 }, { "epoch": 0.48293515358361777, "grad_norm": 25.748003005981445, "learning_rate": 5.453728661275831e-06, "loss": 2.5664, "step": 566 }, { "epoch": 0.48378839590443684, "grad_norm": 15.132332801818848, "learning_rate": 5.444743935309974e-06, "loss": 2.5977, "step": 567 }, { "epoch": 0.48464163822525597, "grad_norm": 22.88768768310547, "learning_rate": 5.435759209344116e-06, "loss": 2.4688, "step": 568 }, { "epoch": 0.4854948805460751, "grad_norm": 40.691104888916016, "learning_rate": 5.426774483378258e-06, "loss": 2.627, "step": 569 }, { "epoch": 0.4863481228668942, "grad_norm": 19.89053726196289, "learning_rate": 5.417789757412399e-06, "loss": 2.6426, "step": 570 }, { "epoch": 0.4872013651877133, "grad_norm": 52.61355209350586, "learning_rate": 5.408805031446541e-06, "loss": 3.0352, "step": 571 }, { "epoch": 0.4880546075085324, "grad_norm": 25.755590438842773, "learning_rate": 5.399820305480683e-06, "loss": 2.8721, "step": 572 }, { "epoch": 0.48890784982935154, "grad_norm": 49.290321350097656, "learning_rate": 5.390835579514825e-06, "loss": 2.9473, "step": 573 }, { "epoch": 0.48976109215017066, "grad_norm": 17.155630111694336, "learning_rate": 5.3818508535489676e-06, "loss": 2.6191, "step": 574 }, { "epoch": 0.4906143344709898, "grad_norm": 20.74138641357422, "learning_rate": 5.37286612758311e-06, "loss": 2.5684, "step": 575 }, { "epoch": 0.49146757679180886, "grad_norm": 38.56920623779297, "learning_rate": 5.363881401617252e-06, "loss": 2.6221, "step": 576 }, { "epoch": 0.492320819112628, "grad_norm": 31.176231384277344, "learning_rate": 5.354896675651393e-06, "loss": 2.3271, "step": 577 }, { "epoch": 0.4931740614334471, "grad_norm": 54.210899353027344, "learning_rate": 5.345911949685535e-06, "loss": 2.4258, "step": 578 }, { "epoch": 0.49402730375426623, "grad_norm": 21.1136474609375, "learning_rate": 5.336927223719677e-06, "loss": 2.6562, "step": 579 }, { "epoch": 0.4948805460750853, "grad_norm": 16.198816299438477, "learning_rate": 5.327942497753818e-06, "loss": 3.1621, "step": 580 }, { "epoch": 0.49573378839590443, "grad_norm": 25.892831802368164, "learning_rate": 5.3189577717879605e-06, "loss": 2.793, "step": 581 }, { "epoch": 0.49658703071672355, "grad_norm": 15.013483047485352, "learning_rate": 5.3099730458221034e-06, "loss": 2.4639, "step": 582 }, { "epoch": 0.4974402730375427, "grad_norm": 35.05656051635742, "learning_rate": 5.300988319856245e-06, "loss": 2.8516, "step": 583 }, { "epoch": 0.49829351535836175, "grad_norm": 27.9871768951416, "learning_rate": 5.292003593890387e-06, "loss": 2.4189, "step": 584 }, { "epoch": 0.4991467576791809, "grad_norm": 15.608467102050781, "learning_rate": 5.283018867924529e-06, "loss": 2.5889, "step": 585 }, { "epoch": 0.5, "grad_norm": 31.001338958740234, "learning_rate": 5.27403414195867e-06, "loss": 2.3174, "step": 586 }, { "epoch": 0.5008532423208191, "grad_norm": 31.35817527770996, "learning_rate": 5.265049415992812e-06, "loss": 2.3955, "step": 587 }, { "epoch": 0.5017064846416383, "grad_norm": 23.374814987182617, "learning_rate": 5.256064690026954e-06, "loss": 3.2852, "step": 588 }, { "epoch": 0.5025597269624573, "grad_norm": 22.058351516723633, "learning_rate": 5.247079964061097e-06, "loss": 2.4004, "step": 589 }, { "epoch": 0.5034129692832765, "grad_norm": 41.5433235168457, "learning_rate": 5.2380952380952384e-06, "loss": 2.8564, "step": 590 }, { "epoch": 0.5042662116040956, "grad_norm": 20.69767189025879, "learning_rate": 5.2291105121293805e-06, "loss": 2.623, "step": 591 }, { "epoch": 0.5051194539249146, "grad_norm": 28.56145477294922, "learning_rate": 5.220125786163523e-06, "loss": 2.6143, "step": 592 }, { "epoch": 0.5059726962457338, "grad_norm": 64.19149780273438, "learning_rate": 5.211141060197664e-06, "loss": 3.0439, "step": 593 }, { "epoch": 0.5068259385665529, "grad_norm": 42.88633346557617, "learning_rate": 5.202156334231806e-06, "loss": 2.3398, "step": 594 }, { "epoch": 0.507679180887372, "grad_norm": 25.10820770263672, "learning_rate": 5.193171608265948e-06, "loss": 2.2949, "step": 595 }, { "epoch": 0.5085324232081911, "grad_norm": 24.562023162841797, "learning_rate": 5.184186882300091e-06, "loss": 2.7129, "step": 596 }, { "epoch": 0.5093856655290102, "grad_norm": 36.790157318115234, "learning_rate": 5.175202156334232e-06, "loss": 2.5508, "step": 597 }, { "epoch": 0.5102389078498294, "grad_norm": 24.073081970214844, "learning_rate": 5.166217430368374e-06, "loss": 2.5186, "step": 598 }, { "epoch": 0.5110921501706485, "grad_norm": 35.33838653564453, "learning_rate": 5.157232704402516e-06, "loss": 2.4971, "step": 599 }, { "epoch": 0.5119453924914675, "grad_norm": 28.3082275390625, "learning_rate": 5.148247978436658e-06, "loss": 2.8848, "step": 600 }, { "epoch": 0.5127986348122867, "grad_norm": 25.457237243652344, "learning_rate": 5.1392632524708e-06, "loss": 2.7832, "step": 601 }, { "epoch": 0.5136518771331058, "grad_norm": 34.88469314575195, "learning_rate": 5.130278526504942e-06, "loss": 2.4941, "step": 602 }, { "epoch": 0.514505119453925, "grad_norm": 41.899715423583984, "learning_rate": 5.121293800539085e-06, "loss": 3.002, "step": 603 }, { "epoch": 0.515358361774744, "grad_norm": 27.774612426757812, "learning_rate": 5.112309074573226e-06, "loss": 2.8643, "step": 604 }, { "epoch": 0.5162116040955631, "grad_norm": 67.86431121826172, "learning_rate": 5.103324348607368e-06, "loss": 2.8418, "step": 605 }, { "epoch": 0.5170648464163823, "grad_norm": 27.578550338745117, "learning_rate": 5.09433962264151e-06, "loss": 2.2207, "step": 606 }, { "epoch": 0.5179180887372014, "grad_norm": 31.7324275970459, "learning_rate": 5.085354896675651e-06, "loss": 3.3027, "step": 607 }, { "epoch": 0.5187713310580204, "grad_norm": 20.27518081665039, "learning_rate": 5.0763701707097935e-06, "loss": 2.2842, "step": 608 }, { "epoch": 0.5196245733788396, "grad_norm": 27.45115089416504, "learning_rate": 5.067385444743936e-06, "loss": 2.2988, "step": 609 }, { "epoch": 0.5204778156996587, "grad_norm": 40.405704498291016, "learning_rate": 5.0584007187780785e-06, "loss": 3.1377, "step": 610 }, { "epoch": 0.5213310580204779, "grad_norm": 15.520742416381836, "learning_rate": 5.04941599281222e-06, "loss": 2.3604, "step": 611 }, { "epoch": 0.5221843003412969, "grad_norm": 28.408700942993164, "learning_rate": 5.040431266846362e-06, "loss": 2.5693, "step": 612 }, { "epoch": 0.523037542662116, "grad_norm": 33.49451446533203, "learning_rate": 5.031446540880504e-06, "loss": 2.6279, "step": 613 }, { "epoch": 0.5238907849829352, "grad_norm": 17.04746437072754, "learning_rate": 5.022461814914645e-06, "loss": 2.5713, "step": 614 }, { "epoch": 0.5247440273037542, "grad_norm": 35.0278205871582, "learning_rate": 5.013477088948787e-06, "loss": 2.4004, "step": 615 }, { "epoch": 0.5255972696245734, "grad_norm": 23.72642707824707, "learning_rate": 5.004492362982929e-06, "loss": 2.832, "step": 616 }, { "epoch": 0.5264505119453925, "grad_norm": 17.700857162475586, "learning_rate": 4.9955076370170715e-06, "loss": 2.8467, "step": 617 }, { "epoch": 0.5273037542662116, "grad_norm": 26.797855377197266, "learning_rate": 4.986522911051213e-06, "loss": 2.7002, "step": 618 }, { "epoch": 0.5281569965870307, "grad_norm": 15.89353084564209, "learning_rate": 4.977538185085356e-06, "loss": 2.8301, "step": 619 }, { "epoch": 0.5290102389078498, "grad_norm": 17.26994514465332, "learning_rate": 4.968553459119497e-06, "loss": 2.4023, "step": 620 }, { "epoch": 0.5298634812286689, "grad_norm": 50.756412506103516, "learning_rate": 4.959568733153639e-06, "loss": 2.1123, "step": 621 }, { "epoch": 0.5307167235494881, "grad_norm": 42.956947326660156, "learning_rate": 4.950584007187781e-06, "loss": 2.2637, "step": 622 }, { "epoch": 0.5315699658703071, "grad_norm": 22.47896385192871, "learning_rate": 4.941599281221923e-06, "loss": 2.7529, "step": 623 }, { "epoch": 0.5324232081911263, "grad_norm": 18.382062911987305, "learning_rate": 4.932614555256065e-06, "loss": 2.6665, "step": 624 }, { "epoch": 0.5332764505119454, "grad_norm": 45.38220977783203, "learning_rate": 4.9236298292902065e-06, "loss": 2.4453, "step": 625 }, { "epoch": 0.5341296928327645, "grad_norm": 51.33029556274414, "learning_rate": 4.914645103324349e-06, "loss": 3.1621, "step": 626 }, { "epoch": 0.5349829351535836, "grad_norm": 39.17404556274414, "learning_rate": 4.905660377358491e-06, "loss": 2.9941, "step": 627 }, { "epoch": 0.5358361774744027, "grad_norm": 56.7110595703125, "learning_rate": 4.896675651392633e-06, "loss": 2.9219, "step": 628 }, { "epoch": 0.5366894197952219, "grad_norm": 15.27424144744873, "learning_rate": 4.887690925426775e-06, "loss": 2.5547, "step": 629 }, { "epoch": 0.537542662116041, "grad_norm": 22.87930679321289, "learning_rate": 4.878706199460917e-06, "loss": 2.3506, "step": 630 }, { "epoch": 0.53839590443686, "grad_norm": 53.75349807739258, "learning_rate": 4.869721473495059e-06, "loss": 2.3652, "step": 631 }, { "epoch": 0.5392491467576792, "grad_norm": 30.84817123413086, "learning_rate": 4.8607367475292e-06, "loss": 2.9297, "step": 632 }, { "epoch": 0.5401023890784983, "grad_norm": 20.29245948791504, "learning_rate": 4.851752021563343e-06, "loss": 2.9795, "step": 633 }, { "epoch": 0.5409556313993175, "grad_norm": 17.75739097595215, "learning_rate": 4.842767295597484e-06, "loss": 2.4072, "step": 634 }, { "epoch": 0.5418088737201365, "grad_norm": 21.76918601989746, "learning_rate": 4.8337825696316265e-06, "loss": 2.5088, "step": 635 }, { "epoch": 0.5426621160409556, "grad_norm": 31.445209503173828, "learning_rate": 4.824797843665769e-06, "loss": 2.4473, "step": 636 }, { "epoch": 0.5435153583617748, "grad_norm": 43.67631530761719, "learning_rate": 4.815813117699911e-06, "loss": 2.7129, "step": 637 }, { "epoch": 0.5443686006825939, "grad_norm": 41.579044342041016, "learning_rate": 4.806828391734053e-06, "loss": 2.6562, "step": 638 }, { "epoch": 0.5452218430034129, "grad_norm": 22.987728118896484, "learning_rate": 4.797843665768194e-06, "loss": 2.8525, "step": 639 }, { "epoch": 0.5460750853242321, "grad_norm": 17.46269416809082, "learning_rate": 4.788858939802337e-06, "loss": 2.6797, "step": 640 }, { "epoch": 0.5469283276450512, "grad_norm": 14.546910285949707, "learning_rate": 4.779874213836478e-06, "loss": 2.4775, "step": 641 }, { "epoch": 0.5477815699658704, "grad_norm": 29.865407943725586, "learning_rate": 4.77088948787062e-06, "loss": 2.6279, "step": 642 }, { "epoch": 0.5486348122866894, "grad_norm": 14.915757179260254, "learning_rate": 4.761904761904762e-06, "loss": 2.5225, "step": 643 }, { "epoch": 0.5494880546075085, "grad_norm": 29.640846252441406, "learning_rate": 4.7529200359389045e-06, "loss": 2.4248, "step": 644 }, { "epoch": 0.5503412969283277, "grad_norm": 44.55379104614258, "learning_rate": 4.7439353099730466e-06, "loss": 2.6348, "step": 645 }, { "epoch": 0.5511945392491467, "grad_norm": 18.605289459228516, "learning_rate": 4.734950584007188e-06, "loss": 2.2568, "step": 646 }, { "epoch": 0.5520477815699659, "grad_norm": 22.488618850708008, "learning_rate": 4.725965858041331e-06, "loss": 2.7637, "step": 647 }, { "epoch": 0.552901023890785, "grad_norm": 18.215923309326172, "learning_rate": 4.716981132075472e-06, "loss": 2.4238, "step": 648 }, { "epoch": 0.5537542662116041, "grad_norm": 48.733970642089844, "learning_rate": 4.707996406109614e-06, "loss": 2.998, "step": 649 }, { "epoch": 0.5546075085324232, "grad_norm": 20.413524627685547, "learning_rate": 4.699011680143756e-06, "loss": 2.335, "step": 650 }, { "epoch": 0.5554607508532423, "grad_norm": 46.819068908691406, "learning_rate": 4.690026954177898e-06, "loss": 3.0381, "step": 651 }, { "epoch": 0.5563139931740614, "grad_norm": 26.987506866455078, "learning_rate": 4.68104222821204e-06, "loss": 2.7188, "step": 652 }, { "epoch": 0.5571672354948806, "grad_norm": 37.12288284301758, "learning_rate": 4.6720575022461816e-06, "loss": 2.4189, "step": 653 }, { "epoch": 0.5580204778156996, "grad_norm": 36.88205337524414, "learning_rate": 4.663072776280324e-06, "loss": 2.6191, "step": 654 }, { "epoch": 0.5588737201365188, "grad_norm": 14.645258903503418, "learning_rate": 4.654088050314466e-06, "loss": 1.9053, "step": 655 }, { "epoch": 0.5597269624573379, "grad_norm": 17.644990921020508, "learning_rate": 4.645103324348608e-06, "loss": 2.7158, "step": 656 }, { "epoch": 0.560580204778157, "grad_norm": 70.53995513916016, "learning_rate": 4.636118598382749e-06, "loss": 3.0059, "step": 657 }, { "epoch": 0.5614334470989761, "grad_norm": 50.95589065551758, "learning_rate": 4.627133872416892e-06, "loss": 3.1006, "step": 658 }, { "epoch": 0.5622866894197952, "grad_norm": 41.75291061401367, "learning_rate": 4.618149146451033e-06, "loss": 2.29, "step": 659 }, { "epoch": 0.5631399317406144, "grad_norm": 32.955204010009766, "learning_rate": 4.609164420485175e-06, "loss": 2.3164, "step": 660 }, { "epoch": 0.5639931740614335, "grad_norm": 33.37961196899414, "learning_rate": 4.6001796945193174e-06, "loss": 2.6953, "step": 661 }, { "epoch": 0.5648464163822525, "grad_norm": 41.79698181152344, "learning_rate": 4.5911949685534595e-06, "loss": 2.916, "step": 662 }, { "epoch": 0.5656996587030717, "grad_norm": 18.266895294189453, "learning_rate": 4.582210242587602e-06, "loss": 2.1611, "step": 663 }, { "epoch": 0.5665529010238908, "grad_norm": 29.101303100585938, "learning_rate": 4.573225516621743e-06, "loss": 2.749, "step": 664 }, { "epoch": 0.5674061433447098, "grad_norm": 58.654640197753906, "learning_rate": 4.564240790655886e-06, "loss": 3.3467, "step": 665 }, { "epoch": 0.568259385665529, "grad_norm": 52.830665588378906, "learning_rate": 4.555256064690027e-06, "loss": 3.0654, "step": 666 }, { "epoch": 0.5691126279863481, "grad_norm": 39.432003021240234, "learning_rate": 4.546271338724169e-06, "loss": 2.9551, "step": 667 }, { "epoch": 0.5699658703071673, "grad_norm": 19.742292404174805, "learning_rate": 4.537286612758311e-06, "loss": 2.8438, "step": 668 }, { "epoch": 0.5708191126279863, "grad_norm": 31.633556365966797, "learning_rate": 4.528301886792453e-06, "loss": 2.6396, "step": 669 }, { "epoch": 0.5716723549488054, "grad_norm": 58.045066833496094, "learning_rate": 4.519317160826595e-06, "loss": 2.7832, "step": 670 }, { "epoch": 0.5725255972696246, "grad_norm": 55.10057067871094, "learning_rate": 4.510332434860737e-06, "loss": 3.0498, "step": 671 }, { "epoch": 0.5733788395904437, "grad_norm": 41.77906799316406, "learning_rate": 4.5013477088948796e-06, "loss": 2.9658, "step": 672 }, { "epoch": 0.5742320819112628, "grad_norm": 27.726163864135742, "learning_rate": 4.492362982929021e-06, "loss": 2.7373, "step": 673 }, { "epoch": 0.5750853242320819, "grad_norm": 16.946115493774414, "learning_rate": 4.483378256963163e-06, "loss": 3.0039, "step": 674 }, { "epoch": 0.575938566552901, "grad_norm": 21.34795379638672, "learning_rate": 4.474393530997305e-06, "loss": 3.1045, "step": 675 }, { "epoch": 0.5767918088737202, "grad_norm": 45.67304229736328, "learning_rate": 4.465408805031447e-06, "loss": 2.7686, "step": 676 }, { "epoch": 0.5776450511945392, "grad_norm": 34.049530029296875, "learning_rate": 4.456424079065589e-06, "loss": 2.4395, "step": 677 }, { "epoch": 0.5784982935153583, "grad_norm": 36.776790618896484, "learning_rate": 4.44743935309973e-06, "loss": 2.8105, "step": 678 }, { "epoch": 0.5793515358361775, "grad_norm": 17.602291107177734, "learning_rate": 4.438454627133873e-06, "loss": 2.9131, "step": 679 }, { "epoch": 0.5802047781569966, "grad_norm": 27.473234176635742, "learning_rate": 4.429469901168015e-06, "loss": 2.8359, "step": 680 }, { "epoch": 0.5810580204778157, "grad_norm": 22.257198333740234, "learning_rate": 4.420485175202157e-06, "loss": 2.4365, "step": 681 }, { "epoch": 0.5819112627986348, "grad_norm": 53.0062141418457, "learning_rate": 4.411500449236299e-06, "loss": 2.2266, "step": 682 }, { "epoch": 0.5827645051194539, "grad_norm": 35.180240631103516, "learning_rate": 4.402515723270441e-06, "loss": 2.6904, "step": 683 }, { "epoch": 0.5836177474402731, "grad_norm": 28.43410873413086, "learning_rate": 4.393530997304583e-06, "loss": 2.4229, "step": 684 }, { "epoch": 0.5844709897610921, "grad_norm": 18.973915100097656, "learning_rate": 4.384546271338724e-06, "loss": 2.5684, "step": 685 }, { "epoch": 0.5853242320819113, "grad_norm": 66.25029754638672, "learning_rate": 4.375561545372867e-06, "loss": 2.3496, "step": 686 }, { "epoch": 0.5861774744027304, "grad_norm": 32.91408157348633, "learning_rate": 4.366576819407008e-06, "loss": 2.3613, "step": 687 }, { "epoch": 0.5870307167235495, "grad_norm": 27.794586181640625, "learning_rate": 4.3575920934411504e-06, "loss": 3.0039, "step": 688 }, { "epoch": 0.5878839590443686, "grad_norm": 33.872276306152344, "learning_rate": 4.348607367475292e-06, "loss": 2.5781, "step": 689 }, { "epoch": 0.5887372013651877, "grad_norm": 23.393707275390625, "learning_rate": 4.339622641509435e-06, "loss": 2.4434, "step": 690 }, { "epoch": 0.5895904436860068, "grad_norm": 27.395784378051758, "learning_rate": 4.330637915543576e-06, "loss": 2.3867, "step": 691 }, { "epoch": 0.590443686006826, "grad_norm": 27.155609130859375, "learning_rate": 4.321653189577718e-06, "loss": 2.7422, "step": 692 }, { "epoch": 0.591296928327645, "grad_norm": 25.652048110961914, "learning_rate": 4.31266846361186e-06, "loss": 2.4365, "step": 693 }, { "epoch": 0.5921501706484642, "grad_norm": 22.866825103759766, "learning_rate": 4.303683737646002e-06, "loss": 2.7734, "step": 694 }, { "epoch": 0.5930034129692833, "grad_norm": 43.10763931274414, "learning_rate": 4.294699011680144e-06, "loss": 2.5938, "step": 695 }, { "epoch": 0.5938566552901023, "grad_norm": 24.86405372619629, "learning_rate": 4.2857142857142855e-06, "loss": 2.2871, "step": 696 }, { "epoch": 0.5947098976109215, "grad_norm": 25.735193252563477, "learning_rate": 4.276729559748428e-06, "loss": 2.4424, "step": 697 }, { "epoch": 0.5955631399317406, "grad_norm": 17.094524383544922, "learning_rate": 4.26774483378257e-06, "loss": 3.0898, "step": 698 }, { "epoch": 0.5964163822525598, "grad_norm": 32.05537796020508, "learning_rate": 4.258760107816712e-06, "loss": 3.0322, "step": 699 }, { "epoch": 0.5972696245733788, "grad_norm": 19.903535842895508, "learning_rate": 4.249775381850854e-06, "loss": 2.2871, "step": 700 }, { "epoch": 0.5981228668941979, "grad_norm": 28.79990005493164, "learning_rate": 4.240790655884996e-06, "loss": 2.6797, "step": 701 }, { "epoch": 0.5989761092150171, "grad_norm": 13.19858169555664, "learning_rate": 4.231805929919138e-06, "loss": 2.417, "step": 702 }, { "epoch": 0.5998293515358362, "grad_norm": 23.84611701965332, "learning_rate": 4.222821203953279e-06, "loss": 2.5889, "step": 703 }, { "epoch": 0.6006825938566553, "grad_norm": 47.766387939453125, "learning_rate": 4.213836477987422e-06, "loss": 2.335, "step": 704 }, { "epoch": 0.6015358361774744, "grad_norm": 41.32902526855469, "learning_rate": 4.204851752021563e-06, "loss": 2.9268, "step": 705 }, { "epoch": 0.6023890784982935, "grad_norm": 27.452489852905273, "learning_rate": 4.1958670260557055e-06, "loss": 2.2168, "step": 706 }, { "epoch": 0.6032423208191127, "grad_norm": 19.42645263671875, "learning_rate": 4.186882300089848e-06, "loss": 2.5527, "step": 707 }, { "epoch": 0.6040955631399317, "grad_norm": 21.670026779174805, "learning_rate": 4.17789757412399e-06, "loss": 2.3047, "step": 708 }, { "epoch": 0.6049488054607508, "grad_norm": 41.55127716064453, "learning_rate": 4.168912848158132e-06, "loss": 2.7568, "step": 709 }, { "epoch": 0.60580204778157, "grad_norm": 18.455894470214844, "learning_rate": 4.159928122192273e-06, "loss": 2.1895, "step": 710 }, { "epoch": 0.606655290102389, "grad_norm": 32.20492172241211, "learning_rate": 4.150943396226416e-06, "loss": 2.6816, "step": 711 }, { "epoch": 0.6075085324232082, "grad_norm": 21.960777282714844, "learning_rate": 4.141958670260557e-06, "loss": 2.7148, "step": 712 }, { "epoch": 0.6083617747440273, "grad_norm": 18.987390518188477, "learning_rate": 4.132973944294699e-06, "loss": 2.4961, "step": 713 }, { "epoch": 0.6092150170648464, "grad_norm": 19.618938446044922, "learning_rate": 4.123989218328841e-06, "loss": 2.1768, "step": 714 }, { "epoch": 0.6100682593856656, "grad_norm": 20.22203254699707, "learning_rate": 4.1150044923629835e-06, "loss": 2.5034, "step": 715 }, { "epoch": 0.6109215017064846, "grad_norm": 25.868797302246094, "learning_rate": 4.1060197663971255e-06, "loss": 2.2061, "step": 716 }, { "epoch": 0.6117747440273038, "grad_norm": 19.47433090209961, "learning_rate": 4.097035040431267e-06, "loss": 3.0068, "step": 717 }, { "epoch": 0.6126279863481229, "grad_norm": 34.66838836669922, "learning_rate": 4.08805031446541e-06, "loss": 2.7236, "step": 718 }, { "epoch": 0.613481228668942, "grad_norm": 59.0142822265625, "learning_rate": 4.079065588499551e-06, "loss": 3.1172, "step": 719 }, { "epoch": 0.6143344709897611, "grad_norm": 64.44792938232422, "learning_rate": 4.070080862533693e-06, "loss": 2.5957, "step": 720 }, { "epoch": 0.6151877133105802, "grad_norm": 16.721546173095703, "learning_rate": 4.061096136567835e-06, "loss": 2.8066, "step": 721 }, { "epoch": 0.6160409556313993, "grad_norm": 24.300310134887695, "learning_rate": 4.052111410601977e-06, "loss": 2.7002, "step": 722 }, { "epoch": 0.6168941979522184, "grad_norm": 34.62942123413086, "learning_rate": 4.043126684636119e-06, "loss": 2.9609, "step": 723 }, { "epoch": 0.6177474402730375, "grad_norm": 26.634056091308594, "learning_rate": 4.0341419586702606e-06, "loss": 2.3535, "step": 724 }, { "epoch": 0.6186006825938567, "grad_norm": 40.439910888671875, "learning_rate": 4.025157232704403e-06, "loss": 2.9062, "step": 725 }, { "epoch": 0.6194539249146758, "grad_norm": 17.941150665283203, "learning_rate": 4.016172506738545e-06, "loss": 2.7451, "step": 726 }, { "epoch": 0.6203071672354948, "grad_norm": 14.031157493591309, "learning_rate": 4.007187780772687e-06, "loss": 2.5586, "step": 727 }, { "epoch": 0.621160409556314, "grad_norm": 12.27566146850586, "learning_rate": 3.998203054806828e-06, "loss": 2.3438, "step": 728 }, { "epoch": 0.6220136518771331, "grad_norm": 13.139644622802734, "learning_rate": 3.989218328840971e-06, "loss": 2.4688, "step": 729 }, { "epoch": 0.6228668941979523, "grad_norm": 16.500751495361328, "learning_rate": 3.980233602875112e-06, "loss": 2.7188, "step": 730 }, { "epoch": 0.6237201365187713, "grad_norm": 28.81122398376465, "learning_rate": 3.971248876909254e-06, "loss": 2.3291, "step": 731 }, { "epoch": 0.6245733788395904, "grad_norm": 25.21991539001465, "learning_rate": 3.962264150943396e-06, "loss": 2.5391, "step": 732 }, { "epoch": 0.6254266211604096, "grad_norm": 29.478809356689453, "learning_rate": 3.9532794249775385e-06, "loss": 2.7256, "step": 733 }, { "epoch": 0.6262798634812287, "grad_norm": 20.742538452148438, "learning_rate": 3.944294699011681e-06, "loss": 3.0352, "step": 734 }, { "epoch": 0.6271331058020477, "grad_norm": 94.33541107177734, "learning_rate": 3.935309973045822e-06, "loss": 2.3477, "step": 735 }, { "epoch": 0.6279863481228669, "grad_norm": 14.510876655578613, "learning_rate": 3.926325247079965e-06, "loss": 2.127, "step": 736 }, { "epoch": 0.628839590443686, "grad_norm": 21.58650016784668, "learning_rate": 3.917340521114106e-06, "loss": 2.2949, "step": 737 }, { "epoch": 0.6296928327645052, "grad_norm": 17.82122802734375, "learning_rate": 3.908355795148248e-06, "loss": 2.8574, "step": 738 }, { "epoch": 0.6305460750853242, "grad_norm": 18.0192813873291, "learning_rate": 3.89937106918239e-06, "loss": 2.3711, "step": 739 }, { "epoch": 0.6313993174061433, "grad_norm": 24.10041618347168, "learning_rate": 3.890386343216532e-06, "loss": 2.9199, "step": 740 }, { "epoch": 0.6322525597269625, "grad_norm": 27.862274169921875, "learning_rate": 3.881401617250674e-06, "loss": 2.8037, "step": 741 }, { "epoch": 0.6331058020477816, "grad_norm": 16.650089263916016, "learning_rate": 3.872416891284816e-06, "loss": 2.2358, "step": 742 }, { "epoch": 0.6339590443686007, "grad_norm": 24.472097396850586, "learning_rate": 3.8634321653189586e-06, "loss": 2.3711, "step": 743 }, { "epoch": 0.6348122866894198, "grad_norm": 14.919700622558594, "learning_rate": 3.8544474393531e-06, "loss": 2.7002, "step": 744 }, { "epoch": 0.6356655290102389, "grad_norm": 13.569048881530762, "learning_rate": 3.845462713387242e-06, "loss": 2.3213, "step": 745 }, { "epoch": 0.636518771331058, "grad_norm": 24.573429107666016, "learning_rate": 3.836477987421384e-06, "loss": 2.7637, "step": 746 }, { "epoch": 0.6373720136518771, "grad_norm": 18.409570693969727, "learning_rate": 3.827493261455526e-06, "loss": 2.4854, "step": 747 }, { "epoch": 0.6382252559726962, "grad_norm": 14.060251235961914, "learning_rate": 3.818508535489668e-06, "loss": 2.5479, "step": 748 }, { "epoch": 0.6390784982935154, "grad_norm": 22.34339714050293, "learning_rate": 3.80952380952381e-06, "loss": 2.5029, "step": 749 }, { "epoch": 0.6399317406143344, "grad_norm": 19.189834594726562, "learning_rate": 3.800539083557952e-06, "loss": 2.126, "step": 750 }, { "epoch": 0.6407849829351536, "grad_norm": 51.201881408691406, "learning_rate": 3.791554357592094e-06, "loss": 2.3213, "step": 751 }, { "epoch": 0.6416382252559727, "grad_norm": 35.899330139160156, "learning_rate": 3.7825696316262357e-06, "loss": 2.7754, "step": 752 }, { "epoch": 0.6424914675767918, "grad_norm": 25.565492630004883, "learning_rate": 3.7735849056603777e-06, "loss": 2.5352, "step": 753 }, { "epoch": 0.643344709897611, "grad_norm": 17.710491180419922, "learning_rate": 3.76460017969452e-06, "loss": 2.4473, "step": 754 }, { "epoch": 0.64419795221843, "grad_norm": 30.0711612701416, "learning_rate": 3.7556154537286615e-06, "loss": 2.9932, "step": 755 }, { "epoch": 0.6450511945392492, "grad_norm": 33.850616455078125, "learning_rate": 3.746630727762803e-06, "loss": 2.5303, "step": 756 }, { "epoch": 0.6459044368600683, "grad_norm": 29.517227172851562, "learning_rate": 3.7376460017969457e-06, "loss": 2.1904, "step": 757 }, { "epoch": 0.6467576791808873, "grad_norm": 35.89356994628906, "learning_rate": 3.7286612758310873e-06, "loss": 3.1494, "step": 758 }, { "epoch": 0.6476109215017065, "grad_norm": 42.21514129638672, "learning_rate": 3.7196765498652294e-06, "loss": 2.5229, "step": 759 }, { "epoch": 0.6484641638225256, "grad_norm": 17.243484497070312, "learning_rate": 3.710691823899371e-06, "loss": 2.9141, "step": 760 }, { "epoch": 0.6493174061433447, "grad_norm": 24.45879364013672, "learning_rate": 3.7017070979335136e-06, "loss": 2.208, "step": 761 }, { "epoch": 0.6501706484641638, "grad_norm": 21.869504928588867, "learning_rate": 3.6927223719676553e-06, "loss": 2.5908, "step": 762 }, { "epoch": 0.6510238907849829, "grad_norm": 18.739221572875977, "learning_rate": 3.683737646001797e-06, "loss": 2.5801, "step": 763 }, { "epoch": 0.6518771331058021, "grad_norm": 26.45047378540039, "learning_rate": 3.6747529200359395e-06, "loss": 2.8115, "step": 764 }, { "epoch": 0.6527303754266212, "grad_norm": 19.305646896362305, "learning_rate": 3.665768194070081e-06, "loss": 2.1982, "step": 765 }, { "epoch": 0.6535836177474402, "grad_norm": 46.113304138183594, "learning_rate": 3.656783468104223e-06, "loss": 3.0215, "step": 766 }, { "epoch": 0.6544368600682594, "grad_norm": 46.685699462890625, "learning_rate": 3.647798742138365e-06, "loss": 2.7422, "step": 767 }, { "epoch": 0.6552901023890785, "grad_norm": 20.04216957092285, "learning_rate": 3.6388140161725074e-06, "loss": 2.8691, "step": 768 }, { "epoch": 0.6561433447098977, "grad_norm": 18.197967529296875, "learning_rate": 3.629829290206649e-06, "loss": 2.9648, "step": 769 }, { "epoch": 0.6569965870307167, "grad_norm": 20.51030731201172, "learning_rate": 3.6208445642407907e-06, "loss": 2.209, "step": 770 }, { "epoch": 0.6578498293515358, "grad_norm": 16.829713821411133, "learning_rate": 3.6118598382749332e-06, "loss": 2.5537, "step": 771 }, { "epoch": 0.658703071672355, "grad_norm": 21.681400299072266, "learning_rate": 3.602875112309075e-06, "loss": 2.7969, "step": 772 }, { "epoch": 0.659556313993174, "grad_norm": 25.944387435913086, "learning_rate": 3.5938903863432166e-06, "loss": 2.6719, "step": 773 }, { "epoch": 0.6604095563139932, "grad_norm": 51.966121673583984, "learning_rate": 3.5849056603773586e-06, "loss": 2.8584, "step": 774 }, { "epoch": 0.6612627986348123, "grad_norm": 18.35639190673828, "learning_rate": 3.5759209344115007e-06, "loss": 2.3818, "step": 775 }, { "epoch": 0.6621160409556314, "grad_norm": 18.182228088378906, "learning_rate": 3.566936208445643e-06, "loss": 2.2686, "step": 776 }, { "epoch": 0.6629692832764505, "grad_norm": 16.234655380249023, "learning_rate": 3.5579514824797845e-06, "loss": 2.2441, "step": 777 }, { "epoch": 0.6638225255972696, "grad_norm": 34.618080139160156, "learning_rate": 3.548966756513927e-06, "loss": 3.0039, "step": 778 }, { "epoch": 0.6646757679180887, "grad_norm": 20.892868041992188, "learning_rate": 3.5399820305480687e-06, "loss": 2.5215, "step": 779 }, { "epoch": 0.6655290102389079, "grad_norm": 21.178865432739258, "learning_rate": 3.5309973045822103e-06, "loss": 2.126, "step": 780 }, { "epoch": 0.6663822525597269, "grad_norm": 15.986398696899414, "learning_rate": 3.5220125786163524e-06, "loss": 2.4512, "step": 781 }, { "epoch": 0.6672354948805461, "grad_norm": 18.40003204345703, "learning_rate": 3.5130278526504945e-06, "loss": 2.3096, "step": 782 }, { "epoch": 0.6680887372013652, "grad_norm": 28.919540405273438, "learning_rate": 3.5040431266846366e-06, "loss": 2.292, "step": 783 }, { "epoch": 0.6689419795221843, "grad_norm": 20.11212158203125, "learning_rate": 3.4950584007187783e-06, "loss": 2.3701, "step": 784 }, { "epoch": 0.6697952218430034, "grad_norm": 22.81437110900879, "learning_rate": 3.4860736747529208e-06, "loss": 2.2822, "step": 785 }, { "epoch": 0.6706484641638225, "grad_norm": 46.97941970825195, "learning_rate": 3.4770889487870624e-06, "loss": 2.252, "step": 786 }, { "epoch": 0.6715017064846417, "grad_norm": 21.700454711914062, "learning_rate": 3.468104222821204e-06, "loss": 2.3457, "step": 787 }, { "epoch": 0.6723549488054608, "grad_norm": 18.99515151977539, "learning_rate": 3.4591194968553458e-06, "loss": 2.2598, "step": 788 }, { "epoch": 0.6732081911262798, "grad_norm": 17.171161651611328, "learning_rate": 3.4501347708894883e-06, "loss": 2.3037, "step": 789 }, { "epoch": 0.674061433447099, "grad_norm": 19.60438346862793, "learning_rate": 3.44115004492363e-06, "loss": 2.5396, "step": 790 }, { "epoch": 0.6749146757679181, "grad_norm": 21.116247177124023, "learning_rate": 3.432165318957772e-06, "loss": 2.4775, "step": 791 }, { "epoch": 0.6757679180887372, "grad_norm": 17.95282745361328, "learning_rate": 3.423180592991914e-06, "loss": 2.2549, "step": 792 }, { "epoch": 0.6766211604095563, "grad_norm": 37.449928283691406, "learning_rate": 3.4141958670260562e-06, "loss": 2.3965, "step": 793 }, { "epoch": 0.6774744027303754, "grad_norm": 16.76734161376953, "learning_rate": 3.405211141060198e-06, "loss": 2.4512, "step": 794 }, { "epoch": 0.6783276450511946, "grad_norm": 16.08464813232422, "learning_rate": 3.3962264150943395e-06, "loss": 2.2861, "step": 795 }, { "epoch": 0.6791808873720137, "grad_norm": 22.598896026611328, "learning_rate": 3.387241689128482e-06, "loss": 2.0, "step": 796 }, { "epoch": 0.6800341296928327, "grad_norm": 30.063629150390625, "learning_rate": 3.3782569631626237e-06, "loss": 2.3936, "step": 797 }, { "epoch": 0.6808873720136519, "grad_norm": 15.586498260498047, "learning_rate": 3.369272237196766e-06, "loss": 2.3857, "step": 798 }, { "epoch": 0.681740614334471, "grad_norm": 47.479331970214844, "learning_rate": 3.3602875112309075e-06, "loss": 2.5176, "step": 799 }, { "epoch": 0.6825938566552902, "grad_norm": 29.33695411682129, "learning_rate": 3.35130278526505e-06, "loss": 2.6494, "step": 800 }, { "epoch": 0.6834470989761092, "grad_norm": 49.52241516113281, "learning_rate": 3.3423180592991917e-06, "loss": 2.666, "step": 801 }, { "epoch": 0.6843003412969283, "grad_norm": 30.229463577270508, "learning_rate": 3.3333333333333333e-06, "loss": 2.4409, "step": 802 }, { "epoch": 0.6851535836177475, "grad_norm": 18.470956802368164, "learning_rate": 3.324348607367476e-06, "loss": 2.4453, "step": 803 }, { "epoch": 0.6860068259385665, "grad_norm": 68.55836486816406, "learning_rate": 3.3153638814016175e-06, "loss": 2.7158, "step": 804 }, { "epoch": 0.6868600682593856, "grad_norm": 23.803735733032227, "learning_rate": 3.3063791554357596e-06, "loss": 2.4375, "step": 805 }, { "epoch": 0.6877133105802048, "grad_norm": 61.04603958129883, "learning_rate": 3.2973944294699013e-06, "loss": 2.7373, "step": 806 }, { "epoch": 0.6885665529010239, "grad_norm": 40.623558044433594, "learning_rate": 3.2884097035040433e-06, "loss": 2.3535, "step": 807 }, { "epoch": 0.689419795221843, "grad_norm": 44.683109283447266, "learning_rate": 3.2794249775381854e-06, "loss": 2.5059, "step": 808 }, { "epoch": 0.6902730375426621, "grad_norm": 23.082717895507812, "learning_rate": 3.270440251572327e-06, "loss": 2.3584, "step": 809 }, { "epoch": 0.6911262798634812, "grad_norm": 31.238815307617188, "learning_rate": 3.2614555256064696e-06, "loss": 2.2852, "step": 810 }, { "epoch": 0.6919795221843004, "grad_norm": 16.465364456176758, "learning_rate": 3.2524707996406113e-06, "loss": 2.9834, "step": 811 }, { "epoch": 0.6928327645051194, "grad_norm": 16.760278701782227, "learning_rate": 3.243486073674753e-06, "loss": 2.457, "step": 812 }, { "epoch": 0.6936860068259386, "grad_norm": 24.745893478393555, "learning_rate": 3.234501347708895e-06, "loss": 2.2397, "step": 813 }, { "epoch": 0.6945392491467577, "grad_norm": 22.20821762084961, "learning_rate": 3.225516621743037e-06, "loss": 2.3564, "step": 814 }, { "epoch": 0.6953924914675768, "grad_norm": 34.78770065307617, "learning_rate": 3.216531895777179e-06, "loss": 2.7852, "step": 815 }, { "epoch": 0.6962457337883959, "grad_norm": 17.53403091430664, "learning_rate": 3.207547169811321e-06, "loss": 2.1963, "step": 816 }, { "epoch": 0.697098976109215, "grad_norm": 17.553361892700195, "learning_rate": 3.1985624438454634e-06, "loss": 2.3682, "step": 817 }, { "epoch": 0.6979522184300341, "grad_norm": 16.97439956665039, "learning_rate": 3.189577717879605e-06, "loss": 2.4453, "step": 818 }, { "epoch": 0.6988054607508533, "grad_norm": 24.054723739624023, "learning_rate": 3.1805929919137467e-06, "loss": 2.6201, "step": 819 }, { "epoch": 0.6996587030716723, "grad_norm": 16.930429458618164, "learning_rate": 3.171608265947889e-06, "loss": 2.1953, "step": 820 }, { "epoch": 0.7005119453924915, "grad_norm": 36.9339599609375, "learning_rate": 3.162623539982031e-06, "loss": 2.6875, "step": 821 }, { "epoch": 0.7013651877133106, "grad_norm": 20.728759765625, "learning_rate": 3.153638814016173e-06, "loss": 2.7783, "step": 822 }, { "epoch": 0.7022184300341296, "grad_norm": 32.32343673706055, "learning_rate": 3.1446540880503146e-06, "loss": 3.0117, "step": 823 }, { "epoch": 0.7030716723549488, "grad_norm": 27.187162399291992, "learning_rate": 3.1356693620844567e-06, "loss": 2.3828, "step": 824 }, { "epoch": 0.7039249146757679, "grad_norm": 19.576969146728516, "learning_rate": 3.126684636118599e-06, "loss": 2.1914, "step": 825 }, { "epoch": 0.7047781569965871, "grad_norm": 19.67875099182129, "learning_rate": 3.1176999101527405e-06, "loss": 2.374, "step": 826 }, { "epoch": 0.7056313993174061, "grad_norm": 51.096885681152344, "learning_rate": 3.108715184186882e-06, "loss": 3.0088, "step": 827 }, { "epoch": 0.7064846416382252, "grad_norm": 16.370563507080078, "learning_rate": 3.0997304582210247e-06, "loss": 2.3779, "step": 828 }, { "epoch": 0.7073378839590444, "grad_norm": 30.746566772460938, "learning_rate": 3.0907457322551663e-06, "loss": 2.6177, "step": 829 }, { "epoch": 0.7081911262798635, "grad_norm": 30.207935333251953, "learning_rate": 3.0817610062893084e-06, "loss": 2.4072, "step": 830 }, { "epoch": 0.7090443686006825, "grad_norm": 29.116840362548828, "learning_rate": 3.0727762803234505e-06, "loss": 2.0972, "step": 831 }, { "epoch": 0.7098976109215017, "grad_norm": 26.794530868530273, "learning_rate": 3.0637915543575926e-06, "loss": 2.333, "step": 832 }, { "epoch": 0.7107508532423208, "grad_norm": 18.36752700805664, "learning_rate": 3.0548068283917343e-06, "loss": 2.6602, "step": 833 }, { "epoch": 0.71160409556314, "grad_norm": 22.439231872558594, "learning_rate": 3.045822102425876e-06, "loss": 2.7002, "step": 834 }, { "epoch": 0.712457337883959, "grad_norm": 51.330665588378906, "learning_rate": 3.0368373764600184e-06, "loss": 2.8867, "step": 835 }, { "epoch": 0.7133105802047781, "grad_norm": 24.982059478759766, "learning_rate": 3.02785265049416e-06, "loss": 2.415, "step": 836 }, { "epoch": 0.7141638225255973, "grad_norm": 40.434627532958984, "learning_rate": 3.018867924528302e-06, "loss": 2.1611, "step": 837 }, { "epoch": 0.7150170648464164, "grad_norm": 43.82883834838867, "learning_rate": 3.009883198562444e-06, "loss": 2.3281, "step": 838 }, { "epoch": 0.7158703071672355, "grad_norm": 43.11958312988281, "learning_rate": 3.0008984725965864e-06, "loss": 2.1943, "step": 839 }, { "epoch": 0.7167235494880546, "grad_norm": 17.046653747558594, "learning_rate": 2.991913746630728e-06, "loss": 2.3779, "step": 840 }, { "epoch": 0.7175767918088737, "grad_norm": 20.74578857421875, "learning_rate": 2.9829290206648697e-06, "loss": 2.3047, "step": 841 }, { "epoch": 0.7184300341296929, "grad_norm": 48.83142852783203, "learning_rate": 2.9739442946990122e-06, "loss": 2.5049, "step": 842 }, { "epoch": 0.7192832764505119, "grad_norm": 42.406375885009766, "learning_rate": 2.964959568733154e-06, "loss": 2.5117, "step": 843 }, { "epoch": 0.7201365187713311, "grad_norm": 55.99921798706055, "learning_rate": 2.9559748427672955e-06, "loss": 2.6826, "step": 844 }, { "epoch": 0.7209897610921502, "grad_norm": 40.09762191772461, "learning_rate": 2.9469901168014376e-06, "loss": 2.6357, "step": 845 }, { "epoch": 0.7218430034129693, "grad_norm": 22.537761688232422, "learning_rate": 2.9380053908355797e-06, "loss": 3.0352, "step": 846 }, { "epoch": 0.7226962457337884, "grad_norm": 16.046295166015625, "learning_rate": 2.929020664869722e-06, "loss": 2.4648, "step": 847 }, { "epoch": 0.7235494880546075, "grad_norm": 20.101272583007812, "learning_rate": 2.9200359389038635e-06, "loss": 2.335, "step": 848 }, { "epoch": 0.7244027303754266, "grad_norm": 40.1048469543457, "learning_rate": 2.911051212938006e-06, "loss": 2.3594, "step": 849 }, { "epoch": 0.7252559726962458, "grad_norm": 47.38935852050781, "learning_rate": 2.9020664869721477e-06, "loss": 2.0469, "step": 850 }, { "epoch": 0.7261092150170648, "grad_norm": 24.103130340576172, "learning_rate": 2.8930817610062893e-06, "loss": 1.9531, "step": 851 }, { "epoch": 0.726962457337884, "grad_norm": 22.3649845123291, "learning_rate": 2.8840970350404314e-06, "loss": 2.3027, "step": 852 }, { "epoch": 0.7278156996587031, "grad_norm": 36.170406341552734, "learning_rate": 2.8751123090745735e-06, "loss": 2.2656, "step": 853 }, { "epoch": 0.7286689419795221, "grad_norm": 30.652938842773438, "learning_rate": 2.8661275831087156e-06, "loss": 2.3604, "step": 854 }, { "epoch": 0.7295221843003413, "grad_norm": 26.317873001098633, "learning_rate": 2.8571428571428573e-06, "loss": 2.1924, "step": 855 }, { "epoch": 0.7303754266211604, "grad_norm": 52.10211944580078, "learning_rate": 2.8481581311769998e-06, "loss": 2.3828, "step": 856 }, { "epoch": 0.7312286689419796, "grad_norm": 22.478017807006836, "learning_rate": 2.8391734052111414e-06, "loss": 2.5449, "step": 857 }, { "epoch": 0.7320819112627986, "grad_norm": 21.12700080871582, "learning_rate": 2.830188679245283e-06, "loss": 1.9521, "step": 858 }, { "epoch": 0.7329351535836177, "grad_norm": 20.594982147216797, "learning_rate": 2.821203953279425e-06, "loss": 2.1904, "step": 859 }, { "epoch": 0.7337883959044369, "grad_norm": 21.516183853149414, "learning_rate": 2.8122192273135673e-06, "loss": 2.5332, "step": 860 }, { "epoch": 0.734641638225256, "grad_norm": 25.919029235839844, "learning_rate": 2.803234501347709e-06, "loss": 2.3511, "step": 861 }, { "epoch": 0.735494880546075, "grad_norm": 20.421133041381836, "learning_rate": 2.794249775381851e-06, "loss": 2.3271, "step": 862 }, { "epoch": 0.7363481228668942, "grad_norm": 23.43620491027832, "learning_rate": 2.785265049415993e-06, "loss": 2.0537, "step": 863 }, { "epoch": 0.7372013651877133, "grad_norm": 17.840322494506836, "learning_rate": 2.776280323450135e-06, "loss": 2.1328, "step": 864 }, { "epoch": 0.7380546075085325, "grad_norm": 33.441341400146484, "learning_rate": 2.767295597484277e-06, "loss": 2.5332, "step": 865 }, { "epoch": 0.7389078498293515, "grad_norm": 21.710899353027344, "learning_rate": 2.7583108715184185e-06, "loss": 2.665, "step": 866 }, { "epoch": 0.7397610921501706, "grad_norm": 23.452516555786133, "learning_rate": 2.749326145552561e-06, "loss": 2.1514, "step": 867 }, { "epoch": 0.7406143344709898, "grad_norm": 17.572235107421875, "learning_rate": 2.7403414195867027e-06, "loss": 1.7598, "step": 868 }, { "epoch": 0.7414675767918089, "grad_norm": 32.19004440307617, "learning_rate": 2.731356693620845e-06, "loss": 2.1289, "step": 869 }, { "epoch": 0.742320819112628, "grad_norm": 22.450124740600586, "learning_rate": 2.722371967654987e-06, "loss": 2.4658, "step": 870 }, { "epoch": 0.7431740614334471, "grad_norm": 19.069766998291016, "learning_rate": 2.713387241689129e-06, "loss": 2.2441, "step": 871 }, { "epoch": 0.7440273037542662, "grad_norm": 23.94462776184082, "learning_rate": 2.7044025157232706e-06, "loss": 2.1426, "step": 872 }, { "epoch": 0.7448805460750854, "grad_norm": 36.22708511352539, "learning_rate": 2.6954177897574123e-06, "loss": 2.228, "step": 873 }, { "epoch": 0.7457337883959044, "grad_norm": 21.32388687133789, "learning_rate": 2.686433063791555e-06, "loss": 2.8701, "step": 874 }, { "epoch": 0.7465870307167235, "grad_norm": 23.661392211914062, "learning_rate": 2.6774483378256965e-06, "loss": 2.4297, "step": 875 }, { "epoch": 0.7474402730375427, "grad_norm": 20.54587173461914, "learning_rate": 2.6684636118598386e-06, "loss": 2.9219, "step": 876 }, { "epoch": 0.7482935153583617, "grad_norm": 24.399003982543945, "learning_rate": 2.6594788858939802e-06, "loss": 2.9365, "step": 877 }, { "epoch": 0.7491467576791809, "grad_norm": 36.97280502319336, "learning_rate": 2.6504941599281223e-06, "loss": 2.376, "step": 878 }, { "epoch": 0.75, "grad_norm": 22.310462951660156, "learning_rate": 2.6415094339622644e-06, "loss": 2.4043, "step": 879 }, { "epoch": 0.7508532423208191, "grad_norm": 27.330747604370117, "learning_rate": 2.632524707996406e-06, "loss": 2.3359, "step": 880 }, { "epoch": 0.7517064846416383, "grad_norm": 20.034399032592773, "learning_rate": 2.6235399820305486e-06, "loss": 2.8594, "step": 881 }, { "epoch": 0.7525597269624573, "grad_norm": 26.571035385131836, "learning_rate": 2.6145552560646903e-06, "loss": 2.0596, "step": 882 }, { "epoch": 0.7534129692832765, "grad_norm": 21.897262573242188, "learning_rate": 2.605570530098832e-06, "loss": 2.4683, "step": 883 }, { "epoch": 0.7542662116040956, "grad_norm": 17.84102439880371, "learning_rate": 2.596585804132974e-06, "loss": 2.1904, "step": 884 }, { "epoch": 0.7551194539249146, "grad_norm": 24.632801055908203, "learning_rate": 2.587601078167116e-06, "loss": 2.1152, "step": 885 }, { "epoch": 0.7559726962457338, "grad_norm": 18.96522331237793, "learning_rate": 2.578616352201258e-06, "loss": 2.5303, "step": 886 }, { "epoch": 0.7568259385665529, "grad_norm": 37.09746170043945, "learning_rate": 2.5696316262354e-06, "loss": 2.335, "step": 887 }, { "epoch": 0.757679180887372, "grad_norm": 31.17850112915039, "learning_rate": 2.5606469002695424e-06, "loss": 2.5059, "step": 888 }, { "epoch": 0.7585324232081911, "grad_norm": 36.89558029174805, "learning_rate": 2.551662174303684e-06, "loss": 2.7695, "step": 889 }, { "epoch": 0.7593856655290102, "grad_norm": 20.49338150024414, "learning_rate": 2.5426774483378257e-06, "loss": 2.6846, "step": 890 }, { "epoch": 0.7602389078498294, "grad_norm": 25.983245849609375, "learning_rate": 2.533692722371968e-06, "loss": 2.4092, "step": 891 }, { "epoch": 0.7610921501706485, "grad_norm": 27.642595291137695, "learning_rate": 2.52470799640611e-06, "loss": 2.0869, "step": 892 }, { "epoch": 0.7619453924914675, "grad_norm": 23.749832153320312, "learning_rate": 2.515723270440252e-06, "loss": 2.125, "step": 893 }, { "epoch": 0.7627986348122867, "grad_norm": 28.204421997070312, "learning_rate": 2.5067385444743936e-06, "loss": 2.1562, "step": 894 }, { "epoch": 0.7636518771331058, "grad_norm": 30.861194610595703, "learning_rate": 2.4977538185085357e-06, "loss": 3.0098, "step": 895 }, { "epoch": 0.764505119453925, "grad_norm": 19.33704376220703, "learning_rate": 2.488769092542678e-06, "loss": 2.4072, "step": 896 }, { "epoch": 0.765358361774744, "grad_norm": 27.73824119567871, "learning_rate": 2.4797843665768195e-06, "loss": 2.2256, "step": 897 }, { "epoch": 0.7662116040955631, "grad_norm": 34.70376968383789, "learning_rate": 2.4707996406109616e-06, "loss": 2.4248, "step": 898 }, { "epoch": 0.7670648464163823, "grad_norm": 29.114303588867188, "learning_rate": 2.4618149146451032e-06, "loss": 2.7227, "step": 899 }, { "epoch": 0.7679180887372014, "grad_norm": 29.096269607543945, "learning_rate": 2.4528301886792453e-06, "loss": 2.4404, "step": 900 }, { "epoch": 0.7687713310580204, "grad_norm": 58.764366149902344, "learning_rate": 2.4438454627133874e-06, "loss": 2.2744, "step": 901 }, { "epoch": 0.7696245733788396, "grad_norm": 30.78232192993164, "learning_rate": 2.4348607367475295e-06, "loss": 2.5342, "step": 902 }, { "epoch": 0.7704778156996587, "grad_norm": 19.90322494506836, "learning_rate": 2.4258760107816716e-06, "loss": 2.1504, "step": 903 }, { "epoch": 0.7713310580204779, "grad_norm": 37.51405715942383, "learning_rate": 2.4168912848158133e-06, "loss": 2.1846, "step": 904 }, { "epoch": 0.7721843003412969, "grad_norm": 16.94844627380371, "learning_rate": 2.4079065588499553e-06, "loss": 2.3501, "step": 905 }, { "epoch": 0.773037542662116, "grad_norm": 28.434951782226562, "learning_rate": 2.398921832884097e-06, "loss": 2.0811, "step": 906 }, { "epoch": 0.7738907849829352, "grad_norm": 16.979534149169922, "learning_rate": 2.389937106918239e-06, "loss": 2.3936, "step": 907 }, { "epoch": 0.7747440273037542, "grad_norm": 22.91834831237793, "learning_rate": 2.380952380952381e-06, "loss": 2.9756, "step": 908 }, { "epoch": 0.7755972696245734, "grad_norm": 22.019638061523438, "learning_rate": 2.3719676549865233e-06, "loss": 2.2075, "step": 909 }, { "epoch": 0.7764505119453925, "grad_norm": 30.827585220336914, "learning_rate": 2.3629829290206654e-06, "loss": 2.4131, "step": 910 }, { "epoch": 0.7773037542662116, "grad_norm": 30.66614532470703, "learning_rate": 2.353998203054807e-06, "loss": 2.2217, "step": 911 }, { "epoch": 0.7781569965870307, "grad_norm": 25.246841430664062, "learning_rate": 2.345013477088949e-06, "loss": 2.3623, "step": 912 }, { "epoch": 0.7790102389078498, "grad_norm": 24.9078311920166, "learning_rate": 2.3360287511230908e-06, "loss": 2.2676, "step": 913 }, { "epoch": 0.7798634812286689, "grad_norm": 19.189767837524414, "learning_rate": 2.327044025157233e-06, "loss": 2.2119, "step": 914 }, { "epoch": 0.7807167235494881, "grad_norm": 21.641551971435547, "learning_rate": 2.3180592991913745e-06, "loss": 2.0054, "step": 915 }, { "epoch": 0.7815699658703071, "grad_norm": 30.206771850585938, "learning_rate": 2.3090745732255166e-06, "loss": 2.165, "step": 916 }, { "epoch": 0.7824232081911263, "grad_norm": 26.020099639892578, "learning_rate": 2.3000898472596587e-06, "loss": 2.5234, "step": 917 }, { "epoch": 0.7832764505119454, "grad_norm": 18.706939697265625, "learning_rate": 2.291105121293801e-06, "loss": 2.2012, "step": 918 }, { "epoch": 0.7841296928327645, "grad_norm": 20.241901397705078, "learning_rate": 2.282120395327943e-06, "loss": 2.0015, "step": 919 }, { "epoch": 0.7849829351535836, "grad_norm": 19.32655906677246, "learning_rate": 2.2731356693620846e-06, "loss": 1.5703, "step": 920 }, { "epoch": 0.7858361774744027, "grad_norm": 22.523513793945312, "learning_rate": 2.2641509433962266e-06, "loss": 2.585, "step": 921 }, { "epoch": 0.7866894197952219, "grad_norm": 36.01555252075195, "learning_rate": 2.2551662174303683e-06, "loss": 1.6499, "step": 922 }, { "epoch": 0.787542662116041, "grad_norm": 27.907073974609375, "learning_rate": 2.2461814914645104e-06, "loss": 2.3066, "step": 923 }, { "epoch": 0.78839590443686, "grad_norm": 19.171142578125, "learning_rate": 2.2371967654986525e-06, "loss": 2.0391, "step": 924 }, { "epoch": 0.7892491467576792, "grad_norm": 35.75897216796875, "learning_rate": 2.2282120395327946e-06, "loss": 2.6104, "step": 925 }, { "epoch": 0.7901023890784983, "grad_norm": 33.932472229003906, "learning_rate": 2.2192273135669367e-06, "loss": 2.6094, "step": 926 }, { "epoch": 0.7909556313993175, "grad_norm": 25.582454681396484, "learning_rate": 2.2102425876010783e-06, "loss": 2.4424, "step": 927 }, { "epoch": 0.7918088737201365, "grad_norm": 27.64750099182129, "learning_rate": 2.2012578616352204e-06, "loss": 2.418, "step": 928 }, { "epoch": 0.7926621160409556, "grad_norm": 345.17022705078125, "learning_rate": 2.192273135669362e-06, "loss": 2.2305, "step": 929 }, { "epoch": 0.7935153583617748, "grad_norm": 54.37593460083008, "learning_rate": 2.183288409703504e-06, "loss": 3.1572, "step": 930 }, { "epoch": 0.7943686006825939, "grad_norm": 29.86464500427246, "learning_rate": 2.174303683737646e-06, "loss": 2.5684, "step": 931 }, { "epoch": 0.7952218430034129, "grad_norm": 22.760496139526367, "learning_rate": 2.165318957771788e-06, "loss": 2.5928, "step": 932 }, { "epoch": 0.7960750853242321, "grad_norm": 18.5999755859375, "learning_rate": 2.15633423180593e-06, "loss": 2.4062, "step": 933 }, { "epoch": 0.7969283276450512, "grad_norm": 26.545793533325195, "learning_rate": 2.147349505840072e-06, "loss": 2.1387, "step": 934 }, { "epoch": 0.7977815699658704, "grad_norm": 17.7139949798584, "learning_rate": 2.138364779874214e-06, "loss": 2.6348, "step": 935 }, { "epoch": 0.7986348122866894, "grad_norm": 29.3115234375, "learning_rate": 2.129380053908356e-06, "loss": 2.624, "step": 936 }, { "epoch": 0.7994880546075085, "grad_norm": 33.964839935302734, "learning_rate": 2.120395327942498e-06, "loss": 2.4248, "step": 937 }, { "epoch": 0.8003412969283277, "grad_norm": 32.854530334472656, "learning_rate": 2.1114106019766396e-06, "loss": 2.75, "step": 938 }, { "epoch": 0.8011945392491467, "grad_norm": 19.812353134155273, "learning_rate": 2.1024258760107817e-06, "loss": 2.209, "step": 939 }, { "epoch": 0.8020477815699659, "grad_norm": 14.995081901550293, "learning_rate": 2.093441150044924e-06, "loss": 2.2539, "step": 940 }, { "epoch": 0.802901023890785, "grad_norm": 40.92326736450195, "learning_rate": 2.084456424079066e-06, "loss": 2.3555, "step": 941 }, { "epoch": 0.8037542662116041, "grad_norm": 40.27710723876953, "learning_rate": 2.075471698113208e-06, "loss": 2.4297, "step": 942 }, { "epoch": 0.8046075085324232, "grad_norm": 25.924610137939453, "learning_rate": 2.0664869721473496e-06, "loss": 2.5576, "step": 943 }, { "epoch": 0.8054607508532423, "grad_norm": 39.17571258544922, "learning_rate": 2.0575022461814917e-06, "loss": 2.5166, "step": 944 }, { "epoch": 0.8063139931740614, "grad_norm": 24.386545181274414, "learning_rate": 2.0485175202156334e-06, "loss": 2.5127, "step": 945 }, { "epoch": 0.8071672354948806, "grad_norm": 23.48984718322754, "learning_rate": 2.0395327942497755e-06, "loss": 2.5996, "step": 946 }, { "epoch": 0.8080204778156996, "grad_norm": 14.407853126525879, "learning_rate": 2.0305480682839176e-06, "loss": 1.7422, "step": 947 }, { "epoch": 0.8088737201365188, "grad_norm": 25.272546768188477, "learning_rate": 2.0215633423180597e-06, "loss": 1.9629, "step": 948 }, { "epoch": 0.8097269624573379, "grad_norm": 41.10344696044922, "learning_rate": 2.0125786163522013e-06, "loss": 2.4893, "step": 949 }, { "epoch": 0.810580204778157, "grad_norm": 34.03232955932617, "learning_rate": 2.0035938903863434e-06, "loss": 2.3135, "step": 950 }, { "epoch": 0.8114334470989761, "grad_norm": 19.308574676513672, "learning_rate": 1.9946091644204855e-06, "loss": 2.3301, "step": 951 }, { "epoch": 0.8122866894197952, "grad_norm": 19.482486724853516, "learning_rate": 1.985624438454627e-06, "loss": 2.3931, "step": 952 }, { "epoch": 0.8131399317406144, "grad_norm": 19.74332046508789, "learning_rate": 1.9766397124887693e-06, "loss": 2.082, "step": 953 }, { "epoch": 0.8139931740614335, "grad_norm": 24.09305763244629, "learning_rate": 1.967654986522911e-06, "loss": 2.5146, "step": 954 }, { "epoch": 0.8148464163822525, "grad_norm": 23.39406394958496, "learning_rate": 1.958670260557053e-06, "loss": 2.9053, "step": 955 }, { "epoch": 0.8156996587030717, "grad_norm": 18.08087921142578, "learning_rate": 1.949685534591195e-06, "loss": 1.9541, "step": 956 }, { "epoch": 0.8165529010238908, "grad_norm": 22.059133529663086, "learning_rate": 1.940700808625337e-06, "loss": 3.1865, "step": 957 }, { "epoch": 0.8174061433447098, "grad_norm": 16.29061508178711, "learning_rate": 1.9317160826594793e-06, "loss": 2.0771, "step": 958 }, { "epoch": 0.818259385665529, "grad_norm": 23.810237884521484, "learning_rate": 1.922731356693621e-06, "loss": 2.8477, "step": 959 }, { "epoch": 0.8191126279863481, "grad_norm": 30.86567497253418, "learning_rate": 1.913746630727763e-06, "loss": 2.082, "step": 960 }, { "epoch": 0.8199658703071673, "grad_norm": 24.14322853088379, "learning_rate": 1.904761904761905e-06, "loss": 2.9727, "step": 961 }, { "epoch": 0.8208191126279863, "grad_norm": 20.15610694885254, "learning_rate": 1.895777178796047e-06, "loss": 2.4414, "step": 962 }, { "epoch": 0.8216723549488054, "grad_norm": 105.63774108886719, "learning_rate": 1.8867924528301889e-06, "loss": 2.3037, "step": 963 }, { "epoch": 0.8225255972696246, "grad_norm": 17.520694732666016, "learning_rate": 1.8778077268643308e-06, "loss": 2.2676, "step": 964 }, { "epoch": 0.8233788395904437, "grad_norm": 26.543331146240234, "learning_rate": 1.8688230008984728e-06, "loss": 2.2607, "step": 965 }, { "epoch": 0.8242320819112628, "grad_norm": 22.674585342407227, "learning_rate": 1.8598382749326147e-06, "loss": 2.4355, "step": 966 }, { "epoch": 0.8250853242320819, "grad_norm": 29.76778793334961, "learning_rate": 1.8508535489667568e-06, "loss": 2.7432, "step": 967 }, { "epoch": 0.825938566552901, "grad_norm": 32.59353256225586, "learning_rate": 1.8418688230008985e-06, "loss": 2.6934, "step": 968 }, { "epoch": 0.8267918088737202, "grad_norm": 24.909629821777344, "learning_rate": 1.8328840970350406e-06, "loss": 1.8232, "step": 969 }, { "epoch": 0.8276450511945392, "grad_norm": 46.205047607421875, "learning_rate": 1.8238993710691824e-06, "loss": 2.8975, "step": 970 }, { "epoch": 0.8284982935153583, "grad_norm": 17.16266441345215, "learning_rate": 1.8149146451033245e-06, "loss": 2.5156, "step": 971 }, { "epoch": 0.8293515358361775, "grad_norm": 104.19178009033203, "learning_rate": 1.8059299191374666e-06, "loss": 2.5391, "step": 972 }, { "epoch": 0.8302047781569966, "grad_norm": 13.162457466125488, "learning_rate": 1.7969451931716083e-06, "loss": 1.8076, "step": 973 }, { "epoch": 0.8310580204778157, "grad_norm": 16.426860809326172, "learning_rate": 1.7879604672057504e-06, "loss": 2.6025, "step": 974 }, { "epoch": 0.8319112627986348, "grad_norm": 18.29142189025879, "learning_rate": 1.7789757412398922e-06, "loss": 2.667, "step": 975 }, { "epoch": 0.8327645051194539, "grad_norm": 17.840328216552734, "learning_rate": 1.7699910152740343e-06, "loss": 2.4551, "step": 976 }, { "epoch": 0.8336177474402731, "grad_norm": 17.80030059814453, "learning_rate": 1.7610062893081762e-06, "loss": 2.5186, "step": 977 }, { "epoch": 0.8344709897610921, "grad_norm": 34.99458694458008, "learning_rate": 1.7520215633423183e-06, "loss": 2.4072, "step": 978 }, { "epoch": 0.8353242320819113, "grad_norm": 17.811429977416992, "learning_rate": 1.7430368373764604e-06, "loss": 2.1963, "step": 979 }, { "epoch": 0.8361774744027304, "grad_norm": 19.649438858032227, "learning_rate": 1.734052111410602e-06, "loss": 2.3896, "step": 980 }, { "epoch": 0.8370307167235495, "grad_norm": 27.297197341918945, "learning_rate": 1.7250673854447441e-06, "loss": 2.375, "step": 981 }, { "epoch": 0.8378839590443686, "grad_norm": 17.75628662109375, "learning_rate": 1.716082659478886e-06, "loss": 1.9648, "step": 982 }, { "epoch": 0.8387372013651877, "grad_norm": 19.516536712646484, "learning_rate": 1.7070979335130281e-06, "loss": 2.3828, "step": 983 }, { "epoch": 0.8395904436860068, "grad_norm": 15.936653137207031, "learning_rate": 1.6981132075471698e-06, "loss": 2.2891, "step": 984 }, { "epoch": 0.840443686006826, "grad_norm": 32.02035903930664, "learning_rate": 1.6891284815813119e-06, "loss": 2.0713, "step": 985 }, { "epoch": 0.841296928327645, "grad_norm": 24.9638671875, "learning_rate": 1.6801437556154537e-06, "loss": 2.3364, "step": 986 }, { "epoch": 0.8421501706484642, "grad_norm": 27.20418930053711, "learning_rate": 1.6711590296495958e-06, "loss": 2.2085, "step": 987 }, { "epoch": 0.8430034129692833, "grad_norm": 18.280986785888672, "learning_rate": 1.662174303683738e-06, "loss": 1.8594, "step": 988 }, { "epoch": 0.8438566552901023, "grad_norm": 52.79808807373047, "learning_rate": 1.6531895777178798e-06, "loss": 2.7715, "step": 989 }, { "epoch": 0.8447098976109215, "grad_norm": 22.51861572265625, "learning_rate": 1.6442048517520217e-06, "loss": 2.7188, "step": 990 }, { "epoch": 0.8455631399317406, "grad_norm": 47.933372497558594, "learning_rate": 1.6352201257861635e-06, "loss": 3.1797, "step": 991 }, { "epoch": 0.8464163822525598, "grad_norm": 20.896522521972656, "learning_rate": 1.6262353998203056e-06, "loss": 2.2227, "step": 992 }, { "epoch": 0.8472696245733788, "grad_norm": 24.34409523010254, "learning_rate": 1.6172506738544475e-06, "loss": 2.2549, "step": 993 }, { "epoch": 0.8481228668941979, "grad_norm": 34.13801574707031, "learning_rate": 1.6082659478885896e-06, "loss": 2.7314, "step": 994 }, { "epoch": 0.8489761092150171, "grad_norm": 20.989660263061523, "learning_rate": 1.5992812219227317e-06, "loss": 2.7432, "step": 995 }, { "epoch": 0.8498293515358362, "grad_norm": 20.589107513427734, "learning_rate": 1.5902964959568734e-06, "loss": 2.4668, "step": 996 }, { "epoch": 0.8506825938566553, "grad_norm": 28.15553092956543, "learning_rate": 1.5813117699910154e-06, "loss": 2.7314, "step": 997 }, { "epoch": 0.8515358361774744, "grad_norm": 15.138993263244629, "learning_rate": 1.5723270440251573e-06, "loss": 2.3604, "step": 998 }, { "epoch": 0.8523890784982935, "grad_norm": 41.964927673339844, "learning_rate": 1.5633423180592994e-06, "loss": 2.4551, "step": 999 }, { "epoch": 0.8532423208191127, "grad_norm": 24.446157455444336, "learning_rate": 1.554357592093441e-06, "loss": 2.75, "step": 1000 }, { "epoch": 0.8540955631399317, "grad_norm": 23.93151092529297, "learning_rate": 1.5453728661275832e-06, "loss": 2.4736, "step": 1001 }, { "epoch": 0.8549488054607508, "grad_norm": 29.804882049560547, "learning_rate": 1.5363881401617253e-06, "loss": 2.0986, "step": 1002 }, { "epoch": 0.85580204778157, "grad_norm": 29.341312408447266, "learning_rate": 1.5274034141958671e-06, "loss": 2.1719, "step": 1003 }, { "epoch": 0.856655290102389, "grad_norm": 19.525724411010742, "learning_rate": 1.5184186882300092e-06, "loss": 2.2354, "step": 1004 }, { "epoch": 0.8575085324232082, "grad_norm": 20.416091918945312, "learning_rate": 1.509433962264151e-06, "loss": 2.5049, "step": 1005 }, { "epoch": 0.8583617747440273, "grad_norm": 12.879405975341797, "learning_rate": 1.5004492362982932e-06, "loss": 2.001, "step": 1006 }, { "epoch": 0.8592150170648464, "grad_norm": 15.967265129089355, "learning_rate": 1.4914645103324349e-06, "loss": 2.4395, "step": 1007 }, { "epoch": 0.8600682593856656, "grad_norm": 27.89084243774414, "learning_rate": 1.482479784366577e-06, "loss": 2.1934, "step": 1008 }, { "epoch": 0.8609215017064846, "grad_norm": 26.392724990844727, "learning_rate": 1.4734950584007188e-06, "loss": 2.5176, "step": 1009 }, { "epoch": 0.8617747440273038, "grad_norm": 40.049503326416016, "learning_rate": 1.464510332434861e-06, "loss": 2.1807, "step": 1010 }, { "epoch": 0.8626279863481229, "grad_norm": 16.034893035888672, "learning_rate": 1.455525606469003e-06, "loss": 2.3408, "step": 1011 }, { "epoch": 0.863481228668942, "grad_norm": 16.689733505249023, "learning_rate": 1.4465408805031447e-06, "loss": 2.1162, "step": 1012 }, { "epoch": 0.8643344709897611, "grad_norm": 19.43463134765625, "learning_rate": 1.4375561545372868e-06, "loss": 2.292, "step": 1013 }, { "epoch": 0.8651877133105802, "grad_norm": 38.333335876464844, "learning_rate": 1.4285714285714286e-06, "loss": 2.9941, "step": 1014 }, { "epoch": 0.8660409556313993, "grad_norm": 20.772680282592773, "learning_rate": 1.4195867026055707e-06, "loss": 2.0811, "step": 1015 }, { "epoch": 0.8668941979522184, "grad_norm": 15.94913387298584, "learning_rate": 1.4106019766397126e-06, "loss": 2.7207, "step": 1016 }, { "epoch": 0.8677474402730375, "grad_norm": 22.80181884765625, "learning_rate": 1.4016172506738545e-06, "loss": 2.5518, "step": 1017 }, { "epoch": 0.8686006825938567, "grad_norm": 24.729825973510742, "learning_rate": 1.3926325247079966e-06, "loss": 2.1699, "step": 1018 }, { "epoch": 0.8694539249146758, "grad_norm": 34.91142654418945, "learning_rate": 1.3836477987421384e-06, "loss": 2.6553, "step": 1019 }, { "epoch": 0.8703071672354948, "grad_norm": 25.28569793701172, "learning_rate": 1.3746630727762805e-06, "loss": 2.25, "step": 1020 }, { "epoch": 0.871160409556314, "grad_norm": 22.64327621459961, "learning_rate": 1.3656783468104224e-06, "loss": 1.7666, "step": 1021 }, { "epoch": 0.8720136518771331, "grad_norm": 20.240745544433594, "learning_rate": 1.3566936208445645e-06, "loss": 2.9014, "step": 1022 }, { "epoch": 0.8728668941979523, "grad_norm": 20.950401306152344, "learning_rate": 1.3477088948787062e-06, "loss": 2.4102, "step": 1023 }, { "epoch": 0.8737201365187713, "grad_norm": 36.6849479675293, "learning_rate": 1.3387241689128482e-06, "loss": 2.8906, "step": 1024 }, { "epoch": 0.8745733788395904, "grad_norm": 31.71055030822754, "learning_rate": 1.3297394429469901e-06, "loss": 2.417, "step": 1025 }, { "epoch": 0.8754266211604096, "grad_norm": 26.664091110229492, "learning_rate": 1.3207547169811322e-06, "loss": 2.6895, "step": 1026 }, { "epoch": 0.8762798634812287, "grad_norm": 21.710546493530273, "learning_rate": 1.3117699910152743e-06, "loss": 2.2451, "step": 1027 }, { "epoch": 0.8771331058020477, "grad_norm": 23.674776077270508, "learning_rate": 1.302785265049416e-06, "loss": 2.6943, "step": 1028 }, { "epoch": 0.8779863481228669, "grad_norm": 17.062026977539062, "learning_rate": 1.293800539083558e-06, "loss": 2.5938, "step": 1029 }, { "epoch": 0.878839590443686, "grad_norm": 19.835830688476562, "learning_rate": 1.2848158131177e-06, "loss": 1.8984, "step": 1030 }, { "epoch": 0.8796928327645052, "grad_norm": 22.001405715942383, "learning_rate": 1.275831087151842e-06, "loss": 2.3711, "step": 1031 }, { "epoch": 0.8805460750853242, "grad_norm": 19.773300170898438, "learning_rate": 1.266846361185984e-06, "loss": 2.2451, "step": 1032 }, { "epoch": 0.8813993174061433, "grad_norm": 20.110618591308594, "learning_rate": 1.257861635220126e-06, "loss": 2.0088, "step": 1033 }, { "epoch": 0.8822525597269625, "grad_norm": 18.571706771850586, "learning_rate": 1.2488769092542679e-06, "loss": 2.6729, "step": 1034 }, { "epoch": 0.8831058020477816, "grad_norm": 22.619691848754883, "learning_rate": 1.2398921832884097e-06, "loss": 2.3291, "step": 1035 }, { "epoch": 0.8839590443686007, "grad_norm": 15.416295051574707, "learning_rate": 1.2309074573225516e-06, "loss": 2.1562, "step": 1036 }, { "epoch": 0.8848122866894198, "grad_norm": 18.906389236450195, "learning_rate": 1.2219227313566937e-06, "loss": 2.2881, "step": 1037 }, { "epoch": 0.8856655290102389, "grad_norm": 26.780014038085938, "learning_rate": 1.2129380053908358e-06, "loss": 2.1504, "step": 1038 }, { "epoch": 0.886518771331058, "grad_norm": 23.079744338989258, "learning_rate": 1.2039532794249777e-06, "loss": 2.5264, "step": 1039 }, { "epoch": 0.8873720136518771, "grad_norm": 18.029769897460938, "learning_rate": 1.1949685534591195e-06, "loss": 2.4541, "step": 1040 }, { "epoch": 0.8882252559726962, "grad_norm": 30.483293533325195, "learning_rate": 1.1859838274932616e-06, "loss": 2.7627, "step": 1041 }, { "epoch": 0.8890784982935154, "grad_norm": 18.396657943725586, "learning_rate": 1.1769991015274035e-06, "loss": 2.1699, "step": 1042 }, { "epoch": 0.8899317406143344, "grad_norm": 42.24234390258789, "learning_rate": 1.1680143755615454e-06, "loss": 2.4609, "step": 1043 }, { "epoch": 0.8907849829351536, "grad_norm": 37.315792083740234, "learning_rate": 1.1590296495956873e-06, "loss": 1.9644, "step": 1044 }, { "epoch": 0.8916382252559727, "grad_norm": 18.035730361938477, "learning_rate": 1.1500449236298294e-06, "loss": 2.2441, "step": 1045 }, { "epoch": 0.8924914675767918, "grad_norm": 33.729732513427734, "learning_rate": 1.1410601976639714e-06, "loss": 2.7402, "step": 1046 }, { "epoch": 0.893344709897611, "grad_norm": 17.152904510498047, "learning_rate": 1.1320754716981133e-06, "loss": 2.2881, "step": 1047 }, { "epoch": 0.89419795221843, "grad_norm": 30.483760833740234, "learning_rate": 1.1230907457322552e-06, "loss": 2.1406, "step": 1048 }, { "epoch": 0.8950511945392492, "grad_norm": 22.160011291503906, "learning_rate": 1.1141060197663973e-06, "loss": 2.4102, "step": 1049 }, { "epoch": 0.8959044368600683, "grad_norm": 21.861427307128906, "learning_rate": 1.1051212938005392e-06, "loss": 2.377, "step": 1050 }, { "epoch": 0.8967576791808873, "grad_norm": 16.11711883544922, "learning_rate": 1.096136567834681e-06, "loss": 2.2207, "step": 1051 }, { "epoch": 0.8976109215017065, "grad_norm": 16.705957412719727, "learning_rate": 1.087151841868823e-06, "loss": 2.4766, "step": 1052 }, { "epoch": 0.8984641638225256, "grad_norm": 21.10558319091797, "learning_rate": 1.078167115902965e-06, "loss": 2.8291, "step": 1053 }, { "epoch": 0.8993174061433447, "grad_norm": 16.919170379638672, "learning_rate": 1.069182389937107e-06, "loss": 2.3379, "step": 1054 }, { "epoch": 0.9001706484641638, "grad_norm": 16.1108455657959, "learning_rate": 1.060197663971249e-06, "loss": 2.0049, "step": 1055 }, { "epoch": 0.9010238907849829, "grad_norm": 14.90361213684082, "learning_rate": 1.0512129380053909e-06, "loss": 2.0732, "step": 1056 }, { "epoch": 0.9018771331058021, "grad_norm": 22.117961883544922, "learning_rate": 1.042228212039533e-06, "loss": 2.4478, "step": 1057 }, { "epoch": 0.9027303754266212, "grad_norm": 17.26648712158203, "learning_rate": 1.0332434860736748e-06, "loss": 2.4834, "step": 1058 }, { "epoch": 0.9035836177474402, "grad_norm": 29.885637283325195, "learning_rate": 1.0242587601078167e-06, "loss": 2.5439, "step": 1059 }, { "epoch": 0.9044368600682594, "grad_norm": 36.06587600708008, "learning_rate": 1.0152740341419588e-06, "loss": 2.3203, "step": 1060 }, { "epoch": 0.9052901023890785, "grad_norm": 23.25389862060547, "learning_rate": 1.0062893081761007e-06, "loss": 1.8232, "step": 1061 }, { "epoch": 0.9061433447098977, "grad_norm": 19.698678970336914, "learning_rate": 9.973045822102428e-07, "loss": 2.1807, "step": 1062 }, { "epoch": 0.9069965870307167, "grad_norm": 20.899768829345703, "learning_rate": 9.883198562443846e-07, "loss": 2.0088, "step": 1063 }, { "epoch": 0.9078498293515358, "grad_norm": 17.271106719970703, "learning_rate": 9.793351302785265e-07, "loss": 2.4834, "step": 1064 }, { "epoch": 0.908703071672355, "grad_norm": 18.44869041442871, "learning_rate": 9.703504043126686e-07, "loss": 2.2676, "step": 1065 }, { "epoch": 0.909556313993174, "grad_norm": 22.660959243774414, "learning_rate": 9.613656783468105e-07, "loss": 2.5537, "step": 1066 }, { "epoch": 0.9104095563139932, "grad_norm": 27.0996150970459, "learning_rate": 9.523809523809525e-07, "loss": 2.2266, "step": 1067 }, { "epoch": 0.9112627986348123, "grad_norm": 34.511531829833984, "learning_rate": 9.433962264150944e-07, "loss": 2.3223, "step": 1068 }, { "epoch": 0.9121160409556314, "grad_norm": 27.928035736083984, "learning_rate": 9.344115004492364e-07, "loss": 2.6211, "step": 1069 }, { "epoch": 0.9129692832764505, "grad_norm": 27.532997131347656, "learning_rate": 9.254267744833784e-07, "loss": 2.2324, "step": 1070 }, { "epoch": 0.9138225255972696, "grad_norm": 14.844276428222656, "learning_rate": 9.164420485175203e-07, "loss": 2.2422, "step": 1071 }, { "epoch": 0.9146757679180887, "grad_norm": 21.835037231445312, "learning_rate": 9.074573225516623e-07, "loss": 2.2881, "step": 1072 }, { "epoch": 0.9155290102389079, "grad_norm": 19.69972801208496, "learning_rate": 8.984725965858041e-07, "loss": 2.1846, "step": 1073 }, { "epoch": 0.9163822525597269, "grad_norm": 15.349184036254883, "learning_rate": 8.894878706199461e-07, "loss": 1.7319, "step": 1074 }, { "epoch": 0.9172354948805461, "grad_norm": 19.928043365478516, "learning_rate": 8.805031446540881e-07, "loss": 2.4658, "step": 1075 }, { "epoch": 0.9180887372013652, "grad_norm": 17.366472244262695, "learning_rate": 8.715184186882302e-07, "loss": 2.2046, "step": 1076 }, { "epoch": 0.9189419795221843, "grad_norm": 29.670156478881836, "learning_rate": 8.625336927223721e-07, "loss": 2.3613, "step": 1077 }, { "epoch": 0.9197952218430034, "grad_norm": 30.919553756713867, "learning_rate": 8.535489667565141e-07, "loss": 1.9961, "step": 1078 }, { "epoch": 0.9206484641638225, "grad_norm": 14.773041725158691, "learning_rate": 8.445642407906559e-07, "loss": 1.9248, "step": 1079 }, { "epoch": 0.9215017064846417, "grad_norm": 24.538284301757812, "learning_rate": 8.355795148247979e-07, "loss": 2.3975, "step": 1080 }, { "epoch": 0.9223549488054608, "grad_norm": 19.382570266723633, "learning_rate": 8.265947888589399e-07, "loss": 2.1125, "step": 1081 }, { "epoch": 0.9232081911262798, "grad_norm": 19.567092895507812, "learning_rate": 8.176100628930818e-07, "loss": 2.7686, "step": 1082 }, { "epoch": 0.924061433447099, "grad_norm": 29.195287704467773, "learning_rate": 8.086253369272238e-07, "loss": 2.8613, "step": 1083 }, { "epoch": 0.9249146757679181, "grad_norm": 20.474294662475586, "learning_rate": 7.996406109613658e-07, "loss": 2.2227, "step": 1084 }, { "epoch": 0.9257679180887372, "grad_norm": 18.846967697143555, "learning_rate": 7.906558849955077e-07, "loss": 2.5205, "step": 1085 }, { "epoch": 0.9266211604095563, "grad_norm": 23.362014770507812, "learning_rate": 7.816711590296497e-07, "loss": 2.3252, "step": 1086 }, { "epoch": 0.9274744027303754, "grad_norm": 23.076448440551758, "learning_rate": 7.726864330637916e-07, "loss": 1.7588, "step": 1087 }, { "epoch": 0.9283276450511946, "grad_norm": 24.485366821289062, "learning_rate": 7.637017070979336e-07, "loss": 2.397, "step": 1088 }, { "epoch": 0.9291808873720137, "grad_norm": 21.237762451171875, "learning_rate": 7.547169811320755e-07, "loss": 2.7598, "step": 1089 }, { "epoch": 0.9300341296928327, "grad_norm": 26.516183853149414, "learning_rate": 7.457322551662174e-07, "loss": 2.7891, "step": 1090 }, { "epoch": 0.9308873720136519, "grad_norm": 21.238964080810547, "learning_rate": 7.367475292003594e-07, "loss": 2.5303, "step": 1091 }, { "epoch": 0.931740614334471, "grad_norm": 30.594709396362305, "learning_rate": 7.277628032345015e-07, "loss": 1.8975, "step": 1092 }, { "epoch": 0.9325938566552902, "grad_norm": 35.4166145324707, "learning_rate": 7.187780772686434e-07, "loss": 1.9258, "step": 1093 }, { "epoch": 0.9334470989761092, "grad_norm": 22.17268943786621, "learning_rate": 7.097933513027854e-07, "loss": 2.4492, "step": 1094 }, { "epoch": 0.9343003412969283, "grad_norm": 17.64724349975586, "learning_rate": 7.008086253369272e-07, "loss": 2.5547, "step": 1095 }, { "epoch": 0.9351535836177475, "grad_norm": 24.73906135559082, "learning_rate": 6.918238993710692e-07, "loss": 2.5254, "step": 1096 }, { "epoch": 0.9360068259385665, "grad_norm": 26.848501205444336, "learning_rate": 6.828391734052112e-07, "loss": 2.6846, "step": 1097 }, { "epoch": 0.9368600682593856, "grad_norm": 20.17809295654297, "learning_rate": 6.738544474393531e-07, "loss": 2.2471, "step": 1098 }, { "epoch": 0.9377133105802048, "grad_norm": 22.41636085510254, "learning_rate": 6.648697214734951e-07, "loss": 1.8472, "step": 1099 }, { "epoch": 0.9385665529010239, "grad_norm": 39.55388641357422, "learning_rate": 6.558849955076372e-07, "loss": 2.3638, "step": 1100 }, { "epoch": 0.939419795221843, "grad_norm": 33.42490005493164, "learning_rate": 6.46900269541779e-07, "loss": 2.4688, "step": 1101 }, { "epoch": 0.9402730375426621, "grad_norm": 21.77603530883789, "learning_rate": 6.37915543575921e-07, "loss": 2.2363, "step": 1102 }, { "epoch": 0.9411262798634812, "grad_norm": 24.46465301513672, "learning_rate": 6.28930817610063e-07, "loss": 2.2061, "step": 1103 }, { "epoch": 0.9419795221843004, "grad_norm": 19.78148651123047, "learning_rate": 6.199460916442049e-07, "loss": 2.5908, "step": 1104 }, { "epoch": 0.9428327645051194, "grad_norm": 31.342111587524414, "learning_rate": 6.109613656783469e-07, "loss": 2.5615, "step": 1105 }, { "epoch": 0.9436860068259386, "grad_norm": 13.717397689819336, "learning_rate": 6.019766397124888e-07, "loss": 2.0635, "step": 1106 }, { "epoch": 0.9445392491467577, "grad_norm": 21.84761619567871, "learning_rate": 5.929919137466308e-07, "loss": 2.293, "step": 1107 }, { "epoch": 0.9453924914675768, "grad_norm": 37.024166107177734, "learning_rate": 5.840071877807727e-07, "loss": 2.249, "step": 1108 }, { "epoch": 0.9462457337883959, "grad_norm": 17.425418853759766, "learning_rate": 5.750224618149147e-07, "loss": 2.3672, "step": 1109 }, { "epoch": 0.947098976109215, "grad_norm": 26.030006408691406, "learning_rate": 5.660377358490567e-07, "loss": 2.124, "step": 1110 }, { "epoch": 0.9479522184300341, "grad_norm": 22.431434631347656, "learning_rate": 5.570530098831986e-07, "loss": 2.3848, "step": 1111 }, { "epoch": 0.9488054607508533, "grad_norm": 19.168577194213867, "learning_rate": 5.480682839173405e-07, "loss": 2.1758, "step": 1112 }, { "epoch": 0.9496587030716723, "grad_norm": 17.07505226135254, "learning_rate": 5.390835579514825e-07, "loss": 2.374, "step": 1113 }, { "epoch": 0.9505119453924915, "grad_norm": 26.080429077148438, "learning_rate": 5.300988319856245e-07, "loss": 2.3418, "step": 1114 }, { "epoch": 0.9513651877133106, "grad_norm": 21.243762969970703, "learning_rate": 5.211141060197665e-07, "loss": 2.1953, "step": 1115 }, { "epoch": 0.9522184300341296, "grad_norm": 15.045170783996582, "learning_rate": 5.121293800539083e-07, "loss": 2.1016, "step": 1116 }, { "epoch": 0.9530716723549488, "grad_norm": 23.7006778717041, "learning_rate": 5.031446540880503e-07, "loss": 2.8184, "step": 1117 }, { "epoch": 0.9539249146757679, "grad_norm": 18.883411407470703, "learning_rate": 4.941599281221923e-07, "loss": 2.6475, "step": 1118 }, { "epoch": 0.9547781569965871, "grad_norm": 25.889921188354492, "learning_rate": 4.851752021563343e-07, "loss": 2.0693, "step": 1119 }, { "epoch": 0.9556313993174061, "grad_norm": 22.17185401916504, "learning_rate": 4.7619047619047623e-07, "loss": 2.4834, "step": 1120 }, { "epoch": 0.9564846416382252, "grad_norm": 22.72771644592285, "learning_rate": 4.672057502246182e-07, "loss": 2.5225, "step": 1121 }, { "epoch": 0.9573378839590444, "grad_norm": 14.462482452392578, "learning_rate": 4.5822102425876014e-07, "loss": 2.0977, "step": 1122 }, { "epoch": 0.9581911262798635, "grad_norm": 16.800121307373047, "learning_rate": 4.4923629829290207e-07, "loss": 2.0635, "step": 1123 }, { "epoch": 0.9590443686006825, "grad_norm": 31.013629913330078, "learning_rate": 4.4025157232704405e-07, "loss": 2.3926, "step": 1124 }, { "epoch": 0.9598976109215017, "grad_norm": 16.524974822998047, "learning_rate": 4.3126684636118604e-07, "loss": 2.1387, "step": 1125 }, { "epoch": 0.9607508532423208, "grad_norm": 25.04954719543457, "learning_rate": 4.2228212039532797e-07, "loss": 2.6875, "step": 1126 }, { "epoch": 0.96160409556314, "grad_norm": 22.931171417236328, "learning_rate": 4.1329739442946995e-07, "loss": 2.2598, "step": 1127 }, { "epoch": 0.962457337883959, "grad_norm": 18.490894317626953, "learning_rate": 4.043126684636119e-07, "loss": 2.0537, "step": 1128 }, { "epoch": 0.9633105802047781, "grad_norm": 18.204050064086914, "learning_rate": 3.9532794249775386e-07, "loss": 2.5342, "step": 1129 }, { "epoch": 0.9641638225255973, "grad_norm": 40.86081314086914, "learning_rate": 3.863432165318958e-07, "loss": 2.4844, "step": 1130 }, { "epoch": 0.9650170648464164, "grad_norm": 14.383349418640137, "learning_rate": 3.773584905660378e-07, "loss": 1.7993, "step": 1131 }, { "epoch": 0.9658703071672355, "grad_norm": 23.310863494873047, "learning_rate": 3.683737646001797e-07, "loss": 2.2402, "step": 1132 }, { "epoch": 0.9667235494880546, "grad_norm": 18.63228416442871, "learning_rate": 3.593890386343217e-07, "loss": 1.998, "step": 1133 }, { "epoch": 0.9675767918088737, "grad_norm": 82.3555908203125, "learning_rate": 3.504043126684636e-07, "loss": 2.5557, "step": 1134 }, { "epoch": 0.9684300341296929, "grad_norm": 16.93051528930664, "learning_rate": 3.414195867026056e-07, "loss": 2.2803, "step": 1135 }, { "epoch": 0.9692832764505119, "grad_norm": 21.192161560058594, "learning_rate": 3.3243486073674753e-07, "loss": 2.7246, "step": 1136 }, { "epoch": 0.9701365187713311, "grad_norm": 18.740169525146484, "learning_rate": 3.234501347708895e-07, "loss": 2.7227, "step": 1137 }, { "epoch": 0.9709897610921502, "grad_norm": 19.469411849975586, "learning_rate": 3.144654088050315e-07, "loss": 2.2979, "step": 1138 }, { "epoch": 0.9718430034129693, "grad_norm": 18.27403450012207, "learning_rate": 3.0548068283917343e-07, "loss": 2.251, "step": 1139 }, { "epoch": 0.9726962457337884, "grad_norm": 24.299030303955078, "learning_rate": 2.964959568733154e-07, "loss": 2.4541, "step": 1140 }, { "epoch": 0.9735494880546075, "grad_norm": 24.530475616455078, "learning_rate": 2.8751123090745734e-07, "loss": 2.1475, "step": 1141 }, { "epoch": 0.9744027303754266, "grad_norm": 25.455007553100586, "learning_rate": 2.785265049415993e-07, "loss": 2.3701, "step": 1142 }, { "epoch": 0.9752559726962458, "grad_norm": 16.469362258911133, "learning_rate": 2.6954177897574125e-07, "loss": 1.9355, "step": 1143 }, { "epoch": 0.9761092150170648, "grad_norm": 19.01254653930664, "learning_rate": 2.6055705300988324e-07, "loss": 2.3057, "step": 1144 }, { "epoch": 0.976962457337884, "grad_norm": 19.243854522705078, "learning_rate": 2.5157232704402517e-07, "loss": 2.4736, "step": 1145 }, { "epoch": 0.9778156996587031, "grad_norm": 22.926416397094727, "learning_rate": 2.4258760107816715e-07, "loss": 1.9111, "step": 1146 }, { "epoch": 0.9786689419795221, "grad_norm": 21.436504364013672, "learning_rate": 2.336028751123091e-07, "loss": 2.5264, "step": 1147 }, { "epoch": 0.9795221843003413, "grad_norm": 18.81846809387207, "learning_rate": 2.2461814914645103e-07, "loss": 2.2744, "step": 1148 }, { "epoch": 0.9803754266211604, "grad_norm": 17.731666564941406, "learning_rate": 2.1563342318059302e-07, "loss": 2.3506, "step": 1149 }, { "epoch": 0.9812286689419796, "grad_norm": 18.162921905517578, "learning_rate": 2.0664869721473497e-07, "loss": 2.3584, "step": 1150 }, { "epoch": 0.9820819112627986, "grad_norm": 17.115859985351562, "learning_rate": 1.9766397124887693e-07, "loss": 2.3174, "step": 1151 }, { "epoch": 0.9829351535836177, "grad_norm": 19.12236785888672, "learning_rate": 1.886792452830189e-07, "loss": 2.8047, "step": 1152 }, { "epoch": 0.9837883959044369, "grad_norm": 46.08415603637695, "learning_rate": 1.7969451931716084e-07, "loss": 2.2979, "step": 1153 }, { "epoch": 0.984641638225256, "grad_norm": 19.95178985595703, "learning_rate": 1.707097933513028e-07, "loss": 2.5273, "step": 1154 }, { "epoch": 0.985494880546075, "grad_norm": 19.663936614990234, "learning_rate": 1.6172506738544476e-07, "loss": 2.1338, "step": 1155 }, { "epoch": 0.9863481228668942, "grad_norm": 18.48255729675293, "learning_rate": 1.5274034141958671e-07, "loss": 2.2705, "step": 1156 }, { "epoch": 0.9872013651877133, "grad_norm": 14.638534545898438, "learning_rate": 1.4375561545372867e-07, "loss": 1.9199, "step": 1157 }, { "epoch": 0.9880546075085325, "grad_norm": 17.064104080200195, "learning_rate": 1.3477088948787063e-07, "loss": 2.3135, "step": 1158 }, { "epoch": 0.9889078498293515, "grad_norm": 24.58094024658203, "learning_rate": 1.2578616352201258e-07, "loss": 2.5918, "step": 1159 }, { "epoch": 0.9897610921501706, "grad_norm": 15.54403305053711, "learning_rate": 1.1680143755615455e-07, "loss": 2.0586, "step": 1160 }, { "epoch": 0.9906143344709898, "grad_norm": 19.432096481323242, "learning_rate": 1.0781671159029651e-07, "loss": 2.0234, "step": 1161 }, { "epoch": 0.9914675767918089, "grad_norm": 23.13845443725586, "learning_rate": 9.883198562443847e-08, "loss": 2.1523, "step": 1162 }, { "epoch": 0.992320819112628, "grad_norm": 19.302988052368164, "learning_rate": 8.984725965858042e-08, "loss": 2.5889, "step": 1163 }, { "epoch": 0.9931740614334471, "grad_norm": 22.22420883178711, "learning_rate": 8.086253369272238e-08, "loss": 2.4434, "step": 1164 }, { "epoch": 0.9940273037542662, "grad_norm": 16.57465362548828, "learning_rate": 7.187780772686433e-08, "loss": 2.2539, "step": 1165 }, { "epoch": 0.9948805460750854, "grad_norm": 29.630794525146484, "learning_rate": 6.289308176100629e-08, "loss": 2.3193, "step": 1166 }, { "epoch": 0.9957337883959044, "grad_norm": 16.782909393310547, "learning_rate": 5.3908355795148254e-08, "loss": 1.9961, "step": 1167 }, { "epoch": 0.9965870307167235, "grad_norm": 25.104806900024414, "learning_rate": 4.492362982929021e-08, "loss": 2.375, "step": 1168 }, { "epoch": 0.9974402730375427, "grad_norm": 15.419062614440918, "learning_rate": 3.593890386343217e-08, "loss": 1.9033, "step": 1169 }, { "epoch": 0.9982935153583617, "grad_norm": 18.09457015991211, "learning_rate": 2.6954177897574127e-08, "loss": 2.3037, "step": 1170 }, { "epoch": 0.9991467576791809, "grad_norm": 20.33756446838379, "learning_rate": 1.7969451931716084e-08, "loss": 2.2764, "step": 1171 }, { "epoch": 1.0, "grad_norm": 22.51215362548828, "learning_rate": 8.984725965858042e-09, "loss": 2.6963, "step": 1172 }, { "epoch": 1.0, "step": 1172, "total_flos": 1.0293976231418266e+18, "train_loss": 2.8942469053300983, "train_runtime": 765.0994, "train_samples_per_second": 392.106, "train_steps_per_second": 1.532 } ], "logging_steps": 1, "max_steps": 1172, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.0293976231418266e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }